├── .env.example
├── .env.markdown
├── .github
    └── workflows
    │   ├── create-release.yml
    │   └── run-tests.yml
├── .gitignore
├── Dockerfile
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── README.md
├── docker-compose.yml
├── docs
    ├── README.md
    └── history4feed.png
├── history4feed
    ├── __init__.py
    ├── app
    │   ├── __init__.py
    │   ├── admin.py
    │   ├── apps.py
    │   ├── autoschema.py
    │   ├── migrations
    │   │   ├── 0001_initial.py
    │   │   ├── 0002_feed_freshness_alter_feed_feed_type.py
    │   │   ├── 0003_alter_feed_description.py
    │   │   ├── 0004_alter_fulltextjob_status_alter_job_state.py
    │   │   ├── 0005_feed_datetime_modified.py
    │   │   └── __init__.py
    │   ├── models.py
    │   ├── openapi_params.py
    │   ├── serializers.py
    │   ├── settings.py
    │   ├── tests.py
    │   ├── utils.py
    │   └── views.py
    ├── asgi.py
    ├── h4fscripts
    │   ├── __init__.py
    │   ├── build_rss.py
    │   ├── celery.py
    │   ├── exceptions.py
    │   ├── h4f.py
    │   ├── sitemap_helpers.py
    │   ├── task_helper.py
    │   ├── wayback_helpers.py
    │   └── xml_utils.py
    ├── settings.py
    ├── urls.py
    └── wsgi.py
├── manage.py
├── pyproject.toml
├── requirements.txt
├── run.sh
└── tests
    ├── README.md
    ├── __init__.py
    ├── requirements.txt
    ├── st
        ├── .env.schemathesis
        ├── __init__.py
        ├── hooks.py
        └── st.py
    ├── test_01_add_feeds.py
    ├── test_02_add_post.py
    ├── test_03_delete_post.py
    ├── test_04_delete_feed.py
    ├── test_05_post_filters.py
    ├── test_06_patch_feed.py
    ├── test_07_patch_post.py
    ├── test_99_delete_all_feeds.py
    └── utils.py


/.env.example:
--------------------------------------------------------------------------------
 1 | # POSTGRES
 2 | POSTGRES_HOST=
 3 | POSTGRES_PORT=
 4 | POSTGRES_DB=
 5 | POSTGRES_USER=
 6 | POSTGRES_PASSWORD=
 7 | #django settings
 8 | DJANGO_SECRET=
 9 | DJANGO_DEBUG=
10 | DJANGO_ALLOWED_HOSTS=
11 | DJANGO_CORS_ALLOW_ALL_ORIGINS=
12 | DJANGO_CORS_ALLOWED_ORIGINS=
13 | # CELERY
14 | CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP=
15 | # SCRAPE BACKFILL SETTINGS
16 | EARLIEST_SEARCH_DATE=
17 | # PROXY
18 | SCRAPFILE_APIKEY=
19 | # SETTINGS TO AVOID RATE LIMITS
20 | WAYBACK_SLEEP_SECONDS=
21 | WAYBACK_BACKOFF_TIME=
22 | REQUEST_RETRY_COUNT=
23 | # API SETTINGS
24 | DEFAULT_PAGE_SIZE=
25 | MAX_PAGE_SIZE=
26 | # SERPER
27 | SERPER_API_KEY=


--------------------------------------------------------------------------------
/.env.markdown:
--------------------------------------------------------------------------------
 1 | # Environmental file info
 2 | 
 3 | If you're running in production, you should set these securely.
 4 | 
 5 | However, if you just want to experiment, set the following values
 6 | 
 7 | ## Django Settings
 8 | 
 9 | These are all Django settings, defined in `history4feed/settings.py`
10 | 
11 | * `DJANGO_SECRET`: `insecure_django_secret`
12 | * `DJANGO_DEBUG`: `True`
13 | * `DJANGO_ALLOWED_HOSTS`: BLANK
14 | * `DJANGO_CORS_ALLOW_ALL_ORIGINS`: `True`
15 | * `DJANGO_CORS_ALLOWED_ORIGINS`: LEAVE EMPTY
16 | 
17 | ## Postgres Settings
18 | 
19 | These are all Django settings, defined in `history4feed/settings.py`
20 | 
21 | * `POSTGRES_HOST`: `pgdb`
22 | * `POSTGRES_PORT`: BLANK
23 | * `POSTGRES_DB`: `postgres`
24 | * `POSTGRES_USER`: `postgres`
25 | * `POSTGRES_PASSWORD`: `postgres`
26 | 
27 | ## Celery settings
28 | 
29 | * `CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP`: `1`
30 | 
31 | ## history4feed API settings
32 | 
33 | These define how the API behaves.
34 | 
35 | * `MAX_PAGE_SIZE`: `50`
36 | 	* This is the maximum number of results the API will ever return before pagination
37 | * `DEFAULT_PAGE_SIZE`: `50`
38 | 	* The default page size of result returned by the API
39 | 
40 | ## Search Index Mode (Serper)
41 | 
42 | Search index mode, uses the [Serper API Key](https://serper.dev/) to scrape search results.
43 | 
44 | * `SERPER_API_KEY`
45 | 	* [Get your key here](https://serper.dev/api-key).
46 | 
47 | ## Scrape backfill settings
48 | 
49 | * `EARLIEST_SEARCH_DATE`: `2020-01-01T00:00:00Z`
50 | 	* determines how far history4feed will backfill posts for newly added feeds. e.g. `EARLIEST_SEARCH_DATE=2020-01-01T00:00:00Z` will import all posts with a publish date >= `2020-01-01T00:00:00Z`
51 | 
52 | ## Proxy settings
53 | 
54 | * `SCRAPFILE_APIKEY`: YOUR_API_KEY
55 | 	* We strongly recommend using the [ScrapFly](https://scrapfly.io/) proxy service with history4feed. Though we have no affiliation with them, it is the best proxy service we've tested and thus built in support for it to history4feed.
56 | 
57 | ## Settings to avoid rate limits if not using Scrapfly
58 | 
59 | If you're not using a Proxy it is very likely you'll run into rate limits on the WayBack Machine and the blogs you're requesting the full text from. You should therefore consider the following options
60 | 
61 | * `WAYBACK_SLEEP_SECONDS`: `45`
62 | 	* This is useful when a large amount of posts are returned. This sets the time between each request to get the full text of the article to reduce servers blocking robotic requests.
63 | * `REQUEST_RETRY_COUNT`: `3`
64 | 	* This is useful when a large amount of posts are returned. This sets the number of retries when a non-200 response is returned.


--------------------------------------------------------------------------------
/.github/workflows/create-release.yml:
--------------------------------------------------------------------------------
 1 | name: Create Release
 2 | run-name: creating release
 3 | on:
 4 |   workflow_dispatch:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | jobs:
10 |   create-release:
11 |     runs-on: ubuntu-latest
12 |     permissions:
13 |       contents: write
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: "3.11"
20 |       - name: Install pypa/build
21 |         run: python3 -m pip install build --user
22 | 
23 |       - name: Build a binary wheel and a source tarball
24 |         run: python3 -m build 
25 | 
26 |       - name: Make release
27 |         env:
28 |           GITHUB_TOKEN: ${{ github.token }}
29 |         run: |
30 |           REF_NAME="${{ github.ref_name }}-$(date +"%Y-%m-%d-%H-%M-%S")"
31 |           gh release create "$REF_NAME" --repo '${{ github.repository }}' --notes ""
32 |           gh release upload "$REF_NAME" dist/** --repo '${{ github.repository }}'
33 | 


--------------------------------------------------------------------------------
/.github/workflows/run-tests.yml:
--------------------------------------------------------------------------------
 1 | name: Run Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | 
 9 | jobs:
10 |   test-schema-thesis:
11 |     runs-on: ubuntu-latest
12 |     environment: test_pipeline    
13 | 
14 |     steps:
15 |       - name: Checkout repository
16 |         uses: actions/checkout@v4
17 |         with:
18 |           fetch-depth: 1
19 | 
20 |       - name: Set up Python
21 |         uses: actions/setup-python@v5
22 |         with:
23 |           python-version: "3.11"
24 |     
25 |           
26 |       - name: Set .env for docker-compose
27 |         run: |
28 |             echo "EARLIEST_SEARCH_DATE=$(date -u -d yesterday +'%Y-%m-%dT%H:%M:%SZ')" >> .env
29 |             echo "SCRAPFLY_APIKEY=${{secrets.SCRAPFLY_APIKEY}}" >> .env
30 | 
31 | 
32 |             cat tests/st/.env.schemathesis >> .env
33 | 
34 |             echo ==== env file start =====
35 |             cat .env
36 |             echo
37 |             echo ==== env file end   =====
38 | 
39 |          
40 |       - name: Start docker-compose
41 |         uses: hoverkraft-tech/compose-action@v2.0.2
42 |         with:
43 |           compose-file: |
44 |             docker-compose.yml
45 |           compose-flags:
46 |             --env-file .env
47 |             -p h4f-action
48 | 
49 |       - name: Get IP addresses
50 |         id: get_ip
51 |         run: |
52 |               IP_ADDRESS=$(docker network inspect -f '{{range.IPAM.Config}}{{.Gateway}}{{end}}' h4f-action_default)
53 |               echo "ip_address=$IP_ADDRESS" >> "$GITHUB_OUTPUT"
54 |               echo "IP_ADDRESS=$IP_ADDRESS" >> "$GITHUB_OUTPUT"
55 |               echo "SERVICE_BASE_URL=http://$IP_ADDRESS:8002/" >> "$GITHUB_OUTPUT"
56 |               cat "$GITHUB_OUTPUT"
57 | 
58 |       - name: Wait for server to start
59 |         run: |
60 |               RETRY_DELAY=3
61 |               RETRY_COUNT=10
62 |               echo "Waiting for server to start"
63 |               curl --retry-delay $RETRY_DELAY --retry $RETRY_COUNT --retry-connrefused ${{ steps.get_ip.outputs.SERVICE_BASE_URL }} > /dev/null
64 |               if [ $? -ne 0 ]; then
65 |                 echo "exiting after waiting $(( $RETRY_DELAY * $RETRY_COUNT )) seconds for server to start"
66 |                 exit 1
67 |               fi
68 | 
69 |     
70 | 
71 |       - name: test all endpoints 1
72 |         id: test-endpoints
73 |         run: |
74 |           pip install -r tests/requirements.txt
75 |           export SERVICE_BASE_URL="${{ steps.get_ip.outputs.SERVICE_BASE_URL }}"
76 | 
77 |           pytest tests/
78 | 
79 |       - name: run schemathesis
80 |         uses: schemathesis/action@v1
81 |         env:
82 |           SCHEMATHESIS_HOOKS: tests.st.hooks
83 |         with:
84 |           schema: ${{ steps.get_ip.outputs.SERVICE_BASE_URL }}/api/schema/
85 |           checks: all
86 |           wait-for-schema: '30'
87 |           args: '--generation-allow-x00 false --show-trace'
88 |           version: 3.39.15
89 | 
90 |       - name: test delete all
91 |         id: test-endpoints-2
92 |         run: |
93 |             pip install -r tests/requirements.txt
94 |             export DELETE_ALL_FEEDS=true
95 |             export SERVICE_BASE_URL="${{ steps.get_ip.outputs.SERVICE_BASE_URL }}"
96 |             pytest tests/test_99_delete_all_feeds.py
97 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | staticfiles
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | .pybuilder/
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | #   For a library or package, you might want to ignore these files since the code is
 88 | #   intended to run in multiple environments; otherwise, check them in:
 89 | # .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # poetry
 99 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
100 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
101 | #   commonly ignored for libraries.
102 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
103 | #poetry.lock
104 | 
105 | # pdm
106 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
107 | #pdm.lock
108 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
109 | #   in version control.
110 | #   https://pdm.fming.dev/#use-with-ide
111 | .pdm.toml
112 | 
113 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
114 | __pypackages__/
115 | 
116 | # Celery stuff
117 | celerybeat-schedule
118 | celerybeat.pid
119 | 
120 | # SageMath parsed files
121 | *.sage.py
122 | 
123 | # Environments
124 | .env
125 | .venv
126 | env/
127 | venv/
128 | ENV/
129 | env.bak/
130 | venv.bak/
131 | 
132 | # Spyder project settings
133 | .spyderproject
134 | .spyproject
135 | 
136 | # Rope project settings
137 | .ropeproject
138 | 
139 | # mkdocs documentation
140 | /site
141 | 
142 | # mypy
143 | .mypy_cache/
144 | .dmypy.json
145 | dmypy.json
146 | 
147 | # Pyre type checker
148 | .pyre/
149 | 
150 | # pytype static type analyzer
151 | .pytype/
152 | 
153 | # Cython debug symbols
154 | cython_debug/
155 | 
156 | # PyCharm
157 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
158 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
159 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
160 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
161 | #.idea/
162 | 
163 | config*
164 | 
165 | # ignore venv in config
166 | 
167 | history4feed-venv/
168 | 
169 | # ignore created dirs with generated data
170 | 
171 | logs/
172 | output/
173 | 
174 | # mac files
175 | .DS_Store
176 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.11
2 | ENV PYTHONUNBUFFERED=1
3 | WORKDIR /usr/src/app
4 | COPY requirements.txt ./
5 | RUN pip install -r requirements.txt
6 | 
7 | COPY . /usr/src/app


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright 2020 DOGESEC (https://www.dogesec.com/)
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | readability-lxml = "*"
 8 | python-dateutil = "*"
 9 | brotlipy = "*"
10 | python-dotenv = "*"
11 | djangorestframework = "*"
12 | drf-spectacular = "*"
13 | lxml_html_clean = "*"
14 | celery = "*"
15 | redis = "*"
16 | psycopg2-binary = "*"
17 | gunicorn = "*"
18 | django-filter = "*"
19 | requests = "*"
20 | fake-useragent = "==1.5.1"
21 | 
22 | [dev-packages]
23 | autopep8 = "*"
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # history4feed
 2 | 
 3 | [![codecov](https://codecov.io/gh/muchdogesec/history4feed/graph/badge.svg?token=3Z5LELB8OP)](https://codecov.io/gh/muchdogesec/history4feed)
 4 | 
 5 | ## Overview
 6 | 
 7 | ![](docs/history4feed.png)
 8 | 
 9 | It is common for feeds (RSS or XML) to only include a limited number of posts. I generally see the latest 3 - 5 posts of a blog in a feed. For blogs that have been operating for years, this means potentially thousands of posts are missed.
10 | 
11 | There is no way to page through historic articles using an RSS or ATOM feed (they were not designed for this), which means the first poll of the feed will only contain the limited number of articles in the feed. This limit is defined by the blog owner.
12 | 
13 | history4feed can be used to create a complete history for a blog and output it as an RSS feed.
14 | 
15 | history4feed offers an API interface that;
16 | 
17 | 1. takes an RSS / ATOM feed URL
18 | 2. downloads a Wayback Machine archive for the feed
19 | 3. identified all unique blog posts in the historic feeds downloaded
20 | 4. downloads a HTML version of the article content on each page
21 | 5. stores the post record in the databases
22 | 6. exposes the posts as JSON or XML RSS
23 | 
24 | ## tl;dr
25 | 
26 | [![history4feed](https://img.youtube.com/vi/z1ATbiecbg4/0.jpg)](https://www.youtube.com/watch?v=z1ATbiecbg4)
27 | 
28 | [Watch the demo](https://www.youtube.com/watch?v=z1ATbiecbg4).
29 | 
30 | ## Install
31 | 
32 | ### Download and configure
33 | 
34 | ```shell
35 | # clone the latest code
36 | git clone https://github.com/muchdogesec/history4feed
37 | ```
38 | 
39 | ### Configuration options
40 | 
41 | history4feed has various settings that are defined in an `.env` file.
42 | 
43 | To create a template for the file:
44 | 
45 | ```shell
46 | cp .env.example .env
47 | ```
48 | 
49 | To see more information about how to set the variables, and what they do, read the `.env.markdown` file.
50 | 
51 | ### Build the Docker Image
52 | 
53 | ```shell
54 | sudo docker compose build
55 | ```
56 | 
57 | ### Start the server
58 | 
59 | ```shell
60 | sudo docker compose up
61 | ```
62 | 
63 | ### Access the server
64 | 
65 | The webserver (Django) should now be running on: http://127.0.0.1:8002/
66 | 
67 | You can access the Swagger UI for the API in a browser at: http://127.0.0.1:8002/api/schema/swagger-ui/
68 | 
69 | ## Useful supporting tools
70 | 
71 | * [Full Text, Full Archive RSS Feeds for any Blog](https://www.dogesec.com/blog/full_text_rss_atom_blog_feeds/)
72 | * [An up-to-date list of threat intel blogs that post cyber threat intelligence research](https://github.com/muchdogesec/awesome_threat_intel_blogs)
73 | * [Donate to the Wayback Machine](https://archive.org/donate)
74 | 
75 | ## Support
76 | 
77 | [Minimal support provided via the DOGESEC community](https://community.dogesec.com/).
78 | 
79 | ## License
80 | 
81 | [Apache 2.0](/LICENSE).


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |     django:
 3 |         image: history4feed
 4 |         build: .
 5 |         command: >
 6 |                 bash -c "
 7 |                     python manage.py collectstatic --no-input &&
 8 |                         python manage.py makemigrations &&
 9 |                                 python manage.py migrate &&
10 |                                     gunicorn history4feed.wsgi:application --bind 0.0.0.0:8002 --reload
11 |                         "
12 |         volumes:
13 |             - .:/usr/src/app/
14 |         ports:
15 |             - 8002:8002
16 |         environment:
17 |             - DEBUG=1
18 |             - CELERY_BROKER_URL=redis://redis:6379/0
19 |         env_file:
20 |             - ./.env
21 |         depends_on:
22 |             pgdb:
23 |                 condition: service_healthy
24 |             redis:
25 |                 condition: service_started
26 |     celery:
27 |         image: history4feed
28 |         build: .
29 |         command: >
30 |                 bash -c "
31 |                   celery -A history4feed.h4fscripts worker -l INFO
32 |                   "
33 |         volumes:
34 |             - .:/usr/src/app
35 |         environment:
36 |             - DEBUG=1
37 |             - CELERY_BROKER_URL=redis://redis:6379/0
38 |             - result_backend=redis://redis:6379/1
39 |         env_file:
40 |             - ./.env
41 |         depends_on:
42 |             - django
43 |             - redis
44 |     pgdb:
45 |         image: postgres
46 |         env_file:
47 |           - ./.env
48 |         volumes:
49 |             - pgdata:/var/lib/postgresql/data/
50 |         healthcheck:
51 |             test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER} -d ${POSTGRES_DB}"]
52 |             interval: 10s
53 |             retries: 5
54 |             start_period: 30s
55 |             timeout: 10s
56 |     redis:
57 |         image: "redis:alpine"
58 | volumes:
59 |     pgdata:


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
  1 | ## Basics of RSS
  2 | 
  3 | RSS stands for Really Simple Syndication. Simply put, RSS is a standardized format using a computer (and human) readable format that shows what has changed for a website, and is especially used by blogs, podcasts, news sites, etc, for this reason.
  4 | 
  5 | Here is a sample of an RSS feed from The Record by the Recorded Future team; `https://therecord.media/feed/`.
  6 | 
  7 | Note, in many cases a blog will clearly show their RSS (or ATOM) feed URL, but not all. Whilst not all blogs have RSS feeds, if you open up a browser, navigate to the blog, and click view page source, you can usually find the feed address under the `link rel="alternate" type="application/rss+xml"` or `application/atom+xml` HTML tag.
  8 | 
  9 | Here's an example...
 10 | 
 11 | ```shell
 12 | curl "https://krebsonsecurity.com/" > demo_1.html
 13 | ```
 14 | 
 15 | ```html
 16 | <link rel="alternate" type="application/rss+xml" title="Krebs on Security &raquo; Feed" href="https://krebsonsecurity.com/feed/" />
 17 | <link rel="alternate" type="application/rss+xml" title="Krebs on Security &raquo; Comments Feed" href="https://krebsonsecurity.com/comments/feed/" />
 18 | ```
 19 | 
 20 | Note, you might see more than one feed, above one is for posts, the other for blog comments.
 21 | 
 22 | It's not always that simple to detect the feed URL...
 23 | 
 24 | The Recorded Future Record RSS feed;
 25 | 
 26 | ```shell
 27 | curl "https://therecord.media/news" > demo_2.html
 28 | ```
 29 | 
 30 | Is nestled in custom properties...
 31 | 
 32 | ```js
 33 | "rssLink":{"id":12,"target":"_blank","externalUrl":"https://therecord.media/feed/"
 34 | ```
 35 | 
 36 | Sometimes a feed will exist, but is not exposed in the HTML (in which case you can try and guess the URL pattern for it). Some blogs just have no feeds.
 37 | 
 38 | In some cases, a blog will also have feeds per category (vs getting the entire blog, which you might not always want), which you can find using the category/tag/etc, URL. e.g.
 39 | 
 40 | ```shell
 41 | curl "https://blogs.infoblox.com/category/cyber-threat-intelligence/" > demo_3.html
 42 | ```
 43 | 
 44 | ```html
 45 | <link rel="alternate" type="application/rss+xml" title="Infoblox Blog &raquo; Feed" href="https://blogs.infoblox.com/feed/" />
 46 | <link rel="alternate" type="application/rss+xml" title="Infoblox Blog &raquo; Comments Feed" href="https://blogs.infoblox.com/comments/feed/" />
 47 | <link rel="alternate" type="application/rss+xml" title="Infoblox Blog &raquo; Cyber Threat Intelligence Category Feed" href="https://blogs.infoblox.com/category/cyber-threat-intelligence/feed/" />
 48 | ```
 49 | 
 50 | Generally an RSS feed has an XML structure containing at least the following items;
 51 | 
 52 | ```xml
 53 | <?xml version="1.0" encoding="UTF-8" ?>
 54 | <rss version="2.0">
 55 | 
 56 | <channel>
 57 |   <title>W3Schools Home Page</title>
 58 |   <link>https://www.w3schools.com</link>
 59 |   <description>Free web building tutorials</description>
 60 |   <item>
 61 |     <title>RSS Tutorial</title>
 62 |     <link>https://www.w3schools.com/xml/xml_rss.asp</link>
 63 |     <description>New RSS tutorial on W3Schools</description>
 64 |     <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
 65 |   </item>
 66 |   <item>
 67 |     <title>XML Tutorial</title>
 68 |     <link>https://www.w3schools.com/xml</link>
 69 |     <description>New XML tutorial on W3Schools</description>
 70 |     <pubDate>Tue, 10 Jun 2003 11:34:12 GMT</pubDate>
 71 |   </item>
 72 | </channel>
 73 | 
 74 | </rss>
 75 | ``` 
 76 | 
 77 | The `<channel>` tags capture the entire feed including metadata about the feed (`title`, `link`, and `description` in this case). There are many other optional elements that can be included in the `<channel>` tags, [as defined here](https://www.rssboard.org/rss-specification).
 78 | 
 79 | Each article in the feed is defined inside each `<item>` tag with sub-elements, generally the most important being:
 80 | 
 81 | * `title`: The title of the post / article
 82 | * `link`: The URL of the post / article
 83 | * `description`: The article content
 84 | * `pubDate`: The date the article was published
 85 | 
 86 | There are many other optional elements that can be included in the `<item>` tags, [as defined here](https://www.rssboard.org/rss-specification).
 87 | 
 88 | ## Basics of ATOM
 89 | 
 90 | Atom is a similar format to RSS and used for the same reasons. It is a slightly newer format than XML (although almost 20 years old) and designed to cover some of the shortcomings of RSS.
 91 | 
 92 | Here is a sample of an ATOM feed from the 0patch blog...
 93 | 
 94 | ```shell
 95 | curl "https://blog.0patch.com/" > demo_4.html
 96 | ```
 97 | 
 98 | ```html
 99 | <link rel="alternate" type="application/atom+xml" title="0patch Blog - Atom" href="https://blog.0patch.com/feeds/posts/default" />
100 | <link rel="alternate" type="application/rss+xml" title="0patch Blog - RSS" href="https://blog.0patch.com/feeds/posts/default?alt=rss" />
101 | <link rel="service.post" type="application/atom+xml" title="0patch Blog - Atom" href="https://www.blogger.com/feeds/7114610046316422325/posts/default" />
102 | ```
103 | 
104 | Note, an RSS version is also available above; `application/rss+xml` vs `application/atom+xml`.
105 | 
106 | An ATOM feed has a similar XML structure to RSS, however, you will notice some of the element names are different.
107 | 
108 | ```xml
109 |   <?xml version="1.0" encoding="utf-8"?>
110 |    <feed xmlns="http://www.w3.org/2005/Atom">
111 | 
112 |      <title>Example Feed</title>
113 |      <link href="http://example.org/"/>
114 |      <updated>2003-12-13T18:30:02Z</updated>
115 |      <author>
116 |        <name>John Doe</name>
117 |      </author>
118 |      <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
119 | 
120 |      <entry>
121 |        <title>Atom-Powered Robots Run Amok</title>
122 |        <link href="http://example.org/2003/12/13/atom03"/>
123 |        <id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
124 |        <published>2003-12-13T18:30:02Z</published>
125 |        <updated>2003-12-13T18:30:02Z</updated>
126 |        <title>Something</title>
127 |        <content>Some text.</content>
128 |      </entry>
129 |    </feed>
130 | ```
131 | 
132 | The blog information is captured at the top of the document.
133 | 
134 | Each article in the feed is defined inside each `<entry>` tag with sub-elements, generally the most important being:
135 | 
136 | * `title`: The title of the post / article
137 | * `id`: The UUID of the post
138 | * `link`: The URL of the post / article
139 | * `published`: The date the article was published
140 | * `content`: The article content
141 | 
142 | There are many other optional elements that can be included in the `<item>` tags, [as defined here](https://validator.w3.org/feed/docs/atom.html).
143 | 
144 | ## The solution
145 | 
146 | There are two ways I came up with to get historic posts from a blog;
147 | 
148 | 1. Scrape the blog for historic posts. This is the most accurate way to do it, though given the different structure of blogs and websites, this can become complex, requiring a fair bit of manual scraping logic to be written for each blog you want to follow
149 | 2. [Inspired by this Reddit thread](https://www.reddit.com/r/webscraping/comments/zxduid/python_library_to_scrape_rssfeeds_from/), use the Wayback Machine's archive. Often the Wayback Machine will have captured snapshots of a feed (though not always). For example, `https://therecord.media/feed/` has been captured [187 times between November 1, 2020 and August 12, 2022](https://web.archive.org/web/20220000000000*/https://therecord.media/feed/).
150 | 
151 | Whilst the Wayback Machine will completely miss some blog archives, a particular problem for smaller sites that are less likely to be regularly indexed by the WBM), and potentially miss certain feed items where the RSS feed updates faster the WBM re-indexes the site, I chose this approach as it is currently the most scalable way I could come up with to backfill history (and most of the requirements for my use-cases were from high profile sites with a fairly small publish rate).
152 | 
153 | [Waybackpack](https://github.com/jsvine/waybackpack) is a command-line tool that lets you download the entire Wayback Machine archive for a given URL for this purpose.
154 | 
155 | Here is an example of how to use it with The Record Feed;
156 | 
157 | ```shell
158 | python3 -m venv tutorial_env
159 | source tutorial_env/bin/activate
160 | pip3 install waybackpack
161 | waybackpack https://therecord.media/feed/ -d ~/Downloads/therecord_media_feed --from-date 2015 --uniques-only  
162 | ```
163 | 
164 | In the above command I am requesting all unique feed pages downloaded by the Wayback Machine (`--uniques-only `) from 2015 (`--from-date 2015`) from the feed URL (`https://therecord.media/feed/`)
165 | 
166 | Which produces about 100 unique `index.html` files (where `index.html` is the actual RSS feed). They are nested in folders named with the index datetime (time captured by WBM) in the format `YYYYMMDDHHMMSS` like so;
167 | 
168 | ```
169 | 
170 | ~/Downloads/therecord_media_feed
171 | ├── 20220808162900
172 | │   └── therecord.media
173 | │       └── feed
174 | │           └── index.html
175 | ├── 20220805213430
176 | │   └── therecord.media
177 | │       └── feed
178 | │           └── index.html
179 | ...
180 | └── 20201101220102
181 |     └── therecord.media
182 |         └── feed
183 |             └── index.html
184 | ```
185 | 
186 | It is important to point out unique entries just mean the `index.html` files have at least one difference. That is to say, much of the file can actually be the same (and include the same articles). Also whilst saved as .html documents, the content is actually pure .xml.
187 | 
188 | Take `20220808162900 > therecord.media > index.html` and `20220805213430 > therecord.media > index.html`
189 | 
190 | Both of these files contain the same item;
191 | 
192 | ```xml
193 | <item>
194 |     <title>Twitter confirms January breach, urges pseudonymous accounts to not add email or phone number</title>
195 |     <link>https://therecord.media/twitter-confirms-january-breach-urges-pseudonymous-accounts-to-not-add-email-or-phone-number/</link>
196 | ```
197 | 
198 | history4feed looks at all unique `<link>` elements in the downloaded `index.html` files to find the unique `<items>`s.
199 | 
200 | Note, this blog is in RSS format. 
201 | 
202 | Here's another example, this time using an ATOM feed as an example;
203 | 
204 | ```shell
205 | waybackpack https://www.schneier.com/feed/atom/ -d ~/Downloads/schneier_feed --from-date 2015 --uniques-only  
206 | ```
207 | 
208 | Looking at a snippet from one of the `index.html` files;
209 | 
210 | ```xml
211 |     <entry>
212 |         <author>
213 |             <name>Bruce Schneier</name>
214 |         </author>
215 |         <title type="html"><![CDATA[Friday Squid Blogging: Vegan Chili Squid]]></title>
216 |         <link rel="alternate" type="text/html" href="https://www.schneier.com/blog/archives/2021/01/friday-squid-blogging-vegan-chili-squid.html" />
217 |         <id>https://www.schneier.com/?p=60711</id>
218 |         <updated>2021-01-04T16:50:54Z</updated>
219 |         <published>2021-01-22T22:19:15Z</published>
220 | ```
221 | 
222 | Here, history4feed looks at the `<link href` value to find the unique entries between each `index.html`.
223 | 
224 | ## Dealing with partial content in feeds
225 | 
226 | The `description` field (in RSS feeds) and `content` field (in ATOM feeds) can contain the entirety of the raw article, including the html formatting. You can see this in The Record's RSS feed. Sometime the HTML content is decoded or encoded.
227 | 
228 | However, some blogs choose to use snippets in their RSS feed content. For example, choosing only to include the first paragraph - requiring a subscriber to read the full content outside of their feed aggregator.
229 | 
230 | I wanted to include a full-text feed in the historical output created by history4feed.
231 | 
232 | To do this, once a historical feed is created, the feed is passed to the [readability-lxml library](https://pypi.org/project/readability-lxml/).
233 | 
234 | history4feed takes all the source URLs (either `<entry.link href>` property value (ATOM) in or `<item.link>` tags (RSS)) for the articles in the feeds and passes them to readability-lxml.
235 | 
236 | The result is then reprinted in the `description` or `content` field depending on feed type, overwriting the potentially partial content that it originally contained.
237 | 
238 | Note, history4feed cannot detect if a feed is full or partial so will always request the full content for all items via readability-lxml, regardless of whether the feed content is partial or full.
239 | 
240 | ## Dealing with encoding in post content
241 | 
242 | For ATOM properties;
243 | 
244 | * `title`: The title of the post / article
245 | * `description`: The article content
246 | 
247 | And for RSS properties;
248 | 
249 | * `title`: The title of the post / article
250 | * `content`: The article content
251 | 
252 | The data is typically printed in one of three ways, either;
253 | 
254 | * Encoded: e.g. contains `&gt` vs `>`
255 | * Decoded Raw: standard HTML tags
256 | * Decoded CDATA: the actual Decoded Raw HTML is inside `<![CDATA[Decoded Raw HTML]]>` tags
257 | 
258 | As an example, endcoded
259 | 
260 | ```html
261 | &gt;img src=&quot;https://cms.therecord.media/uploads/2023_0706_Ransomware_Tracker_Most_Prolific_Groups_6a567c11da.jpg&quot;&lt;
262 | ```
263 | 
264 | Which as decoded raw html looks as follows
265 | 
266 | ```html
267 | <img src="https://cms.therecord.media/uploads/2023_0706_Ransomware_Tracker_Most_Prolific_Groups_6a567c11da.jpg">
268 | ```
269 | 
270 | Which as decoded CDATA looks like
271 | 
272 | ```html
273 | <![CDATA[<img src="https://cms.therecord.media/uploads/2023_0706_Ransomware_Tracker_Most_Prolific_Groups_6a567c11da.jpg">]]>
274 | ```
275 | 
276 | In the responses provided by history4feed, the XML endpoint will return encoded HTML, the JSON response will return decoded HTML.
277 | 
278 | ## Live feed data (data not from WBM)
279 | 
280 | In addition to the historical feed information pulled by the Wayback Machine, history4feed also includes the latest posts in the live feed URL.
281 | 
282 | Live feed data always takes precedence. history4feed will remove duplicate entries found in the Wayback Machine response also present in the live feed, and will instead use the live feed version by default.
283 | 
284 | ## Rebuilding the feed (for output XML API output)
285 | 
286 | history4feed stores data in the database as JSON.
287 | 
288 | However, to support an RSS XML API endpoint (that can be used with a feed reader), history4feed converts all feeds and their content into a single RSS formatted XML file at request time.
289 | 
290 | RSS is always the output, regardless of wether input was ATOM or RSS.
291 | 
292 | The RSS files for each feed contain a simplified header;
293 | 
294 | ```xml
295 | <?xml version="1.0" encoding="UTF-8" ?>
296 |     <rss version="2.0">
297 |         <channel>
298 |             <title>CHANNEL.TITLE (RSS) / FEED.TITLE (ATOM)</title>
299 |             <description>CHANNEL.DESCRIPTION (RSS) / FEED.SUBTITLE (ATOM)</description>
300 |             <link>FEED URL ENTERED BY USER</link>
301 |             <lastBuildDate>SCRIPT EXECUTION TIME</lastBuildDate>
302 |             <generator>https://www.github.com/history4feed</generator>
303 |             <ITEMS></ITEMS>
304 |         </channel>
305 |     </rss>
306 | ```
307 | 
308 | Each item the be printed between `<ITEMS></ITEMS>` tags is rebuilt as follows;
309 | 
310 | ```xml
311 |             <item>
312 |                 <title>CHANNEL.ITEM.TITLE (RSS) / FEED.ENTRY.TITLE (ATOM)</title>
313 |                 <description>CHANNEL.ITEM.DESCRIPTION (RSS) / FEED.ENTRY.CONTENT (ATOM) EITHER ENCODED OR DECODED BASED ON USER SETTING -- THIS IS THE FULL BLOG POST AFTER FULL TEXT EXTRACTED</description>
314 |                 <link>CHANNEL.ITEM.LINK (RSS) / FEED.ENTRY.LINK (ATOM)</link>
315 |                 <pubDate>CHANNEL.ITEM.PUBDATE (RSS) / FEED.ENTRY.PUBLISHED (ATOM)</pubDate>
316 |                 <author>CHANNEL.ITEM.AUTHOR (RSS) / FEED.ENTRY.AUTHOR (ATOM)</author>
317 |                 <category>CHANNEL.ITEM.CATERGORY [N] (RSS) / FEED.ENTRY.CATEGORY [N] (ATOM)</category>
318 |                 <category>CHANNEL.ITEM.CATERGORY [N] (RSS) / FEED.ENTRY.CATEGORY [N] (ATOM)</category>
319 |             </item>
320 | ```
321 | 
322 | ## Dealing with feed validation on input
323 | 
324 | ATOM feeds are XML documents. ATOM feeds can be validated by checking for the header tags where `<feed` tag contains the text `atom` somewhere inside it, e.g. https://www.schneier.com/feed/atom/
325 | 
326 | RSS feeds are very similar to ATOM in many ways. RSS feeds can be validated as they contain an `<rss>` tag in the header of the document. e.g. https://www.hackread.com/feed/
327 | 
328 | Feeds are validated to ensure they contain this data before any processing is carries out.
329 | 
330 | For example, the source of https://github.com/signalscorps/history4feed/ does not show an RSS or ATOM feed, so would return an error.
331 | 
332 | ## Dealing with IP throttling during full text requests
333 | 
334 | Many sites will stop robotic request to their content. As the full text function of history4feed relies on accessing each blog post individually this can result in potentially thousands of requests to the Wayback Machine and which have a high risk of being blocked.
335 | 
336 | history4feed has two potential workarounds to solve this problem;
337 | 
338 | ### 1. Use a proxy
339 | 
340 | history4feed supports the use of [ScrapFly](https://scrapfly.io/).
341 | 
342 | This is a paid service ([with a free tier](https://scrapfly.io/pricing)). In my own research, its the best proxy for web scraping.
343 | 
344 | You will need to register for an account and grab your API key.
345 | 
346 | Note, due to many site blocking access to Russian IPs, the request includes the following proxy locations only;
347 | 
348 | ```shell
349 | country=us,ca,mx,gb,fr,de,au,at,be,hr,cz,dk,ee,fi,ie,se,es,pt,nl
350 | ```
351 | 
352 | ### 2. Use inbuilt app settings
353 | 
354 | It's best to request only what you need, and also slow down the rate at which the content is requested (so the request look more like a human).
355 | 
356 | history4feed supports the following options;
357 | 
358 | * sleep times: sets the time between each request to get the full post text
359 | * time range: an earliest and latest post time can be set, reducing the number of items returned in a single script run. Similarly, you can reduce the content by ignoring entries in the live feed.
360 | * retries: by default, when in full text mode history4feed will retry the page a certain number of times in case of error. If it still fails after retries count reached, the script will fail. You can change the retries as you require.
361 | 
362 | ## A note on error handling
363 | 
364 | Due to the way old feeds are pulled from WBM, it is likely some will now be deleted (404s). Similarly, the site might reject requests (403's -- see proxy use as a solution to this).
365 | 
366 | history4feed will soft handle these errors and log the failure, including the HTTP status code and the particular URL that failed. You can view the logs for each run in the `logs/` directory.
367 | 
368 | This means that if it's required you can go back and get this post manually. However, one limitation of soft error handling is you won't be able to do this using the same history4feed install though.


--------------------------------------------------------------------------------
/docs/history4feed.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/docs/history4feed.png


--------------------------------------------------------------------------------
/history4feed/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/history4feed/__init__.py


--------------------------------------------------------------------------------
/history4feed/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/history4feed/app/__init__.py


--------------------------------------------------------------------------------
/history4feed/app/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/history4feed/app/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 | 
3 | 
4 | class AppConfig(AppConfig):
5 |     default_auto_field = 'django.db.models.BigAutoField'
6 |     name = 'history4feed.app'
7 |     label = 'history4feed'
8 | 


--------------------------------------------------------------------------------
/history4feed/app/autoschema.py:
--------------------------------------------------------------------------------
 1 | from drf_spectacular.openapi import AutoSchema
 2 | from rest_framework.serializers import Serializer
 3 | from rest_framework.views import exception_handler
 4 | from rest_framework.exceptions import ValidationError
 5 | from django.core import exceptions
 6 | from dogesec_commons.utils.autoschema import CustomAutoSchema
 7 | 
 8 | class H4FSchema(CustomAutoSchema):
 9 |     def _is_list_view(self, serializer: Serializer | type[Serializer] | None = None) -> bool:
10 |         if self.path.endswith("/xml/"):
11 |             return True
12 |         return super()._is_list_view(serializer)


--------------------------------------------------------------------------------
/history4feed/app/migrations/0001_initial.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 5.0.9 on 2025-02-14 09:30
 2 | 
 3 | import django.db.models.deletion
 4 | import history4feed.app.models
 5 | import uuid
 6 | from django.db import migrations, models
 7 | 
 8 | 
 9 | class Migration(migrations.Migration):
10 | 
11 |     initial = True
12 | 
13 |     dependencies = [
14 |     ]
15 | 
16 |     operations = [
17 |         migrations.CreateModel(
18 |             name='Category',
19 |             fields=[
20 |                 ('name', history4feed.app.models.SlugField(max_length=1000, primary_key=True, serialize=False)),
21 |             ],
22 |         ),
23 |         migrations.CreateModel(
24 |             name='Feed',
25 |             fields=[
26 |                 ('id', models.UUIDField(help_text='UUID of feed generated by history4feed', primary_key=True, serialize=False)),
27 |                 ('title', models.CharField(help_text='found in the <channel> of RSS output. Is always kept up to date with the latest feed import values for this property.', max_length=1000)),
28 |                 ('description', models.CharField(help_text='found in the <channel> of RSS output. Is always kept up to date with the latest feed import values for this property.', max_length=10240)),
29 |                 ('url', models.URLField(help_text='\nThe URL of the RSS or ATOM feed\n\nNote this will be validated to ensure the feed is in the correct format.\n', max_length=1000, unique=True, validators=[history4feed.app.models.normalize_url])),
30 |                 ('earliest_item_pubdate', models.DateTimeField(help_text='pubdate of earliest post', null=True)),
31 |                 ('latest_item_pubdate', models.DateTimeField(help_text='pubdate of latest post', null=True)),
32 |                 ('datetime_added', models.DateTimeField(auto_now_add=True, help_text='date feed entry was added to database')),
33 |                 ('feed_type', models.CharField(choices=[('rss', 'Rss'), ('atom', 'Atom'), ('skeleton', 'Skeleton')], editable=False, help_text='type of feed', max_length=12)),
34 |                 ('pretty_url', models.URLField(default=None, max_length=1000, null=True)),
35 |             ],
36 |         ),
37 |         migrations.CreateModel(
38 |             name='Job',
39 |             fields=[
40 |                 ('id', models.UUIDField(default=uuid.uuid4, help_text='UUID of job', primary_key=True, serialize=False)),
41 |                 ('state', models.CharField(choices=[('pending', 'Pending'), ('running', 'Running'), ('success', 'Success'), ('failed', 'Failed')], default='pending', help_text='state of the job', max_length=12)),
42 |                 ('run_datetime', models.DateTimeField(auto_now_add=True, help_text='time job was executed')),
43 |                 ('earliest_item_requested', models.DateTimeField(help_text='shows the earliest time for posts requested. Useful for when jobs are run to see if the time range it runs across is expected', null=True)),
44 |                 ('latest_item_requested', models.DateTimeField(help_text='shows the latest time for posts requested', null=True)),
45 |                 ('info', models.CharField(help_text='contains a useful summary of the job (e.g. number of posts retrieved, errors logged)', max_length=10240)),
46 |                 ('include_remote_blogs', models.BooleanField(default=False)),
47 |                 ('feed', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='history4feed.feed')),
48 |             ],
49 |         ),
50 |         migrations.CreateModel(
51 |             name='Post',
52 |             fields=[
53 |                 ('id', models.UUIDField(help_text='UUID of items generated by history4feed', primary_key=True, serialize=False)),
54 |                 ('datetime_added', models.DateTimeField(auto_now_add=True)),
55 |                 ('datetime_updated', models.DateTimeField(auto_now=True)),
56 |                 ('title', models.CharField(help_text='found in the <item> element of feed output', max_length=1000)),
57 |                 ('description', models.CharField(blank=True, help_text='found in the <item> element of feed output', max_length=2097152)),
58 |                 ('link', models.URLField(help_text='link to full article. found in the <item> element of feed output', max_length=1000, validators=[history4feed.app.models.normalize_url])),
59 |                 ('pubdate', models.DateTimeField(help_text='date of publication.')),
60 |                 ('author', models.CharField(blank=True, help_text='author of the post', max_length=1000, null=True)),
61 |                 ('is_full_text', models.BooleanField(default=False, help_text='if full text has been retrieved')),
62 |                 ('content_type', models.CharField(default='plain/text', help_text='content type of the description', max_length=200)),
63 |                 ('added_manually', models.BooleanField(default=False)),
64 |                 ('deleted_manually', models.BooleanField(default=False, help_text='this post is hidden from user')),
65 |                 ('categories', models.ManyToManyField(blank=True, help_text='categories of the post', related_name='posts', to='history4feed.category')),
66 |                 ('feed', models.ForeignKey(help_text='feed id this item belongs too', on_delete=django.db.models.deletion.CASCADE, related_name='posts', to='history4feed.feed')),
67 |             ],
68 |         ),
69 |         migrations.CreateModel(
70 |             name='FulltextJob',
71 |             fields=[
72 |                 ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
73 |                 ('status', models.CharField(choices=[('retrieved', 'Retrieved'), ('skipped', 'Skipped'), ('failed', 'Failed'), ('retrieving', 'Retrieving')], default='retrieving', max_length=15)),
74 |                 ('error_str', models.CharField(blank=True, max_length=1500, null=True)),
75 |                 ('link', models.CharField(max_length=1500)),
76 |                 ('job', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='fulltext_jobs', to='history4feed.job')),
77 |                 ('post', models.ForeignKey(null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='fulltext_jobs', to='history4feed.post')),
78 |             ],
79 |         ),
80 |         migrations.AddConstraint(
81 |             model_name='post',
82 |             constraint=models.UniqueConstraint(fields=('link', 'feed'), name='unique_link_by_feed'),
83 |         ),
84 |     ]
85 | 


--------------------------------------------------------------------------------
/history4feed/app/migrations/0002_feed_freshness_alter_feed_feed_type.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 5.1.6 on 2025-02-24 12:48
 2 | 
 3 | from django.db import migrations, models
 4 | 
 5 | 
 6 | class Migration(migrations.Migration):
 7 | 
 8 |     dependencies = [
 9 |         ('history4feed', '0001_initial'),
10 |     ]
11 | 
12 |     operations = [
13 |         migrations.AddField(
14 |             model_name='feed',
15 |             name='freshness',
16 |             field=models.DateTimeField(default=None, null=True),
17 |         ),
18 |         migrations.AlterField(
19 |             model_name='feed',
20 |             name='feed_type',
21 |             field=models.CharField(choices=[('rss', 'Rss'), ('atom', 'Atom'), ('skeleton', 'Skeleton'), ('search_index', 'Search Index')], editable=False, help_text='type of feed', max_length=12),
22 |         ),
23 |     ]
24 | 


--------------------------------------------------------------------------------
/history4feed/app/migrations/0003_alter_feed_description.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 5.1.6 on 2025-03-28 10:32
 2 | 
 3 | from django.db import migrations, models
 4 | 
 5 | 
 6 | class Migration(migrations.Migration):
 7 | 
 8 |     dependencies = [
 9 |         ('history4feed', '0002_feed_freshness_alter_feed_feed_type'),
10 |     ]
11 | 
12 |     operations = [
13 |         migrations.AlterField(
14 |             model_name='feed',
15 |             name='description',
16 |             field=models.CharField(default=None, help_text='found in the <channel> of RSS output. Is always kept up to date with the latest feed import values for this property.', max_length=10240, null=True),
17 |         ),
18 |     ]
19 | 


--------------------------------------------------------------------------------
/history4feed/app/migrations/0004_alter_fulltextjob_status_alter_job_state.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 5.1.6 on 2025-05-02 13:03
 2 | 
 3 | from django.db import migrations, models
 4 | 
 5 | 
 6 | class Migration(migrations.Migration):
 7 | 
 8 |     dependencies = [
 9 |         ('history4feed', '0003_alter_feed_description'),
10 |     ]
11 | 
12 |     operations = [
13 |         migrations.AlterField(
14 |             model_name='fulltextjob',
15 |             name='status',
16 |             field=models.CharField(choices=[('retrieved', 'Retrieved'), ('skipped', 'Skipped'), ('cancelled', 'Cancelled'), ('failed', 'Failed'), ('retrieving', 'Retrieving')], default='retrieving', max_length=15),
17 |         ),
18 |         migrations.AlterField(
19 |             model_name='job',
20 |             name='state',
21 |             field=models.CharField(choices=[('pending', 'Pending'), ('running', 'Running'), ('success', 'Success'), ('cancelled', 'Cancelled'), ('failed', 'Failed')], default='pending', help_text='state of the job', max_length=12),
22 |         ),
23 |     ]
24 | 


--------------------------------------------------------------------------------
/history4feed/app/migrations/0005_feed_datetime_modified.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 5.1.6 on 2025-05-02 14:53
 2 | 
 3 | from django.db import migrations, models
 4 | 
 5 | 
 6 | class Migration(migrations.Migration):
 7 | 
 8 |     dependencies = [
 9 |         ('history4feed', '0004_alter_fulltextjob_status_alter_job_state'),
10 |     ]
11 | 
12 |     operations = [
13 |         migrations.AddField(
14 |             model_name='feed',
15 |             name='datetime_modified',
16 |             field=models.DateTimeField(default=None, help_text='date feed entry was edited in the database', null=True),
17 |         ),
18 |     ]
19 | 


--------------------------------------------------------------------------------
/history4feed/app/migrations/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/history4feed/app/migrations/__init__.py


--------------------------------------------------------------------------------
/history4feed/app/models.py:
--------------------------------------------------------------------------------
  1 | from textwrap import dedent
  2 | from typing import Iterable
  3 | from urllib.parse import urlparse
  4 | import uuid
  5 | from .settings import history4feed_server_settings
  6 | from django.db import models
  7 | from rest_framework import validators
  8 | from uuid import uuid4
  9 | from django.utils.text import slugify
 10 | import hyperlink
 11 | from django.db.models import Min, Max
 12 | from django.db.models import OuterRef, Subquery
 13 | from django.db.models import F
 14 | from django.utils import timezone
 15 | 
 16 | POST_DESCRIPTION_MAX_LENGTH = 2 * 1024 * 1024 # 2MiB
 17 | FEED_DESCRIPTION_MAX_LENGTH = 10*1024 # 10KiB
 18 | 
 19 | class JobState(models.TextChoices):
 20 |     PENDING    = "pending"
 21 |     RUNNING    = "running"
 22 |     SUCCESS    = "success"
 23 |     CANCELLED  = "cancelled"
 24 |     FAILED     = "failed"
 25 | 
 26 | class FeedType(models.TextChoices):
 27 |     RSS = "rss"
 28 |     ATOM = "atom"
 29 |     SKELETON = "skeleton"
 30 |     SEARCH_INDEX = "search_index"
 31 | 
 32 | # Create your models here.
 33 | 
 34 | class SlugField(models.CharField):
 35 |     def get_prep_value(self, value):
 36 |         return slugify(str(value))
 37 | 
 38 | class Category(models.Model):
 39 |     name = SlugField(max_length=1000, primary_key=True)
 40 | 
 41 | 
 42 | def stix_id(url):
 43 |     return uuid.uuid5(uuid.UUID(str(history4feed_server_settings.HISTORY4FEED_NAMESPACE)), url)
 44 | 
 45 | def normalize_url(url):
 46 |     try:
 47 |         u = hyperlink.parse(url)
 48 |         return u.normalize(url).to_text()
 49 |     except Exception as e:
 50 |         raise validators.ValidationError(f"URL normalization failed")
 51 |     
 52 | AUTO_TITLE_TRAIL = "%^%*(%"
 53 | def title_as_string(value: str):
 54 |     if value.endswith(AUTO_TITLE_TRAIL):
 55 |         value = value[:-len(AUTO_TITLE_TRAIL)]
 56 |     return value
 57 | 
 58 | class Feed(models.Model):
 59 |     id = models.UUIDField(primary_key=True, help_text="UUID of feed generated by history4feed")
 60 |     title = models.CharField(max_length=1000, help_text="found in the <channel> of RSS output. Is always kept up to date with the latest feed import values for this property.")
 61 |     description = models.CharField(max_length=FEED_DESCRIPTION_MAX_LENGTH, help_text="found in the <channel> of RSS output. Is always kept up to date with the latest feed import values for this property.", null=True, default=None)
 62 |     url = models.URLField(max_length=1000, unique=True, help_text=dedent("""
 63 |         The URL of the RSS or ATOM feed
 64 | 
 65 |         Note this will be validated to ensure the feed is in the correct format.
 66 |     """), validators=[normalize_url])
 67 |     earliest_item_pubdate = models.DateTimeField(null=True, help_text="pubdate of earliest post")
 68 |     latest_item_pubdate = models.DateTimeField(null=True, help_text="pubdate of latest post")
 69 |     datetime_added = models.DateTimeField(auto_now_add=True, editable=False, help_text="date feed entry was added to database")
 70 |     datetime_modified = models.DateTimeField(default=None, null=True, help_text="date feed entry was edited in the database")
 71 |     feed_type = models.CharField(choices=FeedType.choices, max_length=12, null=False, editable=False, help_text="type of feed")
 72 |     pretty_url = models.URLField(max_length=1000, null=True, default=None)
 73 |     freshness = models.DateTimeField(null=True, default=None)
 74 | 
 75 |     def get_post_count(self):
 76 |         return self.posts.filter(deleted_manually=False).count()
 77 |     
 78 |     def save(self, *args, **kwargs) -> None:
 79 |         if not self.id:
 80 |             self.id = stix_id(self.url)
 81 |         self.earliest_item_pubdate, self.latest_item_pubdate = self.posts.aggregate(min=Min('pubdate'), max=Max('pubdate')).values()
 82 |         self.datetime_modified = self.datetime_modified or self.datetime_added
 83 |         return super().save(*args, **kwargs)
 84 |     
 85 |     def get_pretty_url(self):
 86 |         return self.pretty_url or self.url
 87 |     
 88 |     def set_title(self, title):
 89 |         if not self.title or self.title.endswith(AUTO_TITLE_TRAIL):
 90 |             self.title = title + AUTO_TITLE_TRAIL
 91 |     
 92 |     def set_description(self, description):
 93 |         if not self.description or self.description.endswith(AUTO_TITLE_TRAIL):
 94 |             self.description = description + AUTO_TITLE_TRAIL
 95 | 
 96 | class Job(models.Model):
 97 |     id = models.UUIDField(primary_key=True, default=uuid4, help_text="UUID of job")
 98 |     state = models.CharField(choices=JobState.choices, max_length=12, default=JobState.PENDING, null=False, help_text="state of the job")
 99 |     run_datetime = models.DateTimeField(auto_now_add=True, editable=False, help_text="time job was executed")
100 |     earliest_item_requested = models.DateTimeField(null=True, help_text="shows the earliest time for posts requested. Useful for when jobs are run to see if the time range it runs across is expected")
101 |     latest_item_requested = models.DateTimeField(null=True, help_text="shows the latest time for posts requested")
102 |     feed = models.ForeignKey(Feed, on_delete=models.CASCADE)
103 |     info = models.CharField(max_length=FEED_DESCRIPTION_MAX_LENGTH, help_text="contains a useful summary of the job (e.g. number of posts retrieved, errors logged)")
104 |     include_remote_blogs = models.BooleanField(default=False)
105 | 
106 |     def urls(self):
107 |         retval = {}
108 |         ft_job: FulltextJob = None
109 |         for ft_job in self.fulltext_jobs.all():
110 |             retval[ft_job.status] = retval.get(ft_job.status, [])
111 |             retval[ft_job.status].append(dict(url=ft_job.link, id=ft_job.post_id))
112 |         return retval
113 |     
114 |     def should_skip_post(self, post_link: str):
115 |         return (not self.include_remote_blogs) and urlparse(self.feed.url).hostname.split('.')[-2:] != urlparse(post_link).hostname.split('.')[-2:]
116 |     
117 |     def cancel(self):
118 |         if self.state in [JobState.PENDING, JobState.RUNNING]:
119 |             self.state = JobState.CANCELLED
120 |         self.save()
121 |         return
122 |     
123 |     def is_cancelled(self):
124 |         return self.state == JobState.CANCELLED
125 | 
126 | 
127 | class FullTextState(models.TextChoices):
128 |     RETRIEVED  = "retrieved"
129 |     SKIPPED    = "skipped"
130 |     CANCELLED  = "cancelled"
131 |     FAILED     = "failed"
132 |     RETRIEVING = "retrieving"
133 | 
134 | class Post(models.Model):
135 |     id = models.UUIDField(primary_key=True, help_text="UUID of items generated by history4feed")
136 |     datetime_added = models.DateTimeField(auto_now_add=True, editable=False)
137 |     datetime_updated = models.DateTimeField(auto_now=True)
138 |     title = models.CharField(max_length=1000, help_text="found in the <item> element of feed output")
139 |     description = models.CharField(max_length=POST_DESCRIPTION_MAX_LENGTH, blank=True, help_text="found in the <item> element of feed output")
140 |     link = models.URLField(max_length=1000, help_text="link to full article. found in the <item> element of feed output", validators=[normalize_url])
141 |     pubdate = models.DateTimeField(help_text="date of publication.")
142 |     author = models.CharField(max_length=1000, help_text="author of the post", null=True, blank=True)
143 |     categories = models.ManyToManyField(Category, related_name="posts", help_text="categories of the post", blank=True)
144 |     feed = models.ForeignKey(Feed, on_delete=models.CASCADE, related_name="posts", help_text="feed id this item belongs too")
145 |     is_full_text = models.BooleanField(default=False, help_text="if full text has been retrieved")
146 |     content_type = models.CharField(default="plain/text", max_length=200, help_text="content type of the description")
147 |     added_manually = models.BooleanField(default=False)
148 |     deleted_manually = models.BooleanField(default=False, help_text="this post is hidden from user")
149 | 
150 |     class  Meta:
151 |         constraints = [
152 |             models.UniqueConstraint(fields=["link", "feed"], name="unique_link_by_feed"),
153 |         ]
154 | 
155 |     def add_categories(self, categories):
156 |         categories = [Category.objects.get_or_create(name=name)[0] for name in categories]
157 |         self.categories.set(categories)
158 | 
159 | 
160 |     def save(self, *args, **kwargs) -> None:
161 |         if not self.id:
162 |             pubdate = self.pubdate.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
163 |             self.id = stix_id(f"{self.feed.id}+{self.link}+{pubdate}")
164 |         return super().save(*args, **kwargs)
165 |     
166 |     @classmethod
167 |     def visible_posts(cls):
168 |         return cls.objects.filter(deleted_manually=False).annotate(
169 |             last_job_id=Subquery(
170 |                 FulltextJob.objects.filter(
171 |                     post_id=OuterRef('pk')
172 |                 ).order_by('-job__run_datetime')  # Ordering by publication date to get the latest book
173 |                 .values('job__id')[:1]  # We take only the first (most recent) book
174 |             )
175 |         )
176 | 
177 | class FulltextJob(models.Model):
178 |     post = models.ForeignKey(Post, on_delete=models.SET_NULL, null=True, related_name="fulltext_jobs")
179 |     job = models.ForeignKey(Job, related_name="fulltext_jobs", on_delete=models.CASCADE)
180 |     status = models.CharField(max_length=15, choices=FullTextState.choices, default=FullTextState.RETRIEVING)
181 |     error_str = models.CharField(max_length=1500, null=True, blank=True)
182 |     link = models.CharField(max_length=1500)
183 | 
184 | 
185 |         
186 |     def is_cancelled(self):
187 |         return self.job.state == JobState.CANCELLED


--------------------------------------------------------------------------------
/history4feed/app/openapi_params.py:
--------------------------------------------------------------------------------
 1 | from drf_spectacular.utils import OpenApiParameter, OpenApiResponse, OpenApiExample
 2 | from drf_spectacular.types import OpenApiTypes
 3 | from textwrap import dedent
 4 | from .serializers import PostSerializer
 5 | 
 6 | 
 7 | FEED_ID_PARAM = OpenApiParameter(
 8 |     "feed_id",
 9 |     type=OpenApiTypes.UUID,
10 |     description="The ID of the Feed. You can search for Feed IDs using the GET Feeds endpoints. e.g. `6c6e6448-04d4-42a3-9214-4f0f7d02694e`",
11 |     location=OpenApiParameter.PATH,
12 | )
13 | JOB_ID_PARAM = OpenApiParameter(
14 |     "job_id",
15 |     type=OpenApiTypes.UUID,
16 |     description="The ID of the Job. You can search for Job IDs using the GET Jobs endpoints. e.g. `7db25a55-55e4-4bc5-b189-3e2ca4e304e5`",
17 |     location=OpenApiParameter.PATH,
18 | )
19 | POST_ID_PARAM = OpenApiParameter(
20 |     "post_id",
21 |     type=OpenApiTypes.UUID,
22 |     description="The ID of the Post. You can search for Post IDs using the GET Posts endpoints for a specific Feed. e.g. `797e94b1-efdc-4e66-a748-f2b6a5896a89`",
23 |     location=OpenApiParameter.PATH,
24 | )
25 | 
26 | 
27 | XML_RESPONSE = OpenApiResponse(
28 |     response=PostSerializer(many=True),
29 |     description="",
30 |     examples=[
31 |         OpenApiExample(
32 |             "xml",
33 |             value=dedent(
34 |                 """
35 |             <?xml version="1.0" ?>
36 |             <rss version="2.0">
37 |                 <channel>
38 |                 <title>Example CTI Blog</title>
39 |                 <description>
40 |                 </description>
41 |                 <link>https://cti.example.com/feed/</link>
42 |                 <lastBuildDate>2024-07-02T17:07:31+00:00</lastBuildDate>
43 |                 <item>
44 |                     <title>DNS Probing Operation</title>
45 |                     <link href="https://cti.example.com/blog/dns-probing-operation/">https://cti.example.com/blog/dns-probing-operation/</link>
46 |                     <pubDate>2024-06-03T15:00:52+00:00</pubDate>
47 |                     <description>&lt;html&gt;&lt;/html&gt;</description>
48 |                     <category>infoblox-threat-intel</category>
49 |                     <category>dns</category>
50 |                     <category>dns-intel</category>
51 |                     <category>dns-threat-intelligence</category>
52 |                     <category>malware</category>
53 |                     <author>
54 |                     <name>John Doe (Admin)</name>
55 |                     </author>
56 |                 </item>
57 |                 </channel>
58 |             </rss>
59 | """
60 |             ),
61 |         )
62 |     ],
63 | )
64 | 
65 | HTTP404_EXAMPLE = OpenApiExample("http-404", {"message": "resource not found", "code": 404})
66 | HTTP400_EXAMPLE = OpenApiExample("http-400", {"message": "request not understood", "code": 400})
67 | 


--------------------------------------------------------------------------------
/history4feed/app/serializers.py:
--------------------------------------------------------------------------------
  1 | from rest_framework import serializers, validators, exceptions
  2 | from .models import AUTO_TITLE_TRAIL, FEED_DESCRIPTION_MAX_LENGTH, Category, Feed, Post, Job, normalize_url, FeedType, title_as_string
  3 | from django.db import models as django_models
  4 | from django.utils.translation import gettext_lazy as _
  5 | 
  6 | class TitleField(serializers.CharField):
  7 |     def to_internal_value(self, data):
  8 |         return super().to_internal_value(data)
  9 |     def to_representation(self, value):
 10 |         return title_as_string(super().to_representation(value))
 11 |     
 12 | class InvalidFeed(exceptions.APIException):
 13 |     status_code = 406
 14 | 
 15 | class FeedSerializer(serializers.ModelSerializer):
 16 |     count_of_posts = serializers.IntegerField(source='get_post_count', read_only=True, help_text="Number of posts in feed")
 17 |     include_remote_blogs = serializers.BooleanField(write_only=True, default=False)
 18 |     pretty_url = serializers.URLField(allow_null=True, required=False, help_text="This is a cosmetic URL. It is designed to show the actual blog link to browse to in a web browser (not the feed)")
 19 |     title = TitleField(required=False, max_length=256, allow_null=True, allow_blank=True)
 20 |     description = TitleField(required=False, max_length=FEED_DESCRIPTION_MAX_LENGTH, allow_null=True, allow_blank=True)
 21 |     use_search_index = serializers.BooleanField(default=False, write_only=True, help_text="should use search index instead")
 22 |     class Meta:
 23 |         model = Feed
 24 |         # fields = '__all__'
 25 |         exclude = ['freshness']
 26 |         read_only_fields = ['id', 'earliest_item_pubdate', 'latest_item_pubdate', 'datetime_added', "datetime_modified"]
 27 | 
 28 |     def create(self, validated_data: dict):
 29 |         validated_data = validated_data.copy()
 30 |         validated_data.pop('include_remote_blogs', None)
 31 |         validated_data.pop('use_search_index', None)
 32 |         return super().create(validated_data)
 33 |     
 34 | class SkeletonFeedSerializer(FeedSerializer):
 35 |     include_remote_blogs = None
 36 |     use_search_index = None
 37 |     title = serializers.CharField(required=True, help_text="title of feed")
 38 |     description = serializers.CharField(required=False, help_text="description of feed", allow_blank=True)
 39 |     feed_type = serializers.HiddenField(default=FeedType.SKELETON)
 40 |     
 41 | class SearchIndexFeedSerializer(FeedSerializer):
 42 |     title = serializers.CharField(required=True, help_text="title of feed")
 43 |     description = serializers.CharField(required=True, help_text="description of feed")
 44 |     feed_type = serializers.HiddenField(default=FeedType.SEARCH_INDEX)
 45 | 
 46 | 
 47 | class FeedCreatedJobSerializer(FeedSerializer):
 48 |     job_id = serializers.UUIDField(read_only=True, help_text="only returns with POST /feeds/")
 49 |     job_state = serializers.CharField(read_only=True, help_text="only returns with POST /feeds/")
 50 |     
 51 | 
 52 | class PostListSerializer(serializers.ListSerializer):
 53 |     child = None
 54 | 
 55 |     @property
 56 |     def feed_id(self):
 57 |         return self.context.get('feed_id')
 58 |     
 59 | 
 60 |     def run_child_validation(self, data):
 61 |         """
 62 |         Run validation on child serializer.
 63 |         You may need to override this method to support multiple updates. For example:
 64 | 
 65 |         self.child.instance = self.instance.get(pk=data['id'])
 66 |         self.child.initial_data = data
 67 |         return super().run_child_validation(data)
 68 |         """
 69 |         data.setdefault('feed', self.feed_id)
 70 |         return self.child.run_validation(data)
 71 |     
 72 |     def create(self, validated_data: list[dict]):
 73 |         instances = []
 74 |         for attrs in validated_data:
 75 |             feed_id = attrs.setdefault('feed_id', self.feed_id)
 76 |             instance = None
 77 |             try:
 78 |                 instance = Post.objects.get(feed_id=feed_id, link=attrs['link'])
 79 |             except:
 80 |                 pass
 81 |             if instance:
 82 |                 instance = self.child.update(instance, attrs)
 83 |             else:
 84 |                 instance = self.child.create(attrs)
 85 | 
 86 |             instances.append(instance)
 87 |         return instances
 88 | 
 89 | class PostSerializer(serializers.ModelSerializer):
 90 |     # categories = serializers.ManyRelatedField()
 91 |     class Meta:
 92 |         list_serializer_class = PostListSerializer
 93 |         model = Post
 94 |         exclude = ['feed', 'deleted_manually']
 95 |         read_only_fields = ["id", "datetime_updated", "datetime_added", "description", "is_full_text", "content_type", "added_manually"]
 96 |         
 97 |     
 98 |     def run_validation(self, data=...):
 99 |         if categories := data.get('categories'):
100 |             data['categories'] = [Category.objects.get_or_create(name=name)[0].name for name in categories]
101 |         return super().run_validation(data)
102 |     
103 | class PostWithFeedIDSerializer(PostSerializer):
104 |     feed_id = serializers.UUIDField()
105 | 
106 | class PatchSerializer(serializers.Serializer):
107 |     pass
108 | 
109 | class FeedPatchSerializer(serializers.ModelSerializer):
110 |     title = serializers.CharField(required=True, help_text="title of feed")
111 |     description = serializers.CharField(required=True, help_text="description of feed")
112 | 
113 |     class Meta:
114 |         model = Feed
115 |         fields = ['title', 'description', 'pretty_url']
116 | 
117 | class FeedFetchSerializer(FeedPatchSerializer, FeedSerializer):
118 |     class Meta:
119 |         model = Feed
120 |         fields = ['include_remote_blogs']
121 | 
122 | class PostCreateSerializer(PostSerializer):
123 |     link = serializers.URLField(validators=[normalize_url])
124 |     class feed_class(serializers.HiddenField):
125 |         def get_default(self):
126 |             return self.context.get('feed_id')
127 |     feed_id = feed_class(default=None)
128 |         
129 |     class Meta:
130 |         list_serializer_class = PostListSerializer
131 |         model = Post
132 |         fields = ["title", "link", "pubdate", "author", "categories", "feed_id"]
133 |         validators = [
134 |             validators.UniqueTogetherValidator(
135 |                 queryset=Post.visible_posts(),
136 |                 fields=('feed_id', 'link'),
137 |                 message='Post with link already exists in feed.',
138 |             )
139 |         ]
140 | 
141 | class PostPatchSerializer(PostSerializer):
142 |     class Meta:
143 |         model = Post
144 |         fields = ["title", "pubdate", "author", "categories"]
145 | 
146 |     
147 | class CreatePostsSerializer(serializers.Serializer):
148 |     posts = PostCreateSerializer(many=True, allow_empty=False)
149 | 
150 |     def create(self, validated_data):
151 |         posts = [{**post, **self.save_kwargs} for post in validated_data["posts"]]
152 |         
153 |         return self.fields['posts'].create(posts)
154 |     
155 |     def save(self, **kwargs):
156 |         self.save_kwargs = kwargs
157 |         return super().save(**kwargs)
158 | 
159 | 
160 | 
161 | class JobUrlStatusSerializer(serializers.Serializer):
162 |     class joburlstatus(serializers.Serializer):
163 |         url = serializers.URLField()
164 |         id = serializers.UUIDField()
165 |     retrieved = joburlstatus(many=True, default=[])
166 |     retrieving = joburlstatus(many=True, default=[])
167 |     skipped = joburlstatus(many=True, default=[])
168 |     failed = joburlstatus(many=True, default=[])
169 |     cancelled = joburlstatus(many=True, default=[])
170 | 
171 | class JobSerializer(serializers.ModelSerializer):
172 |     count_of_items = serializers.IntegerField(read_only=True)
173 |     feed_id = serializers.UUIDField(read_only=True, source='feed.id')
174 |     urls = JobUrlStatusSerializer()
175 |     class Meta:
176 |         model = Job
177 |         # fields = '__all__'
178 |         exclude = ['feed']
179 | 
180 | class PostJobSerializer(JobSerializer):
181 |     pass
182 | 


--------------------------------------------------------------------------------
/history4feed/app/settings.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | from typing import Any, Dict, get_type_hints
 3 | import uuid
 4 | 
 5 | from django.conf import settings
 6 | from rest_framework.settings import APISettings, perform_import, api_settings
 7 | 
 8 | H4F_DEFAULTS: dict[str, any] = {
 9 |     'SCRAPFLY_KEY': '',
10 |     'WAYBACK_SLEEP_SECONDS': 20,
11 |     'EARLIEST_SEARCH_DATE': datetime(2020, 1, 1, tzinfo=timezone.utc),
12 |     'REQUEST_RETRY_COUNT': 3,
13 |     'HISTORY4FEED_NAMESPACE': uuid.UUID("6c6e6448-04d4-42a3-9214-4f0f7d02694e"),
14 |     "BRAVE_SEARCH_API_KEY": None
15 | }
16 | 
17 | IMPORT_STRINGS = [
18 | ]
19 | 
20 | class History4FeedServerSettings(APISettings):
21 |     SCRAPFLY_KEY: str
22 |     WAYBACK_SLEEP_SECONDS: int
23 |     EARLIEST_SEARCH_DATE: datetime
24 |     REQUEST_RETRY_COUNT: int
25 |     HISTORY4FEED_NAMESPACE : str|uuid.UUID
26 |     BRAVE_SEARCH_API_KEY: str
27 | 
28 | history4feed_server_settings = History4FeedServerSettings(
29 |     user_settings=getattr(settings, 'HISTORY4FEED_SETTINGS', {}),  # type: ignore
30 |     defaults=H4F_DEFAULTS,  # type: ignore
31 |     import_strings=IMPORT_STRINGS,
32 | )


--------------------------------------------------------------------------------
/history4feed/app/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/history4feed/app/utils.py:
--------------------------------------------------------------------------------
 1 | from rest_framework import pagination, response, renderers
 2 | from rest_framework.filters import OrderingFilter, BaseFilterBackend
 3 | from django.utils.encoding import force_str
 4 | from django.db.models import Q
 5 | from datetime import datetime, UTC
 6 | import typing
 7 | from dogesec_commons.utils import Pagination, Ordering
 8 | from django.utils import timezone
 9 | from django.forms import DateTimeField
10 | from django_filters.rest_framework import filters
11 | 
12 | class DatetimeFieldUTC(DateTimeField):
13 |     def to_python(self, value):
14 |         value = super().to_python(value) 
15 |         return value and value.astimezone(UTC)
16 |     
17 | class DatetimeFilter(filters.Filter):
18 |     field_class = DatetimeFieldUTC
19 | 
20 | class MinMaxDateFilter(BaseFilterBackend):
21 |     min_val = datetime.min
22 |     max_value = datetime.max
23 |     def get_fields(self, view):
24 |         out = {}
25 |         fields = getattr(view, 'minmax_date_fields', [])
26 |         if not isinstance(fields, list):
27 |             return out
28 |         for field in fields:
29 |             out[f"{field}_max"] = field
30 |             out[f"{field}_min"] = field
31 |         return out
32 |     
33 |     def parse_date(self, value):
34 |         return DatetimeFieldUTC().to_python(value)
35 | 
36 |     def filter_queryset(self, request, queryset, view):
37 |         valid_fields = self.get_fields(view)
38 |         valid_params = [(k, v) for k, v in request.query_params.items() if k in valid_fields]
39 |         queries =  {}
40 |         for param, value in valid_params:
41 |             field_name = valid_fields[param]
42 |             if param.endswith('_max'):
43 |                 queries[f"{field_name}__lte"] = self.parse_date(value)
44 |             else:
45 |                 queries[f"{field_name}__gte"] = self.parse_date(value)
46 |         return queryset.filter(Q(**queries))
47 | 
48 |     def get_schema_operation_parameters(self, view):
49 |         parameters = []
50 |         valid_fields = self.get_fields(view)
51 |         for query_name, field_name in valid_fields.items():
52 |             _type = "Maximum"
53 |             if query_name.endswith('min'):
54 |                 _type = "Minimum"
55 |             parameter = {
56 |                 'name': query_name,
57 |                 'required': False,
58 |                 'in': 'query',
59 |                 'description': f"{_type} value of `{field_name}` to filter by in format `YYYY-MM-DD`.",
60 |                 'schema': {
61 |                     'type': 'string', 'format': 'date',
62 |                 },
63 |             }
64 |             parameters.append(parameter)
65 |         return parameters
66 | 
67 | 
68 | 
69 | # use pagination to modify how xml/rss renders
70 | class XMLPostPagination(Pagination):
71 |     def get_paginated_response_schema(self, schema):
72 |         return {
73 |             'type': 'string',
74 |             'example': '<?xml version="1.0" ?>'
75 |         }
76 |     
77 |     def get_paginated_response(self, data):
78 |         return response.Response(data, headers={
79 |             'rss_page_size': self.get_page_size(self.request),
80 |             'rss_page_number': self.page.number,
81 |             'rss_page_results_count': len(self.page),
82 |             'rss_total_results_count': self.page.paginator.count,
83 |         }, content_type="application/rss+xml; charset=UTF-8")
84 |     
85 |     def get_schema_operation_parameters(self, view):
86 |         return super().get_schema_operation_parameters(view)
87 | 
88 | class RSSRenderer(renderers.BaseRenderer):
89 |     media_type = "application/rss+xml"
90 |     format = "xml"
91 | 
92 |     def render(self, data, accepted_media_type=None, renderer_context=None):
93 |         return data


--------------------------------------------------------------------------------
/history4feed/app/views.py:
--------------------------------------------------------------------------------
  1 | from django.shortcuts import get_object_or_404
  2 | 
  3 | from .autoschema import H4FSchema
  4 | 
  5 | from .openapi_params import (
  6 |     HTTP400_EXAMPLE,
  7 |     HTTP404_EXAMPLE,
  8 |     JOB_ID_PARAM,
  9 |     FEED_ID_PARAM,
 10 |     POST_ID_PARAM,
 11 |     XML_RESPONSE,
 12 | )
 13 | from .utils import (
 14 |     DatetimeFilter,
 15 |     Ordering,
 16 |     Pagination,
 17 |     MinMaxDateFilter,
 18 |     RSSRenderer,
 19 |     XMLPostPagination,
 20 | )
 21 | from dogesec_commons.utils.serializers import CommonErrorSerializer
 22 | # from .openapi_params import FEED_PARAMS, POST_PARAMS
 23 | 
 24 | from .serializers import CreatePostsSerializer, FeedCreatedJobSerializer, FeedFetchSerializer, FeedPatchSerializer, PostPatchSerializer, PostWithFeedIDSerializer, SearchIndexFeedSerializer, SkeletonFeedSerializer, PatchSerializer, PostJobSerializer, PostSerializer, FeedSerializer, JobSerializer, PostCreateSerializer
 25 | from .models import AUTO_TITLE_TRAIL, FulltextJob, JobState, Post, Feed, Job, FeedType
 26 | from rest_framework import (
 27 |     viewsets,
 28 |     request,
 29 |     response,
 30 |     mixins,
 31 |     decorators,
 32 |     renderers,
 33 |     pagination,
 34 |     status,
 35 |     validators,
 36 | )
 37 | from django.http import HttpResponse
 38 | from ..h4fscripts import h4f, task_helper, build_rss
 39 | from drf_spectacular.utils import (
 40 |     extend_schema,
 41 |     extend_schema_view,
 42 |     OpenApiResponse,
 43 |     OpenApiExample,
 44 | )
 45 | from drf_spectacular.types import OpenApiTypes
 46 | from django_filters.rest_framework import (
 47 |     DjangoFilterBackend,
 48 |     FilterSet,
 49 |     Filter,
 50 |     BaseCSVFilter,
 51 |     UUIDFilter,
 52 |     BaseInFilter,
 53 |     filters,
 54 | )
 55 | from django.db.models import Count, Q, Subquery, OuterRef
 56 | from datetime import datetime
 57 | import textwrap
 58 | from django.utils import timezone
 59 | 
 60 | from history4feed.app import serializers
 61 | 
 62 | from history4feed.app import utils
 63 | 
 64 | from drf_spectacular.views import SpectacularAPIView
 65 | 
 66 | class SchemaViewCached(SpectacularAPIView):
 67 |     _schema = None
 68 |     
 69 |     def _get_schema_response(self, request):
 70 |         version = self.api_version or request.version or self._get_version_parameter(request)
 71 |         if not self.__class__._schema:
 72 |             generator = self.generator_class(urlconf=self.urlconf, api_version=version, patterns=self.patterns)
 73 |             self.__class__._schema = generator.get_schema(request=request, public=self.serve_public)
 74 |         return response.Response(
 75 |             data=self.__class__._schema,
 76 |             headers={"Content-Disposition": f'inline; filename="{self._get_filename(request, version)}"'}
 77 |         )
 78 | 
 79 | class Response(response.Response):
 80 |     DEFAULT_HEADERS = {
 81 |         "Access-Control-Allow-Origin": "*",
 82 |     }
 83 |     CONTENT_TYPE = "application/json"
 84 | 
 85 |     def __init__(
 86 |         self,
 87 |         data=None,
 88 |         status=None,
 89 |         template_name=None,
 90 |         headers=None,
 91 |         exception=False,
 92 |         content_type=CONTENT_TYPE,
 93 |     ):
 94 |         headers = headers or {}
 95 |         headers.update(self.DEFAULT_HEADERS)
 96 |         super().__init__(data, status, template_name, headers, exception, content_type)
 97 | 
 98 | 
 99 | class ErrorResp(Response):
100 |     def __init__(self, status, title, details=None):
101 |         super().__init__({"message": title, "code": status}, status=status)
102 | 
103 | 
104 | # Create your views here.
105 | 
106 | @extend_schema_view(
107 |     retrieve=extend_schema(
108 |         summary="Get a Post",
109 |         description=textwrap.dedent(
110 |             """
111 |             This will return a single Post by its ID. It is useful if you only want to get the data for a single entry.
112 |             """
113 |         ),
114 |         responses={
115 |             200: PostWithFeedIDSerializer,
116 |             404: OpenApiResponse(CommonErrorSerializer, "Post not found", examples=[HTTP404_EXAMPLE]),
117 |             400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]),
118 |         },
119 |     ),
120 |     list=extend_schema(
121 |         summary="Search for Posts",
122 |         description=textwrap.dedent(
123 |             """
124 |             Search through Posts from all Blogs. Filter by the ones you're interested in.
125 |             """
126 |         ),
127 |         responses={
128 |             200: PostWithFeedIDSerializer,
129 |             404: OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]),
130 |             400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]),
131 |         },
132 |     ),
133 |     destroy=extend_schema(
134 |         summary="Delete a Post by ID",
135 |        description=textwrap.dedent(
136 |             """
137 |             This will delete the post inside of the feed. Deleting the post will remove it forever and it will not be reindexed on subsequent feed updates. The only way to re-index it is to add it manually.
138 |             """
139 |         ),
140 |     ),
141 |     reindex=extend_schema(
142 |         summary="Update a Post in a Feed",
143 |         description=textwrap.dedent(
144 |             """
145 |              When blog posts are modified, the RSS or ATOM feeds or search results are not often updated with the new modification time. As such, fetching for blog will cause these updated posts to be missed.
146 | 
147 |             To ensure the post stored in the database matches the one currently published you can make a request to this endpoint using the Post ID to update it.
148 | 
149 |             This update will only change the content (`description`) stored for the Post. It will not update the `title`, `pubdate`, `author`, or `categories`. If you need to update these properties you can use the Update Post Metadata endpoint.
150 | 
151 |             **IMPORTANT**: This action will delete the original post as well as all the STIX SDO and SRO objects created during the processing of the original text. Mostly this is not an issue, however, if the post has been removed at source you will end up with an empty entry for this Post.
152 | 
153 |             The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint.
154 |             """
155 |         ),
156 |         responses={
157 |             201: PostJobSerializer,
158 |             404: OpenApiResponse(CommonErrorSerializer, "post does not exist", examples=[HTTP404_EXAMPLE]),
159 |         },
160 |         request=PatchSerializer,
161 |     ),
162 |     partial_update=extend_schema(
163 |         summary="Update a Posts Metadata",
164 |         description=textwrap.dedent(
165 |             """
166 |             In most cases, the automatically indexed metadata (or user submitted metadata in the case of manually added Posts) will be fine.
167 | 
168 |             However, these may be occasions you want to change the values of the `title`, `pubdate`, `author`, or `categories` for a Post.
169 | 
170 |             The following key/values are accepted in the body of the request:
171 | 
172 |             * `pubdate` (required): The date of the blog post in the format `YYYY-MM-DD`. history4feed cannot accurately determine a post date in all cases, so you must enter it manually.
173 |             * `title` (required):  history4feed cannot accurately determine the title of a post in all cases, so you must enter it manually.
174 |             * `author` (optional): the value to be stored for the author of the post.
175 |             * `categories` (optional) : the value(s) to be stored for the category of the post. Pass as a list like `["tag1","tag2"]`.
176 | 
177 |             Only one key/value is required. If no values are passed, they will be remain unchanged from the current state.
178 | 
179 |             It is not possible to manually modify any other values for the Post object. You can update the post content using the Update a Post in A Feed endpoint.
180 |             """
181 |         ),
182 |         responses={
183 |             201: PostSerializer,
184 |             400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]),
185 |             404: OpenApiResponse(CommonErrorSerializer, "post does not exist", examples=[HTTP404_EXAMPLE]),
186 |         },
187 |         request=PostPatchSerializer,
188 |     ),
189 | 
190 | )
191 | class PostOnlyView(mixins.RetrieveModelMixin, mixins.ListModelMixin, viewsets.GenericViewSet):
192 |     openapi_path_params = [POST_ID_PARAM]
193 |     openapi_tags = ["Posts"]
194 |     serializer_class = PostWithFeedIDSerializer
195 |     lookup_url_kwarg = "post_id"
196 |     pagination_class = Pagination("posts")
197 |     filter_backends = [DjangoFilterBackend, Ordering, MinMaxDateFilter]
198 |     ordering_fields = ["pubdate", "title", "datetime_updated", "datetime_added"]
199 |     ordering = "pubdate_descending"
200 |     minmax_date_fields = ["pubdate"]
201 | 
202 |     class filterset_class(FilterSet):
203 |         feed_id = filters.BaseInFilter(help_text="Filter the results by one or more `feed_id`(s). e.g. `3f388179-4683-4495-889f-690c5de3ae7c`")
204 |         title = Filter(
205 |             help_text="Filter the content by the `title` of the post. Will search for titles that contain the value entered. Search is wildcard so `exploit` will match `exploited` and `exploits`.",
206 |             lookup_expr="icontains",
207 |         )
208 |         description = Filter(
209 |             help_text="Filter by the content post `description`. Will search for descriptions that contain the value entered. Search is wildcard so `exploit` will match `exploited` and `exploits`.",
210 |             lookup_expr="icontains",
211 |         )
212 |         link = Filter(
213 |             help_text="Filter the content by a posts `link`. Will search for links that contain the value entered. Search is wildcard so `dogesec` will return any URL that contains the string `dogesec`.",
214 |             lookup_expr="icontains",
215 |         )
216 |         job_id = Filter(help_text="Filter the results by the Job ID the Post was downloaded or updated in. e.g. `6606bd0c-9d9d-4ffd-81bb-81c9196ccfe6`", field_name="fulltext_jobs__job_id")
217 |         job_state = filters.ChoiceFilter(choices=JobState.choices, help_text="Filter by job status")
218 |         updated_after = DatetimeFilter(help_text="Only show posts with a `datetime_updated` after the time specified. It must be in `YYYY-MM-DD HH:MM[:ss[.uuuuuu]][TZ]`, e.g. `2020-01-01 00:00`", field_name="datetime_updated", lookup_expr="gt")
219 | 
220 |     def get_queryset(self):
221 |         return Post.visible_posts() \
222 |                 .annotate(job_state=Subquery(Job.objects.filter(pk=OuterRef('last_job_id')).values('state')[:1]))
223 |     
224 |     def partial_update(self, request, *args, **kwargs):
225 |         instance = self.get_object()
226 |         serializer = PostPatchSerializer(instance, data=request.data, partial=True)
227 |         serializer.is_valid(raise_exception=True)
228 |         serializer.save()
229 | 
230 |         if getattr(instance, '_prefetched_objects_cache', None):
231 |             # If 'prefetch_related' has been applied to a queryset, we need to
232 |             # forcibly invalidate the prefetch cache on the instance.
233 |             instance._prefetched_objects_cache = {}
234 | 
235 |         s = self.get_serializer(instance)
236 |         return Response(s.data, status=status.HTTP_201_CREATED)
237 |     
238 |     @decorators.action(detail=True, methods=['PATCH'])
239 |     def reindex(self, request, *args, **kwargs):
240 |         post, job_obj = self.new_reindex_post_job(request)
241 |         job_resp = JobSerializer(job_obj).data.copy()
242 |         job_resp.update(post_id=post.id)
243 |         return Response(job_resp, status=status.HTTP_201_CREATED)
244 | 
245 |     def new_reindex_post_job(self, request):
246 |         s = PatchSerializer(data=request.data)
247 |         s.is_valid(raise_exception=True)
248 |         post: Post = self.get_object()
249 |         job_obj = task_helper.new_patch_posts_job(post.feed, [post])
250 |         return post, job_obj
251 | 
252 |     def destroy(self, *args, **kwargs):
253 |         obj = self.get_object()
254 |         obj.deleted_manually = True
255 |         obj.save()
256 |         obj.feed.save()
257 |         return Response(None, status=status.HTTP_204_NO_CONTENT)
258 | 
259 | 
260 | 
261 | class FeedView(viewsets.ModelViewSet):
262 |     openapi_tags = ["Feeds"]
263 |     serializer_class = FeedSerializer
264 |     queryset = Feed.objects.all()
265 |     lookup_url_kwarg = "feed_id"
266 |     pagination_class = Pagination("feeds")
267 |     http_method_names = ["get", "post", "patch", "delete"]
268 | 
269 |     filter_backends = [DjangoFilterBackend, Ordering, MinMaxDateFilter]
270 |     ordering_fields = [
271 |         "datetime_added",
272 |         "title",
273 |         "url",
274 |         "count_of_posts",
275 |         "earliest_item_pubdate",
276 |         "latest_item_pubdate",
277 |     ]
278 |     ordering = ["-datetime_added"]
279 |     minmax_date_fields = ["earliest_item_pubdate", "latest_item_pubdate"]
280 | 
281 |     class filterset_class(FilterSet):
282 |         title = Filter(
283 |             help_text="Filter by the content in feed title. Will search for titles that contain the value entered. Search is wildcard so `exploit` will match `exploited` and `exploits`.",
284 |             lookup_expr="icontains",
285 |         )
286 |         description = Filter(
287 |             help_text="Filter by the content in feed description. Will search for descriptions that contain the value entered. Search is wildcard so `exploit` will match `exploited` and `exploits`.",
288 |             lookup_expr="icontains",
289 |         )
290 |         url = Filter(
291 |             help_text="Filter by the content in a feeds URL. Will search for URLs that contain the value entered. Search is wildcard so `google` will match `google.com` and `google.co.uk`.",
292 |             lookup_expr="icontains",
293 |         )
294 |         id = BaseCSVFilter(
295 |             help_text="Filter by feed id(s), comma-separated, e.g `6c6e6448-04d4-42a3-9214-4f0f7d02694e,2bce5b30-7014-4a5d-ade7-12913fe6ac36`",
296 |             lookup_expr="in",
297 |         )
298 |         feed_type = filters.MultipleChoiceFilter(
299 |             help_text="Filter by `feed_type`",
300 |             choices=FeedType.choices,
301 |         )
302 | 
303 | 
304 |     def get_queryset(self):
305 |         return Feed.objects.all().annotate(count_of_posts=Count("posts"))
306 | 
307 |     @extend_schema(
308 |         summary="Create a New Feed",
309 |         description=textwrap.dedent(
310 |             """
311 |             Use this endpoint to create to a new feed.
312 | 
313 |             The following key/values are accepted in the body of the request:
314 | 
315 |             * `url` (required): a valid RSS or ATOM feed URL (if `use_search_index` = `false`) OR the URL of the blog (if `use_search_index` = `true`).
316 |             * `include_remote_blogs` (required): is a boolean setting and will ask history4feed to ignore any feeds not on the same domain as the URL of the feed. Some RSS/ATOM feeds include remote posts from other sites (e.g. for a paid promotion). This setting (set to `false` allows you to ignore remote posts that do not use the same domain as the `url` used). Generally you should set `include_remote_blogs` to `false`. The one exception is when things like feed aggregators (e.g. Feedburner) URLs are used, where the actual blog posts are not on the `feedburner.com` (or whatever) domain. In this case `include_remote_blogs` should be set to `true`.
317 |             * `pretty_url` (optional): you can also include a secondary URL in the database. This is designed to be used to show the link to the blog (not the RSS/ATOM) feed so that a user can navigate to the blog in their browser.
318 |             * `title` (optional): the title of the feed will be used if not passed. You can also manually pass the title of the blog here.
319 |             * `description` (optional): the description of the feed will be used if not passed. You can also manually pass the description of the blog here.
320 |             * `use_search_index` (optional, default is `false`): If the `url` is not a valid RSS or ATOM feed you must set this mode to `true`. Set to `true` this mode uses search results that contain the base `url` passed vs. the RSS/ATOM feed entries (when this mode is set to `false`). This mode is only be able to index results in Google Search, so can miss some sites entirely where they are not indexed by Google. You must also pass a `title` and `description` when setting this mode to `true`. Note, you can use the skeleton endpoint to create a feed manually from a non RSS/ATOM URL or where search results do not satisfy your use case.
321 | 
322 |             The `id` of a Feed is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` and the value used is `<FEED_URL>` (e.g. `https://muchdogesec.github.io/fakeblog123/feeds/rss-feed-encoded.xml` would have the id `d1d96b71-c687-50db-9d2b-d0092d1d163a`). Therefore, you cannot add a URL that already exists, you must first delete it to add it with new settings.
323 | 
324 |             Each post ID is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` and the value used `<FEED_ID>+<POST_URL>+<POST_PUB_TIME (to .000000Z)>` (e.g. `d1d96b71-c687-50db-9d2b-d0092d1d163a+https://muchdogesec.github.io/fakeblog123///test3/2024/08/20/update-post.html+2024-08-20T10:00:00.000000Z` = `22173843-f008-5afa-a8fb-7fc7a4e3bfda`).
325 | 
326 |             The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint.
327 |             """
328 |         ),
329 |         responses={
330 |             201: FeedCreatedJobSerializer,
331 |             400: OpenApiResponse(CommonErrorSerializer, "Bad request", examples=[HTTP400_EXAMPLE]),
332 |             406: OpenApiResponse(CommonErrorSerializer, "Invalid feed url", examples=[OpenApiExample(name="http-406", value={"detail": "invalid feed url", "code": 406})]),
333 |         },
334 |         request=FeedSerializer,
335 |     )
336 |     def create(self, request: request.Request, **kwargs):
337 | 
338 |         job_obj = self.new_create_job(request)
339 |         resp_data = self.serializer_class(job_obj.feed).data.copy()
340 |         resp_data.update(
341 |             job_state=job_obj.state,
342 |             job_id=job_obj.id,
343 |         )
344 |         return Response(resp_data, status=status.HTTP_201_CREATED)
345 |     
346 |     def new_create_job(self, request: request.Request):
347 |         feed_data = {}
348 |         s = FeedSerializer(data=request.data)
349 |         s.is_valid(raise_exception=True)
350 |         if s.validated_data["use_search_index"]:
351 |             s = SearchIndexFeedSerializer(data=request.data)
352 |             s.is_valid(raise_exception=True)
353 |             feed_data.update(feed_type=FeedType.SEARCH_INDEX)
354 |         else:
355 |             try:
356 |                 feed_data = h4f.parse_feed_from_url(s.data["url"])
357 |             except Exception as e:
358 |                 raise serializers.InvalidFeed(s.data["url"])
359 | 
360 |         for k in ['title', 'description']:
361 |             if v := s.validated_data.get(k):
362 |                 feed_data[k] = v
363 |             elif v := feed_data.get(k):
364 |                 feed_data[k] = v + AUTO_TITLE_TRAIL
365 |         
366 |         s = FeedSerializer(data={**s.data, **feed_data})
367 |         s.is_valid(raise_exception=True)
368 | 
369 |         feed_obj: Feed = s.save(feed_type=feed_data['feed_type'])
370 |         job_obj = task_helper.new_job(feed_obj, s.validated_data.get('include_remote_blogs', False))
371 |         return job_obj
372 | 
373 |     @extend_schema(
374 |         summary="Create a New Skeleton Feed",
375 |         description=textwrap.dedent(
376 |             """
377 |             Sometimes it might be the case you want to curate a blog manually using various URLs from different blogs. This is what `skeleton` feeds are designed for, allowing you to create a skeleton feed and then add posts to it manually later on using the add post manually endpoint.
378 | 
379 |             The following key/values are accepted in the body of the request:
380 | 
381 |             * `url` (required): the URL to be attached to the feed. Needs to be a URL (because this is what feed ID is generated from), however does not need to be valid.
382 |             * `pretty_url` (optional): you can also include a secondary URL in the database. This is designed to be used to show the link to the blog (not the RSS/ATOM) feed so that a user can navigate to the blog in their browser.
383 |             * `title` (required): the title of the feed
384 |             * `description` (optional): the description of the feed
385 | 
386 |             The response will return the created Feed object with the Feed `id`.
387 |             """
388 |         ),
389 |         responses={
390 |             201: FeedSerializer,
391 |             400: OpenApiResponse(CommonErrorSerializer, "Bad request", examples=[HTTP400_EXAMPLE]),
392 |         },
393 |         request=SkeletonFeedSerializer,
394 |     )
395 |     @decorators.action(methods=['POST'], detail=False)
396 |     def skeleton(self, request: request.Request, **kwargs):
397 |         s = SkeletonFeedSerializer(data=request.data)
398 |         s.is_valid(raise_exception=True)
399 |         instance = s.save()
400 |         return Response(FeedSerializer(instance).data, status=status.HTTP_201_CREATED)
401 |     
402 |     @extend_schema(
403 |         parameters=[FEED_ID_PARAM],
404 |         summary="Update a Feeds Metadata",
405 |         request=FeedPatchSerializer,
406 |         description=textwrap.dedent(
407 |             """
408 |             Update the metadata of the Feed.
409 | 
410 |             Note, it is not possible to update the `url` of the feed. You must delete the Feed and add it again to modify the `url`.
411 | 
412 |             The following key/values are accepted in the body of the request:
413 | 
414 |             * `title` (optional): update the `title` of the Feed
415 |             * `description` (optional): update the `description` of the Feed
416 |             * `pretty_url` (optional): update the `pretty_url of the Feed
417 | 
418 |             Only one/key value is required in the request. For those not passed, the current value will remain unchanged.
419 | 
420 |             The response will contain the newly updated Feed object.
421 | 
422 |             Every time the feed is updated, the `datetime_modified` property in the Feed object will be updated accordingly.
423 |             """
424 |         ),
425 |         responses={
426 |             201: FeedSerializer,
427 |             400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]),
428 |             (404, "application/json"): OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]),
429 |         },
430 |     )
431 |     def partial_update(self, request, *args, **kwargs):
432 |         feed_obj: Feed = self.get_object()
433 |         s = FeedPatchSerializer(feed_obj, data=request.data, partial=True)
434 |         s.is_valid(raise_exception=True)
435 |         s.save(datetime_modified=timezone.now())
436 |         return Response(self.serializer_class(feed_obj).data, status=status.HTTP_201_CREATED)
437 |     
438 |     @extend_schema(
439 |         parameters=[FEED_ID_PARAM],
440 |         summary="Fetch Updates for a Feed",
441 |         request=FeedFetchSerializer,
442 |         description=textwrap.dedent(
443 |             """
444 |             Use this endpoint to check for new posts on this blog since the last post time. An update request will immediately trigger a job to get the posts between `latest_item_pubdate` for feed and time you make a request to this endpoint.
445 | 
446 |             The following key/values are accepted in the body of the request:
447 | 
448 |              * `include_remote_blogs` (required): is a boolean setting and will ask history4feed to ignore any feeds not on the same domain as the URL of the feed. Some feeds include remote posts from other sites (e.g. for a paid promotion). This setting (set to `false` allows you to ignore remote posts that do not use the same domain as the `url` used). Generally you should set `include_remote_blogs` to `false`. The one exception is when things like feed aggregators (e.g. Feedburner) URLs are used, where the actual blog posts are not on the `feedburner.com` (or whatever) domain. In this case `include_remote_blogs` should be set to `true`.
449 | 
450 |             Each post ID is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` (history4feed) and the value used `<FEED_ID>+<POST_URL>+<POST_PUB_TIME (to .000000Z)>` (e.g. `d1d96b71-c687-50db-9d2b-d0092d1d163a+https://muchdogesec.github.io/fakeblog123///test3/2024/08/20/update-post.html+2024-08-20T10:00:00.000000Z` = `22173843-f008-5afa-a8fb-7fc7a4e3bfda`).
451 | 
452 |             **IMPORTANT:** this request will fail if run against a Skeleton type feed. Skeleton feeds can only be updated by adding posts to them manually using the Manually Add a Post to a Feed endpoint.
453 | 
454 |             **IMPORTANT:** this endpoint can miss updates that have happened to currently indexed posts (where the RSS or ATOM feed or search results do not report the updated date correctly -- which is actually very common). To solve this issue for currently indexed blog posts, use the Update a Post in a Feed endpoint directly.
455 | 
456 |             The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint.
457 |             """
458 |         ),
459 |         responses={
460 |             201: FeedCreatedJobSerializer,
461 |             400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]),
462 |             (404, "application/json"): OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]),
463 |         },
464 |     )
465 |     @decorators.action(methods=["PATCH"], detail=True)
466 |     def fetch(self, request, *args, **kwargs):
467 |         job_obj = self.new_fetch_job(request)
468 |         feed = self.serializer_class(self.get_object()).data.copy()
469 |         feed.update(
470 |             job_state=job_obj.state,
471 |             job_id=job_obj.id,
472 |         )
473 |         return Response(feed, status=status.HTTP_201_CREATED)
474 | 
475 |     def new_fetch_job(self, request):
476 |         feed_obj: Feed = self.get_object()
477 |         if feed_obj.feed_type == FeedType.SKELETON:
478 |             raise validators.ValidationError(f"fetch not supported for feed of type {feed_obj.feed_type}")
479 |         s = FeedFetchSerializer(feed_obj, data=request.data, partial=True)
480 |         s.is_valid(raise_exception=True)
481 |         s.save()
482 |         return task_helper.new_job(feed_obj, s.validated_data.get('include_remote_blogs', False))
483 | 
484 |     @extend_schema(
485 |         summary="Search for Feeds",
486 |         description=textwrap.dedent(
487 |             """
488 |             Use this endpoint to get a list of all the feeds you are currently subscribed to. This endpoint is usually used to get the ID of Deed you want to get blog post data for in a follow up request to the GET Feed Posts endpoints or to get the status of a job related to the Feed in a follow up request to the GET Job endpoint. If you already know the id of the Feed already, you can use the GET Feeds by ID endpoint.
489 |             """
490 |         ),
491 |         responses={
492 |             200: FeedSerializer,
493 |             400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]),
494 |         },
495 |     )
496 |     def list(self, request, *args, **kwargs):
497 |         return super().list(request, *args, **kwargs)
498 | 
499 |     @extend_schema(
500 |         parameters=[FEED_ID_PARAM],
501 |         summary="Get a Feed",
502 |         description=textwrap.dedent(
503 |             """
504 |             Use this endpoint to get information about a specific feed using its ID. You can search for a Feed ID using the GET Feeds endpoint, if required.
505 |             """
506 |         ),
507 |         responses={
508 |             200: FeedSerializer,
509 |             404: OpenApiResponse(CommonErrorSerializer, "Not found", examples=[HTTP404_EXAMPLE]),
510 |         },
511 |     )
512 |     def retrieve(self, request, *args, **kwargs):
513 |         return super().retrieve(request, *args, **kwargs)
514 | 
515 |     @extend_schema(
516 |         parameters=[FEED_ID_PARAM],
517 |         summary="Delete a Feed",
518 |         description=textwrap.dedent(
519 |             """
520 |             Use this endpoint to delete a feed using its ID. This will delete all posts (items) that belong to the feed and cannot be reversed.
521 |             """
522 |         ),
523 |         responses={
524 |             204: {},
525 |             404: OpenApiResponse(
526 |                 CommonErrorSerializer,
527 |                 "Feed does not exist",
528 |                 examples=[HTTP404_EXAMPLE],
529 |             ),
530 |         },
531 |     )
532 |     def destroy(self, request, *args, **kwargs):
533 |         return super().destroy(request, *args, **kwargs)
534 |     
535 | class RSSView(viewsets.GenericViewSet):
536 |     class filterset_class(PostOnlyView.filterset_class):
537 |         feed_id = None
538 |     openapi_tags = ["Feeds"]
539 |     renderer_classes=[RSSRenderer]
540 |     lookup_url_kwarg = 'feed_id'
541 | 
542 |     @extend_schema(
543 |         parameters=[FEED_ID_PARAM],
544 |         filters=True,
545 |         summary="RSS Feed for Feed",
546 |         description=textwrap.dedent(
547 |             """
548 |             Use this endpoint with your feed reader. The response of this endpoint is valid RSS XML for the Posts in the Feed. If you want more flexibility (perhaps to build a custom integration) use the JSON version of this endpoint.
549 |             """
550 |         ),
551 |         responses={
552 |             (200, RSSRenderer.media_type): XML_RESPONSE,
553 |             (404, "application/json"): OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]),
554 |             (400, "application/json"): OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]),
555 |         },
556 |     )
557 |     @decorators.action(
558 |         methods=["get"],
559 |         detail=True,
560 |         pagination_class=XMLPostPagination("xml_posts"),
561 |     )
562 |     def rss(self, request: request.Request, *args, feed_id=None, **kwargs):
563 |         feed_obj = get_object_or_404(Feed, id=feed_id)
564 |         queryset = self.filter_queryset(self.get_queryset())
565 |         page = self.paginate_queryset(queryset)
566 |         body = build_rss.build_rss(feed_obj, page)
567 |         return self.paginator.get_paginated_response(body)
568 |     
569 |     def get_queryset(self):
570 |         return PostOnlyView.get_queryset(self).filter(feed_id=self.kwargs.get("feed_id"))
571 | 
572 | 
573 | 
574 | @extend_schema_view(
575 |     retrieve=extend_schema(
576 |         parameters=[FEED_ID_PARAM, POST_ID_PARAM],
577 |         summary="Get a Post in a Feed",
578 |         description=textwrap.dedent(
579 |             """
580 |             This will return a single Post in a Feed using its ID. It is useful if you only want to get the data for a single entry.
581 |             """
582 |         ),
583 |         responses={
584 |             200: PostSerializer,
585 |             404: OpenApiResponse(CommonErrorSerializer, "Feed or post not found", examples=[HTTP404_EXAMPLE]),
586 |             400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]),
587 |         },
588 |     ),
589 |     list=extend_schema(
590 |         summary="Search for Posts in a Feed (JSON)",
591 |         description=textwrap.dedent(
592 |             """
593 |             Use this endpoint if you want to search through all Posts in a Feed. The response of this endpoint is JSON, and is useful if you're building a custom integration to a downstream tool. If you just want to import the data for this blog into your feed reader use the RSS version of this endpoint.
594 |             """
595 |         ),
596 |         responses={
597 |             200: PostSerializer,
598 |             404: OpenApiResponse(CommonErrorSerializer, "Feed not found", examples=[HTTP404_EXAMPLE]),
599 |             400: OpenApiResponse(CommonErrorSerializer, "Request not understood", examples=[HTTP400_EXAMPLE]),
600 |         },
601 |     ),
602 | )
603 | 
604 | class feed_post_view(
605 |     mixins.CreateModelMixin,
606 |     viewsets.GenericViewSet
607 | ):
608 | 
609 |     openapi_tags = ["Feeds"]
610 |     serializer_class = PostSerializer
611 | 
612 |     class filterset_class(PostOnlyView.filterset_class):
613 |         feed_id = None
614 |     
615 |     
616 |     def get_queryset(self):
617 |         return PostOnlyView.get_queryset(self).filter(feed_id=self.kwargs.get("feed_id"))
618 |     
619 |     
620 |     @extend_schema(
621 |         parameters=[FEED_ID_PARAM],
622 |         summary="Manually Add a Post to A Feed",
623 |         description=textwrap.dedent(
624 |             """
625 |             Sometimes historic posts are missed when a feed is indexed (typically when no Wayback Machine archive exists).
626 | 
627 |             This endpoint allows you to add Posts manually to a Feed.
628 | 
629 |             If the feed you want to add a post to does not already exist, you should first add it using the POST Feed or POST skeleton feed endpoints.
630 | 
631 |             The following key/values are accepted in the body of the request:
632 | 
633 |             * `link` (required - must be unique): The URL of the blog post. This is where the content of the post is found. It cannot be the same as the `url` of a post already in this feed. If you want to update the post, use the PATCH post endpoint.
634 |             * `pubdate` (required): The date of the blog post in the format `YYYY-MM-DDTHH:MM:SS.sssZ`. history4feed cannot accurately determine a post date in all cases, so you must enter it manually.
635 |             * `title` (required):  history4feed cannot accurately determine the title of a post in all cases, so you must enter it manually.
636 |             * `author` (optional): the value to be stored for the author of the post.
637 |             * `categories` (optional) : the value(s) to be stored for the category of the post. Pass as a list like `["tag1","tag2"]`.
638 | 
639 |             The response will return the Job information responsible for getting the requested data you can track using the `id` returned via the GET Jobs by ID endpoint.
640 | 
641 |             Each post ID is generated using a UUIDv5. The namespace used is `6c6e6448-04d4-42a3-9214-4f0f7d02694e` and the value used `<FEED_ID>+<POST_URL>+<POST_PUB_TIME (to .000000Z)>` (e.g. `d1d96b71-c687-50db-9d2b-d0092d1d163a+https://muchdogesec.github.io/fakeblog123///test3/2024/08/20/update-post.html+2024-08-20T10:00:00.000000Z` = `22173843-f008-5afa-a8fb-7fc7a4e3bfda`).
642 | 
643 |             _Note: We do have a proof-of-concept to scrape a site for all blog post urls, titles, and pubdate called [sitemap2posts](https://github.com/muchdogesec/sitemap2posts) which can help form the request body needed for this endpoint._
644 |             """
645 |         ),
646 |         responses={
647 |             201: PostJobSerializer,
648 |             404: OpenApiResponse(CommonErrorSerializer, "Feed does not exist", examples=[HTTP404_EXAMPLE]),
649 |         },
650 |         request=CreatePostsSerializer,
651 |     )
652 |     def create(self, request, *args, feed_id=None, **kwargs):
653 |         job_obj = self.new_create_post_job(request, feed_id)
654 |         job_resp = JobSerializer(job_obj).data.copy()
655 |         # job_resp.update(post_id=post.id)
656 |         return Response(job_resp, status=status.HTTP_201_CREATED)
657 | 
658 |     def new_create_post_job(self, request, feed_id):
659 |         feed_obj = get_object_or_404(Feed, id=feed_id)
660 |         data = dict(request.data) #, feed_id=feed_id, feed=feed_id)
661 | 
662 |         s = CreatePostsSerializer(data=data, context=dict(feed_id=feed_id))
663 |         s.is_valid(raise_exception=True)
664 | 
665 |         posts = s.save(added_manually=True, deleted_manually=False)
666 | 
667 |         job_obj = task_helper.new_patch_posts_job(feed_obj, posts)
668 |         return job_obj
669 |     
670 | 
671 |     @extend_schema(
672 |         summary="Update all Posts in a Feed",
673 |         description=textwrap.dedent(
674 |             """
675 |                 This endpoint will reindex the Post content (`description`) for all Post IDs currently listed in the Feed.
676 | 
677 |                 This request will only change the content (`description`) stored for the Post ID. It will not update the title, pubdate, author, or categories. If you need to update these properties you can use the Update Post Metadata endpoint.
678 | 
679 |                 Note, if you only want to update the content of a single post, it is much more effecient to use the Update a Post in a Feed endpoint.
680 |             """
681 |         ),
682 |         responses={
683 |             201: PostJobSerializer,
684 |             404: OpenApiResponse(CommonErrorSerializer, "Feed does not exist", examples=[HTTP404_EXAMPLE]),
685 |         },
686 |         request={},
687 |     )
688 |     @decorators.action(methods=["PATCH"], detail=False, url_path='reindex')
689 |     def reindex_feed(self, request, *args, feed_id=None, **kwargs):
690 |         job_obj = self.new_reindex_feed_job(feed_id)
691 |         job_resp = JobSerializer(job_obj).data.copy()
692 |         # job_resp.update(post_id=post.id)
693 |         return Response(job_resp, status=status.HTTP_201_CREATED)
694 | 
695 |     def new_reindex_feed_job(self, feed_id):
696 |         posts = self.get_queryset().all()
697 |         feed_obj = get_object_or_404(Feed, id=feed_id)
698 | 
699 |         job_obj = task_helper.new_patch_posts_job(feed_obj, posts)
700 |         return job_obj
701 | 
702 | 
703 | class FeedPostView(
704 |     feed_post_view
705 | ):
706 |     pass
707 | 
708 | class JobView(
709 |     mixins.RetrieveModelMixin, mixins.ListModelMixin, viewsets.GenericViewSet
710 | ):
711 |     serializer_class = JobSerializer
712 |     pagination_class = Pagination("jobs")
713 |     filter_backends = [DjangoFilterBackend, Ordering]
714 |     ordering_fields = ["run_datetime", "state"]
715 |     ordering = "run_datetime_descending"
716 |     openapi_tags = ["Jobs"]
717 |     lookup_url_kwarg = "job_id"
718 |     lookup_field = "id"
719 | 
720 |     class filterset_class(FilterSet):
721 |         feed_id = Filter(
722 |             help_text="Filter Jobs by the ID of the Feed they belong to. You can search for Feed IDs using the GET Feeds endpoints. Note a Feed can have multiple jobs associated with it where a PATCH request has been run to update the Feed. e.g. `6c6e6448-04d4-42a3-9214-4f0f7d02694e`"
723 |         )
724 |         state = Filter(help_text="Filter by the status of a Job")
725 |         post_id = UUIDFilter(help_text="Filter Jobs by the ID of the Post they belong to. You can search for Post IDs using the GET Posts endpoint. Note a Post can have multiple jobs associated with it where a PATCH request has been run to update a Feed or a Post. e.g `797e94b1-efdc-4e66-a748-f2b6a5896a89`", field_name="fulltext_jobs__post_id")
726 | 
727 |     def get_queryset(self):
728 |         return Job.objects.all().annotate(count_of_items=Count("fulltext_jobs"))
729 | 
730 |     def filter_queryset(self, queryset):
731 |         return super().filter_queryset(queryset)
732 | 
733 |     @extend_schema(
734 |         summary="Search Jobs",
735 |         description=textwrap.dedent(
736 |             """
737 |             Jobs track the status of the request to get posts for Feeds. For every new Feed added and every update to a Feed requested a job will be created. The `id` of a job is printed in the POST and PATCH responses respectively, but you can use this endpoint to search for the id again, if required.
738 |             """
739 |         ),
740 |         responses={
741 |             200: JobSerializer,
742 |             400: OpenApiResponse(
743 |                 CommonErrorSerializer,
744 |                 "Request not understood",
745 |                 [HTTP400_EXAMPLE],
746 |             ),
747 |         },
748 |     )
749 |     def list(self, request, *args, **kwargs):
750 |         return super().list(request, *args, **kwargs)
751 | 
752 |     @extend_schema(
753 |         parameters=[JOB_ID_PARAM],
754 |         summary="Get a Job",
755 |         description=textwrap.dedent(
756 |             """
757 |             Using a Job ID you can retrieve information about its state via this endpoint. This is useful to see if a Job to get data is complete, how many posts were imported in the job, or if an error has occurred.
758 |             """
759 |         ),
760 |         responses={
761 |             200: JobSerializer,
762 |             404: OpenApiResponse(
763 |                 CommonErrorSerializer,
764 |                 "Job not found",
765 |                 [HTTP404_EXAMPLE],
766 |             ),
767 |         },
768 |     )
769 |     def retrieve(self, request, *args, **kwargs):
770 |         return super().retrieve(request, *args, **kwargs)
771 |     
772 | 
773 |     @extend_schema(
774 |         parameters=[JOB_ID_PARAM],
775 |         summary="Kill a running Job that is indexing Posts",
776 |         description=textwrap.dedent(
777 |             """
778 |             Using a Job ID you can kill it whilst it is still in `running` or `pending` state.
779 | 
780 |             If any posts have already been downloaded before the job is complete, they will still remain and you will need to delete them using the delete endpoints manually.
781 | 
782 |             The job will enter `cancelled` state when cancelled.
783 |             """
784 |         ),
785 |         responses={
786 |             204: {},
787 |             404: OpenApiResponse(
788 |                 CommonErrorSerializer,
789 |                 "Job not found",
790 |                 [HTTP404_EXAMPLE],
791 |             ),
792 |         },
793 |     )
794 |     @decorators.action(methods=['DELETE'], detail=True, url_path="kill")
795 |     def cancel_job(self, request, *args, **kwargs):
796 |         obj: Job = self.get_object()
797 |         obj.cancel()
798 |         return Response(status=status.HTTP_204_NO_CONTENT)
799 | 


--------------------------------------------------------------------------------
/history4feed/asgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ASGI config for history4feed project.
 3 | 
 4 | It exposes the ASGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.asgi import get_asgi_application
13 | 
14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'history4feed.settings')
15 | 
16 | application = get_asgi_application()
17 | 


--------------------------------------------------------------------------------
/history4feed/h4fscripts/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | LOG_PRINT = 105
 3 | 
 4 | def newLogger(name: str) -> logging.Logger:
 5 |     # Configure logging
 6 |     logging.addLevelName(LOG_PRINT, "LOG")
 7 |     stream_handler = logging.StreamHandler()  # Log to stdout and stderr
 8 |     stream_handler.setLevel(logging.INFO)
 9 |     logging.basicConfig(
10 |         level=logging.INFO,
11 |         format=f"%(asctime)s [%(levelname)s] %(message)s",
12 |         handlers=[stream_handler],
13 |         datefmt='%d-%b-%y %H:%M:%S'
14 |     )
15 |     logger = logging.getLogger("history4feed")
16 |     logger.print = lambda msg: logger.log(LOG_PRINT, msg)
17 |     logger.print("=====================history4feed======================")
18 | 
19 |     return logger
20 | 
21 | logger = newLogger("h4f-logger")


--------------------------------------------------------------------------------
/history4feed/h4fscripts/build_rss.py:
--------------------------------------------------------------------------------
 1 | from .xml_utils import createRSSHeader, createCDataElement, createTextElement
 2 | from ..app.models import Feed, Post
 3 | from django.db.models.manager import BaseManager
 4 | from xml.dom.minidom import Document
 5 | 
 6 | 
 7 | def build_rss(feed_obj: Feed, posts_set: BaseManager[Post]):
 8 |     document, channel = createRSSHeader(feed_obj.title,  feed_obj.description, feed_obj.url, feed_obj.latest_item_pubdate)
 9 |     for post in posts_set:
10 |         channel.appendChild(build_entry_element(post, document))
11 | 
12 |     return document.toprettyxml()
13 | 
14 | def build_entry_element(post: Post, d: Document):
15 |         element = d.createElement('item')
16 |         element.appendChild(createTextElement(d, "title", post.title))
17 | 
18 |         link = createTextElement(d, "link", post.link)
19 |         link.setAttribute("href", post.link)
20 |         element.appendChild(link)
21 |         element.appendChild(createTextElement(d, "pubDate", post.pubdate.isoformat()))
22 |         if post.description:
23 |             description = post.description
24 |             description = description
25 |             element.appendChild(createTextElement(d, "description", description))
26 | 
27 |         for category in post.categories.all():
28 |             element.appendChild(createTextElement(d, "category", category.name))
29 | 
30 |         if post.author:
31 |             author = d.createElement('author')
32 |             author.appendChild(createTextElement(d, "name", post.author))
33 |             element.appendChild(author)
34 |         return element


--------------------------------------------------------------------------------
/history4feed/h4fscripts/celery.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from celery import Celery
 3 | # Set the default Django settings module for the 'celery' program.
 4 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'history4feed.settings')
 5 | 
 6 | app = Celery('history4feed')
 7 | 
 8 | 
 9 | app.config_from_object('os:environ', namespace='CELERY')
10 | 
11 | # Load task modules from all registered Django apps.
12 | app.autodiscover_tasks()


--------------------------------------------------------------------------------
/history4feed/h4fscripts/exceptions.py:
--------------------------------------------------------------------------------
 1 | class history4feedException(Exception):
 2 |     pass
 3 | class UnknownFeedtypeException(history4feedException):
 4 |     pass
 5 | class ParseArgumentException(history4feedException):
 6 |     pass
 7 | class FetchRedirect(history4feedException):
 8 |     pass
 9 | 
10 | class ScrapflyError(Exception):
11 |     pass


--------------------------------------------------------------------------------
/history4feed/h4fscripts/h4f.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import time
  3 | from io import BytesIO, StringIO
  4 | from xml.dom.minidom import Element, parse
  5 | import os
  6 | from history4feed.app.settings import history4feed_server_settings as settings
  7 | import requests
  8 | from dateutil.parser import parse as parse_date
  9 | from readability import Document as ReadabilityDocument
 10 | import brotli
 11 | from types import SimpleNamespace
 12 | from .import logger
 13 | from .xml_utils import getAtomLink, getFirstChildByTag, getFirstElementByTag, getText
 14 | from .exceptions import history4feedException, UnknownFeedtypeException, FetchRedirect, ScrapflyError
 15 | import fake_useragent
 16 | from urllib.parse import urljoin
 17 | 
 18 | def fetch_page_with_retries(url, retry_count=3, sleep_seconds=settings.WAYBACK_SLEEP_SECONDS, **kwargs):
 19 |     ua = fake_useragent.UserAgent()
 20 |     session = requests.Session()
 21 |     session.max_redirects = 3
 22 |     headers = kwargs.get('headers', {})
 23 |     headers.update({
 24 |         "User-Agent": ua.random,
 25 |     })
 26 |     kwargs.update(headers=headers)
 27 |     error = None
 28 |     for i in range(retry_count+1):
 29 |         try:
 30 |             if i > 0:
 31 |                 time.sleep(sleep_seconds * 1.5 ** (i-1))
 32 |             return fetch_page(session, url, **kwargs)
 33 |         except FatalError:
 34 |             raise
 35 |         except BaseException as e:
 36 |             error = e
 37 |             print(error)
 38 |     raise ConnectionError(f"could not fetch page after {retry_count} retries") from error
 39 |     
 40 | class FatalError(Exception):
 41 |     pass
 42 | 
 43 | def fetch_page(session, url, headers=None) -> tuple[bytes, str, str]:
 44 |     proxy_apikey = os.getenv("SCRAPFLY_APIKEY")
 45 |     headers = headers or {}
 46 | 
 47 |     if proxy_apikey:
 48 |         logger.info(f"Fetching `{url}` via scrapfly.io")
 49 |         headers = dict((f"headers[{k}]", v) for k, v in headers.items())
 50 |         resp = session.get("https://api.scrapfly.io/scrape", params=dict(**headers, key=proxy_apikey, url=url, country="us,ca,mx,gb,fr,de,au,at,be,hr,cz,dk,ee,fi,ie,se,es,pt,nl"))
 51 |         json_data = resp.json()
 52 |         if resp.status_code != 200:
 53 |             raise ScrapflyError(json_data)
 54 |         result = SimpleNamespace(**json_data['result'])
 55 |         if result.status_code > 499:
 56 |             raise FatalError(f"Got server error {result.status_code}, stopping")
 57 |         if result.status_code > 399:
 58 |             raise history4feedException(f"PROXY_GET Request failed for `{url}`, status: {result.status_code}, reason: {result.status}")
 59 |         elif result.status_code > 299:
 60 |             raise FetchRedirect(f"PROXY_GET for `{url}` redirected, status: {result.status_code}, reason: {result.status}")
 61 |         return result.content.encode(), result.content_type, result.url
 62 | 
 63 |     logger.info(f"Fetching `{url}`")
 64 |     resp: requests.Response  = session.get(url, headers=headers)
 65 |     content = resp.content
 66 |     if not resp.ok:
 67 |         raise history4feedException(f"GET Request failed for `{url}`, status: {resp.status_code}, reason: {resp.reason}")
 68 | 
 69 |     # some times, wayback returns br encoding, try decompressing
 70 |     try:
 71 |         content = brotli.decompress(content)
 72 |     except Exception as err:
 73 |         logger.print(f"brotli decompress fail: {err}")
 74 |     return content, resp.headers.get("content-type"), resp.url
 75 | 
 76 | def parse_feed_from_url(url):
 77 |     data, content_type, url = fetch_page_with_retries(url, retry_count=0)
 78 |     return parse_feed_from_content(data, url)
 79 | 
 80 | 
 81 | @dataclass
 82 | class PostDict:
 83 |     link: str
 84 |     title: str
 85 |     pubdate: str
 86 |     author: str = None
 87 |     categories: list[str] = None
 88 |     description: str = "EMPTY BODY"
 89 |     content_type: str =  "text/html"
 90 | 
 91 | def parse_feed_from_content(data: bytes, url: str):
 92 |     feed_data = {}
 93 |     try:
 94 |         if isinstance(data, str):
 95 |             document = parse(StringIO(data))
 96 |         else:
 97 |             document = parse(BytesIO(data))
 98 |         # check if it's atom or rss
 99 |         if rss := getFirstElementByTag(document, "rss"):
100 |             channel = getFirstElementByTag(rss, "channel")
101 |             feed_data['description'] = getText(getFirstElementByTag(channel, "description"))
102 |             feed_data['title'] = getText(getFirstElementByTag(channel, "title"))
103 |             # feed_data['rel'] = getText(getFirstElementByTag(channel, "link"))
104 | 
105 |             feed_data["feed_type"] = "rss"
106 |         elif feed := getFirstElementByTag(document, "feed"):
107 |             feed_data['description'] = getText(getFirstElementByTag(feed, "subtitle"))
108 |             feed_data['title'] = getText(getFirstElementByTag(feed, "title"))
109 |             # feed_data['rel'] = getAtomLink(feed)
110 | 
111 |             feed_data["feed_type"] = "atom"
112 |         else:
113 |             raise UnknownFeedtypeException("feed is neither RSS or ATOM")
114 |         feed_data["url"] = url
115 |         return feed_data
116 |     except BaseException as e:
117 |         raise UnknownFeedtypeException(f"Failed to parse feed from `{url}`") from e
118 | 
119 | def get_publish_date(item):
120 |     published = getFirstElementByTag(item, "published")
121 |     if not published:
122 |         published = getFirstElementByTag(item, "pubDate")
123 |     return parse_date(getText(published))
124 | 
125 | def get_categories(entry: Element) -> list[str]:
126 |     categories = []
127 |     for category in entry.getElementsByTagName('category'):
128 |         cat = category.getAttribute('term') or getText(category)
129 |         if not cat:
130 |             cat = category
131 |         categories.append(cat)
132 |     return categories
133 | 
134 | def get_author(item):
135 |     author = getFirstElementByTag(item, "dc:creator")
136 |     if not author:
137 |         author = getFirstElementByTag(item, "author")
138 |         author = getFirstElementByTag(author, "name") or author
139 |     return getText(author)
140 | 
141 | 
142 | def parse_items(elem, link):
143 |     return PostDict(
144 |         # element = elem,
145 |         link = link,
146 |         title = getText(getFirstElementByTag(elem, "title")),
147 |         pubdate = get_publish_date(elem),
148 |         author = get_author(elem),
149 |         categories = get_categories(elem),
150 |         description="",
151 |         content_type="plain/text",
152 |     )
153 | 
154 | def parse_posts_from_rss_feed(base_url, data) -> dict[str, PostDict]:
155 |     entries = {}
156 |     document = parse(BytesIO(data))
157 |     channel = getFirstElementByTag(document, "channel")
158 | 
159 |     for item in channel.getElementsByTagName("item"):
160 |         link = urljoin(base_url, getText(getFirstElementByTag(item, "link")).strip())
161 |         entries[link] = parse_items(item, link)
162 |         entries[link].description = parse_rss_description(item)
163 |     return entries
164 | 
165 | def parse_posts_from_atom_feed(base_url, data):
166 |     entries = {}
167 |     document = parse(BytesIO(data))
168 | 
169 |     for item in document.getElementsByTagName("entry"):
170 |         link = urljoin(base_url, getAtomLink(item, rel='alternate'))
171 |         entries[link] = parse_items(item, link)
172 |         entries[link].description, content_type = parse_atom_description(item)
173 |         if content_type:
174 |             entries[link].content_type = content_type
175 |     return entries
176 | 
177 | def parse_atom_description(item: Element):
178 |     description = ""
179 |     if summary := getFirstChildByTag(item, "summary"):
180 |         description = getText(summary)
181 |     if content := getFirstChildByTag(item, "content"):
182 |         description = getText(content)
183 |     return description, None
184 | 
185 | def parse_rss_description(item: Element):
186 |     return getText(getFirstChildByTag(item, "description"))
187 | 
188 | 
189 | def is_valid_atom_feed(xml):
190 |     pass
191 | 
192 | def is_valid_atom_feed(xml):
193 |     pass
194 | 
195 | def get_full_text(link):
196 |     try:
197 |         page, content_type, url = fetch_page_with_retries(link)
198 |         doc  = ReadabilityDocument(page, url=url)
199 |         return doc.summary(), content_type
200 |     except BaseException as e:
201 |         raise history4feedException(f"Error processing fulltext: {e}") from e
202 |     
203 | 


--------------------------------------------------------------------------------
/history4feed/h4fscripts/sitemap_helpers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | import os
 4 | import time
 5 | from collections import namedtuple
 6 | from urllib.parse import urlencode
 7 | from .h4f import FatalError, PostDict, fetch_page_with_retries
 8 | from history4feed.app.settings import history4feed_server_settings as settings
 9 | import requests
10 | from datetime import UTC, datetime as dt
11 | import time
12 | import requests
13 | from datetime import datetime as dt, date, timedelta
14 | from dateparser import parse as parse_date
15 | DEFAULT_USER_AGENT = "curl"
16 | 
17 | class SearchIndexError(FatalError):
18 |     pass
19 | 
20 | def fetch_posts_links_with_serper(site, from_time: dt, to_time: dt = None) -> dict[str, PostDict]:
21 |     s = requests.Session()
22 |     s.headers.update({
23 |         'X-API-KEY':  os.getenv("SERPER_API_KEY"),
24 |         'Content-Type': 'application/json'
25 |     })
26 | 
27 |     params = dict(num=100, page=1)
28 |     entries: dict[str, PostDict] = {}
29 |     to_time = to_time or dt.now(UTC)
30 |     if not to_time.tzinfo:
31 |         to_time = to_time.replace(tzinfo=UTC)
32 | 
33 |     frame_start = from_time - timedelta(days=1)
34 |     credits_used = 0
35 | 
36 |     while frame_start < to_time:
37 |         frame_end = frame_start + timedelta(days=100)
38 |         params.update(q=f"site:{site} after:{frame_start.date().isoformat()} before:{frame_end.date().isoformat()}", page=1)
39 |         while True:
40 |             resp = s.get("https://google.serper.dev/search", params=params)
41 |             if not resp.ok:
42 |                 raise SearchIndexError(f"Serper Request GOT {resp.status_code}: {resp.text}")
43 |             data = resp.json()
44 |             credits_used += data['credits']
45 |             for d in data['organic']:
46 |                 date = d.get('date')
47 |                 if date:
48 |                     date = parse_date(date)
49 |                 else:
50 |                     date = min(frame_end, to_time)
51 |                 post = PostDict(link=d['link'], title=d['title'], pubdate=date, categories=[])
52 |                 entries[post.link] = post
53 |             params['page'] += 1
54 |             if len(data['organic']) < params['num']:
55 |                 break
56 |         frame_start = frame_end - timedelta(days=1)
57 |     logging.info(f"got {len(entries)} posts between {from_time} and {to_time}, used {credits_used} credits")
58 |     return entries
59 | 
60 | 


--------------------------------------------------------------------------------
/history4feed/h4fscripts/task_helper.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from celery import shared_task, Task as CeleryTask
  3 | import celery
  4 | from celery.result import ResultSet, AsyncResult
  5 | import redis
  6 | 
  7 | from history4feed.h4fscripts.sitemap_helpers import fetch_posts_links_with_serper
  8 | 
  9 | from ..app import models
 10 | from . import h4f, wayback_helpers, logger, exceptions
 11 | from datetime import UTC, datetime
 12 | from history4feed.app.settings import history4feed_server_settings as settings
 13 | 
 14 | from urllib.parse import urlparse
 15 | from contextlib import contextmanager
 16 | from django.core.cache import cache
 17 | from rest_framework.exceptions import APIException, Throttled
 18 | from django.db import transaction
 19 | 
 20 | LOCK_EXPIRE = 60 * 60
 21 | 
 22 | def get_lock_id(feed: models.Feed):
 23 |     lock_id = f"feed-lock-{feed.id}"
 24 |     logger.debug("using lock id %s", lock_id)
 25 |     return lock_id
 26 | 
 27 | def queue_lock(feed: models.Feed, job=None):
 28 |     lock_value = dict(feed_id=str(feed.id))
 29 |     if job:
 30 |         lock_value["job_id"] = str(job.id)
 31 |         
 32 |     status = cache.add(get_lock_id(feed), lock_value, timeout=LOCK_EXPIRE)
 33 |     return status
 34 | 
 35 | 
 36 | 
 37 | def new_job(feed: models.Feed, include_remote_blogs):
 38 |     with transaction.atomic():
 39 |         job_obj = models.Job.objects.create(
 40 |             feed=feed,
 41 |             earliest_item_requested=feed.latest_item_pubdate or settings.EARLIEST_SEARCH_DATE,
 42 |             latest_item_requested=datetime.now(UTC),
 43 |             include_remote_blogs=include_remote_blogs,
 44 |         )
 45 |         if not queue_lock(feed, job_obj):
 46 |             raise Throttled(detail={"message": "A job is already running for this feed", **cache.get(get_lock_id(feed))})
 47 | 
 48 |         (start_job.s(job_obj.pk)| retrieve_posts_from_links.s(job_obj.pk) | wait_for_all_with_retry.s() | collect_and_schedule_removal.si(job_obj.pk)).apply_async(countdown=5, link_error=error_handler.s(job_obj.pk))
 49 |         return job_obj
 50 | 
 51 | def new_patch_posts_job(feed: models.Feed, posts: list[models.Post], include_remote_blogs=True):
 52 |     job_obj = models.Job.objects.create(
 53 |         feed=posts[0].feed,
 54 |         state=models.JobState.PENDING,
 55 |         include_remote_blogs=include_remote_blogs,
 56 |     )
 57 |     ft_jobs = [models.FulltextJob.objects.create(
 58 |         job_id=job_obj.id,
 59 |         post_id=post.id,
 60 |         link=post.link,
 61 |     ) for post in posts]
 62 |     chain = celery.chain([retrieve_full_text.si(ft_job.pk) for ft_job in ft_jobs])
 63 |     ( start_post_job.si(job_obj.id) | chain | collect_and_schedule_removal.si(job_obj.pk)).apply_async(link_error=error_handler.s(job_obj.pk), countdown=5)
 64 |     return job_obj
 65 | 
 66 | @shared_task(bind=True, default_retry_delay=10)
 67 | def start_post_job(self: CeleryTask, job_id):
 68 |     job = models.Job.objects.get(pk=job_id)
 69 |     if job.is_cancelled():
 70 |         job.info = "job cancelled while in queue"
 71 |         job.save()
 72 |         return False
 73 |     if not queue_lock(job.feed, job):
 74 |         return self.retry(max_retries=360)
 75 |     job.state = models.JobState.RUNNING
 76 |     job.save()
 77 |     return True
 78 | 
 79 | @shared_task
 80 | def start_job(job_id):
 81 |     job = models.Job.objects.get(pk=job_id)
 82 |     feed = job.feed
 83 |     job.state = models.JobState.RUNNING
 84 |     job.save()
 85 |     try:
 86 |         if feed.feed_type == models.FeedType.SEARCH_INDEX:
 87 |             return [feed.url]
 88 |         return wayback_helpers.get_wayback_urls(feed.url, job.earliest_item_requested, job.latest_item_requested)
 89 |     except BaseException as e:
 90 |         job.state = models.JobState.FAILED
 91 |         job.info = str(e)
 92 |         job.save()
 93 |         return []
 94 | 
 95 | @shared_task(bind=True, default_retry_delay=10)
 96 | def wait_for_all_with_retry(self, result_ids):
 97 |     if not result_ids:
 98 |         return []
 99 |     result_set = ResultSet([AsyncResult(task_id) for task_id in result_ids])
100 |     if not result_set.ready():
101 |         return self.retry(max_retries=360)
102 |     return result_ids
103 | 
104 | @shared_task
105 | def retrieve_posts_from_links(urls, job_id):
106 |     if not urls:
107 |         return []
108 |     full_text_chain = models.Job.objects.get(pk=job_id)
109 |     feed = full_text_chain.feed
110 |     chains = []
111 |     parsed_feed = {}
112 |     job = models.Job.objects.get(id=job_id)
113 |     for index, url in enumerate(urls):
114 |         if job.is_cancelled():
115 |             break
116 |         error = None
117 |         if feed.feed_type == models.FeedType.SEARCH_INDEX:
118 |             start_time = feed.freshness or settings.EARLIEST_SEARCH_DATE
119 |             if not start_time.tzinfo:
120 |                 start_time = start_time.replace(tzinfo=UTC)
121 |             crawled_posts = fetch_posts_links_with_serper(url, from_time=start_time, to_time=job.run_datetime)
122 |             posts = [add_new_post(feed, job, post_dict) for post_dict in crawled_posts.values()]
123 |         else:
124 |             parsed_feed, posts, error = retrieve_posts_from_url(url, feed, job)
125 |         if error:
126 |             logger.exception(error)
127 |             continue
128 |         if not posts:
129 |             logger.warning('no new post in `%s`', url)
130 |             continue
131 | 
132 |         chain_tasks = []
133 |         for post in posts:
134 |             ftjob_entry = models.FulltextJob.objects.create(
135 |                 job_id=job_id,
136 |                 post_id=post.id,
137 |                 link=post.link,
138 |             )
139 |             chain_tasks.append(retrieve_full_text.si(ftjob_entry.pk))
140 |         full_text_chain = celery.chain(chain_tasks)
141 |         chains.append(full_text_chain.apply_async())
142 | 
143 |     if parsed_feed:
144 |         feed.set_description(parsed_feed['description'])
145 |         feed.set_title(parsed_feed['title'])
146 |     feed.freshness = job.run_datetime
147 |     
148 |     feed.save()
149 |     logger.info("====\n"*5)
150 |     return [result.id for result in chains]
151 | 
152 | class JobCancelled(Exception):
153 |     pass
154 | 
155 | @shared_task(bind=True)
156 | def collect_and_schedule_removal(sender, job_id):
157 |     logger.print(f"===> {sender=}, {job_id=} ")
158 |     job = models.Job.objects.get(pk=job_id)
159 |     remove_lock(job)
160 |     if job.state == models.JobState.RUNNING:
161 |         job.state = models.JobState.SUCCESS
162 |         job.save()
163 | 
164 | def remove_lock(job):
165 |     if cache.delete(get_lock_id(job.feed)):
166 |         logger.debug("lock deleted")
167 |     else:
168 |         logger.debug("Failed to remove lock")
169 | 
170 | def retrieve_posts_from_url(url, db_feed: models.Feed, job: models.Job):
171 |     back_off_seconds = settings.WAYBACK_SLEEP_SECONDS
172 |     all_posts: list[models.Post] = []
173 |     error = None
174 |     parsed_feed = {}
175 |     for i in range(settings.REQUEST_RETRY_COUNT):
176 |         if i != 0:
177 |             time.sleep(back_off_seconds)
178 |         try:
179 |             if job.is_cancelled():
180 |                 raise JobCancelled("job was terminated by user")
181 |             data, content_type, url = h4f.fetch_page_with_retries(url)
182 |             parsed_feed = h4f.parse_feed_from_content(data, url)
183 |             if parsed_feed['feed_type'] == models.FeedType.ATOM:
184 |                 posts = h4f.parse_posts_from_atom_feed(url, data)
185 |             elif parsed_feed['feed_type'] == models.FeedType.RSS:
186 |                 posts = h4f.parse_posts_from_rss_feed(url, data)
187 |             else:
188 |                 raise exceptions.UnknownFeedtypeException("unknown feed type `{}` at {}".format(parsed_feed['feed_type'], url))
189 |             for post_dict in posts.values():
190 |                 # make sure that post and feed share the same domain
191 |                 post = add_new_post(db_feed, job, post_dict)
192 |                 if not post:
193 |                     continue
194 |                 all_posts.append(post)
195 |             db_feed.save()
196 |             logger.info(f"saved {len(posts)} posts for {url}")
197 |             break
198 |         except ConnectionError as e:
199 |             logger.error(e, exc_info=True)
200 |             error = e
201 |             logger.info(f"job with url {url} ran into an issue {e}, backing off for {back_off_seconds} seconds")
202 |             back_off_seconds *= 1.2
203 |         except BaseException as e:
204 |             logger.error(e, exc_info=True)
205 |             error = e
206 |             break
207 |     return parsed_feed, all_posts, error
208 | 
209 | def add_new_post(db_feed: models.Feed, job: models.Job, post_dict: h4f.PostDict):
210 |     # make sure that post and feed share the same domain
211 |     if job.should_skip_post(post_dict.link):
212 |         models.FulltextJob.objects.create(
213 |                 job_id=job.id,
214 |                 status=models.FullTextState.SKIPPED,
215 |                 link=post_dict.link,
216 |         )
217 |         return None
218 |     categories = post_dict.categories
219 |     del post_dict.categories
220 |     post, created = models.Post.objects.get_or_create(defaults=post_dict.__dict__, feed=db_feed, link=post_dict.link)
221 |     if not created or post.deleted_manually:
222 |         return None
223 | 
224 |     post.save()
225 |     post.add_categories(categories)
226 |     return post
227 |         
228 | @shared_task(bind=True)
229 | def retrieve_full_text(self, ftjob_pk):
230 |     fulltext_job = models.FulltextJob.objects.get(pk=ftjob_pk)
231 |     try:
232 |         if fulltext_job.is_cancelled():
233 |             raise JobCancelled()
234 |         else:
235 |             fulltext_job.post.description, fulltext_job.post.content_type = h4f.get_full_text(fulltext_job.post.link)
236 |             fulltext_job.status = models.FullTextState.RETRIEVED
237 |             fulltext_job.error_str = ""
238 |             fulltext_job.post.is_full_text = True
239 |     except JobCancelled:
240 |         fulltext_job.status = models.FullTextState.CANCELLED
241 |         fulltext_job.error_str = "job cancelled while retrieving fulltext"
242 |     except BaseException as e:
243 |         fulltext_job.error_str = str(e)
244 |         fulltext_job.status = models.FullTextState.FAILED
245 |     fulltext_job.save()
246 |     fulltext_job.post.save()
247 |     logger.print(f"{self}")
248 | 
249 | 
250 | 
251 | from celery import signals
252 | @signals.worker_ready.connect
253 | def mark_old_jobs_as_failed(**kwargs):
254 |     models.Job.objects.filter(state__in=[models.JobState.PENDING, models.JobState.RUNNING]).update(state=models.JobState.CANCELLED, info="job cancelled automatically on server startup")
255 | 
256 | @shared_task
257 | def error_handler(request, exc: Exception, traceback, job_id):
258 |     job = models.Job.objects.get(pk=job_id)
259 |     job.state = models.JobState.FAILED
260 |     job.info = f"job failed: {exc}"
261 |     job.save()
262 |     remove_lock(job)
263 |     logger.error('Job {3} with task_id {0} raised exception: {1!r}\n{2!r}'.format(
264 |           request.id, exc, traceback, job_id))


--------------------------------------------------------------------------------
/history4feed/h4fscripts/wayback_helpers.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import time
 3 | from datetime import datetime as dt, UTC
 4 | from collections import namedtuple
 5 | from urllib.parse import urlencode
 6 | from .h4f import FatalError, fetch_page_with_retries
 7 | from history4feed.app.settings import history4feed_server_settings as settings
 8 | 
 9 | DEFAULT_USER_AGENT = "curl"
10 | 
11 | 
12 | CDXSearchResult = namedtuple("CDXSearchResult", ["urlkey", "timestamp", "original_url", "mimetype", "statuscode", "digest", "length"])
13 | 
14 | def cdx_search(url, earliest: dt, latest: dt=None, retry_count=3, sleep_seconds=settings.WAYBACK_SLEEP_SECONDS, user_agent="curl") -> list[CDXSearchResult]:
15 |     latest = latest or dt.now(UTC)
16 |     query = urlencode([
17 |         ("from", as_wayback_date(earliest)),
18 |         ("to", as_wayback_date(latest)),
19 |         ("url", url),
20 |         ("filter", "statuscode:200"),
21 |         ("output", "json"),
22 |         ("collapse", "digest"),
23 |     ])
24 | 
25 |     headers = {}
26 | 
27 |     error = None
28 | 
29 |     for i in range(retry_count+1):
30 |         if i > 0:
31 |             time.sleep(sleep_seconds * 1.5**(i-1))
32 |         try:
33 |             res, content_type, _ = fetch_page_with_retries(f"http://web.archive.org/cdx/search/cdx?{query}", headers=headers)
34 |             res_json = json.loads(res)
35 |             error = None
36 |             break
37 |         except FatalError:
38 |             return []
39 |         except BaseException as e:
40 |             error = e
41 |             continue
42 |     if error:
43 |         raise error
44 |     out = {}
45 |     for v in res_json[1:]:
46 |         try:
47 |             v[6] = int(v[6])
48 |             v[4] = int(v[4])
49 |             v = CDXSearchResult(*v)
50 |             out[v.digest] = v
51 |         except:
52 |             pass
53 |     return list(out.values())
54 |         
55 | def as_wayback_date(date: dt) -> str:
56 |     return date.strftime('%Y%m%d')
57 | 
58 | def get_wayback_urls(url, from_date, to_date=None):
59 |     to_date = to_date or dt.now(UTC)
60 |     urls = []
61 |     results = cdx_search(url, from_date, to_date)
62 |     for result in results:
63 |         urls.append(f"https://web.archive.org/web/{result.timestamp}id_/{result.original_url}")
64 |     urls.append(url)
65 |     return urls


--------------------------------------------------------------------------------
/history4feed/h4fscripts/xml_utils.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timezone
 2 | from xml.dom.minidom import Document, Element
 3 | 
 4 | 
 5 | def createTextElement(document: Document, tagName, text):
 6 |     el = document.createElement(tagName)
 7 |     txtNode = document.createTextNode(text or "")
 8 |     el.appendChild(txtNode)
 9 |     return el
10 | 
11 | def createCDataElement(document: Document, tagName, text):
12 |     el = document.createElement(tagName)
13 |     txtNode = document.createCDATASection(text or "")
14 |     el.appendChild(txtNode)    
15 |     return el
16 | 
17 | def createRSSHeader(title, description,  url, last_build_date=None):
18 |     last_build_date = last_build_date or datetime.now(timezone.utc)
19 |     d = Document()
20 |     rss = d.createElement("rss")
21 |     d.appendChild(rss)
22 |     rss.setAttribute("version", "2.0")
23 |     channel = d.createElement("channel")
24 |     rss.appendChild(channel)
25 |     channel.appendChild(createTextElement(d, "title", title))
26 |     channel.appendChild(createTextElement(d, "description", description))
27 |     channel.appendChild(createTextElement(d, "link", url))
28 |     channel.appendChild(createTextElement(d, "lastBuildDate", last_build_date.isoformat()))
29 |     # channel.appendChild(createTextElement(d, "generator", LINK_TO_SELF))
30 |     return d, channel
31 | 
32 | 
33 | def getText(nodelist: list[Element]):
34 |     if not nodelist:
35 |         return ''
36 |     if not isinstance(nodelist, list):
37 |         nodelist = nodelist.childNodes
38 |     rc = []
39 |     for node in nodelist:
40 |         if node.nodeType == node.TEXT_NODE or node.nodeType == node.CDATA_SECTION_NODE:
41 |             rc.append(node.data)
42 |     return ''.join(rc)
43 | 
44 | def getFirstElementByTag(node, tag):
45 |     if not node:
46 |         return None
47 |     elems = node.getElementsByTagName(tag)
48 |     return (elems or None) and elems[0]
49 | 
50 | def getFirstChildByTag(node: Element, tag):
51 |     child = None
52 |     for c in node.childNodes:
53 |         if c.nodeName == tag:
54 |             child = c
55 |             break
56 |     return child
57 | 
58 | 
59 | def getAtomLink(node: Element, rel='self'):
60 |     links = [child for child in node.childNodes if child.nodeType == child.ELEMENT_NODE and child.tagName in ['link', 'atom:link']]
61 | 
62 |     link = links[0]
63 |     for l in links:
64 |         r = l.attributes.get('rel')
65 |         if r and r.value == rel:
66 |             link = l
67 |             break
68 |     return link.attributes['href'].value


--------------------------------------------------------------------------------
/history4feed/settings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Django settings for history4feed project.
  3 | 
  4 | Generated by 'django-admin startproject' using Django 5.0.6.
  5 | 
  6 | For more information on this file, see
  7 | https://docs.djangoproject.com/en/5.0/topics/settings/
  8 | 
  9 | For the full list of settings and their values, see
 10 | https://docs.djangoproject.com/en/5.0/ref/settings/
 11 | """
 12 | 
 13 | import os
 14 | from pathlib import Path
 15 | from dotenv import load_dotenv
 16 | from datetime import UTC, datetime
 17 | from textwrap import dedent
 18 | 
 19 | load_dotenv()
 20 | 
 21 | # Build paths inside the project like this: BASE_DIR / 'subdir'.
 22 | BASE_DIR = Path(__file__).resolve().parent.parent
 23 | 
 24 | 
 25 | # Quick-start development settings - unsuitable for production
 26 | # See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/
 27 | 
 28 | # SECURITY WARNING: keep the secret key used in production secret!
 29 | SECRET_KEY = os.environ.get('DJANGO_SECRET', "insecure_django_secret")
 30 | 
 31 | # SECURITY WARNING: don't run with debug turned on in production!
 32 | DEBUG = os.getenv('DJANGO_DEBUG', False)
 33 | 
 34 | ALLOWED_HOSTS = os.getenv('DJANGO_ALLOWED_HOSTS', "localhost 127.0.0.1 [::1]").split()
 35 | 
 36 | CELERY_BROKER_URL = os.environ["CELERY_BROKER_URL"]
 37 | 
 38 | CACHES = {
 39 |     'default': {
 40 |         'BACKEND': 'django.core.cache.backends.redis.RedisCache',
 41 |         'LOCATION': CELERY_BROKER_URL,  # Use the appropriate Redis server URL
 42 |         'OPTIONS': {
 43 |             # 'CLIENT_CLASS': 'django.core.cache.backends.redis.RedisCacheClient',
 44 |         }
 45 |     }
 46 | }
 47 | 
 48 | #CORS_ALLOW_ALL_ORIGINS = os.environ.get('DJANGO_CORS_ALLOW_ALL_ORIGINS', True)
 49 | #CORS_ALLOWED_ORIGINS = [os.environ.get('DJANGO_CORS_ALLOWED_ORIGINS', "http://127.0.0.1:8002")]
 50 | 
 51 | # Application definition
 52 | 
 53 | INSTALLED_APPS = [
 54 |     'django.contrib.admin',
 55 |     'django.contrib.auth',
 56 |     'django.contrib.contenttypes',
 57 |     'django.contrib.sessions',
 58 |     'django.contrib.messages',
 59 |     'django.contrib.staticfiles',
 60 |     'rest_framework',
 61 |     'drf_spectacular',
 62 |     'django.contrib.postgres',
 63 |     'history4feed.app',
 64 | ]
 65 | 
 66 | MIDDLEWARE = [
 67 |     'django.middleware.security.SecurityMiddleware',
 68 |     'whitenoise.middleware.WhiteNoiseMiddleware',
 69 |     'django.contrib.sessions.middleware.SessionMiddleware',
 70 |     'django.middleware.common.CommonMiddleware',
 71 |     'django.middleware.csrf.CsrfViewMiddleware',
 72 |     'django.contrib.auth.middleware.AuthenticationMiddleware',
 73 |     'django.contrib.messages.middleware.MessageMiddleware',
 74 |     'django.middleware.clickjacking.XFrameOptionsMiddleware',
 75 | ]
 76 | 
 77 | ROOT_URLCONF = 'history4feed.urls'
 78 | 
 79 | TEMPLATES = [
 80 |     {
 81 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 82 |         'DIRS': [],
 83 |         'APP_DIRS': True,
 84 |         'OPTIONS': {
 85 |             'context_processors': [
 86 |                 'django.template.context_processors.debug',
 87 |                 'django.template.context_processors.request',
 88 |                 'django.contrib.auth.context_processors.auth',
 89 |                 'django.contrib.messages.context_processors.messages',
 90 |             ],
 91 |         },
 92 |     },
 93 | ]
 94 | 
 95 | WSGI_APPLICATION = 'history4feed.wsgi.application'
 96 | 
 97 | 
 98 | # Database
 99 | # https://docs.djangoproject.com/en/5.0/ref/settings/#databases
100 | 
101 | DATABASES = {
102 |     'default': {
103 |         'ENGINE': 'django.db.backends.postgresql',
104 |         'NAME': os.getenv('POSTGRES_DB'),      # Database name
105 |         'USER': os.getenv('POSTGRES_USER'),          # Database user
106 |         'PASSWORD': os.getenv('POSTGRES_PASSWORD'),  # Database password
107 |         'HOST': os.getenv('POSTGRES_HOST'),          # PostgreSQL service name in Docker Compose
108 |         'PORT': os.getenv('POSTGRES_PORT'),  # PostgreSQL default port
109 |     },
110 |     'sqlite': {
111 |         'ENGINE': 'django.db.backends.sqlite3',
112 |         'NAME': BASE_DIR / 'db.sqlite3',
113 |     },
114 | }
115 | 
116 | 
117 | # Password validation
118 | # https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators
119 | 
120 | AUTH_PASSWORD_VALIDATORS = [
121 |     {
122 |         'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
123 |     },
124 |     {
125 |         'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
126 |     },
127 |     {
128 |         'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
129 |     },
130 |     {
131 |         'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
132 |     },
133 | ]
134 | 
135 | 
136 | # Internationalization
137 | # https://docs.djangoproject.com/en/5.0/topics/i18n/
138 | 
139 | LANGUAGE_CODE = 'en-us'
140 | 
141 | TIME_ZONE = 'UTC'
142 | 
143 | USE_I18N = True
144 | 
145 | USE_TZ = True
146 | 
147 | 
148 | # Static files (CSS, JavaScript, Images)
149 | # https://docs.djangoproject.com/en/5.0/howto/static-files/
150 | 
151 | STATIC_URL = 'static/'
152 | STATIC_ROOT = BASE_DIR / "staticfiles"
153 | 
154 | # Default primary key field type
155 | # https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field
156 | 
157 | DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
158 | 
159 | 
160 | REST_FRAMEWORK = {
161 |     # YOUR SETTINGS
162 |     'DEFAULT_SCHEMA_CLASS': 'history4feed.app.autoschema.H4FSchema',
163 |     'DEFAULT_FILTER_BACKENDS': ['django_filters.rest_framework.DjangoFilterBackend'],
164 |     'DEFAULT_AUTHENTICATION_CLASSES': [],
165 |     'EXCEPTION_HANDLER': "dogesec_commons.utils.custom_exception_handler",
166 |     'DATETIME_FORMAT': '%Y-%m-%dT%H:%M:%SZ',
167 | }
168 | 
169 | SPECTACULAR_SETTINGS = {
170 |     'TITLE': "history4feed API",
171 |     'DESCRIPTION': dedent("""
172 |         history4feed can be used to create a complete history for a blog and output it as an RSS feed.
173 |     """),
174 |     'VERSION': '1.0.0',
175 |     'CONTACT': {
176 |         'email': 'noreply@dogesec.com',
177 |         'url': 'https://github.com/muchdogesec/history4feed',
178 |     },
179 |     'TAGS': [
180 |         {
181 |             "name": "Feeds",
182 |             "description": "Subscribe and retrieve Feeds"
183 |         },
184 |         {
185 |             "name": "Posts",
186 |             "description": "Retrieve Posts in Feeds"
187 |         },
188 |         {
189 |             "name": "Jobs",
190 |             "description": "Check the status of data retrieval from Feeds"
191 |         },
192 |     ],
193 | 
194 | }
195 | 
196 | DEFAULT_PAGE_SIZE = int(os.getenv("DEFAULT_PAGE_SIZE", 50))
197 | MAXIMUM_PAGE_SIZE = int(os.getenv("MAX_PAGE_SIZE", 50))
198 | 
199 | HISTORY4FEED_SETTINGS = {
200 |     'WAYBACK_SLEEP_SECONDS': int(os.getenv("WAYBACK_SLEEP_SECONDS", 20)),
201 |     'EARLIEST_SEARCH_DATE': datetime.strptime(os.environ.get("EARLIEST_SEARCH_DATE", "2024-01-01T00:00:00Z"), "%Y-%m-%dT%H:%M:%SZ").replace(tzinfo=UTC),
202 |     'REQUEST_RETRY_COUNT': int(os.getenv("REQUEST_RETRY_COUNT", 3)),
203 | }


--------------------------------------------------------------------------------
/history4feed/urls.py:
--------------------------------------------------------------------------------
 1 | """
 2 | URL configuration for history4feed project.
 3 | 
 4 | The `urlpatterns` list routes URLs to views. For more information please see:
 5 |     https://docs.djangoproject.com/en/5.0/topics/http/urls/
 6 | Examples:
 7 | Function views
 8 |     1. Add an import:  from my_app import views
 9 |     2. Add a URL to urlpatterns:  path('', views.home, name='home')
10 | Class-based views
11 |     1. Add an import:  from other_app.views import Home
12 |     2. Add a URL to urlpatterns:  path('', Home.as_view(), name='home')
13 | Including another URLconf
14 |     1. Import the include() function: from django.urls import include, path
15 |     2. Add a URL to urlpatterns:  path('blog/', include('blog.urls'))
16 | """
17 | from django.contrib import admin
18 | from django.urls import include, path
19 | from .app import views
20 | from rest_framework import routers
21 | from drf_spectacular.views import SpectacularAPIView, SpectacularRedocView, SpectacularSwaggerView
22 | 
23 | 
24 | from django.http import JsonResponse
25 | def handler404(*args, **kwargs):
26 |     return JsonResponse(dict(code=404, message='non-existent page'), status=404)
27 | 
28 | def handler500(*args, **kwargs):
29 |     return JsonResponse(dict(code=500, message='internal server error'), status=500)
30 | 
31 | 
32 | API_VERSION = "v1"
33 | 
34 | router = routers.SimpleRouter(use_regex_path=False)
35 | router.register("feeds", views.FeedView, "feed-view")
36 | router.register("feeds/<uuid:feed_id>/posts", views.FeedPostView, "feed-post-view")
37 | router.register("feeds", views.RSSView, "feed-rss-view")
38 | router.register("posts", views.PostOnlyView, "post-view")
39 | router.register("jobs", views.JobView, "job-view")
40 | 
41 | 
42 | urlpatterns = [
43 |     path(f'api/{API_VERSION}/', include(router.urls)),
44 |     path('admin/', admin.site.urls),
45 | 
46 |     # YOUR PATTERNS
47 |     path('api/schema/', views.SchemaViewCached.as_view(), name='schema'),
48 |     # Optional UI:
49 |     path('api/schema/swagger-ui/', SpectacularSwaggerView.as_view(url_name='schema'), name='swagger-ui'),
50 | ]
51 | 


--------------------------------------------------------------------------------
/history4feed/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for history4feed project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'history4feed.settings')
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Django's command-line utility for administrative tasks."""
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | def main():
 8 |     """Run administrative tasks."""
 9 |     os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'history4feed.settings')
10 |     try:
11 |         from django.core.management import execute_from_command_line
12 |     except ImportError as exc:
13 |         raise ImportError(
14 |             "Couldn't import Django. Are you sure it's installed and "
15 |             "available on your PYTHONPATH environment variable? Did you "
16 |             "forget to activate a virtual environment?"
17 |         ) from exc
18 |     execute_from_command_line(sys.argv)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "history4feed"
 7 | version = "0.0.1-pre"
 8 | authors = [
 9 |   { name="DOGESEC", email="support@dogesec.com" },
10 | ]
11 | description = "History4Feed"
12 | readme = "README.md"
13 | requires-python = ">=3.9"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: Apache Software License",
17 |     "Operating System :: OS Independent",
18 | ]
19 | dependencies = [
20 |     "djangorestframework>=3.15.2",
21 |     "drf-spectacular>=0.27.2",
22 |     "celery>=5.4.0; python_version >= '3.8'",
23 |     "psycopg2-binary>=2.9.10",
24 |     "redis",
25 |     "brotlipy>=0.7.0",
26 |     "lxml-html-clean>=0.4.1",
27 |     "fake-useragent>=1.5.1",
28 |     "hyperlink",
29 |     "django-filter>=24.2",
30 |     "dateparser>=1.2.1",
31 |   ]
32 | [project.urls]
33 | Homepage = "https://github.com/muchdogesec/history4feed"
34 | Issues = "https://github.com/muchdogesec/history4feed/issues"
35 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | -i https://pypi.org/simple
 2 | amqp==5.2.0; python_version >= '3.6'
 3 | asgiref==3.8.1; python_version >= '3.8'
 4 | attrs==23.2.0; python_version >= '3.7'
 5 | billiard==4.2.0; python_version >= '3.7'
 6 | brotlipy==0.7.0
 7 | celery==5.4.0; python_version >= '3.8'
 8 | certifi==2025.4.26; python_version >= '3.6'
 9 | chardet==5.2.0; python_version >= '3.7'
10 | charset-normalizer==3.3.2; python_full_version >= '3.7.0'
11 | click==8.1.7; python_version >= '3.7'
12 | click-didyoumean==0.3.1; python_full_version >= '3.6.2'
13 | click-plugins==1.1.1
14 | click-repl==0.3.0; python_version >= '3.6'
15 | cssselect==1.2.0; python_version >= '3.7'
16 | django==5.1.7; python_version >= '3.10'
17 | django-filter==24.2; python_version >= '3.8'
18 | djangorestframework==3.15.2; python_version >= '3.6'
19 | drf-spectacular==0.27.2; python_version >= '3.7'
20 | gunicorn==23.0.0; python_version >= '3.7'
21 | idna==3.7; python_version >= '3.5'
22 | inflection==0.5.1; python_version >= '3.5'
23 | jsonschema==4.22.0; python_version >= '3.8'
24 | jsonschema-specifications==2023.12.1; python_version >= '3.8'
25 | kombu==5.3.7; python_version >= '3.8'
26 | lxml==5.2.2; python_version >= '3.6'
27 | lxml-html-clean==0.4.1
28 | packaging==24.0; python_version >= '3.7'
29 | prompt-toolkit==3.0.45; python_full_version >= '3.7.0'
30 | psycopg2-binary==2.9.10; python_version >= '3.7'
31 | pycparser==2.22; python_version >= '3.8'
32 | python-dateutil==2.9.0.post0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
33 | python-dotenv==1.0.1; python_version >= '3.8'
34 | pyyaml==6.0.1; python_version >= '3.6'
35 | readability-lxml==0.8.1
36 | redis==5.0.4; python_version >= '3.7'
37 | referencing==0.35.1; python_version >= '3.8'
38 | requests==2.32.3; python_version >= '3.8'
39 | six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
40 | sqlparse==0.5.0; python_version >= '3.8'
41 | tzdata==2024.1; python_version >= '2'
42 | uritemplate==4.1.1; python_version >= '3.6'
43 | urllib3==2.2.2; python_version >= '3.8'
44 | vine==5.1.0; python_version >= '3.6'
45 | wcwidth==0.2.13
46 | fake-useragent==1.5.1
47 | whitenoise==6.7.0
48 | hyperlink==21.0.0
49 | dateparser==1.2.1
50 | stix2arango @ https://github.com/muchdogesec/stix2arango/releases/download/main-2025-02-04-14-14-39/stix2arango-0.0.4rc0-py3-none-any.whl
51 | dogesec_commons @ https://github.com/muchdogesec/dogesec_commons/releases/download/main-2025-05-26-13-08-23/dogesec_commons-0.0.7rc1-py3-none-any.whl


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | python manage.py migrate
2 | #gunicorn history4feed.wsgi:application --bind 0.0.0.0:8002 --reload
3 | python manage.py runserver 0.0.0.0:8002


--------------------------------------------------------------------------------
/tests/README.md:
--------------------------------------------------------------------------------
 1 | # Tests
 2 | 
 3 | ## Environment setup
 4 | 
 5 | ```shell
 6 | python3 -m venv history4feed-venv && \
 7 | source history4feed-venv/bin/activate && \
 8 | pip3 install -r requirements.txt
 9 | ````
10 | 
11 | ## API schema tests
12 | 
13 | ```shell
14 | st run --checks all http://127.0.0.1:8002/api/schema --generation-allow-x00 true
15 | ```
16 | 
17 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/tests/__init__.py


--------------------------------------------------------------------------------
/tests/requirements.txt:
--------------------------------------------------------------------------------
1 | python-dotenv==1.0.1
2 | parameterized==0.9.0
3 | pytest==8.3.4
4 | requests==2.32.2
5 | python-dateutil==2.9.0.post0
6 | pytest-subtests
7 | schemathesis==3.38.7; python_version >= '3.8'


--------------------------------------------------------------------------------
/tests/st/.env.schemathesis:
--------------------------------------------------------------------------------
 1 | DJANGO_ALLOWED_HOSTS=*
 2 | DJANGO_CORS_ALLOW_ALL_ORIGINS=*
 3 | DJANGO_CORS_ALLOWED_ORIGINS=*
 4 | DJANGO_DEBUG=
 5 | DEFAULT_PAGE_SIZE=5000
 6 | CELERY_BROKER_CONNECTION_RETRY_ON_STARTUP=1
 7 | POSTGRES_HOST=pgdb
 8 | POSTGRES_DB=postgres
 9 | POSTGRES_USER=postgres
10 | POSTGRES_PASSWORD=postgres


--------------------------------------------------------------------------------
/tests/st/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/muchdogesec/history4feed/085ea559ffde6f62aa8d76b2cb889e4bec6fba26/tests/st/__init__.py


--------------------------------------------------------------------------------
/tests/st/hooks.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import schemathesis, schemathesis.schemas
 3 | from schemathesis.specs.openapi.schemas import BaseOpenAPISchema
 4 | from schemathesis import Case
 5 | from schemathesis.transports.responses import GenericResponse
 6 | 
 7 | @schemathesis.hook
 8 | def after_load_schema(
 9 |     context: schemathesis.hooks.HookContext,
10 |     schema: BaseOpenAPISchema,
11 | ) -> None:
12 |     
13 |     schema.add_link(
14 |         source=schema["/api/v1/jobs/"]['GET'],
15 |         target=schema["/api/v1/jobs/{job_id}/"]['GET'],
16 |         status_code=200,
17 |         parameters={"path.job_id": '$response.body#/jobs/0/id'}
18 |     )
19 |     for method in ['GET', 'PATCH', 'DELETE']:
20 |         schema.add_link(
21 |             source=schema['/api/v1/feeds/']['GET'],
22 |             target=schema['/api/v1/feeds/{feed_id}/'][method],
23 |             status_code=200,
24 |             parameters={"path.feed_id": "$response.body#/feeds/0/id"}
25 |         )
26 | 
27 |     for method in ['GET', 'PATCH', 'DELETE']:
28 |         schema.add_link(
29 |             source=schema['/api/v1/posts/']['GET'],
30 |             target=schema['/api/v1/posts/{post_id}/'][method],
31 |             status_code=200,
32 |             parameters={"path.post_id": "$response.body#/posts/0/id"}
33 |         )


--------------------------------------------------------------------------------
/tests/st/st.py:
--------------------------------------------------------------------------------
 1 | from hypothesis.stateful import initialize
 2 | import schemathesis
 3 | import hooks
 4 | 
 5 | schema = schemathesis.from_uri("http://localhost:8006/api/schema/")
 6 | 
 7 | 
 8 | 
 9 | 
10 | BaseAPIWorkflow = schema.as_state_machine()
11 | BaseAPIWorkflow.run()


--------------------------------------------------------------------------------
/tests/test_01_add_feeds.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from types import SimpleNamespace
  4 | import unittest, pytest
  5 | from urllib.parse import urljoin
  6 | 
  7 | from tests.utils import remove_unknown_keys, wait_for_jobs
  8 | 
  9 | base_url = os.environ["SERVICE_BASE_URL"]
 10 | import requests
 11 | 
 12 | 
 13 | DATA = [
 14 |     {
 15 |         "id": "d1d96b71-c687-50db-9d2b-d0092d1d163a",
 16 |         "feed_type": "rss",
 17 |         "include_remote_blogs": False,
 18 |         "url": "https://muchdogesec.github.io/fakeblog123/feeds/rss-feed-encoded.xml",
 19 |     },
 20 |     {
 21 |         "id": "cb0ba709-b841-521a-a3f2-5e1429f4d366",
 22 |         "feed_type": "atom",
 23 |         "pretty_url": "https://muchdogesec.github.io/fakeblog123/",
 24 |         "title": "Custom Title",
 25 |         "description": "Custom description",
 26 |         "include_remote_blogs": False,
 27 |         "url": "https://muchdogesec.github.io/fakeblog123/feeds/atom-feed-decoded.xml",
 28 |     },
 29 |     {
 30 |         "id": "121e5557-7277-5aa3-945d-e466c6bf92d5",
 31 |         "title": "Custom Title 2",
 32 |         "feed_type": "atom",
 33 |         "include_remote_blogs": False,
 34 |         "url": "https://muchdogesec.github.io/fakeblog123/feeds/atom-feed-cdata.xml",
 35 |     },
 36 |     {
 37 |         "id": "8f89731d-b9de-5931-9182-5460af59ca84",
 38 |         "description": "Custom description 2",
 39 |         "feed_type": "rss",
 40 |         "include_remote_blogs": False,
 41 |         "url": "https://muchdogesec.github.io/fakeblog123/feeds/rss-feed-decoded.xml",
 42 |     },
 43 |     {
 44 |         "id": "9c04d319-a949-52df-bcb6-5a73a1458fe5",
 45 |         "feed_type": "atom",
 46 |         "include_remote_blogs": False,
 47 |         "url": "https://muchdogesec.github.io/fakeblog123/feeds/atom-feed-decoded-partial.xml",
 48 |     },
 49 |     {
 50 |         "id": "d63dad15-8e23-57eb-80f7-715cedf85f33", # not passed in request
 51 |         "feed_type": "skeleton", # not passed in request
 52 |         "pretty_url": "https://muchdogesec.github.io/fakeblog123/about/",
 53 |         "url": "https://muchdogesec.github.io/fakeblog123/",
 54 |         "title": "Skeleton custom Title",
 55 |         "description": "Skeleton custom description"
 56 |     }
 57 | ]
 58 | 
 59 | def all_blog_parameters():
 60 |     return [
 61 |         pytest.param(k["url"], k, k.get("should_fail", False))
 62 |             for k in DATA
 63 |     ]
 64 | 
 65 | @pytest.mark.parametrize(
 66 |         ["url", "blog_data", "should_fail"],
 67 |         all_blog_parameters(),
 68 | )
 69 | def test_add_blog(url, blog_data: dict, should_fail):
 70 |     payload = remove_unknown_keys(blog_data, ["pretty_url", "title", "description", "include_remote_blogs", "url"])
 71 | 
 72 |     endpoint = urljoin(base_url, "api/v1/feeds/")
 73 | 
 74 |     if blog_data["feed_type"] == "skeleton":
 75 |         post_resp = requests.post(urljoin(endpoint, "skeleton/"), json=payload)
 76 |     else:
 77 |         post_resp = requests.post(endpoint, json=payload)
 78 | 
 79 |     if should_fail:
 80 |         assert not post_resp.ok, "add feed request expected to fail"
 81 |         return
 82 | 
 83 |     assert post_resp.status_code == 201, f"request failed: {post_resp.text}"
 84 |     post_resp_data = post_resp.json()
 85 |     job_id = post_resp_data.get("job_id")
 86 |     feed_id = post_resp_data["id"]
 87 |     if job_id:
 88 |         wait_for_jobs(job_id)
 89 | 
 90 |     feed_resp = requests.get(urljoin(base_url, f"api/v1/feeds/{feed_id}/"))
 91 |     resp_data = feed_resp.json()
 92 | 
 93 |     assert resp_data["id"] == blog_data["id"]
 94 | 
 95 |     if expected_pretty_url := blog_data.get("pretty_url"):
 96 |         assert resp_data["pretty_url"] == expected_pretty_url
 97 | 
 98 |     if expected_title := blog_data.get("title"):
 99 |         assert resp_data["title"] == expected_title
100 | 
101 |     if expected_description := blog_data.get("description"):
102 |         assert resp_data["description"] == expected_description
103 | 
104 |     if expected_feed_type := blog_data.get("feed_type"):
105 |         assert resp_data["feed_type"] == expected_feed_type
106 | 
107 |     if payload.get('use_search_index'):
108 |         assert resp_data["feed_type"] == "search_index"
109 | 


--------------------------------------------------------------------------------
/tests/test_02_add_post.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | from types import SimpleNamespace
 4 | import unittest, pytest
 5 | from urllib.parse import urljoin
 6 | 
 7 | from tests.utils import remove_unknown_keys, wait_for_jobs
 8 | 
 9 | base_url = os.environ["SERVICE_BASE_URL"]
10 | import requests
11 | 
12 | def all_posts():
13 |     DATA = [
14 |         {
15 |             "feed_id": "d63dad15-8e23-57eb-80f7-715cedf85f33",
16 |             "title": "Example COM",
17 |             "id": "223565cd-dd4f-54c2-9bbd-63019f39554f",
18 |             "link": "https://example.com/",
19 |             "pubdate": "2024-08-11T16:12:03Z",
20 |             "author": "test",
21 |             "categories": [
22 |                 "test",
23 |                 "test2"
24 |             ]
25 |         },
26 |         {
27 |             "feed_id": "d63dad15-8e23-57eb-80f7-715cedf85f33",
28 |             "title": "Example ORG",
29 |             "id": "a378c839-0940-56fb-b52c-e5b78d34ec94",
30 |             "link": "https://example.org/",
31 |             "pubdate": "2024-03-22T16:11:03Z",
32 |             "author": "test",
33 |             "categories": [
34 |                 "test",
35 |                 "test2"
36 |             ]
37 |         },
38 |         {
39 |             "feed_id": "d63dad15-8e23-57eb-80f7-715cedf85f33",
40 |             "title": "Example COM under real",
41 |             "id": "223565cd-dd4f-54c2-9bbd-63019f39554f",
42 |             "link": "https://example.com/",
43 |             "pubdate": "2024-08-11T16:12:03Z",
44 |             "author": "test",
45 |             "categories": [
46 |                 "test",
47 |                 "test2"
48 |             ],
49 |             "should_fail": True, #already added
50 |         },
51 |         {
52 |             "feed_id": "d63dad15-8e23-57eb-80f7-715cedf85f33",
53 |             "title": "Example ORG under real",
54 |             "id": "a378c839-0940-56fb-b52c-e5b78d34ec94",
55 |             "link": "https://example.org/",
56 |             "pubdate": "2024-03-22T16:11:03Z",
57 |             "author": "test",
58 |             "categories": [
59 |                 "test",
60 |                 "test2"
61 |             ],
62 |             "should_fail": True, #already added
63 |         },
64 |     ]
65 |     return [
66 |         [d["feed_id"], d["link"], d, d.get("should_fail")]
67 |             for d in DATA
68 |     ]
69 | 
70 | @pytest.mark.parametrize(
71 |     ["feed_id", "post_url", "post_data", "should_fail"],
72 |     all_posts()
73 | )
74 | def test_add_post(feed_id, post_url, post_data, should_fail):
75 |     payload = remove_unknown_keys(post_data, ["link", "title", "pubdate", "author", "categories"])
76 |     post_job_resp = requests.post(urljoin(base_url, f"api/v1/feeds/{feed_id}/posts/"), json=dict(posts=[payload]))
77 | 
78 |     if should_fail:
79 |         assert post_job_resp.status_code == 400, "add feed request expected to fail"
80 |         return
81 |     
82 |     assert post_job_resp.status_code == 201, f"request failed: {post_job_resp.text}"
83 |     post_job_resp_data = post_job_resp.json()
84 |     assert post_job_resp_data["feed_id"] == feed_id, "wrong feed id"
85 |     assert len(post_job_resp_data["urls"]["retrieving"]) == 1, "one post expected"
86 |     post_id = post_job_resp_data["urls"]["retrieving"][0]["id"]
87 |     expected_id = post_data["id"]
88 |     assert post_id == expected_id
89 |     job_id = post_job_resp_data['id']
90 | 
91 |     job_data = wait_for_jobs(job_id)
92 |     post_data_resp = requests.get(urljoin(base_url, f"api/v1/posts/{post_id}/"))
93 |     post_data_resp_data = post_data_resp.json()
94 |     assert post_data_resp_data["title"] == post_data["title"]
95 |     assert post_data_resp_data["pubdate"] == post_data["pubdate"]
96 |     assert set(post_data_resp_data["categories"]) == set(post_data.get("categories", []))
97 | 


--------------------------------------------------------------------------------
/tests/test_03_delete_post.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import os
 4 | import time
 5 | from types import SimpleNamespace
 6 | import unittest, pytest
 7 | from urllib.parse import urljoin
 8 | 
 9 | from tests.utils import remove_unknown_keys, wait_for_jobs
10 | 
11 | base_url = os.environ["SERVICE_BASE_URL"]
12 | import requests
13 | 
14 | @pytest.mark.parametrize(
15 |     ["post_id", "should_fail"],
16 |     [
17 |         ["9c04d319-a949-52df-bcb6-5a73a1458fe5", True], #post does not exist
18 |         ["4aa844cb-18e6-58cc-bed1-4c22abf3b977", False],
19 |         ["4aa844cb-18e6-58cc-bed1-4c22abf3b977", True], #post already deleted
20 |     ]
21 | )
22 | def test_delete_post(post_id, should_fail):
23 |     post_url = urljoin(base_url, f"api/v1/posts/{post_id}/")
24 |     delete_resp = requests.delete(post_url)
25 | 
26 |     if should_fail:
27 |         assert delete_resp.status_code == 404, f"delete post request expected to fail: {delete_resp.text}"
28 |         return
29 |     assert delete_resp.status_code == 204, f"unexpected status, body: {delete_resp.text}"
30 | 
31 | 
32 |     get_resp = requests.get(post_url)
33 |     assert get_resp.status_code == 404, f"post should already be deleted"
34 | 


--------------------------------------------------------------------------------
/tests/test_04_delete_feed.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import os
 4 | import time
 5 | from types import SimpleNamespace
 6 | import unittest, pytest
 7 | from urllib.parse import urljoin
 8 | 
 9 | from tests.utils import remove_unknown_keys, wait_for_jobs
10 | 
11 | base_url = os.environ["SERVICE_BASE_URL"]
12 | import requests
13 | 
14 | @pytest.mark.parametrize(
15 |     ["feed_id", "should_fail"],
16 |     [
17 |         ["c2fe0594-f463-5362-afe7-6950bda94bc6", True], #feed does not exist
18 |         ["9c04d319-a949-52df-bcb6-5a73a1458fe5", False],
19 |         ["9c04d319-a949-52df-bcb6-5a73a1458fe5", True], #feed already deleted
20 |     ]
21 | )
22 | def test_delete_feed(feed_id, should_fail):
23 |     feed_url = urljoin(base_url, f"api/v1/feeds/{feed_id}/")
24 |     delete_resp = requests.delete(feed_url)
25 | 
26 |     if should_fail:
27 |         assert delete_resp.status_code == 404, f"delete feed request expected to fail: {delete_resp.text}"
28 |         return
29 |     assert delete_resp.status_code == 204, f"unexpected status, body: {delete_resp.text}"
30 | 
31 | 
32 |     get_resp = requests.get(feed_url)
33 |     assert get_resp.status_code == 404, f"feed should already be deleted"
34 | 


--------------------------------------------------------------------------------
/tests/test_05_post_filters.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | import time
  4 | from types import SimpleNamespace
  5 | import unittest, pytest
  6 | from urllib.parse import urljoin
  7 | 
  8 | from tests.utils import get_post_ids_for_job, is_sorted, remove_unknown_keys, wait_for_jobs
  9 | 
 10 | base_url = os.environ["SERVICE_BASE_URL"]
 11 | import requests
 12 | 
 13 | 
 14 | @pytest.mark.parametrize(
 15 |     ["filters", "expected_ids"],
 16 |     [
 17 |         [
 18 |             dict(feed_id="d1d96b71-c687-50db-9d2b-d0092d1d163a"),
 19 |             [
 20 |                 "f8c75694-a834-5e35-b0a3-52034a1d9f6d",
 21 |                 "85a762c9-00f9-5c0c-9858-498883e13ea1",
 22 |                 "29be2407-d5d1-5b47-bbb5-1c51a84d48eb",
 23 |                 "84a8ff1c-c463-5a97-b0c4-93daf7102b5f",
 24 |                 "cfdb68b8-3d80-572d-9350-58baf57eabfb",
 25 |                 "8f16d2be-7b06-5f3c-a851-9cce31b4fec8",
 26 |             ],
 27 |         ],  # feed does not exist
 28 |         [
 29 |             dict(link="test2/2024/08/07"),
 30 |             [
 31 |                 "afef9ebd-2dee-5ab9-be0b-96c2ad83a1bb",
 32 |                 "48310096-d1f3-5e30-9910-5d7d0fd400be",
 33 |                 "d8aa9854-43fc-5816-b7ef-fc93810b29a5",
 34 |                 "f8c75694-a834-5e35-b0a3-52034a1d9f6d",
 35 |             ],
 36 |         ],
 37 |         [
 38 |             dict(title="uPdATe this Post"),
 39 |             [
 40 |                 "58514345-4e10-54c9-8f2c-d81507088079",
 41 |                 "8c72f15c-abeb-5c90-b239-6429f53696f9",
 42 |                 "8f16d2be-7b06-5f3c-a851-9cce31b4fec8",
 43 |                 "f214c1fd-5370-5dff-bd49-fd74bf32c7fe",
 44 |             ],
 45 |         ],
 46 |         [
 47 |             dict(title="example org"),
 48 |             [
 49 |                 "a378c839-0940-56fb-b52c-e5b78d34ec94",
 50 |             ],
 51 |         ],
 52 |         [
 53 |             dict(description="example domain"),
 54 |             [
 55 |                 "223565cd-dd4f-54c2-9bbd-63019f39554f",
 56 |                 "a378c839-0940-56fb-b52c-e5b78d34ec94",
 57 |             ],
 58 |         ],
 59 |     ],
 60 | )
 61 | def test_filters_generic(filters: dict, expected_ids: list[str]):
 62 |     expected_ids = set(expected_ids)
 63 |     url = urljoin(base_url, "api/v1/posts/")
 64 |     resp = requests.get(url, params=filters)
 65 |     resp_data = resp.json()
 66 |     assert resp_data["total_results_count"] == len(expected_ids)
 67 |     assert {post["id"] for post in resp_data["posts"]} == expected_ids
 68 | 
 69 | 
 70 | def random_posts_values(key, count):
 71 |     url = urljoin(base_url, "api/v1/posts/")
 72 |     resp = requests.get(url)
 73 |     data = resp.json()
 74 |     return [post[key] for post in random.choices(data["posts"], k=count)]
 75 | 
 76 | 
 77 | def more_pubdate_filters(count):
 78 |     filters = []
 79 |     pubdates = random_posts_values("pubdate", 50)
 80 |     for i in range(count):
 81 |         mmin = mmax = None
 82 |         if random.random() > 0.7:
 83 |             mmax = random.choice(pubdates)
 84 |         if random.random() < 0.3:
 85 |             mmin = random.choice(pubdates)
 86 |         if mmin or mmax:
 87 |             filters.append([mmin, mmax])
 88 |     return filters
 89 | 
 90 | 
 91 | @pytest.mark.parametrize(
 92 |     ["pubdate_min", "pubdate_max"],
 93 |     [
 94 |         ["2024-03-22T16:11:03Z", "2024-08-11T16:12:03Z"],
 95 |         ["2025-03-22T16:11:03Z", "2024-08-11T16:12:03Z"],
 96 |     ],
 97 | )
 98 | def test_pubdate_minmax(pubdate_min, pubdate_max):
 99 |     filters = {}
100 |     if pubdate_min:
101 |         filters.update(pubdate_min=pubdate_min)
102 |     if pubdate_max:
103 |         filters.update(pubdate_max=pubdate_max)
104 | 
105 |     assert pubdate_max or pubdate_min, "at least one of two filters required"
106 | 
107 |     url = urljoin(base_url, "api/v1/posts/")
108 |     resp = requests.get(url, params=filters)
109 |     assert resp.status_code == 200
110 |     resp_data = resp.json()
111 |     for d in resp_data["posts"]:
112 |         if pubdate_min:
113 |             assert (
114 |                 d["pubdate"] >= pubdate_min
115 |             ), "pubdate must not be less than pubdate_min"
116 |         if pubdate_max:
117 |             assert (
118 |                 d["pubdate"] <= pubdate_max
119 |             ), "pubdate must not be greater than pubdate_max"
120 | 
121 | 
122 | @pytest.mark.parametrize(
123 |     "updated_after", ["2024-03-22T16:11:03Z", "2030-03-22T16:11:03Z"]
124 | )
125 | def test_updated_after(updated_after):
126 |     assert updated_after, "value cannot be None"
127 | 
128 |     url = urljoin(base_url, "api/v1/posts/")
129 |     resp = requests.get(url, params=dict(pubdate_min=updated_after))
130 |     assert resp.status_code == 200
131 |     resp_data = resp.json()
132 |     for d in resp_data["posts"]:
133 |         assert (
134 |             d["datetime_updated"] >= updated_after
135 |         ), "datetime_updated must not be greater than updated_after"
136 | 
137 | 
138 | def test_extra_updated_after(subtests):
139 |     for datetime_updated in random_posts_values("datetime_updated", 12):
140 |         with subtests.test(
141 |             "randomly_generated updated_after query", updated_after=datetime_updated
142 |         ):
143 |             test_updated_after(datetime_updated)
144 | 
145 | 
146 | def test_extra_pubdate_filters(subtests):
147 |     for dmin, dmax in more_pubdate_filters(22):
148 |         with subtests.test(
149 |             "randomly_generated pubdate_* query", pubdate_min=dmin, pubdate_max=dmax
150 |         ):
151 |             test_pubdate_minmax(dmin, dmax)
152 | 
153 | 
154 | def test_job_filter(subtests):
155 |     def test_job_id_filter(job_id, post_ids):
156 |         url = urljoin(base_url, "api/v1/posts/")
157 |         resp = requests.get(url, params=dict(job_id=job_id))
158 |         data = resp.json()
159 |         for post in data["posts"]:
160 |             assert post['id'] in post_ids, "post does not belong to job"
161 |         assert data['total_results_count'] == len(post_ids)
162 | 
163 |     jobs_resp = requests.get(urljoin(base_url, "api/v1/jobs/"))
164 |     for job in jobs_resp.json()['jobs']:
165 |         with subtests.test("test_job_id_filter", job_id=job['id']):
166 |             test_job_id_filter(job['id'], [x[0] for x in get_post_ids_for_job(job)])
167 | 
168 | 
169 | @pytest.mark.parametrize(
170 |         ["sort_filter", "expected_sort"],
171 |         [
172 |         ("", "pubdate_descending"), #default filter
173 |         ("pubdate_descending", "pubdate_descending"),
174 |         ("pubdate_ascending", "pubdate_ascending"),
175 |         ("title_descending", "title_descending"),
176 |         ("title_ascending", "title_ascending"),
177 |         ("datetime_updated_descending", "datetime_updated_descending"),
178 |         ("datetime_updated_ascending", "datetime_updated_ascending"),
179 |         ("datetime_added_descending", "datetime_added_descending"),
180 |         ("datetime_added_ascending", "datetime_added_ascending"),
181 |     ]
182 | )
183 | def test_list_posts_sort(sort_filter: str, expected_sort: str):
184 |     reports_url = urljoin(base_url, f"api/v1/posts/")
185 |     filters = dict(sort=sort_filter) if sort_filter else None
186 |     get_resp = requests.get(reports_url, params=filters)
187 |     assert get_resp.status_code == 200, f"response: {get_resp.text}"
188 |     posts = get_resp.json()["posts"]
189 |     property, _, direction = expected_sort.rpartition('_')
190 |     def sort_fn(obj):
191 |         retval = obj[property]
192 |         print(retval)
193 |         return retval
194 |     assert is_sorted(posts, key=sort_fn, reverse=direction == 'descending'), f"expected posts to be sorted by {property} in {direction} order"
195 | 


--------------------------------------------------------------------------------
/tests/test_06_patch_feed.py:
--------------------------------------------------------------------------------
 1 | from datetime import UTC, datetime
 2 | import os
 3 | import time
 4 | from types import SimpleNamespace
 5 | import unittest, pytest
 6 | from urllib.parse import urljoin
 7 | from dateutil.parser import parse as parse_date
 8 | 
 9 | from tests.utils import remove_unknown_keys, wait_for_jobs
10 | 
11 | base_url = os.environ["SERVICE_BASE_URL"]
12 | import requests
13 | @pytest.mark.parametrize(
14 |         ["feed_id", "metadata"],
15 |         [
16 |             ["d1d96b71-c687-50db-9d2b-d0092d1d163a", dict(title="updated title")],
17 |             ["d63dad15-8e23-57eb-80f7-715cedf85f33", dict(title="updated title", description="new description")],
18 |             ["d1d96b71-c687-50db-9d2b-d0092d1d163a", dict(pretty_url="https://muchdogesec.github.io/fakeblog123/?added_later=true")],
19 |         ]
20 | )
21 | def test_update_feed_metadata(feed_id, metadata):
22 |     resp = requests.patch(urljoin(base_url, f"api/v1/feeds/{feed_id}/"), json=metadata)
23 |     assert resp.status_code == 201
24 |     resp_data = resp.json()
25 | 
26 |     if expected_pretty_url := metadata.get("pretty_url"):
27 |         assert resp_data["pretty_url"] == expected_pretty_url
28 | 
29 |     if expected_title := metadata.get("title"):
30 |         assert resp_data["title"] == expected_title
31 | 
32 |     if expected_description := metadata.get("description"):
33 |         assert resp_data["description"] == expected_description
34 | 
35 | # def test_feed_reindex(feed_id):
36 | #     start_time = datetime.now(UTC)
37 | #     resp = requests.patch(urljoin(base_url, f"api/v1/feeds/{feed_id}/"))
38 | #     assert resp.status_code == 201
39 | #     resp_data = resp.json()
40 | 


--------------------------------------------------------------------------------
/tests/test_07_patch_post.py:
--------------------------------------------------------------------------------
 1 | from datetime import UTC, datetime
 2 | import os
 3 | import time
 4 | from types import SimpleNamespace
 5 | import unittest, pytest
 6 | from urllib.parse import urljoin
 7 | from dateutil.parser import parse as parse_date
 8 | 
 9 | from tests.utils import remove_unknown_keys, wait_for_jobs
10 | 
11 | base_url = os.environ["SERVICE_BASE_URL"]
12 | import requests
13 | @pytest.mark.parametrize(
14 |         ["post_id", "metadata"],
15 |         [
16 |             ["58514345-4e10-54c9-8f2c-d81507088079", dict(title="updated post title")],
17 |             ["a378c839-0940-56fb-b52c-e5b78d34ec94", dict(title="updated title", author="new post author")],
18 |             ["58514345-4e10-54c9-8f2c-d81507088079", dict(pubdate="2009-03-04T14:56:07Z")],
19 |         ]
20 | )
21 | def test_update_post_metadata(post_id, metadata):
22 |     resp = requests.patch(urljoin(base_url, f"api/v1/posts/{post_id}/"), json=metadata)
23 |     assert resp.status_code == 201
24 |     resp_data = resp.json()
25 | 
26 |     if expected_pretty_url := metadata.get("pretty_url"):
27 |         assert resp_data["pretty_url"] == expected_pretty_url
28 | 
29 |     if expected_categories := metadata.get("categories"):
30 |         assert resp_data["categories"] == expected_categories
31 | 
32 |     if expected_author := metadata.get("author"):
33 |         assert resp_data["author"] == expected_author
34 | 
35 |     
36 |     if expected_pubdate := metadata.get("pubdate"):
37 |         assert resp_data["pubdate"] == expected_pubdate


--------------------------------------------------------------------------------
/tests/test_99_delete_all_feeds.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | from types import SimpleNamespace
 4 | import unittest, pytest
 5 | from urllib.parse import urljoin
 6 | 
 7 | base_url = os.environ["SERVICE_BASE_URL"]
 8 | import requests
 9 | 
10 | 
11 | def get_all_feeds():
12 |     if not os.getenv('DELETE_ALL_FEEDS'):
13 |         return []
14 |     resp = requests.get(urljoin(base_url, "api/v1/feeds/"))
15 |     return [[feed["id"]] for feed in resp.json()["feeds"]]
16 | 
17 | @pytest.mark.parametrize(
18 |         ["feed_id"],
19 |         get_all_feeds(),
20 | )
21 | def test_delete_blog(feed_id):
22 |     resp = requests.delete(urljoin(base_url, f"api/v1/feeds/{feed_id}/"))
23 |     assert resp.status_code == 204, "unexpected status code"
24 |     resp = requests.get(urljoin(base_url, f"api/v1/feeds/{feed_id}/"))
25 |     assert resp.status_code == 404, "feed should not exist after deletion"
26 | 


--------------------------------------------------------------------------------
/tests/utils.py:
--------------------------------------------------------------------------------
 1 | from itertools import tee
 2 | from operator import lt
 3 | import os
 4 | import time
 5 | from types import SimpleNamespace
 6 | import unittest, pytest
 7 | from urllib.parse import urljoin
 8 | 
 9 | base_url = os.environ["SERVICE_BASE_URL"]
10 | import requests
11 | 
12 | 
13 | 
14 | def remove_unknown_keys(data: dict, known_keys: list):
15 |     payload = data.copy()
16 |     for k in list(payload.keys()):
17 |         if k not in known_keys:
18 |             payload.pop(k, None)
19 |     return payload
20 | 
21 | 
22 | def wait_for_jobs(job_id):
23 |     try_count = 0
24 |     while True:
25 |         job_data = requests.get(f"{base_url}/api/v1/jobs/{job_id}/").json()
26 |         job_status = job_data["state"]
27 |         if job_status in ["success", "failed"]:
28 |             assert job_status == "success", f"response: {job_data}"
29 |             return job_data
30 |         try_count += 1
31 |         assert try_count < 30, "stopped after 30 retries"
32 |         time.sleep(3)
33 | 
34 | 
35 | def get_post_ids_for_job(job:dict):
36 |     retval = []
37 |     for type, d in job['urls'].items():
38 |         if type == 'skipped':
39 |             continue
40 |         for p in d:
41 |             retval.append((p['id'], type))
42 |     return retval
43 |     
44 | 
45 | def is_sorted(iterable, key=None, reverse=False):
46 |     it = iterable if (key is None) else map(key, iterable)
47 |     a, b = tee(it)
48 |     next(b, None)
49 |     if reverse:
50 |         b, a = a, b
51 |     return not any(map(lt, b, a))


--------------------------------------------------------------------------------