├── .gitignore ├── LICENSE ├── Pipfile ├── Pipfile.lock ├── README.rst ├── feedsearch ├── __init__.py ├── __version__.py ├── feedfinder.py ├── feedinfo.py ├── feedsearch.py ├── lib.py ├── site_meta.py └── url.py ├── search.py ├── setup.cfg └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | 103 | .idea/ 104 | .vscode/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 David Beath 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.python.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [dev-packages] 7 | twine = "*" 8 | black = "*" 9 | "flake8" = "*" 10 | rope = "*" 11 | 12 | [packages] 13 | requests = "*" 14 | "beautifulsoup4" = "*" 15 | feedparser = "*" 16 | click = "*" 17 | werkzeug = "*" 18 | 19 | [requires] 20 | python_version = "3.5" 21 | 22 | [pipenv] 23 | allow_prereleases = true 24 | -------------------------------------------------------------------------------- /Pipfile.lock: -------------------------------------------------------------------------------- 1 | { 2 | "_meta": { 3 | "hash": { 4 | "sha256": "e3fd544b327cb4788ca7440bee564bbeb55f243e04f76c4b342d2f7bc0037c28" 5 | }, 6 | "pipfile-spec": 6, 7 | "requires": { 8 | "python_version": "3.5" 9 | }, 10 | "sources": [ 11 | { 12 | "name": "pypi", 13 | "url": "https://pypi.python.org/simple", 14 | "verify_ssl": true 15 | } 16 | ] 17 | }, 18 | "default": { 19 | "beautifulsoup4": { 20 | "hashes": [ 21 | "sha256:05fd825eb01c290877657a56df4c6e4c311b3965bda790c613a3d6fb01a5462a", 22 | "sha256:9fbb4d6e48ecd30bcacc5b63b94088192dcda178513b2ae3c394229f8911b887", 23 | "sha256:e1505eeed31b0f4ce2dbb3bc8eb256c04cc2b3b72af7d551a4ab6efd5cbe5dae" 24 | ], 25 | "index": "pypi", 26 | "version": "==4.8.2" 27 | }, 28 | "certifi": { 29 | "hashes": [ 30 | "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", 31 | "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" 32 | ], 33 | "version": "==2019.11.28" 34 | }, 35 | "chardet": { 36 | "hashes": [ 37 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 38 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 39 | ], 40 | "version": "==3.0.4" 41 | }, 42 | "click": { 43 | "hashes": [ 44 | "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", 45 | "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7" 46 | ], 47 | "index": "pypi", 48 | "version": "==7.0" 49 | }, 50 | "feedparser": { 51 | "hashes": [ 52 | "sha256:150ccca4cfc3481f7ff503988a91bbdbbbc3406d6444bfe9cfe6c1001d378e73", 53 | "sha256:87185443d6e12cf870125bdc9211168c60895e7dd7209b5c082897ddb1b11efb" 54 | ], 55 | "index": "pypi", 56 | "version": "==6.0.0b1" 57 | }, 58 | "idna": { 59 | "hashes": [ 60 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 61 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 62 | ], 63 | "version": "==2.8" 64 | }, 65 | "requests": { 66 | "hashes": [ 67 | "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", 68 | "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" 69 | ], 70 | "index": "pypi", 71 | "version": "==2.22.0" 72 | }, 73 | "sgmllib3k": { 74 | "hashes": [ 75 | "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9" 76 | ], 77 | "markers": "python_version >= '3.0'", 78 | "version": "==1.0.0" 79 | }, 80 | "soupsieve": { 81 | "hashes": [ 82 | "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5", 83 | "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda" 84 | ], 85 | "version": "==1.9.5" 86 | }, 87 | "urllib3": { 88 | "hashes": [ 89 | "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", 90 | "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" 91 | ], 92 | "version": "==1.25.8" 93 | }, 94 | "werkzeug": { 95 | "hashes": [ 96 | "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096", 97 | "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16" 98 | ], 99 | "index": "pypi", 100 | "version": "==1.0.0" 101 | } 102 | }, 103 | "develop": { 104 | "appdirs": { 105 | "hashes": [ 106 | "sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92", 107 | "sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e" 108 | ], 109 | "version": "==1.4.3" 110 | }, 111 | "attrs": { 112 | "hashes": [ 113 | "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c", 114 | "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72" 115 | ], 116 | "version": "==19.3.0" 117 | }, 118 | "black": { 119 | "hashes": [ 120 | "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b", 121 | "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539" 122 | ], 123 | "index": "pypi", 124 | "version": "==19.10b0" 125 | }, 126 | "bleach": { 127 | "hashes": [ 128 | "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16", 129 | "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa" 130 | ], 131 | "version": "==3.1.0" 132 | }, 133 | "certifi": { 134 | "hashes": [ 135 | "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", 136 | "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" 137 | ], 138 | "version": "==2019.11.28" 139 | }, 140 | "cffi": { 141 | "hashes": [ 142 | "sha256:001bf3242a1bb04d985d63e138230802c6c8d4db3668fb545fb5005ddf5bb5ff", 143 | "sha256:00789914be39dffba161cfc5be31b55775de5ba2235fe49aa28c148236c4e06b", 144 | "sha256:028a579fc9aed3af38f4892bdcc7390508adabc30c6af4a6e4f611b0c680e6ac", 145 | "sha256:14491a910663bf9f13ddf2bc8f60562d6bc5315c1f09c704937ef17293fb85b0", 146 | "sha256:1cae98a7054b5c9391eb3249b86e0e99ab1e02bb0cc0575da191aedadbdf4384", 147 | "sha256:2089ed025da3919d2e75a4d963d008330c96751127dd6f73c8dc0c65041b4c26", 148 | "sha256:2d384f4a127a15ba701207f7639d94106693b6cd64173d6c8988e2c25f3ac2b6", 149 | "sha256:337d448e5a725bba2d8293c48d9353fc68d0e9e4088d62a9571def317797522b", 150 | "sha256:399aed636c7d3749bbed55bc907c3288cb43c65c4389964ad5ff849b6370603e", 151 | "sha256:3b911c2dbd4f423b4c4fcca138cadde747abdb20d196c4a48708b8a2d32b16dd", 152 | "sha256:3d311bcc4a41408cf5854f06ef2c5cab88f9fded37a3b95936c9879c1640d4c2", 153 | "sha256:62ae9af2d069ea2698bf536dcfe1e4eed9090211dbaafeeedf5cb6c41b352f66", 154 | "sha256:66e41db66b47d0d8672d8ed2708ba91b2f2524ece3dee48b5dfb36be8c2f21dc", 155 | "sha256:675686925a9fb403edba0114db74e741d8181683dcf216be697d208857e04ca8", 156 | "sha256:7e63cbcf2429a8dbfe48dcc2322d5f2220b77b2e17b7ba023d6166d84655da55", 157 | "sha256:8a6c688fefb4e1cd56feb6c511984a6c4f7ec7d2a1ff31a10254f3c817054ae4", 158 | "sha256:8c0ffc886aea5df6a1762d0019e9cb05f825d0eec1f520c51be9d198701daee5", 159 | "sha256:95cd16d3dee553f882540c1ffe331d085c9e629499ceadfbda4d4fde635f4b7d", 160 | "sha256:99f748a7e71ff382613b4e1acc0ac83bf7ad167fb3802e35e90d9763daba4d78", 161 | "sha256:b8c78301cefcf5fd914aad35d3c04c2b21ce8629b5e4f4e45ae6812e461910fa", 162 | "sha256:c420917b188a5582a56d8b93bdd8e0f6eca08c84ff623a4c16e809152cd35793", 163 | "sha256:c43866529f2f06fe0edc6246eb4faa34f03fe88b64a0a9a942561c8e22f4b71f", 164 | "sha256:cab50b8c2250b46fe738c77dbd25ce017d5e6fb35d3407606e7a4180656a5a6a", 165 | "sha256:cef128cb4d5e0b3493f058f10ce32365972c554572ff821e175dbc6f8ff6924f", 166 | "sha256:cf16e3cf6c0a5fdd9bc10c21687e19d29ad1fe863372b5543deaec1039581a30", 167 | "sha256:e56c744aa6ff427a607763346e4170629caf7e48ead6921745986db3692f987f", 168 | "sha256:e577934fc5f8779c554639376beeaa5657d54349096ef24abe8c74c5d9c117c3", 169 | "sha256:f2b0fa0c01d8a0c7483afd9f31d7ecf2d71760ca24499c8697aeb5ca37dc090c" 170 | ], 171 | "version": "==1.14.0" 172 | }, 173 | "chardet": { 174 | "hashes": [ 175 | "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", 176 | "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" 177 | ], 178 | "version": "==3.0.4" 179 | }, 180 | "click": { 181 | "hashes": [ 182 | "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13", 183 | "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7" 184 | ], 185 | "index": "pypi", 186 | "version": "==7.0" 187 | }, 188 | "cryptography": { 189 | "hashes": [ 190 | "sha256:02079a6addc7b5140ba0825f542c0869ff4df9a69c360e339ecead5baefa843c", 191 | "sha256:1df22371fbf2004c6f64e927668734070a8953362cd8370ddd336774d6743595", 192 | "sha256:369d2346db5934345787451504853ad9d342d7f721ae82d098083e1f49a582ad", 193 | "sha256:3cda1f0ed8747339bbdf71b9f38ca74c7b592f24f65cdb3ab3765e4b02871651", 194 | "sha256:44ff04138935882fef7c686878e1c8fd80a723161ad6a98da31e14b7553170c2", 195 | "sha256:4b1030728872c59687badcca1e225a9103440e467c17d6d1730ab3d2d64bfeff", 196 | "sha256:58363dbd966afb4f89b3b11dfb8ff200058fbc3b947507675c19ceb46104b48d", 197 | "sha256:6ec280fb24d27e3d97aa731e16207d58bd8ae94ef6eab97249a2afe4ba643d42", 198 | "sha256:7270a6c29199adc1297776937a05b59720e8a782531f1f122f2eb8467f9aab4d", 199 | "sha256:73fd30c57fa2d0a1d7a49c561c40c2f79c7d6c374cc7750e9ac7c99176f6428e", 200 | "sha256:7f09806ed4fbea8f51585231ba742b58cbcfbfe823ea197d8c89a5e433c7e912", 201 | "sha256:90df0cc93e1f8d2fba8365fb59a858f51a11a394d64dbf3ef844f783844cc793", 202 | "sha256:971221ed40f058f5662a604bd1ae6e4521d84e6cad0b7b170564cc34169c8f13", 203 | "sha256:a518c153a2b5ed6b8cc03f7ae79d5ffad7315ad4569b2d5333a13c38d64bd8d7", 204 | "sha256:b0de590a8b0979649ebeef8bb9f54394d3a41f66c5584fff4220901739b6b2f0", 205 | "sha256:b43f53f29816ba1db8525f006fa6f49292e9b029554b3eb56a189a70f2a40879", 206 | "sha256:d31402aad60ed889c7e57934a03477b572a03af7794fa8fb1780f21ea8f6551f", 207 | "sha256:de96157ec73458a7f14e3d26f17f8128c959084931e8997b9e655a39c8fde9f9", 208 | "sha256:df6b4dca2e11865e6cfbfb708e800efb18370f5a46fd601d3755bc7f85b3a8a2", 209 | "sha256:ecadccc7ba52193963c0475ac9f6fa28ac01e01349a2ca48509667ef41ffd2cf", 210 | "sha256:fb81c17e0ebe3358486cd8cc3ad78adbae58af12fc2bf2bc0bb84e8090fa5ce8" 211 | ], 212 | "version": "==2.8" 213 | }, 214 | "docutils": { 215 | "hashes": [ 216 | "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af", 217 | "sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc" 218 | ], 219 | "version": "==0.16" 220 | }, 221 | "entrypoints": { 222 | "hashes": [ 223 | "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19", 224 | "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451" 225 | ], 226 | "version": "==0.3" 227 | }, 228 | "flake8": { 229 | "hashes": [ 230 | "sha256:45681a117ecc81e870cbf1262835ae4af5e7a8b08e40b944a8a6e6b895914cfb", 231 | "sha256:49356e766643ad15072a789a20915d3c91dc89fd313ccd71802303fd67e4deca" 232 | ], 233 | "index": "pypi", 234 | "version": "==3.7.9" 235 | }, 236 | "idna": { 237 | "hashes": [ 238 | "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", 239 | "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c" 240 | ], 241 | "version": "==2.8" 242 | }, 243 | "importlib-metadata": { 244 | "hashes": [ 245 | "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302", 246 | "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b" 247 | ], 248 | "markers": "python_version < '3.8'", 249 | "version": "==1.5.0" 250 | }, 251 | "jeepney": { 252 | "hashes": [ 253 | "sha256:0ba6d8c597e9bef1ebd18aaec595f942a264e25c1a48f164d46120eacaa2e9bb", 254 | "sha256:6f45dce1125cf6c58a1c88123d3831f36a789f9204fbad3172eac15f8ccd08d0" 255 | ], 256 | "markers": "sys_platform == 'linux'", 257 | "version": "==0.4.2" 258 | }, 259 | "keyring": { 260 | "hashes": [ 261 | "sha256:1f393f7466314068961c7e1d508120c092bd71fa54e3d93b76180b526d4abc56", 262 | "sha256:24ae23ab2d6adc59138339e56843e33ec7b0a6b2f06302662477085c6c0aca00" 263 | ], 264 | "version": "==21.1.0" 265 | }, 266 | "mccabe": { 267 | "hashes": [ 268 | "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42", 269 | "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f" 270 | ], 271 | "version": "==0.6.1" 272 | }, 273 | "pathspec": { 274 | "hashes": [ 275 | "sha256:163b0632d4e31cef212976cf57b43d9fd6b0bac6e67c26015d611a647d5e7424", 276 | "sha256:562aa70af2e0d434367d9790ad37aed893de47f1693e4201fd1d3dca15d19b96" 277 | ], 278 | "version": "==0.7.0" 279 | }, 280 | "pkginfo": { 281 | "hashes": [ 282 | "sha256:7424f2c8511c186cd5424bbf31045b77435b37a8d604990b79d4e70d741148bb", 283 | "sha256:a6d9e40ca61ad3ebd0b72fbadd4fba16e4c0e4df0428c041e01e06eb6ee71f32" 284 | ], 285 | "version": "==1.5.0.1" 286 | }, 287 | "pycodestyle": { 288 | "hashes": [ 289 | "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56", 290 | "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c" 291 | ], 292 | "version": "==2.5.0" 293 | }, 294 | "pycparser": { 295 | "hashes": [ 296 | "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3" 297 | ], 298 | "version": "==2.19" 299 | }, 300 | "pyflakes": { 301 | "hashes": [ 302 | "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0", 303 | "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2" 304 | ], 305 | "version": "==2.1.1" 306 | }, 307 | "pygments": { 308 | "hashes": [ 309 | "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b", 310 | "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe" 311 | ], 312 | "version": "==2.5.2" 313 | }, 314 | "readme-renderer": { 315 | "hashes": [ 316 | "sha256:bb16f55b259f27f75f640acf5e00cf897845a8b3e4731b5c1a436e4b8529202f", 317 | "sha256:c8532b79afc0375a85f10433eca157d6b50f7d6990f337fa498c96cd4bfc203d" 318 | ], 319 | "version": "==24.0" 320 | }, 321 | "regex": { 322 | "hashes": [ 323 | "sha256:07b39bf943d3d2fe63d46281d8504f8df0ff3fe4c57e13d1656737950e53e525", 324 | "sha256:0932941cdfb3afcbc26cc3bcf7c3f3d73d5a9b9c56955d432dbf8bbc147d4c5b", 325 | "sha256:0e182d2f097ea8549a249040922fa2b92ae28be4be4895933e369a525ba36576", 326 | "sha256:10671601ee06cf4dc1bc0b4805309040bb34c9af423c12c379c83d7895622bb5", 327 | "sha256:23e2c2c0ff50f44877f64780b815b8fd2e003cda9ce817a7fd00dea5600c84a0", 328 | "sha256:26ff99c980f53b3191d8931b199b29d6787c059f2e029b2b0c694343b1708c35", 329 | "sha256:27429b8d74ba683484a06b260b7bb00f312e7c757792628ea251afdbf1434003", 330 | "sha256:3e77409b678b21a056415da3a56abfd7c3ad03da71f3051bbcdb68cf44d3c34d", 331 | "sha256:4e8f02d3d72ca94efc8396f8036c0d3bcc812aefc28ec70f35bb888c74a25161", 332 | "sha256:4eae742636aec40cf7ab98171ab9400393360b97e8f9da67b1867a9ee0889b26", 333 | "sha256:6a6ae17bf8f2d82d1e8858a47757ce389b880083c4ff2498dba17c56e6c103b9", 334 | "sha256:6a6ba91b94427cd49cd27764679024b14a96874e0dc638ae6bdd4b1a3ce97be1", 335 | "sha256:7bcd322935377abcc79bfe5b63c44abd0b29387f267791d566bbb566edfdd146", 336 | "sha256:98b8ed7bb2155e2cbb8b76f627b2fd12cf4b22ab6e14873e8641f266e0fb6d8f", 337 | "sha256:bd25bb7980917e4e70ccccd7e3b5740614f1c408a642c245019cff9d7d1b6149", 338 | "sha256:d0f424328f9822b0323b3b6f2e4b9c90960b24743d220763c7f07071e0778351", 339 | "sha256:d58e4606da2a41659c84baeb3cfa2e4c87a74cec89a1e7c56bee4b956f9d7461", 340 | "sha256:e3cd21cc2840ca67de0bbe4071f79f031c81418deb544ceda93ad75ca1ee9f7b", 341 | "sha256:e6c02171d62ed6972ca8631f6f34fa3281d51db8b326ee397b9c83093a6b7242", 342 | "sha256:e7c7661f7276507bce416eaae22040fd91ca471b5b33c13f8ff21137ed6f248c", 343 | "sha256:ecc6de77df3ef68fee966bb8cb4e067e84d4d1f397d0ef6fce46913663540d77" 344 | ], 345 | "version": "==2020.1.8" 346 | }, 347 | "requests": { 348 | "hashes": [ 349 | "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4", 350 | "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31" 351 | ], 352 | "index": "pypi", 353 | "version": "==2.22.0" 354 | }, 355 | "requests-toolbelt": { 356 | "hashes": [ 357 | "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f", 358 | "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0" 359 | ], 360 | "version": "==0.9.1" 361 | }, 362 | "rope": { 363 | "hashes": [ 364 | "sha256:52423a7eebb5306a6d63bdc91a7c657db51ac9babfb8341c9a1440831ecf3203", 365 | "sha256:ae1fa2fd56f64f4cc9be46493ce54bed0dd12dee03980c61a4393d89d84029ad", 366 | "sha256:d2830142c2e046f5fc26a022fe680675b6f48f81c7fc1f03a950706e746e9dfe" 367 | ], 368 | "index": "pypi", 369 | "version": "==0.16.0" 370 | }, 371 | "secretstorage": { 372 | "hashes": [ 373 | "sha256:15da8a989b65498e29be338b3b279965f1b8f09b9668bd8010da183024c8bff6", 374 | "sha256:b5ec909dde94d4ae2fa26af7c089036997030f0cf0a5cb372b4cccabd81c143b" 375 | ], 376 | "markers": "sys_platform == 'linux'", 377 | "version": "==3.1.2" 378 | }, 379 | "six": { 380 | "hashes": [ 381 | "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", 382 | "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" 383 | ], 384 | "version": "==1.14.0" 385 | }, 386 | "toml": { 387 | "hashes": [ 388 | "sha256:229f81c57791a41d65e399fc06bf0848bab550a9dfd5ed66df18ce5f05e73d5c", 389 | "sha256:235682dd292d5899d361a811df37e04a8828a5b1da3115886b73cf81ebc9100e" 390 | ], 391 | "version": "==0.10.0" 392 | }, 393 | "tqdm": { 394 | "hashes": [ 395 | "sha256:251ee8440dbda126b8dfa8a7c028eb3f13704898caaef7caa699b35e119301e2", 396 | "sha256:fe231261cfcbc6f4a99165455f8f6b9ef4e1032a6e29bccf168b4bf42012f09c" 397 | ], 398 | "version": "==4.42.1" 399 | }, 400 | "twine": { 401 | "hashes": [ 402 | "sha256:c1af8ca391e43b0a06bbc155f7f67db0bf0d19d284bfc88d1675da497a946124", 403 | "sha256:d561a5e511f70275e5a485a6275ff61851c16ffcb3a95a602189161112d9f160" 404 | ], 405 | "index": "pypi", 406 | "version": "==3.1.1" 407 | }, 408 | "typed-ast": { 409 | "hashes": [ 410 | "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", 411 | "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", 412 | "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", 413 | "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", 414 | "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", 415 | "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", 416 | "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", 417 | "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", 418 | "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", 419 | "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", 420 | "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", 421 | "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", 422 | "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", 423 | "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", 424 | "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", 425 | "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", 426 | "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", 427 | "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", 428 | "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", 429 | "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", 430 | "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" 431 | ], 432 | "version": "==1.4.1" 433 | }, 434 | "urllib3": { 435 | "hashes": [ 436 | "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", 437 | "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" 438 | ], 439 | "version": "==1.25.8" 440 | }, 441 | "webencodings": { 442 | "hashes": [ 443 | "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78", 444 | "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923" 445 | ], 446 | "version": "==0.5.1" 447 | }, 448 | "zipp": { 449 | "hashes": [ 450 | "sha256:5c56e330306215cd3553342cfafc73dda2c60792384117893f3a83f8a1209f50", 451 | "sha256:d65287feb793213ffe11c0f31b81602be31448f38aeb8ffc2eb286c4f6f6657e" 452 | ], 453 | "version": "==2.2.0" 454 | } 455 | } 456 | } 457 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Feedsearch 2 | ========== 3 | .. image:: https://img.shields.io/pypi/v/feedsearch.svg 4 | :target: https://pypi.python.org/pypi/feedsearch 5 | 6 | .. image:: https://img.shields.io/pypi/l/feedsearch.svg 7 | :target: https://pypi.python.org/pypi/feedsearch 8 | 9 | .. image:: https://img.shields.io/pypi/pyversions/feedsearch.svg 10 | :target: https://pypi.python.org/pypi/feedsearch 11 | 12 | .. image:: https://pepy.tech/badge/feedsearch 13 | :target: https://pepy.tech/project/feedsearch 14 | 15 | Feedsearch is a Python library for searching websites for RSS, Atom, and JSON feeds. 16 | 17 | It was originally based on 18 | `Feedfinder2 `_ written by 19 | `Dan Foreman-Mackey `_, which in turn is based on 20 | `feedfinder `_ - originally written by 21 | `Mark Pilgrim `_ 22 | and subsequently maintained by 23 | `Aaron Swartz `_ until his untimely death. 24 | 25 | Feedsearch now differs a lot with Feedfinder2, in that Feedsearch supports JSON feeds, allows for 26 | optional fetching of Feed and Site metadata, and optionally searches the content of internal linked pages 27 | and default CMS feed locations. 28 | 29 | **Please Note:** Development of this library is no longer ongoing except in the case of fixing reported bugs. 30 | Further development of Feedsearch functionality has now moved to 31 | `Feedsearch Crawler `_. 32 | 33 | Usage 34 | ----- 35 | 36 | Feedsearch is called with the single function ``search``: 37 | 38 | .. code-block:: python 39 | 40 | >>> from feedsearch import search 41 | >>> feeds = search('xkcd.com') 42 | >>> feeds 43 | [FeedInfo('https://xkcd.com/atom.xml'), FeedInfo('https://xkcd.com/rss.xml')] 44 | >>> feeds[0].url 45 | 'http://xkcd.com/atom.xml' 46 | 47 | To get Feed and Site metadata: 48 | 49 | .. code-block:: python 50 | 51 | >>> feeds = search('propublica.org', info=True) 52 | >>> feeds 53 | [FeedInfo('http://feeds.propublica.org/propublica/main')] 54 | >>> pprint(vars(feeds[0])) 55 | {'bozo': 0, 56 | 'content_type': 'text/xml; charset=UTF-8', 57 | 'description': 'Latest Articles and Investigations from ProPublica, an ' 58 | 'independent, non-profit newsroom that produces investigative ' 59 | 'journalism in the public interest.', 60 | 'favicon': 'https://assets.propublica.org/prod/v3/images/favicon.ico', 61 | 'favicon_data_uri': '', 62 | 'hubs': ['http://feedpress.superfeedr.com/'], 63 | 'is_push': True, 64 | 'score': 4, 65 | 'self_url': 'http://feeds.propublica.org/propublica/main', 66 | 'site_name': 'ProPublica', 67 | 'site_url': 'https://www.propublica.org/', 68 | 'title': 'Articles and Investigations - ProPublica', 69 | 'url': 'http://feeds.propublica.org/propublica/main', 70 | 'version': 'rss20'} 71 | 72 | Search will always return a list of *FeedInfo* objects, each of which will always have a *url* property. 73 | Feeds are sorted by the *score* value from highest to lowest, with a higher score theoretically indicating 74 | a more relevant feed compared to the original URL provided. 75 | 76 | If you only want the raw urls, then use a list comprehension on the result, or set the 77 | *as_urls* parameter to *True*: 78 | 79 | .. code-block:: python 80 | 81 | >>> feeds = search('http://jsonfeed.org') 82 | >>> feeds 83 | [FeedInfo('https://jsonfeed.org/xml/rss.xml'), FeedInfo('https://jsonfeed.org/feed.json')] 84 | >>> urls = [f.url for f in feeds] 85 | >>> urls 86 | ['https://jsonfeed.org/xml/rss.xml', 'https://jsonfeed.org/feed.json'] 87 | 88 | >>> feeds = search('http://jsonfeed.org', as_urls=True) 89 | >>> feeds 90 | >>> ['https://jsonfeed.org/xml/rss.xml', 'https://jsonfeed.org/feed.json'] 91 | 92 | In addition to the URL, the ``search`` function takes the following optional keyword arguments: 93 | 94 | - **info**: *bool*: Get Feed and Site Metadata. Defaults False. 95 | - **check_all**: *bool*: Check all internally linked pages of tags for feeds, and default CMS feeds. 96 | Only checks one level down. Defaults False. May be very slow. 97 | - **user_agent**: *str*: User-Agent Header string. Defaults to Package name. 98 | - **timeout**: *float* or *tuple(float, float)*: Timeout for each request in the search (not a timeout for the ``search`` 99 | method itself). Defaults to 3 seconds. See 100 | `Requests timeout documentation `_ for more info. 101 | - **max_redirects**: *int*: Maximum number of redirects for each request. Defaults to 30. 102 | - **parser**: *str*: BeautifulSoup parser for HTML parsing. Defaults to 'html.parser'. 103 | - **exceptions**: *bool*: If False, will gracefully handle Requests exceptions and attempt to keep searching. 104 | If True, will leave Requests exceptions uncaught to be handled by the caller. Defaults False. 105 | - **verify**: *bool* or *str*: Verify SSL Certificates. See 106 | `Requests SSL documentation `_ for more info. 107 | - **favicon_data_uri**: *bool*: Convert Favicon to Data Uri. Defaults False. 108 | - **as_urls**: *bool*: Return found Feeds as a list of URL strings instead of FeedInfo objects. 109 | - **cms**: *bool*: Check default CMS feed location if no feeds already found and site is using a known CMS. Defaults True. 110 | - **discovery_only**: *bool*: Only search for RSS discovery tags (e.g. ). Defaults False. 111 | Overridden by **check_all** if **check_all** is True. 112 | 113 | FeedInfo Values 114 | --------------- 115 | 116 | FeedInfo objects may have the following values if *info* is *True*: 117 | 118 | - **bozo**: *int*: Set to 1 when feed data is not well formed or may not be a feed. Defaults 0. 119 | - **content_type**: *str*: Content-Type value of the returned feed. 120 | - **description**: *str*: Feed description. 121 | - **favicon**: *str*: Url of site Favicon. 122 | - **favicon_data_uri**: *str*: Data Uri of site Favicon. 123 | - **hubs**: *List[str]*: List of `Websub `_ hubs of feed if available. 124 | - **is_push**: *bool*: True if feed contains valid Websub data. 125 | - **score**: *int*: Computed relevance of feed url value to provided URL. May be safely ignored. 126 | - **self_url**: *str*: *ref="self"* value returned from feed links. In some cases may be different from feed url. 127 | - **site_name**: *str*: Name of feed's website. 128 | - **site_url**: *str*: URL of feed's website. 129 | - **title**: *str*: Feed Title. 130 | - **url**: *str*: URL location of feed. 131 | - **version**: *str*: Feed version `XML values `_, 132 | or `JSON feed `_. 133 | 134 | 135 | Search Order 136 | ------------ 137 | 138 | Feedsearch searches for feeds in the following order: 139 | 140 | 1. If the URL points directly to a feed, then return that feed. 141 | 2. If **discovery_only** is True, search only tags. Return unless **check_all** is True. 142 | 3. Search all tags. Return if feeds are found and **check_all** is False. 143 | 4. If **cms** or **check_all** is True, search for default CMS feeds if the site is using a known CMS. Return if feeds are found and **check_all** is False. 144 | 5. Search all tags. Return if **check_all** is False. 145 | 6. This point will only be reached if **check_all** is True. 146 | 7. Fetch the content of all internally pointing tags whose URL paths indicate they may contain feeds. (e.g. /feed /rss /atom). All tags and tags of the content are searched, although not recusively. Return if feeds are found. This step may be very slow, so be sure whether you want **check_all** enabled. 147 | 8. If step 7 failed to find feeds, then as a last resort we make a few guesses for potential feed urls. 148 | -------------------------------------------------------------------------------- /feedsearch/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .feedsearch import search 4 | 5 | logging.getLogger(__name__).addHandler(logging.NullHandler()) 6 | -------------------------------------------------------------------------------- /feedsearch/__version__.py: -------------------------------------------------------------------------------- 1 | __title__ = "feedsearch" 2 | __description__ = "Search sites for RSS, Atom, and JSON feeds" 3 | __url__ = "https://github.com/DBeath/feedsearch" 4 | __version__ = "1.0.12" 5 | __author__ = "David Beath" 6 | __author_email__ = "davidgbeath@gmail.com" 7 | __license__ = "MIT" 8 | -------------------------------------------------------------------------------- /feedsearch/feedfinder.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Tuple, Union 3 | from urllib.parse import urljoin, urlparse 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | from .feedinfo import FeedInfo 8 | from .site_meta import SiteMeta 9 | from .url import URL 10 | from .lib import create_soup 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class FeedFinder: 16 | def __init__( 17 | self, coerced_url: str, feed_info: bool = False, favicon_data_uri: bool = False 18 | ) -> None: 19 | self.get_feed_info = feed_info # type: bool 20 | self.favicon_data_uri = favicon_data_uri # type: bool 21 | self.soup = None 22 | self.site_meta = None 23 | self.feeds = [] # type: list 24 | self.urls = [] # type: List[URL] 25 | self.coerced_url = coerced_url # type: str 26 | 27 | def check_urls(self, urls: List[str]) -> List[FeedInfo]: 28 | """ 29 | Check if a list of Urls contain feeds 30 | 31 | :param urls: List of Url strings 32 | :return: List of FeedInfo objects 33 | """ 34 | feeds = [] 35 | for url_str in urls: 36 | url = self.get_url(url_str) 37 | if url.is_feed: 38 | feed = self.create_feed_info(url) 39 | feeds.append(feed) 40 | 41 | return feeds 42 | 43 | def create_feed_info(self, url: URL) -> FeedInfo: 44 | """ 45 | Creates a FeedInfo object from a URL object 46 | 47 | :param url: URL object 48 | :return: FeedInfo 49 | """ 50 | info = FeedInfo(url.url, content_type=url.content_type) 51 | 52 | if self.get_feed_info: 53 | info.get_info(data=url.data, headers=url.headers) 54 | 55 | if self.site_meta: 56 | info.add_site_info( 57 | self.site_meta.site_url, 58 | self.site_meta.site_name, 59 | self.site_meta.icon_url, 60 | self.site_meta.icon_data_uri, 61 | ) 62 | 63 | return info 64 | 65 | @staticmethod 66 | def search_links(soup: BeautifulSoup, url: str, rel: bool = False) -> List[str]: 67 | """ 68 | Search all links on a page for feeds 69 | 70 | :param soup: BeautifulSoup dict 71 | :param url: Url of the soup 72 | :param rel: If true, only search for RSS discovery type "alternate" links 73 | :return: list 74 | """ 75 | links = [] # type: List[str] 76 | if rel: 77 | link_tags = soup.find_all("link", rel="alternate") 78 | else: 79 | link_tags = soup.find_all("link") 80 | for link in link_tags: 81 | if link.get("type") in [ 82 | "application/rss+xml", 83 | "text/xml", 84 | "application/atom+xml", 85 | "application/x.atom+xml", 86 | "application/x-atom+xml", 87 | "application/json", 88 | ]: 89 | links.append(urljoin(url, link.get("href", ""))) 90 | 91 | return links 92 | 93 | @staticmethod 94 | def search_a_tags(soup: BeautifulSoup) -> Tuple[List[str], List[str]]: 95 | """ 96 | Search all 'a' tags on a page for feeds 97 | 98 | :return: Tuple[list, list] 99 | """ 100 | local, remote = [], [] 101 | for a in soup.find_all("a"): 102 | href = a.get("href", None) 103 | if href is None: 104 | continue 105 | if "://" not in href and URL.is_feed_url(href): 106 | local.append(href) 107 | if URL.is_feedlike_url(href): 108 | remote.append(href) 109 | 110 | return local, remote 111 | 112 | def get_site_info(self, url: Union[str, URL]) -> None: 113 | """ 114 | Search for site metadata 115 | 116 | :param url: Site Url 117 | :return: None 118 | """ 119 | if isinstance(url, str): 120 | self.site_meta = SiteMeta(url) 121 | elif isinstance(url, URL): 122 | self.site_meta = SiteMeta(url.url, data=url.data) 123 | if self.site_meta: 124 | self.site_meta.parse_site_info(self.favicon_data_uri) 125 | 126 | def get_url(self, url: Union[str, URL]) -> URL: 127 | """ 128 | Return a unique URL object containing fetched URL data 129 | 130 | :param url: URL string or URL object 131 | :return: URL object 132 | """ 133 | if isinstance(url, str): 134 | if "://" not in url: 135 | url = urljoin(self.coerced_url, url) 136 | url = URL(url, immediate_get=False) 137 | if url in self.urls: 138 | url = self.urls[self.urls.index(url)] 139 | else: 140 | self.urls.append(url) 141 | if not url.data: 142 | url.get_is_feed(url.url) 143 | return url 144 | 145 | def internal_feedlike_urls(self) -> List[URL]: 146 | """ 147 | Return a list of URLs that point to internal pages 148 | which may contain feeds. 149 | 150 | :return: List of URL objects 151 | """ 152 | internal = [] # type: List[URL] 153 | parsed_coerced = urlparse(self.coerced_url) 154 | for url in self.urls: 155 | if not url.is_feed and url.fetched and url.feedlike_url: 156 | parsed = urlparse(url.url) 157 | # We want to check that the url is internal. 158 | # The coerced netloc is likely to be less complete (i.e. missing www subdomain) 159 | # than the netloc of the fetched url. 160 | if parsed_coerced.netloc in parsed.netloc: 161 | internal.append(url) 162 | return internal 163 | 164 | def check_url_data(self, urls: List[URL]) -> List[FeedInfo]: 165 | """ 166 | Check the data of each URL for links which may be feeds, 167 | then check the links and return any found feeds. 168 | 169 | :return: List of FeedInfo objects 170 | """ 171 | found = [] # type: List[FeedInfo] 172 | 173 | for url in urls: 174 | if not url.is_feed and url.data: 175 | to_search = [] # type: List[str] 176 | url_soup = create_soup(url.data) 177 | to_search.extend(self.search_links(url_soup, url.url)) 178 | local, remote = self.search_a_tags(url_soup) 179 | to_search.extend(local) 180 | to_search.extend(remote) 181 | found.extend(self.check_urls(to_search)) 182 | 183 | return found 184 | -------------------------------------------------------------------------------- /feedsearch/feedinfo.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | from typing import Tuple, Any, List 4 | 5 | import feedparser 6 | from bs4 import BeautifulSoup 7 | 8 | from .lib import bs4_parser, parse_header_links 9 | from .url import URL 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | class FeedInfo: 15 | def __init__( 16 | self, 17 | url: str, 18 | site_url: str = "", 19 | title: str = "", 20 | description: str = "", 21 | site_name: str = "", 22 | favicon: str = "", 23 | hubs: list = None, 24 | is_push: bool = False, 25 | content_type: str = "", 26 | version: str = "", 27 | self_url: str = "", 28 | score: int = 0, 29 | bozo: int = 0, 30 | favicon_data_uri: str = "", 31 | ) -> None: 32 | self.url = url 33 | self.site_url = site_url 34 | self.title = title 35 | self.description = description 36 | self.site_name = site_name 37 | self.favicon = favicon 38 | self.hubs = hubs or [] 39 | self.is_push = is_push 40 | self.content_type = content_type 41 | self.version = version 42 | self.self_url = self_url 43 | self.bozo = bozo 44 | self.score = score 45 | self.favicon_data_uri = favicon_data_uri 46 | 47 | def __repr__(self): 48 | return "{0}({1})".format(self.__class__.__name__, self.url.__repr__()) 49 | 50 | def __eq__(self, other): 51 | return self.url == other.url 52 | 53 | def __hash__(self): 54 | return hash(self.url) 55 | 56 | def get_info(self, data: Any = None, headers: dict = None) -> None: 57 | """ 58 | Get Feed info from data. 59 | 60 | :param data: Feed data, XML string or JSON object 61 | :param headers: HTTP Headers of the Feed Url 62 | :return: None 63 | """ 64 | logger.debug("Getting FeedInfo for %s", self.url) 65 | 66 | # Get data from URL if no data provided 67 | url_object = None 68 | if not data: 69 | url_object = URL(self.url) 70 | if url_object.is_feed: 71 | self.update_from_url( 72 | url_object.url, 73 | url_object.content_type, 74 | url_object.data, 75 | url_object.headers, 76 | ) 77 | 78 | if not headers and url_object: 79 | headers = url_object.headers 80 | 81 | # Check link headers first for WebSub content discovery 82 | # https://www.w3.org/TR/websub/#discovery 83 | if headers: 84 | self.hubs, self.self_url = self.header_links(headers) 85 | 86 | # Try to parse data as JSON 87 | try: 88 | json_data = json.loads(data) 89 | logger.debug("%s data is JSON", self) 90 | self.content_type = "application/json" 91 | self.parse_json(json_data) 92 | return 93 | except json.JSONDecodeError: 94 | pass 95 | 96 | self.parse_xml(data) 97 | 98 | def parse_xml(self, data: str) -> None: 99 | """ 100 | Get info from XML (RSS or ATOM) feed. 101 | :param data: XML string 102 | :return: None 103 | """ 104 | # Parse data with feedparser 105 | # Don't wrap this in try/except, feedparser eats errors and returns bozo instead 106 | parsed = self.parse_feed(data) 107 | if not parsed or parsed.get("bozo") == 1: 108 | self.bozo = 1 109 | logger.warning("No valid feed data in %s", self.url) 110 | return 111 | 112 | feed = parsed.get("feed") 113 | 114 | # Only search if no hubs already present from headers 115 | if not self.hubs: 116 | self.hubs, self.self_url = self.websub_links(feed) 117 | 118 | if self.hubs and self.self_url: 119 | self.is_push = True 120 | 121 | self.version = parsed.get("version") 122 | self.title = self.feed_title(feed) 123 | self.description = self.feed_description(feed) 124 | 125 | def parse_json(self, data: dict) -> None: 126 | """ 127 | Get info from JSON feed. 128 | 129 | :param data: JSON object 130 | :return: None 131 | """ 132 | self.version = data.get("version") 133 | if "https://jsonfeed.org/version/" not in self.version: 134 | self.bozo = 1 135 | return 136 | 137 | feed_url = data.get("feed_url") 138 | # Check URL from feed data if mismatch 139 | if feed_url and feed_url != self.url: 140 | url = URL(feed_url) 141 | if url.is_feed: 142 | self.update_from_url(url.url, url.content_type, url.data) 143 | return 144 | 145 | self.title = data.get("title") 146 | self.description = data.get("description") 147 | 148 | favicon = data.get("favicon") 149 | if favicon: 150 | self.favicon = favicon 151 | 152 | # Only search if no hubs already present from headers 153 | if not self.hubs: 154 | try: 155 | self.hubs = list(hub.get("url") for hub in data.get("hubs", [])) 156 | except (IndexError, AttributeError): 157 | pass 158 | 159 | if self.hubs: 160 | self.is_push = True 161 | 162 | @staticmethod 163 | def parse_feed(text: str) -> dict: 164 | """ 165 | Parse feed with feedparser. 166 | 167 | :param text: Feed string 168 | :return: dict 169 | """ 170 | return feedparser.parse(text) 171 | 172 | @staticmethod 173 | def feed_title(feed: dict) -> str: 174 | """ 175 | Get feed title 176 | 177 | :param feed: feed dict 178 | :return: str 179 | """ 180 | title = feed.get("title", None) 181 | if not title: 182 | return "" 183 | return FeedInfo.clean_title(title) 184 | 185 | @staticmethod 186 | def clean_title(title: str) -> str: 187 | """ 188 | Cleans title string, and shortens if too long. 189 | Have had issues with dodgy feed titles. 190 | 191 | :param title: Title string 192 | :return: str 193 | """ 194 | try: 195 | title = BeautifulSoup(title, bs4_parser).get_text() 196 | if len(title) > 1024: 197 | title = title[:1020] + "..." 198 | return title 199 | except Exception as ex: 200 | logger.exception("Failed to clean title: %s", ex) 201 | return "" 202 | 203 | @staticmethod 204 | def feed_description(feed: dict) -> str: 205 | """ 206 | Get feed description. 207 | 208 | :param feed: feed dict 209 | :return: str 210 | """ 211 | subtitle = feed.get("subtitle", None) 212 | if subtitle: 213 | return subtitle 214 | return feed.get("description", None) 215 | 216 | @staticmethod 217 | def websub_links(feed: dict) -> Tuple[List[str], str]: 218 | """ 219 | Returns a tuple containing the hub url and the self url for 220 | a parsed feed. 221 | 222 | :param feed: An RSS feed parsed by feedparser 223 | :return: tuple 224 | """ 225 | links = feed.get("links", []) 226 | return FeedInfo.find_hubs_and_self_links(links) 227 | 228 | def add_site_info( 229 | self, url: str = "", name: str = "", icon: str = "", icon_data_uri: str = "" 230 | ) -> None: 231 | """ 232 | Adds site meta info to FeedInfo 233 | 234 | :param url: Site URL 235 | :param name: Site Name 236 | :param icon: Site Favicon 237 | :param icon_data_uri: Site Favicon as Data Uri 238 | :return: None 239 | """ 240 | self.site_url = url 241 | self.site_name = name 242 | self.favicon = icon 243 | self.favicon_data_uri = icon_data_uri 244 | 245 | def update_from_url( 246 | self, url: str, content_type: str = "", data: Any = None, headers: dict = None 247 | ) -> None: 248 | """ 249 | Update a FeedInfo object from a Url object 250 | 251 | :param url: Url string 252 | :param content_type: Content-Type of returned Url 253 | :param data: Data from returned Url 254 | :param headers: Dict of headers 255 | :return: None 256 | """ 257 | self.url = url 258 | self.content_type = content_type 259 | self.get_info(data, headers) 260 | 261 | @classmethod 262 | def create_from_url(cls, url: str, content_type: str = ""): 263 | """ 264 | Create a FeedInfo object from a Url 265 | 266 | :param url: Url string 267 | :param content_type: Content-Type of returned Url 268 | :return: FeedInfo 269 | """ 270 | return cls(url=url, content_type=content_type) 271 | 272 | def serialize(self) -> str: 273 | """ 274 | Attempt to serialize FeedInfo to JSON string 275 | 276 | :return: JSON 277 | """ 278 | return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4) 279 | 280 | @staticmethod 281 | def header_links(headers: dict) -> Tuple[List[str], str]: 282 | """ 283 | Attempt to get self and hub links from HTTP headers 284 | https://www.w3.org/TR/websub/#x4-discovery 285 | 286 | :param headers: Dict of HTTP headers 287 | :return: None 288 | """ 289 | link_header = headers.get("Link") 290 | links = [] # type: list 291 | if link_header: 292 | links = parse_header_links(link_header) 293 | return FeedInfo.find_hubs_and_self_links(links) 294 | 295 | @staticmethod 296 | def find_hubs_and_self_links(links: List[dict]) -> Tuple[List[str], str]: 297 | """ 298 | Parses a list of links into self and hubs urls 299 | 300 | :param links: List of parsed HTTP Link Dicts 301 | :return: Tuple 302 | """ 303 | hub_urls = [] # type: List[str] 304 | self_url = "" # type: str 305 | 306 | if not links: 307 | return [], "" 308 | 309 | for link in links: 310 | try: 311 | if link["rel"] == "hub": 312 | href = link["href"] # type: str 313 | hub_urls.append(href) 314 | elif link["rel"] == "self": 315 | self_url = link["href"] 316 | except KeyError: 317 | continue 318 | 319 | return hub_urls, self_url 320 | -------------------------------------------------------------------------------- /feedsearch/feedsearch.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | from typing import List, Tuple, Union 4 | from urllib.parse import urljoin 5 | 6 | from .feedfinder import FeedFinder 7 | from .feedinfo import FeedInfo 8 | from .lib import ( 9 | coerce_url, 10 | create_requests_session, 11 | create_soup, 12 | default_timeout, 13 | get_site_root, 14 | set_bs4_parser, 15 | timeit, 16 | get_exceptions, 17 | set_exceptions, 18 | ) 19 | from requests import ReadTimeout 20 | 21 | logger = logging.getLogger(__name__) 22 | 23 | 24 | def search( 25 | url, 26 | info: bool = False, 27 | check_all: bool = False, 28 | cms: bool = True, 29 | discovery_only: bool = False, 30 | favicon_data_uri: bool = False, 31 | as_urls: bool = False, 32 | timeout: Union[float, Tuple[float, float]] = default_timeout, 33 | user_agent: str = "", 34 | max_redirects: int = 30, 35 | parser: str = "html.parser", 36 | exceptions: bool = False, 37 | verify: Union[bool, str] = True, 38 | ) -> Union[List[FeedInfo], List[str]]: 39 | """ 40 | Search for RSS or ATOM feeds at a given URL 41 | 42 | :param url: URL 43 | :param info: Get Feed and Site Metadata 44 | :param check_all: Check all and tags on page 45 | :param cms: Check default CMS feed location if site is using a known CMS. 46 | :param discovery_only: Only search for RSS discovery tags (e.g. ). 47 | :param favicon_data_uri: Fetch Favicon and convert to Data Uri 48 | :param as_urls: Return found Feeds as a list of URL strings instead 49 | of FeedInfo objects 50 | :param timeout: Request timeout, either a float or (float, float). 51 | See Requests documentation: http://docs.python-requests.org/en/master/user/advanced/#timeouts 52 | :param user_agent: User-Agent Header string 53 | :param max_redirects: Maximum Request redirects 54 | :param parser: BeautifulSoup parser ('html.parser', 'lxml', etc.). 55 | Defaults to 'html.parser' 56 | :param exceptions: If False, will gracefully handle Requests exceptions and 57 | attempt to keep searching. If True, will leave Requests exceptions 58 | uncaught to be handled externally. 59 | :param verify: Verify SSL Certificates. 60 | See Requests documentation: https://requests.readthedocs.io/en/master/user/advanced/#ssl-cert-verification 61 | :return: List of found feeds as FeedInfo objects or URL strings (depending on "as_url" parameter). 62 | FeedInfo objects will always have a "url" value. 63 | """ 64 | # Wrap find_feeds in a Requests session 65 | with create_requests_session( 66 | user_agent=user_agent, 67 | max_redirects=max_redirects, 68 | timeout=timeout, 69 | exceptions=exceptions, 70 | verify=verify, 71 | ): 72 | # Set BeautifulSoup parser 73 | set_bs4_parser(parser) 74 | # Find feeds 75 | feeds = _find_feeds( 76 | url, 77 | feed_info=info, 78 | check_all=check_all, 79 | cms=cms, 80 | discovery_only=discovery_only, 81 | favicon_data_uri=favicon_data_uri, 82 | ) 83 | # If as_urls is true, return only URL strings 84 | if as_urls: 85 | return list(f.url for f in feeds) 86 | else: 87 | return feeds 88 | 89 | 90 | @timeit 91 | def _find_feeds( 92 | url: str, 93 | feed_info: bool = False, 94 | check_all: bool = False, 95 | cms: bool = True, 96 | discovery_only: bool = False, 97 | favicon_data_uri: bool = False, 98 | ) -> List[FeedInfo]: 99 | """ 100 | Finds feeds 101 | 102 | :param url: URL 103 | :param check_all: Check all the pages of tags for feeds 104 | :param feed_info: Get Feed and Site Metadata 105 | :param favicon_data_uri: Fetch Favicon and convert to Data Uri 106 | :param cms: Check default CMS feed location if site is using a known CMS. 107 | :param discovery_only: Only search for RSS discovery tags (e.g. ). 108 | :return: List of found feeds as FeedInfo objects. 109 | """ 110 | # Format the URL properly. Use HTTPS 111 | coerced_url = coerce_url(url) # type: str 112 | 113 | # Create Feedfinder 114 | finder = FeedFinder( 115 | coerced_url, feed_info=feed_info, favicon_data_uri=favicon_data_uri 116 | ) 117 | 118 | # Initialise List of found Feeds 119 | feeds = [] # type: list 120 | 121 | start_time = time.perf_counter() 122 | 123 | # Download the requested URL 124 | logger.info("Finding feeds at URL: %s", coerced_url) 125 | 126 | # If the Caller provided an explicit HTTPS URL or asked for exceptions 127 | # to be raised, then make the first fetch without explicit exception 128 | # handling, as we don't want to retry with HTTP only. 129 | if url.startswith("https://") or get_exceptions(): 130 | found_url = finder.get_url(coerced_url) 131 | # Else, we perform the fetch with exception handling, so we can retry 132 | # with an HTTP URL if we had a ReadTimeout using HTTPS. 133 | else: 134 | try: 135 | # Set context to raise RequestExceptions on first fetch. 136 | set_exceptions(True) 137 | found_url = finder.get_url(coerced_url) 138 | except ReadTimeout: 139 | # Set Local Context exception settings back to Caller provided settings. 140 | set_exceptions(False) 141 | # Coerce URL with HTTP instead of HTTPS 142 | coerced_url = coerce_url(url, https=False) 143 | finder.coerced_url = coerced_url 144 | found_url = finder.get_url(coerced_url) 145 | finally: 146 | # Always set Local Context exception settings back to Caller provided settings. 147 | set_exceptions(False) 148 | 149 | search_time = int((time.perf_counter() - start_time) * 1000) 150 | logger.debug("Searched url in %sms", search_time) 151 | 152 | # If URL is valid, then get site info if feed_info is True 153 | if found_url and found_url.is_valid: 154 | if feed_info: 155 | finder.get_site_info(found_url) 156 | # Return nothing if there is no data from the URL 157 | else: 158 | return [] 159 | 160 | # If URL is already a feed, create and return FeedInfo 161 | if found_url.is_feed: 162 | found = finder.create_feed_info(found_url) 163 | feeds.append(found) 164 | return feeds 165 | 166 | # Parse text with BeautifulSoup 167 | finder.soup = create_soup(found_url.data) 168 | 169 | # If discovery_only, then search for tags and return 170 | if discovery_only and not check_all: 171 | logger.debug('Looking for tags.') 172 | links = finder.search_links(finder.soup, found_url.url) 173 | found_links = finder.check_urls(links) 174 | feeds.extend(found_links) 175 | logger.info('Found %s feed tags.', len(found_links)) 176 | return sort_urls(feeds, url) 177 | 178 | # Search for tags 179 | logger.debug("Looking for tags.") 180 | links = finder.search_links(finder.soup, found_url.url) 181 | found_links = finder.check_urls(links) 182 | feeds.extend(found_links) 183 | logger.info("Found %s feed tags.", len(found_links)) 184 | 185 | search_time = int((time.perf_counter() - start_time) * 1000) 186 | logger.debug("Searched tags in %sms", search_time) 187 | 188 | # Return if feeds are already found and check_all is False. 189 | if feeds and not check_all: 190 | return sort_urls(feeds, url) 191 | 192 | # Search for default CMS feeds. 193 | if cms or check_all: 194 | if not finder.site_meta: 195 | finder.get_site_info(coerced_url) 196 | logger.debug("Looking for CMS feeds.") 197 | cms_urls = finder.site_meta.cms_feed_urls() 198 | found_cms = finder.check_urls(cms_urls) 199 | logger.info("Found %s CMS feeds.", len(found_cms)) 200 | feeds.extend(found_cms) 201 | 202 | # Return if feeds are already found and check_all is False. 203 | if feeds and not check_all: 204 | return sort_urls(feeds, url) 205 | 206 | # Look for tags. 207 | logger.debug("Looking for tags.") 208 | local, remote = finder.search_a_tags(finder.soup) 209 | 210 | # Check the local URLs. 211 | local = [urljoin(coerced_url, l) for l in local] # type: list 212 | found_local = finder.check_urls(local) 213 | feeds.extend(found_local) 214 | logger.info("Found %s local links to feeds.", len(found_local)) 215 | 216 | # Check the remote URLs. 217 | local = [urljoin(coerced_url, l) for l in local] # type: list 218 | # Check the remote URLs. 219 | remote = [urljoin(coerced_url, l) for l in remote] # type: list 220 | hrefs = local + remote 221 | found_hrefs = finder.check_urls(hrefs) 222 | feeds.extend(found_hrefs) 223 | logger.info("Found %s links to feeds.", len(found_hrefs)) 224 | 225 | search_time = int((time.perf_counter() - start_time) * 1000) 226 | logger.debug("Searched links in %sms", search_time) 227 | 228 | # Only check internal pages if check_all is True. 229 | if not check_all: 230 | return sort_urls(feeds, url) 231 | 232 | # Check all possible internal urls that may point to a feed page. 233 | internal = finder.internal_feedlike_urls() 234 | found_internal = finder.check_url_data(internal) 235 | feeds.extend(found_internal) 236 | 237 | search_time = int((time.perf_counter() - start_time) * 1000) 238 | logger.debug("Searched internal pages in %sms", search_time) 239 | 240 | # Return if feeds are found. Guessing URLs is a last resort. 241 | if feeds: 242 | return sort_urls(feeds, url) 243 | 244 | # Guessing potential URLs. 245 | fns = [ 246 | "atom.xml", 247 | "index.atom", 248 | "index.rdf", 249 | "rss.xml", 250 | "index.xml", 251 | "index.rss", 252 | "index.json", 253 | ] 254 | urls = list(urljoin(coerced_url, f) for f in fns) 255 | found_guessed = finder.check_urls(urls) 256 | feeds.extend(found_guessed) 257 | logger.info("Found %s guessed links to feeds.", len(found_guessed)) 258 | 259 | search_time = int((time.perf_counter() - start_time) * 1000) 260 | logger.debug("Searched guessed urls in %sms", search_time) 261 | 262 | return sort_urls(feeds, url) 263 | 264 | 265 | def url_feed_score(url: str, original_url: str = "") -> int: 266 | """ 267 | Return a Score based on estimated relevance of the feed Url 268 | to the original search Url 269 | 270 | :param url: Feed Url 271 | :param original_url: Searched Url 272 | :return: Score integer 273 | """ 274 | score = 0 275 | 276 | if original_url: 277 | url_domain = get_site_root(url) 278 | original_domain = get_site_root(original_url) 279 | 280 | if original_domain not in url_domain: 281 | score -= 17 282 | 283 | if "comments" in url: 284 | score -= 15 285 | if "georss" in url: 286 | score -= 9 287 | if "alt" in url: 288 | score -= 7 289 | kw = ["atom", "rss", ".xml", "feed", "rdf"] 290 | for p, t in zip(range(len(kw) * 2, 0, -2), kw): 291 | if t in url: 292 | score += p 293 | if url.startswith("https"): 294 | score += 9 295 | return score 296 | 297 | 298 | def sort_urls(feeds: List[FeedInfo], original_url: str = "") -> List[FeedInfo]: 299 | """ 300 | Sort list of feeds based on Url score 301 | 302 | :param feeds: List of FeedInfo objects 303 | :param original_url: Searched Url 304 | :return: List of FeedInfo objects 305 | """ 306 | for feed in feeds: 307 | feed.score = url_feed_score(feed.url, original_url) 308 | sorted_urls = sorted(list(set(feeds)), key=lambda x: x.score, reverse=True) 309 | logger.info("Returning sorted URLs: %s", sorted_urls) 310 | return sorted_urls 311 | -------------------------------------------------------------------------------- /feedsearch/lib.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import logging 3 | import time 4 | from contextlib import contextmanager 5 | from typing import Optional, Union, Tuple 6 | 7 | import requests 8 | from bs4 import BeautifulSoup 9 | from requests import Response 10 | from requests.exceptions import RequestException 11 | from werkzeug.local import Local, release_local 12 | from werkzeug.urls import url_parse, url_fix 13 | 14 | from .__version__ import __version__ 15 | 16 | LOCAL_CONTEXT = Local() 17 | 18 | logger = logging.getLogger(__name__) 19 | 20 | bs4_parser = "html.parser" 21 | 22 | default_timeout = 3.05 23 | 24 | 25 | def get_session(): 26 | """ 27 | Returns the Requests Session for the current local context. 28 | Creates a Session with default values if none exists. 29 | 30 | :return: Requests Session 31 | """ 32 | return getattr(LOCAL_CONTEXT, "session", create_requests_session()) 33 | 34 | 35 | def get_timeout(): 36 | """ 37 | Returns the Request timeout for the current local context. 38 | 39 | :return: Request timeout 40 | """ 41 | return getattr(LOCAL_CONTEXT, "timeout", default_timeout) 42 | 43 | 44 | def get_exceptions() -> bool: 45 | """ 46 | Returns the exception handling settings for the current local context. 47 | 48 | :return: Catch exception boolean 49 | """ 50 | return getattr(LOCAL_CONTEXT, "exceptions", False) 51 | 52 | 53 | def set_exceptions(value: bool = False) -> None: 54 | """ 55 | Set the exception hadnling settings for the current local context. 56 | 57 | :return: None 58 | """ 59 | setattr(LOCAL_CONTEXT, "exceptions", value) 60 | 61 | 62 | def _user_agent() -> str: 63 | """ 64 | Return User-Agent string 65 | 66 | :return: str 67 | """ 68 | return "FeedSerach/{0} (https://github.com/DBeath/feedsearch)".format(__version__) 69 | 70 | 71 | @contextmanager 72 | def create_requests_session( 73 | user_agent: str = "", 74 | max_redirects: int = 30, 75 | timeout: Union[float, Tuple[float, float]] = default_timeout, 76 | exceptions: bool = False, 77 | verify: Union[bool, str] = True, 78 | ): 79 | """ 80 | Creates a Requests Session and sets User-Agent header and Max Redirects 81 | 82 | :param user_agent: User-Agent string 83 | :param max_redirects: Max number of redirects before failure 84 | :param timeout: Request Timeout 85 | :param exceptions: If False, will gracefully handle Requests exceptions and attempt to keep searching. 86 | If True, will leave Requests exceptions uncaught to be handled externally. 87 | :param verify: Verify SSL Certificates. 88 | :return: Requests session 89 | """ 90 | # Create a request session 91 | session = requests.session() 92 | 93 | # Set User-Agent header 94 | user_agent = user_agent if user_agent else _user_agent() 95 | session.headers.update({"User-Agent": user_agent}) 96 | 97 | session.max_redirects = max_redirects 98 | session.verify = verify 99 | 100 | # Add request session to local context 101 | setattr(LOCAL_CONTEXT, "session", session) 102 | setattr(LOCAL_CONTEXT, "timeout", timeout) 103 | setattr(LOCAL_CONTEXT, "exceptions", exceptions) 104 | 105 | yield session 106 | 107 | # Close request session 108 | session.close() 109 | 110 | # Clean up local context 111 | release_local(LOCAL_CONTEXT) 112 | 113 | 114 | def requests_session( 115 | user_agent: str = "", 116 | max_redirects: int = 30, 117 | timeout: Union[float, Tuple[float, float]] = default_timeout, 118 | exceptions: bool = False, 119 | verify: Union[bool, str] = True, 120 | ): 121 | """ 122 | Wraps a requests session around a function. 123 | 124 | :param user_agent: User Agent for requests 125 | :param max_redirects: Maximum number of redirects 126 | :param timeout: Request Timeout 127 | :param exceptions: If True, rethrow exceptions. 128 | :param verify: Verify SSL Certificates. 129 | :return: decorator function 130 | """ 131 | 132 | def decorator(func): 133 | @functools.wraps(func) 134 | def wrapper(*args, **kwargs): 135 | with create_requests_session( 136 | user_agent, max_redirects, timeout, exceptions, verify 137 | ): 138 | # Call wrapped function 139 | return func(*args, **kwargs) 140 | 141 | return wrapper 142 | 143 | return decorator 144 | 145 | 146 | def set_bs4_parser(parser: str) -> None: 147 | """ 148 | Sets the parser used by BeautifulSoup 149 | 150 | :param parser: BeautifulSoup parser 151 | :return: None 152 | """ 153 | if parser: 154 | global bs4_parser 155 | bs4_parser = parser 156 | 157 | 158 | def get_url( 159 | url: str, 160 | timeout: Union[float, Tuple[float, float]] = default_timeout, 161 | exceptions: bool = False, 162 | **kwargs 163 | ) -> Optional[Response]: 164 | """ 165 | Performs a GET request on a URL 166 | 167 | :param url: URL string 168 | :param timeout: Request Timeout 169 | :param exceptions: If False, will gracefully handle Requests exceptions and attempt to keep searching. 170 | If True, will reraise Requests exceptions to be handled externally. 171 | :return: Requests Response object 172 | """ 173 | timeout = timeout if timeout else get_timeout() 174 | 175 | logger.info("Fetching URL: %s", url) 176 | start_time = time.perf_counter() 177 | try: 178 | session = get_session() 179 | response = session.get(url, timeout=timeout, **kwargs) 180 | response.raise_for_status() 181 | except RequestException as ex: 182 | logger.warning("RequestException while getting URL: %s, %s", url, str(ex)) 183 | if exceptions: 184 | raise 185 | return None 186 | finally: 187 | dur = int((time.perf_counter() - start_time) * 1000) 188 | logger.debug("Performed fetch of URL: %s in %sms", url, dur) 189 | return response 190 | 191 | 192 | def create_soup(text: str) -> BeautifulSoup: 193 | """ 194 | Parses a string into a BeautifulSoup object 195 | 196 | :param text: Html string 197 | :return: BeautifulSoup object 198 | """ 199 | return BeautifulSoup(text, bs4_parser) 200 | 201 | 202 | def coerce_url(url: str, https: bool = True) -> str: 203 | """ 204 | Coerce URL to valid format 205 | 206 | :param url: URL 207 | :param https: Force https if no scheme in url 208 | :return: str 209 | """ 210 | url.strip() 211 | if url.startswith("feed://"): 212 | return url_fix("http://{0}".format(url[7:])) 213 | for proto in ["http://", "https://"]: 214 | if url.startswith(proto): 215 | return url_fix(url) 216 | if https: 217 | return url_fix("https://{0}".format(url)) 218 | else: 219 | return url_fix("http://{0}".format(url)) 220 | 221 | 222 | def get_site_root(url: str) -> str: 223 | """ 224 | Find the root domain of a url 225 | """ 226 | url = coerce_url(url) 227 | parsed = url_parse(url, scheme="http") 228 | return parsed.netloc 229 | 230 | 231 | def timeit(func): 232 | """ 233 | A decorator used to log the function execution time 234 | """ 235 | 236 | @functools.wraps(func) 237 | def wrap(*args, **kwargs): 238 | start = time.perf_counter() 239 | 240 | result = func(*args, **kwargs) 241 | 242 | dur = int((time.perf_counter() - start) * 1000) 243 | 244 | logger.debug("Function name=%s duration=%sms", func.__name__, dur) 245 | 246 | return result 247 | 248 | return wrap 249 | 250 | 251 | def parse_header_links(value): 252 | """ 253 | Return a list of Dicts of parsed link headers proxies. 254 | i.e. Link: ; rel=front; type="image/jpeg", 255 | ; rel=back;type="image/jpeg" 256 | 257 | :param value: HTTP Link header to parse 258 | :return: List of Dicts 259 | """ 260 | 261 | links = [] 262 | 263 | replace_chars = " '\"" 264 | 265 | for val in value.split(","): 266 | try: 267 | url, params = val.split(";", 1) 268 | except ValueError: 269 | url, params = val, "" 270 | 271 | link = {"url": url.strip("<> '\"")} 272 | 273 | for param in params.split(";"): 274 | try: 275 | key, value = param.split("=") 276 | except ValueError: 277 | break 278 | 279 | link[key.strip(replace_chars)] = value.strip(replace_chars) 280 | 281 | links.append(link) 282 | 283 | return links 284 | -------------------------------------------------------------------------------- /feedsearch/site_meta.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import logging 3 | import re 4 | 5 | from typing import List, Set, Dict, Any 6 | from bs4 import BeautifulSoup, ResultSet 7 | from werkzeug.urls import url_parse 8 | 9 | from .lib import get_url, coerce_url, create_soup, get_timeout, get_exceptions 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | 14 | WORDPRESS_URLS = ["/feed"] 15 | 16 | 17 | class SiteMeta: 18 | def __init__(self, url: str, data: Any = None, soup: BeautifulSoup = None) -> None: 19 | self.url = url # type: str 20 | self.data = data # type: Any 21 | self.soup = soup # type: BeautifulSoup 22 | self.site_url = "" # type: str 23 | self.site_name = "" # type: str 24 | self.icon_url = "" # type: str 25 | self.icon_data_uri = "" # type: str 26 | self.domain = "" # type: str 27 | 28 | def parse_site_info(self, favicon_data_uri: bool = False): 29 | """ 30 | Finds Site Info from root domain of site 31 | 32 | :return: None 33 | """ 34 | self.domain = self.get_domain(self.url) 35 | 36 | # Only fetch url again if domain is different from provided url or if 37 | # no site data already provided. 38 | if self.domain != self.url.strip("/") or not self.data: 39 | logger.debug( 40 | "Domain %s is different from URL %s. Fetching domain.", 41 | self.domain, 42 | self.url, 43 | ) 44 | response = get_url(self.domain, get_timeout(), get_exceptions()) 45 | if not response or not response.text: 46 | return 47 | self.data = response.text 48 | 49 | if not self.soup: 50 | self.soup = create_soup(self.data) 51 | 52 | self.site_url = self.find_site_url(self.soup, self.domain) 53 | self.site_name = self.find_site_name(self.soup) 54 | self.icon_url = self.find_site_icon_url(self.domain) 55 | 56 | if favicon_data_uri and self.icon_url: 57 | self.icon_data_uri = self.create_data_uri(self.icon_url) 58 | 59 | def find_site_icon_url(self, url: str) -> str: 60 | """ 61 | Attempts to find Site Favicon 62 | 63 | :param url: Root domain Url of Site 64 | :return: str 65 | """ 66 | icon_rel = ["apple-touch-icon", "shortcut icon", "icon"] 67 | 68 | icon = "" 69 | for rel in icon_rel: 70 | link = self.soup.find(name="link", rel=rel) 71 | if link: 72 | icon = link.get("href", None) 73 | if icon[0] == "/": 74 | icon = "{0}{1}".format(url, icon) 75 | if icon == "favicon.ico": 76 | icon = "{0}/{1}".format(url, icon) 77 | if not icon: 78 | send_url = url + "/favicon.ico" 79 | logger.debug("Trying url %s for favicon", send_url) 80 | response = get_url(send_url, get_timeout(), get_exceptions()) 81 | if response and response.status_code == 200: 82 | logger.debug("Received url %s for favicon", response.url) 83 | icon = response.url 84 | return icon 85 | 86 | @staticmethod 87 | def find_site_name(soup) -> str: 88 | """ 89 | Attempts to find Site Name 90 | 91 | :param soup: BeautifulSoup of site 92 | :return: str 93 | """ 94 | site_name_meta = [ 95 | "og:site_name", 96 | "og:title", 97 | "application:name", 98 | "twitter:app:name:iphone", 99 | ] 100 | 101 | for p in site_name_meta: 102 | try: 103 | name = soup.find(name="meta", property=p).get("content") 104 | if name: 105 | return name 106 | except AttributeError: 107 | pass 108 | 109 | try: 110 | title = soup.find(name="title").text 111 | if title: 112 | return title 113 | except AttributeError: 114 | pass 115 | 116 | return "" 117 | 118 | @staticmethod 119 | def find_site_url(soup, url: str) -> str: 120 | """ 121 | Attempts to find the canonical Url of the Site 122 | 123 | :param soup: BeautifulSoup of site 124 | :param url: Current Url of site 125 | :return: str 126 | """ 127 | canonical = soup.find(name="link", rel="canonical") 128 | try: 129 | site = canonical.get("href") 130 | if site: 131 | return site 132 | except AttributeError: 133 | pass 134 | 135 | meta = soup.find(name="meta", property="og:url") 136 | try: 137 | site = meta.get("content") 138 | except AttributeError: 139 | return url 140 | return site 141 | 142 | @staticmethod 143 | def get_domain(url: str) -> str: 144 | """ 145 | Finds root domain of Url, including scheme 146 | 147 | :param url: URL string 148 | :return: str 149 | """ 150 | url = coerce_url(url) 151 | parsed = url_parse(url) 152 | domain = "{0}://{1}".format(parsed.scheme, parsed.netloc) 153 | return domain 154 | 155 | @staticmethod 156 | def create_data_uri(img_url: str) -> str: 157 | """ 158 | Creates a Data Uri for a Favicon 159 | 160 | :param img_url: Url of Favicon 161 | :return: str 162 | """ 163 | response = get_url(img_url, get_timeout(), get_exceptions(), stream=True) 164 | if not response or int(response.headers["content-length"]) > (1024 * 1024): 165 | response.close() 166 | return "" 167 | 168 | uri = "" 169 | try: 170 | encoded = base64.b64encode(response.content) 171 | uri = "data:image/png;base64," + encoded.decode("utf-8") 172 | except Exception as e: 173 | logger.warning("Failure encoding image: %s", e) 174 | 175 | response.close() 176 | return uri 177 | 178 | def cms_feed_urls(self) -> List[str]: 179 | """ 180 | Checks if a site is using a popular CMS, and returns 181 | a list of default feed urls to check. 182 | 183 | :return: List[str] 184 | """ 185 | 186 | site_feeds = {"WordPress": ["/feed"]} # type: Dict[str, List[str]] 187 | 188 | possible_urls = set() # type: Set[str] 189 | if not self.soup: 190 | return [] 191 | 192 | site_names = set() # type: Set[str] 193 | 194 | metas = self.soup.find_all(name="meta") 195 | site_names.update(self.check_meta(metas)) 196 | 197 | links = self.soup.find_all(name="link") 198 | site_names.update(self.check_links(links)) 199 | 200 | for name in site_names: 201 | urls = site_feeds.get(name) # type: List[str] 202 | if urls: 203 | possible_urls.update(urls) 204 | 205 | # Return urls appended to the root domain to allow searching 206 | urls = [] # type: List[str] 207 | for url in possible_urls: 208 | urls.append(self.domain + url) 209 | return urls 210 | 211 | @staticmethod 212 | def check_meta(metas: ResultSet) -> Set[str]: 213 | """ 214 | Check site meta to find possible CMS values. 215 | 216 | :param metas: ResultSet of Site Meta values 217 | :return: Set of possible CMS names 218 | """ 219 | meta_tests = {"generator": {"WordPress": "WordPress\\s*(.*)"}} 220 | 221 | results = set() # type: Set[str] 222 | 223 | def get_meta_value(inner_type: str, inner_metas: ResultSet): 224 | for meta in inner_metas: 225 | if inner_type in meta.get("property", ""): 226 | yield meta.get("content") 227 | 228 | for test_type, tests in meta_tests.items(): 229 | meta_values = list(get_meta_value(test_type, metas)) 230 | for meta_value in meta_values: 231 | for site_name, pattern in tests.items(): 232 | if re.search(pattern, meta_value, flags=re.I): 233 | results.add(site_name) 234 | 235 | return results 236 | 237 | @staticmethod 238 | def check_links(links: ResultSet) -> Set[str]: 239 | link_tests = {"WordPress": "/wp-content/"} 240 | 241 | results = set() # type: Set[str] 242 | 243 | def get_link_href(inner_links: ResultSet): 244 | for link in inner_links: 245 | yield link.get("href") 246 | 247 | link_hrefs = list(get_link_href(links)) 248 | for site_name, pattern in link_tests.items(): 249 | for href in link_hrefs: 250 | if not href: 251 | continue 252 | if re.search(pattern, href, flags=re.I): 253 | results.add(site_name) 254 | 255 | return results 256 | -------------------------------------------------------------------------------- /feedsearch/url.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Any 3 | 4 | from .lib import get_url, get_timeout, get_exceptions 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | 9 | class URL: 10 | def __init__(self, url: str, data: Any = None, immediate_get: bool = True) -> None: 11 | """ 12 | Initialise URL object and immediately fetch URL to check if feed. 13 | 14 | :param url: URL string 15 | """ 16 | self.url = url # type: str 17 | self.data = data # type: Any 18 | self.is_feed = False # type: bool 19 | self.content_type = "" # type: str 20 | self.headers = {} # type: dict 21 | self.links = {} # type: dict 22 | self.fetched = False # type: bool 23 | self.feedlike_url = self.is_feedlike_url(self.url) # type: bool 24 | 25 | if immediate_get and not self.fetched: 26 | self.get_is_feed(self.url) 27 | 28 | def __repr__(self): 29 | return "{0}({1})".format(self.__class__.__name__, self.url.__repr__) 30 | 31 | def __eq__(self, other): 32 | return self.url == other.url 33 | 34 | @staticmethod 35 | def is_feed_url(url: str) -> bool: 36 | """ 37 | Return True if URL ending contains valid feed file format. 38 | 39 | :param url: URL string 40 | :return: bool 41 | """ 42 | return any( 43 | map(url.lower().endswith, [".rss", ".rdf", ".xml", ".atom", ".json"]) 44 | ) 45 | 46 | @staticmethod 47 | def is_feedlike_url(url: str) -> bool: 48 | """ 49 | Return True any part of URL might identify as feed. 50 | 51 | :param url: URL string 52 | :return: bool 53 | """ 54 | return any( 55 | map(url.lower().count, ["rss", "rdf", "xml", "atom", "feed", "json"]) 56 | ) 57 | 58 | @staticmethod 59 | def is_json_feed(json: dict) -> bool: 60 | """ 61 | Return True if JSON contains valid JSON Feed version. 62 | 63 | :param json: Parsed JSON 64 | :return: bool 65 | """ 66 | version = json.get("version") 67 | if not version or "https://jsonfeed.org/version/" not in version: 68 | return False 69 | return True 70 | 71 | @staticmethod 72 | def is_feed_data(text: str, content_type: str) -> bool: 73 | """ 74 | Return True if text string has valid feed beginning. 75 | 76 | :param text: Possible feed text 77 | :param content_type: MimeType of text 78 | :return: bool 79 | """ 80 | data = text.lower() 81 | if not data: 82 | return False 83 | if data[:100].count(" None: 94 | """ 95 | Gets a URL and checks if it might be a feed. 96 | 97 | :param url: URL string 98 | :return: None 99 | """ 100 | response = get_url(url, get_timeout(), get_exceptions()) 101 | 102 | self.fetched = True 103 | 104 | if not response or not response.text: 105 | logger.debug("Nothing found at %s", url) 106 | return 107 | 108 | self.url = response.url 109 | self.content_type = response.headers.get("content-type") 110 | 111 | self.data = response.text 112 | self.headers = response.headers 113 | self.links = response.links 114 | self.is_feed = self.is_feed_data(response.text, self.content_type) 115 | 116 | @property 117 | def is_valid(self) -> bool: 118 | """ 119 | Check if URL returned valid response 120 | 121 | :return: bool 122 | """ 123 | if self.url and self.data: 124 | return True 125 | return False 126 | -------------------------------------------------------------------------------- /search.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import traceback 3 | from pprint import pprint 4 | 5 | import click 6 | 7 | from feedsearch import search as search_feeds 8 | 9 | 10 | @click.command() 11 | @click.argument("url") 12 | @click.option( 13 | "--all/--no-all", 14 | default=False, 15 | help="Search all potential locations for feeds. Warning: Slow", 16 | ) 17 | @click.option("--info/--no-info", default=False, help="Return additional feed details") 18 | @click.option( 19 | "--parser", 20 | default="html.parser", 21 | type=click.Choice(["html.parser", "lxml", "xml", "html5lib"]), 22 | help="BeautifulSoup parser ('html.parser', 'lxml', 'xml', or 'html5lib'). Defaults to 'html.parser'", 23 | ) 24 | @click.option("-v", "--verbose", is_flag=True, help="Show logging") 25 | @click.option( 26 | "--exceptions/--no-exceptions", 27 | default=False, 28 | help="If False, will gracefully handle Requests exceptions and attempt to keep searching." 29 | "If True, will leave Requests exceptions uncaught to be handled externally.", 30 | ) 31 | @click.option("--timeout", default=3.05, type=click.FLOAT, help="Request timeout") 32 | @click.option( 33 | "--favicon/--no-favicon", default=False, help="Convert Favicon into Data Uri" 34 | ) 35 | @click.option( 36 | "--urls/--no-urls", 37 | default=False, 38 | help="Return found Feeds as a list of URL strings instead of FeedInfo objects.", 39 | ) 40 | @click.option( 41 | "--cms/--no-cms", 42 | default=True, 43 | help="Check default CMS feed location if site is using a known CMS.", 44 | ) 45 | @click.option( 46 | "--discovery/--no-discovery", 47 | default=False, 48 | help='Only search for RSS discovery tags (e.g. ).', 49 | ) 50 | def search( 51 | url, all, info, parser, verbose, exceptions, timeout, favicon, urls, cms, discovery 52 | ): 53 | if verbose: 54 | logger = logging.getLogger("feedsearch") 55 | logger.setLevel(logging.DEBUG) 56 | ch = logging.StreamHandler() 57 | ch.setLevel(logging.DEBUG) 58 | formatter = logging.Formatter( 59 | "%(asctime)s - %(name)s - %(levelname)s - %(message)s [in %(pathname)s:%(lineno)d]" 60 | ) 61 | ch.setFormatter(formatter) 62 | logger.addHandler(ch) 63 | 64 | click.echo("\nSearching URL {0}\n".format(url)) 65 | try: 66 | feeds = search_feeds( 67 | url, 68 | info=info, 69 | check_all=all, 70 | cms=cms, 71 | discovery_only=discovery, 72 | favicon_data_uri=favicon, 73 | as_urls=urls, 74 | parser=parser, 75 | exceptions=exceptions, 76 | timeout=timeout 77 | ) 78 | click.echo() 79 | for feed in feeds: 80 | if not urls: 81 | pprint(vars(feed)) 82 | print() 83 | else: 84 | click.echo("{0}".format(feed)) 85 | 86 | return feeds 87 | except Exception as e: 88 | click.echo("Exception: {0}\n".format(e)) 89 | click.echo(traceback.format_exc()) 90 | 91 | return [] 92 | 93 | 94 | if __name__ == "__main__": 95 | search() 96 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | python-tag = py35.py36.py37 3 | 4 | [flake8] 5 | # Keep in sync with .flake8. This copy here is needed for source packages 6 | # to be able to pass tests without failing selfclean check. 7 | ignore = E302, E501, W503 8 | max-line-length = 88 9 | max-complexity = 12 10 | select = B,C,E,F,W,B9 11 | 12 | [metadata] 13 | license_file = LICENSE -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import sys 6 | from codecs import open 7 | 8 | from setuptools import setup 9 | 10 | here = os.path.abspath(os.path.dirname(__file__)) 11 | 12 | about = {} 13 | with open(os.path.join(here, "feedsearch", "__version__.py"), "r", "utf-8") as f: 14 | exec(f.read(), about) 15 | 16 | with open(os.path.join(here, "README.rst"), encoding="utf-8") as f: 17 | readme = f.read() 18 | 19 | if sys.argv[-1] == "publish": 20 | os.system("python3 setup.py sdist bdist_wheel") 21 | os.system("twine upload dist/*") 22 | sys.exit() 23 | 24 | packages = ["feedsearch"] 25 | 26 | required = ["requests", "beautifulsoup4", "feedparser", "click", "Werkzeug"] 27 | 28 | setup( 29 | name=about["__title__"], 30 | version=about["__version__"], 31 | description=about["__description__"], 32 | long_description=readme, 33 | author=about["__author__"], 34 | author_email=about["__author_email__"], 35 | url=about["__url__"], 36 | license=about["__license__"], 37 | packages=packages, 38 | install_requires=required, 39 | classifiers=[ 40 | "License :: OSI Approved :: MIT License", 41 | "Intended Audience :: Developers", 42 | "Development Status :: 5 - Production/Stable", 43 | "Natural Language :: English", 44 | "Operating System :: OS Independent", 45 | "Programming Language :: Python :: 3.5", 46 | "Programming Language :: Python :: 3.6", 47 | "Programming Language :: Python :: 3.7", 48 | ], 49 | python_requires=">=3", 50 | ) 51 | --------------------------------------------------------------------------------