├── .gitignore
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── README.rst
├── feedsearch
    ├── __init__.py
    ├── __version__.py
    ├── feedfinder.py
    ├── feedinfo.py
    ├── feedsearch.py
    ├── lib.py
    ├── site_meta.py
    └── url.py
├── search.py
├── setup.cfg
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 
103 | .idea/
104 | .vscode/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 David Beath
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.python.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [dev-packages]
 7 | twine = "*"
 8 | black = "*"
 9 | "flake8" = "*"
10 | rope = "*"
11 | 
12 | [packages]
13 | requests = "*"
14 | "beautifulsoup4" = "*"
15 | feedparser = "*"
16 | click = "*"
17 | werkzeug = "*"
18 | 
19 | [requires]
20 | python_version = "3.5"
21 | 
22 | [pipenv]
23 | allow_prereleases = true
24 | 


--------------------------------------------------------------------------------
/Pipfile.lock:
--------------------------------------------------------------------------------
  1 | {
  2 |     "_meta": {
  3 |         "hash": {
  4 |             "sha256": "e3fd544b327cb4788ca7440bee564bbeb55f243e04f76c4b342d2f7bc0037c28"
  5 |         },
  6 |         "pipfile-spec": 6,
  7 |         "requires": {
  8 |             "python_version": "3.5"
  9 |         },
 10 |         "sources": [
 11 |             {
 12 |                 "name": "pypi",
 13 |                 "url": "https://pypi.python.org/simple",
 14 |                 "verify_ssl": true
 15 |             }
 16 |         ]
 17 |     },
 18 |     "default": {
 19 |         "beautifulsoup4": {
 20 |             "hashes": [
 21 |                 "sha256:05fd825eb01c290877657a56df4c6e4c311b3965bda790c613a3d6fb01a5462a",
 22 |                 "sha256:9fbb4d6e48ecd30bcacc5b63b94088192dcda178513b2ae3c394229f8911b887",
 23 |                 "sha256:e1505eeed31b0f4ce2dbb3bc8eb256c04cc2b3b72af7d551a4ab6efd5cbe5dae"
 24 |             ],
 25 |             "index": "pypi",
 26 |             "version": "==4.8.2"
 27 |         },
 28 |         "certifi": {
 29 |             "hashes": [
 30 |                 "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
 31 |                 "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
 32 |             ],
 33 |             "version": "==2019.11.28"
 34 |         },
 35 |         "chardet": {
 36 |             "hashes": [
 37 |                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
 38 |                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
 39 |             ],
 40 |             "version": "==3.0.4"
 41 |         },
 42 |         "click": {
 43 |             "hashes": [
 44 |                 "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
 45 |                 "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
 46 |             ],
 47 |             "index": "pypi",
 48 |             "version": "==7.0"
 49 |         },
 50 |         "feedparser": {
 51 |             "hashes": [
 52 |                 "sha256:150ccca4cfc3481f7ff503988a91bbdbbbc3406d6444bfe9cfe6c1001d378e73",
 53 |                 "sha256:87185443d6e12cf870125bdc9211168c60895e7dd7209b5c082897ddb1b11efb"
 54 |             ],
 55 |             "index": "pypi",
 56 |             "version": "==6.0.0b1"
 57 |         },
 58 |         "idna": {
 59 |             "hashes": [
 60 |                 "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
 61 |                 "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
 62 |             ],
 63 |             "version": "==2.8"
 64 |         },
 65 |         "requests": {
 66 |             "hashes": [
 67 |                 "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
 68 |                 "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
 69 |             ],
 70 |             "index": "pypi",
 71 |             "version": "==2.22.0"
 72 |         },
 73 |         "sgmllib3k": {
 74 |             "hashes": [
 75 |                 "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"
 76 |             ],
 77 |             "markers": "python_version >= '3.0'",
 78 |             "version": "==1.0.0"
 79 |         },
 80 |         "soupsieve": {
 81 |             "hashes": [
 82 |                 "sha256:bdb0d917b03a1369ce964056fc195cfdff8819c40de04695a80bc813c3cfa1f5",
 83 |                 "sha256:e2c1c5dee4a1c36bcb790e0fabd5492d874b8ebd4617622c4f6a731701060dda"
 84 |             ],
 85 |             "version": "==1.9.5"
 86 |         },
 87 |         "urllib3": {
 88 |             "hashes": [
 89 |                 "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc",
 90 |                 "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"
 91 |             ],
 92 |             "version": "==1.25.8"
 93 |         },
 94 |         "werkzeug": {
 95 |             "hashes": [
 96 |                 "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096",
 97 |                 "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16"
 98 |             ],
 99 |             "index": "pypi",
100 |             "version": "==1.0.0"
101 |         }
102 |     },
103 |     "develop": {
104 |         "appdirs": {
105 |             "hashes": [
106 |                 "sha256:9e5896d1372858f8dd3344faf4e5014d21849c756c8d5701f78f8a103b372d92",
107 |                 "sha256:d8b24664561d0d34ddfaec54636d502d7cea6e29c3eaf68f3df6180863e2166e"
108 |             ],
109 |             "version": "==1.4.3"
110 |         },
111 |         "attrs": {
112 |             "hashes": [
113 |                 "sha256:08a96c641c3a74e44eb59afb61a24f2cb9f4d7188748e76ba4bb5edfa3cb7d1c",
114 |                 "sha256:f7b7ce16570fe9965acd6d30101a28f62fb4a7f9e926b3bbc9b61f8b04247e72"
115 |             ],
116 |             "version": "==19.3.0"
117 |         },
118 |         "black": {
119 |             "hashes": [
120 |                 "sha256:1b30e59be925fafc1ee4565e5e08abef6b03fe455102883820fe5ee2e4734e0b",
121 |                 "sha256:c2edb73a08e9e0e6f65a0e6af18b059b8b1cdd5bef997d7a0b181df93dc81539"
122 |             ],
123 |             "index": "pypi",
124 |             "version": "==19.10b0"
125 |         },
126 |         "bleach": {
127 |             "hashes": [
128 |                 "sha256:213336e49e102af26d9cde77dd2d0397afabc5a6bf2fed985dc35b5d1e285a16",
129 |                 "sha256:3fdf7f77adcf649c9911387df51254b813185e32b2c6619f690b593a617e19fa"
130 |             ],
131 |             "version": "==3.1.0"
132 |         },
133 |         "certifi": {
134 |             "hashes": [
135 |                 "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
136 |                 "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
137 |             ],
138 |             "version": "==2019.11.28"
139 |         },
140 |         "cffi": {
141 |             "hashes": [
142 |                 "sha256:001bf3242a1bb04d985d63e138230802c6c8d4db3668fb545fb5005ddf5bb5ff",
143 |                 "sha256:00789914be39dffba161cfc5be31b55775de5ba2235fe49aa28c148236c4e06b",
144 |                 "sha256:028a579fc9aed3af38f4892bdcc7390508adabc30c6af4a6e4f611b0c680e6ac",
145 |                 "sha256:14491a910663bf9f13ddf2bc8f60562d6bc5315c1f09c704937ef17293fb85b0",
146 |                 "sha256:1cae98a7054b5c9391eb3249b86e0e99ab1e02bb0cc0575da191aedadbdf4384",
147 |                 "sha256:2089ed025da3919d2e75a4d963d008330c96751127dd6f73c8dc0c65041b4c26",
148 |                 "sha256:2d384f4a127a15ba701207f7639d94106693b6cd64173d6c8988e2c25f3ac2b6",
149 |                 "sha256:337d448e5a725bba2d8293c48d9353fc68d0e9e4088d62a9571def317797522b",
150 |                 "sha256:399aed636c7d3749bbed55bc907c3288cb43c65c4389964ad5ff849b6370603e",
151 |                 "sha256:3b911c2dbd4f423b4c4fcca138cadde747abdb20d196c4a48708b8a2d32b16dd",
152 |                 "sha256:3d311bcc4a41408cf5854f06ef2c5cab88f9fded37a3b95936c9879c1640d4c2",
153 |                 "sha256:62ae9af2d069ea2698bf536dcfe1e4eed9090211dbaafeeedf5cb6c41b352f66",
154 |                 "sha256:66e41db66b47d0d8672d8ed2708ba91b2f2524ece3dee48b5dfb36be8c2f21dc",
155 |                 "sha256:675686925a9fb403edba0114db74e741d8181683dcf216be697d208857e04ca8",
156 |                 "sha256:7e63cbcf2429a8dbfe48dcc2322d5f2220b77b2e17b7ba023d6166d84655da55",
157 |                 "sha256:8a6c688fefb4e1cd56feb6c511984a6c4f7ec7d2a1ff31a10254f3c817054ae4",
158 |                 "sha256:8c0ffc886aea5df6a1762d0019e9cb05f825d0eec1f520c51be9d198701daee5",
159 |                 "sha256:95cd16d3dee553f882540c1ffe331d085c9e629499ceadfbda4d4fde635f4b7d",
160 |                 "sha256:99f748a7e71ff382613b4e1acc0ac83bf7ad167fb3802e35e90d9763daba4d78",
161 |                 "sha256:b8c78301cefcf5fd914aad35d3c04c2b21ce8629b5e4f4e45ae6812e461910fa",
162 |                 "sha256:c420917b188a5582a56d8b93bdd8e0f6eca08c84ff623a4c16e809152cd35793",
163 |                 "sha256:c43866529f2f06fe0edc6246eb4faa34f03fe88b64a0a9a942561c8e22f4b71f",
164 |                 "sha256:cab50b8c2250b46fe738c77dbd25ce017d5e6fb35d3407606e7a4180656a5a6a",
165 |                 "sha256:cef128cb4d5e0b3493f058f10ce32365972c554572ff821e175dbc6f8ff6924f",
166 |                 "sha256:cf16e3cf6c0a5fdd9bc10c21687e19d29ad1fe863372b5543deaec1039581a30",
167 |                 "sha256:e56c744aa6ff427a607763346e4170629caf7e48ead6921745986db3692f987f",
168 |                 "sha256:e577934fc5f8779c554639376beeaa5657d54349096ef24abe8c74c5d9c117c3",
169 |                 "sha256:f2b0fa0c01d8a0c7483afd9f31d7ecf2d71760ca24499c8697aeb5ca37dc090c"
170 |             ],
171 |             "version": "==1.14.0"
172 |         },
173 |         "chardet": {
174 |             "hashes": [
175 |                 "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
176 |                 "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
177 |             ],
178 |             "version": "==3.0.4"
179 |         },
180 |         "click": {
181 |             "hashes": [
182 |                 "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
183 |                 "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
184 |             ],
185 |             "index": "pypi",
186 |             "version": "==7.0"
187 |         },
188 |         "cryptography": {
189 |             "hashes": [
190 |                 "sha256:02079a6addc7b5140ba0825f542c0869ff4df9a69c360e339ecead5baefa843c",
191 |                 "sha256:1df22371fbf2004c6f64e927668734070a8953362cd8370ddd336774d6743595",
192 |                 "sha256:369d2346db5934345787451504853ad9d342d7f721ae82d098083e1f49a582ad",
193 |                 "sha256:3cda1f0ed8747339bbdf71b9f38ca74c7b592f24f65cdb3ab3765e4b02871651",
194 |                 "sha256:44ff04138935882fef7c686878e1c8fd80a723161ad6a98da31e14b7553170c2",
195 |                 "sha256:4b1030728872c59687badcca1e225a9103440e467c17d6d1730ab3d2d64bfeff",
196 |                 "sha256:58363dbd966afb4f89b3b11dfb8ff200058fbc3b947507675c19ceb46104b48d",
197 |                 "sha256:6ec280fb24d27e3d97aa731e16207d58bd8ae94ef6eab97249a2afe4ba643d42",
198 |                 "sha256:7270a6c29199adc1297776937a05b59720e8a782531f1f122f2eb8467f9aab4d",
199 |                 "sha256:73fd30c57fa2d0a1d7a49c561c40c2f79c7d6c374cc7750e9ac7c99176f6428e",
200 |                 "sha256:7f09806ed4fbea8f51585231ba742b58cbcfbfe823ea197d8c89a5e433c7e912",
201 |                 "sha256:90df0cc93e1f8d2fba8365fb59a858f51a11a394d64dbf3ef844f783844cc793",
202 |                 "sha256:971221ed40f058f5662a604bd1ae6e4521d84e6cad0b7b170564cc34169c8f13",
203 |                 "sha256:a518c153a2b5ed6b8cc03f7ae79d5ffad7315ad4569b2d5333a13c38d64bd8d7",
204 |                 "sha256:b0de590a8b0979649ebeef8bb9f54394d3a41f66c5584fff4220901739b6b2f0",
205 |                 "sha256:b43f53f29816ba1db8525f006fa6f49292e9b029554b3eb56a189a70f2a40879",
206 |                 "sha256:d31402aad60ed889c7e57934a03477b572a03af7794fa8fb1780f21ea8f6551f",
207 |                 "sha256:de96157ec73458a7f14e3d26f17f8128c959084931e8997b9e655a39c8fde9f9",
208 |                 "sha256:df6b4dca2e11865e6cfbfb708e800efb18370f5a46fd601d3755bc7f85b3a8a2",
209 |                 "sha256:ecadccc7ba52193963c0475ac9f6fa28ac01e01349a2ca48509667ef41ffd2cf",
210 |                 "sha256:fb81c17e0ebe3358486cd8cc3ad78adbae58af12fc2bf2bc0bb84e8090fa5ce8"
211 |             ],
212 |             "version": "==2.8"
213 |         },
214 |         "docutils": {
215 |             "hashes": [
216 |                 "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af",
217 |                 "sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc"
218 |             ],
219 |             "version": "==0.16"
220 |         },
221 |         "entrypoints": {
222 |             "hashes": [
223 |                 "sha256:589f874b313739ad35be6e0cd7efde2a4e9b6fea91edcc34e58ecbb8dbe56d19",
224 |                 "sha256:c70dd71abe5a8c85e55e12c19bd91ccfeec11a6e99044204511f9ed547d48451"
225 |             ],
226 |             "version": "==0.3"
227 |         },
228 |         "flake8": {
229 |             "hashes": [
230 |                 "sha256:45681a117ecc81e870cbf1262835ae4af5e7a8b08e40b944a8a6e6b895914cfb",
231 |                 "sha256:49356e766643ad15072a789a20915d3c91dc89fd313ccd71802303fd67e4deca"
232 |             ],
233 |             "index": "pypi",
234 |             "version": "==3.7.9"
235 |         },
236 |         "idna": {
237 |             "hashes": [
238 |                 "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
239 |                 "sha256:ea8b7f6188e6fa117537c3df7da9fc686d485087abf6ac197f9c46432f7e4a3c"
240 |             ],
241 |             "version": "==2.8"
242 |         },
243 |         "importlib-metadata": {
244 |             "hashes": [
245 |                 "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302",
246 |                 "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b"
247 |             ],
248 |             "markers": "python_version < '3.8'",
249 |             "version": "==1.5.0"
250 |         },
251 |         "jeepney": {
252 |             "hashes": [
253 |                 "sha256:0ba6d8c597e9bef1ebd18aaec595f942a264e25c1a48f164d46120eacaa2e9bb",
254 |                 "sha256:6f45dce1125cf6c58a1c88123d3831f36a789f9204fbad3172eac15f8ccd08d0"
255 |             ],
256 |             "markers": "sys_platform == 'linux'",
257 |             "version": "==0.4.2"
258 |         },
259 |         "keyring": {
260 |             "hashes": [
261 |                 "sha256:1f393f7466314068961c7e1d508120c092bd71fa54e3d93b76180b526d4abc56",
262 |                 "sha256:24ae23ab2d6adc59138339e56843e33ec7b0a6b2f06302662477085c6c0aca00"
263 |             ],
264 |             "version": "==21.1.0"
265 |         },
266 |         "mccabe": {
267 |             "hashes": [
268 |                 "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
269 |                 "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
270 |             ],
271 |             "version": "==0.6.1"
272 |         },
273 |         "pathspec": {
274 |             "hashes": [
275 |                 "sha256:163b0632d4e31cef212976cf57b43d9fd6b0bac6e67c26015d611a647d5e7424",
276 |                 "sha256:562aa70af2e0d434367d9790ad37aed893de47f1693e4201fd1d3dca15d19b96"
277 |             ],
278 |             "version": "==0.7.0"
279 |         },
280 |         "pkginfo": {
281 |             "hashes": [
282 |                 "sha256:7424f2c8511c186cd5424bbf31045b77435b37a8d604990b79d4e70d741148bb",
283 |                 "sha256:a6d9e40ca61ad3ebd0b72fbadd4fba16e4c0e4df0428c041e01e06eb6ee71f32"
284 |             ],
285 |             "version": "==1.5.0.1"
286 |         },
287 |         "pycodestyle": {
288 |             "hashes": [
289 |                 "sha256:95a2219d12372f05704562a14ec30bc76b05a5b297b21a5dfe3f6fac3491ae56",
290 |                 "sha256:e40a936c9a450ad81df37f549d676d127b1b66000a6c500caa2b085bc0ca976c"
291 |             ],
292 |             "version": "==2.5.0"
293 |         },
294 |         "pycparser": {
295 |             "hashes": [
296 |                 "sha256:a988718abfad80b6b157acce7bf130a30876d27603738ac39f140993246b25b3"
297 |             ],
298 |             "version": "==2.19"
299 |         },
300 |         "pyflakes": {
301 |             "hashes": [
302 |                 "sha256:17dbeb2e3f4d772725c777fabc446d5634d1038f234e77343108ce445ea69ce0",
303 |                 "sha256:d976835886f8c5b31d47970ed689944a0262b5f3afa00a5a7b4dc81e5449f8a2"
304 |             ],
305 |             "version": "==2.1.1"
306 |         },
307 |         "pygments": {
308 |             "hashes": [
309 |                 "sha256:2a3fe295e54a20164a9df49c75fa58526d3be48e14aceba6d6b1e8ac0bfd6f1b",
310 |                 "sha256:98c8aa5a9f778fcd1026a17361ddaf7330d1b7c62ae97c3bb0ae73e0b9b6b0fe"
311 |             ],
312 |             "version": "==2.5.2"
313 |         },
314 |         "readme-renderer": {
315 |             "hashes": [
316 |                 "sha256:bb16f55b259f27f75f640acf5e00cf897845a8b3e4731b5c1a436e4b8529202f",
317 |                 "sha256:c8532b79afc0375a85f10433eca157d6b50f7d6990f337fa498c96cd4bfc203d"
318 |             ],
319 |             "version": "==24.0"
320 |         },
321 |         "regex": {
322 |             "hashes": [
323 |                 "sha256:07b39bf943d3d2fe63d46281d8504f8df0ff3fe4c57e13d1656737950e53e525",
324 |                 "sha256:0932941cdfb3afcbc26cc3bcf7c3f3d73d5a9b9c56955d432dbf8bbc147d4c5b",
325 |                 "sha256:0e182d2f097ea8549a249040922fa2b92ae28be4be4895933e369a525ba36576",
326 |                 "sha256:10671601ee06cf4dc1bc0b4805309040bb34c9af423c12c379c83d7895622bb5",
327 |                 "sha256:23e2c2c0ff50f44877f64780b815b8fd2e003cda9ce817a7fd00dea5600c84a0",
328 |                 "sha256:26ff99c980f53b3191d8931b199b29d6787c059f2e029b2b0c694343b1708c35",
329 |                 "sha256:27429b8d74ba683484a06b260b7bb00f312e7c757792628ea251afdbf1434003",
330 |                 "sha256:3e77409b678b21a056415da3a56abfd7c3ad03da71f3051bbcdb68cf44d3c34d",
331 |                 "sha256:4e8f02d3d72ca94efc8396f8036c0d3bcc812aefc28ec70f35bb888c74a25161",
332 |                 "sha256:4eae742636aec40cf7ab98171ab9400393360b97e8f9da67b1867a9ee0889b26",
333 |                 "sha256:6a6ae17bf8f2d82d1e8858a47757ce389b880083c4ff2498dba17c56e6c103b9",
334 |                 "sha256:6a6ba91b94427cd49cd27764679024b14a96874e0dc638ae6bdd4b1a3ce97be1",
335 |                 "sha256:7bcd322935377abcc79bfe5b63c44abd0b29387f267791d566bbb566edfdd146",
336 |                 "sha256:98b8ed7bb2155e2cbb8b76f627b2fd12cf4b22ab6e14873e8641f266e0fb6d8f",
337 |                 "sha256:bd25bb7980917e4e70ccccd7e3b5740614f1c408a642c245019cff9d7d1b6149",
338 |                 "sha256:d0f424328f9822b0323b3b6f2e4b9c90960b24743d220763c7f07071e0778351",
339 |                 "sha256:d58e4606da2a41659c84baeb3cfa2e4c87a74cec89a1e7c56bee4b956f9d7461",
340 |                 "sha256:e3cd21cc2840ca67de0bbe4071f79f031c81418deb544ceda93ad75ca1ee9f7b",
341 |                 "sha256:e6c02171d62ed6972ca8631f6f34fa3281d51db8b326ee397b9c83093a6b7242",
342 |                 "sha256:e7c7661f7276507bce416eaae22040fd91ca471b5b33c13f8ff21137ed6f248c",
343 |                 "sha256:ecc6de77df3ef68fee966bb8cb4e067e84d4d1f397d0ef6fce46913663540d77"
344 |             ],
345 |             "version": "==2020.1.8"
346 |         },
347 |         "requests": {
348 |             "hashes": [
349 |                 "sha256:11e007a8a2aa0323f5a921e9e6a2d7e4e67d9877e85773fba9ba6419025cbeb4",
350 |                 "sha256:9cf5292fcd0f598c671cfc1e0d7d1a7f13bb8085e9a590f48c010551dc6c4b31"
351 |             ],
352 |             "index": "pypi",
353 |             "version": "==2.22.0"
354 |         },
355 |         "requests-toolbelt": {
356 |             "hashes": [
357 |                 "sha256:380606e1d10dc85c3bd47bf5a6095f815ec007be7a8b69c878507068df059e6f",
358 |                 "sha256:968089d4584ad4ad7c171454f0a5c6dac23971e9472521ea3b6d49d610aa6fc0"
359 |             ],
360 |             "version": "==0.9.1"
361 |         },
362 |         "rope": {
363 |             "hashes": [
364 |                 "sha256:52423a7eebb5306a6d63bdc91a7c657db51ac9babfb8341c9a1440831ecf3203",
365 |                 "sha256:ae1fa2fd56f64f4cc9be46493ce54bed0dd12dee03980c61a4393d89d84029ad",
366 |                 "sha256:d2830142c2e046f5fc26a022fe680675b6f48f81c7fc1f03a950706e746e9dfe"
367 |             ],
368 |             "index": "pypi",
369 |             "version": "==0.16.0"
370 |         },
371 |         "secretstorage": {
372 |             "hashes": [
373 |                 "sha256:15da8a989b65498e29be338b3b279965f1b8f09b9668bd8010da183024c8bff6",
374 |                 "sha256:b5ec909dde94d4ae2fa26af7c089036997030f0cf0a5cb372b4cccabd81c143b"
375 |             ],
376 |             "markers": "sys_platform == 'linux'",
377 |             "version": "==3.1.2"
378 |         },
379 |         "six": {
380 |             "hashes": [
381 |                 "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
382 |                 "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
383 |             ],
384 |             "version": "==1.14.0"
385 |         },
386 |         "toml": {
387 |             "hashes": [
388 |                 "sha256:229f81c57791a41d65e399fc06bf0848bab550a9dfd5ed66df18ce5f05e73d5c",
389 |                 "sha256:235682dd292d5899d361a811df37e04a8828a5b1da3115886b73cf81ebc9100e"
390 |             ],
391 |             "version": "==0.10.0"
392 |         },
393 |         "tqdm": {
394 |             "hashes": [
395 |                 "sha256:251ee8440dbda126b8dfa8a7c028eb3f13704898caaef7caa699b35e119301e2",
396 |                 "sha256:fe231261cfcbc6f4a99165455f8f6b9ef4e1032a6e29bccf168b4bf42012f09c"
397 |             ],
398 |             "version": "==4.42.1"
399 |         },
400 |         "twine": {
401 |             "hashes": [
402 |                 "sha256:c1af8ca391e43b0a06bbc155f7f67db0bf0d19d284bfc88d1675da497a946124",
403 |                 "sha256:d561a5e511f70275e5a485a6275ff61851c16ffcb3a95a602189161112d9f160"
404 |             ],
405 |             "index": "pypi",
406 |             "version": "==3.1.1"
407 |         },
408 |         "typed-ast": {
409 |             "hashes": [
410 |                 "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
411 |                 "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
412 |                 "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
413 |                 "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
414 |                 "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
415 |                 "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
416 |                 "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
417 |                 "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
418 |                 "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
419 |                 "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
420 |                 "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
421 |                 "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
422 |                 "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
423 |                 "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
424 |                 "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
425 |                 "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
426 |                 "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
427 |                 "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
428 |                 "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
429 |                 "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
430 |                 "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
431 |             ],
432 |             "version": "==1.4.1"
433 |         },
434 |         "urllib3": {
435 |             "hashes": [
436 |                 "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc",
437 |                 "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"
438 |             ],
439 |             "version": "==1.25.8"
440 |         },
441 |         "webencodings": {
442 |             "hashes": [
443 |                 "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78",
444 |                 "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"
445 |             ],
446 |             "version": "==0.5.1"
447 |         },
448 |         "zipp": {
449 |             "hashes": [
450 |                 "sha256:5c56e330306215cd3553342cfafc73dda2c60792384117893f3a83f8a1209f50",
451 |                 "sha256:d65287feb793213ffe11c0f31b81602be31448f38aeb8ffc2eb286c4f6f6657e"
452 |             ],
453 |             "version": "==2.2.0"
454 |         }
455 |     }
456 | }
457 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | Feedsearch
  2 | ==========
  3 | .. image:: https://img.shields.io/pypi/v/feedsearch.svg
  4 |     :target: https://pypi.python.org/pypi/feedsearch
  5 | 
  6 | .. image:: https://img.shields.io/pypi/l/feedsearch.svg
  7 |     :target: https://pypi.python.org/pypi/feedsearch
  8 |     
  9 | .. image:: https://img.shields.io/pypi/pyversions/feedsearch.svg
 10 |     :target: https://pypi.python.org/pypi/feedsearch
 11 | 
 12 | .. image:: https://pepy.tech/badge/feedsearch
 13 |     :target: https://pepy.tech/project/feedsearch
 14 | 
 15 | Feedsearch is a Python library for searching websites for RSS, Atom, and JSON feeds.
 16 | 
 17 | It was originally based on
 18 | `Feedfinder2 <https://github.com/dfm/feedfinder2>`_ written by
 19 | `Dan Foreman-Mackey <http://dfm.io/>`_, which in turn is based on
 20 | `feedfinder <http://www.aaronsw.com/2002/feedfinder/>`_ - originally written by
 21 | `Mark Pilgrim <http://en.wikipedia.org/wiki/Mark_Pilgrim_(software_developer)>`_
 22 | and subsequently maintained by
 23 | `Aaron Swartz <http://en.wikipedia.org/wiki/Aaron_Swartz>`_ until his untimely death.
 24 | 
 25 | Feedsearch now differs a lot with Feedfinder2, in that Feedsearch supports JSON feeds, allows for 
 26 | optional fetching of Feed and Site metadata, and optionally searches the content of internal linked pages
 27 | and default CMS feed locations.
 28 | 
 29 | **Please Note:** Development of this library is no longer ongoing except in the case of fixing reported bugs.
 30 | Further development of Feedsearch functionality has now moved to
 31 | `Feedsearch Crawler <https://github.com/DBeath/feedsearch-crawler>`_.
 32 | 
 33 | Usage
 34 | -----
 35 | 
 36 | Feedsearch is called with the single function ``search``:
 37 | 
 38 | .. code-block:: python
 39 | 
 40 |     >>> from feedsearch import search
 41 |     >>> feeds = search('xkcd.com')
 42 |     >>> feeds
 43 |     [FeedInfo('https://xkcd.com/atom.xml'), FeedInfo('https://xkcd.com/rss.xml')]
 44 |     >>> feeds[0].url
 45 |     'http://xkcd.com/atom.xml'
 46 | 
 47 | To get Feed and Site metadata:
 48 | 
 49 | .. code-block:: python
 50 | 
 51 |     >>> feeds = search('propublica.org', info=True)
 52 |     >>> feeds
 53 |     [FeedInfo('http://feeds.propublica.org/propublica/main')]
 54 |     >>> pprint(vars(feeds[0]))
 55 |     {'bozo': 0,
 56 |      'content_type': 'text/xml; charset=UTF-8',
 57 |      'description': 'Latest Articles and Investigations from ProPublica, an '
 58 |                     'independent, non-profit newsroom that produces investigative '
 59 |                     'journalism in the public interest.',
 60 |      'favicon': 'https://assets.propublica.org/prod/v3/images/favicon.ico',
 61 |      'favicon_data_uri': '',
 62 |      'hubs': ['http://feedpress.superfeedr.com/'],
 63 |      'is_push': True,
 64 |      'score': 4,
 65 |      'self_url': 'http://feeds.propublica.org/propublica/main',
 66 |      'site_name': 'ProPublica',
 67 |      'site_url': 'https://www.propublica.org/',
 68 |      'title': 'Articles and Investigations - ProPublica',
 69 |      'url': 'http://feeds.propublica.org/propublica/main',
 70 |      'version': 'rss20'}
 71 | 
 72 | Search will always return a list of *FeedInfo* objects, each of which will always have a *url* property.
 73 | Feeds are sorted by the *score* value from highest to lowest, with a higher score theoretically indicating
 74 | a more relevant feed compared to the original URL provided.
 75 | 
 76 | If you only want the raw urls, then use a list comprehension on the result, or set the
 77 | *as_urls* parameter to *True*:
 78 | 
 79 | .. code-block:: python
 80 | 
 81 |     >>> feeds = search('http://jsonfeed.org')
 82 |     >>> feeds
 83 |     [FeedInfo('https://jsonfeed.org/xml/rss.xml'), FeedInfo('https://jsonfeed.org/feed.json')]
 84 |     >>> urls = [f.url for f in feeds]
 85 |     >>> urls
 86 |     ['https://jsonfeed.org/xml/rss.xml', 'https://jsonfeed.org/feed.json']
 87 | 
 88 |     >>> feeds = search('http://jsonfeed.org', as_urls=True)
 89 |     >>> feeds
 90 |     >>> ['https://jsonfeed.org/xml/rss.xml', 'https://jsonfeed.org/feed.json']
 91 | 
 92 | In addition to the URL, the ``search`` function takes the following optional keyword arguments:
 93 | 
 94 | - **info**: *bool*: Get Feed and Site Metadata. Defaults False.
 95 | - **check_all**: *bool*: Check all internally linked pages of <a> tags for feeds, and default CMS feeds.
 96 |   Only checks one level down. Defaults False. May be very slow.
 97 | - **user_agent**: *str*: User-Agent Header string. Defaults to Package name.
 98 | - **timeout**: *float* or *tuple(float, float)*: Timeout for each request in the search (not a timeout for the ``search``
 99 |   method itself). Defaults to 3 seconds. See
100 |   `Requests timeout documentation <http://docs.python-requests.org/en/master/user/advanced/#timeouts>`_ for more info.
101 | - **max_redirects**: *int*: Maximum number of redirects for each request. Defaults to 30.
102 | - **parser**: *str*: BeautifulSoup parser for HTML parsing. Defaults to 'html.parser'.
103 | - **exceptions**: *bool*: If False, will gracefully handle Requests exceptions and attempt to keep searching. 
104 |   If True, will leave Requests exceptions uncaught to be handled by the caller. Defaults False.
105 | - **verify**: *bool* or *str*: Verify SSL Certificates. See
106 |   `Requests SSL documentation <https://requests.readthedocs.io/en/master/user/advanced/#ssl-cert-verification>`_ for more info.
107 | - **favicon_data_uri**: *bool*: Convert Favicon to Data Uri. Defaults False.
108 | - **as_urls**: *bool*: Return found Feeds as a list of URL strings instead of FeedInfo objects.
109 | - **cms**: *bool*: Check default CMS feed location if no feeds already found and site is using a known CMS. Defaults True.
110 | - **discovery_only**: *bool*: Only search for RSS discovery tags (e.g. <link rel="alternate" href=...>). Defaults False.
111 |   Overridden by **check_all** if **check_all** is True.
112 | 
113 | FeedInfo Values
114 | ---------------
115 | 
116 | FeedInfo objects may have the following values if *info* is *True*:
117 | 
118 | - **bozo**: *int*: Set to 1 when feed data is not well formed or may not be a feed. Defaults 0.
119 | - **content_type**: *str*: Content-Type value of the returned feed.
120 | - **description**: *str*: Feed description.
121 | - **favicon**: *str*: Url of site Favicon.
122 | - **favicon_data_uri**: *str*: Data Uri of site Favicon.
123 | - **hubs**: *List[str]*: List of `Websub <https://en.wikipedia.org/wiki/WebSub>`_ hubs of feed if available.
124 | - **is_push**: *bool*: True if feed contains valid Websub data.
125 | - **score**: *int*: Computed relevance of feed url value to provided URL. May be safely ignored.
126 | - **self_url**: *str*: *ref="self"* value returned from feed links. In some cases may be different from feed url.
127 | - **site_name**: *str*: Name of feed's website.
128 | - **site_url**: *str*: URL of feed's website.
129 | - **title**: *str*: Feed Title.
130 | - **url**: *str*: URL location of feed.
131 | - **version**: *str*: Feed version `XML values <https://pythonhosted.org/feedparser/version-detection.html>`_,
132 |   or `JSON feed <https://jsonfeed.org/version/1>`_.
133 | 
134 | 
135 | Search Order
136 | ------------
137 | 
138 | Feedsearch searches for feeds in the following order:
139 | 
140 | 1. If the URL points directly to a feed, then return that feed.
141 | 2. If **discovery_only** is True, search only <link rel="alternate"> tags. Return unless **check_all** is True.
142 | 3. Search all <link> tags. Return if feeds are found and **check_all** is False.
143 | 4. If **cms** or **check_all** is True, search for default CMS feeds if the site is using a known CMS. Return if feeds are found and **check_all** is False.
144 | 5. Search all <a> tags. Return if **check_all** is False.
145 | 6. This point will only be reached if **check_all** is True.
146 | 7. Fetch the content of all internally pointing <a> tags whose URL paths indicate they may contain feeds. (e.g. /feed /rss /atom). All <link> tags and <a> tags of the content are searched, although not recusively. Return if feeds are found. This step may be very slow, so be sure whether you want **check_all** enabled.
147 | 8. If step 7 failed to find feeds, then as a last resort we make a few guesses for potential feed urls.
148 | 


--------------------------------------------------------------------------------
/feedsearch/__init__.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | from .feedsearch import search
4 | 
5 | logging.getLogger(__name__).addHandler(logging.NullHandler())
6 | 


--------------------------------------------------------------------------------
/feedsearch/__version__.py:
--------------------------------------------------------------------------------
1 | __title__ = "feedsearch"
2 | __description__ = "Search sites for RSS, Atom, and JSON feeds"
3 | __url__ = "https://github.com/DBeath/feedsearch"
4 | __version__ = "1.0.12"
5 | __author__ = "David Beath"
6 | __author_email__ = "davidgbeath@gmail.com"
7 | __license__ = "MIT"
8 | 


--------------------------------------------------------------------------------
/feedsearch/feedfinder.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List, Tuple, Union
  3 | from urllib.parse import urljoin, urlparse
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | from .feedinfo import FeedInfo
  8 | from .site_meta import SiteMeta
  9 | from .url import URL
 10 | from .lib import create_soup
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class FeedFinder:
 16 |     def __init__(
 17 |         self, coerced_url: str, feed_info: bool = False, favicon_data_uri: bool = False
 18 |     ) -> None:
 19 |         self.get_feed_info = feed_info  # type: bool
 20 |         self.favicon_data_uri = favicon_data_uri  # type: bool
 21 |         self.soup = None
 22 |         self.site_meta = None
 23 |         self.feeds = []  # type: list
 24 |         self.urls = []  # type: List[URL]
 25 |         self.coerced_url = coerced_url  # type: str
 26 | 
 27 |     def check_urls(self, urls: List[str]) -> List[FeedInfo]:
 28 |         """
 29 |         Check if a list of Urls contain feeds
 30 | 
 31 |         :param urls: List of Url strings
 32 |         :return: List of FeedInfo objects
 33 |         """
 34 |         feeds = []
 35 |         for url_str in urls:
 36 |             url = self.get_url(url_str)
 37 |             if url.is_feed:
 38 |                 feed = self.create_feed_info(url)
 39 |                 feeds.append(feed)
 40 | 
 41 |         return feeds
 42 | 
 43 |     def create_feed_info(self, url: URL) -> FeedInfo:
 44 |         """
 45 |         Creates a FeedInfo object from a URL object
 46 | 
 47 |         :param url: URL object
 48 |         :return: FeedInfo
 49 |         """
 50 |         info = FeedInfo(url.url, content_type=url.content_type)
 51 | 
 52 |         if self.get_feed_info:
 53 |             info.get_info(data=url.data, headers=url.headers)
 54 | 
 55 |             if self.site_meta:
 56 |                 info.add_site_info(
 57 |                     self.site_meta.site_url,
 58 |                     self.site_meta.site_name,
 59 |                     self.site_meta.icon_url,
 60 |                     self.site_meta.icon_data_uri,
 61 |                 )
 62 | 
 63 |         return info
 64 | 
 65 |     @staticmethod
 66 |     def search_links(soup: BeautifulSoup, url: str, rel: bool = False) -> List[str]:
 67 |         """
 68 |         Search all links on a page for feeds
 69 | 
 70 |         :param soup: BeautifulSoup dict
 71 |         :param url: Url of the soup
 72 |         :param rel: If true, only search for RSS discovery type "alternate" links
 73 |         :return: list
 74 |         """
 75 |         links = []  # type: List[str]
 76 |         if rel:
 77 |             link_tags = soup.find_all("link", rel="alternate")
 78 |         else:
 79 |             link_tags = soup.find_all("link")
 80 |         for link in link_tags:
 81 |             if link.get("type") in [
 82 |                 "application/rss+xml",
 83 |                 "text/xml",
 84 |                 "application/atom+xml",
 85 |                 "application/x.atom+xml",
 86 |                 "application/x-atom+xml",
 87 |                 "application/json",
 88 |             ]:
 89 |                 links.append(urljoin(url, link.get("href", "")))
 90 | 
 91 |         return links
 92 | 
 93 |     @staticmethod
 94 |     def search_a_tags(soup: BeautifulSoup) -> Tuple[List[str], List[str]]:
 95 |         """
 96 |         Search all 'a' tags on a page for feeds
 97 | 
 98 |         :return: Tuple[list, list]
 99 |         """
100 |         local, remote = [], []
101 |         for a in soup.find_all("a"):
102 |             href = a.get("href", None)
103 |             if href is None:
104 |                 continue
105 |             if "://" not in href and URL.is_feed_url(href):
106 |                 local.append(href)
107 |             if URL.is_feedlike_url(href):
108 |                 remote.append(href)
109 | 
110 |         return local, remote
111 | 
112 |     def get_site_info(self, url: Union[str, URL]) -> None:
113 |         """
114 |         Search for site metadata
115 | 
116 |         :param url: Site Url
117 |         :return: None
118 |         """
119 |         if isinstance(url, str):
120 |             self.site_meta = SiteMeta(url)
121 |         elif isinstance(url, URL):
122 |             self.site_meta = SiteMeta(url.url, data=url.data)
123 |         if self.site_meta:
124 |             self.site_meta.parse_site_info(self.favicon_data_uri)
125 | 
126 |     def get_url(self, url: Union[str, URL]) -> URL:
127 |         """
128 |         Return a unique URL object containing fetched URL data
129 | 
130 |         :param url: URL string or URL object
131 |         :return: URL object
132 |         """
133 |         if isinstance(url, str):
134 |             if "://" not in url:
135 |                 url = urljoin(self.coerced_url, url)
136 |             url = URL(url, immediate_get=False)
137 |         if url in self.urls:
138 |             url = self.urls[self.urls.index(url)]
139 |         else:
140 |             self.urls.append(url)
141 |         if not url.data:
142 |             url.get_is_feed(url.url)
143 |         return url
144 | 
145 |     def internal_feedlike_urls(self) -> List[URL]:
146 |         """
147 |         Return a list of URLs that point to internal pages
148 |         which may contain feeds.
149 | 
150 |         :return: List of URL objects
151 |         """
152 |         internal = []  # type: List[URL]
153 |         parsed_coerced = urlparse(self.coerced_url)
154 |         for url in self.urls:
155 |             if not url.is_feed and url.fetched and url.feedlike_url:
156 |                 parsed = urlparse(url.url)
157 |                 # We want to check that the url is internal.
158 |                 # The coerced netloc is likely to be less complete (i.e. missing www subdomain)
159 |                 # than the netloc of the fetched url.
160 |                 if parsed_coerced.netloc in parsed.netloc:
161 |                     internal.append(url)
162 |         return internal
163 | 
164 |     def check_url_data(self, urls: List[URL]) -> List[FeedInfo]:
165 |         """
166 |         Check the data of each URL for links which may be feeds,
167 |         then check the links and return any found feeds.
168 | 
169 |         :return: List of FeedInfo objects
170 |         """
171 |         found = []  # type: List[FeedInfo]
172 | 
173 |         for url in urls:
174 |             if not url.is_feed and url.data:
175 |                 to_search = []  # type: List[str]
176 |                 url_soup = create_soup(url.data)
177 |                 to_search.extend(self.search_links(url_soup, url.url))
178 |                 local, remote = self.search_a_tags(url_soup)
179 |                 to_search.extend(local)
180 |                 to_search.extend(remote)
181 |                 found.extend(self.check_urls(to_search))
182 | 
183 |         return found
184 | 


--------------------------------------------------------------------------------
/feedsearch/feedinfo.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | from typing import Tuple, Any, List
  4 | 
  5 | import feedparser
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | from .lib import bs4_parser, parse_header_links
  9 | from .url import URL
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | class FeedInfo:
 15 |     def __init__(
 16 |         self,
 17 |         url: str,
 18 |         site_url: str = "",
 19 |         title: str = "",
 20 |         description: str = "",
 21 |         site_name: str = "",
 22 |         favicon: str = "",
 23 |         hubs: list = None,
 24 |         is_push: bool = False,
 25 |         content_type: str = "",
 26 |         version: str = "",
 27 |         self_url: str = "",
 28 |         score: int = 0,
 29 |         bozo: int = 0,
 30 |         favicon_data_uri: str = "",
 31 |     ) -> None:
 32 |         self.url = url
 33 |         self.site_url = site_url
 34 |         self.title = title
 35 |         self.description = description
 36 |         self.site_name = site_name
 37 |         self.favicon = favicon
 38 |         self.hubs = hubs or []
 39 |         self.is_push = is_push
 40 |         self.content_type = content_type
 41 |         self.version = version
 42 |         self.self_url = self_url
 43 |         self.bozo = bozo
 44 |         self.score = score
 45 |         self.favicon_data_uri = favicon_data_uri
 46 | 
 47 |     def __repr__(self):
 48 |         return "{0}({1})".format(self.__class__.__name__, self.url.__repr__())
 49 | 
 50 |     def __eq__(self, other):
 51 |         return self.url == other.url
 52 | 
 53 |     def __hash__(self):
 54 |         return hash(self.url)
 55 | 
 56 |     def get_info(self, data: Any = None, headers: dict = None) -> None:
 57 |         """
 58 |         Get Feed info from data.
 59 | 
 60 |         :param data: Feed data, XML string or JSON object
 61 |         :param headers: HTTP Headers of the Feed Url
 62 |         :return: None
 63 |         """
 64 |         logger.debug("Getting FeedInfo for %s", self.url)
 65 | 
 66 |         # Get data from URL if no data provided
 67 |         url_object = None
 68 |         if not data:
 69 |             url_object = URL(self.url)
 70 |             if url_object.is_feed:
 71 |                 self.update_from_url(
 72 |                     url_object.url,
 73 |                     url_object.content_type,
 74 |                     url_object.data,
 75 |                     url_object.headers,
 76 |                 )
 77 | 
 78 |         if not headers and url_object:
 79 |             headers = url_object.headers
 80 | 
 81 |         # Check link headers first for WebSub content discovery
 82 |         # https://www.w3.org/TR/websub/#discovery
 83 |         if headers:
 84 |             self.hubs, self.self_url = self.header_links(headers)
 85 | 
 86 |         # Try to parse data as JSON
 87 |         try:
 88 |             json_data = json.loads(data)
 89 |             logger.debug("%s data is JSON", self)
 90 |             self.content_type = "application/json"
 91 |             self.parse_json(json_data)
 92 |             return
 93 |         except json.JSONDecodeError:
 94 |             pass
 95 | 
 96 |         self.parse_xml(data)
 97 | 
 98 |     def parse_xml(self, data: str) -> None:
 99 |         """
100 |         Get info from XML (RSS or ATOM) feed.
101 |         :param data: XML string
102 |         :return: None
103 |         """
104 |         # Parse data with feedparser
105 |         # Don't wrap this in try/except, feedparser eats errors and returns bozo instead
106 |         parsed = self.parse_feed(data)
107 |         if not parsed or parsed.get("bozo") == 1:
108 |             self.bozo = 1
109 |             logger.warning("No valid feed data in %s", self.url)
110 |             return
111 | 
112 |         feed = parsed.get("feed")
113 | 
114 |         # Only search if no hubs already present from headers
115 |         if not self.hubs:
116 |             self.hubs, self.self_url = self.websub_links(feed)
117 | 
118 |         if self.hubs and self.self_url:
119 |             self.is_push = True
120 | 
121 |         self.version = parsed.get("version")
122 |         self.title = self.feed_title(feed)
123 |         self.description = self.feed_description(feed)
124 | 
125 |     def parse_json(self, data: dict) -> None:
126 |         """
127 |         Get info from JSON feed.
128 | 
129 |         :param data: JSON object
130 |         :return: None
131 |         """
132 |         self.version = data.get("version")
133 |         if "https://jsonfeed.org/version/" not in self.version:
134 |             self.bozo = 1
135 |             return
136 | 
137 |         feed_url = data.get("feed_url")
138 |         # Check URL from feed data if mismatch
139 |         if feed_url and feed_url != self.url:
140 |             url = URL(feed_url)
141 |             if url.is_feed:
142 |                 self.update_from_url(url.url, url.content_type, url.data)
143 |                 return
144 | 
145 |         self.title = data.get("title")
146 |         self.description = data.get("description")
147 | 
148 |         favicon = data.get("favicon")
149 |         if favicon:
150 |             self.favicon = favicon
151 | 
152 |         # Only search if no hubs already present from headers
153 |         if not self.hubs:
154 |             try:
155 |                 self.hubs = list(hub.get("url") for hub in data.get("hubs", []))
156 |             except (IndexError, AttributeError):
157 |                 pass
158 | 
159 |         if self.hubs:
160 |             self.is_push = True
161 | 
162 |     @staticmethod
163 |     def parse_feed(text: str) -> dict:
164 |         """
165 |         Parse feed with feedparser.
166 | 
167 |         :param text: Feed string
168 |         :return: dict
169 |         """
170 |         return feedparser.parse(text)
171 | 
172 |     @staticmethod
173 |     def feed_title(feed: dict) -> str:
174 |         """
175 |         Get feed title
176 | 
177 |         :param feed: feed dict
178 |         :return: str
179 |         """
180 |         title = feed.get("title", None)
181 |         if not title:
182 |             return ""
183 |         return FeedInfo.clean_title(title)
184 | 
185 |     @staticmethod
186 |     def clean_title(title: str) -> str:
187 |         """
188 |         Cleans title string, and shortens if too long.
189 |         Have had issues with dodgy feed titles.
190 | 
191 |         :param title: Title string
192 |         :return: str
193 |         """
194 |         try:
195 |             title = BeautifulSoup(title, bs4_parser).get_text()
196 |             if len(title) > 1024:
197 |                 title = title[:1020] + "..."
198 |             return title
199 |         except Exception as ex:
200 |             logger.exception("Failed to clean title: %s", ex)
201 |             return ""
202 | 
203 |     @staticmethod
204 |     def feed_description(feed: dict) -> str:
205 |         """
206 |         Get feed description.
207 | 
208 |         :param feed: feed dict
209 |         :return: str
210 |         """
211 |         subtitle = feed.get("subtitle", None)
212 |         if subtitle:
213 |             return subtitle
214 |         return feed.get("description", None)
215 | 
216 |     @staticmethod
217 |     def websub_links(feed: dict) -> Tuple[List[str], str]:
218 |         """
219 |         Returns a tuple containing the hub url and the self url for
220 |         a parsed feed.
221 | 
222 |         :param feed: An RSS feed parsed by feedparser
223 |         :return: tuple
224 |         """
225 |         links = feed.get("links", [])
226 |         return FeedInfo.find_hubs_and_self_links(links)
227 | 
228 |     def add_site_info(
229 |         self, url: str = "", name: str = "", icon: str = "", icon_data_uri: str = ""
230 |     ) -> None:
231 |         """
232 |         Adds site meta info to FeedInfo
233 | 
234 |         :param url: Site URL
235 |         :param name: Site Name
236 |         :param icon: Site Favicon
237 |         :param icon_data_uri: Site Favicon as Data Uri
238 |         :return: None
239 |         """
240 |         self.site_url = url
241 |         self.site_name = name
242 |         self.favicon = icon
243 |         self.favicon_data_uri = icon_data_uri
244 | 
245 |     def update_from_url(
246 |         self, url: str, content_type: str = "", data: Any = None, headers: dict = None
247 |     ) -> None:
248 |         """
249 |         Update a FeedInfo object from a Url object
250 | 
251 |         :param url: Url string
252 |         :param content_type: Content-Type of returned Url
253 |         :param data: Data from returned Url
254 |         :param headers: Dict of headers
255 |         :return: None
256 |         """
257 |         self.url = url
258 |         self.content_type = content_type
259 |         self.get_info(data, headers)
260 | 
261 |     @classmethod
262 |     def create_from_url(cls, url: str, content_type: str = ""):
263 |         """
264 |         Create a FeedInfo object from a Url
265 | 
266 |         :param url: Url string
267 |         :param content_type: Content-Type of returned Url
268 |         :return: FeedInfo
269 |         """
270 |         return cls(url=url, content_type=content_type)
271 | 
272 |     def serialize(self) -> str:
273 |         """
274 |         Attempt to serialize FeedInfo to JSON string
275 | 
276 |         :return: JSON
277 |         """
278 |         return json.dumps(self, default=lambda o: o.__dict__, sort_keys=True, indent=4)
279 | 
280 |     @staticmethod
281 |     def header_links(headers: dict) -> Tuple[List[str], str]:
282 |         """
283 |         Attempt to get self and hub links from HTTP headers
284 |         https://www.w3.org/TR/websub/#x4-discovery
285 | 
286 |         :param headers: Dict of HTTP headers
287 |         :return: None
288 |         """
289 |         link_header = headers.get("Link")
290 |         links = []  # type: list
291 |         if link_header:
292 |             links = parse_header_links(link_header)
293 |         return FeedInfo.find_hubs_and_self_links(links)
294 | 
295 |     @staticmethod
296 |     def find_hubs_and_self_links(links: List[dict]) -> Tuple[List[str], str]:
297 |         """
298 |         Parses a list of links into self and hubs urls
299 | 
300 |         :param links: List of parsed HTTP Link Dicts
301 |         :return: Tuple
302 |         """
303 |         hub_urls = []  # type: List[str]
304 |         self_url = ""  # type: str
305 | 
306 |         if not links:
307 |             return [], ""
308 | 
309 |         for link in links:
310 |             try:
311 |                 if link["rel"] == "hub":
312 |                     href = link["href"]  # type: str
313 |                     hub_urls.append(href)
314 |                 elif link["rel"] == "self":
315 |                     self_url = link["href"]
316 |             except KeyError:
317 |                 continue
318 | 
319 |         return hub_urls, self_url
320 | 


--------------------------------------------------------------------------------
/feedsearch/feedsearch.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import time
  3 | from typing import List, Tuple, Union
  4 | from urllib.parse import urljoin
  5 | 
  6 | from .feedfinder import FeedFinder
  7 | from .feedinfo import FeedInfo
  8 | from .lib import (
  9 |     coerce_url,
 10 |     create_requests_session,
 11 |     create_soup,
 12 |     default_timeout,
 13 |     get_site_root,
 14 |     set_bs4_parser,
 15 |     timeit,
 16 |     get_exceptions,
 17 |     set_exceptions,
 18 | )
 19 | from requests import ReadTimeout
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | 
 24 | def search(
 25 |     url,
 26 |     info: bool = False,
 27 |     check_all: bool = False,
 28 |     cms: bool = True,
 29 |     discovery_only: bool = False,
 30 |     favicon_data_uri: bool = False,
 31 |     as_urls: bool = False,
 32 |     timeout: Union[float, Tuple[float, float]] = default_timeout,
 33 |     user_agent: str = "",
 34 |     max_redirects: int = 30,
 35 |     parser: str = "html.parser",
 36 |     exceptions: bool = False,
 37 |     verify: Union[bool, str] = True,
 38 | ) -> Union[List[FeedInfo], List[str]]:
 39 |     """
 40 |     Search for RSS or ATOM feeds at a given URL
 41 | 
 42 |     :param url: URL
 43 |     :param info: Get Feed and Site Metadata
 44 |     :param check_all: Check all <link> and <a> tags on page
 45 |     :param cms: Check default CMS feed location if site is using a known CMS.
 46 |     :param discovery_only: Only search for RSS discovery tags (e.g. <link rel=\"alternate\" href=...>).
 47 |     :param favicon_data_uri: Fetch Favicon and convert to Data Uri
 48 |     :param as_urls: Return found Feeds as a list of URL strings instead
 49 |         of FeedInfo objects
 50 |     :param timeout: Request timeout, either a float or (float, float).
 51 |         See Requests documentation: http://docs.python-requests.org/en/master/user/advanced/#timeouts
 52 |     :param user_agent: User-Agent Header string
 53 |     :param max_redirects: Maximum Request redirects
 54 |     :param parser: BeautifulSoup parser ('html.parser', 'lxml', etc.).
 55 |         Defaults to 'html.parser'
 56 |     :param exceptions: If False, will gracefully handle Requests exceptions and
 57 |         attempt to keep searching. If True, will leave Requests exceptions
 58 |         uncaught to be handled externally.
 59 |     :param verify: Verify SSL Certificates.
 60 |         See Requests documentation: https://requests.readthedocs.io/en/master/user/advanced/#ssl-cert-verification
 61 |     :return: List of found feeds as FeedInfo objects or URL strings (depending on "as_url" parameter).
 62 |         FeedInfo objects will always have a "url" value.
 63 |     """
 64 |     # Wrap find_feeds in a Requests session
 65 |     with create_requests_session(
 66 |         user_agent=user_agent,
 67 |         max_redirects=max_redirects,
 68 |         timeout=timeout,
 69 |         exceptions=exceptions,
 70 |         verify=verify,
 71 |     ):
 72 |         # Set BeautifulSoup parser
 73 |         set_bs4_parser(parser)
 74 |         # Find feeds
 75 |         feeds = _find_feeds(
 76 |             url,
 77 |             feed_info=info,
 78 |             check_all=check_all,
 79 |             cms=cms,
 80 |             discovery_only=discovery_only,
 81 |             favicon_data_uri=favicon_data_uri,
 82 |         )
 83 |         # If as_urls is true, return only URL strings
 84 |         if as_urls:
 85 |             return list(f.url for f in feeds)
 86 |         else:
 87 |             return feeds
 88 | 
 89 | 
 90 | @timeit
 91 | def _find_feeds(
 92 |     url: str,
 93 |     feed_info: bool = False,
 94 |     check_all: bool = False,
 95 |     cms: bool = True,
 96 |     discovery_only: bool = False,
 97 |     favicon_data_uri: bool = False,
 98 | ) -> List[FeedInfo]:
 99 |     """
100 |     Finds feeds
101 | 
102 |     :param url: URL
103 |     :param check_all: Check all the pages of <a> tags for feeds
104 |     :param feed_info: Get Feed and Site Metadata
105 |     :param favicon_data_uri: Fetch Favicon and convert to Data Uri
106 |     :param cms: Check default CMS feed location if site is using a known CMS.
107 |     :param discovery_only: Only search for RSS discovery tags (e.g. <link rel=\"alternate\" href=...>).
108 |     :return: List of found feeds as FeedInfo objects.
109 |     """
110 |     # Format the URL properly. Use HTTPS
111 |     coerced_url = coerce_url(url)  # type: str
112 | 
113 |     # Create Feedfinder
114 |     finder = FeedFinder(
115 |         coerced_url, feed_info=feed_info, favicon_data_uri=favicon_data_uri
116 |     )
117 | 
118 |     # Initialise List of found Feeds
119 |     feeds = []  # type: list
120 | 
121 |     start_time = time.perf_counter()
122 | 
123 |     # Download the requested URL
124 |     logger.info("Finding feeds at URL: %s", coerced_url)
125 | 
126 |     # If the Caller provided an explicit HTTPS URL or asked for exceptions
127 |     # to be raised, then make the first fetch without explicit exception
128 |     # handling, as we don't want to retry with HTTP only.
129 |     if url.startswith("https://") or get_exceptions():
130 |         found_url = finder.get_url(coerced_url)
131 |     # Else, we perform the fetch with exception handling, so we can retry
132 |     # with an HTTP URL if we had a ReadTimeout using HTTPS.
133 |     else:
134 |         try:
135 |             # Set context to raise RequestExceptions on first fetch.
136 |             set_exceptions(True)
137 |             found_url = finder.get_url(coerced_url)
138 |         except ReadTimeout:
139 |             # Set Local Context exception settings back to Caller provided settings.
140 |             set_exceptions(False)
141 |             # Coerce URL with HTTP instead of HTTPS
142 |             coerced_url = coerce_url(url, https=False)
143 |             finder.coerced_url = coerced_url
144 |             found_url = finder.get_url(coerced_url)
145 |         finally:
146 |             # Always set Local Context exception settings back to Caller provided settings.
147 |             set_exceptions(False)
148 | 
149 |     search_time = int((time.perf_counter() - start_time) * 1000)
150 |     logger.debug("Searched url in %sms", search_time)
151 | 
152 |     # If URL is valid, then get site info if feed_info is True
153 |     if found_url and found_url.is_valid:
154 |         if feed_info:
155 |             finder.get_site_info(found_url)
156 |     # Return nothing if there is no data from the URL
157 |     else:
158 |         return []
159 | 
160 |     # If URL is already a feed, create and return FeedInfo
161 |     if found_url.is_feed:
162 |         found = finder.create_feed_info(found_url)
163 |         feeds.append(found)
164 |         return feeds
165 | 
166 |     # Parse text with BeautifulSoup
167 |     finder.soup = create_soup(found_url.data)
168 | 
169 |     # If discovery_only, then search for <link rel=\"alternate\"> tags and return
170 |     if discovery_only and not check_all:
171 |         logger.debug('Looking for <link rel="alternate"> tags.')
172 |         links = finder.search_links(finder.soup, found_url.url)
173 |         found_links = finder.check_urls(links)
174 |         feeds.extend(found_links)
175 |         logger.info('Found %s feed <link rel="alternate" > tags.', len(found_links))
176 |         return sort_urls(feeds, url)
177 | 
178 |     # Search for <link> tags
179 |     logger.debug("Looking for <link> tags.")
180 |     links = finder.search_links(finder.soup, found_url.url)
181 |     found_links = finder.check_urls(links)
182 |     feeds.extend(found_links)
183 |     logger.info("Found %s feed <link> tags.", len(found_links))
184 | 
185 |     search_time = int((time.perf_counter() - start_time) * 1000)
186 |     logger.debug("Searched <link> tags in %sms", search_time)
187 | 
188 |     # Return if feeds are already found and check_all is False.
189 |     if feeds and not check_all:
190 |         return sort_urls(feeds, url)
191 | 
192 |     # Search for default CMS feeds.
193 |     if cms or check_all:
194 |         if not finder.site_meta:
195 |             finder.get_site_info(coerced_url)
196 |         logger.debug("Looking for CMS feeds.")
197 |         cms_urls = finder.site_meta.cms_feed_urls()
198 |         found_cms = finder.check_urls(cms_urls)
199 |         logger.info("Found %s CMS feeds.", len(found_cms))
200 |         feeds.extend(found_cms)
201 | 
202 |     # Return if feeds are already found and check_all is False.
203 |     if feeds and not check_all:
204 |         return sort_urls(feeds, url)
205 | 
206 |     # Look for <a> tags.
207 |     logger.debug("Looking for <a> tags.")
208 |     local, remote = finder.search_a_tags(finder.soup)
209 | 
210 |     # Check the local URLs.
211 |     local = [urljoin(coerced_url, l) for l in local]  # type: list
212 |     found_local = finder.check_urls(local)
213 |     feeds.extend(found_local)
214 |     logger.info("Found %s local <a> links to feeds.", len(found_local))
215 | 
216 |     # Check the remote URLs.
217 |     local = [urljoin(coerced_url, l) for l in local]  # type: list
218 |     # Check the remote URLs.
219 |     remote = [urljoin(coerced_url, l) for l in remote]  # type: list
220 |     hrefs = local + remote
221 |     found_hrefs = finder.check_urls(hrefs)
222 |     feeds.extend(found_hrefs)
223 |     logger.info("Found %s <a> links to feeds.", len(found_hrefs))
224 | 
225 |     search_time = int((time.perf_counter() - start_time) * 1000)
226 |     logger.debug("Searched <a> links in %sms", search_time)
227 | 
228 |     # Only check internal pages if check_all is True.
229 |     if not check_all:
230 |         return sort_urls(feeds, url)
231 | 
232 |     # Check all possible internal urls that may point to a feed page.
233 |     internal = finder.internal_feedlike_urls()
234 |     found_internal = finder.check_url_data(internal)
235 |     feeds.extend(found_internal)
236 | 
237 |     search_time = int((time.perf_counter() - start_time) * 1000)
238 |     logger.debug("Searched internal pages in %sms", search_time)
239 | 
240 |     # Return if feeds are found. Guessing URLs is a last resort.
241 |     if feeds:
242 |         return sort_urls(feeds, url)
243 | 
244 |     # Guessing potential URLs.
245 |     fns = [
246 |         "atom.xml",
247 |         "index.atom",
248 |         "index.rdf",
249 |         "rss.xml",
250 |         "index.xml",
251 |         "index.rss",
252 |         "index.json",
253 |     ]
254 |     urls = list(urljoin(coerced_url, f) for f in fns)
255 |     found_guessed = finder.check_urls(urls)
256 |     feeds.extend(found_guessed)
257 |     logger.info("Found %s guessed links to feeds.", len(found_guessed))
258 | 
259 |     search_time = int((time.perf_counter() - start_time) * 1000)
260 |     logger.debug("Searched guessed urls in %sms", search_time)
261 | 
262 |     return sort_urls(feeds, url)
263 | 
264 | 
265 | def url_feed_score(url: str, original_url: str = "") -> int:
266 |     """
267 |     Return a Score based on estimated relevance of the feed Url
268 |     to the original search Url
269 | 
270 |     :param url: Feed Url
271 |     :param original_url: Searched Url
272 |     :return: Score integer
273 |     """
274 |     score = 0
275 | 
276 |     if original_url:
277 |         url_domain = get_site_root(url)
278 |         original_domain = get_site_root(original_url)
279 | 
280 |         if original_domain not in url_domain:
281 |             score -= 17
282 | 
283 |     if "comments" in url:
284 |         score -= 15
285 |     if "georss" in url:
286 |         score -= 9
287 |     if "alt" in url:
288 |         score -= 7
289 |     kw = ["atom", "rss", ".xml", "feed", "rdf"]
290 |     for p, t in zip(range(len(kw) * 2, 0, -2), kw):
291 |         if t in url:
292 |             score += p
293 |     if url.startswith("https"):
294 |         score += 9
295 |     return score
296 | 
297 | 
298 | def sort_urls(feeds: List[FeedInfo], original_url: str = "") -> List[FeedInfo]:
299 |     """
300 |     Sort list of feeds based on Url score
301 | 
302 |     :param feeds: List of FeedInfo objects
303 |     :param original_url: Searched Url
304 |     :return: List of FeedInfo objects
305 |     """
306 |     for feed in feeds:
307 |         feed.score = url_feed_score(feed.url, original_url)
308 |     sorted_urls = sorted(list(set(feeds)), key=lambda x: x.score, reverse=True)
309 |     logger.info("Returning sorted URLs: %s", sorted_urls)
310 |     return sorted_urls
311 | 


--------------------------------------------------------------------------------
/feedsearch/lib.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import logging
  3 | import time
  4 | from contextlib import contextmanager
  5 | from typing import Optional, Union, Tuple
  6 | 
  7 | import requests
  8 | from bs4 import BeautifulSoup
  9 | from requests import Response
 10 | from requests.exceptions import RequestException
 11 | from werkzeug.local import Local, release_local
 12 | from werkzeug.urls import url_parse, url_fix
 13 | 
 14 | from .__version__ import __version__
 15 | 
 16 | LOCAL_CONTEXT = Local()
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | bs4_parser = "html.parser"
 21 | 
 22 | default_timeout = 3.05
 23 | 
 24 | 
 25 | def get_session():
 26 |     """
 27 |     Returns the Requests Session for the current local context.
 28 |     Creates a Session with default values if none exists.
 29 | 
 30 |     :return: Requests Session
 31 |     """
 32 |     return getattr(LOCAL_CONTEXT, "session", create_requests_session())
 33 | 
 34 | 
 35 | def get_timeout():
 36 |     """
 37 |     Returns the Request timeout for the current local context.
 38 | 
 39 |     :return: Request timeout
 40 |     """
 41 |     return getattr(LOCAL_CONTEXT, "timeout", default_timeout)
 42 | 
 43 | 
 44 | def get_exceptions() -> bool:
 45 |     """
 46 |     Returns the exception handling settings for the current local context.
 47 | 
 48 |     :return: Catch exception boolean
 49 |     """
 50 |     return getattr(LOCAL_CONTEXT, "exceptions", False)
 51 | 
 52 | 
 53 | def set_exceptions(value: bool = False) -> None:
 54 |     """
 55 |     Set the exception hadnling settings for the current local context.
 56 | 
 57 |     :return: None
 58 |     """
 59 |     setattr(LOCAL_CONTEXT, "exceptions", value)
 60 | 
 61 | 
 62 | def _user_agent() -> str:
 63 |     """
 64 |     Return User-Agent string
 65 | 
 66 |     :return: str
 67 |     """
 68 |     return "FeedSerach/{0} (https://github.com/DBeath/feedsearch)".format(__version__)
 69 | 
 70 | 
 71 | @contextmanager
 72 | def create_requests_session(
 73 |     user_agent: str = "",
 74 |     max_redirects: int = 30,
 75 |     timeout: Union[float, Tuple[float, float]] = default_timeout,
 76 |     exceptions: bool = False,
 77 |     verify: Union[bool, str] = True,
 78 | ):
 79 |     """
 80 |     Creates a Requests Session and sets User-Agent header and Max Redirects
 81 | 
 82 |     :param user_agent: User-Agent string
 83 |     :param max_redirects: Max number of redirects before failure
 84 |     :param timeout: Request Timeout
 85 |     :param exceptions: If False, will gracefully handle Requests exceptions and attempt to keep searching.
 86 |                        If True, will leave Requests exceptions uncaught to be handled externally.
 87 |     :param verify: Verify SSL Certificates.
 88 |     :return: Requests session
 89 |     """
 90 |     # Create a request session
 91 |     session = requests.session()
 92 | 
 93 |     # Set User-Agent header
 94 |     user_agent = user_agent if user_agent else _user_agent()
 95 |     session.headers.update({"User-Agent": user_agent})
 96 | 
 97 |     session.max_redirects = max_redirects
 98 |     session.verify = verify
 99 | 
100 |     # Add request session to local context
101 |     setattr(LOCAL_CONTEXT, "session", session)
102 |     setattr(LOCAL_CONTEXT, "timeout", timeout)
103 |     setattr(LOCAL_CONTEXT, "exceptions", exceptions)
104 | 
105 |     yield session
106 | 
107 |     # Close request session
108 |     session.close()
109 | 
110 |     # Clean up local context
111 |     release_local(LOCAL_CONTEXT)
112 | 
113 | 
114 | def requests_session(
115 |     user_agent: str = "",
116 |     max_redirects: int = 30,
117 |     timeout: Union[float, Tuple[float, float]] = default_timeout,
118 |     exceptions: bool = False,
119 |     verify: Union[bool, str] = True,
120 | ):
121 |     """
122 |     Wraps a requests session around a function.
123 | 
124 |     :param user_agent: User Agent for requests
125 |     :param max_redirects: Maximum number of redirects
126 |     :param timeout: Request Timeout
127 |     :param exceptions: If True, rethrow exceptions.
128 |     :param verify: Verify SSL Certificates.
129 |     :return: decorator function
130 |     """
131 | 
132 |     def decorator(func):
133 |         @functools.wraps(func)
134 |         def wrapper(*args, **kwargs):
135 |             with create_requests_session(
136 |                 user_agent, max_redirects, timeout, exceptions, verify
137 |             ):
138 |                 # Call wrapped function
139 |                 return func(*args, **kwargs)
140 | 
141 |         return wrapper
142 | 
143 |     return decorator
144 | 
145 | 
146 | def set_bs4_parser(parser: str) -> None:
147 |     """
148 |     Sets the parser used by BeautifulSoup
149 | 
150 |     :param parser: BeautifulSoup parser
151 |     :return: None
152 |     """
153 |     if parser:
154 |         global bs4_parser
155 |         bs4_parser = parser
156 | 
157 | 
158 | def get_url(
159 |     url: str,
160 |     timeout: Union[float, Tuple[float, float]] = default_timeout,
161 |     exceptions: bool = False,
162 |     **kwargs
163 | ) -> Optional[Response]:
164 |     """
165 |     Performs a GET request on a URL
166 | 
167 |     :param url: URL string
168 |     :param timeout: Request Timeout
169 |     :param exceptions: If False, will gracefully handle Requests exceptions and attempt to keep searching.
170 |                        If True, will reraise Requests exceptions to be handled externally.
171 |     :return: Requests Response object
172 |     """
173 |     timeout = timeout if timeout else get_timeout()
174 | 
175 |     logger.info("Fetching URL: %s", url)
176 |     start_time = time.perf_counter()
177 |     try:
178 |         session = get_session()
179 |         response = session.get(url, timeout=timeout, **kwargs)
180 |         response.raise_for_status()
181 |     except RequestException as ex:
182 |         logger.warning("RequestException while getting URL: %s, %s", url, str(ex))
183 |         if exceptions:
184 |             raise
185 |         return None
186 |     finally:
187 |         dur = int((time.perf_counter() - start_time) * 1000)
188 |         logger.debug("Performed fetch of URL: %s in %sms", url, dur)
189 |     return response
190 | 
191 | 
192 | def create_soup(text: str) -> BeautifulSoup:
193 |     """
194 |     Parses a string into a BeautifulSoup object
195 | 
196 |     :param text: Html string
197 |     :return: BeautifulSoup object
198 |     """
199 |     return BeautifulSoup(text, bs4_parser)
200 | 
201 | 
202 | def coerce_url(url: str, https: bool = True) -> str:
203 |     """
204 |     Coerce URL to valid format
205 | 
206 |     :param url: URL
207 |     :param https: Force https if no scheme in url
208 |     :return: str
209 |     """
210 |     url.strip()
211 |     if url.startswith("feed://"):
212 |         return url_fix("http://{0}".format(url[7:]))
213 |     for proto in ["http://", "https://"]:
214 |         if url.startswith(proto):
215 |             return url_fix(url)
216 |     if https:
217 |         return url_fix("https://{0}".format(url))
218 |     else:
219 |         return url_fix("http://{0}".format(url))
220 | 
221 | 
222 | def get_site_root(url: str) -> str:
223 |     """
224 |     Find the root domain of a url
225 |     """
226 |     url = coerce_url(url)
227 |     parsed = url_parse(url, scheme="http")
228 |     return parsed.netloc
229 | 
230 | 
231 | def timeit(func):
232 |     """
233 |     A decorator used to log the function execution time
234 |     """
235 | 
236 |     @functools.wraps(func)
237 |     def wrap(*args, **kwargs):
238 |         start = time.perf_counter()
239 | 
240 |         result = func(*args, **kwargs)
241 | 
242 |         dur = int((time.perf_counter() - start) * 1000)
243 | 
244 |         logger.debug("Function name=%s duration=%sms", func.__name__, dur)
245 | 
246 |         return result
247 | 
248 |     return wrap
249 | 
250 | 
251 | def parse_header_links(value):
252 |     """
253 |     Return a list of Dicts of parsed link headers proxies.
254 |     i.e. Link: <http:/.../front.jpeg>; rel=front; type="image/jpeg",
255 |     <http://.../back.jpeg>; rel=back;type="image/jpeg"
256 | 
257 |     :param value: HTTP Link header to parse
258 |     :return: List of Dicts
259 |     """
260 | 
261 |     links = []
262 | 
263 |     replace_chars = " '\""
264 | 
265 |     for val in value.split(","):
266 |         try:
267 |             url, params = val.split(";", 1)
268 |         except ValueError:
269 |             url, params = val, ""
270 | 
271 |         link = {"url": url.strip("<> '\"")}
272 | 
273 |         for param in params.split(";"):
274 |             try:
275 |                 key, value = param.split("=")
276 |             except ValueError:
277 |                 break
278 | 
279 |             link[key.strip(replace_chars)] = value.strip(replace_chars)
280 | 
281 |         links.append(link)
282 | 
283 |     return links
284 | 


--------------------------------------------------------------------------------
/feedsearch/site_meta.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import logging
  3 | import re
  4 | 
  5 | from typing import List, Set, Dict, Any
  6 | from bs4 import BeautifulSoup, ResultSet
  7 | from werkzeug.urls import url_parse
  8 | 
  9 | from .lib import get_url, coerce_url, create_soup, get_timeout, get_exceptions
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | 
 14 | WORDPRESS_URLS = ["/feed"]
 15 | 
 16 | 
 17 | class SiteMeta:
 18 |     def __init__(self, url: str, data: Any = None, soup: BeautifulSoup = None) -> None:
 19 |         self.url = url  # type: str
 20 |         self.data = data  # type: Any
 21 |         self.soup = soup  # type: BeautifulSoup
 22 |         self.site_url = ""  # type: str
 23 |         self.site_name = ""  # type: str
 24 |         self.icon_url = ""  # type: str
 25 |         self.icon_data_uri = ""  # type: str
 26 |         self.domain = ""  # type: str
 27 | 
 28 |     def parse_site_info(self, favicon_data_uri: bool = False):
 29 |         """
 30 |         Finds Site Info from root domain of site
 31 | 
 32 |         :return: None
 33 |         """
 34 |         self.domain = self.get_domain(self.url)
 35 | 
 36 |         # Only fetch url again if domain is different from provided url or if
 37 |         # no site data already provided.
 38 |         if self.domain != self.url.strip("/") or not self.data:
 39 |             logger.debug(
 40 |                 "Domain %s is different from URL %s. Fetching domain.",
 41 |                 self.domain,
 42 |                 self.url,
 43 |             )
 44 |             response = get_url(self.domain, get_timeout(), get_exceptions())
 45 |             if not response or not response.text:
 46 |                 return
 47 |             self.data = response.text
 48 | 
 49 |         if not self.soup:
 50 |             self.soup = create_soup(self.data)
 51 | 
 52 |         self.site_url = self.find_site_url(self.soup, self.domain)
 53 |         self.site_name = self.find_site_name(self.soup)
 54 |         self.icon_url = self.find_site_icon_url(self.domain)
 55 | 
 56 |         if favicon_data_uri and self.icon_url:
 57 |             self.icon_data_uri = self.create_data_uri(self.icon_url)
 58 | 
 59 |     def find_site_icon_url(self, url: str) -> str:
 60 |         """
 61 |         Attempts to find Site Favicon
 62 | 
 63 |         :param url: Root domain Url of Site
 64 |         :return: str
 65 |         """
 66 |         icon_rel = ["apple-touch-icon", "shortcut icon", "icon"]
 67 | 
 68 |         icon = ""
 69 |         for rel in icon_rel:
 70 |             link = self.soup.find(name="link", rel=rel)
 71 |             if link:
 72 |                 icon = link.get("href", None)
 73 |                 if icon[0] == "/":
 74 |                     icon = "{0}{1}".format(url, icon)
 75 |                 if icon == "favicon.ico":
 76 |                     icon = "{0}/{1}".format(url, icon)
 77 |         if not icon:
 78 |             send_url = url + "/favicon.ico"
 79 |             logger.debug("Trying url %s for favicon", send_url)
 80 |             response = get_url(send_url, get_timeout(), get_exceptions())
 81 |             if response and response.status_code == 200:
 82 |                 logger.debug("Received url %s for favicon", response.url)
 83 |                 icon = response.url
 84 |         return icon
 85 | 
 86 |     @staticmethod
 87 |     def find_site_name(soup) -> str:
 88 |         """
 89 |         Attempts to find Site Name
 90 | 
 91 |         :param soup: BeautifulSoup of site
 92 |         :return: str
 93 |         """
 94 |         site_name_meta = [
 95 |             "og:site_name",
 96 |             "og:title",
 97 |             "application:name",
 98 |             "twitter:app:name:iphone",
 99 |         ]
100 | 
101 |         for p in site_name_meta:
102 |             try:
103 |                 name = soup.find(name="meta", property=p).get("content")
104 |                 if name:
105 |                     return name
106 |             except AttributeError:
107 |                 pass
108 | 
109 |         try:
110 |             title = soup.find(name="title").text
111 |             if title:
112 |                 return title
113 |         except AttributeError:
114 |             pass
115 | 
116 |         return ""
117 | 
118 |     @staticmethod
119 |     def find_site_url(soup, url: str) -> str:
120 |         """
121 |         Attempts to find the canonical Url of the Site
122 | 
123 |         :param soup: BeautifulSoup of site
124 |         :param url: Current Url of site
125 |         :return: str
126 |         """
127 |         canonical = soup.find(name="link", rel="canonical")
128 |         try:
129 |             site = canonical.get("href")
130 |             if site:
131 |                 return site
132 |         except AttributeError:
133 |             pass
134 | 
135 |         meta = soup.find(name="meta", property="og:url")
136 |         try:
137 |             site = meta.get("content")
138 |         except AttributeError:
139 |             return url
140 |         return site
141 | 
142 |     @staticmethod
143 |     def get_domain(url: str) -> str:
144 |         """
145 |         Finds root domain of Url, including scheme
146 | 
147 |         :param url: URL string
148 |         :return: str
149 |         """
150 |         url = coerce_url(url)
151 |         parsed = url_parse(url)
152 |         domain = "{0}://{1}".format(parsed.scheme, parsed.netloc)
153 |         return domain
154 | 
155 |     @staticmethod
156 |     def create_data_uri(img_url: str) -> str:
157 |         """
158 |         Creates a Data Uri for a Favicon
159 | 
160 |         :param img_url: Url of Favicon
161 |         :return: str
162 |         """
163 |         response = get_url(img_url, get_timeout(), get_exceptions(), stream=True)
164 |         if not response or int(response.headers["content-length"]) > (1024 * 1024):
165 |             response.close()
166 |             return ""
167 | 
168 |         uri = ""
169 |         try:
170 |             encoded = base64.b64encode(response.content)
171 |             uri = "data:image/png;base64," + encoded.decode("utf-8")
172 |         except Exception as e:
173 |             logger.warning("Failure encoding image: %s", e)
174 | 
175 |         response.close()
176 |         return uri
177 | 
178 |     def cms_feed_urls(self) -> List[str]:
179 |         """
180 |         Checks if a site is using a popular CMS, and returns
181 |         a list of default feed urls to check.
182 | 
183 |         :return: List[str]
184 |         """
185 | 
186 |         site_feeds = {"WordPress": ["/feed"]}  # type: Dict[str, List[str]]
187 | 
188 |         possible_urls = set()  # type: Set[str]
189 |         if not self.soup:
190 |             return []
191 | 
192 |         site_names = set()  # type: Set[str]
193 | 
194 |         metas = self.soup.find_all(name="meta")
195 |         site_names.update(self.check_meta(metas))
196 | 
197 |         links = self.soup.find_all(name="link")
198 |         site_names.update(self.check_links(links))
199 | 
200 |         for name in site_names:
201 |             urls = site_feeds.get(name)  # type: List[str]
202 |             if urls:
203 |                 possible_urls.update(urls)
204 | 
205 |         # Return urls appended to the root domain to allow searching
206 |         urls = []  # type: List[str]
207 |         for url in possible_urls:
208 |             urls.append(self.domain + url)
209 |         return urls
210 | 
211 |     @staticmethod
212 |     def check_meta(metas: ResultSet) -> Set[str]:
213 |         """
214 |         Check site meta to find possible CMS values.
215 | 
216 |         :param metas: ResultSet of Site Meta values
217 |         :return: Set of possible CMS names
218 |         """
219 |         meta_tests = {"generator": {"WordPress": "WordPress\\s*(.*)"}}
220 | 
221 |         results = set()  # type: Set[str]
222 | 
223 |         def get_meta_value(inner_type: str, inner_metas: ResultSet):
224 |             for meta in inner_metas:
225 |                 if inner_type in meta.get("property", ""):
226 |                     yield meta.get("content")
227 | 
228 |         for test_type, tests in meta_tests.items():
229 |             meta_values = list(get_meta_value(test_type, metas))
230 |             for meta_value in meta_values:
231 |                 for site_name, pattern in tests.items():
232 |                     if re.search(pattern, meta_value, flags=re.I):
233 |                         results.add(site_name)
234 | 
235 |         return results
236 | 
237 |     @staticmethod
238 |     def check_links(links: ResultSet) -> Set[str]:
239 |         link_tests = {"WordPress": "/wp-content/"}
240 | 
241 |         results = set()  # type: Set[str]
242 | 
243 |         def get_link_href(inner_links: ResultSet):
244 |             for link in inner_links:
245 |                 yield link.get("href")
246 | 
247 |         link_hrefs = list(get_link_href(links))
248 |         for site_name, pattern in link_tests.items():
249 |             for href in link_hrefs:
250 |                 if not href:
251 |                     continue
252 |                 if re.search(pattern, href, flags=re.I):
253 |                     results.add(site_name)
254 | 
255 |         return results
256 | 


--------------------------------------------------------------------------------
/feedsearch/url.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Any
  3 | 
  4 | from .lib import get_url, get_timeout, get_exceptions
  5 | 
  6 | logger = logging.getLogger(__name__)
  7 | 
  8 | 
  9 | class URL:
 10 |     def __init__(self, url: str, data: Any = None, immediate_get: bool = True) -> None:
 11 |         """
 12 |         Initialise URL object and immediately fetch URL to check if feed.
 13 | 
 14 |         :param url: URL string
 15 |         """
 16 |         self.url = url  # type: str
 17 |         self.data = data  # type: Any
 18 |         self.is_feed = False  # type: bool
 19 |         self.content_type = ""  # type: str
 20 |         self.headers = {}  # type: dict
 21 |         self.links = {}  # type: dict
 22 |         self.fetched = False  # type: bool
 23 |         self.feedlike_url = self.is_feedlike_url(self.url)  # type: bool
 24 | 
 25 |         if immediate_get and not self.fetched:
 26 |             self.get_is_feed(self.url)
 27 | 
 28 |     def __repr__(self):
 29 |         return "{0}({1})".format(self.__class__.__name__, self.url.__repr__)
 30 | 
 31 |     def __eq__(self, other):
 32 |         return self.url == other.url
 33 | 
 34 |     @staticmethod
 35 |     def is_feed_url(url: str) -> bool:
 36 |         """
 37 |         Return True if URL ending contains valid feed file format.
 38 | 
 39 |         :param url: URL string
 40 |         :return: bool
 41 |         """
 42 |         return any(
 43 |             map(url.lower().endswith, [".rss", ".rdf", ".xml", ".atom", ".json"])
 44 |         )
 45 | 
 46 |     @staticmethod
 47 |     def is_feedlike_url(url: str) -> bool:
 48 |         """
 49 |         Return True any part of URL might identify as feed.
 50 | 
 51 |         :param url: URL string
 52 |         :return: bool
 53 |         """
 54 |         return any(
 55 |             map(url.lower().count, ["rss", "rdf", "xml", "atom", "feed", "json"])
 56 |         )
 57 | 
 58 |     @staticmethod
 59 |     def is_json_feed(json: dict) -> bool:
 60 |         """
 61 |         Return True if JSON contains valid JSON Feed version.
 62 | 
 63 |         :param json: Parsed JSON
 64 |         :return: bool
 65 |         """
 66 |         version = json.get("version")
 67 |         if not version or "https://jsonfeed.org/version/" not in version:
 68 |             return False
 69 |         return True
 70 | 
 71 |     @staticmethod
 72 |     def is_feed_data(text: str, content_type: str) -> bool:
 73 |         """
 74 |         Return True if text string has valid feed beginning.
 75 | 
 76 |         :param text: Possible feed text
 77 |         :param content_type: MimeType of text
 78 |         :return: bool
 79 |         """
 80 |         data = text.lower()
 81 |         if not data:
 82 |             return False
 83 |         if data[:100].count("<html"):
 84 |             return False
 85 |         if "json" in content_type and data.count("jsonfeed.org"):
 86 |             return True
 87 |         return bool(
 88 |             data.count("<rss")
 89 |             + data.count("<rdf")
 90 |             + data.count("<feed")
 91 |         )
 92 | 
 93 |     def get_is_feed(self, url: str) -> None:
 94 |         """
 95 |         Gets a URL and checks if it might be a feed.
 96 | 
 97 |         :param url: URL string
 98 |         :return: None
 99 |         """
100 |         response = get_url(url, get_timeout(), get_exceptions())
101 | 
102 |         self.fetched = True
103 | 
104 |         if not response or not response.text:
105 |             logger.debug("Nothing found at %s", url)
106 |             return
107 | 
108 |         self.url = response.url
109 |         self.content_type = response.headers.get("content-type")
110 | 
111 |         self.data = response.text
112 |         self.headers = response.headers
113 |         self.links = response.links
114 |         self.is_feed = self.is_feed_data(response.text, self.content_type)
115 | 
116 |     @property
117 |     def is_valid(self) -> bool:
118 |         """
119 |         Check if URL returned valid response
120 | 
121 |         :return: bool
122 |         """
123 |         if self.url and self.data:
124 |             return True
125 |         return False
126 | 


--------------------------------------------------------------------------------
/search.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import traceback
 3 | from pprint import pprint
 4 | 
 5 | import click
 6 | 
 7 | from feedsearch import search as search_feeds
 8 | 
 9 | 
10 | @click.command()
11 | @click.argument("url")
12 | @click.option(
13 |     "--all/--no-all",
14 |     default=False,
15 |     help="Search all potential locations for feeds. Warning: Slow",
16 | )
17 | @click.option("--info/--no-info", default=False, help="Return additional feed details")
18 | @click.option(
19 |     "--parser",
20 |     default="html.parser",
21 |     type=click.Choice(["html.parser", "lxml", "xml", "html5lib"]),
22 |     help="BeautifulSoup parser ('html.parser', 'lxml', 'xml', or 'html5lib'). Defaults to 'html.parser'",
23 | )
24 | @click.option("-v", "--verbose", is_flag=True, help="Show logging")
25 | @click.option(
26 |     "--exceptions/--no-exceptions",
27 |     default=False,
28 |     help="If False, will gracefully handle Requests exceptions and attempt to keep searching."
29 |     "If True, will leave Requests exceptions uncaught to be handled externally.",
30 | )
31 | @click.option("--timeout", default=3.05, type=click.FLOAT, help="Request timeout")
32 | @click.option(
33 |     "--favicon/--no-favicon", default=False, help="Convert Favicon into Data Uri"
34 | )
35 | @click.option(
36 |     "--urls/--no-urls",
37 |     default=False,
38 |     help="Return found Feeds as a list of URL strings instead of FeedInfo objects.",
39 | )
40 | @click.option(
41 |     "--cms/--no-cms",
42 |     default=True,
43 |     help="Check default CMS feed location if site is using a known CMS.",
44 | )
45 | @click.option(
46 |     "--discovery/--no-discovery",
47 |     default=False,
48 |     help='Only search for RSS discovery tags (e.g. <link rel="alternate" href=...>).',
49 | )
50 | def search(
51 |     url, all, info, parser, verbose, exceptions, timeout, favicon, urls, cms, discovery
52 | ):
53 |     if verbose:
54 |         logger = logging.getLogger("feedsearch")
55 |         logger.setLevel(logging.DEBUG)
56 |         ch = logging.StreamHandler()
57 |         ch.setLevel(logging.DEBUG)
58 |         formatter = logging.Formatter(
59 |             "%(asctime)s - %(name)s - %(levelname)s - %(message)s [in %(pathname)s:%(lineno)d]"
60 |         )
61 |         ch.setFormatter(formatter)
62 |         logger.addHandler(ch)
63 | 
64 |     click.echo("\nSearching URL {0}\n".format(url))
65 |     try:
66 |         feeds = search_feeds(
67 |             url,
68 |             info=info,
69 |             check_all=all,
70 |             cms=cms,
71 |             discovery_only=discovery,
72 |             favicon_data_uri=favicon,
73 |             as_urls=urls,
74 |             parser=parser,
75 |             exceptions=exceptions,
76 |             timeout=timeout
77 |         )
78 |         click.echo()
79 |         for feed in feeds:
80 |             if not urls:
81 |                 pprint(vars(feed))
82 |                 print()
83 |             else:
84 |                 click.echo("{0}".format(feed))
85 | 
86 |         return feeds
87 |     except Exception as e:
88 |         click.echo("Exception: {0}\n".format(e))
89 |         click.echo(traceback.format_exc())
90 | 
91 |     return []
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     search()
96 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [bdist_wheel]
 2 | python-tag = py35.py36.py37
 3 | 
 4 | [flake8]
 5 | # Keep in sync with .flake8. This copy here is needed for source packages
 6 | # to be able to pass tests without failing selfclean check.
 7 | ignore = E302, E501, W503
 8 | max-line-length = 88
 9 | max-complexity = 12
10 | select = B,C,E,F,W,B9
11 | 
12 | [metadata]
13 | license_file = LICENSE


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import sys
 6 | from codecs import open
 7 | 
 8 | from setuptools import setup
 9 | 
10 | here = os.path.abspath(os.path.dirname(__file__))
11 | 
12 | about = {}
13 | with open(os.path.join(here, "feedsearch", "__version__.py"), "r", "utf-8") as f:
14 |     exec(f.read(), about)
15 | 
16 | with open(os.path.join(here, "README.rst"), encoding="utf-8") as f:
17 |     readme = f.read()
18 | 
19 | if sys.argv[-1] == "publish":
20 |     os.system("python3 setup.py sdist bdist_wheel")
21 |     os.system("twine upload dist/*")
22 |     sys.exit()
23 | 
24 | packages = ["feedsearch"]
25 | 
26 | required = ["requests", "beautifulsoup4", "feedparser", "click", "Werkzeug"]
27 | 
28 | setup(
29 |     name=about["__title__"],
30 |     version=about["__version__"],
31 |     description=about["__description__"],
32 |     long_description=readme,
33 |     author=about["__author__"],
34 |     author_email=about["__author_email__"],
35 |     url=about["__url__"],
36 |     license=about["__license__"],
37 |     packages=packages,
38 |     install_requires=required,
39 |     classifiers=[
40 |         "License :: OSI Approved :: MIT License",
41 |         "Intended Audience :: Developers",
42 |         "Development Status :: 5 - Production/Stable",
43 |         "Natural Language :: English",
44 |         "Operating System :: OS Independent",
45 |         "Programming Language :: Python :: 3.5",
46 |         "Programming Language :: Python :: 3.6",
47 |         "Programming Language :: Python :: 3.7",
48 |     ],
49 |     python_requires=">=3",
50 | )
51 | 


--------------------------------------------------------------------------------