├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── TODO ├── poetry.lock ├── py.typed ├── pyproject.toml └── src ├── cleanurl.py ├── py.typed └── test_cleanurl.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .vscode/settings.json 131 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | lint: 3 | poetry run mypy src 4 | poetry run flake8 --extend-ignore E501,E741,E203 src | tac 5 | 6 | test: 7 | poetry run pytest 8 | 9 | build: lint test 10 | poetry build 11 | 12 | publish: build 13 | poetry publish 14 | 15 | shell: 16 | poetry run python 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # cleanurl 2 | Remove clutter from URLs and return a canonicalized version 3 | 4 | # Install 5 | ``` 6 | pip install cleanurl 7 | ``` 8 | or if you're using poetry: 9 | ``` 10 | poetry add cleanurl 11 | ``` 12 | 13 | # Usage 14 | By default *cleanurl* retuns a cleaned URL without respecting semantics. 15 | For example: 16 | 17 | ``` 18 | >>> import cleanurl 19 | >>> r = cleanurl.cleanurl('https://www.xojoc.pw/blog/focus.html?utm_content=buffercf3b2&utm_medium=social&utm_source=snapchat.com&utm_campaign=buffe') 20 | >>> r.url 21 | 'https://xojoc.pw/blog/focus' 22 | >>> r.parsed_url 23 | ParseResult(scheme='https', netloc='xojoc.pw', path='/blog/focus', params='', query='', fragment='') 24 | ``` 25 | 26 | The default parameters are useful if you want to get a *canonical* URL without caring if the resulting URL is still valid. 27 | 28 | If you want to get a clean URL which is still valid call it like this: 29 | 30 | ``` 31 | >>> r = cleanurl.cleanurl('https://www.xojoc.pw/blog/////focus.html', respect_semantics=True) 32 | >>> r.url 33 | 'https://www.xojoc.pw/blog/focus.html' 34 | ``` 35 | 36 | ```celeanurl.cleanurl``` parameters: 37 | 38 | - ```generic``` -> if True don't use site specific rules 39 | - ```respect_semantics``` -> if True make sure the returned URL is still valid, altough it may still contain some superfluous elements 40 | - ```host_remap``` -> whether to remap hosts. Example: 41 | ``` 42 | >>> import cleanurl 43 | >>> cleanurl.cleanurl('https://threadreaderapp.com/thread/1453753924960219145', host_remap=True).url 44 | 'https://twitter.com/i/status/1453753924960219145' 45 | >>> cleanurl.cleanurl('https://threadreaderapp.com/thread/1453753924960219145', host_remap=False).url 46 | 'https://threadreaderapp.com/thread/1453753924960219145' 47 | ``` 48 | 49 | For more examples see the [unit tests](https://github.com/xojoc/cleanurl/blob/main/src/test_cleanurl.py). 50 | 51 | 52 | # Why? 53 | While there are some libraries that handle general cases, this library has website specific rules that more aggresivly normalize urls. 54 | 55 | # Users 56 | Initially used for [discu.eu](https://discu.eu). 57 | 58 | [Discussions around the web](https://discu.eu/q/https://github.com/xojoc/cleanurl) 59 | 60 | # Who? 61 | *cleanurl* was written by [Alexandru Cojocaru](https://xojoc.pw). 62 | 63 | # License 64 | *cleanurl* is [Free Software](https://www.gnu.org/philosophy/free-sw.html) and is released as [AGPLv3](https://github.com/xojoc/cleanurl/blob/main/LICENSE) -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | # Normalization 2 | 3 | Detect "unique urls" rfc, wikipedia, papers, etc., stock (https://www.marketwatch.com/investing/stock/gme), man pages 4 | 5 | session ids 6 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry 1.4.0 and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "attrs" 5 | version = "22.2.0" 6 | description = "Classes Without Boilerplate" 7 | category = "dev" 8 | optional = false 9 | python-versions = ">=3.6" 10 | files = [ 11 | {file = "attrs-22.2.0-py3-none-any.whl", hash = "sha256:29e95c7f6778868dbd49170f98f8818f78f3dc5e0e37c0b1f474e3561b240836"}, 12 | {file = "attrs-22.2.0.tar.gz", hash = "sha256:c9227bfc2f01993c03f68db37d1d15c9690188323c067c641f1a35ca58185f99"}, 13 | ] 14 | 15 | [package.extras] 16 | cov = ["attrs[tests]", "coverage-enable-subprocess", "coverage[toml] (>=5.3)"] 17 | dev = ["attrs[docs,tests]"] 18 | docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier", "zope.interface"] 19 | tests = ["attrs[tests-no-zope]", "zope.interface"] 20 | tests-no-zope = ["cloudpickle", "cloudpickle", "hypothesis", "hypothesis", "mypy (>=0.971,<0.990)", "mypy (>=0.971,<0.990)", "pympler", "pympler", "pytest (>=4.3.0)", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-mypy-plugins", "pytest-xdist[psutil]", "pytest-xdist[psutil]"] 21 | 22 | [[package]] 23 | name = "black" 24 | version = "23.1.0" 25 | description = "The uncompromising code formatter." 26 | category = "dev" 27 | optional = false 28 | python-versions = ">=3.7" 29 | files = [ 30 | {file = "black-23.1.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:b6a92a41ee34b883b359998f0c8e6eb8e99803aa8bf3123bf2b2e6fec505a221"}, 31 | {file = "black-23.1.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:57c18c5165c1dbe291d5306e53fb3988122890e57bd9b3dcb75f967f13411a26"}, 32 | {file = "black-23.1.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:9880d7d419bb7e709b37e28deb5e68a49227713b623c72b2b931028ea65f619b"}, 33 | {file = "black-23.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e6663f91b6feca5d06f2ccd49a10f254f9298cc1f7f49c46e498a0771b507104"}, 34 | {file = "black-23.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:9afd3f493666a0cd8f8df9a0200c6359ac53940cbde049dcb1a7eb6ee2dd7074"}, 35 | {file = "black-23.1.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:bfffba28dc52a58f04492181392ee380e95262af14ee01d4bc7bb1b1c6ca8d27"}, 36 | {file = "black-23.1.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:c1c476bc7b7d021321e7d93dc2cbd78ce103b84d5a4cf97ed535fbc0d6660648"}, 37 | {file = "black-23.1.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:382998821f58e5c8238d3166c492139573325287820963d2f7de4d518bd76958"}, 38 | {file = "black-23.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bf649fda611c8550ca9d7592b69f0637218c2369b7744694c5e4902873b2f3a"}, 39 | {file = "black-23.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:121ca7f10b4a01fd99951234abdbd97728e1240be89fde18480ffac16503d481"}, 40 | {file = "black-23.1.0-cp37-cp37m-macosx_10_16_x86_64.whl", hash = "sha256:a8471939da5e824b891b25751955be52ee7f8a30a916d570a5ba8e0f2eb2ecad"}, 41 | {file = "black-23.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8178318cb74f98bc571eef19068f6ab5613b3e59d4f47771582f04e175570ed8"}, 42 | {file = "black-23.1.0-cp37-cp37m-win_amd64.whl", hash = "sha256:a436e7881d33acaf2536c46a454bb964a50eff59b21b51c6ccf5a40601fbef24"}, 43 | {file = "black-23.1.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:a59db0a2094d2259c554676403fa2fac3473ccf1354c1c63eccf7ae65aac8ab6"}, 44 | {file = "black-23.1.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:0052dba51dec07ed029ed61b18183942043e00008ec65d5028814afaab9a22fd"}, 45 | {file = "black-23.1.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:49f7b39e30f326a34b5c9a4213213a6b221d7ae9d58ec70df1c4a307cf2a1580"}, 46 | {file = "black-23.1.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:162e37d49e93bd6eb6f1afc3e17a3d23a823042530c37c3c42eeeaf026f38468"}, 47 | {file = "black-23.1.0-cp38-cp38-win_amd64.whl", hash = "sha256:8b70eb40a78dfac24842458476135f9b99ab952dd3f2dab738c1881a9b38b753"}, 48 | {file = "black-23.1.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:a29650759a6a0944e7cca036674655c2f0f63806ddecc45ed40b7b8aa314b651"}, 49 | {file = "black-23.1.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:bb460c8561c8c1bec7824ecbc3ce085eb50005883a6203dcfb0122e95797ee06"}, 50 | {file = "black-23.1.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:c91dfc2c2a4e50df0026f88d2215e166616e0c80e86004d0003ece0488db2739"}, 51 | {file = "black-23.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a951cc83ab535d248c89f300eccbd625e80ab880fbcfb5ac8afb5f01a258ac9"}, 52 | {file = "black-23.1.0-cp39-cp39-win_amd64.whl", hash = "sha256:0680d4380db3719ebcfb2613f34e86c8e6d15ffeabcf8ec59355c5e7b85bb555"}, 53 | {file = "black-23.1.0-py3-none-any.whl", hash = "sha256:7a0f701d314cfa0896b9001df70a530eb2472babb76086344e688829efd97d32"}, 54 | {file = "black-23.1.0.tar.gz", hash = "sha256:b0bd97bea8903f5a2ba7219257a44e3f1f9d00073d6cc1add68f0beec69692ac"}, 55 | ] 56 | 57 | [package.dependencies] 58 | click = ">=8.0.0" 59 | mypy-extensions = ">=0.4.3" 60 | packaging = ">=22.0" 61 | pathspec = ">=0.9.0" 62 | platformdirs = ">=2" 63 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} 64 | typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} 65 | 66 | [package.extras] 67 | colorama = ["colorama (>=0.4.3)"] 68 | d = ["aiohttp (>=3.7.4)"] 69 | jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] 70 | uvloop = ["uvloop (>=0.15.2)"] 71 | 72 | [[package]] 73 | name = "click" 74 | version = "8.1.3" 75 | description = "Composable command line interface toolkit" 76 | category = "dev" 77 | optional = false 78 | python-versions = ">=3.7" 79 | files = [ 80 | {file = "click-8.1.3-py3-none-any.whl", hash = "sha256:bb4d8133cb15a609f44e8213d9b391b0809795062913b383c62be0ee95b1db48"}, 81 | {file = "click-8.1.3.tar.gz", hash = "sha256:7682dc8afb30297001674575ea00d1814d808d6a36af415a82bd481d37ba7b8e"}, 82 | ] 83 | 84 | [package.dependencies] 85 | colorama = {version = "*", markers = "platform_system == \"Windows\""} 86 | 87 | [[package]] 88 | name = "colorama" 89 | version = "0.4.6" 90 | description = "Cross-platform colored terminal text." 91 | category = "dev" 92 | optional = false 93 | python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" 94 | files = [ 95 | {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, 96 | {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, 97 | ] 98 | 99 | [[package]] 100 | name = "exceptiongroup" 101 | version = "1.1.1" 102 | description = "Backport of PEP 654 (exception groups)" 103 | category = "dev" 104 | optional = false 105 | python-versions = ">=3.7" 106 | files = [ 107 | {file = "exceptiongroup-1.1.1-py3-none-any.whl", hash = "sha256:232c37c63e4f682982c8b6459f33a8981039e5fb8756b2074364e5055c498c9e"}, 108 | {file = "exceptiongroup-1.1.1.tar.gz", hash = "sha256:d484c3090ba2889ae2928419117447a14daf3c1231d5e30d0aae34f354f01785"}, 109 | ] 110 | 111 | [package.extras] 112 | test = ["pytest (>=6)"] 113 | 114 | [[package]] 115 | name = "flake8" 116 | version = "6.0.0" 117 | description = "the modular source code checker: pep8 pyflakes and co" 118 | category = "dev" 119 | optional = false 120 | python-versions = ">=3.8.1" 121 | files = [ 122 | {file = "flake8-6.0.0-py2.py3-none-any.whl", hash = "sha256:3833794e27ff64ea4e9cf5d410082a8b97ff1a06c16aa3d2027339cd0f1195c7"}, 123 | {file = "flake8-6.0.0.tar.gz", hash = "sha256:c61007e76655af75e6785a931f452915b371dc48f56efd765247c8fe68f2b181"}, 124 | ] 125 | 126 | [package.dependencies] 127 | mccabe = ">=0.7.0,<0.8.0" 128 | pycodestyle = ">=2.10.0,<2.11.0" 129 | pyflakes = ">=3.0.0,<3.1.0" 130 | 131 | [[package]] 132 | name = "iniconfig" 133 | version = "2.0.0" 134 | description = "brain-dead simple config-ini parsing" 135 | category = "dev" 136 | optional = false 137 | python-versions = ">=3.7" 138 | files = [ 139 | {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, 140 | {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, 141 | ] 142 | 143 | [[package]] 144 | name = "langcodes" 145 | version = "3.3.0" 146 | description = "Tools for labeling human languages with IETF language tags" 147 | category = "main" 148 | optional = false 149 | python-versions = ">=3.6" 150 | files = [ 151 | {file = "langcodes-3.3.0-py3-none-any.whl", hash = "sha256:4d89fc9acb6e9c8fdef70bcdf376113a3db09b67285d9e1d534de6d8818e7e69"}, 152 | {file = "langcodes-3.3.0.tar.gz", hash = "sha256:794d07d5a28781231ac335a1561b8442f8648ca07cd518310aeb45d6f0807ef6"}, 153 | ] 154 | 155 | [package.extras] 156 | data = ["language-data (>=1.1,<2.0)"] 157 | 158 | [[package]] 159 | name = "mccabe" 160 | version = "0.7.0" 161 | description = "McCabe checker, plugin for flake8" 162 | category = "dev" 163 | optional = false 164 | python-versions = ">=3.6" 165 | files = [ 166 | {file = "mccabe-0.7.0-py2.py3-none-any.whl", hash = "sha256:6c2d30ab6be0e4a46919781807b4f0d834ebdd6c6e3dca0bda5a15f863427b6e"}, 167 | {file = "mccabe-0.7.0.tar.gz", hash = "sha256:348e0240c33b60bbdf4e523192ef919f28cb2c3d7d5c7794f74009290f236325"}, 168 | ] 169 | 170 | [[package]] 171 | name = "mypy" 172 | version = "1.1.1" 173 | description = "Optional static typing for Python" 174 | category = "dev" 175 | optional = false 176 | python-versions = ">=3.7" 177 | files = [ 178 | {file = "mypy-1.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:39c7119335be05630611ee798cc982623b9e8f0cff04a0b48dfc26100e0b97af"}, 179 | {file = "mypy-1.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:61bf08362e93b6b12fad3eab68c4ea903a077b87c90ac06c11e3d7a09b56b9c1"}, 180 | {file = "mypy-1.1.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dbb19c9f662e41e474e0cff502b7064a7edc6764f5262b6cd91d698163196799"}, 181 | {file = "mypy-1.1.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:315ac73cc1cce4771c27d426b7ea558fb4e2836f89cb0296cbe056894e3a1f78"}, 182 | {file = "mypy-1.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:5cb14ff9919b7df3538590fc4d4c49a0f84392237cbf5f7a816b4161c061829e"}, 183 | {file = "mypy-1.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:26cdd6a22b9b40b2fd71881a8a4f34b4d7914c679f154f43385ca878a8297389"}, 184 | {file = "mypy-1.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5b5f81b40d94c785f288948c16e1f2da37203c6006546c5d947aab6f90aefef2"}, 185 | {file = "mypy-1.1.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21b437be1c02712a605591e1ed1d858aba681757a1e55fe678a15c2244cd68a5"}, 186 | {file = "mypy-1.1.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d809f88734f44a0d44959d795b1e6f64b2bbe0ea4d9cc4776aa588bb4229fc1c"}, 187 | {file = "mypy-1.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:a380c041db500e1410bb5b16b3c1c35e61e773a5c3517926b81dfdab7582be54"}, 188 | {file = "mypy-1.1.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7c7b708fe9a871a96626d61912e3f4ddd365bf7f39128362bc50cbd74a634d5"}, 189 | {file = "mypy-1.1.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1c10fa12df1232c936830839e2e935d090fc9ee315744ac33b8a32216b93707"}, 190 | {file = "mypy-1.1.1-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:0a28a76785bf57655a8ea5eb0540a15b0e781c807b5aa798bd463779988fa1d5"}, 191 | {file = "mypy-1.1.1-cp37-cp37m-win_amd64.whl", hash = "sha256:ef6a01e563ec6a4940784c574d33f6ac1943864634517984471642908b30b6f7"}, 192 | {file = "mypy-1.1.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d64c28e03ce40d5303450f547e07418c64c241669ab20610f273c9e6290b4b0b"}, 193 | {file = "mypy-1.1.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:64cc3afb3e9e71a79d06e3ed24bb508a6d66f782aff7e56f628bf35ba2e0ba51"}, 194 | {file = "mypy-1.1.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce61663faf7a8e5ec6f456857bfbcec2901fbdb3ad958b778403f63b9e606a1b"}, 195 | {file = "mypy-1.1.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:2b0c373d071593deefbcdd87ec8db91ea13bd8f1328d44947e88beae21e8d5e9"}, 196 | {file = "mypy-1.1.1-cp38-cp38-win_amd64.whl", hash = "sha256:2888ce4fe5aae5a673386fa232473014056967f3904f5abfcf6367b5af1f612a"}, 197 | {file = "mypy-1.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:19ba15f9627a5723e522d007fe708007bae52b93faab00f95d72f03e1afa9598"}, 198 | {file = "mypy-1.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:59bbd71e5c58eed2e992ce6523180e03c221dcd92b52f0e792f291d67b15a71c"}, 199 | {file = "mypy-1.1.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9401e33814cec6aec8c03a9548e9385e0e228fc1b8b0a37b9ea21038e64cdd8a"}, 200 | {file = "mypy-1.1.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4b398d8b1f4fba0e3c6463e02f8ad3346f71956b92287af22c9b12c3ec965a9f"}, 201 | {file = "mypy-1.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:69b35d1dcb5707382810765ed34da9db47e7f95b3528334a3c999b0c90fe523f"}, 202 | {file = "mypy-1.1.1-py3-none-any.whl", hash = "sha256:4e4e8b362cdf99ba00c2b218036002bdcdf1e0de085cdb296a49df03fb31dfc4"}, 203 | {file = "mypy-1.1.1.tar.gz", hash = "sha256:ae9ceae0f5b9059f33dbc62dea087e942c0ccab4b7a003719cb70f9b8abfa32f"}, 204 | ] 205 | 206 | [package.dependencies] 207 | mypy-extensions = ">=1.0.0" 208 | tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} 209 | typing-extensions = ">=3.10" 210 | 211 | [package.extras] 212 | dmypy = ["psutil (>=4.0)"] 213 | install-types = ["pip"] 214 | python2 = ["typed-ast (>=1.4.0,<2)"] 215 | reports = ["lxml"] 216 | 217 | [[package]] 218 | name = "mypy-extensions" 219 | version = "1.0.0" 220 | description = "Type system extensions for programs checked with the mypy type checker." 221 | category = "dev" 222 | optional = false 223 | python-versions = ">=3.5" 224 | files = [ 225 | {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, 226 | {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, 227 | ] 228 | 229 | [[package]] 230 | name = "packaging" 231 | version = "23.0" 232 | description = "Core utilities for Python packages" 233 | category = "dev" 234 | optional = false 235 | python-versions = ">=3.7" 236 | files = [ 237 | {file = "packaging-23.0-py3-none-any.whl", hash = "sha256:714ac14496c3e68c99c29b00845f7a2b85f3bb6f1078fd9f72fd20f0570002b2"}, 238 | {file = "packaging-23.0.tar.gz", hash = "sha256:b6ad297f8907de0fa2fe1ccbd26fdaf387f5f47c7275fedf8cce89f99446cf97"}, 239 | ] 240 | 241 | [[package]] 242 | name = "pathspec" 243 | version = "0.11.1" 244 | description = "Utility library for gitignore style pattern matching of file paths." 245 | category = "dev" 246 | optional = false 247 | python-versions = ">=3.7" 248 | files = [ 249 | {file = "pathspec-0.11.1-py3-none-any.whl", hash = "sha256:d8af70af76652554bd134c22b3e8a1cc46ed7d91edcdd721ef1a0c51a84a5293"}, 250 | {file = "pathspec-0.11.1.tar.gz", hash = "sha256:2798de800fa92780e33acca925945e9a19a133b715067cf165b8866c15a31687"}, 251 | ] 252 | 253 | [[package]] 254 | name = "platformdirs" 255 | version = "3.1.1" 256 | description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." 257 | category = "dev" 258 | optional = false 259 | python-versions = ">=3.7" 260 | files = [ 261 | {file = "platformdirs-3.1.1-py3-none-any.whl", hash = "sha256:e5986afb596e4bb5bde29a79ac9061aa955b94fca2399b7aaac4090860920dd8"}, 262 | {file = "platformdirs-3.1.1.tar.gz", hash = "sha256:024996549ee88ec1a9aa99ff7f8fc819bb59e2c3477b410d90a16d32d6e707aa"}, 263 | ] 264 | 265 | [package.extras] 266 | docs = ["furo (>=2022.12.7)", "proselint (>=0.13)", "sphinx (>=6.1.3)", "sphinx-autodoc-typehints (>=1.22,!=1.23.4)"] 267 | test = ["appdirs (==1.4.4)", "covdefaults (>=2.2.2)", "pytest (>=7.2.1)", "pytest-cov (>=4)", "pytest-mock (>=3.10)"] 268 | 269 | [[package]] 270 | name = "pluggy" 271 | version = "1.0.0" 272 | description = "plugin and hook calling mechanisms for python" 273 | category = "dev" 274 | optional = false 275 | python-versions = ">=3.6" 276 | files = [ 277 | {file = "pluggy-1.0.0-py2.py3-none-any.whl", hash = "sha256:74134bbf457f031a36d68416e1509f34bd5ccc019f0bcc952c7b909d06b37bd3"}, 278 | {file = "pluggy-1.0.0.tar.gz", hash = "sha256:4224373bacce55f955a878bf9cfa763c1e360858e330072059e10bad68531159"}, 279 | ] 280 | 281 | [package.extras] 282 | dev = ["pre-commit", "tox"] 283 | testing = ["pytest", "pytest-benchmark"] 284 | 285 | [[package]] 286 | name = "pycodestyle" 287 | version = "2.10.0" 288 | description = "Python style guide checker" 289 | category = "dev" 290 | optional = false 291 | python-versions = ">=3.6" 292 | files = [ 293 | {file = "pycodestyle-2.10.0-py2.py3-none-any.whl", hash = "sha256:8a4eaf0d0495c7395bdab3589ac2db602797d76207242c17d470186815706610"}, 294 | {file = "pycodestyle-2.10.0.tar.gz", hash = "sha256:347187bdb476329d98f695c213d7295a846d1152ff4fe9bacb8a9590b8ee7053"}, 295 | ] 296 | 297 | [[package]] 298 | name = "pyflakes" 299 | version = "3.0.1" 300 | description = "passive checker of Python programs" 301 | category = "dev" 302 | optional = false 303 | python-versions = ">=3.6" 304 | files = [ 305 | {file = "pyflakes-3.0.1-py2.py3-none-any.whl", hash = "sha256:ec55bf7fe21fff7f1ad2f7da62363d749e2a470500eab1b555334b67aa1ef8cf"}, 306 | {file = "pyflakes-3.0.1.tar.gz", hash = "sha256:ec8b276a6b60bd80defed25add7e439881c19e64850afd9b346283d4165fd0fd"}, 307 | ] 308 | 309 | [[package]] 310 | name = "pytest" 311 | version = "7.2.2" 312 | description = "pytest: simple powerful testing with Python" 313 | category = "dev" 314 | optional = false 315 | python-versions = ">=3.7" 316 | files = [ 317 | {file = "pytest-7.2.2-py3-none-any.whl", hash = "sha256:130328f552dcfac0b1cec75c12e3f005619dc5f874f0a06e8ff7263f0ee6225e"}, 318 | {file = "pytest-7.2.2.tar.gz", hash = "sha256:c99ab0c73aceb050f68929bc93af19ab6db0558791c6a0715723abe9d0ade9d4"}, 319 | ] 320 | 321 | [package.dependencies] 322 | attrs = ">=19.2.0" 323 | colorama = {version = "*", markers = "sys_platform == \"win32\""} 324 | exceptiongroup = {version = ">=1.0.0rc8", markers = "python_version < \"3.11\""} 325 | iniconfig = "*" 326 | packaging = "*" 327 | pluggy = ">=0.12,<2.0" 328 | tomli = {version = ">=1.0.0", markers = "python_version < \"3.11\""} 329 | 330 | [package.extras] 331 | testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "pygments (>=2.7.2)", "requests", "xmlschema"] 332 | 333 | [[package]] 334 | name = "tomli" 335 | version = "2.0.1" 336 | description = "A lil' TOML parser" 337 | category = "dev" 338 | optional = false 339 | python-versions = ">=3.7" 340 | files = [ 341 | {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, 342 | {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, 343 | ] 344 | 345 | [[package]] 346 | name = "typing-extensions" 347 | version = "4.5.0" 348 | description = "Backported and Experimental Type Hints for Python 3.7+" 349 | category = "dev" 350 | optional = false 351 | python-versions = ">=3.7" 352 | files = [ 353 | {file = "typing_extensions-4.5.0-py3-none-any.whl", hash = "sha256:fb33085c39dd998ac16d1431ebc293a8b3eedd00fd4a32de0ff79002c19511b4"}, 354 | {file = "typing_extensions-4.5.0.tar.gz", hash = "sha256:5cb5f4a79139d699607b3ef622a1dedafa84e115ab0024e0d9c044a9479ca7cb"}, 355 | ] 356 | 357 | [metadata] 358 | lock-version = "2.0" 359 | python-versions = "^3.9" 360 | content-hash = "5849a531e965e4e50ba3687cd96233650a4a7ab685e9ddb2e5faf47b7237fc0b" 361 | -------------------------------------------------------------------------------- /py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xojoc/cleanurl/fd67294fb5ca71d541b30c7d5c143318c7b4fb45/py.typed -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "cleanurl" 3 | version = "0.1.15" 4 | description = "Remove clutter from URLs and return a canonicalized version" 5 | authors = ["Alexandru Cojocaru "] 6 | license = "AGPL-3.0-or-later" 7 | readme = "README.md" 8 | homepage = "https://github.com/xojoc/cleanurl" 9 | repository = "https://github.com/xojoc/cleanurl" 10 | keywords = ['url', 'canonical'] 11 | classifiers = [ 12 | "Environment :: Web Environment", 13 | "Intended Audience :: Developers", 14 | "Operating System :: OS Independent", 15 | "Typing :: Typed" ] 16 | 17 | [tool.poetry.dependencies] 18 | python = "^3.9" 19 | langcodes = "^3.3.0" 20 | 21 | [tool.poetry.dev-dependencies] 22 | mypy = "*" 23 | flake8 = "*" 24 | pytest = "*" 25 | black = {version = "*", allow-prereleases = true} 26 | 27 | 28 | [build-system] 29 | requires = ["poetry-core>=1.0.0"] 30 | build-backend = "poetry.core.masonry.api" 31 | -------------------------------------------------------------------------------- /src/cleanurl.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations # for union type 2 | from urllib import parse as urlparse 3 | from dataclasses import dataclass 4 | import re 5 | import langcodes 6 | 7 | 8 | def __replace_last(s, old, new): 9 | h, _s, t = s.rpartition(old) 10 | return h + new + t 11 | 12 | 13 | def __is_integer(s): 14 | try: 15 | _ = int(s) 16 | return True 17 | except ValueError: 18 | return False 19 | 20 | 21 | def __is_lang_tag(s): 22 | return s and langcodes.tag_is_valid(s) 23 | 24 | 25 | @dataclass 26 | class Result: 27 | parsed_url: urlparse.ParseResult 28 | 29 | @property 30 | def scheme(self) -> str | None: 31 | return self.parsed_url.scheme 32 | 33 | @property 34 | def hostname(self) -> str | None: 35 | return self.parsed_url.hostname 36 | 37 | @property 38 | def netloc(self) -> str | None: 39 | return self.parsed_url.netloc 40 | 41 | @property 42 | def password(self) -> str | None: 43 | return self.parsed_url.password 44 | 45 | @property 46 | def port(self) -> int | None: 47 | return self.parsed_url.port 48 | 49 | @property 50 | def path(self) -> str | None: 51 | return self.parsed_url.path 52 | 53 | @property 54 | def query(self) -> str | None: 55 | return self.parsed_url.query 56 | 57 | @property 58 | def parsed_query(self) -> list[tuple[str, str]]: 59 | return urlparse.parse_qsl( 60 | self.parsed_url.query, keep_blank_values=True 61 | ) 62 | 63 | @property 64 | def fragment(self) -> str | None: 65 | return self.parsed_url.fragment 66 | 67 | @property 68 | def url(self) -> str: 69 | return urlparse.urlunparse(self.parsed_url) 70 | 71 | @property 72 | def schemeless_url(self) -> str: 73 | u = self.url 74 | if self.scheme: 75 | u = u[len(self.scheme) + 1 :] 76 | if self.parsed_url.netloc: 77 | u = u.removeprefix("//") 78 | return u 79 | 80 | 81 | def __canonical_host(host, respect_semantics): 82 | if not host: 83 | return "" 84 | 85 | host = host.lower() 86 | host = host.strip(".") 87 | host = re.sub(r"\.{2,}", ".", host) 88 | 89 | if respect_semantics: 90 | return host 91 | 92 | for prefix in ["www.", "ww2.", "m.", "mobile."]: 93 | if host.startswith(prefix) and len(host) > (len(prefix) + 1): 94 | host = host[len(prefix) :] 95 | 96 | return host 97 | 98 | 99 | def __canonical_path(scheme, path, respect_semantics): 100 | if not path: 101 | return "" 102 | 103 | if scheme in ["", "http", "https", "ftp", "file"]: 104 | absolute_path, segment = [], None 105 | for segment in path.split("/"): 106 | if segment == "": 107 | if not absolute_path: 108 | absolute_path.append(segment) 109 | elif segment == ".": 110 | pass 111 | elif segment == "..": 112 | if len(absolute_path) > 1: 113 | absolute_path.pop() 114 | else: 115 | absolute_path.append(segment) 116 | if segment in ["", ".", ".."]: 117 | absolute_path.append("") 118 | path = "/".join(absolute_path) 119 | 120 | if respect_semantics: 121 | return path 122 | 123 | path = path.lower() 124 | 125 | suffixes = [ 126 | "/default", 127 | "/index", 128 | ".htm", 129 | ".html", 130 | ".shtml", 131 | ".php", 132 | ".jsp", 133 | ".aspx", 134 | ".cms", 135 | ".md", 136 | ".pdf", 137 | ".stm", 138 | "/", 139 | ] 140 | found_suffix = True 141 | while found_suffix: 142 | found_suffix = False 143 | for suffix in suffixes: 144 | if path.endswith(suffix): 145 | path = path[: -len(suffix)] 146 | found_suffix = True 147 | 148 | return path 149 | 150 | 151 | def __canonical_query(query, respect_semantics): 152 | pq = urlparse.parse_qs(query, keep_blank_values=True) or {} 153 | 154 | queries_to_skip = { 155 | # https://en.wikipedia.org/wiki/UTM_parameters 156 | "utm_term", 157 | "utm_campaign", 158 | "utm_content", 159 | "utm_source", 160 | "utm_medium", 161 | # https://en.wikipedia.org/wiki/Gclid 162 | "gclid", 163 | # https://en.wikipedia.org/wiki/Gclsrc 164 | "gclsrc", 165 | # https://en.wikipedia.org/wiki/Dclid 166 | "dclid", 167 | # https://en.wikipedia.org/wiki/Fbclid 168 | "fbclid", 169 | } 170 | 171 | if not respect_semantics: 172 | queries_to_skip |= { 173 | "cd-origin", 174 | "cmpid", 175 | "camp", 176 | "cid", 177 | "ncid", 178 | "zanpid", 179 | "guccounter", 180 | "campaign_id", 181 | "tstart", 182 | } 183 | 184 | v = pq.get("hl") or [None] 185 | if __is_lang_tag(v[0]): 186 | pq.pop("hl", None) 187 | 188 | return sorted( 189 | [(q, v[0]) for q, v in pq.items() if v and q not in queries_to_skip] 190 | ) 191 | 192 | 193 | def __fragment_to_path(scheme, host, path, fragment): 194 | if not fragment: 195 | return None 196 | 197 | if scheme not in ("", "http", "https"): 198 | return None 199 | 200 | if host == "cnn.com" and path == "/video" and fragment.startswith("/"): 201 | return fragment 202 | 203 | if ( 204 | host == "groups.google.com" 205 | and path.startswith("/forum") 206 | and fragment.startswith("!topic/") 207 | ): 208 | return "/g/" + fragment[len("!topic/") :].replace("/", "/c/", 1) 209 | 210 | if ( 211 | host == "groups.google.com" 212 | and path.startswith("/forum") 213 | and fragment.startswith("!forum/") 214 | ): 215 | return "/g/" + fragment[len("!forum/") :] 216 | 217 | if ( 218 | host == "groups.google.com" 219 | and path.startswith("/forum") 220 | and fragment.startswith("!msg/") 221 | ): 222 | new_path = "/g/" + fragment[len("!msg/") :].replace("/", "/c/", 1) 223 | return __replace_last(new_path, "/", "/m/") 224 | 225 | if path in ("", "/") and fragment.startswith("!"): 226 | new_path = fragment[1:] 227 | if not new_path.startswith("/"): 228 | new_path = "/" + new_path 229 | return new_path 230 | 231 | 232 | def __canonical_fragment(scheme, host, path, fragment, respect_semantics): 233 | if host in ("sbcl.org", "www.sbcl.org") and path in ( 234 | "/news", 235 | "/news.html", 236 | ): 237 | return fragment 238 | 239 | if ( 240 | host in ("typescriptlang.org", "www.typescriptlang.org") 241 | and path 242 | and path.startswith("/play") 243 | and fragment 244 | and fragment.startswith("code/") 245 | ): 246 | return fragment 247 | 248 | if respect_semantics: 249 | return fragment 250 | else: 251 | return None 252 | 253 | 254 | # fixme: the amped url may have a different scheme from the amp url 255 | def __canonical_amp(host, path, parsed_query, respect_semantics, host_remap): 256 | path_is_amped_url = False 257 | if host in ("www.google.com", "google.com"): 258 | if path.startswith("/amp/"): 259 | path_is_amped_url = True 260 | 261 | # https://example-com.cdn.ampproject.org/c/s/example.com/g?value=Hello%20World 262 | 263 | if host.endswith(".cdn.ampproject.org"): 264 | path_is_amped_url = True 265 | 266 | if path_is_amped_url: 267 | parts = path.split("/") 268 | while parts and "." not in parts[0]: 269 | parts = parts[1:] 270 | 271 | path = "//" + "/".join(parts) 272 | if parsed_query: 273 | path += "?" + urlparse.urlencode(parsed_query) 274 | amped_url = cleanurl( 275 | path, 276 | respect_semantics=respect_semantics, 277 | host_remap=host_remap, 278 | ) 279 | host = amped_url.parsed_url.netloc 280 | path = amped_url.parsed_url.path 281 | parsed_query = amped_url.parsed_query 282 | 283 | path = path.removeprefix("/amp/") 284 | 285 | return host, path, parsed_query 286 | 287 | 288 | # fixme: the archived url may have a different scheme from the webarchive url 289 | def __canonical_webarchive( 290 | host, path, parsed_query, fragment, respect_semantics, host_remap 291 | ): 292 | if host != "web.archive.org": 293 | return 294 | 295 | web_archive_prefix = "/web/" 296 | if not path.startswith(web_archive_prefix): 297 | return 298 | 299 | parts = path[len(web_archive_prefix) :].split("/", 1) 300 | if len(parts) == 2 and parts[1].startswith(("http:/", "https:/")): 301 | try: 302 | url = parts[1] 303 | url = url.replace("http:/", "http://", 1) 304 | url = url.replace("https:/", "https://", 1) 305 | u = cleanurl( 306 | url, 307 | generic=False, 308 | respect_semantics=respect_semantics, 309 | host_remap=host_remap, 310 | ) 311 | return ( 312 | u.parsed_url.netloc, 313 | u.parsed_url.path, 314 | u.parsed_query, 315 | u.fragment, 316 | ) 317 | except Exception: 318 | pass 319 | 320 | 321 | def __canonical_youtube( 322 | host, path, parsed_query, fragment, respect_semantics, host_remap 323 | ): 324 | if host in ("youtube.com", "www.youtube.com"): 325 | video_id = None 326 | if path == "/watch": 327 | for v in parsed_query or []: 328 | if v[0] == "v": 329 | video_id = v[1] 330 | break 331 | 332 | if path.startswith("/embed/"): 333 | path_parts = path.split("/") 334 | if len(path_parts) >= 3 and path_parts[-1] != "": 335 | video_id = path_parts[-1] 336 | 337 | if video_id: 338 | return "youtu.be", "/" + video_id.lower(), [], None 339 | 340 | if host_remap and host == "dev.tube" and path.startswith("/video/"): 341 | return "youtu.be", path[len("/video") :].lower(), [], None 342 | 343 | 344 | def __canonical_medium( 345 | host, path, parsed_query, fragment, respect_semantics, host_remap 346 | ): 347 | path_parts = path.split("/") 348 | if host == "medium.com": 349 | if len(path_parts) >= 3: 350 | return host, "/p/" + path_parts[-1].split("-")[-1], [], None 351 | if host.endswith(".medium.com"): 352 | if len(path_parts) >= 2: 353 | if host_remap: 354 | return ( 355 | "medium.com", 356 | "/p/" + path_parts[-1].split("-")[-1], 357 | [], 358 | None, 359 | ) 360 | else: 361 | return host, "/" + path_parts[-1].split("-")[-1], [], None 362 | 363 | 364 | def __canonical_github( 365 | host, path, parsed_query, fragment, respect_semantics, host_remap 366 | ): 367 | if host == "github.com": 368 | path = path.removesuffix("/tree/master") 369 | path = path.removesuffix("/blob/master/readme") 370 | fragment = None 371 | 372 | return host, path, parsed_query, fragment 373 | 374 | 375 | def __canonical_bitbucket( 376 | host, path, parsed_query, fragment, respect_semantics, host_remap 377 | ): 378 | if host == "bitbucket.org": 379 | path = path.removesuffix("/src/master") 380 | fragment = None 381 | 382 | return host, path, parsed_query, fragment 383 | 384 | 385 | def __canonical_nytimes( 386 | host, path, parsed_query, fragment, respect_semantics, host_remap 387 | ): 388 | if host == "nytimes.com": 389 | parsed_query = [] 390 | if host == "open.nytimes.com": 391 | if path: 392 | parsed_query = [] 393 | path_parts = path.split("/") 394 | if len(path_parts) >= 2: 395 | path = "/" + path_parts[-1].split("-")[-1] 396 | fragment = None 397 | 398 | return host, path, parsed_query, fragment 399 | 400 | 401 | def __canonical_techcrunch( 402 | host, path, parsed_query, fragment, respect_semantics, host_remap 403 | ): 404 | if host == "techcrunch.com" or host.endswith(".techcrunch.com"): 405 | parsed_query = [] 406 | 407 | return host, path, parsed_query, fragment 408 | 409 | 410 | def __canonical_wikipedia( 411 | host, path, parsed_query, fragment, respect_semantics, host_remap 412 | ): 413 | if host.endswith(".wikipedia.org"): 414 | for q in parsed_query: 415 | if q[0] == "title": 416 | path = "/wiki/" + q[1] 417 | parsed_query = [] 418 | 419 | host_parts = host.split(".") 420 | if len(host_parts) == 4 and host_parts[1] == "m": 421 | host_parts.pop(1) 422 | 423 | if ( 424 | not respect_semantics 425 | and len(host_parts) == 3 426 | and len(host_parts[0]) == 2 427 | ): 428 | host_parts.pop(0) 429 | 430 | host = ".".join(host_parts) 431 | 432 | return host, path, parsed_query, fragment 433 | 434 | 435 | def __canonical_arstechnica( 436 | host, path, parsed_query, fragment, respect_semantics, host_remap 437 | ): 438 | if host == "arstechnica" and "viewtopic.php" not in path: 439 | parsed_query = [] 440 | 441 | return host, path, parsed_query, fragment 442 | 443 | 444 | def __canonical_bbc( 445 | host, path, parsed_query, fragment, respect_semantics, host_remap 446 | ): 447 | if host_remap and (host == "bbc.co.uk" or host.endswith(".bbc.co.uk")): 448 | host = host.replace(".co.uk", ".com") 449 | 450 | if host in ("news.bbc.com", "news.bbc.co.uk"): 451 | parsed_query = [] 452 | 453 | return host, path, parsed_query, fragment 454 | 455 | 456 | def __canonical_twitter( 457 | host, path, parsed_query, fragment, respect_semantics, host_remap 458 | ): 459 | if host in ("www.twitter.com", "twitter.com"): 460 | if path == "/home": 461 | path = "" 462 | else: 463 | path_parts = path.split("/") 464 | if ( 465 | len(path_parts) == 4 466 | and path_parts[0] == "" 467 | and path_parts[2] == "status" 468 | ): 469 | path = "/i/status/" + path_parts[3] 470 | parsed_query = [] 471 | 472 | if host_remap and host == "threadreaderapp.com": 473 | if path.startswith("/thread/"): 474 | path = "/i/status/" + path[len("/thread/") :] 475 | parsed_query = [] 476 | host = "twitter.com" 477 | 478 | if host in ("www.nitter.net", "nitter.net"): 479 | parts = path.split("/") 480 | if ( 481 | len(parts) == 4 482 | and parts[0] == "" 483 | and parts[2] == "status" 484 | and __is_integer(parts[3]) 485 | ): 486 | path = "/i/status/" + parts[3] 487 | parsed_query = [] 488 | if host_remap: 489 | host = "twitter.com" 490 | 491 | queries_to_skip = {"src"} 492 | parsed_query = sorted( 493 | [q for q in parsed_query if q[0] not in queries_to_skip] 494 | ) 495 | 496 | return host, path, parsed_query, fragment 497 | 498 | 499 | def __canonical_mastodon( 500 | host, path, parsed_query, fragment, respect_semantics, host_remap 501 | ): 502 | parts = path.split("/") 503 | if ( 504 | len(parts) == 4 505 | and parts[0] == "" 506 | and parts[1] == "web" 507 | and parts[2].startswith("@") 508 | and __is_integer(parts[3]) 509 | ): 510 | parts.pop(1) 511 | path = "/".join(parts) 512 | parsed_query = [] 513 | 514 | if host_remap: 515 | if ( 516 | len(parts) == 3 517 | and parts[0] == "" 518 | and parts[1].startswith("@") 519 | and parts[1].count("@") == 2 520 | and "." in parts[1] 521 | and __is_integer(parts[2]) 522 | ): 523 | account_parts = parts[1].split("@") 524 | if len(account_parts) == 3 and "." in account_parts[2]: 525 | host = account_parts[2] 526 | path = "@" + account_parts[1] + "/" + parts[2] 527 | parsed_query = [] 528 | 529 | return host, path, parsed_query, fragment 530 | 531 | 532 | def __canonical_reddit( 533 | host, path, parsed_query, fragment, respect_semantics, host_remap 534 | ): 535 | if host in ("reddit.com", "www.reddit.com", "old.reddit.com"): 536 | if host_remap: 537 | host = "reddit.com" 538 | else: 539 | if host != "old.reddit.com": 540 | host = "reddit.com" 541 | 542 | parts = path.split("/") 543 | if ( 544 | len(parts) >= 5 545 | and parts[0] == "" 546 | and parts[1] == "r" 547 | and parts[3] == "comments" 548 | ): 549 | path = f"/{parts[1]}/{parts[2]}/{parts[3]}/{parts[4]}" 550 | parsed_query = [] 551 | 552 | return host, path, parsed_query, fragment 553 | 554 | 555 | def __canonical_stackoverflow( 556 | host, path, parsed_query, fragment, respect_semantics, host_remap 557 | ): 558 | parts = path.split("/") 559 | if ( 560 | host.endswith(".com") 561 | and len(parts) == 4 562 | and parts[1] == "questions" 563 | and __is_integer(parts[2]) 564 | and len(parts[3]) > 0 565 | ): 566 | path = "/q/" + parts[2] 567 | parsed_query = [] 568 | 569 | return host, path, parsed_query, fragment 570 | 571 | 572 | def __canonical_amazon( 573 | host, path, parsed_query, fragment, respect_semantics, host_remap 574 | ): 575 | host_parts = host.split(".") 576 | if len(host_parts) < 2: 577 | return 578 | if host_parts[0] == "www": 579 | host_parts = host_parts[1:] 580 | 581 | if host_parts[0] == "amazon": 582 | parts = path.split("/") 583 | parts = [p for p in parts if p] 584 | for i, p in enumerate(parts): 585 | if p == "dp" and i + 1 < len(parts): 586 | path = f"/{parts[i]}/{parts[i+1]}" 587 | parsed_query = [] 588 | break 589 | 590 | if host_remap: 591 | host = ".".join(host_parts) 592 | 593 | return host, path, parsed_query, fragment 594 | 595 | 596 | def __canonical_tumblr( 597 | host, path, parsed_query, fragment, respect_semantics, host_remap 598 | ): 599 | if not host: 600 | return 601 | 602 | host_parts = host.split(".") 603 | if not ( 604 | len(host_parts) >= 3 605 | and host_parts[-2] == "tumblr" 606 | and host_parts[-1] == "com" 607 | ): 608 | return 609 | 610 | path_parts = path.split("/") 611 | 612 | if ( 613 | len(path_parts) >= 3 614 | and path_parts[1] == "post" 615 | and path_parts[2].isdigit() 616 | ): 617 | return host, "/post/" + path_parts[2], [], None 618 | 619 | 620 | def __canonical_lwn( 621 | host, path, parsed_query, fragment, respect_semantics, host_remap 622 | ): 623 | if host not in ("lwn.net", "www.lwn.net"): 624 | return 625 | 626 | path_parts = [p for p in path.split("/") if p] 627 | 628 | if ( 629 | len(path_parts) >= 2 630 | and path_parts[0].lower() == "subscriberlink" 631 | and path_parts[1].isdigit() 632 | ): 633 | return host, "/Articles/" + path_parts[1], [], None 634 | 635 | 636 | def __canonical_doi( 637 | host, path, parsed_query, fragment, respect_semantics, host_remap 638 | ): 639 | if not host_remap: 640 | return 641 | 642 | path_parts = [p for p in path.split("/") if p] 643 | 644 | if host in ("doi.org", "www.doi.org") and len(path_parts) >= 2: 645 | if path_parts[0].startswith("10."): 646 | return "doi.org", "/" + "/".join(path_parts[:2]).lower(), [], None 647 | 648 | doi_idx = path_parts.index("doi") 649 | doi_parts = path_parts[doi_idx:] 650 | for i, pp in enumerate(doi_parts): 651 | if pp.startswith("10.") and doi_parts[i + 1]: 652 | return ( 653 | "doi.org", 654 | "/" + pp.lower() + "/" + doi_parts[i + 1].lower(), 655 | [], 656 | None, 657 | ) 658 | 659 | 660 | def __canonical_remove_language( 661 | host, path, parsed_query, fragment, respect_semantics, host_remap 662 | ): 663 | if respect_semantics: 664 | return 665 | 666 | path_parts = [p for p in path.split("/") if p] 667 | if len(path_parts) >= 2 and __is_lang_tag(path_parts[0]): 668 | return host, "/" + "/".join(path_parts[1:]), parsed_query, fragment 669 | 670 | 671 | def __canonical_arxiv( 672 | host, path, parsed_query, fragment, respect_semantics, host_remap 673 | ): 674 | if host not in ("arxiv.org", "www.arxiv.org"): 675 | return 676 | 677 | # syntax: https://arxiv.org/help/arxiv_identifier 678 | 679 | path_parts = [p for p in path.split("/") if p] 680 | 681 | if len(path_parts) >= 2: 682 | dot_parts = path_parts[1].split(".") 683 | if dot_parts[0].isdigit() and dot_parts[1].isdigit(): 684 | return "arxiv.org", f"/abs/{dot_parts[0]}.{dot_parts[1]}", [], None 685 | 686 | 687 | def __canonical_djangoproject( 688 | host, path, parsed_query, fragment, respect_semantics, host_remap 689 | ): 690 | if respect_semantics: 691 | return 692 | 693 | path_parts = [p for p in path.split("/") if p] 694 | 695 | if host == "docs.djangoproject.com" and ( 696 | re.match(r"^\d+\.\d+$", path_parts[0]) or path_parts[0] == "dev" 697 | ): 698 | return host, "/" + "/".join(path_parts[1:]), parsed_query, None 699 | 700 | 701 | def __canonical_thenewstack( 702 | host, path, parsed_query, fragment, respect_semantics, host_remap 703 | ): 704 | if host == "thenewstack.io": 705 | return ( 706 | host, 707 | path.removesuffix("/"), 708 | [q for q in parsed_query if not (q[0] == "s" and q[1].isdigit())], 709 | None, 710 | ) 711 | 712 | 713 | def __canonical_typescript( 714 | host, path, parsed_query, fragment, respect_semantics, host_remap 715 | ): 716 | if host in ("typescriptlang.org", "www.typescriptlang.org"): 717 | if path.startswith("/play"): 718 | if fragment.startswith("code/"): 719 | return host, path, [("code", fragment[5:])], None 720 | 721 | 722 | def __canonical_specific_websites( 723 | host, path, parsed_query, fragment, respect_semantics, host_remap 724 | ): 725 | for h in [ 726 | __canonical_webarchive, 727 | __canonical_youtube, 728 | __canonical_medium, 729 | __canonical_github, 730 | __canonical_bitbucket, 731 | __canonical_nytimes, 732 | __canonical_techcrunch, 733 | __canonical_wikipedia, 734 | __canonical_arstechnica, 735 | __canonical_bbc, 736 | __canonical_twitter, 737 | __canonical_mastodon, 738 | __canonical_reddit, 739 | __canonical_stackoverflow, 740 | __canonical_amazon, 741 | __canonical_tumblr, 742 | __canonical_lwn, 743 | __canonical_doi, 744 | __canonical_remove_language, 745 | __canonical_arxiv, 746 | __canonical_djangoproject, 747 | __canonical_thenewstack, 748 | __canonical_typescript, 749 | ]: 750 | result = None 751 | try: 752 | result = h( 753 | host, 754 | path, 755 | parsed_query, 756 | fragment or "", 757 | respect_semantics, 758 | host_remap, 759 | ) 760 | except Exception: 761 | pass 762 | if result: 763 | host, path, parsed_query, fragment = result 764 | host = host or "" 765 | path = path or "" 766 | parsed_query = parsed_query or [] 767 | fragment = fragment or "" 768 | 769 | return host, path, parsed_query, fragment 770 | 771 | 772 | __host_map = {"edition.cnn.com": "cnn.com"} 773 | 774 | 775 | def _remap_host(host): 776 | return __host_map.get(host, host) 777 | 778 | 779 | # todo: add note for schemeless urls 780 | 781 | 782 | def cleanurl( 783 | url: str | urlparse.ParseResult, 784 | generic=False, 785 | respect_semantics=False, 786 | host_remap=True, 787 | ) -> Result | None: 788 | if not url: 789 | return None 790 | 791 | u: urlparse.ParseResult 792 | 793 | if isinstance(url, str): 794 | try: 795 | u = urlparse.urlparse(url.strip()) 796 | except Exception: 797 | return None 798 | else: 799 | u = url 800 | 801 | if u.scheme == "about" and u.path == "reader": 802 | pq = urlparse.parse_qs(u.query, keep_blank_values=True) 803 | urls = pq.get("url") 804 | if urls: 805 | return cleanurl(urls[0], generic, respect_semantics, host_remap) 806 | 807 | scheme = u.scheme 808 | 809 | host = __canonical_host(u.netloc, respect_semantics) 810 | path = __canonical_path(scheme, u.path, respect_semantics) 811 | parsed_query = __canonical_query(u.query, respect_semantics) 812 | fragment = u.fragment 813 | 814 | new_path = __fragment_to_path(scheme, host, path, fragment) 815 | if new_path is not None: 816 | path = __canonical_path(scheme, new_path, respect_semantics) 817 | fragment = "" 818 | 819 | fragment = ( 820 | __canonical_fragment(scheme, host, path, fragment, respect_semantics) 821 | or "" 822 | ) 823 | 824 | result = __canonical_amp( 825 | host, path, parsed_query, respect_semantics, host_remap 826 | ) 827 | if result: 828 | host, path, parsed_query = result 829 | host = host or "" 830 | path = path or "" 831 | parsed_query = parsed_query or [] 832 | 833 | if not generic: 834 | host, path, parsed_query, fragment = __canonical_specific_websites( 835 | host, path, parsed_query, fragment, respect_semantics, host_remap 836 | ) 837 | if host_remap: 838 | host = _remap_host(host) 839 | 840 | u = urlparse.ParseResult( 841 | scheme=scheme, 842 | netloc=host, 843 | path=path, 844 | params=u.params, 845 | query=urlparse.urlencode(parsed_query or []), 846 | fragment=fragment, 847 | ) 848 | 849 | return Result(u) 850 | -------------------------------------------------------------------------------- /src/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xojoc/cleanurl/fd67294fb5ca71d541b30c7d5c143318c7b4fb45/src/py.typed -------------------------------------------------------------------------------- /src/test_cleanurl.py: -------------------------------------------------------------------------------- 1 | import cleanurl 2 | import unittest 3 | 4 | 5 | class Clean(unittest.TestCase): 6 | def test_result(self): 7 | r = cleanurl.cleanurl("hTTps://gnu.org") 8 | self.assertEqual(r.scheme, "https") 9 | self.assertEqual(r.url, "https://gnu.org") 10 | self.assertEqual(r.schemeless_url, "gnu.org") 11 | 12 | r = cleanurl.cleanurl("//gnu.org") 13 | self.assertEqual(r.scheme, "") 14 | self.assertEqual(r.url, "//gnu.org") 15 | self.assertEqual(r.schemeless_url, "gnu.org") 16 | 17 | r = cleanurl.cleanurl("gnu.org") 18 | self.assertEqual(r.scheme, "") 19 | self.assertEqual(r.parsed_url.netloc, "") 20 | self.assertEqual(r.parsed_url.path, "gnu.org") 21 | self.assertEqual(r.url, "gnu.org") 22 | self.assertEqual(r.schemeless_url, "gnu.org") 23 | 24 | def test_semantics(self): 25 | urls = [ 26 | "hTTps://www...xOjoC.pw./blog", 27 | "www.xojoc.pw/blog", 28 | "https://www.xojoc.pw/blog/////focus.html", 29 | "www.xojoc.pw/blog/focus.html", 30 | "https://www.xojoc.pw//.././///b/../a.html", 31 | "www.xojoc.pw/a.html", 32 | "https://www.xojoc.pw/blog/focus.html?utm_content=buffercf3b2&utm_medium=social&utm_source=snapchat.com&utm_campaign=buffe", 33 | "www.xojoc.pw/blog/focus.html", 34 | "https://web.archive.org/web/20200103092739/https://www.xojoc.pw/blog/focus.html", 35 | "www.xojoc.pw/blog/focus.html", 36 | "https://www.twitter.com/#!wikileaks/status/1255304335887646721", 37 | "www.twitter.com/i/status/1255304335887646721", 38 | "https://www.google.com/amp/www.example.com/amp/doc.html", 39 | "www.example.com/doc.html", 40 | "https://example-com.cdn.ampproject.org/c/s/example.com/g?value=Hello%20World", 41 | "example.com/g?value=Hello+World", 42 | "https://example-com.cdn.ampproject.org/i/example.com/logo.png", 43 | "example.com/logo.png", 44 | "https://mastodon.social/web/@gitea@social.gitea.io/107576792277055419", 45 | "social.gitea.io/@gitea/107576792277055419", 46 | "https://mastodon.social/web/@compsci_discussions/107795852426456992", 47 | "mastodon.social/@compsci_discussions/107795852426456992", 48 | "https://nitter.net/AdamCSharp/status/1473035981511180291", 49 | "twitter.com/i/status/1473035981511180291", 50 | "https://twitter.com/hashtag/swiftui?src=hash", 51 | "twitter.com/hashtag/swiftui", 52 | "https://en.m.wikipedia.org/wiki/Daphne_Caruana_Galizia", 53 | "en.wikipedia.org/wiki/Daphne_Caruana_Galizia", 54 | "https://stackoverflow.com/questions/69503317/bubble-sort-slower-with-o3-than-o2-with-gcc", 55 | "stackoverflow.com/q/69503317", 56 | "https://aviation.stackexchange.com/questions/71119/would-converting-a-lazair-ultralight-to-4-x-3-hp-engines-and-using-the-props-as", 57 | "aviation.stackexchange.com/q/71119", 58 | "https://example.com/#keep-fragment", 59 | "example.com/#keep-fragment", 60 | "https://old.reddit.com/r/wallstreetbets/comments/sv6clr/there_wont_be_a_war_in_ukraine_because_russia/", 61 | "reddit.com/r/wallstreetbets/comments/sv6clr", 62 | "https://www.amazon.it/Free-Freedom-Paperback-Stallmans-Software-ebook/dp/B006GCNP5S/ref=sr_1_2?__mk_it_IT=%C3%85M%C3%85%C5%BD%C3%95%C3%91&crid=A32SDX7PEZAW&keywords=richard+stallman&qid=1645805689&sprefix=richard+stallman%2Caps%2C154&sr=8-2", 63 | "amazon.it/dp/B006GCNP5S", 64 | "https://amazon.com/dp/B006GCNP5S?keywords=richard+stallman", 65 | "amazon.com/dp/B006GCNP5S", 66 | "https://amazon.com/dp/B006GCNP5S", 67 | "amazon.com/dp/B006GCNP5S", 68 | "about:reader?url=https%3A%2F%2Fmedium.com%2Fstories-from-fawrakh%2Ftales-of-a-hybrid-generation-8ccc853cbb77", 69 | "medium.com/p/8ccc853cbb77", 70 | "https://lwn.net/SubscriberLink/909887/c69ee127309e50e3/", 71 | "lwn.net/Articles/909887", 72 | "https://lwn.net/SubscriberLink/909887/14eac5b0b6f59131/", 73 | "lwn.net/Articles/909887", 74 | "https://store.google.com/category/phones?hl=en-US", 75 | "store.google.com/category/phones", 76 | "https://store.google.com/category/phones?hl=not-lang", 77 | "store.google.com/category/phones?hl=not-lang", 78 | "https://dl.acm.org/doi/pdf/10.1145/3371071", 79 | "doi.org/10.1145/3371071", 80 | "https://dl.acm.org/doi/10.1145/3371071", 81 | "doi.org/10.1145/3371071", 82 | "https://www.tandfonline.com/doi/abs/10.1080/03085147.2019.1678262", 83 | "doi.org/10.1080/03085147.2019.1678262", 84 | "https://doi.org/10.1000/182", 85 | "doi.org/10.1000/182", 86 | "https://arxiv.org/pdf/2210.07230.pdf", 87 | "arxiv.org/abs/2210.07230", 88 | "https://thenewstack.io/rust-vs-go-why-theyre-better-together/?s=09", 89 | "thenewstack.io/rust-vs-go-why-theyre-better-together", 90 | "https://groups.google.com/forum/#!topic/mozilla.dev.platform/1PHhxBxSehQ", 91 | "groups.google.com/g/mozilla.dev.platform/c/1PHhxBxSehQ", 92 | "https://groups.google.com/forum/?utm_term=0_62dc6ea1a0-4367aed1fd-246207570#!msg/mi.jobs/poxlcw8udk4/_ghzqb9sg9gj", 93 | "groups.google.com/g/mi.jobs/c/poxlcw8udk4/m/_ghzqb9sg9gj", 94 | "https://groups.google.com/forum/#!forum/golang-nuts", 95 | "groups.google.com/g/golang-nuts", 96 | "https://www.typescriptlang.org/play#code/JYOwLgpgTgZghgYwgAjMg3gWAFDL8uAIwQC4MBzMgZzClHIF8BuHfZAEwhjPXbJACuAW0LRmOBjhwIA9iBrJuyAApwoYYHAA2AHjAA+ZAF4MrfEVIUyAcgAW1hgBozeTkt5kA7JOzjs0uQU4Y2RrC2sCKmQAawgATxkYVBZ-bFl5NCgQmABtOABdFNyCkKgmIA", 97 | "www.typescriptlang.org/play?code=JYOwLgpgTgZghgYwgAjMg3gWAFDL8uAIwQC4MBzMgZzClHIF8BuHfZAEwhjPXbJACuAW0LRmOBjhwIA9iBrJuyAApwoYYHAA2AHjAA%2BZAF4MrfEVIUyAcgAW1hgBozeTkt5kA7JOzjs0uQU4Y2RrC2sCKmQAawgATxkYVBZ-bFl5NCgQmABtOABdFNyCkKgmIA", 98 | ] 99 | 100 | for u, r in zip(urls[0::2], urls[1::2]): 101 | c = cleanurl.cleanurl(u, generic=False, respect_semantics=True) 102 | self.assertEqual(c.schemeless_url, r, msg=u) 103 | 104 | self.assertEqual( 105 | c.schemeless_url, 106 | cleanurl.cleanurl( 107 | c.url, generic=False, respect_semantics=True 108 | ).schemeless_url, 109 | msg=f"Clean clean {c.schemeless_url}", 110 | ) 111 | 112 | def test_semantics_no_host_remap(self): 113 | urls = [ 114 | "https://mastodon.social/web/@gitea@social.gitea.io/107576792277055419", 115 | "mastodon.social/@gitea@social.gitea.io/107576792277055419", 116 | "https://nitter.net/AdamCSharp/status/1473035981511180291", 117 | "nitter.net/i/status/1473035981511180291", 118 | "https://bgolus.medium.com/the-quest-for-very-wide-outlines-ba82ed442cd9", 119 | "bgolus.medium.com/ba82ed442cd9", 120 | ] 121 | 122 | for u, r in zip(urls[0::2], urls[1::2]): 123 | c = cleanurl.cleanurl( 124 | u, generic=False, respect_semantics=True, host_remap=False 125 | ) 126 | self.assertEqual(c.schemeless_url, r, msg=u) 127 | 128 | self.assertEqual( 129 | c.schemeless_url, 130 | cleanurl.cleanurl( 131 | c.url, 132 | generic=False, 133 | respect_semantics=True, 134 | host_remap=False, 135 | ).schemeless_url, 136 | msg=f"Clean clean {c.schemeless_url}", 137 | ) 138 | 139 | def test_no_semantics(self): 140 | urls = [ 141 | "https://medium.com/swlh/caching-and-scaling-django-dc80a54012", 142 | "medium.com/p/dc80a54012", 143 | "https://bgolus.medium.com/the-quest-for-very-wide-outlines-ba82ed442cd9", 144 | "medium.com/p/ba82ed442cd9", 145 | "http://www.path-normalization.com/a///index.html////", 146 | "path-normalization.com/a", 147 | "https://www.youtube.com/watch?v=71SsVUmT1ys&ignore=query", 148 | "youtu.be/71ssvumt1ys", 149 | "https://www.xojoc.pw/blog/////focus.html", 150 | "xojoc.pw/blog/focus", 151 | "https://web.archive.org/web/20200103092739/https://www.xojoc.pw/blog/focus.html", 152 | "xojoc.pw/blog/focus", 153 | "https://twitter.com/#!wikileaks/status/1255304335887646721", 154 | "twitter.com/i/status/1255304335887646721", 155 | "https://threadreaderapp.com/thread/1453753924960219145", 156 | "twitter.com/i/status/1453753924960219145", 157 | "https://twitter.com/RustDiscussions/status/1448994137504686086?s=19", 158 | "twitter.com/i/status/1448994137504686086", 159 | "http://twitter.com/home", 160 | "twitter.com", 161 | "https://github.com/xojoc/discussions/tree/master", 162 | "github.com/xojoc/discussions", 163 | "https://github.com/satwikkansal/wtfpython/blob/master/readme.md", 164 | "github.com/satwikkansal/wtfpython", 165 | "https://www.nytimes.com/2006/10/11/technology/11yahoo.html?ex=1318219200&en=538f73d9faa9d263&ei=5090&partner=rssuserland&emc=rss", 166 | "nytimes.com/2006/10/11/technology/11yahoo", 167 | "https://open.nytimes.com/tracking-covid-19-from-hundreds-of-sources-one-extracted-record-at-a-time-dd8cbd31f9b4", 168 | "open.nytimes.com/dd8cbd31f9b4", 169 | "https://www.techcrunch.com/2009/05/30/vidoop-is-dead-employees-getting-computers-in-lieu-of-wages/?awesm=tcrn.ch_2t3&utm_campaign=techcrunch&utm_content=techcrunch-autopost&utm_medium=tcrn.ch-twitter&utm_source=direct-tcrn.ch", 170 | "techcrunch.com/2009/05/30/vidoop-is-dead-employees-getting-computers-in-lieu-of-wages", 171 | "https://dev.tube/video/EZ05e7EMOLM", 172 | "youtu.be/ez05e7emolm", 173 | "https://www.youtube.com/embed/71SsVUmT1ys?ignore=query", 174 | "youtu.be/71ssvumt1ys", 175 | "https://edition.cnn.com/2021/09/29/business/supply-chain-workers/index.html", 176 | "cnn.com/2021/09/29/business/supply-chain-workers", 177 | "https://www.google.com/amp/s/www.cnbc.com/amp/2021/04/27/archegos-hit-to-ubs-stuns-investors-as-shares-slide.html", 178 | "cnbc.com/2021/04/27/archegos-hit-to-ubs-stuns-investors-as-shares-slide", 179 | "https://en.m.wikipedia.org/wiki/Daphne_Caruana_Galizia", 180 | "wikipedia.org/wiki/daphne_caruana_galizia", 181 | "https://example.com/#remove-fragment", 182 | "example.com", 183 | "https://demo.tumblr.com/post/232/an-example-post", 184 | "demo.tumblr.com/post/232", 185 | "https://xojoc.pw/path#remove-fragment", 186 | "xojoc.pw/path", 187 | "http://www.sbcl.org/news.html#2.2.5", 188 | "sbcl.org/news#2.2.5", 189 | "https://www.cloudflare.com/it-it/learning/security/glossary/what-is-bgp/", 190 | "cloudflare.com/learning/security/glossary/what-is-bgp", 191 | "https://www.typescriptlang.org/play?#code/Base64", 192 | "typescriptlang.org/play?code=Base64", 193 | "https://docs.djangoproject.com/en/4.0/howto/deployment/asgi/", 194 | "docs.djangoproject.com/howto/deployment/asgi", 195 | ] 196 | 197 | for u, r in zip(urls[0::2], urls[1::2]): 198 | c = cleanurl.cleanurl(u, generic=False, respect_semantics=False) 199 | self.assertEqual(c.schemeless_url, r) 200 | 201 | self.assertEqual( 202 | c.schemeless_url, 203 | cleanurl.cleanurl( 204 | c.url, generic=False, respect_semantics=False 205 | ).schemeless_url, 206 | msg=f"Clean clean {c.schemeless_url}", 207 | ) 208 | --------------------------------------------------------------------------------