├── .github └── workflows │ ├── ci-build-and-install-gem.yml │ └── ci.yml ├── .gitignore ├── .rspec ├── CHANGELOG.md ├── Gemfile ├── LICENSE ├── README.md ├── Rakefile ├── examples └── url.rb ├── lib └── twingly │ ├── public_suffix_list.rb │ ├── url.rb │ ├── url │ ├── error.rb │ ├── hasher.rb │ ├── null_url.rb │ └── utilities.rb │ └── version.rb ├── profile ├── Gemfile ├── Rakefile └── profile.rb ├── spec ├── lib │ └── twingly │ │ ├── public_suffix_list_spec.rb │ │ ├── url │ │ ├── hasher_spec.rb │ │ ├── null_url_spec.rb │ │ └── utilities_spec.rb │ │ └── url_spec.rb └── spec_helper.rb └── twingly-url.gemspec /.github/workflows/ci-build-and-install-gem.yml: -------------------------------------------------------------------------------- 1 | name: CI build and install gem 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-22.04 10 | 11 | steps: 12 | - name: Checkout code 13 | uses: actions/checkout@v3 14 | 15 | - name: Setup Ruby 16 | uses: ruby/setup-ruby@v1 17 | with: 18 | ruby-version: 3.1.2 19 | 20 | - name: Build and install gem 21 | run: gem build *.gemspec && gem install *.gem 22 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-22.04 10 | 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | include: 15 | - { ruby: '2.6' } 16 | - { ruby: '2.7' } 17 | - { ruby: '3.0' } 18 | - { ruby: '3.1' } 19 | - { ruby: '3.2' } 20 | - { ruby: head, allow-failure: true } 21 | - { ruby: jruby-9.3 } 22 | - { ruby: jruby-head, allow-failure: true } 23 | 24 | steps: 25 | - name: Checkout code 26 | uses: actions/checkout@v3 27 | 28 | - name: Setup Ruby ${{ matrix.ruby }} 29 | uses: ruby/setup-ruby@v1 30 | with: 31 | ruby-version: ${{ matrix.ruby }} 32 | bundler-cache: true 33 | 34 | - name: Run tests 35 | run: bundle exec rake 36 | continue-on-error: ${{ matrix.allow-failure || false }} 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.gem 2 | *.rbc 3 | .bundle 4 | .config 5 | coverage 6 | InstalledFiles 7 | lib/bundler/man 8 | pkg 9 | rdoc 10 | spec/reports 11 | test/tmp 12 | test/version_tmp 13 | /tmp 14 | /profile/tmp 15 | 16 | # YARD artifacts 17 | .yardoc 18 | _yardoc 19 | doc/ 20 | 21 | Gemfile.lock 22 | -------------------------------------------------------------------------------- /.rspec: -------------------------------------------------------------------------------- 1 | --color 2 | --format documentation 3 | --require spec_helper 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [v7.0.1](https://github.com/twingly/twingly-url/tree/v7.0.1) (2022-11-01) 4 | 5 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v7.0.0...v7.0.1) 6 | 7 | **Merged pull requests:** 8 | 9 | - Add missing nil guard in `valid_hostname?` [\#160](https://github.com/twingly/twingly-url/pull/160) ([dentarg](https://github.com/dentarg)) 10 | 11 | ## [v7.0.0](https://github.com/twingly/twingly-url/tree/v7.0.0) (2022-10-14) 12 | 13 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.4...v7.0.0) 14 | 15 | **Implemented enhancements:** 16 | 17 | - Remove `Twingly::URL::Hasher.blogstream_hash` [\#152](https://github.com/twingly/twingly-url/issues/152) 18 | - Normalize URL with multiple consecutive dots [\#125](https://github.com/twingly/twingly-url/issues/125) 19 | 20 | **Fixed bugs:** 21 | 22 | - Bug in normalized\_host in Addressable \(ArgumentError: invalid byte sequence in UTF-8\) [\#62](https://github.com/twingly/twingly-url/issues/62) 23 | 24 | **Merged pull requests:** 25 | 26 | - Add more invalid URLs to specs [\#159](https://github.com/twingly/twingly-url/pull/159) ([dentarg](https://github.com/dentarg)) 27 | - Validate the normalized hostname [\#158](https://github.com/twingly/twingly-url/pull/158) ([dentarg](https://github.com/dentarg)) 28 | - Require Ruby \>= 2.6 [\#157](https://github.com/twingly/twingly-url/pull/157) ([Pontus4](https://github.com/Pontus4)) 29 | - CI housekeeping [\#156](https://github.com/twingly/twingly-url/pull/156) ([dentarg](https://github.com/dentarg)) 30 | - Allow use of `public_suffix` 5 [\#155](https://github.com/twingly/twingly-url/pull/155) ([dentarg](https://github.com/dentarg)) 31 | - Test with latest Rubies [\#154](https://github.com/twingly/twingly-url/pull/154) ([roback](https://github.com/roback)) 32 | - Remove `Twingly::URL::Hasher.blogstream_hash` [\#153](https://github.com/twingly/twingly-url/pull/153) ([Chrizpy](https://github.com/Chrizpy)) 33 | - Run CI on latest Rubies [\#151](https://github.com/twingly/twingly-url/pull/151) ([walro](https://github.com/walro)) 34 | 35 | ## [v6.0.4](https://github.com/twingly/twingly-url/tree/v6.0.4) (2021-04-14) 36 | 37 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.3...v6.0.4) 38 | 39 | **Implemented enhancements:** 40 | 41 | - Ruby 2.7 support [\#144](https://github.com/twingly/twingly-url/issues/144) 42 | 43 | **Merged pull requests:** 44 | 45 | - Run CI on Ruby 3.0.0 [\#150](https://github.com/twingly/twingly-url/pull/150) ([walro](https://github.com/walro)) 46 | - Add version constraint for Pry [\#149](https://github.com/twingly/twingly-url/pull/149) ([walro](https://github.com/walro)) 47 | - Run CI on GitHub actions [\#148](https://github.com/twingly/twingly-url/pull/148) ([walro](https://github.com/walro)) 48 | - Test with more recent Rubies [\#146](https://github.com/twingly/twingly-url/pull/146) ([walro](https://github.com/walro)) 49 | 50 | ## [v6.0.3](https://github.com/twingly/twingly-url/tree/v6.0.3) (2020-09-21) 51 | 52 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.2...v6.0.3) 53 | 54 | **Fixed bugs:** 55 | 56 | - Cannot published gem using "rake release" [\#139](https://github.com/twingly/twingly-url/issues/139) 57 | 58 | **Merged pull requests:** 59 | 60 | - Update to Ruby 2.7 and drop support for Ruby 2.4 [\#145](https://github.com/twingly/twingly-url/pull/145) ([Pontus4](https://github.com/Pontus4)) 61 | - Bump rake version \(10 -\> 12\) [\#142](https://github.com/twingly/twingly-url/pull/142) ([dentarg](https://github.com/dentarg)) 62 | - Test with latest Ruby versions on Travis [\#140](https://github.com/twingly/twingly-url/pull/140) ([roback](https://github.com/roback)) 63 | 64 | ## [v6.0.2](https://github.com/twingly/twingly-url/tree/v6.0.2) (2019-08-28) 65 | 66 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.1...v6.0.2) 67 | 68 | **Implemented enhancements:** 69 | 70 | - Make CI test gem installation [\#109](https://github.com/twingly/twingly-url/issues/109) 71 | 72 | **Fixed bugs:** 73 | 74 | - Handle some common representations of newlines [\#138](https://github.com/twingly/twingly-url/pull/138) ([dentarg](https://github.com/dentarg)) 75 | 76 | **Merged pull requests:** 77 | 78 | - Add a memory profiler task [\#137](https://github.com/twingly/twingly-url/pull/137) ([jage](https://github.com/jage)) 79 | - Test gem install on TravisCI [\#136](https://github.com/twingly/twingly-url/pull/136) ([roback](https://github.com/roback)) 80 | - Sign in to RubyGems.org before trying to publish [\#134](https://github.com/twingly/twingly-url/pull/134) ([dentarg](https://github.com/dentarg)) 81 | 82 | ## [v6.0.1](https://github.com/twingly/twingly-url/tree/v6.0.1) (2019-03-04) 83 | 84 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.0...v6.0.1) 85 | 86 | **Implemented enhancements:** 87 | 88 | - Update addressable to 2.6 [\#132](https://github.com/twingly/twingly-url/issues/132) 89 | 90 | **Merged pull requests:** 91 | 92 | - Accept that parsing downcases the scheme part, allow Addressable 2.6 [\#133](https://github.com/twingly/twingly-url/pull/133) ([dentarg](https://github.com/dentarg)) 93 | 94 | ## [v6.0.0](https://github.com/twingly/twingly-url/tree/v6.0.0) (2019-02-06) 95 | 96 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v5.1.1...v6.0.0) 97 | 98 | **Fixed bugs:** 99 | 100 | - Array\#uniq does not remove equal Twingly::URLs [\#123](https://github.com/twingly/twingly-url/issues/123) 101 | - uninitialized constant Addressable::IDNA::PunycodeBigOutput \(NameError\) [\#119](https://github.com/twingly/twingly-url/issues/119) 102 | 103 | **Merged pull requests:** 104 | 105 | - Strip input from both space and non-breaking space [\#131](https://github.com/twingly/twingly-url/pull/131) ([dentarg](https://github.com/dentarg)) 106 | - Implement uniqueness \(hash equality\) [\#129](https://github.com/twingly/twingly-url/pull/129) ([jage](https://github.com/jage)) 107 | - Freeze string literals and mutable constants [\#128](https://github.com/twingly/twingly-url/pull/128) ([jage](https://github.com/jage)) 108 | - Support: Ruby 2.6, 2.5, 2.4, drop 2.2, 2.3 [\#127](https://github.com/twingly/twingly-url/pull/127) ([jage](https://github.com/jage)) 109 | - Strip URLs of leading and trailing non-breaking space \(and space, but we already did\) [\#126](https://github.com/twingly/twingly-url/pull/126) ([dentarg](https://github.com/dentarg)) 110 | - Remove Twingly::URL::Hasher.pingloggerdb\_hash [\#124](https://github.com/twingly/twingly-url/pull/124) ([walro](https://github.com/walro)) 111 | - Fix various warnings [\#122](https://github.com/twingly/twingly-url/pull/122) ([walro](https://github.com/walro)) 112 | - Load the pure-Ruby IDNA implementation from Addressable [\#120](https://github.com/twingly/twingly-url/pull/120) ([dentarg](https://github.com/dentarg)) 113 | 114 | ## [v5.1.1](https://github.com/twingly/twingly-url/tree/v5.1.1) (2018-02-14) 115 | 116 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v5.1.0...v5.1.1) 117 | 118 | **Implemented enhancements:** 119 | 120 | - Rework exceptions [\#31](https://github.com/twingly/twingly-url/issues/31) 121 | 122 | **Merged pull requests:** 123 | 124 | - Allow future patch versions of dependencies [\#118](https://github.com/twingly/twingly-url/pull/118) ([dentarg](https://github.com/dentarg)) 125 | - Update PublicSuffix and Addressable [\#117](https://github.com/twingly/twingly-url/pull/117) ([roback](https://github.com/roback)) 126 | - Use latest rubies on Travis CI [\#116](https://github.com/twingly/twingly-url/pull/116) ([dentarg](https://github.com/dentarg)) 127 | - Use latest rubies on Travis CI [\#115](https://github.com/twingly/twingly-url/pull/115) ([walro](https://github.com/walro)) 128 | - Use latest rubies on Travis CI [\#114](https://github.com/twingly/twingly-url/pull/114) ([dentarg](https://github.com/dentarg)) 129 | - Do not blow up on Addressable::IDNA::PunycodeBigOutput [\#113](https://github.com/twingly/twingly-url/pull/113) ([dentarg](https://github.com/dentarg)) 130 | - Bump Ruby versions tested on Travis CI [\#111](https://github.com/twingly/twingly-url/pull/111) ([dentarg](https://github.com/dentarg)) 131 | - Tag exceptions [\#106](https://github.com/twingly/twingly-url/pull/106) ([dentarg](https://github.com/dentarg)) 132 | - Switch to logical requires [\#103](https://github.com/twingly/twingly-url/pull/103) ([dentarg](https://github.com/dentarg)) 133 | 134 | ## [v5.1.0](https://github.com/twingly/twingly-url/tree/v5.1.0) (2017-03-03) 135 | 136 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v5.0.1...v5.1.0) 137 | 138 | **Implemented enhancements:** 139 | 140 | - Release a new version \(without idn-ruby\) [\#110](https://github.com/twingly/twingly-url/issues/110) 141 | - Unfortunate require in Rakefile \(profile task\), affect specs [\#92](https://github.com/twingly/twingly-url/issues/92) 142 | - JRuby compatibility \(drop libidn requirement\) [\#66](https://github.com/twingly/twingly-url/issues/66) 143 | 144 | **Fixed bugs:** 145 | 146 | - Dependencies not locked correctly [\#104](https://github.com/twingly/twingly-url/issues/104) 147 | - twingly-url doesn't support IDNA2008, only IDNA2003 \(libidn\) [\#101](https://github.com/twingly/twingly-url/issues/101) 148 | 149 | **Merged pull requests:** 150 | 151 | - Test the latest Ruby releases [\#108](https://github.com/twingly/twingly-url/pull/108) ([dentarg](https://github.com/dentarg)) 152 | - Depend on addressable 2.5.0 and public\_suffix 2.0.3 [\#107](https://github.com/twingly/twingly-url/pull/107) ([dentarg](https://github.com/dentarg)) 153 | - Remove ruby-prof from dev dependencies [\#105](https://github.com/twingly/twingly-url/pull/105) ([dentarg](https://github.com/dentarg)) 154 | - Drop libidn [\#102](https://github.com/twingly/twingly-url/pull/102) ([dentarg](https://github.com/dentarg)) 155 | 156 | ## [v5.0.1](https://github.com/twingly/twingly-url/tree/v5.0.1) (2016-09-19) 157 | 158 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v5.0.0...v5.0.1) 159 | 160 | **Fixed bugs:** 161 | 162 | - "ArgumentError: invalid byte sequence in US-ASCII" when parsing the public suffix list [\#98](https://github.com/twingly/twingly-url/issues/98) 163 | 164 | **Merged pull requests:** 165 | 166 | - Make sure we always read PSL data as UTF-8 [\#99](https://github.com/twingly/twingly-url/pull/99) ([dentarg](https://github.com/dentarg)) 167 | 168 | ## [v5.0.0](https://github.com/twingly/twingly-url/tree/v5.0.0) (2016-09-16) 169 | 170 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v4.2.0...v5.0.0) 171 | 172 | **Implemented enhancements:** 173 | 174 | - License file [\#91](https://github.com/twingly/twingly-url/issues/91) 175 | - Use PublicSuffix 2.0 [\#85](https://github.com/twingly/twingly-url/issues/85) 176 | - Changelog [\#33](https://github.com/twingly/twingly-url/issues/33) 177 | 178 | **Fixed bugs:** 179 | 180 | - NormalizedURL\#to\_s returns punycode, other instance methods does not [\#89](https://github.com/twingly/twingly-url/issues/89) 181 | 182 | **Merged pull requests:** 183 | 184 | - DRY up to urls example [\#95](https://github.com/twingly/twingly-url/pull/95) ([jage](https://github.com/jage)) 185 | - Add changelog [\#93](https://github.com/twingly/twingly-url/pull/93) ([dentarg](https://github.com/dentarg)) 186 | - Ensure normalized IDNA domains return ASCII strings [\#90](https://github.com/twingly/twingly-url/pull/90) ([dentarg](https://github.com/dentarg)) 187 | 188 | ## [v4.2.0](https://github.com/twingly/twingly-url/tree/v4.2.0) (2016-08-31) 189 | 190 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v4.1.0...v4.2.0) 191 | 192 | **Merged pull requests:** 193 | 194 | - Add Twingly::URL\#ttld, "true TLD" getter [\#88](https://github.com/twingly/twingly-url/pull/88) ([dentarg](https://github.com/dentarg)) 195 | - Add example usage to README [\#86](https://github.com/twingly/twingly-url/pull/86) ([dentarg](https://github.com/dentarg)) 196 | 197 | ## [v4.1.0](https://github.com/twingly/twingly-url/tree/v4.1.0) (2016-05-23) 198 | 199 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v4.0.0...v4.1.0) 200 | 201 | **Closed issues:** 202 | 203 | - Expose addressable's \#userinfo [\#73](https://github.com/twingly/twingly-url/issues/73) 204 | 205 | **Merged pull requests:** 206 | 207 | - Expose userinfo, user and password [\#84](https://github.com/twingly/twingly-url/pull/84) ([jage](https://github.com/jage)) 208 | 209 | ## [v4.0.0](https://github.com/twingly/twingly-url/tree/v4.0.0) (2016-02-03) 210 | 211 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v3.0.2...v4.0.0) 212 | 213 | **Implemented enhancements:** 214 | 215 | - Make more methods private [\#60](https://github.com/twingly/twingly-url/issues/60) 216 | 217 | **Fixed bugs:** 218 | 219 | - domain, sld, trd \(maybe others\) can be nil [\#52](https://github.com/twingly/twingly-url/issues/52) 220 | 221 | **Merged pull requests:** 222 | 223 | - Temporary fix for UTF-8 bug in addressable [\#79](https://github.com/twingly/twingly-url/pull/79) ([roback](https://github.com/roback)) 224 | - No part of a URL should be nil [\#78](https://github.com/twingly/twingly-url/pull/78) ([roback](https://github.com/roback)) 225 | - The gem should load the version constant [\#75](https://github.com/twingly/twingly-url/pull/75) ([dentarg](https://github.com/dentarg)) 226 | - Make things private [\#69](https://github.com/twingly/twingly-url/pull/69) ([dentarg](https://github.com/dentarg)) 227 | 228 | ## [v3.0.2](https://github.com/twingly/twingly-url/tree/v3.0.2) (2015-11-11) 229 | 230 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v3.0.1...v3.0.2) 231 | 232 | **Fixed bugs:** 233 | 234 | - IDN::Idna::IdnaError: Output would be too large or too small [\#64](https://github.com/twingly/twingly-url/issues/64) 235 | 236 | **Merged pull requests:** 237 | 238 | - Rescue IDN::Idna::IdnaError [\#65](https://github.com/twingly/twingly-url/pull/65) ([dentarg](https://github.com/dentarg)) 239 | 240 | ## [v3.0.1](https://github.com/twingly/twingly-url/tree/v3.0.1) (2015-11-11) 241 | 242 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v3.0.0...v3.0.1) 243 | 244 | **Fixed bugs:** 245 | 246 | - Do not blow up on broken punycode URLs [\#48](https://github.com/twingly/twingly-url/issues/48) 247 | 248 | **Merged pull requests:** 249 | 250 | - Improve punycode handling with libidn [\#63](https://github.com/twingly/twingly-url/pull/63) ([walro](https://github.com/walro)) 251 | 252 | ## [v3.0.0](https://github.com/twingly/twingly-url/tree/v3.0.0) (2015-11-02) 253 | 254 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v2.0.0...v3.0.0) 255 | 256 | **Implemented enhancements:** 257 | 258 | - New major release [\#38](https://github.com/twingly/twingly-url/issues/38) 259 | 260 | **Fixed bugs:** 261 | 262 | - require bug [\#56](https://github.com/twingly/twingly-url/issues/56) 263 | - \#valid? doesn't work for protocol-less urls [\#55](https://github.com/twingly/twingly-url/issues/55) 264 | - Drop support for older Ruby versions [\#53](https://github.com/twingly/twingly-url/issues/53) 265 | 266 | **Merged pull requests:** 267 | 268 | - We support 2.2.x [\#59](https://github.com/twingly/twingly-url/pull/59) ([walro](https://github.com/walro)) 269 | - Fix "\#valid? doesn't work for protocol-less urls" [\#58](https://github.com/twingly/twingly-url/pull/58) ([walro](https://github.com/walro)) 270 | - Refactor requires [\#57](https://github.com/twingly/twingly-url/pull/57) ([dentarg](https://github.com/dentarg)) 271 | 272 | ## [v2.0.0](https://github.com/twingly/twingly-url/tree/v2.0.0) (2015-10-26) 273 | 274 | [Full Changelog](https://github.com/twingly/twingly-url/compare/d7cecadd542ce5e7709833e1874d39d644d4d11f...v2.0.0) 275 | 276 | **Implemented enhancements:** 277 | 278 | - Move lib/version.rb to lib/twingly/version.rb [\#50](https://github.com/twingly/twingly-url/issues/50) 279 | - Prettier inspect output [\#43](https://github.com/twingly/twingly-url/issues/43) 280 | - Return objects instead of strings [\#40](https://github.com/twingly/twingly-url/issues/40) 281 | - Do not return nil [\#35](https://github.com/twingly/twingly-url/issues/35) 282 | - Method to extract URLs from text without normalizing [\#34](https://github.com/twingly/twingly-url/issues/34) 283 | - Turn is unmaintained [\#27](https://github.com/twingly/twingly-url/issues/27) 284 | - Should normalize IDN properly [\#17](https://github.com/twingly/twingly-url/issues/17) 285 | - Discrepancy with .NET normalization [\#12](https://github.com/twingly/twingly-url/issues/12) 286 | - Capability for extracting origin part of an URL [\#11](https://github.com/twingly/twingly-url/issues/11) 287 | - Always return normalized urls in lower case [\#8](https://github.com/twingly/twingly-url/issues/8) 288 | - Make gem more general [\#6](https://github.com/twingly/twingly-url/issues/6) 289 | 290 | **Fixed bugs:** 291 | 292 | - Ensure proper behaviour for edge-case input data [\#45](https://github.com/twingly/twingly-url/issues/45) 293 | - normalize method can't handle URLs with punycoded TLD [\#28](https://github.com/twingly/twingly-url/issues/28) 294 | - Shoulda-context does not seem to work with Ruby 2.2 [\#26](https://github.com/twingly/twingly-url/issues/26) 295 | - Digest is not threadsafe [\#20](https://github.com/twingly/twingly-url/issues/20) 296 | - Blogspot.com normalization error [\#13](https://github.com/twingly/twingly-url/issues/13) 297 | - Crashes if only a protocol is provided [\#10](https://github.com/twingly/twingly-url/issues/10) 298 | - Can not handle urls with international characters [\#2](https://github.com/twingly/twingly-url/issues/2) 299 | - Add tests [\#1](https://github.com/twingly/twingly-url/issues/1) 300 | 301 | **Closed issues:** 302 | 303 | - Release 1.3.3 [\#22](https://github.com/twingly/twingly-url/issues/22) 304 | - Encrypt HipChat API key in .travis.yml [\#16](https://github.com/twingly/twingly-url/issues/16) 305 | - Always return normalized URLs with lower case scheme [\#9](https://github.com/twingly/twingly-url/issues/9) 306 | - Add test for URL: feedville.com,2007-06-19:/blends/16171 [\#7](https://github.com/twingly/twingly-url/issues/7) 307 | - Make repo public [\#5](https://github.com/twingly/twingly-url/issues/5) 308 | - Add .ruby-version file? [\#4](https://github.com/twingly/twingly-url/issues/4) 309 | 310 | **Merged pull requests:** 311 | 312 | - Move version.rb to correct subdir [\#51](https://github.com/twingly/twingly-url/pull/51) ([jage](https://github.com/jage)) 313 | - Implement prettier \#inspect [\#47](https://github.com/twingly/twingly-url/pull/47) ([jage](https://github.com/jage)) 314 | - Work with Twingly::URL objects instead of strings [\#42](https://github.com/twingly/twingly-url/pull/42) ([twingly-mob](https://github.com/twingly-mob)) 315 | - New .extract\_url method which does not normalize [\#41](https://github.com/twingly/twingly-url/pull/41) ([twingly-mob](https://github.com/twingly-mob)) 316 | - Sync known behaviour with .NET [\#37](https://github.com/twingly/twingly-url/pull/37) ([roback](https://github.com/roback)) 317 | - Change from minitest to rspec [\#36](https://github.com/twingly/twingly-url/pull/36) ([roback](https://github.com/roback)) 318 | - Make sure Digest loading is thread-safe [\#32](https://github.com/twingly/twingly-url/pull/32) ([jage](https://github.com/jage)) 319 | - Ensure we have a tmp directory to dump result to [\#30](https://github.com/twingly/twingly-url/pull/30) ([walro](https://github.com/walro)) 320 | - Turn is unmaintained [\#29](https://github.com/twingly/twingly-url/pull/29) ([walro](https://github.com/walro)) 321 | - Downcase URLs in normalization [\#23](https://github.com/twingly/twingly-url/pull/23) ([jage](https://github.com/jage)) 322 | - Twingly::URL::Utilities.remove\_scheme [\#21](https://github.com/twingly/twingly-url/pull/21) ([jage](https://github.com/jage)) 323 | - Fix "gem build" warnings [\#19](https://github.com/twingly/twingly-url/pull/19) ([dentarg](https://github.com/dentarg)) 324 | - Rename gem to twingly-url [\#15](https://github.com/twingly/twingly-url/pull/15) ([jage](https://github.com/jage)) 325 | - Don't add www. to blogspot [\#14](https://github.com/twingly/twingly-url/pull/14) ([jage](https://github.com/jage)) 326 | - Tests [\#3](https://github.com/twingly/twingly-url/pull/3) ([jage](https://github.com/jage)) 327 | 328 | 329 | 330 | \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)* 331 | -------------------------------------------------------------------------------- /Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source 'https://rubygems.org/' 4 | 5 | gemspec 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Twingly AB 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Twingly::URL 2 | 3 | [![GitHub Build Status](https://github.com/twingly/twingly-url/workflows/CI/badge.svg?branch=master)](https://github.com/twingly/twingly-url/actions) 4 | 5 | Twingly URL tools. 6 | 7 | * `twingly/url` - Parse and validate URLs 8 | * `Twingly::URL.parse` - Returns one or more `Twingly::URL` instance 9 | * `twingly/url/hasher` - Generate URL hashes suitable for primary keys 10 | * `Twingly::URL::Hasher.taskdb_hash(url)` - MD5 hexdigest 11 | * `Twingly::URL::Hasher.documentdb_hash(url)` - SHA256 unsigned long, native endian digest 12 | * `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest 13 | * `twingly/url/utilities` - Utilities to work with URLs 14 | * `Twingly::URL::Utilities.extract_valid_urls` - Returns Array of valid `Twingly::URL` 15 | 16 | ## Getting Started 17 | 18 | Install the gem: 19 | 20 | gem install twingly-url 21 | 22 | Usage (this output was created with [`examples/url.rb`][examples]): 23 | 24 | ```ruby 25 | require "twingly/url" 26 | 27 | url = Twingly::URL.parse("http://www.twingly.co.uk/search") 28 | url.scheme # => "http" 29 | url.normalized.scheme # => "http" 30 | url.trd # => "www" 31 | url.normalized.trd # => "www" 32 | url.sld # => "twingly" 33 | url.normalized.sld # => "twingly" 34 | url.tld # => "co.uk" 35 | url.normalized.tld # => "co.uk" 36 | url.ttld # => "uk" 37 | url.normalized.ttld # => "uk" 38 | url.domain # => "twingly.co.uk" 39 | url.normalized.domain # => "twingly.co.uk" 40 | url.host # => "www.twingly.co.uk" 41 | url.normalized.host # => "www.twingly.co.uk" 42 | url.origin # => "http://www.twingly.co.uk" 43 | url.normalized.origin # => "http://www.twingly.co.uk" 44 | url.path # => "/search" 45 | url.normalized.path # => "/search" 46 | url.without_scheme # => "//www.twingly.co.uk/search" 47 | url.normalized.without_scheme # => "//www.twingly.co.uk/search" 48 | url.userinfo # => "" 49 | url.normalized.userinfo # => "" 50 | url.user # => "" 51 | url.normalized.user # => "" 52 | url.password # => "" 53 | url.normalized.password # => "" 54 | url.valid? # => "true" 55 | url.normalized.valid? # => "true" 56 | url.to_s # => "http://www.twingly.co.uk/search" 57 | url.normalized.to_s # => "http://www.twingly.co.uk/search" 58 | 59 | url = Twingly::URL.parse("http://räksmörgås.макдональдс.рф/foo") 60 | url.scheme # => "http" 61 | url.normalized.scheme # => "http" 62 | url.trd # => "räksmörgås" 63 | url.normalized.trd # => "xn--rksmrgs-5wao1o" 64 | url.sld # => "макдональдс" 65 | url.normalized.sld # => "xn--80aalb1aicli8a5i" 66 | url.tld # => "рф" 67 | url.normalized.tld # => "xn--p1ai" 68 | url.ttld # => "рф" 69 | url.normalized.ttld # => "xn--p1ai" 70 | url.domain # => "макдональдс.рф" 71 | url.normalized.domain # => "xn--80aalb1aicli8a5i.xn--p1ai" 72 | url.host # => "räksmörgås.макдональдс.рф" 73 | url.normalized.host # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai" 74 | url.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai" 75 | url.normalized.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai" 76 | url.path # => "/foo" 77 | url.normalized.path # => "/foo" 78 | url.without_scheme # => "//räksmörgås.макдональдс.рф/foo" 79 | url.normalized.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo" 80 | url.userinfo # => "" 81 | url.normalized.userinfo # => "" 82 | url.user # => "" 83 | url.normalized.user # => "" 84 | url.password # => "" 85 | url.normalized.password # => "" 86 | url.valid? # => "true" 87 | url.normalized.valid? # => "true" 88 | url.to_s # => "http://räksmörgås.макдональдс.рф/foo" 89 | url.normalized.to_s # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo" 90 | 91 | url = Twingly::URL.parse("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo") 92 | url.scheme # => "http" 93 | url.normalized.scheme # => "http" 94 | url.trd # => "xn--rksmrgs-5wao1o" 95 | url.normalized.trd # => "xn--rksmrgs-5wao1o" 96 | url.sld # => "xn--80aalb1aicli8a5i" 97 | url.normalized.sld # => "xn--80aalb1aicli8a5i" 98 | url.tld # => "xn--p1ai" 99 | url.normalized.tld # => "xn--p1ai" 100 | url.ttld # => "xn--p1ai" 101 | url.normalized.ttld # => "xn--p1ai" 102 | url.domain # => "xn--80aalb1aicli8a5i.xn--p1ai" 103 | url.normalized.domain # => "xn--80aalb1aicli8a5i.xn--p1ai" 104 | url.host # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai" 105 | url.normalized.host # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai" 106 | url.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai" 107 | url.normalized.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai" 108 | url.path # => "/foo" 109 | url.normalized.path # => "/foo" 110 | url.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo" 111 | url.normalized.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo" 112 | url.userinfo # => "" 113 | url.normalized.userinfo # => "" 114 | url.user # => "" 115 | url.normalized.user # => "" 116 | url.password # => "" 117 | url.normalized.password # => "" 118 | url.valid? # => "true" 119 | url.normalized.valid? # => "true" 120 | url.to_s # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo" 121 | url.normalized.to_s # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo" 122 | 123 | url = Twingly::URL.parse("https://admin:correcthorsebatterystaple@example.com/") 124 | url.scheme # => "https" 125 | url.normalized.scheme # => "https" 126 | url.trd # => "" 127 | url.normalized.trd # => "www" 128 | url.sld # => "example" 129 | url.normalized.sld # => "example" 130 | url.tld # => "com" 131 | url.normalized.tld # => "com" 132 | url.ttld # => "com" 133 | url.normalized.ttld # => "com" 134 | url.domain # => "example.com" 135 | url.normalized.domain # => "example.com" 136 | url.host # => "example.com" 137 | url.normalized.host # => "www.example.com" 138 | url.origin # => "https://example.com" 139 | url.normalized.origin # => "https://www.example.com" 140 | url.path # => "/" 141 | url.normalized.path # => "/" 142 | url.without_scheme # => "//admin:correcthorsebatterystaple@example.com/" 143 | url.normalized.without_scheme # => "//admin:correcthorsebatterystaple@www.example.com/" 144 | url.userinfo # => "admin:correcthorsebatterystaple" 145 | url.normalized.userinfo # => "admin:correcthorsebatterystaple" 146 | url.user # => "admin" 147 | url.normalized.user # => "admin" 148 | url.password # => "correcthorsebatterystaple" 149 | url.normalized.password # => "correcthorsebatterystaple" 150 | url.valid? # => "true" 151 | url.normalized.valid? # => "true" 152 | url.to_s # => "https://admin:correcthorsebatterystaple@example.com/" 153 | url.normalized.to_s # => "https://admin:correcthorsebatterystaple@www.example.com/" 154 | ``` 155 | 156 | ### Dependencies 157 | 158 | Only the gems listed in the [Gem Specification](https://github.com/twingly/twingly-url/blob/master/twingly-url.gemspec). 159 | 160 | ## Development 161 | 162 | To inspect the [Public Suffix List], this handy command can be used (also works in projects that use `twingly-url` as an dependency). 163 | 164 | open $(bundle show public_suffix)/data/list.txt 165 | 166 | [Public Suffix List]: https://github.com/weppos/publicsuffix-ruby 167 | 168 | ## Tests 169 | 170 | Run tests with 171 | 172 | bundle exec rake 173 | 174 | ### Profiling 175 | 176 | There's some profiling tasks available through Rake 177 | 178 | cd profile/ 179 | bundle # Install dependencies 180 | bundle exec rake -T # Show available tasks 181 | 182 | Note that this isn't a benchmark, we're using [ruby-prof] and [memory_profiler] which will slow things down. 183 | 184 | ## Release workflow 185 | 186 | * Update the [examples] in this README if needed, generate the output with 187 | 188 | ruby examples/url.rb 189 | 190 | * Bump the version in `lib/twingly/version.rb` in a commit, no need to push (the release task does that). 191 | 192 | * Ensure you are signed in to RubyGems.org as [twingly][twingly-rubygems] with `gem signin`. 193 | 194 | * Build and [publish](http://guides.rubygems.org/publishing/) the gem. This will create the proper tag in git, push the commit and tag and upload to RubyGems. 195 | 196 | bundle exec rake release 197 | 198 | * Update the changelog with [GitHub Changelog Generator](https://github.com/github-changelog-generator/github-changelog-generator) (`gem install github_changelog_generator` if you don't have it, set `CHANGELOG_GITHUB_TOKEN` to a personal access token to avoid rate limiting by GitHub). This command will update `CHANGELOG.md`. You need to commit and push manually. 199 | 200 | github_changelog_generator 201 | 202 | [twingly-rubygems]: https://rubygems.org/profiles/twingly 203 | [ruby-prof]: http://ruby-prof.rubyforge.org/ 204 | [memory_profiler]: https://github.com/SamSaffron/memory_profiler 205 | [examples]: examples/url.rb 206 | -------------------------------------------------------------------------------- /Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | # Bundler rake tasks to handle gem releases 4 | require "bundler/gem_tasks" 5 | 6 | begin 7 | require "rspec/core/rake_task" 8 | 9 | spec_files = Dir.glob(File.join("spec/**", "*_spec.rb")) 10 | spec_tasks = [] 11 | 12 | namespace(:spec) do 13 | spec_files.each do |spec_file| 14 | task_name = File.basename(spec_file, ".rb").to_sym 15 | 16 | spec_tasks << "spec:#{task_name}" 17 | 18 | RSpec::Core::RakeTask.new(task_name) do |task| 19 | task.pattern = spec_file 20 | end 21 | end 22 | end 23 | 24 | task default: spec_tasks.shuffle 25 | rescue LoadError 26 | end 27 | -------------------------------------------------------------------------------- /examples/url.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "bundler/setup" 4 | require_relative "../lib/twingly/url" 5 | 6 | def print_url_details(url_as_string) 7 | url = Twingly::URL.parse(url_as_string) 8 | 9 | puts "url = Twingly::URL.parse(\"#{url_as_string}\")" 10 | puts "url.scheme # => \"#{url.scheme}\"" 11 | puts "url.normalized.scheme # => \"#{url.normalized.scheme}\"" 12 | puts "url.trd # => \"#{url.trd}\"" 13 | puts "url.normalized.trd # => \"#{url.normalized.trd}\"" 14 | puts "url.sld # => \"#{url.sld}\"" 15 | puts "url.normalized.sld # => \"#{url.normalized.sld}\"" 16 | puts "url.tld # => \"#{url.tld}\"" 17 | puts "url.normalized.tld # => \"#{url.normalized.tld}\"" 18 | puts "url.ttld # => \"#{url.ttld}\"" 19 | puts "url.normalized.ttld # => \"#{url.normalized.ttld}\"" 20 | puts "url.domain # => \"#{url.domain}\"" 21 | puts "url.normalized.domain # => \"#{url.normalized.domain}\"" 22 | puts "url.host # => \"#{url.host}\"" 23 | puts "url.normalized.host # => \"#{url.normalized.host}\"" 24 | puts "url.origin # => \"#{url.origin}\"" 25 | puts "url.normalized.origin # => \"#{url.normalized.origin}\"" 26 | puts "url.path # => \"#{url.path}\"" 27 | puts "url.normalized.path # => \"#{url.normalized.path}\"" 28 | puts "url.without_scheme # => \"#{url.without_scheme}\"" 29 | puts "url.normalized.without_scheme # => \"#{url.normalized.without_scheme}\"" 30 | puts "url.userinfo # => \"#{url.userinfo}\"" 31 | puts "url.normalized.userinfo # => \"#{url.normalized.userinfo}\"" 32 | puts "url.user # => \"#{url.user}\"" 33 | puts "url.normalized.user # => \"#{url.normalized.user}\"" 34 | puts "url.password # => \"#{url.password}\"" 35 | puts "url.normalized.password # => \"#{url.normalized.password}\"" 36 | puts "url.valid? # => \"#{url.valid?}\"" 37 | puts "url.normalized.valid? # => \"#{url.normalized.valid?}\"" 38 | puts "url.to_s # => \"#{url.to_s}\"" 39 | puts "url.normalized.to_s # => \"#{url.normalized.to_s}\"" 40 | end 41 | 42 | puts "require \"twingly/url\"" 43 | puts 44 | print_url_details("http://www.twingly.co.uk/search") 45 | puts 46 | print_url_details("http://räksmörgås.макдональдс.рф/foo") 47 | puts 48 | print_url_details("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo") 49 | puts 50 | print_url_details("https://admin:correcthorsebatterystaple@example.com/") 51 | -------------------------------------------------------------------------------- /lib/twingly/public_suffix_list.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "addressable/idna" 4 | require "public_suffix" 5 | 6 | module Twingly 7 | class PublicSuffixList 8 | ACE_PREFIX = /\Axn\-\-/i.freeze 9 | 10 | private_constant :ACE_PREFIX 11 | 12 | # Extend the PSL with ASCII form of all internationalized domain names 13 | def self.with_punycoded_names(encoding: Encoding::UTF_8) 14 | list_path = PublicSuffix::List::DEFAULT_LIST_PATH 15 | list_data = File.read(list_path, encoding: encoding) 16 | list = PublicSuffix::List.parse(list_data, private_domains: false) 17 | 18 | punycoded_names(list).each do |punycoded_name| 19 | new_rule = PublicSuffix::Rule.factory(punycoded_name) 20 | list.add(new_rule) 21 | end 22 | 23 | list 24 | end 25 | 26 | private_class_method \ 27 | def self.punycoded_names(list) 28 | names = list.each.map { |rule| Addressable::IDNA.to_ascii(rule.value) } 29 | names.select { |name| punycoded_name?(name) } 30 | end 31 | 32 | private_class_method \ 33 | def self.punycoded_name?(name) 34 | PublicSuffix::Domain.name_to_labels(name).any? do |label| 35 | label =~ ACE_PREFIX 36 | end 37 | end 38 | end 39 | end 40 | -------------------------------------------------------------------------------- /lib/twingly/url.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "addressable/idna/pure" 4 | require "addressable/uri" 5 | require "public_suffix" 6 | 7 | require "twingly/public_suffix_list" 8 | require "twingly/url/null_url" 9 | require "twingly/url/error" 10 | require "twingly/version" 11 | 12 | module Twingly 13 | class URL 14 | include Comparable 15 | 16 | ACCEPTED_SCHEMES = /\Ahttps?\z/i.freeze 17 | CUSTOM_PSL = PublicSuffixList.with_punycoded_names 18 | ENDS_WITH_SLASH = /\/+$/.freeze 19 | STARTS_WITH_WWW = /\Awww\./i.freeze 20 | ERRORS_TO_EXTEND = [ 21 | Addressable::IDNA::PunycodeBigOutput, 22 | Addressable::URI::InvalidURIError, 23 | PublicSuffix::DomainInvalid, 24 | ].freeze 25 | DOT = "." 26 | HYPHEN = "-" 27 | CARRIAGE_RETURN = "\u000D" 28 | LINE_FEED = "\u000A" 29 | NBSP = "\u00A0" 30 | SPACE = "\u0020" 31 | WHITESPACE_CHARS = [ 32 | CARRIAGE_RETURN, 33 | LINE_FEED, 34 | NBSP, 35 | SPACE, 36 | ].join.freeze 37 | LEADING_AND_TRAILING_WHITESPACE = 38 | /\A[#{WHITESPACE_CHARS}]+|[#{WHITESPACE_CHARS}]+\z/.freeze 39 | LETTERS_DIGITS_HYPHEN = /\A[a-zA-Z0-9-]+\z/.freeze 40 | 41 | private_constant :ACCEPTED_SCHEMES 42 | private_constant :CUSTOM_PSL 43 | private_constant :STARTS_WITH_WWW 44 | private_constant :ENDS_WITH_SLASH 45 | private_constant :ERRORS_TO_EXTEND 46 | private_constant :DOT 47 | private_constant :HYPHEN 48 | private_constant :NBSP 49 | private_constant :SPACE 50 | private_constant :WHITESPACE_CHARS 51 | private_constant :LEADING_AND_TRAILING_WHITESPACE 52 | private_constant :LETTERS_DIGITS_HYPHEN 53 | 54 | class << self 55 | def parse(potential_url) 56 | internal_parse(potential_url) 57 | rescue Twingly::URL::Error, Twingly::URL::Error::ParseError => error 58 | NullURL.new 59 | rescue Exception => error 60 | error.extend(Twingly::URL::Error) 61 | raise 62 | end 63 | 64 | def internal_parse(input) 65 | potential_url = clean_input(input) 66 | addressable_uri = Addressable::URI.heuristic_parse(potential_url) 67 | raise Twingly::URL::Error::ParseError if addressable_uri.nil? 68 | 69 | scheme = addressable_uri.scheme 70 | raise Twingly::URL::Error::ParseError unless scheme =~ ACCEPTED_SCHEMES 71 | 72 | # URLs that can't be normalized should not be valid 73 | try_addressable_normalize(addressable_uri) 74 | 75 | host = addressable_uri.host 76 | public_suffix_domain = PublicSuffix.parse(host, list: CUSTOM_PSL, 77 | default_rule: nil) 78 | raise Twingly::URL::Error::ParseError if public_suffix_domain.nil? 79 | 80 | raise Twingly::URL::Error::ParseError if public_suffix_domain.sld.nil? 81 | 82 | new(addressable_uri, public_suffix_domain) 83 | rescue *ERRORS_TO_EXTEND => error 84 | error.extend(Twingly::URL::Error) 85 | raise 86 | end 87 | 88 | def clean_input(input) 89 | input = String(input) 90 | input = input.scrub 91 | input = strip_whitespace(input) 92 | end 93 | 94 | def strip_whitespace(input) 95 | return input unless input.encoding == Encoding::UTF_8 96 | 97 | input.gsub(LEADING_AND_TRAILING_WHITESPACE, "") 98 | end 99 | 100 | def try_addressable_normalize(addressable_uri) 101 | ascii_host = addressable_uri.normalize.host 102 | raise Twingly::URL::Error::ParseError unless valid_hostname?(ascii_host) 103 | rescue ArgumentError => error 104 | if error.message.include?("invalid byte sequence in UTF-8") 105 | raise Twingly::URL::Error::ParseError 106 | end 107 | 108 | raise 109 | end 110 | 111 | def valid_hostname?(hostname) 112 | return false if hostname.nil? 113 | 114 | # No need to check the TLD, the public suffix list does that 115 | labels = hostname.split(DOT)[0...-1].map(&:to_s) 116 | 117 | labels.all? { |label| valid_label?(label) } 118 | end 119 | 120 | def valid_label?(label) 121 | return false if label.start_with?(HYPHEN) 122 | return false if label.end_with?(HYPHEN) 123 | 124 | label.match?(LETTERS_DIGITS_HYPHEN) 125 | end 126 | 127 | private :new 128 | private :internal_parse 129 | private :clean_input 130 | private :strip_whitespace 131 | private :try_addressable_normalize 132 | private :valid_hostname? 133 | private :valid_label? 134 | end 135 | 136 | def initialize(addressable_uri, public_suffix_domain) 137 | @addressable_uri = addressable_uri 138 | @public_suffix_domain = public_suffix_domain 139 | end 140 | 141 | def scheme 142 | addressable_uri.scheme 143 | end 144 | 145 | def trd 146 | public_suffix_domain.trd.to_s 147 | end 148 | 149 | def sld 150 | public_suffix_domain.sld 151 | end 152 | 153 | def tld 154 | public_suffix_domain.tld 155 | end 156 | 157 | # Many ccTLDs have a second level[1] underneath their ccTLD, use this when 158 | # you don't care about the second level. 159 | # 160 | # [1]: https://en.wikipedia.org/wiki/Second-level_domain 161 | def ttld 162 | tld.split(".").last 163 | end 164 | 165 | def domain 166 | public_suffix_domain.domain 167 | end 168 | 169 | def host 170 | addressable_uri.host 171 | end 172 | 173 | def origin 174 | addressable_uri.origin 175 | end 176 | 177 | def path 178 | addressable_uri.path 179 | end 180 | 181 | def without_scheme 182 | self.to_s.sub(/\A#{scheme}:/, "") 183 | end 184 | 185 | def normalized 186 | normalized_url = addressable_uri.dup 187 | 188 | normalized_url.scheme = normalized_scheme 189 | normalized_url.host = normalized_host 190 | normalized_url.path = normalized_path 191 | 192 | self.class.parse(normalized_url) 193 | end 194 | 195 | def normalized_scheme 196 | scheme.downcase 197 | end 198 | 199 | def normalized_host 200 | host = addressable_uri.normalized_host 201 | domain = public_suffix_domain 202 | 203 | unless domain.subdomain? 204 | host = "www.#{host}" 205 | end 206 | 207 | host = normalize_blogspot(host, domain) 208 | 209 | host 210 | end 211 | 212 | def normalized_path 213 | path = strip_trailing_slashes(addressable_uri.path) 214 | 215 | (path.empty?) ? "/" : path 216 | end 217 | 218 | def userinfo 219 | addressable_uri.userinfo.to_s 220 | end 221 | 222 | def user 223 | addressable_uri.user.to_s 224 | end 225 | 226 | def password 227 | addressable_uri.password.to_s 228 | end 229 | 230 | def valid? 231 | true 232 | end 233 | 234 | def <=>(other) 235 | self.to_s <=> other.to_s 236 | end 237 | 238 | def eql?(other) 239 | return false unless other.is_a?(self.class) 240 | 241 | self.hash == other.hash 242 | end 243 | 244 | def hash 245 | self.to_s.hash 246 | end 247 | 248 | def to_s 249 | addressable_uri.to_s 250 | end 251 | 252 | def inspect 253 | sprintf("#<%s:0x%x %s>", self.class.name, __id__, self.to_s) 254 | end 255 | 256 | private 257 | 258 | attr_reader :addressable_uri, :public_suffix_domain 259 | 260 | def normalize_blogspot(host, domain) 261 | if domain.sld.downcase == "blogspot" 262 | host.sub(STARTS_WITH_WWW, "").sub(/#{domain.tld}\z/i, "com") 263 | else 264 | host 265 | end 266 | end 267 | 268 | def strip_trailing_slashes(path) 269 | path.sub(ENDS_WITH_SLASH, "") 270 | end 271 | end 272 | end 273 | -------------------------------------------------------------------------------- /lib/twingly/url/error.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Twingly 4 | class URL 5 | module Error 6 | class ParseError < StandardError 7 | end 8 | end 9 | end 10 | end 11 | -------------------------------------------------------------------------------- /lib/twingly/url/hasher.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require 'digest' 4 | 5 | require "twingly/url" 6 | 7 | module Twingly 8 | class URL 9 | module Hasher 10 | module_function 11 | 12 | # Instantiate digest classes in a thread-safe manner 13 | # This is important since we don't know how people will 14 | # use this gem (if they require it in a thread safe way) 15 | MD5_DIGEST = Digest(:MD5) 16 | SHA256_DIGEST = Digest(:SHA256) 17 | 18 | def taskdb_hash(url) 19 | MD5_DIGEST.hexdigest(url)[0..29].upcase 20 | end 21 | 22 | def documentdb_hash(url) 23 | SHA256_DIGEST.digest(url).unpack("L!")[0] 24 | end 25 | 26 | def autopingdb_hash(url) 27 | SHA256_DIGEST.digest(url).unpack("q")[0] 28 | end 29 | end 30 | end 31 | end 32 | -------------------------------------------------------------------------------- /lib/twingly/url/null_url.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Twingly 4 | class URL 5 | class NullURL 6 | include Comparable 7 | 8 | def method_missing(name, *) 9 | error = NoMethodError.new("undefined method `#{name}'") 10 | raise error unless Twingly::URL.instance_methods.include?(name) 11 | 12 | "" 13 | end 14 | 15 | def normalized 16 | self 17 | end 18 | 19 | def valid? 20 | false 21 | end 22 | 23 | def <=>(other) 24 | self.to_s <=> other.to_s 25 | end 26 | 27 | def eql?(other) 28 | return false unless other.is_a?(self.class) 29 | 30 | self.hash == other.hash 31 | end 32 | 33 | def hash 34 | self.to_s.hash 35 | end 36 | 37 | def to_s 38 | "" 39 | end 40 | end 41 | end 42 | end 43 | -------------------------------------------------------------------------------- /lib/twingly/url/utilities.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "twingly/url" 4 | 5 | module Twingly 6 | class URL 7 | module Utilities 8 | module_function 9 | 10 | def extract_valid_urls(text_or_array) 11 | potential_urls = Array(text_or_array).flat_map(&:split) 12 | potential_urls.map do |potential_url| 13 | url = Twingly::URL.parse(potential_url) 14 | url if url.valid? 15 | end.compact 16 | end 17 | end 18 | end 19 | end 20 | -------------------------------------------------------------------------------- /lib/twingly/version.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | module Twingly 4 | class URL 5 | VERSION = "7.0.1" 6 | end 7 | end 8 | -------------------------------------------------------------------------------- /profile/Gemfile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | source "https://rubygems.org/" 4 | 5 | gem "rake" 6 | gem "ruby-prof" 7 | gem "memory_profiler" 8 | gem "twingly-url", path: "../" 9 | -------------------------------------------------------------------------------- /profile/Rakefile: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "twingly/url" 4 | require_relative "profile" 5 | 6 | namespace :profile do 7 | desc "Profile Twingly::URL.parse and #normalized (file reports)" 8 | task :normalize do |task| 9 | Profile.measure "normalizing a short URL", 1000 do 10 | Twingly::URL.parse('http://www.duh.se/').normalized 11 | end 12 | end 13 | end 14 | 15 | namespace :memory_profile do 16 | desc "Memory Profile Twingly::URL.parse (stdout report)" 17 | task :parse do |task| 18 | MemoryProfile.measure "parsing an URL", 1000 do 19 | Twingly::URL.parse('http://www.twingly.com/') 20 | end 21 | end 22 | end 23 | 24 | task default: "profile:normalize" 25 | -------------------------------------------------------------------------------- /profile/profile.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "ruby-prof" 4 | require "memory_profiler" 5 | 6 | class Profile 7 | def self.measure(name, count, &block) 8 | RubyProf.start 9 | 10 | count.times do 11 | block.call 12 | end 13 | 14 | result = RubyProf.stop 15 | result_directory = "tmp" 16 | Dir.mkdir(result_directory) unless File.exists?(result_directory) 17 | printer = RubyProf::MultiPrinter.new(result) 18 | printer.print(path: result_directory) 19 | 20 | puts "Measured #{name} #{count} times" 21 | puts "Generated reports:" 22 | Dir.entries(result_directory).reject { |entry| entry.end_with?(".") }.each do |file| 23 | puts " #{result_directory}/#{file}" 24 | end 25 | end 26 | end 27 | 28 | class MemoryProfile 29 | def self.measure(name, count, &block) 30 | report_options = { 31 | ignore_files: __FILE__ # Ignore this file 32 | } 33 | 34 | MemoryProfiler.start(report_options) 35 | 36 | count.times do 37 | block.call 38 | end 39 | 40 | report = MemoryProfiler.stop 41 | report.pretty_print 42 | end 43 | end 44 | -------------------------------------------------------------------------------- /spec/lib/twingly/public_suffix_list_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | require "twingly/public_suffix_list" 6 | 7 | describe Twingly::PublicSuffixList do 8 | describe ".with_punycoded_names" do 9 | subject { described_class.with_punycoded_names(encoding: encoding) } 10 | 11 | context "when the list is data is read with the default encoding" do 12 | subject { described_class.with_punycoded_names } 13 | 14 | it { is_expected.to be_a(PublicSuffix::List) } 15 | end 16 | 17 | context "when the list data is read as UTF-8" do 18 | let(:encoding) { Encoding::UTF_8 } 19 | 20 | it { is_expected.to be_a(PublicSuffix::List) } 21 | end 22 | 23 | context "when the list data is read as US-ASCII" do 24 | let(:encoding) { Encoding::US_ASCII } 25 | # https://github.com/ruby/ruby/commit/571d21fd4a2e877f49b4ff918832bda9a5e8f91c 26 | let(:expected_error) do 27 | if RUBY_VERSION >= "3.2.0" 28 | Encoding::CompatibilityError 29 | else 30 | ArgumentError 31 | end 32 | end 33 | 34 | it "parsing the data will fail" do 35 | expect { subject }. 36 | to raise_error(expected_error, "invalid byte sequence in US-ASCII") 37 | end 38 | end 39 | end 40 | end 41 | -------------------------------------------------------------------------------- /spec/lib/twingly/url/hasher_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | require "twingly/url/hasher" 6 | 7 | describe Twingly::URL::Hasher do 8 | describe ".taskdb_hash" do 9 | it "returns a MD5 hexdigest" do 10 | expect(Twingly::URL::Hasher.taskdb_hash("http://blog.twingly.com/")).to eq "B1E2D5AECF6649C2E44D17AEA3E0F4" 11 | end 12 | end 13 | 14 | describe ".documentdb_hash" do 15 | it "returns a SHA256 unsigned long, native endian digest" do 16 | expect(Twingly::URL::Hasher.documentdb_hash("http://blog.twingly.com/")).to eq 15340752212397415993 17 | end 18 | end 19 | 20 | describe ".autopingdb_hash" do 21 | let(:expected) { -3105991861312135623 } 22 | 23 | it "returns a SHA256 64-bit signed, native endian digest" do 24 | expect(Twingly::URL::Hasher.autopingdb_hash("http://blog.twingly.com/")).to eq expected 25 | end 26 | end 27 | end 28 | -------------------------------------------------------------------------------- /spec/lib/twingly/url/null_url_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | require "twingly/url" 6 | 7 | describe Twingly::URL::NullURL do 8 | let(:url) { described_class.new } 9 | 10 | describe "#valid?" do 11 | subject { url.valid? } 12 | it { is_expected.to be(false) } 13 | end 14 | 15 | describe "#normalized" do 16 | subject { url.normalized } 17 | it { is_expected.to equal(subject) } 18 | end 19 | 20 | describe "#scheme" do 21 | subject { url.scheme } 22 | it { is_expected.to eq("") } 23 | end 24 | 25 | describe "#trd" do 26 | subject { url.trd } 27 | it { is_expected.to eq("") } 28 | end 29 | 30 | describe "#sld" do 31 | subject { url.sld } 32 | it { is_expected.to eq("") } 33 | end 34 | 35 | describe "#tld" do 36 | subject { url.tld } 37 | it { is_expected.to eq("") } 38 | end 39 | 40 | describe "#ttld" do 41 | subject { url.ttld } 42 | it { is_expected.to eq("") } 43 | end 44 | 45 | describe "#domain" do 46 | subject { url.domain } 47 | it { is_expected.to eq("") } 48 | end 49 | 50 | describe "#host" do 51 | subject { url.host } 52 | it { is_expected.to eq("") } 53 | end 54 | 55 | describe "#origin" do 56 | subject { url.origin } 57 | it { is_expected.to eq("") } 58 | end 59 | 60 | describe "#path" do 61 | subject { url.path } 62 | it { is_expected.to eq("") } 63 | end 64 | 65 | describe "#normalized_path" do 66 | subject { url.normalized_path } 67 | it { is_expected.to eq("") } 68 | end 69 | 70 | describe "#normalized_scheme" do 71 | subject { url.normalized_scheme } 72 | it { is_expected.to eq("") } 73 | end 74 | 75 | describe "#normalized_host" do 76 | subject { url.normalized_host } 77 | it { is_expected.to eq("") } 78 | end 79 | 80 | describe "#userinfo" do 81 | subject { url.userinfo } 82 | it { is_expected.to eq("") } 83 | end 84 | 85 | describe "#user" do 86 | subject { url.user } 87 | it { is_expected.to eq("") } 88 | end 89 | 90 | describe "#password" do 91 | subject { url.password } 92 | it { is_expected.to eq("") } 93 | end 94 | 95 | context "when receiving call for non-existing method on Twingly::URL" do 96 | it "raises an error" do 97 | expect { url.method_does_not_exist }.to raise_error(NoMethodError) 98 | end 99 | end 100 | 101 | describe "uniqueness" do 102 | context "a list with multiple NullURLs should only have one unique item" do 103 | subject(:list) { 10.times.map { described_class.new }.uniq } 104 | 105 | it { is_expected.to eq([described_class.new]) } 106 | end 107 | 108 | context "an object and its string representation" do 109 | let(:a) { described_class.new } 110 | let(:b) { described_class.new.to_s } 111 | 112 | it "should be two unique objects" do 113 | expect([a,b].uniq).to eq([a,b]) 114 | end 115 | 116 | describe "#eql?" do 117 | subject { a.eql?(b) } 118 | it { is_expected.to eq(false) } 119 | end 120 | end 121 | end 122 | end 123 | -------------------------------------------------------------------------------- /spec/lib/twingly/url/utilities_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | require "twingly/url/utilities" 6 | 7 | describe Twingly::URL::Utilities do 8 | describe ".extract_valid_urls" do 9 | context "when given a string with URLs" do 10 | it "returns an array of extracted URLs" do 11 | input = "hej hopp http://www.twingly.com banan https://www.wordpress.com/forums/sv äpplen/päron" 12 | actual = described_class.extract_valid_urls(input).map(&:to_s) 13 | expected = %w(http://www.twingly.com https://www.wordpress.com/forums/sv) 14 | 15 | expect(actual).to eq(expected) 16 | end 17 | end 18 | 19 | context "when given an array with URLs" do 20 | it "returns an array of extracted URLs" do 21 | input = %w(hej hopp http://www.twingly.com banan https://www.wordpress.com/forums/sv äpplen/päron) 22 | actual = described_class.extract_valid_urls(input).map(&:to_s) 23 | expected = %w(http://www.twingly.com https://www.wordpress.com/forums/sv) 24 | 25 | expect(actual).to eq(expected) 26 | end 27 | end 28 | 29 | it "always returns an Array" do 30 | response = described_class.extract_valid_urls(nil) 31 | 32 | expect(response).to eq([]) 33 | end 34 | end 35 | end 36 | -------------------------------------------------------------------------------- /spec/lib/twingly/url_spec.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require "spec_helper" 4 | 5 | require "twingly/url" 6 | 7 | def invalid_urls 8 | [ 9 | "http://http", 10 | "http:///", 11 | "http:/", 12 | "http:", 13 | "htttp", 14 | "http://", 15 | "http:X", 16 | "a", 17 | "1", 18 | "?", 19 | 123, 20 | nil, 21 | false, 22 | "", 23 | //, 24 | "feedville.com,2007-06-19:/blends/16171", 25 | "ftp://blog.twingly.com/", 26 | "blablahttp://blog.twingly.com/", 27 | "gopher://blog.twingly.com/", 28 | "\n", 29 | "//www.twingly.com/", 30 | "http://xn--t...-/", 31 | "http://xn--...-", 32 | "leather beltsbelts for menleather beltmens beltsleather belts for menmens beltbelt bucklesblack l...", 33 | "https//.com", 34 | "http://xxx@.com/", 35 | "http://...com", 36 | "http://.ly/xxx", 37 | "http://.com.my/", 38 | "http://.net", 39 | "http://.com.", 40 | "http://.gl/xxx", 41 | "http://.twingly.com/", 42 | "http://www.twingly.", 43 | "http://www..twingly..com/", 44 | "http:// shouldfail.com", 45 | "http://-a.b.co", 46 | "http://a.b-.co", 47 | "http://.www.foo.bar./", 48 | "http://club].no/", 49 | "http://www,google.com", 50 | "http://some_site.net%C2", 51 | "http://+%D5d.some_site.net", 52 | 53 | # Triggers Addressable::IDNA::PunycodeBigOutput 54 | "http://40world-many.ru&passwd=pUXFGc0LS5&subject=%D0%B1%D0%B0%D0%BB%D0%B0%D0%BD%D1%81%D0%B8%D1%80%D0%BE%D0%B2%D0%BA%D0%B0+%D0%BA%D0%B0%D1%80%D0%B4%D0%B0%D0%BD%D0%BD%D0%BE%D0%B3%D0%BE+%D0%B2%D0%B0%D0%BB%D0%B0&commit=Predict&complex=true&complex=false&membrane=false&coil=false&msa_control=all&secStructPred=true&secStructPred=false&falseRate=5&output=opnone&modeller=&seqalign=yes&database=PfamA&eval=0.01&iterations=5&domssea=yes&secpro=yes&pp=yes", 55 | ] 56 | end 57 | 58 | def valid_urls 59 | [ 60 | "http://blog.twingly.com/", 61 | "http://blOg.tWingly.coM/", 62 | "https://blog.twingly.com", 63 | "http://3.bp.blogspot.com/_lRbEHeizXlQ/Sf4RdEqCqhI/AAAAAAAAAAw/Pl8nGPsyhXc/s1600-h/images[4].jpg", 64 | "http://xn--zckp1cyg1.sblo.jp/", 65 | "http://eleven.se/mason-pearson-pocket-bristle-nylon-dark-ruby-20683.html&gclid=CjwKEAiAvPGxBRCH3YCgpdbCtmYSJABqHRVw1ZLaelwjepCihWgKkoqgl2t7k0J6J8I1IFp3GYZmKxoCc-nw_wcB?gclid=CjwKEAiAvPGxBRCH3YCgpdbCtmYSJABqHRVw1ZLaelwjepCihWgKkoqgl2t7k0J6J8I1IFp3GYZmKxoCc-nw_wcB", 66 | "http://xn--rksmrgs-5wao1o.josefsson.org/", 67 | "http://räksmörgås.josefßon.org", 68 | "http://user:password@blog.twingly.com/", 69 | "http://:@blog.twingly.com/", 70 | "https://www.foo.ایران.ir/bar", 71 | "https://www.foo.xn--mgba3a4f16a.ir/bar", 72 | "http://AcinusFallumTrompetumNullunCreditumVisumEstAtCuadLongumEtCefallumEst.com", 73 | ] 74 | end 75 | 76 | def leading_and_trailing_whitespace 77 | line_feed = "\u000A" 78 | carriage_return = "\u000D" 79 | non_breaking_space = "\u00A0" 80 | space = "\u0020" 81 | 82 | { 83 | "non-breaking space and space" => [non_breaking_space, space].join, 84 | "non-breaking space" => [non_breaking_space].join, 85 | "non-breaking space, space, non-breaking space" => [non_breaking_space, space, non_breaking_space].join, 86 | "space and non-breaking space" => [space, non_breaking_space].join, 87 | "space, non-breaking space and space" => [space, non_breaking_space, space].join, 88 | 89 | "non-breaking space and line-feed" => [non_breaking_space, line_feed].join, 90 | "line-feed and non-breaking space" => [line_feed, non_breaking_space].join, 91 | "space and line-feed" => [space, line_feed].join, 92 | "line-feed and space" => [line_feed, space].join, 93 | 94 | "non-breaking space and carriage-return" => [non_breaking_space, carriage_return].join, 95 | "carriage-return and non-breaking space" => [carriage_return, non_breaking_space].join, 96 | "space and carriage-return" => [space, carriage_return].join, 97 | "carriage-return and space" => [carriage_return, space].join, 98 | 99 | "carriage-return and line-feed" => [carriage_return, line_feed].join, 100 | "line-feed and carriage-return" => [line_feed, carriage_return].join, 101 | } 102 | end 103 | 104 | describe Twingly::URL do 105 | let(:unicode_idn_test_url) do 106 | "http://räksmörgås.макдональдс.рф/foo" 107 | end 108 | 109 | let(:ascii_idn_test_url) do 110 | "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo" 111 | end 112 | 113 | let(:test_url) do 114 | "http://www.blog.twingly.co.uk/2015/07/01/language-detection-changes/" 115 | end 116 | let(:url) { described_class.parse(test_url) } 117 | 118 | describe ".parse" do 119 | subject { url } 120 | 121 | it { is_expected.to be_a(Twingly::URL) } 122 | 123 | context "when re-reraising errors" do 124 | let(:some_exception) { Exception } 125 | 126 | before do 127 | allow(described_class) 128 | .to receive(:internal_parse) 129 | .and_raise(some_exception) 130 | end 131 | 132 | it "always tags the error" do 133 | expect { subject }.to raise_error do |error| 134 | aggregate_failures do 135 | expect(error).to be_instance_of(some_exception) 136 | expect(error).to be_kind_of(Twingly::URL::Error) 137 | end 138 | end 139 | end 140 | end 141 | 142 | context "when given valid urls" do 143 | valid_urls.each do |valid_url| 144 | it "does not ruin the url \"#{valid_url}\"" do 145 | expect(described_class.parse(valid_url).to_s).to eq(valid_url) 146 | end 147 | end 148 | end 149 | 150 | context "when given bad input" do 151 | invalid_urls.each do |invalid_url| 152 | it "returns a NullURL for \"#{invalid_url}\"" do 153 | actual = described_class.parse(invalid_url) 154 | expect(actual).to be_a(Twingly::URL::NullURL) 155 | end 156 | end 157 | end 158 | 159 | context "when given URL with uppercase scheme" do 160 | let(:test_url) { "HTTPS://www.twingly.com/" } 161 | let(:expected) { "https://www.twingly.com/" } 162 | 163 | it "downcases the scheme part" do 164 | expect(subject).to eq(expected) 165 | end 166 | end 167 | 168 | context "when given badly encoded input" do 169 | let(:badly_encoded_url) { "http://abc.se/öあ\x81b\xE3" } 170 | let(:expected) { "http://abc.se/öあ\uFFFDb\uFFFD" } 171 | let(:actual) { described_class.parse(badly_encoded_url) } 172 | 173 | it "will replace badly encoded characters with unicode replacement character (U+FFFD)" do 174 | expect(actual.to_s).to eq(expected) 175 | end 176 | end 177 | 178 | context "when given ASCII input" do 179 | let(:ascii_url) { (+"http://www.twingly.com/öあ").force_encoding("ASCII-8BIT") } 180 | let(:expected) { "http://www.twingly.com/öあ" } 181 | let(:actual) { described_class.parse(ascii_url).to_s } 182 | 183 | it "can handle it but returns UTF-8" do 184 | expect(actual).to eq(expected) 185 | end 186 | end 187 | 188 | context "with url containing leading and trailing new lines" do 189 | let(:test_url) { "\nhttp://www.twingly.com/blog-data/\r\n" } 190 | let(:expected) { "http://www.twingly.com/blog-data/" } 191 | 192 | it { is_expected.to eq(expected) } 193 | end 194 | 195 | context "with url containing leading and trailing whitespaces" do 196 | let(:test_url) { " http://www.twingly.com/blog-data/ " } 197 | let(:expected) { "http://www.twingly.com/blog-data/" } 198 | 199 | it { is_expected.to eq(expected) } 200 | end 201 | 202 | context "with url containing both newlines and whitespaces" do 203 | let(:test_url) { " \n\r https://anniaksa.wordpress.com/2014/05/19/privy-digging-blogg100/ \r \n " } 204 | let(:expected) { "https://anniaksa.wordpress.com/2014/05/19/privy-digging-blogg100/" } 205 | 206 | it { is_expected.to eq(expected) } 207 | end 208 | 209 | leading_and_trailing_whitespace.each do |whitespace_name, whitespace| 210 | context "with url containing leading and trailing: #{whitespace_name}" do 211 | let(:test_url) { "#{whitespace}https://www.example.com/#{whitespace}" } 212 | let(:expected) { "https://www.example.com/" } 213 | 214 | it { is_expected.to eq(expected) } 215 | end 216 | end 217 | end 218 | 219 | describe ".internal_parse" do 220 | context "when called from the outside" do 221 | it "raises an error" do 222 | expect { described_class.internal_parse("a") }. 223 | to raise_error(NoMethodError, /private method `internal_parse' called for/) 224 | end 225 | end 226 | end 227 | 228 | describe ".new" do 229 | context "when called from the outside" do 230 | it "raises an error" do 231 | expect { described_class.new("a", "b") }. 232 | to raise_error(NoMethodError, /private method `new' called for/) 233 | end 234 | end 235 | end 236 | 237 | describe "#scheme" do 238 | subject { url.scheme } 239 | it { is_expected.to eq("http") } 240 | end 241 | 242 | describe "#trd" do 243 | subject { url.trd } 244 | it { is_expected.to eq("www.blog") } 245 | 246 | context "when the url contains no trd" do 247 | let(:test_url){ "http://twingly.com" } 248 | it { is_expected.to eq("") } 249 | end 250 | 251 | context "internationalized domain name" do 252 | describe "given in Unicode" do 253 | let(:test_url) { unicode_idn_test_url } 254 | it { is_expected.to eq("räksmörgås") } 255 | end 256 | 257 | describe "given in ASCII" do 258 | let(:test_url) { ascii_idn_test_url } 259 | it { is_expected.to eq("xn--rksmrgs-5wao1o") } 260 | end 261 | end 262 | end 263 | 264 | describe "#sld" do 265 | subject { url.sld } 266 | it { is_expected.to eq("twingly") } 267 | 268 | context "internationalized domain name" do 269 | describe "given in Unicode" do 270 | let(:test_url) { unicode_idn_test_url } 271 | it { is_expected.to eq("макдональдс") } 272 | end 273 | 274 | describe "given in ASCII" do 275 | let(:test_url) { ascii_idn_test_url } 276 | it { is_expected.to eq("xn--80aalb1aicli8a5i") } 277 | end 278 | end 279 | end 280 | 281 | describe "#tld" do 282 | subject { url.tld } 283 | it { is_expected.to eq("co.uk") } 284 | 285 | context "internationalized domain name" do 286 | describe "given in Unicode" do 287 | let(:test_url) { unicode_idn_test_url } 288 | it { is_expected.to eq("рф") } 289 | end 290 | 291 | describe "given in ASCII" do 292 | let(:test_url) { ascii_idn_test_url } 293 | it { is_expected.to eq("xn--p1ai") } 294 | end 295 | 296 | describe "punycoded TLD with multiple levels" do 297 | let(:test_url) { "https://foo.sande.xn--mre-og-romsdal-qqb.no/bar" } 298 | it { is_expected.to eq("sande.xn--mre-og-romsdal-qqb.no") } 299 | end 300 | end 301 | end 302 | 303 | describe "#ttld" do 304 | subject { url.ttld } 305 | it { is_expected.to eq("uk") } 306 | 307 | context "when the TLD is just one level" do 308 | let(:test_url){ "http://twingly.com" } 309 | 310 | it { is_expected.to eq("com") } 311 | end 312 | 313 | context "internationalized domain name" do 314 | describe "given in Unicode" do 315 | let(:test_url) { unicode_idn_test_url } 316 | it { is_expected.to eq("рф") } 317 | end 318 | 319 | describe "given in ASCII" do 320 | let(:test_url) { ascii_idn_test_url } 321 | it { is_expected.to eq("xn--p1ai") } 322 | end 323 | end 324 | end 325 | 326 | describe "#domain" do 327 | subject { url.domain } 328 | it { is_expected.to eq("twingly.co.uk") } 329 | 330 | context "internationalized domain name" do 331 | describe "given in Unicode" do 332 | let(:test_url) { unicode_idn_test_url } 333 | it { is_expected.to eq("макдональдс.рф") } 334 | end 335 | 336 | describe "given in ASCII" do 337 | let(:test_url) { ascii_idn_test_url } 338 | it { is_expected.to eq("xn--80aalb1aicli8a5i.xn--p1ai") } 339 | end 340 | end 341 | end 342 | 343 | describe "#host" do 344 | subject { url.host } 345 | it { is_expected.to eq("www.blog.twingly.co.uk") } 346 | 347 | context "internationalized domain name" do 348 | describe "given in Unicode" do 349 | let(:test_url) { unicode_idn_test_url } 350 | it { is_expected.to eq("räksmörgås.макдональдс.рф") } 351 | end 352 | 353 | describe "given in ASCII" do 354 | let(:test_url) { ascii_idn_test_url } 355 | it { is_expected.to eq("xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai") } 356 | end 357 | end 358 | end 359 | 360 | describe "#origin" do 361 | subject { url.origin } 362 | it { is_expected.to eq("http://www.blog.twingly.co.uk") } 363 | 364 | context "internationalized domain name" do 365 | describe "given in Unicode" do 366 | let(:test_url) { unicode_idn_test_url } 367 | it { is_expected.to eq("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai") } 368 | end 369 | 370 | describe "given in ASCII" do 371 | let(:test_url) { ascii_idn_test_url } 372 | it { is_expected.to eq("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai") } 373 | end 374 | end 375 | end 376 | 377 | describe "#path" do 378 | subject { url.path } 379 | it { is_expected.to eq("/2015/07/01/language-detection-changes/") } 380 | end 381 | 382 | describe "#normalized_path" do 383 | subject { url.normalized_path } 384 | it { is_expected.to eq("/2015/07/01/language-detection-changes") } 385 | end 386 | 387 | describe "#normalized_scheme" do 388 | subject { url.normalized_scheme } 389 | it { is_expected.to eq("http") } 390 | end 391 | 392 | describe "#normalized_host" do 393 | subject { url.normalized_host } 394 | it { is_expected.to eq("www.blog.twingly.co.uk") } 395 | end 396 | 397 | describe "#valid?" do 398 | invalid_urls.each do |invalid_url| 399 | it "returns false for an invalid URL \"#{invalid_url}\"" do 400 | expect(described_class.parse(invalid_url).valid?).to be false 401 | end 402 | end 403 | 404 | valid_urls.each do |valid_url| 405 | it "returns true for the valid url \"#{valid_url}\"" do 406 | expect(described_class.parse(valid_url).valid?).to be true 407 | end 408 | end 409 | 410 | context "when given nil input" do 411 | it "it returns false" do 412 | expect(described_class.parse(nil).valid?).to be false 413 | end 414 | end 415 | end 416 | 417 | describe "#normalized" do 418 | context "when given valid urls" do 419 | valid_urls.each do |valid_url| 420 | it "does not raise an error for \"#{valid_url}\"" do 421 | actual = described_class.parse(valid_url).normalized 422 | expect(actual).to be_a(Twingly::URL) 423 | end 424 | end 425 | end 426 | 427 | context "when given bad input" do 428 | invalid_urls.each do |invalid_url| 429 | it "returns NullURL for \"#{invalid_url}\"" do 430 | actual = described_class.parse(invalid_url).normalized 431 | expect(actual).to be_a(Twingly::URL::NullURL) 432 | end 433 | end 434 | end 435 | 436 | subject { described_class.parse(url).normalized.to_s } 437 | 438 | context "when given IDN URL with the domain \"straße.de\"" do 439 | let(:test_url) { "http://straße.de" } 440 | let(:normalized_url) { described_class.parse(url).normalized } 441 | 442 | it "does conform to the IDNA2008 protocol" do 443 | expect(normalized_url.domain).to eq("xn--strae-oqa.de") 444 | end 445 | end 446 | 447 | context "with URL that has an internationalized TLD in Unicode" do 448 | let(:test_url) { "https://www.foo.ایران.ir/bar" } 449 | let(:normalized_url) { described_class.parse(url).normalized } 450 | 451 | describe "#scheme" do 452 | subject { normalized_url.scheme } 453 | it { is_expected.to eq("https") } 454 | end 455 | 456 | describe "#trd" do 457 | subject { normalized_url.trd } 458 | it { is_expected.to eq("www") } 459 | end 460 | 461 | describe "#sld" do 462 | subject { normalized_url.sld } 463 | it { is_expected.to eq("foo") } 464 | end 465 | 466 | describe "#tld" do 467 | subject { normalized_url.tld } 468 | it { is_expected.to eq("xn--mgba3a4f16a.ir") } 469 | end 470 | 471 | describe "#ttld" do 472 | subject { normalized_url.ttld } 473 | it { is_expected.to eq("ir") } 474 | end 475 | 476 | describe "#domain" do 477 | subject { normalized_url.domain } 478 | it { is_expected.to eq("foo.xn--mgba3a4f16a.ir") } 479 | end 480 | 481 | describe "#host" do 482 | subject { normalized_url.host } 483 | it { is_expected.to eq("www.foo.xn--mgba3a4f16a.ir") } 484 | end 485 | 486 | describe "#origin" do 487 | subject { normalized_url.origin } 488 | it { is_expected.to eq("https://www.foo.xn--mgba3a4f16a.ir") } 489 | end 490 | 491 | describe "#path" do 492 | subject { normalized_url.path } 493 | it { is_expected.to eq("/bar") } 494 | end 495 | end 496 | 497 | context "adds www if host is missing a subdomain" do 498 | let(:url) { "http://twingly.com/" } 499 | let(:expected) { "http://www.twingly.com/" } 500 | 501 | it { is_expected.to eq(expected) } 502 | end 503 | 504 | context "does not add www if the host has a subdomain" do 505 | let(:url) { "http://blog.twingly.com/" } 506 | 507 | it { is_expected.to eq(url) } 508 | end 509 | 510 | context "does not remove www if the host has a subdomain" do 511 | let(:url) { "http://www.blog.twingly.com/" } 512 | 513 | it { is_expected.to eq(url) } 514 | end 515 | 516 | context "keeps www if the host already has it" do 517 | let(:url) { "http://www.twingly.com/" } 518 | 519 | it { is_expected.to eq(url) } 520 | end 521 | 522 | context "ensures that path starts with slash" do 523 | let(:url) { "http://www.twingly.com" } 524 | let(:expected) { "http://www.twingly.com/" } 525 | 526 | it { is_expected.to eq(expected) } 527 | end 528 | 529 | context "ensures that path only starts with single slash" do 530 | let(:url) { "http://www.twingly.com//" } 531 | let(:expected) { "http://www.twingly.com/" } 532 | 533 | it { is_expected.to eq(expected) } 534 | end 535 | 536 | context "removes trailing slash from end of path unless path becomes empty" do 537 | let(:url) { "http://www.twingly.com/blog-data/" } 538 | let(:expected) { "http://www.twingly.com/blog-data" } 539 | 540 | it { is_expected.to eq(expected) } 541 | end 542 | 543 | context "does not remove whitespaces from middle of path" do 544 | let(:url) { "http://www.twingly.com/blo g-data/" } 545 | let(:expected) { "http://www.twingly.com/blo g-data" } 546 | 547 | it { is_expected.to eq(expected) } 548 | end 549 | 550 | context "is able to normalize a url with double slash in path" do 551 | let(:url) { "www.twingly.com/path//" } 552 | let(:expected) { "http://www.twingly.com/path" } 553 | 554 | it { is_expected.to eq(expected) } 555 | end 556 | 557 | context "is able to normalize a url without the scheme part" do 558 | let(:url) { "www.twingly.com/" } 559 | let(:expected) { "http://www.twingly.com/" } 560 | 561 | it { is_expected.to eq(expected) } 562 | end 563 | 564 | context "does not return broken URLs" do 565 | let(:url) { "http://www.twingly." } 566 | let(:expected) { "" } 567 | 568 | it { is_expected.to eq(expected) } 569 | end 570 | 571 | context "does not add www. to blogspot URLs" do 572 | let(:url) { "http://jlchen1026.blogspot.com/" } 573 | 574 | it { is_expected.to eq(url) } 575 | end 576 | 577 | context "removes www. from blogspot URLs" do 578 | let(:url) { "http://www.jlchen1026.blogspot.com/" } 579 | let(:expected) { "http://jlchen1026.blogspot.com/" } 580 | 581 | it { is_expected.to eq(expected) } 582 | end 583 | 584 | context "rewrites blogspot TLDs to .com" do 585 | let(:url) { "http://WWW.jlchen1026.blogspot.CO.UK/" } 586 | let(:expected) { "http://jlchen1026.blogspot.com/" } 587 | 588 | it { is_expected.to eq(expected) } 589 | end 590 | 591 | context "downcases the scheme part" do 592 | let(:url) { "HTTPS://www.twingly.com/" } 593 | let(:expected) { "https://www.twingly.com/" } 594 | 595 | it { is_expected.to eq(expected) } 596 | end 597 | 598 | context "downcases the domain" do 599 | let(:url) { "http://WWW.TWINGLY.COM/" } 600 | let(:expected) { "http://www.twingly.com/" } 601 | 602 | it { is_expected.to eq(expected) } 603 | end 604 | 605 | context "does not downcase the path" do 606 | let(:url) { "http://www.twingly.com/PaTH" } 607 | 608 | it { is_expected.to eq(url) } 609 | end 610 | 611 | context "does not downcase fragment" do 612 | let(:url) { "http://www.twingly.com/#FRAGment" } 613 | 614 | it { is_expected.to eq(url) } 615 | end 616 | 617 | context "handles URL with ] in it" do 618 | let(:url) { "http://www.iwaseki.co.jp/cgi/yybbs/yybbs.cgi/%DEuropean]buy" } 619 | 620 | it { is_expected.to eq(url) } 621 | end 622 | 623 | context "handles URL with reference to another URL in it" do 624 | let(:url) { "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun" } 625 | 626 | it { is_expected.to eq(url) } 627 | end 628 | 629 | context "handles URL with umlauts in host" do 630 | let(:url) { "http://www.åäö.se/" } 631 | let(:expected) { "http://www.xn--4cab6c.se/" } 632 | 633 | it { is_expected.to eq(expected) } 634 | end 635 | 636 | context "handles URL with umlauts in path" do 637 | let(:url) { "http://www.aoo.se/öö" } 638 | 639 | it { is_expected.to eq(url) } 640 | end 641 | 642 | context "handles URL with punycoded SLD" do 643 | let(:url) { "http://www.xn--4cab6c.se/" } 644 | 645 | it { is_expected.to eq(url) } 646 | end 647 | 648 | context "handles URL with punycoded TLD" do 649 | let(:url) { "http://example.xn--p1ai/" } 650 | let(:expected) { "http://www.example.xn--p1ai/" } 651 | 652 | it { is_expected.to eq(expected) } 653 | end 654 | 655 | context "converts to a punycoded URL" do 656 | let(:url) { "скраповыймир.рф" } 657 | let(:expected) { "http://www.xn--80aesdcplhhhb0k.xn--p1ai/" } 658 | 659 | it { is_expected.to eq(expected) } 660 | end 661 | 662 | context "does not blow up when there's no URL in the text" do 663 | let(:url) { "Just some text" } 664 | let(:expected) { "" } 665 | 666 | it { is_expected.to eq(expected) } 667 | end 668 | end 669 | 670 | describe "#without_scheme" do 671 | subject { described_class.parse(url).without_scheme } 672 | 673 | context "removes scheme from mixed case HTTP URL" do 674 | let(:url) { "HttP://www.duh.se/" } 675 | let(:expected) { "//www.duh.se/" } 676 | 677 | it { is_expected.to eq(expected) } 678 | end 679 | 680 | context "removes scheme from mixed case HTTPS URL" do 681 | let(:url) { "hTTpS://www.duh.se/" } 682 | let(:expected) { "//www.duh.se/" } 683 | 684 | it { is_expected.to eq(expected) } 685 | end 686 | 687 | context "removes scheme from lowercase HTTP URL" do 688 | let(:url) { "http://www.duh.se/" } 689 | let(:expected) { "//www.duh.se/" } 690 | 691 | it { is_expected.to eq(expected) } 692 | end 693 | 694 | context "removes scheme from lowercase HTTPS URL" do 695 | let(:url) { "https://www.duh.se/" } 696 | let(:expected) { "//www.duh.se/" } 697 | 698 | it { is_expected.to eq(expected) } 699 | end 700 | 701 | context "removes scheme from uppercase HTTP URL" do 702 | let(:url) { "HTTP://WWW.DUH.SE/" } 703 | let(:expected) { "//WWW.DUH.SE/" } 704 | 705 | it { is_expected.to eq(expected) } 706 | end 707 | 708 | context "removes scheme from uppercase HTTPS URL" do 709 | let(:url) { "HTTPS://WWW.DUH.SE/" } 710 | let(:expected) { "//WWW.DUH.SE/" } 711 | 712 | it { is_expected.to eq(expected) } 713 | end 714 | 715 | context "removes scheme from URL with non ASCII characters" do 716 | let(:url) { "http://www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα" } 717 | let(:expected) { "//www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα" } 718 | 719 | it { is_expected.to eq(expected) } 720 | end 721 | 722 | context "only removes scheme from HTTP URL" do 723 | let(:url) { "http://feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml" } 724 | let(:expected) { "//feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml" } 725 | 726 | it { is_expected.to eq(expected) } 727 | end 728 | 729 | context "only removes scheme from HTTPS URL" do 730 | let(:url) { "https://feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss" } 731 | let(:expected) { "//feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss" } 732 | 733 | it { is_expected.to eq(expected) } 734 | end 735 | end 736 | 737 | describe "#userinfo" do 738 | subject { described_class.parse(url).userinfo } 739 | 740 | context "without authorisation part in URL" do 741 | let(:url) { "https://blog.twingly.com/" } 742 | 743 | it { is_expected.to eq("") } 744 | end 745 | 746 | context "with user and password part in URL" do 747 | let(:url) { "https://user:password@blog.twingly.com/" } 748 | 749 | it { is_expected.to eq("user:password") } 750 | end 751 | 752 | context "with empty user and empty password in URL" do 753 | let(:url) { "https://:@blog.twingly.com/" } 754 | 755 | it { is_expected.to eq(":") } 756 | end 757 | 758 | context "with user but empty password in URL" do 759 | let(:url) { "https://user:@blog.twingly.com/" } 760 | 761 | it { is_expected.to eq("user:") } 762 | end 763 | 764 | context "with empty user but password in URL" do 765 | let(:url) { "https://:password@blog.twingly.com/" } 766 | 767 | it { is_expected.to eq(":password") } 768 | end 769 | end 770 | 771 | describe "#user" do 772 | subject { described_class.parse(url).user } 773 | 774 | context "without authorisation part in URL" do 775 | let(:url) { "https://blog.twingly.com/" } 776 | 777 | it { is_expected.to eq("") } 778 | end 779 | 780 | context "with user and password part in URL" do 781 | let(:url) { "https://user:password@blog.twingly.com/" } 782 | 783 | it { is_expected.to eq("user") } 784 | end 785 | 786 | context "with empty user and empty password in URL" do 787 | let(:url) { "https://:@blog.twingly.com/" } 788 | 789 | it { is_expected.to eq("") } 790 | end 791 | 792 | context "with user but empty password in URL" do 793 | let(:url) { "https://user:@blog.twingly.com/" } 794 | 795 | it { is_expected.to eq("user") } 796 | end 797 | 798 | context "with empty user but password in URL" do 799 | let(:url) { "https://:password@blog.twingly.com/" } 800 | 801 | it { is_expected.to eq("") } 802 | end 803 | end 804 | 805 | describe "#password" do 806 | subject { described_class.parse(url).password } 807 | 808 | context "without authorisation part in URL" do 809 | let(:url) { "https://blog.twingly.com/" } 810 | 811 | it { is_expected.to eq("") } 812 | end 813 | 814 | context "with user and password part in URL" do 815 | let(:url) { "https://user:password@blog.twingly.com/" } 816 | 817 | it { is_expected.to eq("password") } 818 | end 819 | 820 | context "with empty user and empty password in URL" do 821 | let(:url) { "https://:@blog.twingly.com/" } 822 | 823 | it { is_expected.to eq("") } 824 | end 825 | 826 | context "with user but empty password in URL" do 827 | let(:url) { "https://user:@blog.twingly.com/" } 828 | 829 | it { is_expected.to eq("") } 830 | end 831 | 832 | context "with empty user but password in URL" do 833 | let(:url) { "https://:password@blog.twingly.com/" } 834 | 835 | it { is_expected.to eq("password") } 836 | end 837 | end 838 | 839 | describe "#to_s" do 840 | subject { url.to_s } 841 | it { is_expected.to eq(test_url) } 842 | end 843 | 844 | describe "comparable methods" do 845 | let(:a) { "http://a.com" } 846 | let(:b) { "http://b.com" } 847 | 848 | describe "#<=>" do 849 | let(:test_urls) { [b, a, b, a, a] } 850 | 851 | subject do 852 | test_urls.map { |url| described_class.parse(url) }.sort.map(&:to_s) 853 | end 854 | 855 | it { is_expected.to eq(test_urls.sort) } 856 | end 857 | 858 | describe "#==" do 859 | context "when parsing the same URLs" do 860 | subject { described_class.parse(a) == described_class.parse(a) } 861 | it { is_expected.to be(true) } 862 | end 863 | 864 | context "when parsing different URLs" do 865 | subject { described_class.parse(a) == described_class.parse(b) } 866 | it { is_expected.to be(false) } 867 | end 868 | end 869 | 870 | describe "#===" do 871 | context "when parsing the same URLs" do 872 | subject { described_class.parse(a) === described_class.parse(a) } 873 | it { is_expected.to be(true) } 874 | end 875 | 876 | context "when parsing different URLs" do 877 | subject { described_class.parse(a) === described_class.parse(b) } 878 | it { is_expected.to be(false) } 879 | end 880 | end 881 | 882 | context "with invalid and valid URLs" do 883 | let(:test_urls) { [b, "", a] } 884 | 885 | subject do 886 | test_urls.map { |url| described_class.parse(url) }.sort.map(&:to_s) 887 | end 888 | 889 | it { is_expected.to eq(test_urls.sort) } 890 | end 891 | end 892 | 893 | describe "uniqueness" do 894 | context do "with the same URL twice" 895 | let(:a) { described_class.parse("https://www.twingly.com/") } 896 | let(:b) { described_class.parse("https://www.google.com/") } 897 | let(:c) { described_class.parse("https://www.twingly.com/") } 898 | 899 | it "should give only unique URLs" do 900 | expect([a,b,c].uniq).to eq([a,b]) 901 | end 902 | end 903 | 904 | context do "two similar URLs, but not exactly the same" 905 | let(:a) { described_class.parse("https://www.twingly.com") } 906 | let(:b) { described_class.parse("https://www.twingly.com/") } 907 | 908 | it "should be two unique URLs" do 909 | expect([a,b].uniq).to eq([a,b]) 910 | end 911 | end 912 | 913 | context do "the same URL but with some whitespace should be the same" 914 | let(:a) { described_class.parse(" https://www.twingly.com/") } 915 | let(:b) { described_class.parse("https://www.twingly.com/ ") } 916 | 917 | it "should be one unique URL" do 918 | expect([a,b].uniq).to eq([a]) 919 | end 920 | 921 | describe ".eql?" do 922 | subject { a.eql?(b) } 923 | it { is_expected.to eq(true) } 924 | end 925 | end 926 | 927 | context "an object and its string representation" do 928 | let(:url) { "https://www.twingy.com/" } 929 | let(:a) { described_class.parse(url) } 930 | let(:b) { described_class.parse(url).to_s } 931 | 932 | it "should be two unique objects" do 933 | expect([a,b].uniq).to eq([a,b]) 934 | end 935 | 936 | describe "#eql?" do 937 | subject { a.eql?(b) } 938 | it { is_expected.to eq(false) } 939 | end 940 | end 941 | end 942 | 943 | describe "#inspect" do 944 | let(:url_object) { described_class.parse(url) } 945 | subject { url_object.inspect } 946 | 947 | it { is_expected.to include(url_object.to_s) } 948 | end 949 | end 950 | -------------------------------------------------------------------------------- /spec/spec_helper.rb: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | RSpec.configure do |config| 4 | config.expect_with :rspec do |expectations| 5 | expectations.include_chain_clauses_in_custom_matcher_descriptions = true 6 | end 7 | 8 | config.mock_with :rspec do |mocks| 9 | mocks.verify_partial_doubles = true 10 | end 11 | 12 | config.order = :random 13 | 14 | Kernel.srand config.seed 15 | end 16 | -------------------------------------------------------------------------------- /twingly-url.gemspec: -------------------------------------------------------------------------------- 1 | # frozen_string_literal: true 2 | 3 | require File.expand_path('../lib/twingly/version', __FILE__) 4 | 5 | Gem::Specification.new do |s| 6 | s.name = "twingly-url" 7 | s.version = Twingly::URL::VERSION 8 | s.platform = Gem::Platform::RUBY 9 | s.authors = ["Twingly AB"] 10 | s.email = ["support@twingly.com"] 11 | s.homepage = "http://github.com/twingly/twingly-url" 12 | s.summary = "Ruby library for URL handling" 13 | s.description = "Twingly URL tools" 14 | s.license = "MIT" 15 | s.required_ruby_version = ">= 2.6" 16 | 17 | s.add_dependency "addressable", "~> 2.6" 18 | s.add_dependency "public_suffix", ">= 3.0.1", "< 6.0" 19 | 20 | s.add_development_dependency "rake", "~> 12" 21 | s.add_development_dependency "rspec", "~> 3" 22 | s.add_development_dependency "pry", "~> 0" 23 | 24 | s.files = Dir.glob("{lib}/**/*") + %w(README.md) 25 | s.require_path = 'lib' 26 | end 27 | --------------------------------------------------------------------------------