├── .github
    └── workflows
    │   ├── ci-build-and-install-gem.yml
    │   └── ci.yml
├── .gitignore
├── .rspec
├── CHANGELOG.md
├── Gemfile
├── LICENSE
├── README.md
├── Rakefile
├── examples
    └── url.rb
├── lib
    └── twingly
    │   ├── public_suffix_list.rb
    │   ├── url.rb
    │   ├── url
    │       ├── error.rb
    │       ├── hasher.rb
    │       ├── null_url.rb
    │       └── utilities.rb
    │   └── version.rb
├── profile
    ├── Gemfile
    ├── Rakefile
    └── profile.rb
├── spec
    ├── lib
    │   └── twingly
    │   │   ├── public_suffix_list_spec.rb
    │   │   ├── url
    │   │       ├── hasher_spec.rb
    │   │       ├── null_url_spec.rb
    │   │       └── utilities_spec.rb
    │   │   └── url_spec.rb
    └── spec_helper.rb
└── twingly-url.gemspec


/.github/workflows/ci-build-and-install-gem.yml:
--------------------------------------------------------------------------------
 1 | name: CI build and install gem
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-22.04
10 | 
11 |     steps:
12 |     - name: Checkout code
13 |       uses: actions/checkout@v3
14 | 
15 |     - name: Setup Ruby
16 |       uses: ruby/setup-ruby@v1
17 |       with:
18 |         ruby-version: 3.1.2
19 | 
20 |     - name: Build and install gem
21 |       run: gem build *.gemspec && gem install *.gem
22 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-22.04
10 | 
11 |     strategy:
12 |       fail-fast: false
13 |       matrix:
14 |         include:
15 |           - { ruby: '2.6' }
16 |           - { ruby: '2.7' }
17 |           - { ruby: '3.0' }
18 |           - { ruby: '3.1' }
19 |           - { ruby: '3.2' }
20 |           - { ruby: head, allow-failure: true }
21 |           - { ruby: jruby-9.3 }
22 |           - { ruby: jruby-head, allow-failure: true }
23 | 
24 |     steps:
25 |     - name: Checkout code
26 |       uses: actions/checkout@v3
27 | 
28 |     - name: Setup Ruby ${{ matrix.ruby }}
29 |       uses: ruby/setup-ruby@v1
30 |       with:
31 |         ruby-version: ${{ matrix.ruby }}
32 |         bundler-cache: true
33 | 
34 |     - name: Run tests
35 |       run: bundle exec rake
36 |       continue-on-error: ${{ matrix.allow-failure || false }}
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.gem
 2 | *.rbc
 3 | .bundle
 4 | .config
 5 | coverage
 6 | InstalledFiles
 7 | lib/bundler/man
 8 | pkg
 9 | rdoc
10 | spec/reports
11 | test/tmp
12 | test/version_tmp
13 | /tmp
14 | /profile/tmp
15 | 
16 | # YARD artifacts
17 | .yardoc
18 | _yardoc
19 | doc/
20 | 
21 | Gemfile.lock
22 | 


--------------------------------------------------------------------------------
/.rspec:
--------------------------------------------------------------------------------
1 | --color
2 | --format documentation
3 | --require spec_helper
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## [v7.0.1](https://github.com/twingly/twingly-url/tree/v7.0.1) (2022-11-01)
  4 | 
  5 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v7.0.0...v7.0.1)
  6 | 
  7 | **Merged pull requests:**
  8 | 
  9 | - Add missing nil guard in `valid_hostname?` [\#160](https://github.com/twingly/twingly-url/pull/160) ([dentarg](https://github.com/dentarg))
 10 | 
 11 | ## [v7.0.0](https://github.com/twingly/twingly-url/tree/v7.0.0) (2022-10-14)
 12 | 
 13 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.4...v7.0.0)
 14 | 
 15 | **Implemented enhancements:**
 16 | 
 17 | - Remove `Twingly::URL::Hasher.blogstream_hash` [\#152](https://github.com/twingly/twingly-url/issues/152)
 18 | - Normalize URL with multiple consecutive dots [\#125](https://github.com/twingly/twingly-url/issues/125)
 19 | 
 20 | **Fixed bugs:**
 21 | 
 22 | - Bug in normalized\_host in Addressable \(ArgumentError: invalid byte sequence in UTF-8\) [\#62](https://github.com/twingly/twingly-url/issues/62)
 23 | 
 24 | **Merged pull requests:**
 25 | 
 26 | - Add more invalid URLs to specs [\#159](https://github.com/twingly/twingly-url/pull/159) ([dentarg](https://github.com/dentarg))
 27 | - Validate the normalized hostname [\#158](https://github.com/twingly/twingly-url/pull/158) ([dentarg](https://github.com/dentarg))
 28 | - Require Ruby \>= 2.6 [\#157](https://github.com/twingly/twingly-url/pull/157) ([Pontus4](https://github.com/Pontus4))
 29 | - CI housekeeping [\#156](https://github.com/twingly/twingly-url/pull/156) ([dentarg](https://github.com/dentarg))
 30 | - Allow use of `public_suffix` 5 [\#155](https://github.com/twingly/twingly-url/pull/155) ([dentarg](https://github.com/dentarg))
 31 | - Test with latest Rubies [\#154](https://github.com/twingly/twingly-url/pull/154) ([roback](https://github.com/roback))
 32 | - Remove `Twingly::URL::Hasher.blogstream_hash` [\#153](https://github.com/twingly/twingly-url/pull/153) ([Chrizpy](https://github.com/Chrizpy))
 33 | - Run CI on latest Rubies [\#151](https://github.com/twingly/twingly-url/pull/151) ([walro](https://github.com/walro))
 34 | 
 35 | ## [v6.0.4](https://github.com/twingly/twingly-url/tree/v6.0.4) (2021-04-14)
 36 | 
 37 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.3...v6.0.4)
 38 | 
 39 | **Implemented enhancements:**
 40 | 
 41 | - Ruby 2.7 support [\#144](https://github.com/twingly/twingly-url/issues/144)
 42 | 
 43 | **Merged pull requests:**
 44 | 
 45 | - Run CI on Ruby 3.0.0 [\#150](https://github.com/twingly/twingly-url/pull/150) ([walro](https://github.com/walro))
 46 | - Add version constraint for Pry [\#149](https://github.com/twingly/twingly-url/pull/149) ([walro](https://github.com/walro))
 47 | - Run CI on GitHub actions [\#148](https://github.com/twingly/twingly-url/pull/148) ([walro](https://github.com/walro))
 48 | - Test with more recent Rubies [\#146](https://github.com/twingly/twingly-url/pull/146) ([walro](https://github.com/walro))
 49 | 
 50 | ## [v6.0.3](https://github.com/twingly/twingly-url/tree/v6.0.3) (2020-09-21)
 51 | 
 52 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.2...v6.0.3)
 53 | 
 54 | **Fixed bugs:**
 55 | 
 56 | - Cannot published gem using "rake release" [\#139](https://github.com/twingly/twingly-url/issues/139)
 57 | 
 58 | **Merged pull requests:**
 59 | 
 60 | - Update to Ruby 2.7 and drop support for Ruby 2.4 [\#145](https://github.com/twingly/twingly-url/pull/145) ([Pontus4](https://github.com/Pontus4))
 61 | - Bump rake version \(10 -\> 12\) [\#142](https://github.com/twingly/twingly-url/pull/142) ([dentarg](https://github.com/dentarg))
 62 | - Test with latest Ruby versions on Travis [\#140](https://github.com/twingly/twingly-url/pull/140) ([roback](https://github.com/roback))
 63 | 
 64 | ## [v6.0.2](https://github.com/twingly/twingly-url/tree/v6.0.2) (2019-08-28)
 65 | 
 66 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.1...v6.0.2)
 67 | 
 68 | **Implemented enhancements:**
 69 | 
 70 | - Make CI test gem installation [\#109](https://github.com/twingly/twingly-url/issues/109)
 71 | 
 72 | **Fixed bugs:**
 73 | 
 74 | - Handle some common representations of newlines [\#138](https://github.com/twingly/twingly-url/pull/138) ([dentarg](https://github.com/dentarg))
 75 | 
 76 | **Merged pull requests:**
 77 | 
 78 | - Add a memory profiler task [\#137](https://github.com/twingly/twingly-url/pull/137) ([jage](https://github.com/jage))
 79 | - Test gem install on TravisCI [\#136](https://github.com/twingly/twingly-url/pull/136) ([roback](https://github.com/roback))
 80 | - Sign in to RubyGems.org before trying to publish [\#134](https://github.com/twingly/twingly-url/pull/134) ([dentarg](https://github.com/dentarg))
 81 | 
 82 | ## [v6.0.1](https://github.com/twingly/twingly-url/tree/v6.0.1) (2019-03-04)
 83 | 
 84 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v6.0.0...v6.0.1)
 85 | 
 86 | **Implemented enhancements:**
 87 | 
 88 | - Update addressable to 2.6 [\#132](https://github.com/twingly/twingly-url/issues/132)
 89 | 
 90 | **Merged pull requests:**
 91 | 
 92 | - Accept that parsing downcases the scheme part, allow Addressable 2.6 [\#133](https://github.com/twingly/twingly-url/pull/133) ([dentarg](https://github.com/dentarg))
 93 | 
 94 | ## [v6.0.0](https://github.com/twingly/twingly-url/tree/v6.0.0) (2019-02-06)
 95 | 
 96 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v5.1.1...v6.0.0)
 97 | 
 98 | **Fixed bugs:**
 99 | 
100 | - Array\#uniq does not remove equal Twingly::URLs [\#123](https://github.com/twingly/twingly-url/issues/123)
101 | - uninitialized constant Addressable::IDNA::PunycodeBigOutput \(NameError\) [\#119](https://github.com/twingly/twingly-url/issues/119)
102 | 
103 | **Merged pull requests:**
104 | 
105 | - Strip input from both space and non-breaking space [\#131](https://github.com/twingly/twingly-url/pull/131) ([dentarg](https://github.com/dentarg))
106 | - Implement uniqueness \(hash equality\) [\#129](https://github.com/twingly/twingly-url/pull/129) ([jage](https://github.com/jage))
107 | - Freeze string literals and mutable constants [\#128](https://github.com/twingly/twingly-url/pull/128) ([jage](https://github.com/jage))
108 | - Support: Ruby 2.6, 2.5, 2.4, drop 2.2, 2.3 [\#127](https://github.com/twingly/twingly-url/pull/127) ([jage](https://github.com/jage))
109 | - Strip URLs of leading and trailing non-breaking space \(and space, but we already did\) [\#126](https://github.com/twingly/twingly-url/pull/126) ([dentarg](https://github.com/dentarg))
110 | - Remove Twingly::URL::Hasher.pingloggerdb\_hash [\#124](https://github.com/twingly/twingly-url/pull/124) ([walro](https://github.com/walro))
111 | - Fix various warnings [\#122](https://github.com/twingly/twingly-url/pull/122) ([walro](https://github.com/walro))
112 | - Load the pure-Ruby IDNA implementation from Addressable [\#120](https://github.com/twingly/twingly-url/pull/120) ([dentarg](https://github.com/dentarg))
113 | 
114 | ## [v5.1.1](https://github.com/twingly/twingly-url/tree/v5.1.1) (2018-02-14)
115 | 
116 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v5.1.0...v5.1.1)
117 | 
118 | **Implemented enhancements:**
119 | 
120 | - Rework exceptions [\#31](https://github.com/twingly/twingly-url/issues/31)
121 | 
122 | **Merged pull requests:**
123 | 
124 | - Allow future patch versions of dependencies [\#118](https://github.com/twingly/twingly-url/pull/118) ([dentarg](https://github.com/dentarg))
125 | - Update PublicSuffix and Addressable [\#117](https://github.com/twingly/twingly-url/pull/117) ([roback](https://github.com/roback))
126 | - Use latest rubies on Travis CI [\#116](https://github.com/twingly/twingly-url/pull/116) ([dentarg](https://github.com/dentarg))
127 | - Use latest rubies on Travis CI [\#115](https://github.com/twingly/twingly-url/pull/115) ([walro](https://github.com/walro))
128 | - Use latest rubies on Travis CI [\#114](https://github.com/twingly/twingly-url/pull/114) ([dentarg](https://github.com/dentarg))
129 | - Do not blow up on Addressable::IDNA::PunycodeBigOutput [\#113](https://github.com/twingly/twingly-url/pull/113) ([dentarg](https://github.com/dentarg))
130 | - Bump Ruby versions tested on Travis CI [\#111](https://github.com/twingly/twingly-url/pull/111) ([dentarg](https://github.com/dentarg))
131 | - Tag exceptions [\#106](https://github.com/twingly/twingly-url/pull/106) ([dentarg](https://github.com/dentarg))
132 | - Switch to logical requires [\#103](https://github.com/twingly/twingly-url/pull/103) ([dentarg](https://github.com/dentarg))
133 | 
134 | ## [v5.1.0](https://github.com/twingly/twingly-url/tree/v5.1.0) (2017-03-03)
135 | 
136 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v5.0.1...v5.1.0)
137 | 
138 | **Implemented enhancements:**
139 | 
140 | - Release a new version \(without idn-ruby\) [\#110](https://github.com/twingly/twingly-url/issues/110)
141 | - Unfortunate require in Rakefile \(profile task\), affect specs [\#92](https://github.com/twingly/twingly-url/issues/92)
142 | - JRuby compatibility \(drop libidn requirement\) [\#66](https://github.com/twingly/twingly-url/issues/66)
143 | 
144 | **Fixed bugs:**
145 | 
146 | - Dependencies not locked correctly [\#104](https://github.com/twingly/twingly-url/issues/104)
147 | - twingly-url doesn't support IDNA2008, only IDNA2003 \(libidn\) [\#101](https://github.com/twingly/twingly-url/issues/101)
148 | 
149 | **Merged pull requests:**
150 | 
151 | - Test the latest Ruby releases [\#108](https://github.com/twingly/twingly-url/pull/108) ([dentarg](https://github.com/dentarg))
152 | - Depend on addressable 2.5.0 and public\_suffix 2.0.3 [\#107](https://github.com/twingly/twingly-url/pull/107) ([dentarg](https://github.com/dentarg))
153 | - Remove ruby-prof from dev dependencies [\#105](https://github.com/twingly/twingly-url/pull/105) ([dentarg](https://github.com/dentarg))
154 | - Drop libidn [\#102](https://github.com/twingly/twingly-url/pull/102) ([dentarg](https://github.com/dentarg))
155 | 
156 | ## [v5.0.1](https://github.com/twingly/twingly-url/tree/v5.0.1) (2016-09-19)
157 | 
158 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v5.0.0...v5.0.1)
159 | 
160 | **Fixed bugs:**
161 | 
162 | - "ArgumentError: invalid byte sequence in US-ASCII" when parsing the public suffix list [\#98](https://github.com/twingly/twingly-url/issues/98)
163 | 
164 | **Merged pull requests:**
165 | 
166 | - Make sure we always read PSL data as UTF-8 [\#99](https://github.com/twingly/twingly-url/pull/99) ([dentarg](https://github.com/dentarg))
167 | 
168 | ## [v5.0.0](https://github.com/twingly/twingly-url/tree/v5.0.0) (2016-09-16)
169 | 
170 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v4.2.0...v5.0.0)
171 | 
172 | **Implemented enhancements:**
173 | 
174 | - License file [\#91](https://github.com/twingly/twingly-url/issues/91)
175 | - Use PublicSuffix 2.0 [\#85](https://github.com/twingly/twingly-url/issues/85)
176 | - Changelog [\#33](https://github.com/twingly/twingly-url/issues/33)
177 | 
178 | **Fixed bugs:**
179 | 
180 | - NormalizedURL\#to\_s returns punycode, other instance methods does not [\#89](https://github.com/twingly/twingly-url/issues/89)
181 | 
182 | **Merged pull requests:**
183 | 
184 | - DRY up to urls example [\#95](https://github.com/twingly/twingly-url/pull/95) ([jage](https://github.com/jage))
185 | - Add changelog [\#93](https://github.com/twingly/twingly-url/pull/93) ([dentarg](https://github.com/dentarg))
186 | - Ensure normalized IDNA domains return ASCII strings [\#90](https://github.com/twingly/twingly-url/pull/90) ([dentarg](https://github.com/dentarg))
187 | 
188 | ## [v4.2.0](https://github.com/twingly/twingly-url/tree/v4.2.0) (2016-08-31)
189 | 
190 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v4.1.0...v4.2.0)
191 | 
192 | **Merged pull requests:**
193 | 
194 | - Add Twingly::URL\#ttld, "true TLD" getter [\#88](https://github.com/twingly/twingly-url/pull/88) ([dentarg](https://github.com/dentarg))
195 | - Add example usage to README [\#86](https://github.com/twingly/twingly-url/pull/86) ([dentarg](https://github.com/dentarg))
196 | 
197 | ## [v4.1.0](https://github.com/twingly/twingly-url/tree/v4.1.0) (2016-05-23)
198 | 
199 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v4.0.0...v4.1.0)
200 | 
201 | **Closed issues:**
202 | 
203 | - Expose addressable's \#userinfo [\#73](https://github.com/twingly/twingly-url/issues/73)
204 | 
205 | **Merged pull requests:**
206 | 
207 | - Expose userinfo, user and password [\#84](https://github.com/twingly/twingly-url/pull/84) ([jage](https://github.com/jage))
208 | 
209 | ## [v4.0.0](https://github.com/twingly/twingly-url/tree/v4.0.0) (2016-02-03)
210 | 
211 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v3.0.2...v4.0.0)
212 | 
213 | **Implemented enhancements:**
214 | 
215 | - Make more methods private [\#60](https://github.com/twingly/twingly-url/issues/60)
216 | 
217 | **Fixed bugs:**
218 | 
219 | - domain, sld, trd \(maybe others\) can be nil [\#52](https://github.com/twingly/twingly-url/issues/52)
220 | 
221 | **Merged pull requests:**
222 | 
223 | - Temporary fix for UTF-8 bug in addressable [\#79](https://github.com/twingly/twingly-url/pull/79) ([roback](https://github.com/roback))
224 | - No part of a URL should be nil [\#78](https://github.com/twingly/twingly-url/pull/78) ([roback](https://github.com/roback))
225 | - The gem should load the version constant [\#75](https://github.com/twingly/twingly-url/pull/75) ([dentarg](https://github.com/dentarg))
226 | - Make things private [\#69](https://github.com/twingly/twingly-url/pull/69) ([dentarg](https://github.com/dentarg))
227 | 
228 | ## [v3.0.2](https://github.com/twingly/twingly-url/tree/v3.0.2) (2015-11-11)
229 | 
230 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v3.0.1...v3.0.2)
231 | 
232 | **Fixed bugs:**
233 | 
234 | - IDN::Idna::IdnaError: Output would be too large or too small [\#64](https://github.com/twingly/twingly-url/issues/64)
235 | 
236 | **Merged pull requests:**
237 | 
238 | - Rescue IDN::Idna::IdnaError [\#65](https://github.com/twingly/twingly-url/pull/65) ([dentarg](https://github.com/dentarg))
239 | 
240 | ## [v3.0.1](https://github.com/twingly/twingly-url/tree/v3.0.1) (2015-11-11)
241 | 
242 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v3.0.0...v3.0.1)
243 | 
244 | **Fixed bugs:**
245 | 
246 | - Do not blow up on broken punycode URLs  [\#48](https://github.com/twingly/twingly-url/issues/48)
247 | 
248 | **Merged pull requests:**
249 | 
250 | - Improve punycode handling with libidn [\#63](https://github.com/twingly/twingly-url/pull/63) ([walro](https://github.com/walro))
251 | 
252 | ## [v3.0.0](https://github.com/twingly/twingly-url/tree/v3.0.0) (2015-11-02)
253 | 
254 | [Full Changelog](https://github.com/twingly/twingly-url/compare/v2.0.0...v3.0.0)
255 | 
256 | **Implemented enhancements:**
257 | 
258 | - New major release [\#38](https://github.com/twingly/twingly-url/issues/38)
259 | 
260 | **Fixed bugs:**
261 | 
262 | - require bug [\#56](https://github.com/twingly/twingly-url/issues/56)
263 | - \#valid? doesn't work for protocol-less urls [\#55](https://github.com/twingly/twingly-url/issues/55)
264 | - Drop support for older Ruby versions [\#53](https://github.com/twingly/twingly-url/issues/53)
265 | 
266 | **Merged pull requests:**
267 | 
268 | - We support 2.2.x [\#59](https://github.com/twingly/twingly-url/pull/59) ([walro](https://github.com/walro))
269 | - Fix "\#valid? doesn't work for protocol-less urls" [\#58](https://github.com/twingly/twingly-url/pull/58) ([walro](https://github.com/walro))
270 | - Refactor requires [\#57](https://github.com/twingly/twingly-url/pull/57) ([dentarg](https://github.com/dentarg))
271 | 
272 | ## [v2.0.0](https://github.com/twingly/twingly-url/tree/v2.0.0) (2015-10-26)
273 | 
274 | [Full Changelog](https://github.com/twingly/twingly-url/compare/d7cecadd542ce5e7709833e1874d39d644d4d11f...v2.0.0)
275 | 
276 | **Implemented enhancements:**
277 | 
278 | - Move lib/version.rb to lib/twingly/version.rb [\#50](https://github.com/twingly/twingly-url/issues/50)
279 | - Prettier inspect output [\#43](https://github.com/twingly/twingly-url/issues/43)
280 | - Return objects instead of strings [\#40](https://github.com/twingly/twingly-url/issues/40)
281 | - Do not return nil [\#35](https://github.com/twingly/twingly-url/issues/35)
282 | - Method to extract URLs from text without normalizing [\#34](https://github.com/twingly/twingly-url/issues/34)
283 | - Turn is unmaintained [\#27](https://github.com/twingly/twingly-url/issues/27)
284 | - Should normalize IDN properly [\#17](https://github.com/twingly/twingly-url/issues/17)
285 | - Discrepancy with .NET normalization [\#12](https://github.com/twingly/twingly-url/issues/12)
286 | - Capability for extracting origin part of an URL [\#11](https://github.com/twingly/twingly-url/issues/11)
287 | - Always return normalized urls in lower case [\#8](https://github.com/twingly/twingly-url/issues/8)
288 | - Make gem more general [\#6](https://github.com/twingly/twingly-url/issues/6)
289 | 
290 | **Fixed bugs:**
291 | 
292 | - Ensure proper behaviour for edge-case input data [\#45](https://github.com/twingly/twingly-url/issues/45)
293 | - normalize method can't handle URLs with punycoded TLD [\#28](https://github.com/twingly/twingly-url/issues/28)
294 | - Shoulda-context does not seem to work with Ruby 2.2 [\#26](https://github.com/twingly/twingly-url/issues/26)
295 | - Digest is not threadsafe [\#20](https://github.com/twingly/twingly-url/issues/20)
296 | - Blogspot.com normalization error [\#13](https://github.com/twingly/twingly-url/issues/13)
297 | - Crashes if only a protocol is provided [\#10](https://github.com/twingly/twingly-url/issues/10)
298 | - Can not handle urls with international characters [\#2](https://github.com/twingly/twingly-url/issues/2)
299 | - Add tests [\#1](https://github.com/twingly/twingly-url/issues/1)
300 | 
301 | **Closed issues:**
302 | 
303 | - Release 1.3.3 [\#22](https://github.com/twingly/twingly-url/issues/22)
304 | - Encrypt HipChat API key in .travis.yml [\#16](https://github.com/twingly/twingly-url/issues/16)
305 | - Always return normalized URLs with lower case scheme [\#9](https://github.com/twingly/twingly-url/issues/9)
306 | - Add test for URL: feedville.com,2007-06-19:/blends/16171 [\#7](https://github.com/twingly/twingly-url/issues/7)
307 | - Make repo public [\#5](https://github.com/twingly/twingly-url/issues/5)
308 | - Add .ruby-version file? [\#4](https://github.com/twingly/twingly-url/issues/4)
309 | 
310 | **Merged pull requests:**
311 | 
312 | - Move version.rb to correct subdir [\#51](https://github.com/twingly/twingly-url/pull/51) ([jage](https://github.com/jage))
313 | - Implement prettier \#inspect [\#47](https://github.com/twingly/twingly-url/pull/47) ([jage](https://github.com/jage))
314 | - Work with Twingly::URL objects instead of strings [\#42](https://github.com/twingly/twingly-url/pull/42) ([twingly-mob](https://github.com/twingly-mob))
315 | - New .extract\_url method which does not normalize [\#41](https://github.com/twingly/twingly-url/pull/41) ([twingly-mob](https://github.com/twingly-mob))
316 | - Sync known behaviour with .NET [\#37](https://github.com/twingly/twingly-url/pull/37) ([roback](https://github.com/roback))
317 | - Change from minitest to rspec [\#36](https://github.com/twingly/twingly-url/pull/36) ([roback](https://github.com/roback))
318 | - Make sure Digest loading is thread-safe [\#32](https://github.com/twingly/twingly-url/pull/32) ([jage](https://github.com/jage))
319 | - Ensure we have a tmp directory to dump result to [\#30](https://github.com/twingly/twingly-url/pull/30) ([walro](https://github.com/walro))
320 | - Turn is unmaintained [\#29](https://github.com/twingly/twingly-url/pull/29) ([walro](https://github.com/walro))
321 | - Downcase URLs in normalization [\#23](https://github.com/twingly/twingly-url/pull/23) ([jage](https://github.com/jage))
322 | - Twingly::URL::Utilities.remove\_scheme [\#21](https://github.com/twingly/twingly-url/pull/21) ([jage](https://github.com/jage))
323 | - Fix "gem build" warnings [\#19](https://github.com/twingly/twingly-url/pull/19) ([dentarg](https://github.com/dentarg))
324 | - Rename gem to twingly-url [\#15](https://github.com/twingly/twingly-url/pull/15) ([jage](https://github.com/jage))
325 | - Don't add www. to blogspot [\#14](https://github.com/twingly/twingly-url/pull/14) ([jage](https://github.com/jage))
326 | - Tests [\#3](https://github.com/twingly/twingly-url/pull/3) ([jage](https://github.com/jage))
327 | 
328 | 
329 | 
330 | \* *This Changelog was automatically generated by [github_changelog_generator](https://github.com/github-changelog-generator/github-changelog-generator)*
331 | 


--------------------------------------------------------------------------------
/Gemfile:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | source 'https://rubygems.org/'
4 | 
5 | gemspec
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2013 Twingly AB
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so,
10 | subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Twingly::URL
  2 | 
  3 | [![GitHub Build Status](https://github.com/twingly/twingly-url/workflows/CI/badge.svg?branch=master)](https://github.com/twingly/twingly-url/actions)
  4 | 
  5 | Twingly URL tools.
  6 | 
  7 | * `twingly/url` - Parse and validate URLs
  8 |     * `Twingly::URL.parse` - Returns one or more `Twingly::URL` instance
  9 | * `twingly/url/hasher` - Generate URL hashes suitable for primary keys
 10 |     * `Twingly::URL::Hasher.taskdb_hash(url)` - MD5 hexdigest
 11 |     * `Twingly::URL::Hasher.documentdb_hash(url)` - SHA256 unsigned long, native endian digest
 12 |     * `Twingly::URL::Hasher.autopingdb_hash(url)` - SHA256 64-bit signed, native endian digest
 13 | * `twingly/url/utilities` - Utilities to work with URLs
 14 |     * `Twingly::URL::Utilities.extract_valid_urls` - Returns Array of valid `Twingly::URL`
 15 | 
 16 | ## Getting Started
 17 | 
 18 | Install the gem:
 19 | 
 20 |     gem install twingly-url
 21 | 
 22 | Usage (this output was created with [`examples/url.rb`][examples]):
 23 | 
 24 | ```ruby
 25 | require "twingly/url"
 26 | 
 27 | url = Twingly::URL.parse("http://www.twingly.co.uk/search")
 28 | url.scheme                    # => "http"
 29 | url.normalized.scheme         # => "http"
 30 | url.trd                       # => "www"
 31 | url.normalized.trd            # => "www"
 32 | url.sld                       # => "twingly"
 33 | url.normalized.sld            # => "twingly"
 34 | url.tld                       # => "co.uk"
 35 | url.normalized.tld            # => "co.uk"
 36 | url.ttld                      # => "uk"
 37 | url.normalized.ttld           # => "uk"
 38 | url.domain                    # => "twingly.co.uk"
 39 | url.normalized.domain         # => "twingly.co.uk"
 40 | url.host                      # => "www.twingly.co.uk"
 41 | url.normalized.host           # => "www.twingly.co.uk"
 42 | url.origin                    # => "http://www.twingly.co.uk"
 43 | url.normalized.origin         # => "http://www.twingly.co.uk"
 44 | url.path                      # => "/search"
 45 | url.normalized.path           # => "/search"
 46 | url.without_scheme            # => "//www.twingly.co.uk/search"
 47 | url.normalized.without_scheme # => "//www.twingly.co.uk/search"
 48 | url.userinfo                  # => ""
 49 | url.normalized.userinfo       # => ""
 50 | url.user                      # => ""
 51 | url.normalized.user           # => ""
 52 | url.password                  # => ""
 53 | url.normalized.password       # => ""
 54 | url.valid?                    # => "true"
 55 | url.normalized.valid?         # => "true"
 56 | url.to_s                      # => "http://www.twingly.co.uk/search"
 57 | url.normalized.to_s           # => "http://www.twingly.co.uk/search"
 58 | 
 59 | url = Twingly::URL.parse("http://räksmörgås.макдональдс.рф/foo")
 60 | url.scheme                    # => "http"
 61 | url.normalized.scheme         # => "http"
 62 | url.trd                       # => "räksmörgås"
 63 | url.normalized.trd            # => "xn--rksmrgs-5wao1o"
 64 | url.sld                       # => "макдональдс"
 65 | url.normalized.sld            # => "xn--80aalb1aicli8a5i"
 66 | url.tld                       # => "рф"
 67 | url.normalized.tld            # => "xn--p1ai"
 68 | url.ttld                      # => "рф"
 69 | url.normalized.ttld           # => "xn--p1ai"
 70 | url.domain                    # => "макдональдс.рф"
 71 | url.normalized.domain         # => "xn--80aalb1aicli8a5i.xn--p1ai"
 72 | url.host                      # => "räksmörgås.макдональдс.рф"
 73 | url.normalized.host           # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
 74 | url.origin                    # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
 75 | url.normalized.origin         # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
 76 | url.path                      # => "/foo"
 77 | url.normalized.path           # => "/foo"
 78 | url.without_scheme            # => "//räksmörgås.макдональдс.рф/foo"
 79 | url.normalized.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
 80 | url.userinfo                  # => ""
 81 | url.normalized.userinfo       # => ""
 82 | url.user                      # => ""
 83 | url.normalized.user           # => ""
 84 | url.password                  # => ""
 85 | url.normalized.password       # => ""
 86 | url.valid?                    # => "true"
 87 | url.normalized.valid?         # => "true"
 88 | url.to_s                      # => "http://räksmörgås.макдональдс.рф/foo"
 89 | url.normalized.to_s           # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
 90 | 
 91 | url = Twingly::URL.parse("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo")
 92 | url.scheme                    # => "http"
 93 | url.normalized.scheme         # => "http"
 94 | url.trd                       # => "xn--rksmrgs-5wao1o"
 95 | url.normalized.trd            # => "xn--rksmrgs-5wao1o"
 96 | url.sld                       # => "xn--80aalb1aicli8a5i"
 97 | url.normalized.sld            # => "xn--80aalb1aicli8a5i"
 98 | url.tld                       # => "xn--p1ai"
 99 | url.normalized.tld            # => "xn--p1ai"
100 | url.ttld                      # => "xn--p1ai"
101 | url.normalized.ttld           # => "xn--p1ai"
102 | url.domain                    # => "xn--80aalb1aicli8a5i.xn--p1ai"
103 | url.normalized.domain         # => "xn--80aalb1aicli8a5i.xn--p1ai"
104 | url.host                      # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
105 | url.normalized.host           # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
106 | url.origin                    # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
107 | url.normalized.origin         # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
108 | url.path                      # => "/foo"
109 | url.normalized.path           # => "/foo"
110 | url.without_scheme            # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
111 | url.normalized.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
112 | url.userinfo                  # => ""
113 | url.normalized.userinfo       # => ""
114 | url.user                      # => ""
115 | url.normalized.user           # => ""
116 | url.password                  # => ""
117 | url.normalized.password       # => ""
118 | url.valid?                    # => "true"
119 | url.normalized.valid?         # => "true"
120 | url.to_s                      # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
121 | url.normalized.to_s           # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
122 | 
123 | url = Twingly::URL.parse("https://admin:correcthorsebatterystaple@example.com/")
124 | url.scheme                    # => "https"
125 | url.normalized.scheme         # => "https"
126 | url.trd                       # => ""
127 | url.normalized.trd            # => "www"
128 | url.sld                       # => "example"
129 | url.normalized.sld            # => "example"
130 | url.tld                       # => "com"
131 | url.normalized.tld            # => "com"
132 | url.ttld                      # => "com"
133 | url.normalized.ttld           # => "com"
134 | url.domain                    # => "example.com"
135 | url.normalized.domain         # => "example.com"
136 | url.host                      # => "example.com"
137 | url.normalized.host           # => "www.example.com"
138 | url.origin                    # => "https://example.com"
139 | url.normalized.origin         # => "https://www.example.com"
140 | url.path                      # => "/"
141 | url.normalized.path           # => "/"
142 | url.without_scheme            # => "//admin:correcthorsebatterystaple@example.com/"
143 | url.normalized.without_scheme # => "//admin:correcthorsebatterystaple@www.example.com/"
144 | url.userinfo                  # => "admin:correcthorsebatterystaple"
145 | url.normalized.userinfo       # => "admin:correcthorsebatterystaple"
146 | url.user                      # => "admin"
147 | url.normalized.user           # => "admin"
148 | url.password                  # => "correcthorsebatterystaple"
149 | url.normalized.password       # => "correcthorsebatterystaple"
150 | url.valid?                    # => "true"
151 | url.normalized.valid?         # => "true"
152 | url.to_s                      # => "https://admin:correcthorsebatterystaple@example.com/"
153 | url.normalized.to_s           # => "https://admin:correcthorsebatterystaple@www.example.com/"
154 | ```
155 | 
156 | ### Dependencies
157 | 
158 | Only the gems listed in the [Gem Specification](https://github.com/twingly/twingly-url/blob/master/twingly-url.gemspec).
159 | 
160 | ## Development
161 | 
162 | To inspect the [Public Suffix List], this handy command can be used (also works in projects that use `twingly-url` as an dependency).
163 | 
164 |     open $(bundle show public_suffix)/data/list.txt
165 | 
166 | [Public Suffix List]: https://github.com/weppos/publicsuffix-ruby
167 | 
168 | ## Tests
169 | 
170 | Run tests with
171 | 
172 |     bundle exec rake
173 | 
174 | ### Profiling
175 | 
176 | There's some profiling tasks available through Rake
177 | 
178 |     cd profile/
179 |     bundle # Install dependencies
180 |     bundle exec rake -T # Show available tasks
181 | 
182 | Note that this isn't a benchmark, we're using [ruby-prof] and [memory_profiler] which will slow things down.
183 | 
184 | ## Release workflow
185 | 
186 | * Update the [examples] in this README if needed, generate the output with
187 | 
188 |         ruby examples/url.rb
189 | 
190 | * Bump the version in `lib/twingly/version.rb` in a commit, no need to push (the release task does that).
191 | 
192 | * Ensure you are signed in to RubyGems.org as [twingly][twingly-rubygems] with `gem signin`.
193 | 
194 | * Build and [publish](http://guides.rubygems.org/publishing/) the gem. This will create the proper tag in git, push the commit and tag and upload to RubyGems.
195 | 
196 |         bundle exec rake release
197 | 
198 | * Update the changelog with [GitHub Changelog Generator](https://github.com/github-changelog-generator/github-changelog-generator) (`gem install github_changelog_generator` if you don't have it, set `CHANGELOG_GITHUB_TOKEN` to a personal access token to avoid rate limiting by GitHub). This command will update `CHANGELOG.md`. You need to commit and push manually.
199 | 
200 |         github_changelog_generator
201 | 
202 | [twingly-rubygems]: https://rubygems.org/profiles/twingly
203 | [ruby-prof]: http://ruby-prof.rubyforge.org/
204 | [memory_profiler]: https://github.com/SamSaffron/memory_profiler
205 | [examples]: examples/url.rb
206 | 


--------------------------------------------------------------------------------
/Rakefile:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | # Bundler rake tasks to handle gem releases
 4 | require "bundler/gem_tasks"
 5 | 
 6 | begin
 7 |   require "rspec/core/rake_task"
 8 | 
 9 |   spec_files = Dir.glob(File.join("spec/**", "*_spec.rb"))
10 |   spec_tasks = []
11 | 
12 |   namespace(:spec) do
13 |     spec_files.each do |spec_file|
14 |       task_name = File.basename(spec_file, ".rb").to_sym
15 | 
16 |       spec_tasks << "spec:#{task_name}"
17 | 
18 |       RSpec::Core::RakeTask.new(task_name) do |task|
19 |         task.pattern = spec_file
20 |       end
21 |     end
22 |   end
23 | 
24 |   task default: spec_tasks.shuffle
25 | rescue LoadError
26 | end
27 | 


--------------------------------------------------------------------------------
/examples/url.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "bundler/setup"
 4 | require_relative "../lib/twingly/url"
 5 | 
 6 | def print_url_details(url_as_string)
 7 |   url = Twingly::URL.parse(url_as_string)
 8 | 
 9 |   puts "url = Twingly::URL.parse(\"#{url_as_string}\")"
10 |   puts "url.scheme                    # => \"#{url.scheme}\""
11 |   puts "url.normalized.scheme         # => \"#{url.normalized.scheme}\""
12 |   puts "url.trd                       # => \"#{url.trd}\""
13 |   puts "url.normalized.trd            # => \"#{url.normalized.trd}\""
14 |   puts "url.sld                       # => \"#{url.sld}\""
15 |   puts "url.normalized.sld            # => \"#{url.normalized.sld}\""
16 |   puts "url.tld                       # => \"#{url.tld}\""
17 |   puts "url.normalized.tld            # => \"#{url.normalized.tld}\""
18 |   puts "url.ttld                      # => \"#{url.ttld}\""
19 |   puts "url.normalized.ttld           # => \"#{url.normalized.ttld}\""
20 |   puts "url.domain                    # => \"#{url.domain}\""
21 |   puts "url.normalized.domain         # => \"#{url.normalized.domain}\""
22 |   puts "url.host                      # => \"#{url.host}\""
23 |   puts "url.normalized.host           # => \"#{url.normalized.host}\""
24 |   puts "url.origin                    # => \"#{url.origin}\""
25 |   puts "url.normalized.origin         # => \"#{url.normalized.origin}\""
26 |   puts "url.path                      # => \"#{url.path}\""
27 |   puts "url.normalized.path           # => \"#{url.normalized.path}\""
28 |   puts "url.without_scheme            # => \"#{url.without_scheme}\""
29 |   puts "url.normalized.without_scheme # => \"#{url.normalized.without_scheme}\""
30 |   puts "url.userinfo                  # => \"#{url.userinfo}\""
31 |   puts "url.normalized.userinfo       # => \"#{url.normalized.userinfo}\""
32 |   puts "url.user                      # => \"#{url.user}\""
33 |   puts "url.normalized.user           # => \"#{url.normalized.user}\""
34 |   puts "url.password                  # => \"#{url.password}\""
35 |   puts "url.normalized.password       # => \"#{url.normalized.password}\""
36 |   puts "url.valid?                    # => \"#{url.valid?}\""
37 |   puts "url.normalized.valid?         # => \"#{url.normalized.valid?}\""
38 |   puts "url.to_s                      # => \"#{url.to_s}\""
39 |   puts "url.normalized.to_s           # => \"#{url.normalized.to_s}\""
40 | end
41 | 
42 | puts "require \"twingly/url\""
43 | puts
44 | print_url_details("http://www.twingly.co.uk/search")
45 | puts
46 | print_url_details("http://räksmörgås.макдональдс.рф/foo")
47 | puts
48 | print_url_details("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo")
49 | puts
50 | print_url_details("https://admin:correcthorsebatterystaple@example.com/")
51 | 


--------------------------------------------------------------------------------
/lib/twingly/public_suffix_list.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "addressable/idna"
 4 | require "public_suffix"
 5 | 
 6 | module Twingly
 7 |   class PublicSuffixList
 8 |     ACE_PREFIX = /\Axn\-\-/i.freeze
 9 | 
10 |     private_constant :ACE_PREFIX
11 | 
12 |     # Extend the PSL with ASCII form of all internationalized domain names
13 |     def self.with_punycoded_names(encoding: Encoding::UTF_8)
14 |       list_path = PublicSuffix::List::DEFAULT_LIST_PATH
15 |       list_data = File.read(list_path, encoding: encoding)
16 |       list = PublicSuffix::List.parse(list_data, private_domains: false)
17 | 
18 |       punycoded_names(list).each do |punycoded_name|
19 |         new_rule = PublicSuffix::Rule.factory(punycoded_name)
20 |         list.add(new_rule)
21 |       end
22 | 
23 |       list
24 |     end
25 | 
26 |     private_class_method \
27 |     def self.punycoded_names(list)
28 |       names = list.each.map { |rule| Addressable::IDNA.to_ascii(rule.value) }
29 |       names.select { |name| punycoded_name?(name) }
30 |     end
31 | 
32 |     private_class_method \
33 |     def self.punycoded_name?(name)
34 |       PublicSuffix::Domain.name_to_labels(name).any? do |label|
35 |         label =~ ACE_PREFIX
36 |       end
37 |     end
38 |   end
39 | end
40 | 


--------------------------------------------------------------------------------
/lib/twingly/url.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "addressable/idna/pure"
  4 | require "addressable/uri"
  5 | require "public_suffix"
  6 | 
  7 | require "twingly/public_suffix_list"
  8 | require "twingly/url/null_url"
  9 | require "twingly/url/error"
 10 | require "twingly/version"
 11 | 
 12 | module Twingly
 13 |   class URL
 14 |     include Comparable
 15 | 
 16 |     ACCEPTED_SCHEMES = /\Ahttps?\z/i.freeze
 17 |     CUSTOM_PSL = PublicSuffixList.with_punycoded_names
 18 |     ENDS_WITH_SLASH = /\/+$/.freeze
 19 |     STARTS_WITH_WWW = /\Awww\./i.freeze
 20 |     ERRORS_TO_EXTEND = [
 21 |       Addressable::IDNA::PunycodeBigOutput,
 22 |       Addressable::URI::InvalidURIError,
 23 |       PublicSuffix::DomainInvalid,
 24 |     ].freeze
 25 |     DOT = "."
 26 |     HYPHEN = "-"
 27 |     CARRIAGE_RETURN = "\u000D"
 28 |     LINE_FEED = "\u000A"
 29 |     NBSP = "\u00A0"
 30 |     SPACE = "\u0020"
 31 |     WHITESPACE_CHARS = [
 32 |       CARRIAGE_RETURN,
 33 |       LINE_FEED,
 34 |       NBSP,
 35 |       SPACE,
 36 |     ].join.freeze
 37 |     LEADING_AND_TRAILING_WHITESPACE =
 38 |       /\A[#{WHITESPACE_CHARS}]+|[#{WHITESPACE_CHARS}]+\z/.freeze
 39 |     LETTERS_DIGITS_HYPHEN = /\A[a-zA-Z0-9-]+\z/.freeze
 40 | 
 41 |     private_constant :ACCEPTED_SCHEMES
 42 |     private_constant :CUSTOM_PSL
 43 |     private_constant :STARTS_WITH_WWW
 44 |     private_constant :ENDS_WITH_SLASH
 45 |     private_constant :ERRORS_TO_EXTEND
 46 |     private_constant :DOT
 47 |     private_constant :HYPHEN
 48 |     private_constant :NBSP
 49 |     private_constant :SPACE
 50 |     private_constant :WHITESPACE_CHARS
 51 |     private_constant :LEADING_AND_TRAILING_WHITESPACE
 52 |     private_constant :LETTERS_DIGITS_HYPHEN
 53 | 
 54 |     class << self
 55 |       def parse(potential_url)
 56 |         internal_parse(potential_url)
 57 |       rescue Twingly::URL::Error, Twingly::URL::Error::ParseError => error
 58 |         NullURL.new
 59 |       rescue Exception => error
 60 |         error.extend(Twingly::URL::Error)
 61 |         raise
 62 |       end
 63 | 
 64 |       def internal_parse(input)
 65 |         potential_url   = clean_input(input)
 66 |         addressable_uri = Addressable::URI.heuristic_parse(potential_url)
 67 |         raise Twingly::URL::Error::ParseError if addressable_uri.nil?
 68 | 
 69 |         scheme = addressable_uri.scheme
 70 |         raise Twingly::URL::Error::ParseError unless scheme =~ ACCEPTED_SCHEMES
 71 | 
 72 |         # URLs that can't be normalized should not be valid
 73 |         try_addressable_normalize(addressable_uri)
 74 | 
 75 |         host = addressable_uri.host
 76 |         public_suffix_domain = PublicSuffix.parse(host, list: CUSTOM_PSL,
 77 |           default_rule: nil)
 78 |         raise Twingly::URL::Error::ParseError if public_suffix_domain.nil?
 79 | 
 80 |         raise Twingly::URL::Error::ParseError if public_suffix_domain.sld.nil?
 81 | 
 82 |         new(addressable_uri, public_suffix_domain)
 83 |       rescue *ERRORS_TO_EXTEND => error
 84 |         error.extend(Twingly::URL::Error)
 85 |         raise
 86 |       end
 87 | 
 88 |       def clean_input(input)
 89 |         input = String(input)
 90 |         input = input.scrub
 91 |         input = strip_whitespace(input)
 92 |       end
 93 | 
 94 |       def strip_whitespace(input)
 95 |         return input unless input.encoding == Encoding::UTF_8
 96 | 
 97 |         input.gsub(LEADING_AND_TRAILING_WHITESPACE, "")
 98 |       end
 99 | 
100 |       def try_addressable_normalize(addressable_uri)
101 |         ascii_host = addressable_uri.normalize.host
102 |         raise Twingly::URL::Error::ParseError unless valid_hostname?(ascii_host)
103 |       rescue ArgumentError => error
104 |         if error.message.include?("invalid byte sequence in UTF-8")
105 |           raise Twingly::URL::Error::ParseError
106 |         end
107 | 
108 |         raise
109 |       end
110 | 
111 |       def valid_hostname?(hostname)
112 |         return false if hostname.nil?
113 | 
114 |         # No need to check the TLD, the public suffix list does that
115 |         labels = hostname.split(DOT)[0...-1].map(&:to_s)
116 | 
117 |         labels.all? { |label| valid_label?(label) }
118 |       end
119 | 
120 |       def valid_label?(label)
121 |         return false if label.start_with?(HYPHEN)
122 |         return false if label.end_with?(HYPHEN)
123 | 
124 |         label.match?(LETTERS_DIGITS_HYPHEN)
125 |       end
126 | 
127 |       private :new
128 |       private :internal_parse
129 |       private :clean_input
130 |       private :strip_whitespace
131 |       private :try_addressable_normalize
132 |       private :valid_hostname?
133 |       private :valid_label?
134 |     end
135 | 
136 |     def initialize(addressable_uri, public_suffix_domain)
137 |       @addressable_uri      = addressable_uri
138 |       @public_suffix_domain = public_suffix_domain
139 |     end
140 | 
141 |     def scheme
142 |       addressable_uri.scheme
143 |     end
144 | 
145 |     def trd
146 |       public_suffix_domain.trd.to_s
147 |     end
148 | 
149 |     def sld
150 |       public_suffix_domain.sld
151 |     end
152 | 
153 |     def tld
154 |       public_suffix_domain.tld
155 |     end
156 | 
157 |     # Many ccTLDs have a second level[1] underneath their ccTLD, use this when
158 |     # you don't care about the second level.
159 |     #
160 |     # [1]: https://en.wikipedia.org/wiki/Second-level_domain
161 |     def ttld
162 |       tld.split(".").last
163 |     end
164 | 
165 |     def domain
166 |       public_suffix_domain.domain
167 |     end
168 | 
169 |     def host
170 |       addressable_uri.host
171 |     end
172 | 
173 |     def origin
174 |       addressable_uri.origin
175 |     end
176 | 
177 |     def path
178 |       addressable_uri.path
179 |     end
180 | 
181 |     def without_scheme
182 |       self.to_s.sub(/\A#{scheme}:/, "")
183 |     end
184 | 
185 |     def normalized
186 |       normalized_url = addressable_uri.dup
187 | 
188 |       normalized_url.scheme = normalized_scheme
189 |       normalized_url.host   = normalized_host
190 |       normalized_url.path   = normalized_path
191 | 
192 |       self.class.parse(normalized_url)
193 |     end
194 | 
195 |     def normalized_scheme
196 |       scheme.downcase
197 |     end
198 | 
199 |     def normalized_host
200 |       host   = addressable_uri.normalized_host
201 |       domain = public_suffix_domain
202 | 
203 |       unless domain.subdomain?
204 |         host = "www.#{host}"
205 |       end
206 | 
207 |       host = normalize_blogspot(host, domain)
208 | 
209 |       host
210 |     end
211 | 
212 |     def normalized_path
213 |       path = strip_trailing_slashes(addressable_uri.path)
214 | 
215 |       (path.empty?) ? "/" : path
216 |     end
217 | 
218 |     def userinfo
219 |       addressable_uri.userinfo.to_s
220 |     end
221 | 
222 |     def user
223 |       addressable_uri.user.to_s
224 |     end
225 | 
226 |     def password
227 |       addressable_uri.password.to_s
228 |     end
229 | 
230 |     def valid?
231 |       true
232 |     end
233 | 
234 |     def <=>(other)
235 |       self.to_s <=> other.to_s
236 |     end
237 | 
238 |     def eql?(other)
239 |       return false unless other.is_a?(self.class)
240 | 
241 |       self.hash == other.hash
242 |     end
243 | 
244 |     def hash
245 |       self.to_s.hash
246 |     end
247 | 
248 |     def to_s
249 |       addressable_uri.to_s
250 |     end
251 | 
252 |     def inspect
253 |       sprintf("#<%s:0x%x %s>", self.class.name, __id__, self.to_s)
254 |     end
255 | 
256 |     private
257 | 
258 |     attr_reader :addressable_uri, :public_suffix_domain
259 | 
260 |     def normalize_blogspot(host, domain)
261 |       if domain.sld.downcase == "blogspot"
262 |         host.sub(STARTS_WITH_WWW, "").sub(/#{domain.tld}\z/i, "com")
263 |       else
264 |         host
265 |       end
266 |     end
267 | 
268 |     def strip_trailing_slashes(path)
269 |       path.sub(ENDS_WITH_SLASH, "")
270 |     end
271 |   end
272 | end
273 | 


--------------------------------------------------------------------------------
/lib/twingly/url/error.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Twingly
 4 |   class URL
 5 |     module Error
 6 |       class ParseError < StandardError
 7 |       end
 8 |     end
 9 |   end
10 | end
11 | 


--------------------------------------------------------------------------------
/lib/twingly/url/hasher.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require 'digest'
 4 | 
 5 | require "twingly/url"
 6 | 
 7 | module Twingly
 8 |   class URL
 9 |     module Hasher
10 |       module_function
11 | 
12 |       # Instantiate digest classes in a thread-safe manner
13 |       # This is important since we don't know how people will
14 |       # use this gem (if they require it in a thread safe way)
15 |       MD5_DIGEST = Digest(:MD5)
16 |       SHA256_DIGEST = Digest(:SHA256)
17 | 
18 |       def taskdb_hash(url)
19 |         MD5_DIGEST.hexdigest(url)[0..29].upcase
20 |       end
21 | 
22 |       def documentdb_hash(url)
23 |         SHA256_DIGEST.digest(url).unpack("L!")[0]
24 |       end
25 | 
26 |       def autopingdb_hash(url)
27 |         SHA256_DIGEST.digest(url).unpack("q")[0]
28 |       end
29 |     end
30 |   end
31 | end
32 | 


--------------------------------------------------------------------------------
/lib/twingly/url/null_url.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | module Twingly
 4 |   class URL
 5 |     class NullURL
 6 |       include Comparable
 7 | 
 8 |       def method_missing(name, *)
 9 |         error = NoMethodError.new("undefined method `#{name}'")
10 |         raise error unless Twingly::URL.instance_methods.include?(name)
11 | 
12 |         ""
13 |       end
14 | 
15 |       def normalized
16 |         self
17 |       end
18 | 
19 |       def valid?
20 |         false
21 |       end
22 | 
23 |       def <=>(other)
24 |         self.to_s <=> other.to_s
25 |       end
26 | 
27 |       def eql?(other)
28 |         return false unless other.is_a?(self.class)
29 | 
30 |         self.hash == other.hash
31 |       end
32 | 
33 |       def hash
34 |         self.to_s.hash
35 |       end
36 | 
37 |       def to_s
38 |         ""
39 |       end
40 |     end
41 |   end
42 | end
43 | 


--------------------------------------------------------------------------------
/lib/twingly/url/utilities.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "twingly/url"
 4 | 
 5 | module Twingly
 6 |   class URL
 7 |     module Utilities
 8 |       module_function
 9 | 
10 |       def extract_valid_urls(text_or_array)
11 |         potential_urls = Array(text_or_array).flat_map(&:split)
12 |         potential_urls.map do |potential_url|
13 |           url = Twingly::URL.parse(potential_url)
14 |           url if url.valid?
15 |         end.compact
16 |       end
17 |     end
18 |   end
19 | end
20 | 


--------------------------------------------------------------------------------
/lib/twingly/version.rb:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | module Twingly
4 |   class URL
5 |     VERSION = "7.0.1"
6 |   end
7 | end
8 | 


--------------------------------------------------------------------------------
/profile/Gemfile:
--------------------------------------------------------------------------------
1 | # frozen_string_literal: true
2 | 
3 | source "https://rubygems.org/"
4 | 
5 | gem "rake"
6 | gem "ruby-prof"
7 | gem "memory_profiler"
8 | gem "twingly-url", path: "../"
9 | 


--------------------------------------------------------------------------------
/profile/Rakefile:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "twingly/url"
 4 | require_relative "profile"
 5 | 
 6 | namespace :profile do
 7 |   desc "Profile Twingly::URL.parse and #normalized (file reports)"
 8 |   task :normalize do |task|
 9 |     Profile.measure "normalizing a short URL", 1000 do
10 |       Twingly::URL.parse('http://www.duh.se/').normalized
11 |     end
12 |   end
13 | end
14 | 
15 | namespace :memory_profile do
16 |   desc "Memory Profile Twingly::URL.parse (stdout report)"
17 |   task :parse do |task|
18 |     MemoryProfile.measure "parsing an URL", 1000 do
19 |       Twingly::URL.parse('http://www.twingly.com/')
20 |     end
21 |   end
22 | end
23 | 
24 | task default: "profile:normalize"
25 | 


--------------------------------------------------------------------------------
/profile/profile.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "ruby-prof"
 4 | require "memory_profiler"
 5 | 
 6 | class Profile
 7 |   def self.measure(name, count, &block)
 8 |     RubyProf.start
 9 | 
10 |     count.times do
11 |       block.call
12 |     end
13 | 
14 |     result = RubyProf.stop
15 |     result_directory = "tmp"
16 |     Dir.mkdir(result_directory) unless File.exists?(result_directory)
17 |     printer = RubyProf::MultiPrinter.new(result)
18 |     printer.print(path: result_directory)
19 | 
20 |     puts "Measured #{name} #{count} times"
21 |     puts "Generated reports:"
22 |     Dir.entries(result_directory).reject { |entry| entry.end_with?(".") }.each do |file|
23 |       puts "  #{result_directory}/#{file}"
24 |     end
25 |   end
26 | end
27 | 
28 | class MemoryProfile
29 |   def self.measure(name, count, &block)
30 |     report_options = {
31 |       ignore_files: __FILE__ # Ignore this file
32 |     }
33 | 
34 |     MemoryProfiler.start(report_options)
35 | 
36 |     count.times do
37 |       block.call
38 |     end
39 | 
40 |     report = MemoryProfiler.stop
41 |     report.pretty_print
42 |   end
43 | end
44 | 


--------------------------------------------------------------------------------
/spec/lib/twingly/public_suffix_list_spec.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "spec_helper"
 4 | 
 5 | require "twingly/public_suffix_list"
 6 | 
 7 | describe Twingly::PublicSuffixList do
 8 |   describe ".with_punycoded_names" do
 9 |     subject { described_class.with_punycoded_names(encoding: encoding) }
10 | 
11 |     context "when the list is data is read with the default encoding" do
12 |       subject { described_class.with_punycoded_names }
13 | 
14 |       it { is_expected.to be_a(PublicSuffix::List) }
15 |     end
16 | 
17 |     context "when the list data is read as UTF-8" do
18 |       let(:encoding) { Encoding::UTF_8 }
19 | 
20 |       it { is_expected.to be_a(PublicSuffix::List) }
21 |     end
22 | 
23 |     context "when the list data is read as US-ASCII" do
24 |       let(:encoding) { Encoding::US_ASCII }
25 |       # https://github.com/ruby/ruby/commit/571d21fd4a2e877f49b4ff918832bda9a5e8f91c
26 |       let(:expected_error) do
27 |         if RUBY_VERSION >= "3.2.0"
28 |           Encoding::CompatibilityError
29 |         else
30 |           ArgumentError
31 |         end
32 |       end
33 | 
34 |       it "parsing the data will fail" do
35 |         expect { subject }.
36 |           to raise_error(expected_error, "invalid byte sequence in US-ASCII")
37 |       end
38 |     end
39 |   end
40 | end
41 | 


--------------------------------------------------------------------------------
/spec/lib/twingly/url/hasher_spec.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "spec_helper"
 4 | 
 5 | require "twingly/url/hasher"
 6 | 
 7 | describe Twingly::URL::Hasher do
 8 |   describe ".taskdb_hash" do
 9 |     it "returns a MD5 hexdigest" do
10 |       expect(Twingly::URL::Hasher.taskdb_hash("http://blog.twingly.com/")).to eq "B1E2D5AECF6649C2E44D17AEA3E0F4"
11 |     end
12 |   end
13 | 
14 |   describe ".documentdb_hash" do
15 |     it "returns a SHA256 unsigned long, native endian digest" do
16 |       expect(Twingly::URL::Hasher.documentdb_hash("http://blog.twingly.com/")).to eq 15340752212397415993
17 |     end
18 |   end
19 | 
20 |   describe ".autopingdb_hash" do
21 |     let(:expected) { -3105991861312135623 }
22 | 
23 |     it "returns a SHA256 64-bit signed, native endian digest" do
24 |       expect(Twingly::URL::Hasher.autopingdb_hash("http://blog.twingly.com/")).to eq expected
25 |     end
26 |   end
27 | end
28 | 


--------------------------------------------------------------------------------
/spec/lib/twingly/url/null_url_spec.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "spec_helper"
  4 | 
  5 | require "twingly/url"
  6 | 
  7 | describe Twingly::URL::NullURL do
  8 |   let(:url) { described_class.new }
  9 | 
 10 |   describe "#valid?" do
 11 |     subject { url.valid? }
 12 |     it { is_expected.to be(false) }
 13 |   end
 14 | 
 15 |   describe "#normalized" do
 16 |     subject { url.normalized }
 17 |     it { is_expected.to equal(subject) }
 18 |   end
 19 | 
 20 |   describe "#scheme" do
 21 |     subject { url.scheme }
 22 |     it { is_expected.to eq("") }
 23 |   end
 24 | 
 25 |   describe "#trd" do
 26 |     subject { url.trd }
 27 |     it { is_expected.to eq("") }
 28 |   end
 29 | 
 30 |   describe "#sld" do
 31 |     subject { url.sld }
 32 |     it { is_expected.to eq("") }
 33 |   end
 34 | 
 35 |   describe "#tld" do
 36 |     subject { url.tld }
 37 |     it { is_expected.to eq("") }
 38 |   end
 39 | 
 40 |   describe "#ttld" do
 41 |     subject { url.ttld }
 42 |     it { is_expected.to eq("") }
 43 |   end
 44 | 
 45 |   describe "#domain" do
 46 |     subject { url.domain }
 47 |     it { is_expected.to eq("") }
 48 |   end
 49 | 
 50 |   describe "#host" do
 51 |     subject { url.host }
 52 |     it { is_expected.to eq("") }
 53 |   end
 54 | 
 55 |   describe "#origin" do
 56 |     subject { url.origin }
 57 |     it { is_expected.to eq("") }
 58 |   end
 59 | 
 60 |   describe "#path" do
 61 |     subject { url.path }
 62 |     it { is_expected.to eq("") }
 63 |   end
 64 | 
 65 |   describe "#normalized_path" do
 66 |     subject { url.normalized_path }
 67 |     it { is_expected.to eq("") }
 68 |   end
 69 | 
 70 |   describe "#normalized_scheme" do
 71 |     subject { url.normalized_scheme }
 72 |     it { is_expected.to eq("") }
 73 |   end
 74 | 
 75 |   describe "#normalized_host" do
 76 |     subject { url.normalized_host }
 77 |     it { is_expected.to eq("") }
 78 |   end
 79 | 
 80 |   describe "#userinfo" do
 81 |     subject { url.userinfo }
 82 |     it { is_expected.to eq("") }
 83 |   end
 84 | 
 85 |   describe "#user" do
 86 |     subject { url.user }
 87 |     it { is_expected.to eq("") }
 88 |   end
 89 | 
 90 |   describe "#password" do
 91 |     subject { url.password }
 92 |     it { is_expected.to eq("") }
 93 |   end
 94 | 
 95 |   context "when receiving call for non-existing method on Twingly::URL" do
 96 |     it "raises an error" do
 97 |       expect { url.method_does_not_exist }.to raise_error(NoMethodError)
 98 |     end
 99 |   end
100 | 
101 |   describe "uniqueness" do
102 |     context "a list with multiple NullURLs should only have one unique item" do
103 |       subject(:list) { 10.times.map { described_class.new }.uniq }
104 | 
105 |       it { is_expected.to eq([described_class.new]) }
106 |     end
107 | 
108 |     context "an object and its string representation" do
109 |       let(:a) { described_class.new }
110 |       let(:b) { described_class.new.to_s }
111 | 
112 |       it "should be two unique objects" do
113 |         expect([a,b].uniq).to eq([a,b])
114 |       end
115 | 
116 |       describe "#eql?" do
117 |         subject { a.eql?(b) }
118 |         it { is_expected.to eq(false) }
119 |       end
120 |     end
121 |   end
122 | end
123 | 


--------------------------------------------------------------------------------
/spec/lib/twingly/url/utilities_spec.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require "spec_helper"
 4 | 
 5 | require "twingly/url/utilities"
 6 | 
 7 | describe Twingly::URL::Utilities do
 8 |   describe ".extract_valid_urls" do
 9 |     context "when given a string with URLs" do
10 |       it "returns an array of extracted URLs" do
11 |         input    = "hej hopp http://www.twingly.com banan https://www.wordpress.com/forums/sv äpplen/päron"
12 |         actual   = described_class.extract_valid_urls(input).map(&:to_s)
13 |         expected = %w(http://www.twingly.com https://www.wordpress.com/forums/sv)
14 | 
15 |         expect(actual).to eq(expected)
16 |       end
17 |     end
18 | 
19 |     context "when given an array with URLs" do
20 |       it "returns an array of extracted URLs" do
21 |         input    = %w(hej hopp http://www.twingly.com banan https://www.wordpress.com/forums/sv äpplen/päron)
22 |         actual   = described_class.extract_valid_urls(input).map(&:to_s)
23 |         expected = %w(http://www.twingly.com https://www.wordpress.com/forums/sv)
24 | 
25 |         expect(actual).to eq(expected)
26 |       end
27 |     end
28 | 
29 |     it "always returns an Array" do
30 |       response = described_class.extract_valid_urls(nil)
31 | 
32 |       expect(response).to eq([])
33 |     end
34 |   end
35 | end
36 | 


--------------------------------------------------------------------------------
/spec/lib/twingly/url_spec.rb:
--------------------------------------------------------------------------------
  1 | # frozen_string_literal: true
  2 | 
  3 | require "spec_helper"
  4 | 
  5 | require "twingly/url"
  6 | 
  7 | def invalid_urls
  8 |   [
  9 |     "http://http",
 10 |     "http:///",
 11 |     "http:/",
 12 |     "http:",
 13 |     "htttp",
 14 |     "http://",
 15 |     "http:X",
 16 |     "a",
 17 |     "1",
 18 |     "?",
 19 |     123,
 20 |     nil,
 21 |     false,
 22 |     "",
 23 |     //,
 24 |     "feedville.com,2007-06-19:/blends/16171",
 25 |     "ftp://blog.twingly.com/",
 26 |     "blablahttp://blog.twingly.com/",
 27 |     "gopher://blog.twingly.com/",
 28 |     "\n",
 29 |     "//www.twingly.com/",
 30 |     "http://xn--t...-/",
 31 |     "http://xn--...-",
 32 |     "leather beltsbelts for menleather beltmens beltsleather belts for menmens beltbelt bucklesblack l...",
 33 |     "https//.com",
 34 |     "http://xxx@.com/",
 35 |     "http://...com",
 36 |     "http://.ly/xxx",
 37 |     "http://.com.my/",
 38 |     "http://.net",
 39 |     "http://.com.",
 40 |     "http://.gl/xxx",
 41 |     "http://.twingly.com/",
 42 |     "http://www.twingly.",
 43 |     "http://www..twingly..com/",
 44 |     "http:// shouldfail.com",
 45 |     "http://-a.b.co",
 46 |     "http://a.b-.co",
 47 |     "http://.www.foo.bar./",
 48 |     "http://club].no/",
 49 |     "http://www,google.com",
 50 |     "http://some_site.net%C2",
 51 |     "http://+%D5d.some_site.net",
 52 | 
 53 |     # Triggers Addressable::IDNA::PunycodeBigOutput
 54 |     "http://40world-many.ru&amp;passwd=pUXFGc0LS5&amp;subject=%D0%B1%D0%B0%D0%BB%D0%B0%D0%BD%D1%81%D0%B8%D1%80%D0%BE%D0%B2%D0%BA%D0%B0+%D0%BA%D0%B0%D1%80%D0%B4%D0%B0%D0%BD%D0%BD%D0%BE%D0%B3%D0%BE+%D0%B2%D0%B0%D0%BB%D0%B0&amp;commit=Predict&amp;complex=true&amp;complex=false&amp;membrane=false&amp;coil=false&amp;msa_control=all&amp;secStructPred=true&amp;secStructPred=false&amp;falseRate=5&amp;output=opnone&amp;modeller=&amp;seqalign=yes&amp;database=PfamA&amp;eval=0.01&amp;iterations=5&amp;domssea=yes&amp;secpro=yes&amp;pp=yes",
 55 |   ]
 56 | end
 57 | 
 58 | def valid_urls
 59 |   [
 60 |     "http://blog.twingly.com/",
 61 |     "http://blOg.tWingly.coM/",
 62 |     "https://blog.twingly.com",
 63 |     "http://3.bp.blogspot.com/_lRbEHeizXlQ/Sf4RdEqCqhI/AAAAAAAAAAw/Pl8nGPsyhXc/s1600-h/images[4].jpg",
 64 |     "http://xn--zckp1cyg1.sblo.jp/",
 65 |     "http://eleven.se/mason-pearson-pocket-bristle-nylon-dark-ruby-20683.html&gclid=CjwKEAiAvPGxBRCH3YCgpdbCtmYSJABqHRVw1ZLaelwjepCihWgKkoqgl2t7k0J6J8I1IFp3GYZmKxoCc-nw_wcB?gclid=CjwKEAiAvPGxBRCH3YCgpdbCtmYSJABqHRVw1ZLaelwjepCihWgKkoqgl2t7k0J6J8I1IFp3GYZmKxoCc-nw_wcB",
 66 |     "http://xn--rksmrgs-5wao1o.josefsson.org/",
 67 |     "http://räksmörgås.josefßon.org",
 68 |     "http://user:password@blog.twingly.com/",
 69 |     "http://:@blog.twingly.com/",
 70 |     "https://www.foo.ایران.ir/bar",
 71 |     "https://www.foo.xn--mgba3a4f16a.ir/bar",
 72 |     "http://AcinusFallumTrompetumNullunCreditumVisumEstAtCuadLongumEtCefallumEst.com",
 73 |   ]
 74 | end
 75 | 
 76 | def leading_and_trailing_whitespace
 77 |   line_feed          = "\u000A"
 78 |   carriage_return    = "\u000D"
 79 |   non_breaking_space = "\u00A0"
 80 |   space              = "\u0020"
 81 | 
 82 |   {
 83 |     "non-breaking space and space"                  => [non_breaking_space, space].join,
 84 |     "non-breaking space"                            => [non_breaking_space].join,
 85 |     "non-breaking space, space, non-breaking space" => [non_breaking_space, space, non_breaking_space].join,
 86 |     "space and non-breaking space"                  => [space, non_breaking_space].join,
 87 |     "space, non-breaking space and space"           => [space, non_breaking_space, space].join,
 88 | 
 89 |     "non-breaking space and line-feed"              => [non_breaking_space, line_feed].join,
 90 |     "line-feed and non-breaking space"              => [line_feed, non_breaking_space].join,
 91 |     "space and line-feed"                           => [space, line_feed].join,
 92 |     "line-feed and space"                           => [line_feed, space].join,
 93 | 
 94 |     "non-breaking space and carriage-return"        => [non_breaking_space, carriage_return].join,
 95 |     "carriage-return and non-breaking space"        => [carriage_return, non_breaking_space].join,
 96 |     "space and carriage-return"                     => [space, carriage_return].join,
 97 |     "carriage-return and space"                     => [carriage_return, space].join,
 98 | 
 99 |     "carriage-return and line-feed"                 => [carriage_return, line_feed].join,
100 |     "line-feed and carriage-return"                 => [line_feed, carriage_return].join,
101 |   }
102 | end
103 | 
104 | describe Twingly::URL do
105 |   let(:unicode_idn_test_url) do
106 |     "http://räksmörgås.макдональдс.рф/foo"
107 |   end
108 | 
109 |   let(:ascii_idn_test_url) do
110 |     "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
111 |   end
112 | 
113 |   let(:test_url) do
114 |     "http://www.blog.twingly.co.uk/2015/07/01/language-detection-changes/"
115 |   end
116 |   let(:url) { described_class.parse(test_url) }
117 | 
118 |   describe ".parse" do
119 |     subject { url }
120 | 
121 |     it { is_expected.to be_a(Twingly::URL) }
122 | 
123 |     context "when re-reraising errors" do
124 |       let(:some_exception) { Exception }
125 | 
126 |       before do
127 |         allow(described_class)
128 |           .to receive(:internal_parse)
129 |           .and_raise(some_exception)
130 |       end
131 | 
132 |       it "always tags the error" do
133 |         expect { subject }.to raise_error do |error|
134 |           aggregate_failures do
135 |             expect(error).to be_instance_of(some_exception)
136 |             expect(error).to be_kind_of(Twingly::URL::Error)
137 |           end
138 |         end
139 |       end
140 |     end
141 | 
142 |     context "when given valid urls" do
143 |       valid_urls.each do |valid_url|
144 |         it "does not ruin the url \"#{valid_url}\"" do
145 |           expect(described_class.parse(valid_url).to_s).to eq(valid_url)
146 |         end
147 |       end
148 |     end
149 | 
150 |     context "when given bad input" do
151 |       invalid_urls.each do |invalid_url|
152 |         it "returns a NullURL for \"#{invalid_url}\"" do
153 |           actual = described_class.parse(invalid_url)
154 |           expect(actual).to be_a(Twingly::URL::NullURL)
155 |         end
156 |       end
157 |     end
158 | 
159 |     context "when given URL with uppercase scheme" do
160 |       let(:test_url) { "HTTPS://www.twingly.com/" }
161 |       let(:expected) { "https://www.twingly.com/" }
162 | 
163 |       it "downcases the scheme part" do
164 |         expect(subject).to eq(expected)
165 |       end
166 |     end
167 | 
168 |     context "when given badly encoded input" do
169 |       let(:badly_encoded_url) { "http://abc.se/öあ\x81b\xE3" }
170 |       let(:expected)          { "http://abc.se/öあ\uFFFDb\uFFFD" }
171 |       let(:actual)            { described_class.parse(badly_encoded_url) }
172 | 
173 |       it "will replace badly encoded characters with unicode replacement character (U+FFFD)" do
174 |         expect(actual.to_s).to eq(expected)
175 |       end
176 |     end
177 | 
178 |     context "when given ASCII input" do
179 |       let(:ascii_url) { (+"http://www.twingly.com/öあ").force_encoding("ASCII-8BIT") }
180 |       let(:expected)  { "http://www.twingly.com/öあ" }
181 |       let(:actual)    { described_class.parse(ascii_url).to_s }
182 | 
183 |       it "can handle it but returns UTF-8" do
184 |         expect(actual).to eq(expected)
185 |       end
186 |     end
187 | 
188 |     context "with url containing leading and trailing new lines" do
189 |       let(:test_url) { "\nhttp://www.twingly.com/blog-data/\r\n" }
190 |       let(:expected) { "http://www.twingly.com/blog-data/" }
191 | 
192 |       it { is_expected.to eq(expected) }
193 |     end
194 | 
195 |     context "with url containing leading and trailing whitespaces" do
196 |       let(:test_url) { "   http://www.twingly.com/blog-data/     " }
197 |       let(:expected) { "http://www.twingly.com/blog-data/" }
198 | 
199 |       it { is_expected.to eq(expected) }
200 |     end
201 | 
202 |     context "with url containing both newlines and whitespaces" do
203 |       let(:test_url) { "  \n\r   https://anniaksa.wordpress.com/2014/05/19/privy-digging-blogg100/   \r   \n   " }
204 |       let(:expected) { "https://anniaksa.wordpress.com/2014/05/19/privy-digging-blogg100/" }
205 | 
206 |       it { is_expected.to eq(expected) }
207 |     end
208 | 
209 |     leading_and_trailing_whitespace.each do |whitespace_name, whitespace|
210 |       context "with url containing leading and trailing: #{whitespace_name}" do
211 |         let(:test_url) { "#{whitespace}https://www.example.com/#{whitespace}" }
212 |         let(:expected) { "https://www.example.com/" }
213 | 
214 |         it { is_expected.to eq(expected) }
215 |       end
216 |     end
217 |   end
218 | 
219 |   describe ".internal_parse" do
220 |     context "when called from the outside" do
221 |       it "raises an error" do
222 |         expect { described_class.internal_parse("a") }.
223 |           to raise_error(NoMethodError, /private method `internal_parse' called for/)
224 |       end
225 |     end
226 |   end
227 | 
228 |   describe ".new" do
229 |     context "when called from the outside" do
230 |       it "raises an error" do
231 |         expect { described_class.new("a", "b") }.
232 |           to raise_error(NoMethodError, /private method `new' called for/)
233 |       end
234 |     end
235 |   end
236 | 
237 |   describe "#scheme" do
238 |     subject { url.scheme }
239 |     it { is_expected.to eq("http") }
240 |   end
241 | 
242 |   describe "#trd" do
243 |     subject { url.trd }
244 |     it { is_expected.to eq("www.blog") }
245 | 
246 |     context "when the url contains no trd" do
247 |       let(:test_url){ "http://twingly.com" }
248 |       it { is_expected.to eq("") }
249 |     end
250 | 
251 |     context "internationalized domain name" do
252 |       describe "given in Unicode" do
253 |         let(:test_url) { unicode_idn_test_url }
254 |         it { is_expected.to eq("räksmörgås") }
255 |       end
256 | 
257 |       describe "given in ASCII" do
258 |         let(:test_url) { ascii_idn_test_url }
259 |         it { is_expected.to eq("xn--rksmrgs-5wao1o") }
260 |       end
261 |     end
262 |   end
263 | 
264 |   describe "#sld" do
265 |     subject { url.sld }
266 |     it { is_expected.to eq("twingly") }
267 | 
268 |     context "internationalized domain name" do
269 |       describe "given in Unicode" do
270 |         let(:test_url) { unicode_idn_test_url }
271 |         it { is_expected.to eq("макдональдс") }
272 |       end
273 | 
274 |       describe "given in ASCII" do
275 |         let(:test_url) { ascii_idn_test_url }
276 |         it { is_expected.to eq("xn--80aalb1aicli8a5i") }
277 |       end
278 |     end
279 |   end
280 | 
281 |   describe "#tld" do
282 |     subject { url.tld }
283 |     it { is_expected.to eq("co.uk") }
284 | 
285 |     context "internationalized domain name" do
286 |       describe "given in Unicode" do
287 |         let(:test_url) { unicode_idn_test_url }
288 |         it { is_expected.to eq("рф") }
289 |       end
290 | 
291 |       describe "given in ASCII" do
292 |         let(:test_url) { ascii_idn_test_url }
293 |         it { is_expected.to eq("xn--p1ai") }
294 |       end
295 | 
296 |       describe "punycoded TLD with multiple levels" do
297 |         let(:test_url) { "https://foo.sande.xn--mre-og-romsdal-qqb.no/bar" }
298 |         it { is_expected.to eq("sande.xn--mre-og-romsdal-qqb.no") }
299 |       end
300 |     end
301 |   end
302 | 
303 |   describe "#ttld" do
304 |     subject { url.ttld }
305 |     it { is_expected.to eq("uk") }
306 | 
307 |     context "when the TLD is just one level" do
308 |       let(:test_url){ "http://twingly.com" }
309 | 
310 |       it { is_expected.to eq("com") }
311 |     end
312 | 
313 |     context "internationalized domain name" do
314 |       describe "given in Unicode" do
315 |         let(:test_url) { unicode_idn_test_url }
316 |         it { is_expected.to eq("рф") }
317 |       end
318 | 
319 |       describe "given in ASCII" do
320 |         let(:test_url) { ascii_idn_test_url }
321 |         it { is_expected.to eq("xn--p1ai") }
322 |       end
323 |     end
324 |   end
325 | 
326 |   describe "#domain" do
327 |     subject { url.domain }
328 |     it { is_expected.to eq("twingly.co.uk") }
329 | 
330 |     context "internationalized domain name" do
331 |       describe "given in Unicode" do
332 |         let(:test_url) { unicode_idn_test_url }
333 |         it { is_expected.to eq("макдональдс.рф") }
334 |       end
335 | 
336 |       describe "given in ASCII" do
337 |         let(:test_url) { ascii_idn_test_url }
338 |         it { is_expected.to eq("xn--80aalb1aicli8a5i.xn--p1ai") }
339 |       end
340 |     end
341 |   end
342 | 
343 |   describe "#host" do
344 |     subject { url.host }
345 |     it { is_expected.to eq("www.blog.twingly.co.uk") }
346 | 
347 |     context "internationalized domain name" do
348 |       describe "given in Unicode" do
349 |         let(:test_url) { unicode_idn_test_url }
350 |         it { is_expected.to eq("räksmörgås.макдональдс.рф") }
351 |       end
352 | 
353 |       describe "given in ASCII" do
354 |         let(:test_url) { ascii_idn_test_url }
355 |         it { is_expected.to eq("xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai") }
356 |       end
357 |     end
358 |   end
359 | 
360 |   describe "#origin" do
361 |     subject { url.origin }
362 |     it { is_expected.to eq("http://www.blog.twingly.co.uk") }
363 | 
364 |     context "internationalized domain name" do
365 |       describe "given in Unicode" do
366 |         let(:test_url) { unicode_idn_test_url }
367 |         it { is_expected.to eq("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai") }
368 |       end
369 | 
370 |       describe "given in ASCII" do
371 |         let(:test_url) { ascii_idn_test_url }
372 |         it { is_expected.to eq("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai") }
373 |       end
374 |     end
375 |   end
376 | 
377 |   describe "#path" do
378 |     subject { url.path }
379 |     it { is_expected.to eq("/2015/07/01/language-detection-changes/") }
380 |   end
381 | 
382 |   describe "#normalized_path" do
383 |     subject { url.normalized_path }
384 |     it { is_expected.to eq("/2015/07/01/language-detection-changes") }
385 |   end
386 | 
387 |   describe "#normalized_scheme" do
388 |     subject { url.normalized_scheme }
389 |     it { is_expected.to eq("http") }
390 |   end
391 | 
392 |   describe "#normalized_host" do
393 |     subject { url.normalized_host }
394 |     it { is_expected.to eq("www.blog.twingly.co.uk") }
395 |   end
396 | 
397 |   describe "#valid?" do
398 |     invalid_urls.each do |invalid_url|
399 |       it "returns false for an invalid URL \"#{invalid_url}\"" do
400 |         expect(described_class.parse(invalid_url).valid?).to be false
401 |       end
402 |     end
403 | 
404 |     valid_urls.each do |valid_url|
405 |       it "returns true for the valid url \"#{valid_url}\"" do
406 |         expect(described_class.parse(valid_url).valid?).to be true
407 |       end
408 |     end
409 | 
410 |     context "when given nil input" do
411 |       it "it returns false" do
412 |         expect(described_class.parse(nil).valid?).to be false
413 |       end
414 |     end
415 |   end
416 | 
417 |   describe "#normalized" do
418 |     context "when given valid urls" do
419 |       valid_urls.each do |valid_url|
420 |         it "does not raise an error for \"#{valid_url}\"" do
421 |           actual = described_class.parse(valid_url).normalized
422 |           expect(actual).to be_a(Twingly::URL)
423 |         end
424 |       end
425 |     end
426 | 
427 |     context "when given bad input" do
428 |       invalid_urls.each do |invalid_url|
429 |         it "returns NullURL for \"#{invalid_url}\"" do
430 |           actual = described_class.parse(invalid_url).normalized
431 |           expect(actual).to be_a(Twingly::URL::NullURL)
432 |         end
433 |       end
434 |     end
435 | 
436 |     subject { described_class.parse(url).normalized.to_s }
437 | 
438 |     context "when given IDN URL with the domain \"straße.de\"" do
439 |       let(:test_url) { "http://straße.de" }
440 |       let(:normalized_url) { described_class.parse(url).normalized }
441 | 
442 |       it "does conform to the IDNA2008 protocol" do
443 |         expect(normalized_url.domain).to eq("xn--strae-oqa.de")
444 |       end
445 |     end
446 | 
447 |     context "with URL that has an internationalized TLD in Unicode" do
448 |       let(:test_url) { "https://www.foo.ایران.ir/bar" }
449 |       let(:normalized_url) { described_class.parse(url).normalized }
450 | 
451 |       describe "#scheme" do
452 |         subject { normalized_url.scheme }
453 |         it { is_expected.to eq("https") }
454 |       end
455 | 
456 |       describe "#trd" do
457 |         subject { normalized_url.trd }
458 |         it { is_expected.to eq("www") }
459 |       end
460 | 
461 |       describe "#sld" do
462 |         subject { normalized_url.sld }
463 |         it { is_expected.to eq("foo") }
464 |       end
465 | 
466 |       describe "#tld" do
467 |         subject { normalized_url.tld }
468 |         it { is_expected.to eq("xn--mgba3a4f16a.ir") }
469 |       end
470 | 
471 |       describe "#ttld" do
472 |         subject { normalized_url.ttld }
473 |         it { is_expected.to eq("ir") }
474 |       end
475 | 
476 |       describe "#domain" do
477 |         subject { normalized_url.domain }
478 |         it { is_expected.to eq("foo.xn--mgba3a4f16a.ir") }
479 |       end
480 | 
481 |       describe "#host" do
482 |         subject { normalized_url.host }
483 |         it { is_expected.to eq("www.foo.xn--mgba3a4f16a.ir") }
484 |       end
485 | 
486 |       describe "#origin" do
487 |         subject { normalized_url.origin }
488 |         it { is_expected.to eq("https://www.foo.xn--mgba3a4f16a.ir") }
489 |       end
490 | 
491 |       describe "#path" do
492 |         subject { normalized_url.path }
493 |         it { is_expected.to eq("/bar") }
494 |       end
495 |     end
496 | 
497 |     context "adds www if host is missing a subdomain" do
498 |       let(:url)      { "http://twingly.com/" }
499 |       let(:expected) { "http://www.twingly.com/" }
500 | 
501 |       it { is_expected.to eq(expected) }
502 |     end
503 | 
504 |     context "does not add www if the host has a subdomain" do
505 |       let(:url) { "http://blog.twingly.com/" }
506 | 
507 |       it { is_expected.to eq(url) }
508 |     end
509 | 
510 |     context "does not remove www if the host has a subdomain" do
511 |       let(:url) { "http://www.blog.twingly.com/" }
512 | 
513 |       it { is_expected.to eq(url) }
514 |     end
515 | 
516 |     context "keeps www if the host already has it" do
517 |       let(:url) { "http://www.twingly.com/" }
518 | 
519 |       it { is_expected.to eq(url) }
520 |     end
521 | 
522 |     context "ensures that path starts with slash" do
523 |       let(:url)      { "http://www.twingly.com" }
524 |       let(:expected) { "http://www.twingly.com/" }
525 | 
526 |       it { is_expected.to eq(expected) }
527 |     end
528 | 
529 |     context "ensures that path only starts with single slash" do
530 |       let(:url)      { "http://www.twingly.com//" }
531 |       let(:expected) { "http://www.twingly.com/" }
532 | 
533 |       it { is_expected.to eq(expected) }
534 |     end
535 | 
536 |     context "removes trailing slash from end of path unless path becomes empty" do
537 |       let(:url)      { "http://www.twingly.com/blog-data/" }
538 |       let(:expected) { "http://www.twingly.com/blog-data" }
539 | 
540 |       it { is_expected.to eq(expected) }
541 |     end
542 | 
543 |     context "does not remove whitespaces from middle of path" do
544 |       let(:url)      { "http://www.twingly.com/blo g-data/" }
545 |       let(:expected) { "http://www.twingly.com/blo g-data" }
546 | 
547 |       it { is_expected.to eq(expected) }
548 |     end
549 | 
550 |     context "is able to normalize a url with double slash in path" do
551 |       let(:url)      { "www.twingly.com/path//" }
552 |       let(:expected) { "http://www.twingly.com/path" }
553 | 
554 |       it { is_expected.to eq(expected) }
555 |     end
556 | 
557 |     context "is able to normalize a url without the scheme part" do
558 |       let(:url)      { "www.twingly.com/" }
559 |       let(:expected) { "http://www.twingly.com/" }
560 | 
561 |       it { is_expected.to eq(expected) }
562 |     end
563 | 
564 |     context "does not return broken URLs" do
565 |       let(:url)      { "http://www.twingly." }
566 |       let(:expected) { "" }
567 | 
568 |       it { is_expected.to eq(expected) }
569 |     end
570 | 
571 |     context "does not add www. to blogspot URLs" do
572 |       let(:url) { "http://jlchen1026.blogspot.com/" }
573 | 
574 |       it { is_expected.to eq(url) }
575 |     end
576 | 
577 |     context "removes www. from blogspot URLs" do
578 |       let(:url)      { "http://www.jlchen1026.blogspot.com/" }
579 |       let(:expected) { "http://jlchen1026.blogspot.com/" }
580 | 
581 |       it { is_expected.to eq(expected) }
582 |     end
583 | 
584 |     context "rewrites blogspot TLDs to .com" do
585 |       let(:url)      { "http://WWW.jlchen1026.blogspot.CO.UK/" }
586 |       let(:expected) { "http://jlchen1026.blogspot.com/" }
587 | 
588 |       it { is_expected.to eq(expected) }
589 |     end
590 | 
591 |     context "downcases the scheme part" do
592 |       let(:url)      { "HTTPS://www.twingly.com/" }
593 |       let(:expected) { "https://www.twingly.com/" }
594 | 
595 |       it { is_expected.to eq(expected) }
596 |     end
597 | 
598 |     context "downcases the domain" do
599 |       let(:url)      { "http://WWW.TWINGLY.COM/" }
600 |       let(:expected) { "http://www.twingly.com/" }
601 | 
602 |       it { is_expected.to eq(expected) }
603 |     end
604 | 
605 |     context "does not downcase the path" do
606 |       let(:url) { "http://www.twingly.com/PaTH" }
607 | 
608 |       it { is_expected.to eq(url) }
609 |     end
610 | 
611 |     context "does not downcase fragment" do
612 |       let(:url) { "http://www.twingly.com/#FRAGment" }
613 | 
614 |       it { is_expected.to eq(url) }
615 |     end
616 | 
617 |     context "handles URL with ] in it" do
618 |       let(:url) { "http://www.iwaseki.co.jp/cgi/yybbs/yybbs.cgi/%DEuropean]buy" }
619 | 
620 |       it { is_expected.to eq(url) }
621 |     end
622 | 
623 |     context "handles URL with reference to another URL in it" do
624 |       let(:url) { "http://news.google.com/news/url?sa=t&fd=R&usg=AFQjCNGc4A_sfGS6fMMqggiK_8h6yk2miw&url=http:%20%20%20//fansided.com/2013/08/02/nike-decides-to-drop-milwaukee-brewers-ryan-braun" }
625 | 
626 |       it { is_expected.to eq(url) }
627 |     end
628 | 
629 |     context "handles URL with umlauts in host" do
630 |       let(:url)      { "http://www.åäö.se/" }
631 |       let(:expected) { "http://www.xn--4cab6c.se/" }
632 | 
633 |       it { is_expected.to eq(expected) }
634 |     end
635 | 
636 |     context "handles URL with umlauts in path" do
637 |       let(:url) { "http://www.aoo.se/öö" }
638 | 
639 |       it { is_expected.to eq(url) }
640 |     end
641 | 
642 |     context "handles URL with punycoded SLD" do
643 |       let(:url) { "http://www.xn--4cab6c.se/" }
644 | 
645 |       it { is_expected.to eq(url) }
646 |     end
647 | 
648 |     context "handles URL with punycoded TLD" do
649 |       let(:url)      { "http://example.xn--p1ai/" }
650 |       let(:expected) { "http://www.example.xn--p1ai/" }
651 | 
652 |       it { is_expected.to eq(expected) }
653 |     end
654 | 
655 |     context "converts to a punycoded URL" do
656 |       let(:url)      { "скраповыймир.рф" }
657 |       let(:expected) { "http://www.xn--80aesdcplhhhb0k.xn--p1ai/" }
658 | 
659 |       it { is_expected.to eq(expected) }
660 |     end
661 | 
662 |     context "does not blow up when there's no URL in the text" do
663 |       let(:url)      { "Just some text" }
664 |       let(:expected) { "" }
665 | 
666 |       it { is_expected.to eq(expected) }
667 |     end
668 |   end
669 | 
670 |   describe "#without_scheme" do
671 |     subject { described_class.parse(url).without_scheme }
672 | 
673 |     context "removes scheme from mixed case HTTP URL" do
674 |       let(:url)      { "HttP://www.duh.se/" }
675 |       let(:expected) { "//www.duh.se/" }
676 | 
677 |       it { is_expected.to eq(expected) }
678 |     end
679 | 
680 |     context "removes scheme from mixed case HTTPS URL" do
681 |       let(:url)      { "hTTpS://www.duh.se/" }
682 |       let(:expected) { "//www.duh.se/" }
683 | 
684 |       it { is_expected.to eq(expected) }
685 |     end
686 | 
687 |     context "removes scheme from lowercase HTTP URL" do
688 |       let(:url)      { "http://www.duh.se/" }
689 |       let(:expected) { "//www.duh.se/" }
690 | 
691 |       it { is_expected.to eq(expected) }
692 |     end
693 | 
694 |     context "removes scheme from lowercase HTTPS URL" do
695 |       let(:url)      { "https://www.duh.se/" }
696 |       let(:expected) { "//www.duh.se/" }
697 | 
698 |       it { is_expected.to eq(expected) }
699 |     end
700 | 
701 |     context "removes scheme from uppercase HTTP URL" do
702 |       let(:url)      { "HTTP://WWW.DUH.SE/" }
703 |       let(:expected) { "//WWW.DUH.SE/" }
704 | 
705 |       it { is_expected.to eq(expected) }
706 |     end
707 | 
708 |     context "removes scheme from uppercase HTTPS URL" do
709 |       let(:url)      { "HTTPS://WWW.DUH.SE/" }
710 |       let(:expected) { "//WWW.DUH.SE/" }
711 | 
712 |       it { is_expected.to eq(expected) }
713 |     end
714 | 
715 |     context "removes scheme from URL with non ASCII characters" do
716 |       let(:url)      { "http://www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα" }
717 |       let(:expected) { "//www.thecloset.gr/people/bloggers-pick-ιωάννα-τσιγαρίδα" }
718 | 
719 |       it { is_expected.to eq(expected) }
720 |     end
721 | 
722 |     context "only removes scheme from HTTP URL" do
723 |       let(:url)      { "http://feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml" }
724 |       let(:expected) { "//feedjira.herokuapp.com/?url=http://developer.twingly.com/feed.xml" }
725 | 
726 |       it { is_expected.to eq(expected) }
727 |     end
728 | 
729 |     context "only removes scheme from HTTPS URL" do
730 |       let(:url)      { "https://feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss" }
731 |       let(:expected) { "//feedjira.herokuapp.com/?url=https://signalvnoise.com/posts.rss" }
732 | 
733 |       it { is_expected.to eq(expected) }
734 |     end
735 |   end
736 | 
737 |   describe "#userinfo" do
738 |     subject { described_class.parse(url).userinfo }
739 | 
740 |     context "without authorisation part in URL" do
741 |       let(:url) { "https://blog.twingly.com/" }
742 | 
743 |       it { is_expected.to eq("") }
744 |     end
745 | 
746 |     context "with user and password part in URL" do
747 |       let(:url) { "https://user:password@blog.twingly.com/" }
748 | 
749 |       it { is_expected.to eq("user:password") }
750 |     end
751 | 
752 |     context "with empty user and empty password in URL" do
753 |       let(:url) { "https://:@blog.twingly.com/" }
754 | 
755 |       it { is_expected.to eq(":") }
756 |     end
757 | 
758 |     context "with user but empty password in URL" do
759 |       let(:url) { "https://user:@blog.twingly.com/" }
760 | 
761 |       it { is_expected.to eq("user:") }
762 |     end
763 | 
764 |     context "with empty user but password in URL" do
765 |       let(:url) { "https://:password@blog.twingly.com/" }
766 | 
767 |       it { is_expected.to eq(":password") }
768 |     end
769 |   end
770 | 
771 |   describe "#user" do
772 |     subject { described_class.parse(url).user }
773 | 
774 |     context "without authorisation part in URL" do
775 |       let(:url) { "https://blog.twingly.com/" }
776 | 
777 |       it { is_expected.to eq("") }
778 |     end
779 | 
780 |     context "with user and password part in URL" do
781 |       let(:url) { "https://user:password@blog.twingly.com/" }
782 | 
783 |       it { is_expected.to eq("user") }
784 |     end
785 | 
786 |     context "with empty user and empty password in URL" do
787 |       let(:url) { "https://:@blog.twingly.com/" }
788 | 
789 |       it { is_expected.to eq("") }
790 |     end
791 | 
792 |     context "with user but empty password in URL" do
793 |       let(:url) { "https://user:@blog.twingly.com/" }
794 | 
795 |       it { is_expected.to eq("user") }
796 |     end
797 | 
798 |     context "with empty user but password in URL" do
799 |       let(:url) { "https://:password@blog.twingly.com/" }
800 | 
801 |       it { is_expected.to eq("") }
802 |     end
803 |   end
804 | 
805 |   describe "#password" do
806 |     subject { described_class.parse(url).password }
807 | 
808 |     context "without authorisation part in URL" do
809 |       let(:url) { "https://blog.twingly.com/" }
810 | 
811 |       it { is_expected.to eq("") }
812 |     end
813 | 
814 |     context "with user and password part in URL" do
815 |       let(:url) { "https://user:password@blog.twingly.com/" }
816 | 
817 |       it { is_expected.to eq("password") }
818 |     end
819 | 
820 |     context "with empty user and empty password in URL" do
821 |       let(:url) { "https://:@blog.twingly.com/" }
822 | 
823 |       it { is_expected.to eq("") }
824 |     end
825 | 
826 |     context "with user but empty password in URL" do
827 |       let(:url) { "https://user:@blog.twingly.com/" }
828 | 
829 |       it { is_expected.to eq("") }
830 |     end
831 | 
832 |     context "with empty user but password in URL" do
833 |       let(:url) { "https://:password@blog.twingly.com/" }
834 | 
835 |       it { is_expected.to eq("password") }
836 |     end
837 |   end
838 | 
839 |   describe "#to_s" do
840 |     subject { url.to_s }
841 |     it { is_expected.to eq(test_url) }
842 |   end
843 | 
844 |   describe "comparable methods" do
845 |     let(:a) { "http://a.com" }
846 |     let(:b) { "http://b.com" }
847 | 
848 |     describe "#<=>" do
849 |       let(:test_urls) { [b, a, b, a, a] }
850 | 
851 |       subject do
852 |         test_urls.map { |url| described_class.parse(url) }.sort.map(&:to_s)
853 |       end
854 | 
855 |       it { is_expected.to eq(test_urls.sort) }
856 |     end
857 | 
858 |     describe "#==" do
859 |       context "when parsing the same URLs" do
860 |         subject { described_class.parse(a) == described_class.parse(a) }
861 |         it { is_expected.to be(true) }
862 |       end
863 | 
864 |       context "when parsing different URLs" do
865 |         subject { described_class.parse(a) == described_class.parse(b) }
866 |         it { is_expected.to be(false) }
867 |       end
868 |     end
869 | 
870 |     describe "#===" do
871 |       context "when parsing the same URLs" do
872 |         subject { described_class.parse(a) === described_class.parse(a) }
873 |         it { is_expected.to be(true) }
874 |       end
875 | 
876 |       context "when parsing different URLs" do
877 |         subject { described_class.parse(a) === described_class.parse(b) }
878 |         it { is_expected.to be(false) }
879 |       end
880 |     end
881 | 
882 |     context "with invalid and valid URLs" do
883 |       let(:test_urls) { [b, "", a] }
884 | 
885 |       subject do
886 |         test_urls.map { |url| described_class.parse(url) }.sort.map(&:to_s)
887 |       end
888 | 
889 |       it { is_expected.to eq(test_urls.sort) }
890 |     end
891 |   end
892 | 
893 |   describe "uniqueness" do
894 |     context do "with the same URL twice"
895 |       let(:a) { described_class.parse("https://www.twingly.com/") }
896 |       let(:b) { described_class.parse("https://www.google.com/") }
897 |       let(:c) { described_class.parse("https://www.twingly.com/") }
898 | 
899 |       it "should give only unique URLs" do
900 |         expect([a,b,c].uniq).to eq([a,b])
901 |       end
902 |     end
903 | 
904 |     context do "two similar URLs, but not exactly the same"
905 |       let(:a) { described_class.parse("https://www.twingly.com") }
906 |       let(:b) { described_class.parse("https://www.twingly.com/") }
907 | 
908 |       it "should be two unique URLs" do
909 |         expect([a,b].uniq).to eq([a,b])
910 |       end
911 |     end
912 | 
913 |     context do "the same URL but with some whitespace should be the same"
914 |       let(:a) { described_class.parse(" https://www.twingly.com/") }
915 |       let(:b) { described_class.parse("https://www.twingly.com/ ") }
916 | 
917 |       it "should be one unique URL" do
918 |         expect([a,b].uniq).to eq([a])
919 |       end
920 | 
921 |       describe ".eql?" do
922 |         subject { a.eql?(b) }
923 |         it { is_expected.to eq(true) }
924 |       end
925 |     end
926 | 
927 |     context "an object and its string representation" do
928 |       let(:url) { "https://www.twingy.com/" }
929 |       let(:a) { described_class.parse(url) }
930 |       let(:b) { described_class.parse(url).to_s }
931 | 
932 |       it "should be two unique objects" do
933 |         expect([a,b].uniq).to eq([a,b])
934 |       end
935 | 
936 |       describe "#eql?" do
937 |         subject { a.eql?(b) }
938 |         it { is_expected.to eq(false) }
939 |       end
940 |     end
941 |   end
942 | 
943 |   describe "#inspect" do
944 |     let(:url_object) { described_class.parse(url) }
945 |     subject { url_object.inspect }
946 | 
947 |     it { is_expected.to include(url_object.to_s) }
948 |   end
949 | end
950 | 


--------------------------------------------------------------------------------
/spec/spec_helper.rb:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | RSpec.configure do |config|
 4 |   config.expect_with :rspec do |expectations|
 5 |     expectations.include_chain_clauses_in_custom_matcher_descriptions = true
 6 |   end
 7 | 
 8 |   config.mock_with :rspec do |mocks|
 9 |     mocks.verify_partial_doubles = true
10 |   end
11 | 
12 |   config.order = :random
13 | 
14 |   Kernel.srand config.seed
15 | end
16 | 


--------------------------------------------------------------------------------
/twingly-url.gemspec:
--------------------------------------------------------------------------------
 1 | # frozen_string_literal: true
 2 | 
 3 | require File.expand_path('../lib/twingly/version', __FILE__)
 4 | 
 5 | Gem::Specification.new do |s|
 6 |   s.name        = "twingly-url"
 7 |   s.version     = Twingly::URL::VERSION
 8 |   s.platform    = Gem::Platform::RUBY
 9 |   s.authors     = ["Twingly AB"]
10 |   s.email       = ["support@twingly.com"]
11 |   s.homepage    = "http://github.com/twingly/twingly-url"
12 |   s.summary     = "Ruby library for URL handling"
13 |   s.description = "Twingly URL tools"
14 |   s.license     = "MIT"
15 |   s.required_ruby_version = ">= 2.6"
16 | 
17 |   s.add_dependency "addressable", "~> 2.6"
18 |   s.add_dependency "public_suffix", ">= 3.0.1", "< 6.0"
19 | 
20 |   s.add_development_dependency "rake", "~> 12"
21 |   s.add_development_dependency "rspec", "~> 3"
22 |   s.add_development_dependency "pry", "~> 0"
23 | 
24 |   s.files        = Dir.glob("{lib}/**/*") + %w(README.md)
25 |   s.require_path = 'lib'
26 | end
27 | 


--------------------------------------------------------------------------------