├── .gitignore ├── LICENSE.txt ├── README.md ├── dub.json ├── dub.selections.json └── source └── url.d /.gitignore: -------------------------------------------------------------------------------- 1 | .dub/ 2 | __test__library__ 3 | docs.json 4 | docs/ 5 | __dummy.html 6 | liburld.a 7 | *.swp 8 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Christopher Wright 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and 4 | associated documentation files (the "Software"), to deal in the Software without restriction, 5 | including without limitation the rights to use, copy, modify, merge, publish, distribute, 6 | sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is 7 | furnished to do so, subject to the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be included in all copies or substantial 10 | portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT 13 | NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 14 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES 15 | OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # urld 2 | URL handling for D 3 | 4 | # Motivation 5 | D's standard library has nothing for working with URLs. 6 | 7 | Vibe.d can work with URLs. However, Vibe is big. Also, we want to work easily with query strings, 8 | which vibe.d doesn't allow. 9 | 10 | # Recent Breaking Changes 11 | 12 | - v3.0: 13 | * urld does no automatic URL decoding 14 | * urld only automatically encodes non-ASCII characters 15 | * `URL(string)` constructor parses a URL rather than assigning the scheme only 16 | 17 | 18 | # Installation 19 | Add `"urld": "~>3.0.0"` to your `dub.json`. 20 | 21 | # Usage 22 | 23 | Parse a URL: 24 | 25 | ```D 26 | auto url = "ircs://irc.freenode.com/#d".parseURL; 27 | auto url = URL("ircs://irc.freenode.com/#d"); 28 | ``` 29 | 30 | Construct one from scratch, laboriously: 31 | 32 | ```D 33 | URL url; 34 | with (url) { 35 | scheme = "soap.beep"; 36 | host = "beep.example.net"; 37 | port = 1772; 38 | path = "/serverinfo/info"; 39 | queryParams.add("token", "my-api-token"); 40 | } 41 | curl.get(url); 42 | ``` 43 | 44 | Unicode domain names: 45 | 46 | ```D 47 | auto url = "http://☃.com/".parseURL; 48 | writeln(url.toString); // http://xn--n3h.com/ 49 | writeln(url.toHumanReadableString); // http://☃.com/ 50 | ``` 51 | 52 | Implicit conversion to strings for use with other libraries that expect URLs as strings: 53 | 54 | ```D 55 | import std.net.curl; 56 | auto couchdbURL = "http://couch.local:8815".parseURL; 57 | writeln(get(couchdbURL ~ "users/bob.dobbs@subgenius.org")); 58 | ``` 59 | 60 | Autodetect ports: 61 | 62 | ```D 63 | assert(parseURL("http://example.org").port == 80); 64 | assert(parseURL("http://example.org:5326").port == 5326); 65 | ``` 66 | 67 | URLs of maximum complexity: 68 | 69 | ```D 70 | auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment"); 71 | assert(url.scheme == "redis"); 72 | assert(url.user == "admin"); 73 | assert(url.pass == "password"); 74 | // etc 75 | ``` 76 | 77 | URLs of minimum complexity: 78 | 79 | ```D 80 | assert(parseURL("example.org").toString == "http://example.org/"); 81 | ``` 82 | 83 | Canonicalization: 84 | 85 | ```D 86 | assert(parseURL("http://example.org:80").toString == "http://example.org/"); 87 | ``` 88 | -------------------------------------------------------------------------------- /dub.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "urld", 3 | "description": "A URL parsing library", 4 | "copyright": "Copyright © 2015, dhasenan", 5 | "authors": ["dhasenan"], 6 | "license": "MIT", 7 | "dependencies": { 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /dub.selections.json: -------------------------------------------------------------------------------- 1 | { 2 | "fileVersion": 1, 3 | "versions": {} 4 | } 5 | -------------------------------------------------------------------------------- /source/url.d: -------------------------------------------------------------------------------- 1 | /** 2 | * A URL handling library. 3 | * 4 | * URLs are Unique Resource Locators. They consist of a scheme and a host, with some optional 5 | * elements like port, path, username, and password. 6 | * 7 | * This module aims to make it simple to muck about with them. 8 | * 9 | * Example usage: 10 | * --- 11 | * auto url = "ssh://me:password@192.168.0.8/".parseURL; 12 | * auto files = system("ssh", url.toString, "ls").splitLines; 13 | * foreach (file; files) { 14 | * system("scp", url ~ file, "."); 15 | * } 16 | * --- 17 | * 18 | * License: The MIT license. 19 | */ 20 | module url; 21 | 22 | import std.conv; 23 | import std.string; 24 | 25 | pure: 26 | @safe: 27 | 28 | /// An exception thrown when something bad happens with URLs. 29 | class URLException : Exception 30 | { 31 | this(string msg) pure { super(msg); } 32 | } 33 | 34 | /** 35 | * A mapping from schemes to their default ports. 36 | * 37 | * This is not exhaustive. Not all schemes use ports. Not all schemes uniquely identify a port to 38 | * use even if they use ports. Entries here should be treated as best guesses. 39 | */ 40 | enum ushort[string] schemeToDefaultPort = [ 41 | "aaa": 3868, 42 | "aaas": 5658, 43 | "acap": 674, 44 | "amqp": 5672, 45 | "cap": 1026, 46 | "coap": 5683, 47 | "coaps": 5684, 48 | "dav": 443, 49 | "dict": 2628, 50 | "ftp": 21, 51 | "git": 9418, 52 | "go": 1096, 53 | "gopher": 70, 54 | "http": 80, 55 | "https": 443, 56 | "ws": 80, 57 | "wss": 443, 58 | "iac": 4569, 59 | "icap": 1344, 60 | "imap": 143, 61 | "ipp": 631, 62 | "ipps": 631, // yes, they're both mapped to port 631 63 | "irc": 6667, // De facto default port, not the IANA reserved port. 64 | "ircs": 6697, 65 | "iris": 702, // defaults to iris.beep 66 | "iris.beep": 702, 67 | "iris.lwz": 715, 68 | "iris.xpc": 713, 69 | "iris.xpcs": 714, 70 | "jabber": 5222, // client-to-server 71 | "ldap": 389, 72 | "ldaps": 636, 73 | "msrp": 2855, 74 | "msrps": 2855, 75 | "mtqp": 1038, 76 | "mupdate": 3905, 77 | "news": 119, 78 | "nfs": 2049, 79 | "pop": 110, 80 | "redis": 6379, 81 | "reload": 6084, 82 | "rsync": 873, 83 | "rtmfp": 1935, 84 | "rtsp": 554, 85 | "shttp": 80, 86 | "sieve": 4190, 87 | "sip": 5060, 88 | "sips": 5061, 89 | "smb": 445, 90 | "smtp": 25, 91 | "snews": 563, 92 | "snmp": 161, 93 | "soap.beep": 605, 94 | "ssh": 22, 95 | "stun": 3478, 96 | "stuns": 5349, 97 | "svn": 3690, 98 | "teamspeak": 9987, 99 | "telnet": 23, 100 | "tftp": 69, 101 | "tip": 3372, 102 | ]; 103 | 104 | /** 105 | * A collection of query parameters. 106 | * 107 | * This is effectively a multimap of string -> strings. 108 | */ 109 | struct QueryParams 110 | { 111 | hash_t toHash() const nothrow @safe 112 | { 113 | return typeid(params).getHash(¶ms); 114 | } 115 | 116 | pure: 117 | import std.typecons; 118 | alias Tuple!(string, "key", string, "value") Param; 119 | Param[] params; 120 | 121 | @property size_t length() const { 122 | return params.length; 123 | } 124 | 125 | /// Get a range over the query parameter values for the given key. 126 | auto opIndex(string key) const 127 | { 128 | import std.algorithm.searching : find; 129 | import std.algorithm.iteration : map; 130 | return params.find!(x => x.key == key).map!(x => x.value); 131 | } 132 | 133 | /// Add a query parameter with the given key and value. 134 | /// If one already exists, there will now be two query parameters with the given name. 135 | void add(string key, string value) { 136 | params ~= Param(key, value); 137 | } 138 | 139 | /// Add a query parameter with the given key and value. 140 | /// If there are any existing parameters with the same key, they are removed and overwritten. 141 | void overwrite(string key, string value) { 142 | for (int i = 0; i < params.length; i++) { 143 | if (params[i].key == key) { 144 | params[i] = params[$-1]; 145 | params.length--; 146 | } 147 | } 148 | params ~= Param(key, value); 149 | } 150 | 151 | private struct QueryParamRange 152 | { 153 | pure: 154 | size_t i; 155 | const(Param)[] params; 156 | bool empty() { return i >= params.length; } 157 | void popFront() { i++; } 158 | Param front() { return params[i]; } 159 | } 160 | 161 | /** 162 | * A range over the query parameters. 163 | * 164 | * Usage: 165 | * --- 166 | * foreach (key, value; url.queryParams) {} 167 | * --- 168 | */ 169 | auto range() const 170 | { 171 | return QueryParamRange(0, this.params); 172 | } 173 | /// ditto 174 | alias range this; 175 | 176 | /// Convert this set of query parameters into a query string. 177 | string toString() const { 178 | import std.array : Appender; 179 | Appender!string s; 180 | bool first = true; 181 | foreach (tuple; this) { 182 | if (!first) { 183 | s ~= '&'; 184 | } 185 | first = false; 186 | s ~= tuple.key.percentEncodeUnicodeOnly; 187 | if (tuple.value.length > 0) { 188 | s ~= '='; 189 | s ~= tuple.value.percentEncodeUnicodeOnly; 190 | } 191 | } 192 | return s.data; 193 | } 194 | 195 | /// Clone this set of query parameters. 196 | QueryParams dup() 197 | { 198 | QueryParams other = this; 199 | other.params = params.dup; 200 | return other; 201 | } 202 | 203 | int opCmp(const ref QueryParams other) const 204 | { 205 | for (int i = 0; i < params.length && i < other.params.length; i++) 206 | { 207 | auto c = cmp(params[i].key, other.params[i].key); 208 | if (c != 0) return c; 209 | c = cmp(params[i].value, other.params[i].value); 210 | if (c != 0) return c; 211 | } 212 | if (params.length > other.params.length) return 1; 213 | if (params.length < other.params.length) return -1; 214 | return 0; 215 | } 216 | } 217 | 218 | /** 219 | * A Unique Resource Locator. 220 | * 221 | * URLs can be parsed (see parseURL) and implicitly convert to strings. 222 | */ 223 | struct URL 224 | { 225 | hash_t toHash() const @safe nothrow 226 | { 227 | return asTuple().toHash(); 228 | } 229 | 230 | pure: 231 | /// The URL scheme. For instance, ssh, ftp, or https. 232 | string scheme; 233 | 234 | /// The username in this URL. Usually absent. If present, there will also be a password. 235 | string user; 236 | 237 | /// The password in this URL. Usually absent. 238 | string pass; 239 | 240 | /// The hostname. 241 | string host; 242 | 243 | this(string s) 244 | { 245 | this = s.parseURL; 246 | } 247 | 248 | /** 249 | * The port. 250 | * 251 | * This is inferred from the scheme if it isn't present in the URL itself. 252 | * If the scheme is not known and the port is not present, the port will be given as 0. 253 | * For some schemes, port will not be sensible -- for instance, file or chrome-extension. 254 | * 255 | * If you explicitly need to detect whether the user provided a port, check the providedPort 256 | * field. 257 | */ 258 | @property ushort port() const nothrow 259 | { 260 | if (providedPort != 0) { 261 | return providedPort; 262 | } 263 | if (auto p = scheme in schemeToDefaultPort) { 264 | return *p; 265 | } 266 | return 0; 267 | } 268 | 269 | /** 270 | * Set the port. 271 | * 272 | * This sets the providedPort field and is provided for convenience. 273 | */ 274 | @property ushort port(ushort value) nothrow 275 | { 276 | return providedPort = value; 277 | } 278 | 279 | /// The port that was explicitly provided in the URL. 280 | ushort providedPort; 281 | 282 | /** 283 | * The path. 284 | * 285 | * For instance, in the URL https://cnn.com/news/story/17774?visited=false, the path is 286 | * "/news/story/17774". 287 | */ 288 | string path; 289 | 290 | /** 291 | * The query parameters associated with this URL. 292 | */ 293 | QueryParams queryParams; 294 | 295 | /** 296 | * The fragment. In web documents, this typically refers to an anchor element. 297 | * For instance, in the URL https://cnn.com/news/story/17774#header2, the fragment is "header2". 298 | */ 299 | string fragment; 300 | 301 | /** 302 | * Convert this URL to a string. 303 | * The string is properly formatted and usable for, eg, a web request. 304 | */ 305 | string toString() const 306 | { 307 | return toString(false); 308 | } 309 | 310 | /** 311 | * Convert this URL to a string. 312 | * 313 | * The string is intended to be human-readable rather than machine-readable. 314 | */ 315 | string toHumanReadableString() const 316 | { 317 | return toString(true); 318 | } 319 | 320 | /// 321 | unittest 322 | { 323 | auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 324 | assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye", url.toString); 325 | assert(url.toHumanReadableString == "https://☂.☃.org/?hi=bye", url.toString); 326 | } 327 | 328 | unittest 329 | { 330 | assert("http://example.org/some_path".parseURL.toHumanReadableString == 331 | "http://example.org/some_path"); 332 | } 333 | 334 | /** 335 | * Convert the path and query string of this URL to a string. 336 | */ 337 | string toPathAndQueryString() const 338 | { 339 | if (queryParams.length > 0) 340 | { 341 | return path ~ '?' ~ queryParams.toString; 342 | } 343 | return path; 344 | } 345 | 346 | /// 347 | unittest 348 | { 349 | auto u = "http://example.org/index?page=12".parseURL; 350 | auto pathAndQuery = u.toPathAndQueryString(); 351 | assert(pathAndQuery == "/index?page=12", pathAndQuery); 352 | } 353 | 354 | private string toString(bool humanReadable) const 355 | { 356 | import std.array : Appender; 357 | Appender!string s; 358 | s ~= scheme; 359 | s ~= "://"; 360 | if (user) { 361 | s ~= humanReadable ? user : user.percentEncodeUnicodeOnly; 362 | s ~= ":"; 363 | s ~= humanReadable ? pass : pass.percentEncodeUnicodeOnly; 364 | s ~= "@"; 365 | } 366 | s ~= humanReadable ? host : host.toPuny; 367 | if (providedPort) { 368 | if ((scheme in schemeToDefaultPort) == null || schemeToDefaultPort[scheme] != providedPort) { 369 | s ~= ":"; 370 | s ~= providedPort.to!string; 371 | } 372 | } 373 | string p = path; 374 | if (p.length == 0 || p == "/") { 375 | s ~= '/'; 376 | } else { 377 | if (humanReadable) { 378 | s ~= p; 379 | } else { 380 | if (p[0] == '/') { 381 | p = p[1..$]; 382 | } 383 | foreach (part; p.split('/')) { 384 | s ~= '/'; 385 | s ~= part.percentEncodeUnicodeOnly; 386 | } 387 | } 388 | } 389 | if (queryParams.length) { 390 | s ~= '?'; 391 | s ~= queryParams.toString; 392 | } if (fragment) { 393 | s ~= '#'; 394 | if (humanReadable) { 395 | s ~= fragment; 396 | } else { 397 | s ~= fragment.percentEncodeUnicodeOnly; 398 | } 399 | } 400 | return s.data; 401 | } 402 | 403 | /// Implicitly convert URLs to strings. 404 | alias toString this; 405 | 406 | /** 407 | Compare two URLs. 408 | 409 | I tried to make the comparison produce a sort order that seems natural, so it's not identical 410 | to sorting based on .toString(). For instance, username/password have lower priority than 411 | host. The scheme has higher priority than port but lower than host. 412 | 413 | While the output of this is guaranteed to provide a total ordering, and I've attempted to make 414 | it human-friendly, it isn't guaranteed to be consistent between versions. The implementation 415 | and its results can change without a minor version increase. 416 | */ 417 | int opCmp(const URL other) const 418 | { 419 | return asTuple.opCmp(other.asTuple); 420 | } 421 | 422 | private auto asTuple() const nothrow 423 | { 424 | import std.typecons : tuple; 425 | return tuple(host, scheme, port, user, pass, path, queryParams); 426 | } 427 | 428 | /// Equality checks. 429 | bool opEquals(string other) const 430 | { 431 | URL o; 432 | if (!tryParseURL(other, o)) 433 | { 434 | return false; 435 | } 436 | return asTuple() == o.asTuple(); 437 | } 438 | 439 | /// Ditto 440 | bool opEquals(ref const URL other) const 441 | { 442 | return asTuple() == other.asTuple(); 443 | } 444 | 445 | /// Ditto 446 | bool opEquals(const URL other) const 447 | { 448 | return asTuple() == other.asTuple(); 449 | } 450 | 451 | unittest 452 | { 453 | import std.algorithm, std.array, std.format; 454 | assert("http://example.org/some_path".parseURL > "http://example.org/other_path".parseURL); 455 | alias sorted = std.algorithm.sort; 456 | auto parsedURLs = 457 | [ 458 | "http://example.org/some_path", 459 | "http://example.org:81/other_path", 460 | "http://example.org/other_path", 461 | "https://example.org/first_path", 462 | "http://example.xyz/other_other_path", 463 | "http://me:secret@blog.ikeran.org/wp_admin", 464 | ].map!(x => x.parseURL).array; 465 | auto urls = sorted(parsedURLs).map!(x => x.toHumanReadableString).array; 466 | auto expected = 467 | [ 468 | "http://me:secret@blog.ikeran.org/wp_admin", 469 | "http://example.org/other_path", 470 | "http://example.org/some_path", 471 | "http://example.org:81/other_path", 472 | "https://example.org/first_path", 473 | "http://example.xyz/other_other_path", 474 | ]; 475 | assert(cmp(urls, expected) == 0, "expected:\n%s\ngot:\n%s".format(expected, urls)); 476 | } 477 | 478 | unittest 479 | { 480 | auto a = "http://x.org/a?b=c".parseURL; 481 | auto b = "http://x.org/a?d=e".parseURL; 482 | auto c = "http://x.org/a?b=a".parseURL; 483 | assert(a < b); 484 | assert(c < b); 485 | assert(c < a); 486 | } 487 | 488 | /** 489 | * The append operator (~). 490 | * 491 | * The append operator for URLs returns a new URL with the given string appended as a path 492 | * element to the URL's path. It only adds new path elements (or sequences of path elements). 493 | * 494 | * Don't worry about path separators; whether you include them or not, it will just work. 495 | * 496 | * Query elements are copied. 497 | * 498 | * Examples: 499 | * --- 500 | * auto random = "http://testdata.org/random".parseURL; 501 | * auto randInt = random ~ "int"; 502 | * writeln(randInt); // prints "http://testdata.org/random/int" 503 | * --- 504 | */ 505 | URL opBinary(string op : "~")(string subsequentPath) { 506 | URL other = this; 507 | other ~= subsequentPath; 508 | other.queryParams = queryParams.dup; 509 | return other; 510 | } 511 | 512 | /** 513 | * The append-in-place operator (~=). 514 | * 515 | * The append operator for URLs adds a path element to this URL. It only adds new path elements 516 | * (or sequences of path elements). 517 | * 518 | * Don't worry about path separators; whether you include them or not, it will just work. 519 | * 520 | * Examples: 521 | * --- 522 | * auto random = "http://testdata.org/random".parseURL; 523 | * random ~= "int"; 524 | * writeln(random); // prints "http://testdata.org/random/int" 525 | * --- 526 | */ 527 | URL opOpAssign(string op : "~")(string subsequentPath) { 528 | if (path.endsWith("/")) { 529 | if (subsequentPath.startsWith("/")) { 530 | path ~= subsequentPath[1..$]; 531 | } else { 532 | path ~= subsequentPath; 533 | } 534 | } else { 535 | if (!subsequentPath.startsWith("/")) { 536 | path ~= '/'; 537 | } 538 | path ~= subsequentPath; 539 | } 540 | return this; 541 | } 542 | 543 | /** 544 | * Convert a relative URL to an absolute URL. 545 | * 546 | * This is designed so that you can scrape a webpage and quickly convert links within the 547 | * page to URLs you can actually work with, but you're clever; I'm sure you'll find more uses 548 | * for it. 549 | * 550 | * It's biased toward HTTP family URLs; as one quirk, "//" is interpreted as "same scheme, 551 | * different everything else", which might not be desirable for all schemes. 552 | * 553 | * This only handles URLs, not URIs; if you pass in 'mailto:bob.dobbs@subgenius.org', for 554 | * instance, this will give you our best attempt to parse it as a URL. 555 | * 556 | * Examples: 557 | * --- 558 | * auto base = "https://example.org/passworddb?secure=false".parseURL; 559 | * 560 | * // Download https://example.org/passworddb/by-username/dhasenan 561 | * download(base.resolve("by-username/dhasenan")); 562 | * 563 | * // Download https://example.org/static/style.css 564 | * download(base.resolve("/static/style.css")); 565 | * 566 | * // Download https://cdn.example.net/jquery.js 567 | * download(base.resolve("https://cdn.example.net/jquery.js")); 568 | * --- 569 | */ 570 | URL resolve(string other) 571 | { 572 | if (other.length == 0) return this; 573 | if (other[0] == '/') 574 | { 575 | if (other.length > 1 && other[1] == '/') 576 | { 577 | // Uncommon syntax: a link like "//wikimedia.org" means "same scheme, switch URL" 578 | return parseURL(this.scheme ~ ':' ~ other); 579 | } 580 | } 581 | else 582 | { 583 | auto schemeSep = other.indexOf("://"); 584 | if (schemeSep >= 0 && schemeSep < other.indexOf("/")) 585 | // separate URL 586 | { 587 | return other.parseURL; 588 | } 589 | } 590 | 591 | URL ret = this; 592 | ret.path = ""; 593 | ret.queryParams = ret.queryParams.init; 594 | if (other[0] != '/') 595 | { 596 | // relative to something 597 | if (!this.path.length) 598 | { 599 | // nothing to be relative to 600 | other = "/" ~ other; 601 | } 602 | else if (this.path[$-1] == '/') 603 | { 604 | // directory-style path for the current thing 605 | // resolve relative to this directory 606 | other = this.path ~ other; 607 | } 608 | else 609 | { 610 | // this is a file-like thing 611 | // find the 'directory' and relative to that 612 | other = this.path[0..this.path.lastIndexOf('/') + 1] ~ other; 613 | } 614 | } 615 | // collapse /foo/../ to / 616 | if (other.indexOf("/../") >= 0) 617 | { 618 | import std.array : Appender, array; 619 | import std.string : split; 620 | import std.algorithm.iteration : joiner, filter; 621 | string[] parts = other.split('/'); 622 | for (int i = 0; i < parts.length; i++) 623 | { 624 | if (parts[i] == "..") 625 | { 626 | for (int j = i - 1; j >= 0; j--) 627 | { 628 | if (parts[j] != null) 629 | { 630 | parts[j] = null; 631 | parts[i] = null; 632 | break; 633 | } 634 | } 635 | } 636 | } 637 | other = "/" ~ parts.filter!(x => x != null).joiner("/").to!string; 638 | } 639 | parsePathAndQuery(ret, other); 640 | return ret; 641 | } 642 | 643 | unittest 644 | { 645 | auto a = "http://alcyius.com/dndtools/index.html".parseURL; 646 | auto b = a.resolve("contacts/index.html"); 647 | assert(b.toString == "http://alcyius.com/dndtools/contacts/index.html"); 648 | } 649 | 650 | unittest 651 | { 652 | auto a = "http://alcyius.com/dndtools/index.html?a=b".parseURL; 653 | auto b = a.resolve("contacts/index.html?foo=bar"); 654 | assert(b.toString == "http://alcyius.com/dndtools/contacts/index.html?foo=bar"); 655 | } 656 | 657 | unittest 658 | { 659 | auto a = "http://alcyius.com/dndtools/index.html".parseURL; 660 | auto b = a.resolve("../index.html"); 661 | assert(b.toString == "http://alcyius.com/index.html", b.toString); 662 | } 663 | 664 | unittest 665 | { 666 | auto a = "http://alcyius.com/dndtools/foo/bar/index.html".parseURL; 667 | auto b = a.resolve("../index.html"); 668 | assert(b.toString == "http://alcyius.com/dndtools/foo/index.html", b.toString); 669 | } 670 | } 671 | 672 | /** 673 | * Parse a URL from a string. 674 | * 675 | * This attempts to parse a wide range of URLs as people might actually type them. Some mistakes 676 | * may be made. However, any URL in a correct format will be parsed correctly. 677 | */ 678 | bool tryParseURL(string value, out URL url) 679 | { 680 | url = URL.init; 681 | // scheme:[//[user:password@]host[:port]][/]path[?query][#fragment] 682 | // Scheme is optional in common use. We infer 'http' if it's not given. 683 | auto i = value.indexOf("//"); 684 | if (i > -1) { 685 | if (i > 1) { 686 | url.scheme = value[0..i-1]; 687 | } 688 | value = value[i+2 .. $]; 689 | } else { 690 | url.scheme = "http"; 691 | } 692 | // Check for an ipv6 hostname. 693 | // [user:password@]host[:port]][/]path[?query][#fragment 694 | i = value.indexOfAny([':', '/', '[']); 695 | if (i == -1) { 696 | // Just a hostname. 697 | url.host = value.fromPuny; 698 | return true; 699 | } 700 | 701 | if (value[i] == ':') { 702 | // This could be between username and password, or it could be between host and port. 703 | auto j = value.indexOfAny(['@', '/']); 704 | if (j > -1 && value[j] == '@') { 705 | try { 706 | url.user = value[0..i]; 707 | url.pass = value[i+1 .. j]; 708 | } catch (URLException) { 709 | return false; 710 | } 711 | value = value[j+1 .. $]; 712 | } 713 | } 714 | 715 | // It's trying to be a host/port, not a user/pass. 716 | i = value.indexOfAny([':', '/', '[']); 717 | if (i == -1) { 718 | url.host = value.fromPuny; 719 | return true; 720 | } 721 | 722 | // Find the hostname. It's either an ipv6 address (which has special rules) or not (which doesn't 723 | // have special rules). -- The main sticking point is that ipv6 addresses have colons, which we 724 | // handle specially, and are offset with square brackets. 725 | if (value[i] == '[') { 726 | auto j = value[i..$].indexOf(']'); 727 | if (j < 0) { 728 | // unterminated ipv6 addr 729 | return false; 730 | } 731 | // includes square brackets 732 | url.host = value[i .. i+j+1]; 733 | value = value[i+j+1 .. $]; 734 | if (value.length == 0) { 735 | // read to end of string; we finished parse 736 | return true; 737 | } 738 | if (value[0] != ':' && value[0] != '?' && value[0] != '/') { 739 | return false; 740 | } 741 | } else { 742 | // Normal host. 743 | url.host = value[0..i].fromPuny; 744 | value = value[i .. $]; 745 | } 746 | 747 | if (value[0] == ':') { 748 | auto end = value.indexOf('/'); 749 | if (end == -1) { 750 | end = value.length; 751 | } 752 | try { 753 | url.port = value[1 .. end].to!ushort; 754 | } catch (ConvException) { 755 | return false; 756 | } 757 | value = value[end .. $]; 758 | if (value.length == 0) { 759 | return true; 760 | } 761 | } 762 | return parsePathAndQuery(url, value); 763 | } 764 | 765 | private bool parsePathAndQuery(ref URL url, string value) 766 | { 767 | auto i = value.indexOfAny("?#"); 768 | if (i == -1) 769 | { 770 | url.path = value; 771 | return true; 772 | } 773 | 774 | try 775 | { 776 | url.path = value[0..i]; 777 | } 778 | catch (URLException) 779 | { 780 | return false; 781 | } 782 | 783 | auto c = value[i]; 784 | value = value[i + 1 .. $]; 785 | if (c == '?') 786 | { 787 | i = value.indexOf('#'); 788 | string query; 789 | if (i < 0) 790 | { 791 | query = value; 792 | value = null; 793 | } 794 | else 795 | { 796 | query = value[0..i]; 797 | value = value[i + 1 .. $]; 798 | } 799 | auto queries = query.split('&'); 800 | foreach (q; queries) 801 | { 802 | auto j = q.indexOf('='); 803 | string key, val; 804 | if (j < 0) 805 | { 806 | key = q; 807 | } 808 | else 809 | { 810 | key = q[0..j]; 811 | val = q[j + 1 .. $]; 812 | } 813 | try 814 | { 815 | key = key; 816 | val = val; 817 | } 818 | catch (URLException) 819 | { 820 | return false; 821 | } 822 | url.queryParams.add(key, val); 823 | } 824 | } 825 | 826 | try 827 | { 828 | url.fragment = value; 829 | } 830 | catch (URLException) 831 | { 832 | return false; 833 | } 834 | 835 | return true; 836 | } 837 | 838 | unittest { 839 | { 840 | // Basic. 841 | URL url; 842 | with (url) { 843 | scheme = "https"; 844 | host = "example.org"; 845 | path = "/foo/bar"; 846 | queryParams.add("hello", "world"); 847 | queryParams.add("gibe", "clay"); 848 | fragment = "frag"; 849 | } 850 | assert( 851 | // Not sure what order it'll come out in. 852 | url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 853 | url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 854 | url.toString); 855 | } 856 | { 857 | // Percent encoded. 858 | URL url; 859 | with (url) { 860 | scheme = "https"; 861 | host = "example.org"; 862 | path = "/f☃o"; 863 | queryParams.add("❄", "❀"); 864 | fragment = "ş"; 865 | } 866 | assert( 867 | url.toString == "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80#%C5%9F", 868 | url.toString); 869 | } 870 | { 871 | // Port, user, pass. 872 | URL url; 873 | with (url) { 874 | scheme = "https"; 875 | host = "example.org"; 876 | user = "dhasenan"; 877 | pass = "itsasecret"; 878 | port = 17; 879 | } 880 | assert( 881 | url.toString == "https://dhasenan:itsasecret@example.org:17/", 882 | url.toString); 883 | } 884 | { 885 | // Query with no path. 886 | URL url; 887 | with (url) { 888 | scheme = "https"; 889 | host = "example.org"; 890 | queryParams.add("hi", "bye"); 891 | } 892 | assert( 893 | url.toString == "https://example.org/?hi=bye", 894 | url.toString); 895 | } 896 | } 897 | 898 | unittest 899 | { 900 | auto url = "//foo/bar".parseURL; 901 | assert(url.host == "foo", "expected host foo, got " ~ url.host); 902 | assert(url.path == "/bar"); 903 | } 904 | 905 | unittest 906 | { 907 | import std.stdio : writeln; 908 | auto url = "file:///foo/bar".parseURL; 909 | assert(url.host == null); 910 | assert(url.port == 0); 911 | assert(url.scheme == "file"); 912 | assert(url.path == "/foo/bar"); 913 | assert(url.toString == "file:///foo/bar"); 914 | assert(url.queryParams.empty); 915 | assert(url.fragment == null); 916 | } 917 | 918 | unittest 919 | { 920 | // ipv6 hostnames! 921 | { 922 | // full range of data 923 | auto url = parseURL("https://bob:secret@[::1]:2771/foo/bar"); 924 | assert(url.scheme == "https", url.scheme); 925 | assert(url.user == "bob", url.user); 926 | assert(url.pass == "secret", url.pass); 927 | assert(url.host == "[::1]", url.host); 928 | assert(url.port == 2771, url.port.to!string); 929 | assert(url.path == "/foo/bar", url.path); 930 | } 931 | 932 | // minimal 933 | { 934 | auto url = parseURL("[::1]"); 935 | assert(url.host == "[::1]", url.host); 936 | } 937 | 938 | // some random bits 939 | { 940 | auto url = parseURL("http://[::1]/foo"); 941 | assert(url.scheme == "http", url.scheme); 942 | assert(url.host == "[::1]", url.host); 943 | assert(url.path == "/foo", url.path); 944 | } 945 | 946 | { 947 | auto url = parseURL("https://[2001:0db8:0:0:0:0:1428:57ab]/?login=true#justkidding"); 948 | assert(url.scheme == "https"); 949 | assert(url.host == "[2001:0db8:0:0:0:0:1428:57ab]"); 950 | assert(url.path == "/"); 951 | assert(url.fragment == "justkidding"); 952 | } 953 | } 954 | 955 | unittest 956 | { 957 | auto url = "localhost:5984".parseURL; 958 | auto url2 = url ~ "db1"; 959 | assert(url2.toString == "http://localhost:5984/db1", url2.toString); 960 | auto url3 = url2 ~ "_all_docs"; 961 | assert(url3.toString == "http://localhost:5984/db1/_all_docs", url3.toString); 962 | } 963 | 964 | /// 965 | unittest { 966 | { 967 | // Basic. 968 | URL url; 969 | with (url) { 970 | scheme = "https"; 971 | host = "example.org"; 972 | path = "/foo/bar"; 973 | queryParams.add("hello", "world"); 974 | queryParams.add("gibe", "clay"); 975 | fragment = "frag"; 976 | } 977 | assert( 978 | // Not sure what order it'll come out in. 979 | url.toString == "https://example.org/foo/bar?hello=world&gibe=clay#frag" || 980 | url.toString == "https://example.org/foo/bar?gibe=clay&hello=world#frag", 981 | url.toString); 982 | } 983 | { 984 | // Passing an array of query values. 985 | URL url; 986 | with (url) { 987 | scheme = "https"; 988 | host = "example.org"; 989 | path = "/foo/bar"; 990 | queryParams.add("hello", "world"); 991 | queryParams.add("hello", "aether"); 992 | fragment = "frag"; 993 | } 994 | assert( 995 | // Not sure what order it'll come out in. 996 | url.toString == "https://example.org/foo/bar?hello=world&hello=aether#frag" || 997 | url.toString == "https://example.org/foo/bar?hello=aether&hello=world#frag", 998 | url.toString); 999 | } 1000 | { 1001 | // Percent encoded. 1002 | URL url; 1003 | with (url) { 1004 | scheme = "https"; 1005 | host = "example.org"; 1006 | path = "/f☃o"; 1007 | queryParams.add("❄", "❀"); 1008 | queryParams.add("[", "]"); 1009 | fragment = "ş"; 1010 | } 1011 | assert( 1012 | // Not sure what order it'll come out in. 1013 | url.toString == 1014 | "https://example.org/f%E2%98%83o?%E2%9D%84=%E2%9D%80&[=]#%C5%9F" || 1015 | url.toString == "https://example.org/f%E2%98%83o?[=]&%E2%9D%84=%E2%9D%80#%C5%9F", 1016 | url.toString); 1017 | } 1018 | { 1019 | // Port, user, pass. 1020 | URL url; 1021 | with (url) { 1022 | scheme = "https"; 1023 | host = "example.org"; 1024 | user = "dhasenan"; 1025 | pass = "itsasecret"; 1026 | port = 17; 1027 | } 1028 | assert( 1029 | url.toString == "https://dhasenan:itsasecret@example.org:17/", 1030 | url.toString); 1031 | } 1032 | { 1033 | // Query with no path. 1034 | URL url; 1035 | with (url) { 1036 | scheme = "https"; 1037 | host = "example.org"; 1038 | queryParams.add("hi", "bye"); 1039 | } 1040 | assert( 1041 | url.toString == "https://example.org/?hi=bye", 1042 | url.toString); 1043 | } 1044 | } 1045 | 1046 | unittest { 1047 | // Percent encoding shouldn't happen until .toString 1048 | auto url = "http://example.org/á".parseURL; 1049 | assert(url.path == "/á", url.path); 1050 | } 1051 | 1052 | unittest { 1053 | // Percent decoding. 1054 | 1055 | // http://#:!:@ 1056 | auto urlString = "http://%23:%21%3A@example.org/%7B?%3B&%26=%3D#%23hash"; 1057 | auto url = urlString.parseURL; 1058 | assert(url.user == "%23"); 1059 | assert(url.pass == "%21%3A"); 1060 | assert(url.host == "example.org"); 1061 | assert(url.path == "/%7B"); 1062 | assert(url.queryParams["%26"].front == "%3D"); 1063 | assert(url.queryParams["%3B"].front == ""); 1064 | assert(url.fragment == "%23hash"); 1065 | 1066 | // Round trip. 1067 | assert(urlString == urlString.parseURL.toString, urlString.parseURL.toString); 1068 | assert(urlString == urlString.parseURL.toString.parseURL.toString); 1069 | } 1070 | 1071 | unittest { 1072 | auto url = "https://xn--m3h.xn--n3h.org/?hi=bye".parseURL; 1073 | assert(url.host == "☂.☃.org", url.host); 1074 | } 1075 | 1076 | unittest { 1077 | auto url = "https://☂.☃.org/?hi=bye".parseURL; 1078 | assert(url.toString == "https://xn--m3h.xn--n3h.org/?hi=bye"); 1079 | } 1080 | 1081 | /// 1082 | unittest { 1083 | // There's an existing path. 1084 | auto url = parseURL("http://example.org/foo"); 1085 | URL url2; 1086 | // No slash? Assume it needs a slash. 1087 | assert((url ~ "bar").toString == "http://example.org/foo/bar"); 1088 | // With slash? Don't add another. 1089 | url2 = url ~ "/bar"; 1090 | assert(url2.toString == "http://example.org/foo/bar", url2.toString); 1091 | url ~= "bar"; 1092 | assert(url.toString == "http://example.org/foo/bar"); 1093 | 1094 | // Path already ends with a slash; don't add another. 1095 | url = parseURL("http://example.org/foo/"); 1096 | assert((url ~ "bar").toString == "http://example.org/foo/bar"); 1097 | // Still don't add one even if you're appending with a slash. 1098 | assert((url ~ "/bar").toString == "http://example.org/foo/bar"); 1099 | url ~= "/bar"; 1100 | assert(url.toString == "http://example.org/foo/bar"); 1101 | 1102 | // No path. 1103 | url = parseURL("http://example.org"); 1104 | assert((url ~ "bar").toString == "http://example.org/bar"); 1105 | assert((url ~ "/bar").toString == "http://example.org/bar"); 1106 | url ~= "bar"; 1107 | assert(url.toString == "http://example.org/bar"); 1108 | 1109 | // Path is just a slash. 1110 | url = parseURL("http://example.org/"); 1111 | assert((url ~ "bar").toString == "http://example.org/bar"); 1112 | assert((url ~ "/bar").toString == "http://example.org/bar"); 1113 | url ~= "bar"; 1114 | assert(url.toString == "http://example.org/bar", url.toString); 1115 | 1116 | // No path, just fragment. 1117 | url = "ircs://irc.freenode.com/#d".parseURL; 1118 | assert(url.toString == "ircs://irc.freenode.com/#d", url.toString); 1119 | } 1120 | unittest 1121 | { 1122 | // basic resolve() 1123 | { 1124 | auto base = "https://example.org/this/".parseURL; 1125 | assert(base.resolve("that") == "https://example.org/this/that"); 1126 | assert(base.resolve("/that") == "https://example.org/that"); 1127 | assert(base.resolve("//example.net/that") == "https://example.net/that"); 1128 | } 1129 | 1130 | // ensure we don't preserve query params 1131 | { 1132 | auto base = "https://example.org/this?query=value&other=value2".parseURL; 1133 | assert(base.resolve("that") == "https://example.org/that"); 1134 | assert(base.resolve("/that") == "https://example.org/that"); 1135 | assert(base.resolve("tother/that") == "https://example.org/tother/that"); 1136 | assert(base.resolve("//example.net/that") == "https://example.net/that"); 1137 | } 1138 | } 1139 | 1140 | 1141 | unittest 1142 | { 1143 | import std.net.curl; 1144 | auto url = "http://example.org".parseURL; 1145 | assert(is(typeof(std.net.curl.get(url)))); 1146 | } 1147 | 1148 | /** 1149 | * Parse the input string as a URL. 1150 | * 1151 | * Throws: 1152 | * URLException if the string was in an incorrect format. 1153 | */ 1154 | URL parseURL(string value) { 1155 | URL url; 1156 | if (tryParseURL(value, url)) { 1157 | return url; 1158 | } 1159 | throw new URLException("failed to parse URL " ~ value); 1160 | } 1161 | 1162 | /// 1163 | unittest { 1164 | { 1165 | // Infer scheme 1166 | auto u1 = parseURL("example.org"); 1167 | assert(u1.scheme == "http"); 1168 | assert(u1.host == "example.org"); 1169 | assert(u1.path == ""); 1170 | assert(u1.port == 80); 1171 | assert(u1.providedPort == 0); 1172 | assert(u1.fragment == ""); 1173 | } 1174 | { 1175 | // Simple host and scheme 1176 | auto u1 = parseURL("https://example.org"); 1177 | assert(u1.scheme == "https"); 1178 | assert(u1.host == "example.org"); 1179 | assert(u1.path == ""); 1180 | assert(u1.port == 443); 1181 | assert(u1.providedPort == 0); 1182 | } 1183 | { 1184 | // With path 1185 | auto u1 = parseURL("https://example.org/foo/bar"); 1186 | assert(u1.scheme == "https"); 1187 | assert(u1.host == "example.org"); 1188 | assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 1189 | assert(u1.port == 443); 1190 | assert(u1.providedPort == 0); 1191 | } 1192 | { 1193 | // With explicit port 1194 | auto u1 = parseURL("https://example.org:1021/foo/bar"); 1195 | assert(u1.scheme == "https"); 1196 | assert(u1.host == "example.org"); 1197 | assert(u1.path == "/foo/bar", "expected /foo/bar but got " ~ u1.path); 1198 | assert(u1.port == 1021); 1199 | assert(u1.providedPort == 1021); 1200 | } 1201 | { 1202 | // With user 1203 | auto u1 = parseURL("https://bob:secret@example.org/foo/bar"); 1204 | assert(u1.scheme == "https"); 1205 | assert(u1.host == "example.org"); 1206 | assert(u1.path == "/foo/bar"); 1207 | assert(u1.port == 443); 1208 | assert(u1.user == "bob"); 1209 | assert(u1.pass == "secret"); 1210 | } 1211 | { 1212 | // With user, URL-encoded 1213 | auto u1 = parseURL("https://bob%21:secret%21%3F@example.org/foo/bar"); 1214 | assert(u1.scheme == "https"); 1215 | assert(u1.host == "example.org"); 1216 | assert(u1.path == "/foo/bar"); 1217 | assert(u1.port == 443); 1218 | assert(u1.user == "bob%21"); 1219 | assert(u1.pass == "secret%21%3F"); 1220 | } 1221 | { 1222 | // With user and port and path 1223 | auto u1 = parseURL("https://bob:secret@example.org:2210/foo/bar"); 1224 | assert(u1.scheme == "https"); 1225 | assert(u1.host == "example.org"); 1226 | assert(u1.path == "/foo/bar"); 1227 | assert(u1.port == 2210); 1228 | assert(u1.user == "bob"); 1229 | assert(u1.pass == "secret"); 1230 | assert(u1.fragment == ""); 1231 | } 1232 | { 1233 | // With query string 1234 | auto u1 = parseURL("https://example.org/?login=true"); 1235 | assert(u1.scheme == "https"); 1236 | assert(u1.host == "example.org"); 1237 | assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 1238 | assert(u1.queryParams["login"].front == "true"); 1239 | assert(u1.fragment == ""); 1240 | } 1241 | { 1242 | // With query string and fragment 1243 | auto u1 = parseURL("https://example.org/?login=true#justkidding"); 1244 | assert(u1.scheme == "https"); 1245 | assert(u1.host == "example.org"); 1246 | assert(u1.path == "/", "expected path: / actual path: " ~ u1.path); 1247 | assert(u1.queryParams["login"].front == "true"); 1248 | assert(u1.fragment == "justkidding"); 1249 | } 1250 | } 1251 | 1252 | unittest { 1253 | assert(parseURL("http://example.org").port == 80); 1254 | assert(parseURL("http://example.org:5326").port == 5326); 1255 | 1256 | auto url = parseURL("redis://admin:password@redisbox.local:2201/path?query=value#fragment"); 1257 | assert(url.scheme == "redis"); 1258 | assert(url.user == "admin"); 1259 | assert(url.pass == "password"); 1260 | 1261 | assert(parseURL("example.org").toString == "http://example.org/"); 1262 | assert(parseURL("http://example.org:80").toString == "http://example.org/"); 1263 | 1264 | assert(parseURL("localhost:8070").toString == "http://localhost:8070/"); 1265 | } 1266 | 1267 | /** 1268 | * Percent-encode non-ASCII characters in a string. 1269 | */ 1270 | string percentEncodeUnicodeOnly(string raw) { 1271 | import std.array : Appender; 1272 | Appender!string app; 1273 | foreach (char c; raw) { 1274 | if (cast(ubyte)c >= 0b1000_0000) { 1275 | app ~= format("%%%02X", cast(ubyte)c); 1276 | } else { 1277 | app ~= c; 1278 | } 1279 | } 1280 | return app.data; 1281 | } 1282 | 1283 | /** 1284 | * Percent-encode a string. 1285 | * 1286 | * URL components cannot contain non-ASCII characters, and there are very few characters that are 1287 | * safe to include as URL components. Domain names using Unicode values use Punycode. For 1288 | * everything else, there is percent encoding. 1289 | */ 1290 | string percentEncode(string raw) { 1291 | // We *must* encode these characters: :/?#[]@!$&'()*+,;=" 1292 | // We *can* encode any other characters. 1293 | // We *should not* encode alpha, numeric, or -._~. 1294 | import std.utf : encode; 1295 | import std.array : Appender; 1296 | Appender!string app; 1297 | foreach (dchar d; raw) { 1298 | if (('a' <= d && 'z' >= d) || 1299 | ('A' <= d && 'Z' >= d) || 1300 | ('0' <= d && '9' >= d) || 1301 | d == '-' || d == '.' || d == '_' || d == '~') { 1302 | app ~= d; 1303 | continue; 1304 | } 1305 | // Something simple like a space character? Still in 7-bit ASCII? 1306 | // Then we get a single-character string out of it and just encode 1307 | // that one bit. 1308 | // Something not in 7-bit ASCII? Then we percent-encode each octet 1309 | // in the UTF-8 encoding (and hope the server understands UTF-8). 1310 | char[] c; 1311 | encode(c, d); 1312 | auto bytes = cast(ubyte[])c; 1313 | foreach (b; bytes) { 1314 | app ~= format("%%%02X", b); 1315 | } 1316 | } 1317 | return cast(string)app.data; 1318 | } 1319 | 1320 | /// 1321 | unittest { 1322 | assert(percentEncode("IDontNeedNoPercentEncoding") == "IDontNeedNoPercentEncoding"); 1323 | assert(percentEncode("~~--..__") == "~~--..__"); 1324 | assert(percentEncode("0123456789") == "0123456789"); 1325 | 1326 | string e; 1327 | 1328 | e = percentEncode("☃"); 1329 | assert(e == "%E2%98%83", "expected %E2%98%83 but got" ~ e); 1330 | } 1331 | 1332 | /** 1333 | * Percent-decode a string. 1334 | * 1335 | * URL components cannot contain non-ASCII characters, and there are very few characters that are 1336 | * safe to include as URL components. Domain names using Unicode values use Punycode. For 1337 | * everything else, there is percent encoding. 1338 | * 1339 | * This explicitly ensures that the result is a valid UTF-8 string. 1340 | */ 1341 | string percentDecode(string encoded) 1342 | { 1343 | import std.utf : validate, UTFException; 1344 | auto raw = percentDecodeRaw(encoded); 1345 | auto s = cast(string) raw; 1346 | try 1347 | { 1348 | validate(s); 1349 | } 1350 | catch (UTFException e) 1351 | { 1352 | return encoded; 1353 | } 1354 | return s; 1355 | } 1356 | 1357 | /// 1358 | unittest { 1359 | assert(percentDecode("IDontNeedNoPercentDecoding") == "IDontNeedNoPercentDecoding"); 1360 | assert(percentDecode("~~--..__") == "~~--..__"); 1361 | assert(percentDecode("0123456789") == "0123456789"); 1362 | 1363 | string e; 1364 | 1365 | e = percentDecode("%E2%98%83"); 1366 | assert(e == "☃", "expected a snowman but got" ~ e); 1367 | 1368 | e = percentDecode("%e2%98%83"); 1369 | assert(e == "☃", "expected a snowman but got" ~ e); 1370 | 1371 | try { 1372 | // %ES is an invalid percent sequence: 'S' is not a hex digit. 1373 | percentDecode("%es"); 1374 | assert(false, "expected exception not thrown"); 1375 | } catch (URLException) { 1376 | } 1377 | 1378 | try { 1379 | percentDecode("%e"); 1380 | assert(false, "expected exception not thrown"); 1381 | } catch (URLException) { 1382 | } 1383 | } 1384 | 1385 | /** 1386 | * Percent-decode a string into a ubyte array. 1387 | * 1388 | * URL components cannot contain non-ASCII characters, and there are very few characters that are 1389 | * safe to include as URL components. Domain names using Unicode values use Punycode. For 1390 | * everything else, there is percent encoding. 1391 | * 1392 | * This yields a ubyte array and will not perform validation on the output. However, an improperly 1393 | * formatted input string will result in a URLException. 1394 | */ 1395 | immutable(ubyte)[] percentDecodeRaw(string encoded) 1396 | { 1397 | // We're dealing with possibly incorrectly encoded UTF-8. Mark it down as ubyte[] for now. 1398 | import std.array : Appender; 1399 | Appender!(immutable(ubyte)[]) app; 1400 | for (int i = 0; i < encoded.length; i++) { 1401 | if (encoded[i] != '%') { 1402 | app ~= encoded[i]; 1403 | continue; 1404 | } 1405 | if (i >= encoded.length - 2) { 1406 | throw new URLException("Invalid percent encoded value: expected two characters after " ~ 1407 | "percent symbol. Error at index " ~ i.to!string); 1408 | } 1409 | if (isHex(encoded[i + 1]) && isHex(encoded[i + 2])) { 1410 | auto b = fromHex(encoded[i + 1]); 1411 | auto c = fromHex(encoded[i + 2]); 1412 | app ~= cast(ubyte)((b << 4) | c); 1413 | } else { 1414 | throw new URLException("Invalid percent encoded value: expected two hex digits after " ~ 1415 | "percent symbol. Error at index " ~ i.to!string); 1416 | } 1417 | i += 2; 1418 | } 1419 | return app.data; 1420 | } 1421 | 1422 | private bool isHex(char c) { 1423 | return ('0' <= c && '9' >= c) || 1424 | ('a' <= c && 'f' >= c) || 1425 | ('A' <= c && 'F' >= c); 1426 | } 1427 | 1428 | private ubyte fromHex(char s) { 1429 | enum caseDiff = 'a' - 'A'; 1430 | if (s >= 'a' && s <= 'z') { 1431 | s -= caseDiff; 1432 | } 1433 | return cast(ubyte)("0123456789ABCDEF".indexOf(s)); 1434 | } 1435 | 1436 | private string toPuny(string unicodeHostname) 1437 | { 1438 | if (unicodeHostname.length == 0) return ""; 1439 | if (unicodeHostname[0] == '[') 1440 | { 1441 | // It's an ipv6 name. 1442 | return unicodeHostname; 1443 | } 1444 | bool mustEncode = false; 1445 | foreach (i, dchar d; unicodeHostname) { 1446 | auto c = cast(uint) d; 1447 | if (c > 0x80) { 1448 | mustEncode = true; 1449 | break; 1450 | } 1451 | if (c < 0x2C || (c >= 0x3A && c <= 40) || (c >= 0x5B && c <= 0x60) || (c >= 0x7B)) { 1452 | throw new URLException( 1453 | format( 1454 | "domain name '%s' contains illegal character '%s' at position %s", 1455 | unicodeHostname, d, i)); 1456 | } 1457 | } 1458 | if (!mustEncode) { 1459 | return unicodeHostname; 1460 | } 1461 | import std.algorithm.iteration : map; 1462 | return unicodeHostname.split('.').map!punyEncode.join("."); 1463 | } 1464 | 1465 | private string fromPuny(string hostname) 1466 | { 1467 | import std.algorithm.iteration : map; 1468 | return hostname.split('.').map!punyDecode.join("."); 1469 | } 1470 | 1471 | private { 1472 | enum delimiter = '-'; 1473 | enum marker = "xn--"; 1474 | enum ulong damp = 700; 1475 | enum ulong tmin = 1; 1476 | enum ulong tmax = 26; 1477 | enum ulong skew = 38; 1478 | enum ulong base = 36; 1479 | enum ulong initialBias = 72; 1480 | enum dchar initialN = cast(dchar)128; 1481 | 1482 | ulong adapt(ulong delta, ulong numPoints, bool firstTime) { 1483 | if (firstTime) { 1484 | delta /= damp; 1485 | } else { 1486 | delta /= 2; 1487 | } 1488 | delta += delta / numPoints; 1489 | ulong k = 0; 1490 | while (delta > ((base - tmin) * tmax) / 2) { 1491 | delta /= (base - tmin); 1492 | k += base; 1493 | } 1494 | return k + (((base - tmin + 1) * delta) / (delta + skew)); 1495 | } 1496 | } 1497 | 1498 | /** 1499 | * Encode the input string using the Punycode algorithm. 1500 | * 1501 | * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1502 | * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1503 | * in Punycode, you will get "xn--m3h.xn--n3h.com". 1504 | * 1505 | * In order to puny-encode a domain name, you must split it into its components. The following will 1506 | * typically suffice: 1507 | * --- 1508 | * auto domain = "☂.☃.com"; 1509 | * auto encodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1510 | * --- 1511 | */ 1512 | string punyEncode(string input) 1513 | { 1514 | import std.array : Appender; 1515 | ulong delta = 0; 1516 | dchar n = initialN; 1517 | auto i = 0; 1518 | auto bias = initialBias; 1519 | Appender!string output; 1520 | output ~= marker; 1521 | auto pushed = 0; 1522 | auto codePoints = 0; 1523 | foreach (dchar c; input) { 1524 | codePoints++; 1525 | if (c <= initialN) { 1526 | output ~= c; 1527 | pushed++; 1528 | } 1529 | } 1530 | if (pushed < codePoints) { 1531 | if (pushed > 0) { 1532 | output ~= delimiter; 1533 | } 1534 | } else { 1535 | // No encoding to do. 1536 | return input; 1537 | } 1538 | bool first = true; 1539 | while (pushed < codePoints) { 1540 | auto best = dchar.max; 1541 | foreach (dchar c; input) { 1542 | if (n <= c && c < best) { 1543 | best = c; 1544 | } 1545 | } 1546 | if (best == dchar.max) { 1547 | throw new URLException("failed to find a new codepoint to process during punyencode"); 1548 | } 1549 | delta += (best - n) * (pushed + 1); 1550 | if (delta > uint.max) { 1551 | // TODO better error message 1552 | throw new URLException("overflow during punyencode"); 1553 | } 1554 | n = best; 1555 | foreach (dchar c; input) { 1556 | if (c < n) { 1557 | delta++; 1558 | } 1559 | if (c == n) { 1560 | ulong q = delta; 1561 | auto k = base; 1562 | while (true) { 1563 | ulong t; 1564 | if (k <= bias) { 1565 | t = tmin; 1566 | } else if (k >= bias + tmax) { 1567 | t = tmax; 1568 | } else { 1569 | t = k - bias; 1570 | } 1571 | if (q < t) { 1572 | break; 1573 | } 1574 | output ~= digitToBasic(t + ((q - t) % (base - t))); 1575 | q = (q - t) / (base - t); 1576 | k += base; 1577 | } 1578 | output ~= digitToBasic(q); 1579 | pushed++; 1580 | bias = adapt(delta, pushed, first); 1581 | first = false; 1582 | delta = 0; 1583 | } 1584 | } 1585 | delta++; 1586 | n++; 1587 | } 1588 | return cast(string)output.data; 1589 | } 1590 | 1591 | /** 1592 | * Decode the input string using the Punycode algorithm. 1593 | * 1594 | * Punycode is used to encode UTF domain name segment. A Punycode-encoded segment will be marked 1595 | * with "xn--". Each segment is encoded separately. For instance, if you wish to encode "☂.☃.com" 1596 | * in Punycode, you will get "xn--m3h.xn--n3h.com". 1597 | * 1598 | * In order to puny-decode a domain name, you must split it into its components. The following will 1599 | * typically suffice: 1600 | * --- 1601 | * auto domain = "xn--m3h.xn--n3h.com"; 1602 | * auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1603 | * --- 1604 | */ 1605 | string punyDecode(string input) { 1606 | if (!input.startsWith(marker)) { 1607 | return input; 1608 | } 1609 | input = input[marker.length..$]; 1610 | 1611 | // let n = initial_n 1612 | dchar n = cast(dchar)128; 1613 | 1614 | // let i = 0 1615 | // let bias = initial_bias 1616 | // let output = an empty string indexed from 0 1617 | size_t i = 0; 1618 | auto bias = initialBias; 1619 | dchar[] output; 1620 | // This reserves a bit more than necessary, but it should be more efficient overall than just 1621 | // appending and inserting volo-nolo. 1622 | output.reserve(input.length); 1623 | 1624 | // consume all code points before the last delimiter (if there is one) 1625 | // and copy them to output, fail on any non-basic code point 1626 | // if more than zero code points were consumed then consume one more 1627 | // (which will be the last delimiter) 1628 | auto end = input.lastIndexOf(delimiter); 1629 | if (end > -1) { 1630 | foreach (dchar c; input[0..end]) { 1631 | output ~= c; 1632 | } 1633 | input = input[end+1 .. $]; 1634 | } 1635 | 1636 | // while the input is not exhausted do begin 1637 | size_t pos = 0; 1638 | while (pos < input.length) { 1639 | // let oldi = i 1640 | // let w = 1 1641 | auto oldi = i; 1642 | auto w = 1; 1643 | // for k = base to infinity in steps of base do begin 1644 | for (ulong k = base; k < uint.max; k += base) { 1645 | // consume a code point, or fail if there was none to consume 1646 | // Note that the input is all ASCII, so we can simply index the input string bytewise. 1647 | auto c = input[pos]; 1648 | pos++; 1649 | // let digit = the code point's digit-value, fail if it has none 1650 | auto digit = basicToDigit(c); 1651 | // let i = i + digit * w, fail on overflow 1652 | i += digit * w; 1653 | // let t = tmin if k <= bias {+ tmin}, or 1654 | // tmax if k >= bias + tmax, or k - bias otherwise 1655 | ulong t; 1656 | if (k <= bias) { 1657 | t = tmin; 1658 | } else if (k >= bias + tmax) { 1659 | t = tmax; 1660 | } else { 1661 | t = k - bias; 1662 | } 1663 | // if digit < t then break 1664 | if (digit < t) { 1665 | break; 1666 | } 1667 | // let w = w * (base - t), fail on overflow 1668 | w *= (base - t); 1669 | // end 1670 | } 1671 | // let bias = adapt(i - oldi, length(output) + 1, test oldi is 0?) 1672 | bias = adapt(i - oldi, output.length + 1, oldi == 0); 1673 | // let n = n + i div (length(output) + 1), fail on overflow 1674 | n += i / (output.length + 1); 1675 | // let i = i mod (length(output) + 1) 1676 | i %= (output.length + 1); 1677 | // {if n is a basic code point then fail} 1678 | // (We aren't actually going to fail here; it's clear what this means.) 1679 | // insert n into output at position i 1680 | import std.array : insertInPlace; 1681 | (() @trusted { output.insertInPlace(i, cast(dchar)n); })(); // should be @safe but isn't marked 1682 | // increment i 1683 | i++; 1684 | // end 1685 | } 1686 | return output.to!string; 1687 | } 1688 | 1689 | // Lifted from punycode.js. 1690 | private dchar digitToBasic(ulong digit) { 1691 | return cast(dchar)(digit + 22 + 75 * (digit < 26)); 1692 | } 1693 | 1694 | // Lifted from punycode.js. 1695 | private uint basicToDigit(char c) { 1696 | auto codePoint = cast(uint)c; 1697 | if (codePoint - 48 < 10) { 1698 | return codePoint - 22; 1699 | } 1700 | if (codePoint - 65 < 26) { 1701 | return codePoint - 65; 1702 | } 1703 | if (codePoint - 97 < 26) { 1704 | return codePoint - 97; 1705 | } 1706 | return base; 1707 | } 1708 | 1709 | unittest { 1710 | { 1711 | auto a = "b\u00FCcher"; 1712 | assert(punyEncode(a) == "xn--bcher-kva"); 1713 | } 1714 | { 1715 | auto a = "b\u00FCc\u00FCher"; 1716 | assert(punyEncode(a) == "xn--bcher-kvab"); 1717 | } 1718 | { 1719 | auto a = "ýbücher"; 1720 | auto b = punyEncode(a); 1721 | assert(b == "xn--bcher-kvaf", b); 1722 | } 1723 | 1724 | { 1725 | auto a = "mañana"; 1726 | assert(punyEncode(a) == "xn--maana-pta"); 1727 | } 1728 | 1729 | { 1730 | auto a = "\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644" 1731 | ~ "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F"; 1732 | auto b = punyEncode(a); 1733 | assert(b == "xn--egbpdaj6bu4bxfgehfvwxn", b); 1734 | } 1735 | import std.stdio; 1736 | } 1737 | 1738 | unittest { 1739 | { 1740 | auto b = punyDecode("xn--egbpdaj6bu4bxfgehfvwxn"); 1741 | assert(b == "ليهمابتكلموشعربي؟", b); 1742 | } 1743 | { 1744 | assert(punyDecode("xn--maana-pta") == "mañana"); 1745 | } 1746 | } 1747 | 1748 | unittest { 1749 | import std.string, std.algorithm, std.array, std.range; 1750 | { 1751 | auto domain = "xn--m3h.xn--n3h.com"; 1752 | auto decodedDomain = domain.splitter(".").map!(punyDecode).join("."); 1753 | assert(decodedDomain == "☂.☃.com", decodedDomain); 1754 | } 1755 | { 1756 | auto domain = "☂.☃.com"; 1757 | auto decodedDomain = domain.splitter(".").map!(punyEncode).join("."); 1758 | assert(decodedDomain == "xn--m3h.xn--n3h.com", decodedDomain); 1759 | } 1760 | } 1761 | 1762 | unittest { 1763 | // this has percent-encoded non-unicode data 1764 | auto u = "http://domain.example/%E9%E9%E9".parseURL; 1765 | assert(u.toString == "http://domain.example/%E9%E9%E9", "toString: " ~ u.toString); 1766 | assert(u.toHumanReadableString == "http://domain.example/%E9%E9%E9", 1767 | "toHumanReadableString: " ~ u.toHumanReadableString); 1768 | } 1769 | 1770 | unittest { 1771 | assert(URL("http://example.org") == parseURL("http://example.org")); 1772 | } 1773 | --------------------------------------------------------------------------------