├── .editorconfig ├── .gitattributes ├── .github ├── funding.yml ├── security.md └── workflows │ └── main.yml ├── .gitignore ├── .npmrc ├── fixture.txt ├── index.d.ts ├── index.js ├── index.test-d.ts ├── license ├── package.json ├── readme.md └── test.js /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = tab 5 | end_of_line = lf 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | 10 | [*.yml] 11 | indent_style = space 12 | indent_size = 2 13 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | -------------------------------------------------------------------------------- /.github/funding.yml: -------------------------------------------------------------------------------- 1 | github: sindresorhus 2 | open_collective: sindresorhus 3 | tidelift: npm/get-urls 4 | custom: https://sindresorhus.com/donate 5 | -------------------------------------------------------------------------------- /.github/security.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | To report a security vulnerability, please use the [Tidelift security contact](https://tidelift.com/security). Tidelift will coordinate the fix and disclosure. 4 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | - push 4 | - pull_request 5 | jobs: 6 | test: 7 | name: Node.js ${{ matrix.node-version }} 8 | runs-on: ubuntu-latest 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | node-version: 13 | - 20 14 | - 18 15 | - 16 16 | steps: 17 | - uses: actions/checkout@v3 18 | - uses: actions/setup-node@v3 19 | with: 20 | node-version: ${{ matrix.node-version }} 21 | - run: npm install 22 | - run: npm test 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | yarn.lock 3 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | package-lock=false 2 | -------------------------------------------------------------------------------- /fixture.txt: -------------------------------------------------------------------------------- 1 | #foo 2 | 3 | http://google.com 4 | 5 | sdsjan fas lasdf dsa\ ds\a fds 6 | sda 7 | sda 8 | fsda 9 | aaa //www.todomvc.com f 10 | sad 11 | asdf 12 | asdf http://yeoman.io ~~~~ 13 | 14 | http://twitter.com/sindresorhus. asdfdsa fsdasad sfdasjkfadhsfkjhsdalkf 15 | 16 | 17 | sadsafda sas ifdos;fjewqrp23i4k231 412 http://yeoman.io 18 | 19 | asfasfsad ssadf https://tastejs.com 20 | 21 | 22 | sdssdfss www.example.com 23 | 24 | unt in culpa http://github.com (ullamco laboris) 25 | 26 | asdffsdfa sdafas adfsad fa sd 27 | 28 | Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod 29 | tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, 30 | quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo 31 | consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse 32 | cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non 33 | proident, sunt in culpa qui officia deserunt mollit anim id est laborum. 34 | -------------------------------------------------------------------------------- /index.d.ts: -------------------------------------------------------------------------------- 1 | import type {Options as NormalizeUrlOptions} from 'normalize-url'; 2 | 3 | export type Options = { 4 | /** 5 | Extract URLs that appear as query parameters in the found URLs. 6 | 7 | @default false 8 | */ 9 | readonly extractFromQueryString?: boolean; 10 | 11 | /** 12 | Exclude URLs that match URLs in the given array. 13 | 14 | @default [] 15 | */ 16 | readonly exclude?: string[]; 17 | 18 | /** 19 | Require URLs to have a scheme or leading `www.` to be considered an URL. When `false`, matches against a list of valid TLDs, so it will match URLs like `unicorn.education`. 20 | 21 | Does not affect URLs in query parameters if using the `extractFromQueryString` option. 22 | 23 | @default false 24 | */ 25 | readonly requireSchemeOrWww?: boolean; 26 | } & NormalizeUrlOptions; 27 | 28 | /** 29 | Get all URLs in a string. 30 | 31 | The URLs will be [normalized](https://github.com/sindresorhus/normalize-url). 32 | 33 | @returns A `Set` of URLs. 34 | 35 | @example 36 | ``` 37 | import getUrls from 'get-urls'; 38 | 39 | const text = 'Lorem ipsum dolor sit amet, //sindresorhus.com consectetuer adipiscing http://yeoman.io elit.'; 40 | 41 | getUrls(text); 42 | //=> Set {'http://sindresorhus.com', 'http://yeoman.io'} 43 | ``` 44 | */ 45 | export default function getUrls(text: string, options?: Options): Set; 46 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | import urlRegex from 'url-regex-safe'; 2 | import normalizeUrl from 'normalize-url'; 3 | import {isMatch, matches} from 'super-regex'; 4 | 5 | const getUrlsFromQueryParameters = url => { 6 | const returnValue = new Set(); 7 | const {searchParams} = (new URL(url.replace(/^(?:\/\/|(?:www\.))/i, 'http://$2'))); 8 | 9 | for (const [, value] of searchParams) { 10 | if (isMatch(urlRegex({exact: true}), value, {timeout: 500})) { 11 | returnValue.add(value); 12 | } 13 | } 14 | 15 | return returnValue; 16 | }; 17 | 18 | export default function getUrls(text, options = {}) { 19 | if (typeof text !== 'string') { 20 | throw new TypeError(`The \`text\` argument should be a string, got ${typeof text}`); 21 | } 22 | 23 | if (options.exclude !== undefined && !Array.isArray(options.exclude)) { 24 | throw new TypeError('The `exclude` option must be an array'); 25 | } 26 | 27 | const returnValue = new Set(); 28 | 29 | const add = url => { 30 | try { 31 | returnValue.add(normalizeUrl(url.trim().replace(/\.+$/, ''), options)); 32 | } catch {} 33 | }; 34 | 35 | const results = matches( 36 | urlRegex(options.requireSchemeOrWww === undefined ? undefined : { 37 | re2: false, 38 | strict: options.requireSchemeOrWww, 39 | parens: true, 40 | }), 41 | text, 42 | { 43 | matchTimeout: 500, 44 | }, 45 | ); 46 | 47 | for (const {match: url} of results) { 48 | add(url); 49 | 50 | if (options.extractFromQueryString) { 51 | const queryStringUrls = getUrlsFromQueryParameters(url); 52 | for (const queryStringUrl of queryStringUrls) { 53 | add(queryStringUrl); 54 | } 55 | } 56 | } 57 | 58 | for (const excludedItem of options.exclude ?? []) { 59 | const regex = new RegExp(excludedItem); 60 | 61 | for (const item of returnValue) { 62 | if (isMatch(regex, item, {timeout: 500})) { 63 | returnValue.delete(item); 64 | } 65 | } 66 | } 67 | 68 | return returnValue; 69 | } 70 | -------------------------------------------------------------------------------- /index.test-d.ts: -------------------------------------------------------------------------------- 1 | import {expectType} from 'tsd'; 2 | import getUrls from './index.js'; 3 | 4 | const text 5 | = 'Lorem ipsum dolor sit amet, //sindresorhus.com consectetuer adipiscing http://yeoman.io elit.'; 6 | 7 | expectType>(getUrls(text)); 8 | expectType>(getUrls(text, {extractFromQueryString: true})); 9 | expectType>(getUrls(text, {exclude: ['foo']})); 10 | expectType>(getUrls(text, {defaultProtocol: 'http'})); 11 | -------------------------------------------------------------------------------- /license: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Sindre Sorhus (https://sindresorhus.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "get-urls", 3 | "version": "12.1.0", 4 | "description": "Get all URLs in a string", 5 | "license": "MIT", 6 | "repository": "sindresorhus/get-urls", 7 | "funding": "https://github.com/sponsors/sindresorhus", 8 | "author": { 9 | "name": "Sindre Sorhus", 10 | "email": "sindresorhus@gmail.com", 11 | "url": "https://sindresorhus.com" 12 | }, 13 | "type": "module", 14 | "exports": { 15 | "types": "./index.d.ts", 16 | "default": "./index.js" 17 | }, 18 | "engines": { 19 | "node": ">=16" 20 | }, 21 | "scripts": { 22 | "test": "xo && ava && tsd" 23 | }, 24 | "files": [ 25 | "index.js", 26 | "index.d.ts" 27 | ], 28 | "keywords": [ 29 | "get", 30 | "urls", 31 | "url", 32 | "extract", 33 | "find", 34 | "scrape", 35 | "text", 36 | "string" 37 | ], 38 | "dependencies": { 39 | "normalize-url": "^8.0.0", 40 | "super-regex": "^0.2.0", 41 | "url-regex-safe": "^4.0.0" 42 | }, 43 | "devDependencies": { 44 | "ava": "^5.3.1", 45 | "tsd": "^0.28.1", 46 | "xo": "^0.56.0" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # get-urls 2 | 3 | > Get all URLs in a string 4 | 5 | The URLs will be [normalized](https://github.com/sindresorhus/normalize-url). 6 | 7 | *Do not use this for any kind of security-related validation.* 8 | 9 | Please note the [known limitation](https://github.com/niftylettuce/url-regex-safe#limitations). You can work around this by setting `requireSchemeOrWww` to `true`. 10 | 11 | ## Install 12 | 13 | ```sh 14 | npm install get-urls 15 | ``` 16 | 17 | ## Usage 18 | 19 | ```js 20 | import getUrls from 'get-urls'; 21 | 22 | const text = 'Lorem ipsum dolor sit amet, //sindresorhus.com consectetuer adipiscing http://yeoman.io elit.'; 23 | 24 | getUrls(text); 25 | //=> Set {'http://sindresorhus.com', 'http://yeoman.io'} 26 | ``` 27 | 28 | ## API 29 | 30 | ### getUrls(text, options?) 31 | 32 | Returns a `Set` of URLs. 33 | 34 | ### text 35 | 36 | Type: `string` 37 | 38 | ### options 39 | 40 | Type: `object` 41 | 42 | All the `normalize-url` [options](https://github.com/sindresorhus/normalize-url#options) in addition to: 43 | 44 | #### extractFromQueryString 45 | 46 | Type: `boolean`\ 47 | Default: `false` 48 | 49 | Extract URLs that appear as query parameters in the found URLs. 50 | 51 | #### exclude 52 | 53 | Type: `string[]`\ 54 | Default: `[]` 55 | 56 | Exclude URLs that match URLs in the given array. 57 | 58 | #### requireSchemeOrWww 59 | 60 | Type: `boolean`\ 61 | Default: `false` 62 | 63 | Require URLs to have a scheme or leading `www.` to be considered an URL. When `false`, matches against a list of valid TLDs, so it will match URLs like `unicorn.education`. 64 | 65 | Does not affect URLs in query parameters if using the `extractFromQueryString` option. 66 | 67 | ## Related 68 | 69 | - [get-urls-cli](https://github.com/sindresorhus/get-urls-cli) - CLI for this module 70 | - [linkify-urls](https://github.com/sindresorhus/linkify-urls) - Linkify URLs in text 71 | -------------------------------------------------------------------------------- /test.js: -------------------------------------------------------------------------------- 1 | import fs from 'node:fs'; 2 | import test from 'ava'; 3 | import getUrls from './index.js'; 4 | 5 | test('get unique cleaned-up urls from a string', t => { 6 | t.deepEqual( 7 | getUrls(fs.readFileSync('fixture.txt', 'utf8')), 8 | new Set([ 9 | 'http://google.com', 10 | 'http://todomvc.com', 11 | 'http://yeoman.io', 12 | 'http://twitter.com/sindresorhus', 13 | 'https://tastejs.com', 14 | 'http://example.com', 15 | 'http://github.com', 16 | ]), 17 | ); 18 | }); 19 | 20 | test('do not get nested urls from query strings', t => { 21 | const text = 'You can read http://www.awin1.com/cread.php?a=b&p=https%3A%2F%2Fuk.hotels.com%2Fhotel%2Fdetails.html%3Ftab%3Ddescription%26hotelId%3D287452%26q-localised-check-in%3D15%2F12%2F2017%26q-localised-check-out%3D19%2F12%2F2017%26q-room-0-adults%3D2%26q-room-0-children%3D0%26locale%3Den_GB%26pos%3DHCOM_UK for more info'; 22 | 23 | t.deepEqual( 24 | getUrls(text), 25 | new Set([ 26 | 'http://awin1.com/cread.php?a=b&p=https://uk.hotels.com/hotel/details.html?tab=description&hotelId=287452&q-localised-check-in=15/12/2017&q-localised-check-out=19/12/2017&q-room-0-adults=2&q-room-0-children=0&locale=en_GB&pos=HCOM_UK', 27 | ]), 28 | ); 29 | }); 30 | 31 | test('get nested urls from query strings', t => { 32 | const text = 'You can read http://www.awin1.com/cread.php?a=b&p=https%3A%2F%2Fuk.hotels.com%2Fhotel%2Fdetails.html%3Ftab%3Ddescription%26hotelId%3D287452%26q-localised-check-in%3D15%2F12%2F2017%26q-localised-check-out%3D19%2F12%2F2017%26q-room-0-adults%3D2%26q-room-0-children%3D0%26locale%3Den_GB%26pos%3DHCOM_UK for more info'; 33 | 34 | t.deepEqual( 35 | getUrls(text, {extractFromQueryString: true}), 36 | new Set([ 37 | 'http://awin1.com/cread.php?a=b&p=https://uk.hotels.com/hotel/details.html?tab=description&hotelId=287452&q-localised-check-in=15/12/2017&q-localised-check-out=19/12/2017&q-room-0-adults=2&q-room-0-children=0&locale=en_GB&pos=HCOM_UK', 38 | 'https://uk.hotels.com/hotel/details.html?hotelId=287452&locale=en_GB&pos=HCOM_UK&q-localised-check-in=15/12/2017&q-localised-check-out=19/12/2017&q-room-0-adults=2&q-room-0-children=0&tab=description', 39 | ]), 40 | ); 41 | }); 42 | 43 | test('don\'t strip hash when stripHash is set to false', t => { 44 | const text = 'You can read http://www.foobar.com/document.html#about for more info'; 45 | 46 | t.deepEqual( 47 | getUrls(text, {stripHash: false}), 48 | new Set(['http://foobar.com/document.html#about']), 49 | ); 50 | }); 51 | 52 | test('strip hash when stripHash is set to true', t => { 53 | const text = 'You can read http://www.foobar.com/document.html#about for more info'; 54 | t.deepEqual(getUrls(text, {stripHash: true}), new Set(['http://foobar.com/document.html'])); 55 | }); 56 | 57 | test('don\'t strip hash by default if stripHash is not in options', t => { 58 | const text = 'You can read http://www.foobar.com/document.html#about for more info'; 59 | t.deepEqual(getUrls(text), new Set(['http://foobar.com/document.html#about'])); 60 | }); 61 | 62 | test('don\'t strip www when stripWWW is set to false', t => { 63 | const text = 'You can read http://www.foobar.com/document.html for more info'; 64 | t.deepEqual(getUrls(text, {stripWWW: false}), new Set(['http://www.foobar.com/document.html'])); 65 | }); 66 | 67 | test('strip www when stripWWW is set to true', t => { 68 | const text = 'You can read http://www.foobar.com/document.html for more info'; 69 | t.deepEqual(getUrls(text, {stripWWW: true}), new Set(['http://foobar.com/document.html'])); 70 | }); 71 | 72 | test('strip www by default if stripWWW is not in options', t => { 73 | const text = 'You can read http://www.foobar.com/document.html for more info'; 74 | t.deepEqual(getUrls(text), new Set(['http://foobar.com/document.html'])); 75 | }); 76 | 77 | test('finds urls beginning with `www`', t => { 78 | const text = 'You can read www.foobar.com/document.html for more info'; 79 | t.deepEqual(getUrls(text), new Set(['http://foobar.com/document.html'])); 80 | }); 81 | 82 | test('exclude matching urls', t => { 83 | const text = `${fs.readFileSync('fixture.txt', 'utf8')} http://w3.org/2000/svg, http://foobar.com/document.html, https://www.w3schools.com/`; 84 | 85 | t.deepEqual( 86 | getUrls(text, {exclude: ['http://w3.org/2000/svg', 'foobar.com', 'w3schools']}), 87 | new Set([ 88 | 'http://google.com', 89 | 'http://todomvc.com', 90 | 'http://yeoman.io', 91 | 'http://twitter.com/sindresorhus', 92 | 'https://tastejs.com', 93 | 'http://example.com', 94 | 'http://github.com', 95 | ]), 96 | ); 97 | }); 98 | 99 | test('throw TypeError for non-array `exclude` option', t => { 100 | t.throws(() => { 101 | getUrls('http://w3.org/2000/svg', {exclude: ''}); 102 | }, { 103 | message: 'The `exclude` option must be an array', 104 | }); 105 | }); 106 | 107 | test('get urls without scheme', t => { 108 | const text = 'Lorem ipsum dolor sit amet, //sindresorhus.com consectetuer adipiscing http://yeoman.io elit. www.github.com'; 109 | 110 | t.deepEqual( 111 | getUrls(text, { 112 | extractFromQueryString: true, 113 | }), 114 | new Set([ 115 | 'http://sindresorhus.com', 116 | 'http://yeoman.io', 117 | 'http://github.com', 118 | ]), 119 | ); 120 | }); 121 | 122 | test('get schemeless url from query string', t => { 123 | const text = 'You can read http://www.awin1.com/cread.php?a=b&p=%2F%2Fuk.hotels.com%2Fhotel%2Fdetails.html%3Ftab%3Ddescription%26hotelId%3D287452%26q-localised-check-in%3D15%2F12%2F2017%26q-localised-check-out%3D19%2F12%2F2017%26q-room-0-adults%3D2%26q-room-0-children%3D0%26locale%3Den_GB%26pos%3DHCOM_UK for more info'; 124 | 125 | t.deepEqual( 126 | getUrls(text, { 127 | extractFromQueryString: true, 128 | }), 129 | new Set([ 130 | 'http://awin1.com/cread.php?a=b&p=//uk.hotels.com/hotel/details.html?tab=description&hotelId=287452&q-localised-check-in=15/12/2017&q-localised-check-out=19/12/2017&q-room-0-adults=2&q-room-0-children=0&locale=en_GB&pos=HCOM_UK', 131 | 'http://uk.hotels.com/hotel/details.html?hotelId=287452&locale=en_GB&pos=HCOM_UK&q-localised-check-in=15/12/2017&q-localised-check-out=19/12/2017&q-room-0-adults=2&q-room-0-children=0&tab=description', 132 | ]), 133 | ); 134 | }); 135 | 136 | test('requireSchemeOrWww turned off', t => { 137 | const text = 'Here is a URL: sindresorhus.com here is another: unicorn.education'; 138 | 139 | t.deepEqual( 140 | getUrls(text, { 141 | requireSchemeOrWww: false, 142 | }), 143 | new Set([ 144 | 'http://sindresorhus.com', 145 | 'http://unicorn.education', 146 | ]), 147 | ); 148 | }); 149 | 150 | test('supports upper case URL', t => { 151 | const url = 'WWW.POS.COM'; 152 | 153 | t.notThrows(() => { 154 | getUrls(url, {extractFromQueryString: true}); 155 | }); 156 | }); 157 | 158 | test('filter all items from options.exclude', t => { 159 | const text = ` 160 | http://domain.com/pic/uploadimg/2019-3/PS/818201903010604.jpg 161 | http://domain.com/81820190301/818201903010604/index.m3u8 162 | http://domain.com/pic/uploadimg/2019-3/PS/818201903010606.jpg 163 | http://domain.com/81820190301/818201903010606/index.m3u8 164 | http://domain.com/pic/uploadimg/2019-3/PS/818201903010615.jpg 165 | `; 166 | 167 | const exclude = ['.*jpg']; 168 | 169 | t.deepEqual( 170 | getUrls(text, {exclude}), 171 | new Set([ 172 | 'http://domain.com/81820190301/818201903010604/index.m3u8', 173 | 'http://domain.com/81820190301/818201903010606/index.m3u8', 174 | ]), 175 | ); 176 | }); 177 | 178 | test('throw an error when the text argument is not a string', t => { 179 | t.throws(() => { 180 | getUrls(); 181 | }, { 182 | instanceOf: TypeError, 183 | }); 184 | }); 185 | 186 | test('handles parens', t => { 187 | const text = 'foo https://sindresorhus.com/some/example) foo'; 188 | 189 | t.deepEqual( 190 | getUrls(text), 191 | new Set([ 192 | 'https://sindresorhus.com/some/example', 193 | ]), 194 | ); 195 | }); 196 | 197 | test('handles Markdown', t => { 198 | const text = 'foo [![](https://sindresorhus.com/unicorn.png)](https://sindresorhus.com/?foo=bar) foo'; 199 | 200 | t.deepEqual( 201 | getUrls(text), 202 | new Set([ 203 | 'https://sindresorhus.com/unicorn.png', 204 | 'https://sindresorhus.com/?foo=bar', 205 | ]), 206 | ); 207 | }); 208 | --------------------------------------------------------------------------------