├── .gitignore ├── .cargo └── config ├── squeeze ├── tests │ ├── fixtures │ │ └── readme.md │ └── uri.rs ├── mirror.rs ├── Cargo.toml ├── lib.rs ├── codetag.rs └── uri.rs ├── Cargo.toml ├── squeeze-cli ├── Cargo.toml └── main.rs ├── .github └── workflows │ └── ci.yml ├── readme.md └── Cargo.lock /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | -------------------------------------------------------------------------------- /.cargo/config: -------------------------------------------------------------------------------- 1 | [build] 2 | rustflags = ["-C", "target-cpu=native"] 3 | -------------------------------------------------------------------------------- /squeeze/tests/fixtures/readme.md: -------------------------------------------------------------------------------- 1 | - https://github.com/mitchellkrogza/The-Big-List-of-Hacked-Malware-Web-Sites 2 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = [ 3 | "squeeze", 4 | "squeeze-cli", 5 | ] 6 | 7 | [profile.dev] 8 | opt-level = 0 9 | 10 | [profile.release] 11 | codegen-units = 1 12 | lto = "fat" 13 | opt-level = 3 14 | -------------------------------------------------------------------------------- /squeeze/mirror.rs: -------------------------------------------------------------------------------- 1 | use super::Finder; 2 | use std::ops::Range; 3 | 4 | #[derive(Default)] 5 | pub struct Mirror {} 6 | 7 | impl Finder for Mirror { 8 | fn id(&self) -> &'static str { 9 | "mirror" 10 | } 11 | 12 | fn find(&self, s: &str) -> Option> { 13 | Some(0..s.len()) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /squeeze-cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "squeeze-cli" 3 | version = "0.1.0" 4 | authors = ["Aymeric Beaumet "] 5 | edition = "2021" 6 | 7 | [[bin]] 8 | name = "squeeze" 9 | path = "main.rs" 10 | 11 | [dependencies] 12 | clap = { version = "3.0.0-rc.0", features = ["derive"] } 13 | log = "0.4.14" 14 | pretty_env_logger = "0.4.0" 15 | squeeze = { path = "../squeeze" } 16 | -------------------------------------------------------------------------------- /squeeze/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "squeeze" 3 | version = "0.1.0" 4 | authors = ["Aymeric Beaumet "] 5 | edition = "2021" 6 | 7 | [lib] 8 | name = "squeeze" 9 | path = "lib.rs" 10 | 11 | [dependencies] 12 | lazy_static = "1.4.0" 13 | phf = { version = "0.10.0", features = ["macros"] } 14 | regex = "1.5.4" 15 | regex-syntax = "0.6.25" 16 | 17 | [dev-dependencies] 18 | glob = "0.3.0" 19 | -------------------------------------------------------------------------------- /squeeze/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod codetag; 2 | pub mod mirror; 3 | pub mod uri; 4 | 5 | use std::ops::Range; 6 | 7 | /// `Finder` must be implemented by all the finders. A finder implementation must be stateless, 8 | /// it's up to the caller to call it until no more results can be extracted. 9 | pub trait Finder { 10 | /// `id` must return a unique id for the finder. 11 | fn id(&self) -> &'static str; 12 | 13 | /// `find` should return the range of the first result it finds. None shall only be returned if 14 | /// the input string is exhausted. 15 | fn find(&self, s: &str) -> Option>; 16 | } 17 | -------------------------------------------------------------------------------- /squeeze/tests/uri.rs: -------------------------------------------------------------------------------- 1 | use squeeze::{uri, Finder}; 2 | use std::fs::File; 3 | use std::io::{prelude::*, BufReader}; 4 | 5 | #[test] 6 | fn it_should_succeed_to_mirror_the_fixtures_uris() { 7 | let mut finder = uri::URI::default(); 8 | finder.strict = true; 9 | let fixtures_glob = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")) 10 | .join("tests") 11 | .join("fixtures") 12 | .join("uri-*"); 13 | 14 | for filepath in glob::glob(fixtures_glob.to_str().unwrap()).unwrap() { 15 | let file = File::open(filepath.unwrap()).unwrap(); 16 | let reader = BufReader::new(file); 17 | for line in reader.lines() { 18 | let input = &line.unwrap(); 19 | assert_eq!(Some(0..input.len()), finder.find(input), "{}", input); 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | push: 4 | branches: 5 | - master 6 | schedule: 7 | - cron: '00 00 * * *' 8 | 9 | name: ci 10 | 11 | jobs: 12 | ci: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - uses: actions-rs/toolchain@v1 19 | with: 20 | profile: minimal 21 | toolchain: stable 22 | components: rustfmt, clippy 23 | 24 | - name: cargo fmt 25 | uses: actions-rs/cargo@v1 26 | with: 27 | command: fmt 28 | args: --all -- --check 29 | 30 | - name: cargo clippy 31 | uses: actions-rs/cargo@v1 32 | with: 33 | command: clippy 34 | args: -- --deny warnings 35 | 36 | - name: cargo build 37 | uses: actions-rs/cargo@v1 38 | with: 39 | command: build 40 | args: --release 41 | 42 | - name: cargo test 43 | uses: actions-rs/cargo@v1 44 | with: 45 | command: test 46 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # squeeze [![GitHub Actions](https://github.com/aymericbeaumet/squeeze/actions/workflows/ci.yml/badge.svg)](https://github.com/aymericbeaumet/squeeze/actions/workflows/ci.yml) 2 | 3 | [squeeze](https://github.com/aymericbeaumet/squeeze) enables to extract rich 4 | information from any text (raw, JSON, HTML, YAML, etc). 5 | 6 | Currently supported: 7 | 8 | - Codetags (as defined per [PEP 350](https://www.python.org/dev/peps/pep-0350/)) 9 | - URIs/URLs/URNs (as defined per [RFC 3986](https://tools.ietf.org/html/rfc3986/)) 10 | 11 | See [integrations](#integrations) for some practical uses. Continue reading for 12 | the install and getting started instructions. 13 | 14 | ## Install 15 | 16 | ### Using git 17 | 18 | _This method requires the [Rust 19 | toolchain](https://www.rust-lang.org/tools/install) to be installed on your 20 | machine._ 21 | 22 | ```shell 23 | git clone --depth=1 https://github.com/aymericbeaumet/squeeze.git /tmp/squeeze 24 | cargo install --path=/tmp/squeeze/squeeze-cli 25 | ``` 26 | 27 | ## Getting Started 28 | 29 | Let's start by extracting a URL, `squeeze` expects the text to be searched on 30 | its standard input, with the results being placed on its standard output: 31 | 32 | ```shell 33 | echo 'lorem https://github.com ipsum' | squeeze -1 --url 34 | ``` 35 | 36 | ``` 37 | https://github.com 38 | ``` 39 | 40 | > The `-1` flag allows to immediately abort after one result has been found. 41 | 42 | If you want to print all the URLs, just omit the `-1` flag: 43 | 44 | ```shell 45 | squeeze --url << EOF 46 | this a domain: github.com, but this is a url: https://aymericbeaumet.com 47 | this is some markdown: [link](https://wikipedia.com) 48 | EOF 49 | ``` 50 | 51 | ``` 52 | https://aymericbeaumet.com 53 | https://wikipedia.com 54 | ``` 55 | 56 | It is also possible to extract other types of information, like codetags 57 | (`TODO:`, `FIXME:`, etc). The usage remains very similar: 58 | 59 | ```shell 60 | squeeze --codetag=todo << EOF 61 | // TODO: implement the main function 62 | fn main {} 63 | EOF 64 | ``` 65 | 66 | ``` 67 | TODO: implement the main function 68 | ``` 69 | 70 | > Note that for convenience some aliases are defined. In this case, you can use 71 | `--todo` instead of `--codetag=todo`. In the same vein, `--url` is an alias to 72 | limit the search to specific URI schemes. 73 | 74 | It is possible to enable several finders at the same time, they will be run 75 | sequentially for each line: 76 | 77 | ```shell 78 | squeeze --uri=http,https --codetag=todo,fixme << EOF 79 | // TODO: update with a better example 80 | // FIXME: all of https://github.com/aymericbeaumet/squeeze/issues 81 | // Some random comment to be ignored 82 | ftp://localhost 83 | http://localhost 84 | EOF 85 | ``` 86 | 87 | ``` 88 | TODO: update with a better example 89 | FIXME: all of https://github.com/aymericbeaumet/squeeze/issues 90 | https://github.com/aymericbeaumet/squeeze/issues 91 | http://localhost 92 | ``` 93 | 94 | This getting started should give you an overview of what's possible with 95 | `squeeze`. Have a look at all the possibilities with `squeeze --help`. 96 | 97 | ## Integrations 98 | 99 | Integrations with some popular tools. 100 | 101 | ### vim/nvim 102 | 103 | Press `Enter` in visual mode to extract the first URL from the current 104 | selection and open it: 105 | 106 | ```vim 107 | " ~/.vimrc 108 | vnoremap :'<,'>w !squeeze -1 --url --open 109 | ``` 110 | 111 | ### tmux 112 | 113 | Press `Enter` in copy mode to extract the first URL from the current selection 114 | and open it: 115 | 116 | ```tmux 117 | # ~/.tmux.conf 118 | bind -T copy-mode-vi enter send -X copy-pipe-and-cancel "squeeze -1 --url --open" 119 | ``` 120 | 121 | ### shell (bash, zsh) 122 | 123 | Define a `urls` function to list all the URLs in your shell history: 124 | 125 | ```shell 126 | # ~/.bashrc ~/.zshrc 127 | urls() { fc -rl 1 | squeeze --url | sort -u; } 128 | ``` 129 | 130 | ## Development 131 | 132 | ### Run binary 133 | 134 | ```shell 135 | echo 'http://localhost' | cargo run -- --url 136 | ``` 137 | 138 | ### Run tests 139 | 140 | ```shell 141 | cargo test 142 | watchexec --clear --restart 'cargo test' 143 | ``` 144 | -------------------------------------------------------------------------------- /squeeze-cli/main.rs: -------------------------------------------------------------------------------- 1 | use clap::Parser; 2 | use squeeze::{codetag::Codetag, mirror::Mirror, uri::URI, Finder}; 3 | use std::convert::{TryFrom, TryInto}; 4 | use std::io::{self, BufRead}; 5 | 6 | #[derive(Parser)] 7 | #[clap( 8 | name = "squeeze", 9 | version = "1.0", 10 | author = "Aymeric Beaumet " 11 | )] 12 | struct Opts { 13 | // flags 14 | #[clap(short = '1', long = "--first", help = "only show the first result")] 15 | first: bool, 16 | #[clap(long = "--open", help = "open the results")] 17 | open: bool, 18 | 19 | // codetag 20 | #[clap(long = "codetag", help = "search for codetags")] 21 | mnemonic: Option>, 22 | #[clap( 23 | long = "hide-mnemonic", 24 | help = "whether to show the mnemonics in the results" 25 | )] 26 | hide_mnemonic: bool, 27 | #[clap(long = "fixme", help = "alias for: --codetag=fixme")] 28 | fixme: bool, 29 | #[clap(long = "todo", help = "alias for: --codetag=todo")] 30 | todo: bool, 31 | 32 | // mirror 33 | #[clap(long = "mirror", help = "[debug] mirror the input")] 34 | mirror: bool, 35 | 36 | // uri 37 | #[clap(long = "uri", help = "search for uris")] 38 | scheme: Option>, 39 | #[clap( 40 | long = "strict", 41 | help = "strictly respect the URI RFC in regards to closing ' and )" 42 | )] 43 | strict: bool, 44 | #[clap( 45 | long = "url", 46 | help = "alias for: --uri=data,ftp,ftps,http,https,mailto,sftp,ws,wss" 47 | )] 48 | url: bool, 49 | #[clap(long = "http", help = "alias for: --uri=http")] 50 | http: bool, 51 | #[clap(long = "https", help = "alias for: --uri=https")] 52 | https: bool, 53 | } 54 | 55 | impl TryFrom<&Opts> for Codetag { 56 | type Error = (); 57 | 58 | fn try_from(opts: &Opts) -> Result { 59 | if !(opts.mnemonic.is_some() || opts.fixme || opts.todo) { 60 | return Err(()); 61 | } 62 | 63 | let mut finder = Codetag::default(); 64 | finder.hide_mnemonic = opts.hide_mnemonic; 65 | if let Some(Some(ref mnemonic)) = opts.mnemonic { 66 | for m in mnemonic.split(',') { 67 | finder.add_mnemonic(m); 68 | } 69 | } 70 | if opts.fixme { 71 | finder.add_mnemonic("fixme"); 72 | } 73 | if opts.todo { 74 | finder.add_mnemonic("todo"); 75 | } 76 | finder.build_mnemonics_regex().unwrap(); 77 | Ok(finder) 78 | } 79 | } 80 | 81 | impl TryFrom<&Opts> for Mirror { 82 | type Error = (); 83 | 84 | fn try_from(opts: &Opts) -> Result { 85 | if !opts.mirror { 86 | return Err(()); 87 | } 88 | 89 | let finder = Mirror::default(); 90 | Ok(finder) 91 | } 92 | } 93 | 94 | impl TryFrom<&Opts> for URI { 95 | type Error = (); 96 | 97 | fn try_from(opts: &Opts) -> Result { 98 | if !(opts.scheme.is_some() || opts.url || opts.http || opts.https) { 99 | return Err(()); 100 | } 101 | 102 | let mut finder = URI::default(); 103 | finder.strict = opts.strict; 104 | if let Some(Some(ref scheme)) = opts.scheme { 105 | for s in scheme.split(',') { 106 | finder.add_scheme(s); 107 | } 108 | } 109 | if opts.url { 110 | finder.add_scheme("data"); 111 | finder.add_scheme("ftp"); 112 | finder.add_scheme("ftps"); 113 | finder.add_scheme("http"); 114 | finder.add_scheme("https"); 115 | finder.add_scheme("mailto"); 116 | finder.add_scheme("sftp"); 117 | finder.add_scheme("ws"); 118 | finder.add_scheme("wss"); 119 | } 120 | if opts.http { 121 | finder.add_scheme("http"); 122 | } 123 | if opts.https { 124 | finder.add_scheme("https"); 125 | } 126 | Ok(finder) 127 | } 128 | } 129 | 130 | fn main() { 131 | pretty_env_logger::init(); 132 | 133 | let opts = Opts::parse(); 134 | let codetag = TryInto::::try_into(&opts); 135 | let mirror = TryInto::::try_into(&opts); 136 | let uri = TryInto::::try_into(&opts); 137 | 138 | let finders: Vec<_> = [ 139 | codetag.as_ref().map(|f| f as &dyn Finder), 140 | mirror.as_ref().map(|f| f as &dyn Finder), 141 | uri.as_ref().map(|f| f as &dyn Finder), 142 | ] 143 | .into_iter() 144 | .filter_map(|finder| finder.ok()) 145 | .collect(); 146 | 147 | if finders.is_empty() { 148 | return; 149 | } 150 | 151 | for line in io::stdin().lock().lines() { 152 | for finder in &finders { 153 | let line = line.as_ref().unwrap(); 154 | log::debug!("[{}] line \"{}\"", finder.id(), line); 155 | let mut idx = 0; 156 | while idx < line.len() { 157 | let segment = &line[idx..]; 158 | log::debug!("[{}] searching in \"{}\"", finder.id(), segment); 159 | if let Some(range) = finder.find(segment) { 160 | log::debug!("[{}] found at [{};{}[", finder.id(), range.start, range.end); 161 | idx += range.end; 162 | let found = &segment[range].trim(); 163 | if !found.is_empty() { 164 | println!("{}", found); 165 | if opts.open { 166 | open(found).expect("failed to open result"); 167 | } 168 | if opts.first { 169 | return; 170 | } 171 | } 172 | } else { 173 | break; 174 | } 175 | } 176 | } 177 | } 178 | } 179 | 180 | #[cfg(target_os = "macos")] 181 | fn open(arg: &str) -> io::Result { 182 | std::process::Command::new("open").arg(arg).spawn() 183 | } 184 | 185 | #[cfg(not(target_os = "macos"))] 186 | fn open(_: &str) -> io::Result { 187 | unimplemented!("The --open flag is not yet available on your platform. In the meantime, `... | squeeze | xargs xdg-open` might be used as a workaround (YMMV)."); 188 | } 189 | -------------------------------------------------------------------------------- /squeeze/codetag.rs: -------------------------------------------------------------------------------- 1 | // https://www.python.org/dev/peps/pep-0350/ 2 | 3 | use super::Finder; 4 | use lazy_static::lazy_static; 5 | use regex::Regex; 6 | use std::collections::HashSet; 7 | use std::ops::Range; 8 | 9 | lazy_static! { 10 | static ref DEFAULT_MNEMONICS: HashSet = { 11 | [ 12 | // todo 13 | "TODO", 14 | "MILESTONE", 15 | "MLSTN", 16 | "DONE", 17 | "YAGNI", 18 | "TBD", 19 | "TOBEDONE", 20 | // fixme 21 | "FIXME", 22 | "XXX", 23 | "DEBUG", 24 | "BROKEN", 25 | "REFACTOR", 26 | "REFACT", 27 | "RFCTR", 28 | "OOPS", 29 | "SMELL", 30 | "NEEDSWORK", 31 | "INSPECT", 32 | // bug 33 | "BUG", 34 | "BUGFIX", 35 | // nobug 36 | "NOBUG", 37 | "NOFIX", 38 | "WONTFIX", 39 | "DONTFIX", 40 | "NEVERFIX", 41 | "UNFIXABLE", 42 | "CANTFIX", 43 | // req 44 | "REQ", 45 | "REQUIREMENT", 46 | "STORY", 47 | // rfe 48 | "RFE", 49 | "FEETCH", 50 | "NYI", 51 | "FR", 52 | "FTRQ", 53 | "FTR", 54 | // idea 55 | "IDEA", 56 | // ??? 57 | "???", 58 | "QUESTION", 59 | "QUEST", 60 | "QSTN", 61 | "WTF", 62 | // !!! 63 | "!!!", 64 | "ALERT", 65 | // hack 66 | "HACK", 67 | "CLEVER", 68 | "MAGIC", 69 | // port 70 | "PORT", 71 | "PORTABILITY", 72 | "WKRD", 73 | // caveat 74 | "CAVEAT", 75 | "CAV", 76 | "CAVT", 77 | "WARNING", 78 | "CAUTION", 79 | // note 80 | "NOTE", 81 | "HELP", 82 | // faq 83 | "FAQ", 84 | // gloss 85 | "GLOSS", 86 | "GLOSSARY", 87 | // see 88 | "SEE", 89 | "REF", 90 | "REFERENCE", 91 | // todoc 92 | "TODOC", 93 | "DOCDO", 94 | "DODOC", 95 | "NEEDSDOC", 96 | "EXPLAIN", 97 | "DOCUMENT", 98 | // cred 99 | "CRED", 100 | "CREDIT", 101 | "THANKS", 102 | // stat 103 | "STAT", 104 | "STATUS", 105 | // rvd 106 | "RVD", 107 | "REVIEWED", 108 | "REVIEW", 109 | ].iter().map(|s| s.to_string()).collect() 110 | }; 111 | } 112 | 113 | #[derive(Default)] 114 | pub struct Codetag { 115 | pub hide_mnemonic: bool, 116 | mnemonics: HashSet, 117 | mnemonics_regex: Option, 118 | } 119 | 120 | impl Finder for Codetag { 121 | fn id(&self) -> &'static str { 122 | "codetag" 123 | } 124 | 125 | fn find(&self, s: &str) -> Option> { 126 | let m = self 127 | .mnemonics_regex 128 | .as_ref() 129 | .expect( 130 | "implementation error: please call .build_mnemonics_regex() on the codetag instance", 131 | ) 132 | .find(s)?; 133 | let from = if self.hide_mnemonic { 134 | m.end() 135 | } else { 136 | m.start() 137 | }; 138 | let to = s.len(); 139 | if from >= to { 140 | None 141 | } else { 142 | Some(from..to) 143 | } 144 | } 145 | } 146 | 147 | impl Codetag { 148 | pub fn add_mnemonic(&mut self, mnemonic: &str) { 149 | self.mnemonics.insert(mnemonic.to_uppercase()); 150 | } 151 | 152 | pub fn build_mnemonics_regex(&mut self) -> Result<(), regex::Error> { 153 | let mnemonics = if self.mnemonics.is_empty() { 154 | DEFAULT_MNEMONICS.iter() 155 | } else { 156 | self.mnemonics.iter() 157 | }; 158 | let mut r = String::with_capacity(mnemonics.len() * 16); 159 | r.push_str("(?i)(?:"); 160 | for (i, m) in mnemonics.enumerate() { 161 | if i > 0 { 162 | r.push('|'); 163 | } 164 | regex_syntax::escape_into(m, &mut r); 165 | } 166 | r.push_str(")(?:\\([^)]*\\))?:"); 167 | self.mnemonics_regex = Some(Regex::new(&r)?); 168 | Ok(()) 169 | } 170 | } 171 | 172 | #[cfg(test)] 173 | mod tests { 174 | use super::*; 175 | 176 | #[test] 177 | fn it_should_find_at_start_of_line() { 178 | let mut finder = Codetag::default(); 179 | finder.build_mnemonics_regex().unwrap(); 180 | let input = "TODO: check if cmd is installed"; 181 | assert_eq!( 182 | Some("TODO: check if cmd is installed"), 183 | finder.find(input).map(|r| &input[r]) 184 | ); 185 | } 186 | 187 | #[test] 188 | fn it_should_find_at_middle_of_line() { 189 | let mut finder = Codetag::default(); 190 | finder.build_mnemonics_regex().unwrap(); 191 | let input = "foobar // TODO: check if cmd is installed"; 192 | assert_eq!( 193 | Some("TODO: check if cmd is installed"), 194 | finder.find(input).map(|r| &input[r]) 195 | ); 196 | } 197 | 198 | #[test] 199 | fn it_should_find_uppercase() { 200 | let mut finder = Codetag::default(); 201 | finder.build_mnemonics_regex().unwrap(); 202 | let input = "TODO: check if cmd is installed"; 203 | assert_eq!( 204 | Some("TODO: check if cmd is installed"), 205 | finder.find(input).map(|r| &input[r]) 206 | ); 207 | } 208 | 209 | #[test] 210 | fn it_should_find_lowercase() { 211 | let mut finder = Codetag::default(); 212 | finder.build_mnemonics_regex().unwrap(); 213 | let input = "todo: check if cmd is installed"; 214 | assert_eq!( 215 | Some("todo: check if cmd is installed"), 216 | finder.find(input).map(|r| &input[r]) 217 | ); 218 | } 219 | 220 | #[test] 221 | fn it_should_find_mnemonics_with_empty_description() { 222 | let mut finder = Codetag::default(); 223 | finder.build_mnemonics_regex().unwrap(); 224 | let input = "todo:"; 225 | assert_eq!(Some("todo:"), finder.find(input).map(|r| &input[r])); 226 | } 227 | 228 | #[test] 229 | fn it_should_hide_mnemonics_if_asked_to() { 230 | let mut finder = Codetag::default(); 231 | finder.hide_mnemonic = true; 232 | finder.build_mnemonics_regex().unwrap(); 233 | let input = "todo: foobar"; 234 | assert_eq!(Some(" foobar"), finder.find(input).map(|r| &input[r])); 235 | } 236 | 237 | #[test] 238 | fn it_should_limit_results_to_the_given_mnemonics() { 239 | let mut finder = Codetag::default(); 240 | finder.add_mnemonic("test"); 241 | finder.build_mnemonics_regex().unwrap(); 242 | let input = "test: check if cmd is installed"; 243 | assert_eq!( 244 | Some("test: check if cmd is installed"), 245 | finder.find(input).map(|r| &input[r]) 246 | ); 247 | let input = "test2: check if cmd is installed"; 248 | assert_eq!(None, finder.find(input).map(|r| &input[r])); 249 | } 250 | 251 | #[test] 252 | fn it_should_ignore_invalid_inputs() { 253 | let mut finder = Codetag::default(); 254 | finder.build_mnemonics_regex().unwrap(); 255 | for input in vec!["", " "] { 256 | assert_eq!(None, finder.find(input)); 257 | } 258 | } 259 | } 260 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "0.7.18" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "atty" 16 | version = "0.2.14" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" 19 | dependencies = [ 20 | "hermit-abi", 21 | "libc", 22 | "winapi", 23 | ] 24 | 25 | [[package]] 26 | name = "autocfg" 27 | version = "1.0.1" 28 | source = "registry+https://github.com/rust-lang/crates.io-index" 29 | checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" 30 | 31 | [[package]] 32 | name = "bitflags" 33 | version = "1.3.2" 34 | source = "registry+https://github.com/rust-lang/crates.io-index" 35 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 36 | 37 | [[package]] 38 | name = "cfg-if" 39 | version = "1.0.0" 40 | source = "registry+https://github.com/rust-lang/crates.io-index" 41 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 42 | 43 | [[package]] 44 | name = "clap" 45 | version = "3.0.0-rc.0" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | checksum = "79b70f999da60e6619a29b131739d2211ed4d4301f40372e94a8081422e9d6c7" 48 | dependencies = [ 49 | "atty", 50 | "bitflags", 51 | "clap_derive", 52 | "indexmap", 53 | "lazy_static", 54 | "os_str_bytes", 55 | "strsim", 56 | "termcolor", 57 | "textwrap", 58 | ] 59 | 60 | [[package]] 61 | name = "clap_derive" 62 | version = "3.0.0-rc.0" 63 | source = "registry+https://github.com/rust-lang/crates.io-index" 64 | checksum = "fe8c0f28022faaef0387fa54f8e33fee22b804a88bbd91303197da2ff8ca6a5d" 65 | dependencies = [ 66 | "heck", 67 | "proc-macro-error", 68 | "proc-macro2", 69 | "quote", 70 | "syn", 71 | ] 72 | 73 | [[package]] 74 | name = "env_logger" 75 | version = "0.7.1" 76 | source = "registry+https://github.com/rust-lang/crates.io-index" 77 | checksum = "44533bbbb3bb3c1fa17d9f2e4e38bbbaf8396ba82193c4cb1b6445d711445d36" 78 | dependencies = [ 79 | "atty", 80 | "humantime", 81 | "log", 82 | "regex", 83 | "termcolor", 84 | ] 85 | 86 | [[package]] 87 | name = "getrandom" 88 | version = "0.2.3" 89 | source = "registry+https://github.com/rust-lang/crates.io-index" 90 | checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" 91 | dependencies = [ 92 | "cfg-if", 93 | "libc", 94 | "wasi", 95 | ] 96 | 97 | [[package]] 98 | name = "glob" 99 | version = "0.3.0" 100 | source = "registry+https://github.com/rust-lang/crates.io-index" 101 | checksum = "9b919933a397b79c37e33b77bb2aa3dc8eb6e165ad809e58ff75bc7db2e34574" 102 | 103 | [[package]] 104 | name = "hashbrown" 105 | version = "0.11.2" 106 | source = "registry+https://github.com/rust-lang/crates.io-index" 107 | checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" 108 | 109 | [[package]] 110 | name = "heck" 111 | version = "0.3.3" 112 | source = "registry+https://github.com/rust-lang/crates.io-index" 113 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" 114 | dependencies = [ 115 | "unicode-segmentation", 116 | ] 117 | 118 | [[package]] 119 | name = "hermit-abi" 120 | version = "0.1.19" 121 | source = "registry+https://github.com/rust-lang/crates.io-index" 122 | checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" 123 | dependencies = [ 124 | "libc", 125 | ] 126 | 127 | [[package]] 128 | name = "humantime" 129 | version = "1.3.0" 130 | source = "registry+https://github.com/rust-lang/crates.io-index" 131 | checksum = "df004cfca50ef23c36850aaaa59ad52cc70d0e90243c3c7737a4dd32dc7a3c4f" 132 | dependencies = [ 133 | "quick-error", 134 | ] 135 | 136 | [[package]] 137 | name = "indexmap" 138 | version = "1.7.0" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "bc633605454125dec4b66843673f01c7df2b89479b32e0ed634e43a91cff62a5" 141 | dependencies = [ 142 | "autocfg", 143 | "hashbrown", 144 | ] 145 | 146 | [[package]] 147 | name = "lazy_static" 148 | version = "1.4.0" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 151 | 152 | [[package]] 153 | name = "libc" 154 | version = "0.2.109" 155 | source = "registry+https://github.com/rust-lang/crates.io-index" 156 | checksum = "f98a04dce437184842841303488f70d0188c5f51437d2a834dc097eafa909a01" 157 | 158 | [[package]] 159 | name = "log" 160 | version = "0.4.14" 161 | source = "registry+https://github.com/rust-lang/crates.io-index" 162 | checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" 163 | dependencies = [ 164 | "cfg-if", 165 | ] 166 | 167 | [[package]] 168 | name = "memchr" 169 | version = "2.4.1" 170 | source = "registry+https://github.com/rust-lang/crates.io-index" 171 | checksum = "308cc39be01b73d0d18f82a0e7b2a3df85245f84af96fdddc5d202d27e47b86a" 172 | 173 | [[package]] 174 | name = "os_str_bytes" 175 | version = "6.0.0" 176 | source = "registry+https://github.com/rust-lang/crates.io-index" 177 | checksum = "8e22443d1643a904602595ba1cd8f7d896afe56d26712531c5ff73a15b2fbf64" 178 | dependencies = [ 179 | "memchr", 180 | ] 181 | 182 | [[package]] 183 | name = "phf" 184 | version = "0.10.0" 185 | source = "registry+https://github.com/rust-lang/crates.io-index" 186 | checksum = "b9fc3db1018c4b59d7d582a739436478b6035138b6aecbce989fc91c3e98409f" 187 | dependencies = [ 188 | "phf_macros", 189 | "phf_shared", 190 | "proc-macro-hack", 191 | ] 192 | 193 | [[package]] 194 | name = "phf_generator" 195 | version = "0.10.0" 196 | source = "registry+https://github.com/rust-lang/crates.io-index" 197 | checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" 198 | dependencies = [ 199 | "phf_shared", 200 | "rand", 201 | ] 202 | 203 | [[package]] 204 | name = "phf_macros" 205 | version = "0.10.0" 206 | source = "registry+https://github.com/rust-lang/crates.io-index" 207 | checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0" 208 | dependencies = [ 209 | "phf_generator", 210 | "phf_shared", 211 | "proc-macro-hack", 212 | "proc-macro2", 213 | "quote", 214 | "syn", 215 | ] 216 | 217 | [[package]] 218 | name = "phf_shared" 219 | version = "0.10.0" 220 | source = "registry+https://github.com/rust-lang/crates.io-index" 221 | checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" 222 | dependencies = [ 223 | "siphasher", 224 | ] 225 | 226 | [[package]] 227 | name = "ppv-lite86" 228 | version = "0.2.15" 229 | source = "registry+https://github.com/rust-lang/crates.io-index" 230 | checksum = "ed0cfbc8191465bed66e1718596ee0b0b35d5ee1f41c5df2189d0fe8bde535ba" 231 | 232 | [[package]] 233 | name = "pretty_env_logger" 234 | version = "0.4.0" 235 | source = "registry+https://github.com/rust-lang/crates.io-index" 236 | checksum = "926d36b9553851b8b0005f1275891b392ee4d2d833852c417ed025477350fb9d" 237 | dependencies = [ 238 | "env_logger", 239 | "log", 240 | ] 241 | 242 | [[package]] 243 | name = "proc-macro-error" 244 | version = "1.0.4" 245 | source = "registry+https://github.com/rust-lang/crates.io-index" 246 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" 247 | dependencies = [ 248 | "proc-macro-error-attr", 249 | "proc-macro2", 250 | "quote", 251 | "syn", 252 | "version_check", 253 | ] 254 | 255 | [[package]] 256 | name = "proc-macro-error-attr" 257 | version = "1.0.4" 258 | source = "registry+https://github.com/rust-lang/crates.io-index" 259 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" 260 | dependencies = [ 261 | "proc-macro2", 262 | "quote", 263 | "version_check", 264 | ] 265 | 266 | [[package]] 267 | name = "proc-macro-hack" 268 | version = "0.5.19" 269 | source = "registry+https://github.com/rust-lang/crates.io-index" 270 | checksum = "dbf0c48bc1d91375ae5c3cd81e3722dff1abcf81a30960240640d223f59fe0e5" 271 | 272 | [[package]] 273 | name = "proc-macro2" 274 | version = "1.0.33" 275 | source = "registry+https://github.com/rust-lang/crates.io-index" 276 | checksum = "fb37d2df5df740e582f28f8560cf425f52bb267d872fe58358eadb554909f07a" 277 | dependencies = [ 278 | "unicode-xid", 279 | ] 280 | 281 | [[package]] 282 | name = "quick-error" 283 | version = "1.2.3" 284 | source = "registry+https://github.com/rust-lang/crates.io-index" 285 | checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" 286 | 287 | [[package]] 288 | name = "quote" 289 | version = "1.0.10" 290 | source = "registry+https://github.com/rust-lang/crates.io-index" 291 | checksum = "38bc8cc6a5f2e3655e0899c1b848643b2562f853f114bfec7be120678e3ace05" 292 | dependencies = [ 293 | "proc-macro2", 294 | ] 295 | 296 | [[package]] 297 | name = "rand" 298 | version = "0.8.4" 299 | source = "registry+https://github.com/rust-lang/crates.io-index" 300 | checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" 301 | dependencies = [ 302 | "libc", 303 | "rand_chacha", 304 | "rand_core", 305 | "rand_hc", 306 | ] 307 | 308 | [[package]] 309 | name = "rand_chacha" 310 | version = "0.3.1" 311 | source = "registry+https://github.com/rust-lang/crates.io-index" 312 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 313 | dependencies = [ 314 | "ppv-lite86", 315 | "rand_core", 316 | ] 317 | 318 | [[package]] 319 | name = "rand_core" 320 | version = "0.6.3" 321 | source = "registry+https://github.com/rust-lang/crates.io-index" 322 | checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" 323 | dependencies = [ 324 | "getrandom", 325 | ] 326 | 327 | [[package]] 328 | name = "rand_hc" 329 | version = "0.3.1" 330 | source = "registry+https://github.com/rust-lang/crates.io-index" 331 | checksum = "d51e9f596de227fda2ea6c84607f5558e196eeaf43c986b724ba4fb8fdf497e7" 332 | dependencies = [ 333 | "rand_core", 334 | ] 335 | 336 | [[package]] 337 | name = "regex" 338 | version = "1.5.4" 339 | source = "registry+https://github.com/rust-lang/crates.io-index" 340 | checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" 341 | dependencies = [ 342 | "aho-corasick", 343 | "memchr", 344 | "regex-syntax", 345 | ] 346 | 347 | [[package]] 348 | name = "regex-syntax" 349 | version = "0.6.25" 350 | source = "registry+https://github.com/rust-lang/crates.io-index" 351 | checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" 352 | 353 | [[package]] 354 | name = "siphasher" 355 | version = "0.3.7" 356 | source = "registry+https://github.com/rust-lang/crates.io-index" 357 | checksum = "533494a8f9b724d33625ab53c6c4800f7cc445895924a8ef649222dcb76e938b" 358 | 359 | [[package]] 360 | name = "squeeze" 361 | version = "0.1.0" 362 | dependencies = [ 363 | "glob", 364 | "lazy_static", 365 | "phf", 366 | "regex", 367 | "regex-syntax", 368 | ] 369 | 370 | [[package]] 371 | name = "squeeze-cli" 372 | version = "0.1.0" 373 | dependencies = [ 374 | "clap", 375 | "log", 376 | "pretty_env_logger", 377 | "squeeze", 378 | ] 379 | 380 | [[package]] 381 | name = "strsim" 382 | version = "0.10.0" 383 | source = "registry+https://github.com/rust-lang/crates.io-index" 384 | checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" 385 | 386 | [[package]] 387 | name = "syn" 388 | version = "1.0.82" 389 | source = "registry+https://github.com/rust-lang/crates.io-index" 390 | checksum = "8daf5dd0bb60cbd4137b1b587d2fc0ae729bc07cf01cd70b36a1ed5ade3b9d59" 391 | dependencies = [ 392 | "proc-macro2", 393 | "quote", 394 | "unicode-xid", 395 | ] 396 | 397 | [[package]] 398 | name = "termcolor" 399 | version = "1.1.2" 400 | source = "registry+https://github.com/rust-lang/crates.io-index" 401 | checksum = "2dfed899f0eb03f32ee8c6a0aabdb8a7949659e3466561fc0adf54e26d88c5f4" 402 | dependencies = [ 403 | "winapi-util", 404 | ] 405 | 406 | [[package]] 407 | name = "textwrap" 408 | version = "0.14.2" 409 | source = "registry+https://github.com/rust-lang/crates.io-index" 410 | checksum = "0066c8d12af8b5acd21e00547c3797fde4e8677254a7ee429176ccebbe93dd80" 411 | 412 | [[package]] 413 | name = "unicode-segmentation" 414 | version = "1.8.0" 415 | source = "registry+https://github.com/rust-lang/crates.io-index" 416 | checksum = "8895849a949e7845e06bd6dc1aa51731a103c42707010a5b591c0038fb73385b" 417 | 418 | [[package]] 419 | name = "unicode-xid" 420 | version = "0.2.2" 421 | source = "registry+https://github.com/rust-lang/crates.io-index" 422 | checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" 423 | 424 | [[package]] 425 | name = "version_check" 426 | version = "0.9.3" 427 | source = "registry+https://github.com/rust-lang/crates.io-index" 428 | checksum = "5fecdca9a5291cc2b8dcf7dc02453fee791a280f3743cb0905f8822ae463b3fe" 429 | 430 | [[package]] 431 | name = "wasi" 432 | version = "0.10.2+wasi-snapshot-preview1" 433 | source = "registry+https://github.com/rust-lang/crates.io-index" 434 | checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" 435 | 436 | [[package]] 437 | name = "winapi" 438 | version = "0.3.9" 439 | source = "registry+https://github.com/rust-lang/crates.io-index" 440 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 441 | dependencies = [ 442 | "winapi-i686-pc-windows-gnu", 443 | "winapi-x86_64-pc-windows-gnu", 444 | ] 445 | 446 | [[package]] 447 | name = "winapi-i686-pc-windows-gnu" 448 | version = "0.4.0" 449 | source = "registry+https://github.com/rust-lang/crates.io-index" 450 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 451 | 452 | [[package]] 453 | name = "winapi-util" 454 | version = "0.1.5" 455 | source = "registry+https://github.com/rust-lang/crates.io-index" 456 | checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" 457 | dependencies = [ 458 | "winapi", 459 | ] 460 | 461 | [[package]] 462 | name = "winapi-x86_64-pc-windows-gnu" 463 | version = "0.4.0" 464 | source = "registry+https://github.com/rust-lang/crates.io-index" 465 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 466 | -------------------------------------------------------------------------------- /squeeze/uri.rs: -------------------------------------------------------------------------------- 1 | // https://tools.ietf.org/html/rfc3986#appendix-A 2 | 3 | use super::Finder; 4 | use std::collections::HashSet; 5 | use std::ops::Range; 6 | 7 | #[derive(Default, Clone, Copy)] 8 | struct SchemeConfig(u8); 9 | 10 | impl SchemeConfig { 11 | fn has(&self, flag: u8) -> bool { 12 | (self.0 & flag) != 0 13 | } 14 | } 15 | 16 | struct SchemeConfigs(phf::Map<&'static str, SchemeConfig>); 17 | 18 | impl SchemeConfigs { 19 | fn get(&self, key: &str) -> SchemeConfig { 20 | if let Some(sc) = self.0.get(key) { 21 | *sc 22 | } else { 23 | SchemeConfig::default() 24 | } 25 | } 26 | } 27 | 28 | const DISALLOW_EMPTY_HOST: u8 = 1 << 0; 29 | 30 | static SCHEMES_CONFIGS: SchemeConfigs = SchemeConfigs(phf::phf_map! { 31 | "ftp" => SchemeConfig(DISALLOW_EMPTY_HOST), 32 | "http" => SchemeConfig(DISALLOW_EMPTY_HOST), 33 | "https" => SchemeConfig(DISALLOW_EMPTY_HOST), 34 | }); 35 | 36 | #[derive(Default)] 37 | pub struct URI { 38 | schemes: HashSet, 39 | pub strict: bool, 40 | } 41 | 42 | impl Finder for URI { 43 | fn id(&self) -> &'static str { 44 | "uri" 45 | } 46 | 47 | // scheme ":" hier-part [ "?" query ] [ "#" fragment ] 48 | fn find(&self, s: &str) -> Option> { 49 | let input = s.as_bytes(); 50 | let mut idx = 0; 51 | 52 | while idx < input.len() { 53 | let start = idx; 54 | 55 | let colon_idx = start + input[start..].iter().position(|&b| b == b':')?; 56 | idx = colon_idx + 1; 57 | 58 | let scheme_idx = match self.rlook_scheme(&input[start..colon_idx]) { 59 | Some(i) => start + i, 60 | None => continue, 61 | }; 62 | let scheme = &s[scheme_idx..colon_idx]; 63 | let scheme_config = SCHEMES_CONFIGS.get(scheme); 64 | 65 | idx += self.look_hier_part(&input[idx..], scheme_config)?; 66 | idx += self.look_question_mark_query(&input[idx..]).unwrap_or(0); 67 | idx += self.look_sharp_fragment(&input[idx..]).unwrap_or(0); 68 | 69 | // we cannot early exit as soon as we know the scheme as we need to advance idx even if the 70 | // uri should be discarded 71 | if self.schemes.is_empty() || self.schemes.contains(scheme) { 72 | return Some(scheme_idx..idx); 73 | } 74 | } 75 | 76 | None 77 | } 78 | } 79 | 80 | impl URI { 81 | pub fn add_scheme(&mut self, s: &str) { 82 | self.schemes.insert(s.to_lowercase()); 83 | } 84 | 85 | // ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) 86 | fn rlook_scheme(&self, input: &[u8]) -> Option { 87 | let mut idx = None; 88 | for (i, &c) in input.iter().enumerate().rev() { 89 | if self.is_alpha(c) { 90 | idx = Some(i); 91 | } else if self.is_digit(c) || [b'+', b'-', b'.'].contains(&c) { 92 | // noop 93 | } else { 94 | break; 95 | } 96 | } 97 | idx 98 | } 99 | 100 | // hier-part = "//" authority path-abempty 101 | // / path-absolute 102 | // / path-rootless 103 | // / path-empty 104 | fn look_hier_part(&self, input: &[u8], sc: SchemeConfig) -> Option { 105 | // "//" authority path-abempty 106 | if let Some(idx) = self 107 | .look_slash_slash(input) 108 | .and_then(|idx| Some(idx + self.look_authority(&input[idx..], sc)?)) 109 | .map(|idx| idx + self.look_path_abempty(&input[idx..])) 110 | { 111 | return Some(idx); 112 | } 113 | 114 | // Some schemes disallow empty hosts 115 | if sc.has(DISALLOW_EMPTY_HOST) { 116 | return None; 117 | } 118 | 119 | // "/" [ segment-nz path-abempty ] 120 | if let Some(idx) = self.look_slash(input).map(|idx| { 121 | idx + self 122 | .look_segment_nz(&input[idx..]) 123 | .map(|i| i + self.look_path_abempty(&input[idx + i..])) 124 | .unwrap_or(0) 125 | }) { 126 | return Some(idx); 127 | } 128 | 129 | // segment-nz path-abempty 130 | if let Some(idx) = self 131 | .look_segment_nz(input) 132 | .map(|idx| idx + self.look_path_abempty(&input[idx..])) 133 | { 134 | return Some(idx); 135 | } 136 | 137 | // 0 138 | Some(0) 139 | } 140 | 141 | // [ userinfo "@" ] host [ ":" port ] 142 | fn look_authority(&self, input: &[u8], sc: SchemeConfig) -> Option { 143 | let mut idx = 0; 144 | idx += self.look_userinfo_at(&input[idx..]).unwrap_or(0); 145 | idx += self.look_host(&input[idx..]).and_then(|i| { 146 | if i == 0 && sc.has(DISALLOW_EMPTY_HOST) { 147 | None 148 | } else { 149 | Some(i) 150 | } 151 | })?; 152 | idx += self.look_colon_port(&input[idx..]).unwrap_or(0); 153 | Some(idx) 154 | } 155 | 156 | fn look_colon_port(&self, input: &[u8]) -> Option { 157 | let mut idx = 0; 158 | idx += self.look_colon(&input[idx..])?; 159 | idx += self.look_port(&input[idx..]); 160 | Some(idx) 161 | } 162 | 163 | // *( "/" segment ) 164 | fn look_path_abempty(&self, input: &[u8]) -> usize { 165 | let mut idx = 0; 166 | while idx < input.len() { 167 | idx += match self 168 | .look_slash(&input[idx..]) 169 | .map(|i| i + self.look_segment(&input[idx + i..])) 170 | { 171 | Some(n) => n, 172 | None => break, 173 | }; 174 | } 175 | idx 176 | } 177 | 178 | // *pchar 179 | fn look_segment(&self, input: &[u8]) -> usize { 180 | let mut idx = 0; 181 | while idx < input.len() { 182 | idx += match self.look_pchar(&input[idx..]) { 183 | Some(n) => n, 184 | None => break, 185 | }; 186 | } 187 | idx 188 | } 189 | 190 | // 1*pchar 191 | fn look_segment_nz(&self, input: &[u8]) -> Option { 192 | match self.look_segment(input) { 193 | 0 => None, 194 | n => Some(n), 195 | } 196 | } 197 | 198 | // userinfo "@" 199 | fn look_userinfo_at(&self, input: &[u8]) -> Option { 200 | let arobase_idx = input.iter().take(256).position(|&b| b == b'@')?; 201 | if self.is_userinfo(&input[..arobase_idx]) { 202 | Some(arobase_idx + 1) 203 | } else { 204 | None 205 | } 206 | } 207 | 208 | // IP-literal / IPv4address / reg-name 209 | fn look_host(&self, input: &[u8]) -> Option { 210 | self.look_ip_literal(input) 211 | .or_else(|| self.look_ipv4_address(input)) 212 | .or_else(|| self.look_hostname(input)) 213 | } 214 | 215 | // "[" ( IPv6address / IPvFuture ) "]" 216 | fn look_ip_literal(&self, input: &[u8]) -> Option { 217 | let mut idx = 0; 218 | idx += self.look_left_bracket(&input[idx..])?; 219 | let right_bracket_index = (&input[idx..]).iter().take(64).position(|&b| b == b']')?; 220 | if right_bracket_index > 0 { 221 | let end = idx + right_bracket_index; 222 | let slice = &input[idx..end]; 223 | if self.is_ipv6address(slice) || self.is_ipvfuture(slice) { 224 | return Some(end + 1); 225 | } 226 | } 227 | None 228 | } 229 | 230 | // https://tools.ietf.org/html/rfc4291#section-2.2 231 | fn is_ipv6address(&self, input: &[u8]) -> bool { 232 | let mut idx = 0; 233 | 234 | let mut bytes_count = 0; 235 | let mut double_colon_found = false; 236 | 237 | while idx < input.len() { 238 | let mut last_is_colon = false; 239 | while let Some(i) = self.look_colon(&input[idx..]) { 240 | if last_is_colon { 241 | if double_colon_found { 242 | return false; 243 | } 244 | double_colon_found = true; 245 | bytes_count += 2; 246 | } 247 | last_is_colon = true; 248 | idx += i; 249 | } 250 | 251 | if last_is_colon || idx == 0 { 252 | if bytes_count == 12 || double_colon_found { 253 | if let Some(i) = self.look_ipv4_address(&input[idx..]) { 254 | bytes_count += 4; 255 | idx += i; 256 | break; 257 | } 258 | } 259 | if let Some(i) = self.look_h16(&input[idx..]) { 260 | bytes_count += 2; 261 | idx += i; 262 | continue; 263 | } 264 | } 265 | 266 | break; 267 | } 268 | 269 | idx == input.len() && (bytes_count == 16 || (double_colon_found && bytes_count <= 12)) 270 | } 271 | 272 | // 1*4HEXDIG 273 | fn look_h16(&self, input: &[u8]) -> Option { 274 | let idx = input 275 | .iter() 276 | .take_while(|&&b| self.is_hexdig(b)) 277 | .take(4) 278 | .count(); 279 | if idx >= 1 { 280 | Some(idx) 281 | } else { 282 | None 283 | } 284 | } 285 | 286 | // "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) 287 | fn is_ipvfuture(&self, _input: &[u8]) -> bool { 288 | // TODO: implementation 289 | false 290 | } 291 | 292 | // dec-octet "." dec-octet "." dec-octet "." dec-octet 293 | fn look_ipv4_address(&self, input: &[u8]) -> Option { 294 | let mut idx = 0; 295 | idx += self.look_dec_octet(&input[idx..])?; 296 | idx += self.look_period(&input[idx..])?; 297 | idx += self.look_dec_octet(&input[idx..])?; 298 | idx += self.look_period(&input[idx..])?; 299 | idx += self.look_dec_octet(&input[idx..])?; 300 | idx += self.look_period(&input[idx..])?; 301 | idx += self.look_dec_octet(&input[idx..])?; 302 | Some(idx) 303 | } 304 | 305 | // dec-octet = DIGIT ; 0-9 306 | // / %x31-39 DIGIT ; 10-99 307 | // / "1" 2DIGIT ; 100-199 308 | // / "2" %x30-34 DIGIT ; 200-249 309 | // / "25" %x30-35 ; 250-255 310 | fn look_dec_octet(&self, input: &[u8]) -> Option { 311 | if input.len() >= 3 312 | && input[0] == b'2' 313 | && input[1] == b'5' 314 | && self.is_digit_0_to_5(input[2]) 315 | { 316 | return Some(3); 317 | } 318 | 319 | if input.len() >= 3 320 | && input[0] == b'2' 321 | && self.is_digit_0_to_4(input[1]) 322 | && self.is_digit(input[2]) 323 | { 324 | return Some(3); 325 | } 326 | 327 | if input.len() >= 3 328 | && input[0] == b'1' 329 | && self.is_digit(input[1]) 330 | && self.is_digit(input[2]) 331 | { 332 | return Some(3); 333 | } 334 | 335 | if input.len() >= 2 && self.is_digit_1_to_9(input[0]) && self.is_digit(input[1]) { 336 | return Some(2); 337 | } 338 | 339 | if !input.is_empty() && self.is_digit(input[0]) { 340 | return Some(1); 341 | } 342 | 343 | None 344 | } 345 | 346 | // https://en.wikipedia.org/wiki/Hostname#Restrictions_on_valid_hostnames 347 | fn look_hostname(&self, input: &[u8]) -> Option { 348 | let mut idx = 0; 349 | while idx < input.len() && idx < 253 { 350 | if idx > 0 { 351 | if let Some(i) = self.look_dot(&input[idx..]) { 352 | idx += i; 353 | } else { 354 | break; 355 | } 356 | } 357 | if let Some(i) = self.look_label(&input[idx..]) { 358 | idx += i; 359 | } else { 360 | break; 361 | } 362 | } 363 | Some(idx) 364 | } 365 | 366 | fn look_label(&self, input: &[u8]) -> Option { 367 | let mut idx = 0; 368 | if idx < input.len() 369 | && (self.is_alpha(input[idx]) || self.is_digit(input[idx]) || input[idx] == b'_') 370 | { 371 | idx += 1; 372 | } else { 373 | return None; 374 | } 375 | while idx < input.len() 376 | && idx < 62 377 | && (self.is_alpha(input[idx]) 378 | || self.is_digit(input[idx]) 379 | || input[idx] == b'_' 380 | || input[idx] == b'-') 381 | { 382 | idx += 1; 383 | } 384 | Some(idx) 385 | } 386 | 387 | fn look_dot(&self, input: &[u8]) -> Option { 388 | if !input.is_empty() && input[0] == b'.' { 389 | Some(1) 390 | } else { 391 | None 392 | } 393 | } 394 | 395 | // *DIGIT 396 | fn look_port(&self, input: &[u8]) -> usize { 397 | input.iter().take_while(|&&c| self.is_digit(c)).count() 398 | } 399 | 400 | fn look_question_mark_query(&self, input: &[u8]) -> Option { 401 | let mut idx = 0; 402 | idx += self.look_question_mark(&input[idx..])?; 403 | idx += self.look_query(&input[idx..]); 404 | Some(idx) 405 | } 406 | 407 | // *( pchar / "/" / "?" ) 408 | fn look_query(&self, input: &[u8]) -> usize { 409 | let mut idx = 0; 410 | while idx < input.len() { 411 | if let Some(i) = self.look_pchar(&input[idx..]) { 412 | idx += i; 413 | continue; 414 | } 415 | if [b'/', b'?'].contains(&input[idx]) { 416 | idx += 1; 417 | continue; 418 | } 419 | break; 420 | } 421 | idx 422 | } 423 | 424 | fn look_sharp_fragment(&self, input: &[u8]) -> Option { 425 | let mut idx = 0; 426 | idx += self.look_sharp(&input[idx..])?; 427 | idx += self.look_fragment(&input[idx..]); 428 | Some(idx) 429 | } 430 | 431 | // *( pchar / "/" / "?" ) 432 | fn look_fragment(&self, input: &[u8]) -> usize { 433 | let mut idx = 0; 434 | while idx < input.len() { 435 | if let Some(i) = self.look_pchar(&input[idx..]) { 436 | idx += i; 437 | continue; 438 | } 439 | if [b'/', b'?'].contains(&input[idx]) { 440 | idx += 1; 441 | continue; 442 | } 443 | break; 444 | } 445 | idx 446 | } 447 | 448 | // unreserved / pct-encoded / sub-delims / ":" / "@" 449 | fn look_pchar(&self, input: &[u8]) -> Option { 450 | self.look_pct_encoded(input).or_else(|| { 451 | if !input.is_empty() 452 | && (self.is_unreserved(input[0]) 453 | || self.is_sub_delim(input[0]) 454 | || [b':', b'@'].contains(&input[0])) 455 | { 456 | Some(1) 457 | } else { 458 | None 459 | } 460 | }) 461 | } 462 | 463 | // "%" HEXDIG HEXDIG 464 | fn look_pct_encoded(&self, input: &[u8]) -> Option { 465 | if input.len() >= 3 466 | && input[0] == b'%' 467 | && self.is_hexdig(input[1]) 468 | && self.is_hexdig(input[2]) 469 | { 470 | Some(3) 471 | } else { 472 | None 473 | } 474 | } 475 | 476 | fn look_period(&self, input: &[u8]) -> Option { 477 | if !input.is_empty() && input[0] == b'.' { 478 | Some(1) 479 | } else { 480 | None 481 | } 482 | } 483 | 484 | fn look_left_bracket(&self, input: &[u8]) -> Option { 485 | if !input.is_empty() && input[0] == b'[' { 486 | Some(1) 487 | } else { 488 | None 489 | } 490 | } 491 | 492 | fn look_colon(&self, input: &[u8]) -> Option { 493 | if !input.is_empty() && input[0] == b':' { 494 | Some(1) 495 | } else { 496 | None 497 | } 498 | } 499 | 500 | fn look_question_mark(&self, input: &[u8]) -> Option { 501 | if !input.is_empty() && input[0] == b'?' { 502 | Some(1) 503 | } else { 504 | None 505 | } 506 | } 507 | 508 | fn look_sharp(&self, input: &[u8]) -> Option { 509 | if !input.is_empty() && input[0] == b'#' { 510 | Some(1) 511 | } else { 512 | None 513 | } 514 | } 515 | 516 | fn look_slash(&self, input: &[u8]) -> Option { 517 | if !input.is_empty() && input[0] == b'/' { 518 | Some(1) 519 | } else { 520 | None 521 | } 522 | } 523 | 524 | fn look_slash_slash(&self, input: &[u8]) -> Option { 525 | if input.len() >= 2 && input[0] == b'/' && input[1] == b'/' { 526 | Some(2) 527 | } else { 528 | None 529 | } 530 | } 531 | 532 | // *( unreserved / pct-encoded / sub-delims / ":" ) 533 | fn is_userinfo(&self, input: &[u8]) -> bool { 534 | let mut idx = 0; 535 | while idx < input.len() { 536 | if let Some(i) = self.look_pct_encoded(&input[idx..]) { 537 | idx += i; 538 | continue; 539 | } 540 | let c = input[idx]; 541 | if self.is_unreserved(c) || self.is_sub_delim(c) || c == b':' { 542 | idx += 1; 543 | continue; 544 | } 545 | return false; 546 | } 547 | true 548 | } 549 | 550 | // ALPHA / DIGIT / "-" / "." / "_" / "~" 551 | fn is_unreserved(&self, c: u8) -> bool { 552 | self.is_alpha(c) || self.is_digit(c) || c == b'-' || c == b'.' || c == b'_' || c == b'~' 553 | } 554 | 555 | // "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 556 | fn is_sub_delim(&self, c: u8) -> bool { 557 | if self.strict { 558 | [ 559 | b'!', b'$', b'&', b'\'', b'(', b')', b'*', b'+', b',', b';', b'=', 560 | ] 561 | .contains(&c) 562 | } else { 563 | [b'!', b'$', b'&', b'(', b'*', b'+', b',', b';', b'='].contains(&c) // without ' and ) 564 | } 565 | } 566 | 567 | // ALPHA 568 | fn is_alpha(&self, c: u8) -> bool { 569 | (b'a'..=b'z').contains(&c) || (b'A'..=b'Z').contains(&c) 570 | } 571 | 572 | // DIGIT 573 | fn is_digit(&self, c: u8) -> bool { 574 | (b'0'..=b'9').contains(&c) 575 | } 576 | fn is_digit_1_to_9(&self, c: u8) -> bool { 577 | (b'1'..=b'9').contains(&c) 578 | } 579 | fn is_digit_0_to_4(&self, c: u8) -> bool { 580 | (b'0'..=b'4').contains(&c) 581 | } 582 | fn is_digit_0_to_5(&self, c: u8) -> bool { 583 | (b'0'..=b'5').contains(&c) 584 | } 585 | 586 | // HEXDIG 587 | fn is_hexdig(&self, c: u8) -> bool { 588 | self.is_digit(c) || (b'a'..=b'f').contains(&c) || (b'A'..=b'F').contains(&c) 589 | } 590 | } 591 | 592 | #[cfg(test)] 593 | mod tests { 594 | use super::*; 595 | 596 | #[test] 597 | fn is_ipv6address_should_identify_valid_ipv6s() { 598 | let finder = URI::default(); 599 | for input in vec![ 600 | "::", 601 | "::1", 602 | "1::", 603 | "1:2:3:4:5:6:7:8", 604 | "1:2:3:4:5:6::7", 605 | "1:2:3:4:5:6:127.0.0.1", 606 | "1::127.0.0.1", 607 | ] { 608 | assert_eq!(true, finder.is_ipv6address(input.as_bytes()), "{}", input); 609 | } 610 | } 611 | 612 | #[test] 613 | fn is_ipv6address_should_identify_invalid_ipv6s() { 614 | let finder = URI::default(); 615 | for input in vec![ 616 | " ", 617 | " ::", 618 | ":: ", 619 | " :: ", 620 | ":::", 621 | "::1::", 622 | ":1:", 623 | "1:2:3:4:5:6:7:8:9", 624 | "1:2:3:4:5:6:7:127.0.0.1", 625 | "1:2:3:4:5:6::7:8", 626 | "1:2:3:4:5:6::127.0.0.1", 627 | "1:127.0.0.1::", 628 | ] { 629 | assert_eq!(false, finder.is_ipv6address(input.as_bytes()), "{}", input); 630 | } 631 | } 632 | 633 | #[test] 634 | fn look_path_abempty_should_mirror_the_len_of_valid_inputs() { 635 | let finder = URI::default(); 636 | for input in vec![ 637 | "", 638 | "/", 639 | "//", 640 | "///", 641 | "/foo/bar", 642 | "/rfc/rfc1808.txt", 643 | "/with/trailing/", 644 | ] { 645 | assert_eq!( 646 | input.len(), 647 | finder.look_path_abempty(input.as_bytes()), 648 | "{}", 649 | input 650 | ); 651 | } 652 | } 653 | 654 | #[test] 655 | fn look_path_abempty_should_skip_invalid_inputs() { 656 | let finder = URI::default(); 657 | for input in vec!["foobar"] { 658 | assert_eq!(0, finder.look_path_abempty(input.as_bytes()), "{}", input); 659 | } 660 | } 661 | 662 | #[test] 663 | fn it_should_mirror_valid_uris() { 664 | let finder = URI::default(); 665 | for input in vec![ 666 | // basic 667 | "http://localhost", 668 | // userinfo 669 | "http://foobar:@localhost", 670 | "http://foobar:baz@localhost", 671 | // port 672 | "http://foobar:@localhost:", 673 | "http://foobar:@localhost:8080", 674 | // path 675 | "http://localhost/lorem", 676 | // query 677 | "http://foobar:@localhost:8080?", 678 | "http://foobar:@localhost:8080?a=b", 679 | // fragment 680 | "http://foobar:@localhost:8080#", 681 | "http://foobar:@localhost:8080?#", 682 | "http://foobar:@localhost:8080?a=b#", 683 | "http://foobar:@localhost:8080?a=b#c=d", 684 | // meh 685 | "http://:@localhost:/?#", 686 | // ipv4 687 | "http://127.0.0.0", 688 | "http://127.0.0.10", 689 | "http://127.0.0.100", 690 | "http://127.0.0.200", 691 | "http://127.0.0.250", 692 | "http://192.0.2.235", 693 | // ipv6 694 | "http://[::]", 695 | "http://[::1]", 696 | "http://[2001:db8::1]", 697 | "http://[2001:0db8::0001]", 698 | "http://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]", 699 | "http://[::ffff:192.0.2.128]", 700 | "http://[::ffff:c000:0280]", 701 | // scheme only 702 | "foobar:", 703 | // rfc examples 704 | "file:///etc/hosts", 705 | "http://localhost/", 706 | "mailto:fred@example.com", 707 | "foo://info.example.com?fred", 708 | "ftp://ftp.is.co.za/rfc/rfc1808.txt", 709 | "http://www.ietf.org/rfc/rfc2396.txt", 710 | "ldap://[2001:db8::7]/c=GB?objectClass?one", 711 | "mailto:John.Doe@example.com", 712 | "news:comp.infosystems.www.servers.unix", 713 | "tel:+1-816-555-1212", 714 | "telnet://192.0.2.16:80/", 715 | "urn:oasis:names:specification:docbook:dtd:xml:4.1.2", 716 | ] { 717 | for i in vec![ 718 | input.to_owned(), 719 | format!(" {} ", input), 720 | format!("<{}>", input), 721 | format!("[{}]", input), 722 | format!("link", input), 723 | format!("{{{}}}", input), 724 | format!("\"{}\"", input), 725 | format!("[link]({})", input), 726 | format!("'{}'", input), 727 | ] { 728 | assert_eq!(Some(input), finder.find(&i).map(|r| &i[r]), "{}", input); 729 | } 730 | } 731 | } 732 | 733 | #[test] 734 | fn it_should_properly_behave_in_strict_mode() { 735 | let mut finder = URI::default(); 736 | finder.strict = true; 737 | for &input in &["http://localhost/)", "http://localhost/'"] { 738 | assert_eq!(Some(input), finder.find(input).map(|r| &input[r])); 739 | } 740 | } 741 | 742 | #[test] 743 | fn it_should_ignore_invalid_uris() { 744 | let finder = URI::default(); 745 | for input in vec![ 746 | // some protocols require a host 747 | "ftp:///test", 748 | "http:///test", 749 | "https:///test", 750 | ] { 751 | assert_eq!(None, finder.find(input), "{}", input); 752 | } 753 | } 754 | } 755 | --------------------------------------------------------------------------------