370 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler.
371 | library("rvest")
372 | library("stringr")
373 |
374 | minimal <- html("blah foo")
375 |
376 | bodytext <- minimal %>%
377 | html_node("body") %>%
378 | html_text()
379 |
380 | re_substitutes(bodytext, rex(spaces), "", global = TRUE)
381 |
382 | #' ###
383 | #+ message=FALSE
384 | string <- "this\\(system) {is} [full]."
385 | library(Hmisc)
386 | gsub("\\\\(.)", "\\1", escapeRegex(string))
387 |
388 | #' Alternatively [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler.
389 | library(rex)
390 | re_substitutes(escape(string), rex("\\", capture(any)), "\\1", global = TRUE)
391 |
392 | #' ###
393 | #' [rex](http://cran.r-project.org/web/packages/rex/) has a [vignette for parsing server logs](http://cran.r-project.org/web/packages/rex/vignettes/log_parsing.html). While the format is not exactly the same as your log you should be able to adapt it to your case fairly easily.
394 | #' As far as reading the log in assuming the file fits in memory your best bet is to read the whole file first with `readLines()`, then the following will put each field into a `data.frame` column.
395 | x <- "Feb 6 12:14:14 localhost haproxy[14389]: 10.0.1.2:33317 [06/Feb/2009:12:14:14.655] http-in static/srv1 10/0/30/69/109 200 2750 - - ---- 1/1/1/1/0 0/0 {1wt.eu} {} \"GET /index.html HTTP/1.1\""
396 | library(rex)
397 | re <- rex(
398 |
399 | capture(name = "process_name", alpha),
400 | "[",
401 | capture(name = "pid", digits),
402 | "]:",
403 | spaces,
404 | capture(name = "client_ip", any_of(digit, ".")),
405 | ":",
406 | capture(name = "client_port", digits),
407 | spaces,
408 | "[",
409 | capture(name = "accept_date", except_some_of("]")),
410 | "]",
411 | spaces,
412 | capture(name = "frontend_name", non_spaces),
413 | spaces,
414 | capture(name = "backend_name", except_some_of("/")),
415 | "/",
416 | capture(name = "server_name", non_spaces),
417 | spaces,
418 | capture(name = "Tq", some_of("-", digit)),
419 | "/",
420 | capture(name = "Tw", some_of("-", digit)),
421 | "/",
422 | capture(name = "Tc", some_of("-", digit)),
423 | "/",
424 | capture(name = "Tr", some_of("-", digit)),
425 | "/",
426 | capture(name = "Tt", some_of("+", digit)),
427 | spaces,
428 | capture(name = "status_code", digits),
429 | spaces,
430 | capture(name = "bytes_read", some_of("+", digit)),
431 | spaces,
432 | capture(name = "captured_request_cookie", non_spaces),
433 | spaces,
434 | capture(name = "captured_response_cookie", non_spaces),
435 | spaces,
436 | capture(name = "termination_state", non_spaces),
437 | spaces,
438 | capture(name = "actconn", digits),
439 | "/",
440 | capture(name = "feconn", digits),
441 | "/",
442 | capture(name = "beconn", digits),
443 | "/",
444 | capture(name = "srv_conn", digits),
445 | "/",
446 | capture(name = "retries", some_of("+", digit)),
447 | spaces,
448 | capture(name = "srv_queue", digits),
449 | "/",
450 | capture(name = "backend_queue", digits),
451 | spaces,
452 | "{",
453 | capture(name = "captured_request_headers", except_any_of("}")),
454 | "}",
455 | spaces,
456 | "{",
457 | capture(name = "captured_response_headers", except_any_of("}")),
458 | "}",
459 | spaces,
460 | double_quote,
461 | capture(name = "http_request", non_quotes),
462 | double_quote)
463 |
464 | re_matches(x, re)
465 |
466 | #' ###
467 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler.
468 |
469 | my.data <- read.table(text = "
470 | my.string state
471 | ......... A
472 | 1........ B
473 | 112...... C
474 | 11111.... D
475 | 1111113.. E
476 | 111111111 F
477 | 111111111 G
478 | ", header = TRUE, stringsAsFactors = FALSE)
479 |
480 | library(rex)
481 |
482 | re_matches(my.data$my.string,
483 | rex(capture(except(".")), "."))$"1"
484 |
485 | #' ###
486 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler.
487 | string <- "Shakira - Wolf - 02.Hips don't lie.mp3"
488 |
489 | library(rex)
490 | re_matches(string,
491 | rex(capture(zero_or_more(any, type="lazy")), spaces, "-"))$"1"
492 |
493 | #' ###
494 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler.
495 |
496 | string <- "I t is tim e to g o"
497 | library(rex)
498 | re_substitutes(string, rex(
499 | space %if_next_is%
500 | list(
501 | list(non_space, space, at_least(non_space, 2)) %or%
502 | list(non_space, end)
503 | )
504 | ), "", global = TRUE)
505 |
506 | #' ###
507 | #' Using [rex](http://cran.r-project.org/web/packages/rex/index.html) may make this type of task a little simpler.
508 |
509 | string <- "01:04:43.064 [12439] <2> xyz
510 | 01:04:43.067 [12439] <2> a lmn
511 | 01:04:43.068 [12439] <4> j klm
512 | x_times_wait to <3000>
513 | 01:04:43.068 [12439] <4> j klm
514 | enter_object <5000> main k"
515 |
516 | library(rex)
517 |
518 | timestamp <- rex(n(digit, 2), ":", n(digit, 2), ":", n(digit, 2), ".", n(digit, 3))
519 |
520 | re <- rex(timestamp, space,
521 | "[", digits, "]", space,
522 | "<", digits, ">", space,
523 | capture(anything))
524 |
525 | re_matches(string, re, global = TRUE)
526 |
--------------------------------------------------------------------------------
/vignettes/url_parsing.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "URL Validation"
3 | author: "Jim Hester"
4 | date: "`r Sys.Date()`"
5 | output: rmarkdown::html_vignette
6 | vignette: >
7 | %\VignetteIndexEntry{URL Validation}
8 | %\VignetteEngine{knitr::rmarkdown}
9 | \usepackage[utf8]{inputenc}
10 | ---
11 |
12 | Consider the task of correctly [validating a URL](https://mathiasbynens.be/demo/url-regex).
13 | From that page two conclusions can be made.
14 |
15 | 1. Validating URLs require complex regular expressions.
16 | 2. Creating a correct regular expression is hard! (only 1 out of 13 regexs were valid for all cases).
17 |
18 | Because of this one may be tempted to simply copy the best regex you can find ([gist](https://gist.github.com/dperini/729294)).
19 |
20 | The problem with this is that while you can copy it now, what happens later when you find a case that is not handled correctly? Can you correctly interpret and modify this?
21 | ```{r url_parsing_stock, eval=F}
22 | "^(?:(?:http(?:s)?|ftp)://)(?:\\S+(?::(?:\\S)*)?@)?(?:(?:[a-z0-9\u00a1-\uffff](?:-)*)*(?:[a-z0-9\u00a1-\uffff])+)(?:\\.(?:[a-z0-9\u00a1-\uffff](?:-)*)*(?:[a-z0-9\u00a1-\uffff])+)*(?:\\.(?:[a-z0-9\u00a1-\uffff]){2,})(?::(?:\\d){2,5})?(?:/(?:\\S)*)?$"
23 | ```
24 |
25 | However if you re-create the regex with `rex` it is much easier to understand and modify later if needed.
26 | ```{r url_parsing_url}
27 | library(rex)
28 | library(magrittr)
29 |
30 | valid_chars <- rex(except_some_of(".", "/", " ", "-"))
31 |
32 | re <- rex(
33 | start,
34 |
35 | # protocol identifier (optional) + //
36 | group(list("http", maybe("s")) %or% "ftp", "://"),
37 |
38 | # user:pass authentication (optional)
39 | maybe(non_spaces,
40 | maybe(":", zero_or_more(non_space)),
41 | "@"),
42 |
43 | #host name
44 | group(zero_or_more(valid_chars, zero_or_more("-")), one_or_more(valid_chars)),
45 |
46 | #domain name
47 | zero_or_more(".", zero_or_more(valid_chars, zero_or_more("-")), one_or_more(valid_chars)),
48 |
49 | #TLD identifier
50 | group(".", valid_chars %>% at_least(2)),
51 |
52 | # server port number (optional)
53 | maybe(":", digit %>% between(2, 5)),
54 |
55 | # resource path (optional)
56 | maybe("/", non_space %>% zero_or_more()),
57 |
58 | end
59 | )
60 | ```
61 |
62 | We can then validate that it correctly identifies both good and bad URLs. (_IP address validation removed_)
63 |
64 | ```{r url_parsing_validate}
65 | good <- c("http://foo.com/blah_blah",
66 | "http://foo.com/blah_blah/",
67 | "http://foo.com/blah_blah_(wikipedia)",
68 | "http://foo.com/blah_blah_(wikipedia)_(again)",
69 | "http://www.example.com/wpstyle/?p=364",
70 | "https://www.example.com/foo/?bar=baz&inga=42&quux",
71 | "http://✪df.ws/123",
72 | "http://userid:password@example.com:8080",
73 | "http://userid:password@example.com:8080/",
74 | "http://userid@example.com",
75 | "http://userid@example.com/",
76 | "http://userid@example.com:8080",
77 | "http://userid@example.com:8080/",
78 | "http://userid:password@example.com",
79 | "http://userid:password@example.com/",
80 | "http://➡.ws/䨹",
81 | "http://⌘.ws",
82 | "http://⌘.ws/",
83 | "http://foo.com/blah_(wikipedia)#cite-1",
84 | "http://foo.com/blah_(wikipedia)_blah#cite-1",
85 | "http://foo.com/unicode_(✪)_in_parens",
86 | "http://foo.com/(something)?after=parens",
87 | "http://☺.damowmow.com/",
88 | "http://code.google.com/events/#&product=browser",
89 | "http://j.mp",
90 | "ftp://foo.bar/baz",
91 | "http://foo.bar/?q=Test%20URL-encoded%20stuff",
92 | "http://مثال.إختبار",
93 | "http://例子.测试",
94 | "http://-.~_!$&'()*+,;=:%40:80%2f::::::@example.com",
95 | "http://1337.net",
96 | "http://a.b-c.de",
97 | "http://223.255.255.254")
98 |
99 | bad <- c(
100 | "http://",
101 | "http://.",
102 | "http://..",
103 | "http://../",
104 | "http://?",
105 | "http://??",
106 | "http://??/",
107 | "http://#",
108 | "http://##",
109 | "http://##/",
110 | "http://foo.bar?q=Spaces should be encoded",
111 | "//",
112 | "//a",
113 | "///a",
114 | "///",
115 | "http:///a",
116 | "foo.com",
117 | "rdar://1234",
118 | "h://test",
119 | "http:// shouldfail.com",
120 | ":// should fail",
121 | "http://foo.bar/foo(bar)baz quux",
122 | "ftps://foo.bar/",
123 | "http://-error-.invalid/",
124 | "http://-a.b.co",
125 | "http://a.b-.co",
126 | "http://0.0.0.0",
127 | "http://3628126748",
128 | "http://.www.foo.bar/",
129 | "http://www.foo.bar./",
130 | "http://.www.foo.bar./")
131 |
132 | all(grepl(re, good) == TRUE)
133 |
134 | all(grepl(re, bad) == FALSE)
135 | ```
136 |
137 | You can now see the power and expressiveness of building regular expressions with `rex`!
138 |
--------------------------------------------------------------------------------