├── Python Regular Expressions Made Easy # 14+15 - Look arounds .ipynb ├── Python Regular Expressions Made Easy - Part 16 - Negative Look Arounds.ipynb ├── Python Regular Expressions Made Easy - Part 17 - Variable Width Assertions with Look Behinds.ipynb ├── README.md ├── Regular Expressions Made Easy - part -10-Flags .ipynb ├── Regular Expressions Made Easy - part 11-Re.Split .ipynb ├── Regular Expressions Made Easy - part 12 - re.Sub.ipynb ├── Regular Expressions Made Easy -13 - Word Boundaries.ipynb ├── Regular Expressions made Easy - part 1 + part 2.ipynb ├── Regular Expressions made Easy - part 3.ipynb ├── Regular Expressions made Easy - part 4.ipynb ├── Regular Expressions made Easy - part 5 + 6.ipynb ├── Regular Expressions made Easy - part 7 .ipynb ├── Regular Expressions made Easy - part 8 .ipynb └── Regular Expressions made Easy - part 9 .ipynb /Python Regular Expressions Made Easy # 14+15 - Look arounds .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#Welcome to look arounds" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "#Allows us to confirm that some sort of subpattern is ahead or behind\n", 23 | "#main pattern\n", 24 | "\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": { 31 | "collapsed": true 32 | }, 33 | "outputs": [], 34 | "source": [ 35 | "import re" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "# 4 types of look arounds\n", 47 | "\n", 48 | "Positive look ahead ?=\n", 49 | "\n", 50 | "Negative look ahead ?!\n", 51 | "\n", 52 | "Positive look behind ?<=\n", 53 | "\n", 54 | "Negative look behind ?" 203 | ] 204 | }, 205 | "execution_count": 8, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "re.search(pattern, string)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### Difference between non-caputure groups and look arounds\n" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": { 225 | "collapsed": true 226 | }, 227 | "outputs": [], 228 | "source": [ 229 | "#Look aheads dont' consume, non-capture groups consume" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 11, 235 | "metadata": { 236 | "collapsed": true 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "string ='abababacb' #whereever a's are surrounded by b\n", 241 | " #so in our case we have two cases" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 12, 247 | "metadata": { 248 | "collapsed": true 249 | }, 250 | "outputs": [], 251 | "source": [ 252 | "pattern = re.compile('(?:b)(a)(?:b)') #non-capturing group" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 13, 258 | "metadata": { 259 | "collapsed": false 260 | }, 261 | "outputs": [ 262 | { 263 | "data": { 264 | "text/plain": [ 265 | "['a']" 266 | ] 267 | }, 268 | "execution_count": 13, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "re.findall(pattern, string)" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "collapsed": true 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "string ='abababacb'" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 14, 291 | "metadata": { 292 | "collapsed": true 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "pattern = re.compile('(?<=b)(a)(?=b)') #look around" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 15, 302 | "metadata": { 303 | "collapsed": false 304 | }, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "['a', 'a']" 310 | ] 311 | }, 312 | "execution_count": 15, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "re.findall(pattern, string)" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "#### Capture the entire look around " 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 17, 331 | "metadata": { 332 | "collapsed": true 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "string ='abababacb'" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 18, 342 | "metadata": { 343 | "collapsed": false 344 | }, 345 | "outputs": [], 346 | "source": [ 347 | "pattern = re.compile('(?=(bab))') " 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": 19, 353 | "metadata": { 354 | "collapsed": false 355 | }, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/plain": [ 360 | "['bab', 'bab']" 361 | ] 362 | }, 363 | "execution_count": 19, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "re.findall(pattern, string)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": { 376 | "collapsed": true 377 | }, 378 | "outputs": [], 379 | "source": [ 380 | "#Another example of positive look ahead" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 20, 386 | "metadata": { 387 | "collapsed": true 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "import re\n", 392 | "string = 'I love cherries, apples, and strawberries.'" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 21, 398 | "metadata": { 399 | "collapsed": true 400 | }, 401 | "outputs": [], 402 | "source": [ 403 | "pattern2 = re.compile(r'(\\w+)(?=\\.|,)')" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 22, 409 | "metadata": { 410 | "collapsed": false 411 | }, 412 | "outputs": [ 413 | { 414 | "data": { 415 | "text/plain": [ 416 | "['cherries', 'apples', 'strawberries']" 417 | ] 418 | }, 419 | "execution_count": 22, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "re.findall(pattern2,string)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 23, 431 | "metadata": { 432 | "collapsed": true 433 | }, 434 | "outputs": [], 435 | "source": [ 436 | "pattern2 = re.compile(r'(\\w+)(?:\\.|,)')" 437 | ] 438 | }, 439 | { 440 | "cell_type": "code", 441 | "execution_count": 24, 442 | "metadata": { 443 | "collapsed": false 444 | }, 445 | "outputs": [ 446 | { 447 | "data": { 448 | "text/plain": [ 449 | "['cherries', 'apples', 'strawberries']" 450 | ] 451 | }, 452 | "execution_count": 24, 453 | "metadata": {}, 454 | "output_type": "execute_result" 455 | } 456 | ], 457 | "source": [ 458 | "re.findall(pattern2,string)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "markdown", 463 | "metadata": {}, 464 | "source": [ 465 | "### consecutive look around fallacy" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 29, 471 | "metadata": { 472 | "collapsed": true 473 | }, 474 | "outputs": [], 475 | "source": [ 476 | "string = '''cherry 100 red\n", 477 | " apple 150 green\n", 478 | " grapes 200 \n", 479 | " '''\n" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 30, 485 | "metadata": { 486 | "collapsed": false 487 | }, 488 | "outputs": [], 489 | "source": [ 490 | "pattern = re.compile(r'[a-z]+\\s*(?= \\d+)(?=\\s*)(?=[a-z]+)') #zero-width assertion" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 31, 496 | "metadata": { 497 | "collapsed": false 498 | }, 499 | "outputs": [ 500 | { 501 | "data": { 502 | "text/plain": [ 503 | "[]" 504 | ] 505 | }, 506 | "execution_count": 31, 507 | "metadata": {}, 508 | "output_type": "execute_result" 509 | } 510 | ], 511 | "source": [ 512 | "re.findall(pattern,string) #consecutives are not really consecutives" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 34, 518 | "metadata": { 519 | "collapsed": true 520 | }, 521 | "outputs": [], 522 | "source": [ 523 | "pattern = re.compile(r'[a-z]+\\s*(?=\\d+\\s*[a-z]+)')" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 35, 529 | "metadata": { 530 | "collapsed": false 531 | }, 532 | "outputs": [ 533 | { 534 | "data": { 535 | "text/plain": [ 536 | "['cherry ', 'apple ']" 537 | ] 538 | }, 539 | "execution_count": 35, 540 | "metadata": {}, 541 | "output_type": "execute_result" 542 | } 543 | ], 544 | "source": [ 545 | "re.findall(pattern,string)" 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": { 552 | "collapsed": true 553 | }, 554 | "outputs": [], 555 | "source": [] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 36, 560 | "metadata": { 561 | "collapsed": true 562 | }, 563 | "outputs": [], 564 | "source": [ 565 | "#Password validation example #order doesn't matter" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 37, 571 | "metadata": { 572 | "collapsed": false 573 | }, 574 | "outputs": [], 575 | "source": [ 576 | "pattern = re.compile('(?=.*[a-z])(?=.*[A-Z])(?=.*[0-9])(?=.*[!?.])\\S+')" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 38, 582 | "metadata": { 583 | "collapsed": true 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "string = 'AZN#3232!abbb32..'\n", 588 | "string2 = 'AZN#3232abbb3232'" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 39, 594 | "metadata": { 595 | "collapsed": false 596 | }, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/plain": [ 601 | "<_sre.SRE_Match object; span=(0, 17), match='AZN#3232!abbb32..'>" 602 | ] 603 | }, 604 | "execution_count": 39, 605 | "metadata": {}, 606 | "output_type": "execute_result" 607 | } 608 | ], 609 | "source": [ 610 | "re.search(pattern, string)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": 42, 616 | "metadata": { 617 | "collapsed": false 618 | }, 619 | "outputs": [], 620 | "source": [ 621 | "re.search(pattern, string2)" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": null, 627 | "metadata": { 628 | "collapsed": true 629 | }, 630 | "outputs": [], 631 | "source": [] 632 | }, 633 | { 634 | "cell_type": "code", 635 | "execution_count": null, 636 | "metadata": { 637 | "collapsed": true 638 | }, 639 | "outputs": [], 640 | "source": [ 641 | "#If we didn't use look arounds we would need to use this complicated pattern\n", 642 | "\n", 643 | "(?:.*[a-z].*[A-Z].*[0-9].*[!?.])|(?:.*[A-Z][a-z].*[0-9].*[!?.])|(?:.*[0-9].*[a-z].*[A-Z].*[!?.])|(?:.*[!?.].*[a-z`].*[A-Z].*[0-9])|(?:.*[A-Z][a-z].*[!?.].*[0-9])|..." 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": null, 649 | "metadata": { 650 | "collapsed": true 651 | }, 652 | "outputs": [], 653 | "source": [] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": null, 658 | "metadata": { 659 | "collapsed": true 660 | }, 661 | "outputs": [], 662 | "source": [] 663 | }, 664 | { 665 | "cell_type": "code", 666 | "execution_count": null, 667 | "metadata": { 668 | "collapsed": true 669 | }, 670 | "outputs": [], 671 | "source": [] 672 | } 673 | ], 674 | "metadata": { 675 | "kernelspec": { 676 | "display_name": "Python 3", 677 | "language": "python", 678 | "name": "python3" 679 | }, 680 | "language_info": { 681 | "codemirror_mode": { 682 | "name": "ipython", 683 | "version": 3 684 | }, 685 | "file_extension": ".py", 686 | "mimetype": "text/x-python", 687 | "name": "python", 688 | "nbconvert_exporter": "python", 689 | "pygments_lexer": "ipython3", 690 | "version": "3.6.0" 691 | } 692 | }, 693 | "nbformat": 4, 694 | "nbformat_minor": 1 695 | } 696 | -------------------------------------------------------------------------------- /Python Regular Expressions Made Easy - Part 16 - Negative Look Arounds.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "Negative look ahead ?!\n", 12 | "\n", 13 | "Negative look behind ?, line 1)", 49 | "output_type": "error", 50 | "traceback": [ 51 | "\u001b[1;36m File \u001b[1;32m\"\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m ^ = matches at beginning of string\u001b[0m\n\u001b[1;37m ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "^ = matches at beginning of string\n", 57 | "$ = matches at end of string\n", 58 | "\n" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "#Below examples exactly the same" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 5, 75 | "metadata": { 76 | "collapsed": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "re.search('^North Korea\\.?', string)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 6, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "re.match('North Korea\\.?', string)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 7, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "<_sre.SRE_Match object; span=(557, 569), match='North Korea.'>" 105 | ] 106 | }, 107 | "execution_count": 7, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "re.search('North Korea\\.?$', string)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 8, 119 | "metadata": { 120 | "collapsed": true 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "#Only benefit to re.search is the MULTILINE flag" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "## re.MULTILINE/ re.M" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 9, 137 | "metadata": { 138 | "collapsed": false 139 | }, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "<_sre.SRE_Match object; span=(331, 343), match='North Korea.'>" 145 | ] 146 | }, 147 | "execution_count": 9, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "re.search('^North Korea\\.?', string, flags = re.MULTILINE ) " 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 10, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "re.match('^North Korea\\.?', string, flags = re.MULTILINE ) #no match" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "source": [ 173 | "## re.IGNORECASE / re.I" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 11, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [ 183 | { 184 | "data": { 185 | "text/plain": [ 186 | "['North Korea', 'North Korea', 'north korea', 'North Korea']" 187 | ] 188 | }, 189 | "execution_count": 11, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "re.findall('North Korea', string, flags = re.IGNORECASE) #re.I" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": { 201 | "collapsed": true 202 | }, 203 | "source": [ 204 | "## re.DOTALL / re.S" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 12, 210 | "metadata": { 211 | "collapsed": false 212 | }, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "'U.S. stock-index futures pointed'" 218 | ] 219 | }, 220 | "execution_count": 12, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "re.match('.*', string).group() #grabs the first line only\n", 227 | " ##only used with . character" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 13, 233 | "metadata": { 234 | "collapsed": false 235 | }, 236 | "outputs": [ 237 | { 238 | "data": { 239 | "text/plain": [ 240 | "'U.S. stock-index futures pointed\\nto a solidly higher open on \\nMonday, indicating that major \\nbenchmarks were poised to rebound \\nfrom last week’s sharp decline, \\nwhich represented their biggest weekly drops in months.\\nThat weakness was driven in part by \\nfears over North Korea, where tensions \\nwith the U.S. have been escalating. \\nNorth Korea. That issue overshadowed the state of \\nthe equity market, where earnings \\nhave been strong at a time of high \\nemployment and low inflation, \\nas well as valuations that a\\nppear elevated by many metrics, north korea North Korea.'" 241 | ] 242 | }, 243 | "execution_count": 13, 244 | "metadata": {}, 245 | "output_type": "execute_result" 246 | } 247 | ], 248 | "source": [ 249 | "re.match('.*', string, flags = re.DOTALL).group() #includes new lines" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": { 256 | "collapsed": true 257 | }, 258 | "outputs": [], 259 | "source": [ 260 | "## Other flags" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": { 267 | "collapsed": false 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "\n", 272 | "re.ASCII\n", 273 | "re.DEBUG\n", 274 | "re.LOCALE`" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 14, 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "Japanese_string ='【北京時事】中国商務省は14日、核'\n", 286 | "\n" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 16, 292 | "metadata": { 293 | "collapsed": false 294 | }, 295 | "outputs": [ 296 | { 297 | "data": { 298 | "text/plain": [ 299 | "<_sre.SRE_Match object; span=(16, 17), match='核'>" 300 | ] 301 | }, 302 | "execution_count": 16, 303 | "metadata": {}, 304 | "output_type": "execute_result" 305 | } 306 | ], 307 | "source": [ 308 | "re.search('核', Japanese_string)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": { 315 | "collapsed": true 316 | }, 317 | "outputs": [], 318 | "source": [] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.6.0" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 1 342 | } 343 | -------------------------------------------------------------------------------- /Regular Expressions Made Easy - part 11-Re.Split .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## re.methods" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "### re.split" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 17, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "['Today is sunny', ' I want go to the park', ' I want to eat ice cream', '']" 39 | ] 40 | }, 41 | "execution_count": 17, 42 | "metadata": {}, 43 | "output_type": "execute_result" 44 | } 45 | ], 46 | "source": [ 47 | "#Example 1\n", 48 | "re.split('\\.','Today is sunny. I want go to the park. I want to eat ice cream.')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 18, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/plain": [ 61 | "['Today is sunny',\n", 62 | " '.',\n", 63 | " ' I want go to the park',\n", 64 | " '.',\n", 65 | " ' I want to eat ice cream',\n", 66 | " '.',\n", 67 | " '']" 68 | ] 69 | }, 70 | "execution_count": 18, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "#includes split point\n", 77 | "re.split('(\\.)','Today is sunny. I want go to the park. I want to eat ice cream.')" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 19, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "['Today is sunny.',\n", 91 | " ' I want go to the park.',\n", 92 | " ' I want to eat ice cream.',\n", 93 | " '.']" 94 | ] 95 | }, 96 | "execution_count": 19, 97 | "metadata": {}, 98 | "output_type": "execute_result" 99 | } 100 | ], 101 | "source": [ 102 | "split = '.'\n", 103 | "[i+split for i in re.split('\\.','Today is sunny. I want go to the park. I want to eat ice cream.')]" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": { 110 | "collapsed": true 111 | }, 112 | "outputs": [], 113 | "source": [ 114 | "#Example 2:\n", 115 | "\n", 116 | "string = '

My mother has blue eyes.

'\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": false 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "re.split('<\\w+>', string) #doesn't work" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "re.split('<.+>', string) #captures entire string \n", 139 | " #because it's greedy" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "re.split(\"<[^<>]+>\", string) #empty string problem" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": null, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "#Another Example of split and empty strings problem" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "re.split(',', ',happy, birthday,') #It seems to split at empty strings" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## Handling empty string problem" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "# list comprehensions" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "collapsed": false 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "[i for i in re.split(\"<[^<>]+>\", string) if i != '']" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": null, 207 | "metadata": { 208 | "collapsed": true 209 | }, 210 | "outputs": [], 211 | "source": [ 212 | "#Alternatives to split --" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "string = '

My mother has blue eyes.

'" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": { 230 | "collapsed": false 231 | }, 232 | "outputs": [], 233 | "source": [ 234 | "re.findall('>([^<]+)<',string) #findall" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": { 241 | "collapsed": false 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "string = re.split(',', ',happy, birthday,')\n", 246 | "\n", 247 | "re.split(',', ',happy, birthday,')" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "string = ',happy, birthday,'" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "collapsed": false 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "list(filter(None, string.split(',')))" 270 | ] 271 | } 272 | ], 273 | "metadata": { 274 | "kernelspec": { 275 | "display_name": "Python 3", 276 | "language": "python", 277 | "name": "python3" 278 | }, 279 | "language_info": { 280 | "codemirror_mode": { 281 | "name": "ipython", 282 | "version": 3 283 | }, 284 | "file_extension": ".py", 285 | "mimetype": "text/x-python", 286 | "name": "python", 287 | "nbconvert_exporter": "python", 288 | "pygments_lexer": "ipython3", 289 | "version": "3.6.0" 290 | } 291 | }, 292 | "nbformat": 4, 293 | "nbformat_minor": 1 294 | } 295 | -------------------------------------------------------------------------------- /Regular Expressions Made Easy - part 12 - re.Sub.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import re" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### re.sub" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "string =\"\"\"U.S. stock-index futures pointed\n", 30 | "to a solidly higher open on Monday, \n", 31 | "indicating that major \n", 32 | "benchmarks were poised to USA reboundfrom last week’s sharp decline, \n", 33 | "\\nwhich represented their biggest weekly drops in months.\"\"\"" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": { 40 | "collapsed": false 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "United States stock-index futures pointed\n", 48 | "to a solidly higher open on Monday, \n", 49 | "indicating that major \n", 50 | "benchmarks were poised to United States A reboundfrom last week’s sharp decline, \n", 51 | "\n", 52 | "which represented their biggest weekly drops in months.\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "print(re.sub('U.S.|US|USA', 'United States ', string ))" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "source": [ 66 | "## Using Functions with Sub" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": { 73 | "collapsed": true 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "#brief explanation of lambda" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": { 84 | "collapsed": false 85 | }, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "9" 91 | ] 92 | }, 93 | "execution_count": 4, 94 | "metadata": {}, 95 | "output_type": "execute_result" 96 | } 97 | ], 98 | "source": [ 99 | "def square(x):\n", 100 | " return (x ** 2)\n", 101 | "\n", 102 | "square(3)\n", 103 | "\n" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "9" 117 | ] 118 | }, 119 | "execution_count": 5, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "square = lambda x: x**2 \n", 126 | "square(3)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 7, 132 | "metadata": { 133 | "collapsed": true 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "string = 'Dan has 3 snails. Mike has 4 cats. Alisa has 9 monkeys.'" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 8, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "'3'" 151 | ] 152 | }, 153 | "execution_count": 8, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "re.search('(\\d+)', string).group()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 9, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "['3', '4', '9']" 173 | ] 174 | }, 175 | "execution_count": 9, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "re.findall('(\\d+)', string)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 10, 187 | "metadata": { 188 | "collapsed": false 189 | }, 190 | "outputs": [ 191 | { 192 | "data": { 193 | "text/plain": [ 194 | "'Dan has 1 snails. Mike has 1 cats. Alisa has 1 monkeys.'" 195 | ] 196 | }, 197 | "execution_count": 10, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "re.sub('(\\d+)', '1', string) #find all instances like findall" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": true 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "#In this example we change the " 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 15, 220 | "metadata": { 221 | "collapsed": false 222 | }, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/plain": [ 227 | "'Dan has 9 snails. Mike has 16 cats. Alisa has 81 monkeys.'" 228 | ] 229 | }, 230 | "execution_count": 15, 231 | "metadata": {}, 232 | "output_type": "execute_result" 233 | } 234 | ], 235 | "source": [ 236 | "re.sub('(\\d+)', lambda x: str(square(int(x.group(0)))), string)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": { 243 | "collapsed": true 244 | }, 245 | "outputs": [], 246 | "source": [ 247 | "#re.sub('(\\d+)', lambda x: str(x), string)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "#step 1 lambda x: x.group x is match object\n", 259 | "#step 2 turn the result into int\n", 260 | "#step 3 Use Square function\n", 261 | "#step 4 turn back to string" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": { 268 | "collapsed": true 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "# Another example with function" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 16, 278 | "metadata": { 279 | "collapsed": false 280 | }, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "eating laughing sleeping studying\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "#m = match object\n", 292 | "import re\n", 293 | "\n", 294 | "# The input string.\n", 295 | "input = \"eat laugh sleep study\"\n", 296 | "\n", 297 | "# Use lambda to add \"ing\" to all words.\n", 298 | "result = re.sub(\"\\w+\", lambda m: m.group() + \"ing\", input)\n", 299 | "\n", 300 | "# Display result.\n", 301 | "print(result)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": { 308 | "collapsed": false 309 | }, 310 | "outputs": [], 311 | "source": [ 312 | "import re\n", 313 | "\n", 314 | "# The input string.\n", 315 | "input = \"eat laugh sleep study\"\n", 316 | "\n", 317 | "# Use lambda to add \"ing\" to all words.\n", 318 | "result = re.sub(\"\\w+\", lambda m: m.group() + \"ing\", input)\n", 319 | "\n", 320 | "# Display result.\n", 321 | "print(result)" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": { 327 | "collapsed": true 328 | }, 329 | "source": [ 330 | "### backreferencing with subs" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 18, 336 | "metadata": { 337 | "collapsed": true 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "string = 'Merry Merry Christmas'" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 19, 347 | "metadata": { 348 | "collapsed": false 349 | }, 350 | "outputs": [ 351 | { 352 | "data": { 353 | "text/plain": [ 354 | "('Merry ', 'Merry ')" 355 | ] 356 | }, 357 | "execution_count": 19, 358 | "metadata": {}, 359 | "output_type": "execute_result" 360 | } 361 | ], 362 | "source": [ 363 | "re.search(r'(\\w+ )(\\1)', string).groups()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 20, 369 | "metadata": { 370 | "collapsed": false 371 | }, 372 | "outputs": [ 373 | { 374 | "data": { 375 | "text/plain": [ 376 | "('Merry ', 'Merry ')" 377 | ] 378 | }, 379 | "execution_count": 20, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "re.search(r'(\\w+ )(\\1)', string).group(1,2)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": { 392 | "collapsed": true 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "#backreferencing example with sub" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 21, 402 | "metadata": { 403 | "collapsed": false 404 | }, 405 | "outputs": [ 406 | { 407 | "data": { 408 | "text/plain": [ 409 | "'Happy Merry Christmas'" 410 | ] 411 | }, 412 | "execution_count": 21, 413 | "metadata": {}, 414 | "output_type": "execute_result" 415 | } 416 | ], 417 | "source": [ 418 | "re.sub(r'(\\w+) (\\1)',r'Happy \\1', string) # \\1 = Merry" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 22, 424 | "metadata": { 425 | "collapsed": false 426 | }, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "'Merry Happy Christmas'" 432 | ] 433 | }, 434 | "execution_count": 22, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | } 438 | ], 439 | "source": [ 440 | "re.sub(r'(\\w+) (\\1)',r'\\1 Happy', string) #Merry Happy" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 23, 446 | "metadata": { 447 | "collapsed": false 448 | }, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "'Happy Merry Christmas'" 454 | ] 455 | }, 456 | "execution_count": 23, 457 | "metadata": {}, 458 | "output_type": "execute_result" 459 | } 460 | ], 461 | "source": [ 462 | "re.sub(r'(\\w+) (\\1)',r'Happy \\2', string)" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": { 469 | "collapsed": true 470 | }, 471 | "outputs": [], 472 | "source": [] 473 | } 474 | ], 475 | "metadata": { 476 | "kernelspec": { 477 | "display_name": "Python 3", 478 | "language": "python", 479 | "name": "python3" 480 | }, 481 | "language_info": { 482 | "codemirror_mode": { 483 | "name": "ipython", 484 | "version": 3 485 | }, 486 | "file_extension": ".py", 487 | "mimetype": "text/x-python", 488 | "name": "python", 489 | "nbconvert_exporter": "python", 490 | "pygments_lexer": "ipython3", 491 | "version": "3.6.0" 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 1 496 | } 497 | -------------------------------------------------------------------------------- /Regular Expressions Made Easy -13 - Word Boundaries.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### \\b meta character" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | " \\b is called 'boundary' and allows you to isolate words. \n", 17 | "\n", 18 | "- is similar to ^ and $ (location and no consumption)`````\n" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "import re" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 4, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "string = 'cat catherine catholic wildcat copycat uncatchable'" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 2, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "pattern = re.compile('cat')" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 5, 57 | "metadata": { 58 | "collapsed": false 59 | }, 60 | "outputs": [ 61 | { 62 | "data": { 63 | "text/plain": [ 64 | "['cat', 'cat', 'cat', 'cat', 'cat', 'cat']" 65 | ] 66 | }, 67 | "execution_count": 5, 68 | "metadata": {}, 69 | "output_type": "execute_result" 70 | } 71 | ], 72 | "source": [ 73 | "re.findall(pattern, string)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "#using space" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 6, 90 | "metadata": { 91 | "collapsed": true 92 | }, 93 | "outputs": [], 94 | "source": [ 95 | "string = 'cat catherine catholic wildcat copycat uncatchable'" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 9, 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "outputs": [], 105 | "source": [ 106 | "pattern = re.compile(' cat ')" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 10, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "[]" 120 | ] 121 | }, 122 | "execution_count": 10, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "re.findall(pattern, string)" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "#only pull out cat with boundary" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "string = 'cat catherine catholic wildcat copycat uncatchable'" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 11, 156 | "metadata": { 157 | "collapsed": true 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "pattern = re.compile(r'\\bcat\\b')" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 12, 167 | "metadata": { 168 | "collapsed": false 169 | }, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "['cat']" 175 | ] 176 | }, 177 | "execution_count": 12, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "re.findall(pattern, string)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "### Word boundaries nuances\n" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "#be careful with periods(dot) and non-alphanumeric characters \n", 202 | "# \\w [A-Za-z0-9_] \\W +:@^%" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 16, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "string = '.cat catherine catholic wildcat copycat uncatchable'" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 17, 219 | "metadata": { 220 | "collapsed": true 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "pattern = re.compile(r'\\bcat\\b')" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 18, 230 | "metadata": { 231 | "collapsed": false 232 | }, 233 | "outputs": [ 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "['cat']" 238 | ] 239 | }, 240 | "execution_count": 18, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "re.findall(pattern, string)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": { 253 | "collapsed": true 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "# . = nonalpha numeric" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": null, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "#One side has to have an alphanumeric character and the other side \n", 269 | "#is non alphanumeric character" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 19, 284 | "metadata": { 285 | "collapsed": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "string = '@cat cat catherine catholic wildcat copycat uncatchable'" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 21, 295 | "metadata": { 296 | "collapsed": false 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "pattern = re.compile(r'\\bcat\\b')" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 22, 306 | "metadata": { 307 | "collapsed": false 308 | }, 309 | "outputs": [ 310 | { 311 | "data": { 312 | "text/plain": [ 313 | "['cat', 'cat']" 314 | ] 315 | }, 316 | "execution_count": 22, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "re.findall(pattern, string)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "#Example 2 Twitter examples Twitter Handles" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 24, 339 | "metadata": { 340 | "collapsed": false 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "string = '@moondra2017.org'\n", 345 | "string2 = '@moondra'\n", 346 | "string3 = 'Python@moondra'\n", 347 | "string4 = '@moondra_python'" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": { 354 | "collapsed": true 355 | }, 356 | "outputs": [], 357 | "source": [ 358 | "#we only want @moondra and '@moondra_python' -- string 2 and string 4" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 28, 364 | "metadata": { 365 | "collapsed": false 366 | }, 367 | "outputs": [], 368 | "source": [ 369 | "pattern = re.compile(r'\\b@[\\w]+\\b') #no good\n", 370 | "re.search(pattern, string)" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": { 377 | "collapsed": true 378 | }, 379 | "outputs": [], 380 | "source": [ 381 | "string = '@moondra2017.org'\n", 382 | "string2 = '@moondra'\n", 383 | "string3 = 'Python@moondra'\n", 384 | "string4 = '@moondra_python'" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 32, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "<_sre.SRE_Match object; span=(0, 12), match='@moondra2017'>" 398 | ] 399 | }, 400 | "execution_count": 32, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "pattern = re.compile(r'\\B@[\\w]+\\b') # _ is include in \\w\n", 407 | "re.search(pattern, string) # This works but not perfect" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 41, 413 | "metadata": { 414 | "collapsed": true 415 | }, 416 | "outputs": [], 417 | "source": [ 418 | "string = '@moondra2017.org'\n", 419 | "string2 = '@moondra @moondra @moondra'\n", 420 | "string3 = 'Python@moondra'\n", 421 | "string4 = '@moondra_python'" 422 | ] 423 | }, 424 | { 425 | "cell_type": "code", 426 | "execution_count": 48, 427 | "metadata": { 428 | "collapsed": false 429 | }, 430 | "outputs": [ 431 | { 432 | "data": { 433 | "text/plain": [ 434 | "[]" 435 | ] 436 | }, 437 | "execution_count": 48, 438 | "metadata": {}, 439 | "output_type": "execute_result" 440 | } 441 | ], 442 | "source": [ 443 | "pattern = re.compile(r'\\B@[\\w]+\\b(?!\\.)')\n", 444 | "re.findall(pattern, string)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": 33, 450 | "metadata": { 451 | "collapsed": true 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "pattern = re.compile(r'\\B@[\\w]+$') # #This is perfect\n", 456 | "re.search(pattern, string)" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 38, 462 | "metadata": { 463 | "collapsed": false 464 | }, 465 | "outputs": [ 466 | { 467 | "data": { 468 | "text/plain": [ 469 | "['@moondra']" 470 | ] 471 | }, 472 | "execution_count": 38, 473 | "metadata": {}, 474 | "output_type": "execute_result" 475 | } 476 | ], 477 | "source": [ 478 | "pattern = re.compile(r'\\B@[\\w]+$') \n", 479 | "re.findall(pattern, string2)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 35, 485 | "metadata": { 486 | "collapsed": false 487 | }, 488 | "outputs": [], 489 | "source": [ 490 | "pattern = re.compile(r'\\B@[\\w]+$') \n", 491 | "re.search(pattern, string3)\n" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 36, 497 | "metadata": { 498 | "collapsed": false 499 | }, 500 | "outputs": [ 501 | { 502 | "data": { 503 | "text/plain": [ 504 | "<_sre.SRE_Match object; span=(0, 15), match='@moondra_python'>" 505 | ] 506 | }, 507 | "execution_count": 36, 508 | "metadata": {}, 509 | "output_type": "execute_result" 510 | } 511 | ], 512 | "source": [ 513 | "pattern = re.compile(r'\\B@[\\w]+$')\n", 514 | "re.search(pattern, string4)" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": { 521 | "collapsed": true 522 | }, 523 | "outputs": [], 524 | "source": [] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": { 530 | "collapsed": true 531 | }, 532 | "outputs": [], 533 | "source": [] 534 | } 535 | ], 536 | "metadata": { 537 | "kernelspec": { 538 | "display_name": "Python 3", 539 | "language": "python", 540 | "name": "python3" 541 | }, 542 | "language_info": { 543 | "codemirror_mode": { 544 | "name": "ipython", 545 | "version": 3 546 | }, 547 | "file_extension": ".py", 548 | "mimetype": "text/x-python", 549 | "name": "python", 550 | "nbconvert_exporter": "python", 551 | "pygments_lexer": "ipython3", 552 | "version": "3.6.0" 553 | } 554 | }, 555 | "nbformat": 4, 556 | "nbformat_minor": 1 557 | } 558 | -------------------------------------------------------------------------------- /Regular Expressions made Easy - part 1 + part 2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#Regular Expressions are used to match string patterns.\n", 12 | "-They are very powerful\n", 13 | "\n", 14 | "-If you want to pull out a string pattern RE can do it\n", 15 | "\n", 16 | "-They may seem intimidating \n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "# Things to note" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": null, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "#The first thing I want start off with is the the back slash character\n", 35 | "#Very confusing to people" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "#Python uses back slash to indicate special characters \n", 47 | "\n", 48 | "\n", 49 | "'\\n' Backslash followed by n denotes a newline\n", 50 | "'\\t' denotes a tab\n", 51 | "\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "\n", 63 | "\n", 64 | " 'r' expression, that voids the Python's special characters\n", 65 | "\n", 66 | " r'\\n' means it's a raw string with two characters 'n' and '\\' as \n", 67 | "opposed to just one special character' " 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "\n", 79 | "#Let's see some examples of this dont mind the python syntax\n" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 1, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "import re\n", 91 | "re.search('n', '\\n') #first item is pattern, second item is string" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 2, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "<_sre.SRE_Match object; span=(1, 2), match='n'>" 105 | ] 106 | }, 107 | "execution_count": 2, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "#two ways to handle this one way is to use \\ for every backslash\n", 114 | "\n", 115 | "import re\n", 116 | "re.search('n', '\\\\n') " 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [ 127 | "re.search('n', '\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n') #not the best way if we\n", 128 | " #have too many \\s" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 5, 134 | "metadata": { 135 | "collapsed": false 136 | }, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "<_sre.SRE_Match object; span=(1, 2), match='n'>" 142 | ] 143 | }, 144 | "execution_count": 5, 145 | "metadata": {}, 146 | "output_type": "execute_result" 147 | } 148 | ], 149 | "source": [ 150 | "re.search('n', r'\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n') #r converts to raw string" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 6, 156 | "metadata": { 157 | "collapsed": false 158 | }, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/plain": [ 163 | "<_sre.SRE_Match object; span=(0, 1), match='\\n'>" 164 | ] 165 | }, 166 | "execution_count": 6, 167 | "metadata": {}, 168 | "output_type": "execute_result" 169 | } 170 | ], 171 | "source": [ 172 | "#there are some nuances that you should be aware of\n", 173 | "#regular expressions has its own special characters as well\n", 174 | "# regex with '\\n' and r'\\n' both look for newline\n", 175 | "\n", 176 | "re.search('\\n', '\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n') " 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 7, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "<_sre.SRE_Match object; span=(0, 1), match='\\n'>" 190 | ] 191 | }, 192 | "execution_count": 7, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "re.search(r'\\n', '\\n\\n') #this works as well because r'\\n' also looks\n", 199 | " #for new line" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 8, 205 | "metadata": { 206 | "collapsed": false 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "#doesn't work because sting doesn't use newline and r'\\n' looks for newline\n", 211 | "\n", 212 | "re.search(r'\\n', r'\\n\\n') ` #r\n", 213 | "\n" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "# MATCH and SEARCH EXAMPLES" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": null, 226 | "metadata": { 227 | "collapsed": true 228 | }, 229 | "outputs": [], 230 | "source": [ 231 | "REs common methods - Match and Search" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "re.search(pattern, string, flags) # searches anywhere in the sentence\n", 243 | " #flags special options" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": { 250 | "collapsed": true 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "re.match(pattern, string, flags) # only beginning of the string" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 11, 260 | "metadata": { 261 | "collapsed": true 262 | }, 263 | "outputs": [], 264 | "source": [ 265 | "re.match(\"c\", \"abcdef\") #returns none because only looks at the start of string" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 10, 271 | "metadata": { 272 | "collapsed": false 273 | }, 274 | "outputs": [ 275 | { 276 | "data": { 277 | "text/plain": [ 278 | "<_sre.SRE_Match object; span=(2, 3), match='c'>" 279 | ] 280 | }, 281 | "execution_count": 10, 282 | "metadata": {}, 283 | "output_type": "execute_result" 284 | } 285 | ], 286 | "source": [ 287 | "re.search(\"c\", \"abcdef\") #searches anywhere" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": { 294 | "collapsed": true 295 | }, 296 | "outputs": [], 297 | "source": [] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 12, 302 | "metadata": { 303 | "collapsed": false 304 | }, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "False" 310 | ] 311 | }, 312 | "execution_count": 12, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "bool(re.match(\"c\", \"abcdef\")) # no match returns boolean false" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 13, 324 | "metadata": { 325 | "collapsed": false 326 | }, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "True" 332 | ] 333 | }, 334 | "execution_count": 13, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "bool(re.match(\"a\", \"abcdef\")) #match returns true" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 14, 346 | "metadata": { 347 | "collapsed": false 348 | }, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "<_sre.SRE_Match object; span=(2, 3), match='c'>" 354 | ] 355 | }, 356 | "execution_count": 14, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "re.search(\"c\", \"abcdef\") #tells you where it matched first and only first" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 15, 368 | "metadata": { 369 | "collapsed": false 370 | }, 371 | "outputs": [ 372 | { 373 | "data": { 374 | "text/plain": [ 375 | "<_sre.SRE_Match object; span=(2, 3), match='c'>" 376 | ] 377 | }, 378 | "execution_count": 15, 379 | "metadata": {}, 380 | "output_type": "execute_result" 381 | } 382 | ], 383 | "source": [ 384 | "re.search(\"c\", \"abcdefc\") #multiple 'c's first instance only" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 16, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "<_sre.SRE_Match object; span=(6, 7), match='c'>" 398 | ] 399 | }, 400 | "execution_count": 16, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "re.search(\"c\", \"abdef\\nc\") #multiline works with search" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 17, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [], 416 | "source": [ 417 | "re.match(\"c\", \"abcdef\\nc\") #match doesn't work with newline" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "metadata": {}, 423 | "source": [ 424 | "## Printing the output of match and search" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 23, 430 | "metadata": { 431 | "collapsed": false 432 | }, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "text/plain": [ 437 | "<_sre.SRE_Match object; span=(0, 1), match='a'>" 438 | ] 439 | }, 440 | "execution_count": 23, 441 | "metadata": {}, 442 | "output_type": "execute_result" 443 | } 444 | ], 445 | "source": [ 446 | "(re.match(\"a\", \"abcdef\")) #match objects" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": 19, 452 | "metadata": { 453 | "collapsed": false 454 | }, 455 | "outputs": [ 456 | { 457 | "data": { 458 | "text/plain": [ 459 | "'a'" 460 | ] 461 | }, 462 | "execution_count": 19, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "re.match(\"a\", \"abcdef\").group() #string output #defautlt value is 0" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": 20, 474 | "metadata": { 475 | "collapsed": false 476 | }, 477 | "outputs": [ 478 | { 479 | "data": { 480 | "text/plain": [ 481 | "'a'" 482 | ] 483 | }, 484 | "execution_count": 20, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "re.match(\"a\", \"abcdef\").group(0) " 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 21, 496 | "metadata": { 497 | "collapsed": false 498 | }, 499 | "outputs": [ 500 | { 501 | "data": { 502 | "text/plain": [ 503 | "'n'" 504 | ] 505 | }, 506 | "execution_count": 21, 507 | "metadata": {}, 508 | "output_type": "execute_result" 509 | } 510 | ], 511 | "source": [ 512 | "re.search(\"n\", \"abcdefnc abcd\").group()" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 22, 518 | "metadata": { 519 | "collapsed": false 520 | }, 521 | "outputs": [ 522 | { 523 | "data": { 524 | "text/plain": [ 525 | "'nc abcd'" 526 | ] 527 | }, 528 | "execution_count": 22, 529 | "metadata": {}, 530 | "output_type": "execute_result" 531 | } 532 | ], 533 | "source": [ 534 | "re.search('n.+', \"abcdefnc abcd\").group() #pull out different types of strings \n", 535 | " #depending on the wildcards you use" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 24, 541 | "metadata": { 542 | "collapsed": false 543 | }, 544 | "outputs": [ 545 | { 546 | "data": { 547 | "text/plain": [ 548 | "6" 549 | ] 550 | }, 551 | "execution_count": 24, 552 | "metadata": {}, 553 | "output_type": "execute_result" 554 | } 555 | ], 556 | "source": [ 557 | "re.search(\"c\", \"abdef\\nc\").start()" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": 25, 563 | "metadata": { 564 | "collapsed": false 565 | }, 566 | "outputs": [ 567 | { 568 | "data": { 569 | "text/plain": [ 570 | "7" 571 | ] 572 | }, 573 | "execution_count": 25, 574 | "metadata": {}, 575 | "output_type": "execute_result" 576 | } 577 | ], 578 | "source": [ 579 | "re.search(\"c\", \"abdef\\nc\").end()" 580 | ] 581 | }, 582 | { 583 | "cell_type": "markdown", 584 | "metadata": { 585 | "collapsed": true 586 | }, 587 | "source": [ 588 | "## Literal matching" 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": 26, 594 | "metadata": { 595 | "collapsed": true 596 | }, 597 | "outputs": [], 598 | "source": [ 599 | "re.search('na',\"abcdefnc abcd\" ) #doesn't work, because they are ordered" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": 27, 605 | "metadata": { 606 | "collapsed": false 607 | }, 608 | "outputs": [ 609 | { 610 | "data": { 611 | "text/plain": [ 612 | "<_sre.SRE_Match object; span=(0, 1), match='a'>" 613 | ] 614 | }, 615 | "execution_count": 27, 616 | "metadata": {}, 617 | "output_type": "execute_result" 618 | } 619 | ], 620 | "source": [ 621 | "re.search('n|a',\"abcdefnc abcda\" ) #n or a" 622 | ] 623 | }, 624 | { 625 | "cell_type": "code", 626 | "execution_count": 28, 627 | "metadata": { 628 | "collapsed": false 629 | }, 630 | "outputs": [ 631 | { 632 | "data": { 633 | "text/plain": [ 634 | "<_sre.SRE_Match object; span=(5, 6), match='n'>" 635 | ] 636 | }, 637 | "execution_count": 28, 638 | "metadata": {}, 639 | "output_type": "execute_result" 640 | } 641 | ], 642 | "source": [ 643 | " re.search('n|a',\"bcdefnc abcda\" ) #replaced the a with b, first match is an n" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 29, 649 | "metadata": { 650 | "collapsed": false 651 | }, 652 | "outputs": [ 653 | { 654 | "data": { 655 | "text/plain": [ 656 | "<_sre.SRE_Match object; span=(0, 1), match='b'>" 657 | ] 658 | }, 659 | "execution_count": 29, 660 | "metadata": {}, 661 | "output_type": "execute_result" 662 | } 663 | ], 664 | "source": [ 665 | "re.search('n|a|b',\"bcdefnc abcda\" ) # as many OR expressions" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "## re.findall" 673 | ] 674 | }, 675 | { 676 | "cell_type": "code", 677 | "execution_count": 30, 678 | "metadata": { 679 | "collapsed": false 680 | }, 681 | "outputs": [ 682 | { 683 | "data": { 684 | "text/plain": [ 685 | "['n', 'a', 'a']" 686 | ] 687 | }, 688 | "execution_count": 30, 689 | "metadata": {}, 690 | "output_type": "execute_result" 691 | } 692 | ], 693 | "source": [ 694 | "re.findall('n|a',\"bcdefnc abcda\" ) #find all pulls out all instances" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 31, 700 | "metadata": { 701 | "collapsed": false 702 | }, 703 | "outputs": [ 704 | { 705 | "data": { 706 | "text/plain": [ 707 | "<_sre.SRE_Match object; span=(0, 4), match='abcd'>" 708 | ] 709 | }, 710 | "execution_count": 31, 711 | "metadata": {}, 712 | "output_type": "execute_result" 713 | } 714 | ], 715 | "source": [ 716 | "re.search('abcd',\"abcdefnc abcd\" ) #multiple characters - literal search" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": null, 722 | "metadata": { 723 | "collapsed": true 724 | }, 725 | "outputs": [], 726 | "source": [ 727 | "re.findall('abcd',\"abcdefnc abcd\" ) " 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": null, 733 | "metadata": { 734 | "collapsed": true 735 | }, 736 | "outputs": [], 737 | "source": [] 738 | } 739 | ], 740 | "metadata": { 741 | "kernelspec": { 742 | "display_name": "Python 3", 743 | "language": "python", 744 | "name": "python3" 745 | }, 746 | "language_info": { 747 | "codemirror_mode": { 748 | "name": "ipython", 749 | "version": 3 750 | }, 751 | "file_extension": ".py", 752 | "mimetype": "text/x-python", 753 | "name": "python", 754 | "nbconvert_exporter": "python", 755 | "pygments_lexer": "ipython3", 756 | "version": "3.6.0" 757 | } 758 | }, 759 | "nbformat": 4, 760 | "nbformat_minor": 1 761 | } 762 | -------------------------------------------------------------------------------- /Regular Expressions made Easy - part 3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## CHARACTER SETS" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "#Character sets can match a set of characters" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 33, 24 | "metadata": { 25 | "collapsed": false 26 | }, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "<_sre.SRE_Match object; span=(0, 4), match='abcd'>" 32 | ] 33 | }, 34 | "execution_count": 33, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "re.search('abcd',\"abcdefnc abcd\" ) # earlier code" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 34, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/plain": [ 53 | "<_sre.SRE_Match object; span=(0, 4), match='abcd'>" 54 | ] 55 | }, 56 | "execution_count": 34, 57 | "metadata": {}, 58 | "output_type": "execute_result" 59 | } 60 | ], 61 | "source": [ 62 | "re.search(r'\\w\\w\\w\\w',\"abcdefnc abcd\" ) #matches characters and numbers\n", 63 | " #alpha numeric characters " 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "source": [ 72 | "\\w matches alpha numeric characters [a-zA-Z0-9_] " 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 35, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [ 82 | { 83 | "data": { 84 | "text/plain": [ 85 | "<_sre.SRE_Match object; span=(0, 4), match='ab_c'>" 86 | ] 87 | }, 88 | "execution_count": 35, 89 | "metadata": {}, 90 | "output_type": "execute_result" 91 | } 92 | ], 93 | "source": [ 94 | "re.search(r'\\w\\w\\w\\w',\"ab_cdefnc abcd\" ) #matches _ character" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 36, 100 | "metadata": { 101 | "collapsed": true 102 | }, 103 | "outputs": [], 104 | "source": [ 105 | "re.search(r'\\w\\w\\w', \"a3.!-!\") #doesn't match symbols only numbers and \n", 106 | " # characters" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 37, 112 | "metadata": { 113 | "collapsed": false 114 | }, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "'a33'" 120 | ] 121 | }, 122 | "execution_count": 37, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "re.search(r'\\w\\w\\w', \"a33-_!\") .group()" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [ 139 | "#\\W opposite of \\w ; so nothing included in [a-zA-Z0-9_]" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 38, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "<_sre.SRE_Match object; span=(0, 3), match='a3.'>" 153 | ] 154 | }, 155 | "execution_count": 38, 156 | "metadata": {}, 157 | "output_type": "execute_result" 158 | } 159 | ], 160 | "source": [ 161 | "\n", 162 | "re.search(r'\\w\\w\\W', \"a3.-_!\") # \\W matches non characters and numbers" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 39, 168 | "metadata": { 169 | "collapsed": false 170 | }, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "<_sre.SRE_Match object; span=(0, 3), match='a3 '>" 176 | ] 177 | }, 178 | "execution_count": 39, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "re.search(r'\\w\\w\\W', \"a3 .-_!\") #matches empty space as well" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "#We will go over other character sets later on" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "# Let's go over quantifiers'" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "collapsed": true 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "#quantifiers\n", 214 | "#\n", 215 | "'+' = 1 or more\n", 216 | "'?' = 0 or 1\n", 217 | "'*' = 0 or more\n", 218 | "'{n,m}' = n to m repetitions {,3}, {3,}\n", 219 | "\n" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 40, 225 | "metadata": { 226 | "collapsed": false 227 | }, 228 | "outputs": [ 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "<_sre.SRE_Match object; span=(0, 2), match='ab'>" 233 | ] 234 | }, 235 | "execution_count": 40, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "re.search(r'\\w\\w',\"abcdefnc abcd\" )" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 41, 247 | "metadata": { 248 | "collapsed": false 249 | }, 250 | "outputs": [ 251 | { 252 | "data": { 253 | "text/plain": [ 254 | "'abcdefnc'" 255 | ] 256 | }, 257 | "execution_count": 41, 258 | "metadata": {}, 259 | "output_type": "execute_result" 260 | } 261 | ], 262 | "source": [ 263 | "re.search(r'\\w+',\"abcdefnc abcd\" ).group() #don't know the numbers of letters" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": true 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "\\w\\w\\w\\w\\w\\w\\w\\w\\w" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 42, 280 | "metadata": { 281 | "collapsed": false 282 | }, 283 | "outputs": [ 284 | { 285 | "data": { 286 | "text/plain": [ 287 | "'abcdefnc abcd'" 288 | ] 289 | }, 290 | "execution_count": 42, 291 | "metadata": {}, 292 | "output_type": "execute_result" 293 | } 294 | ], 295 | "source": [ 296 | "re.search(r'\\w+\\W+\\w+',\"abcdefnc abcd\").group()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 69, 302 | "metadata": { 303 | "collapsed": false 304 | }, 305 | "outputs": [ 306 | { 307 | "data": { 308 | "text/plain": [ 309 | "'abcdefnc abcd'" 310 | ] 311 | }, 312 | "execution_count": 69, 313 | "metadata": {}, 314 | "output_type": "execute_result" 315 | } 316 | ], 317 | "source": [ 318 | "re.search('\\w+\\W+\\w+',\"abcdefnc abcd\").group() #added spaces" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 44, 324 | "metadata": { 325 | "collapsed": false 326 | }, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "'abcdefnabcd'" 332 | ] 333 | }, 334 | "execution_count": 44, 335 | "metadata": {}, 336 | "output_type": "execute_result" 337 | } 338 | ], 339 | "source": [ 340 | "re.search(r'\\w+\\W?\\w+',\"abcdefnabcd\").group() # ? = 0 or 1 instances" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": 45, 346 | "metadata": { 347 | "collapsed": false 348 | }, 349 | "outputs": [ 350 | { 351 | "data": { 352 | "text/plain": [ 353 | "'abcde fnabcd'" 354 | ] 355 | }, 356 | "execution_count": 45, 357 | "metadata": {}, 358 | "output_type": "execute_result" 359 | } 360 | ], 361 | "source": [ 362 | "re.search(r'\\w+\\W?\\w+',\"abcde fnabcd\").group()" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 47, 368 | "metadata": { 369 | "collapsed": false 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "re.search(r'\\w+\\W+\\w+', \"abcdefnabcd\")" 374 | ] 375 | }, 376 | { 377 | "cell_type": "code", 378 | "execution_count": null, 379 | "metadata": { 380 | "collapsed": true 381 | }, 382 | "outputs": [], 383 | "source": [ 384 | "#Pulling out specific amounts" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 48, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/plain": [ 397 | "<_sre.SRE_Match object; span=(0, 3), match='aaa'>" 398 | ] 399 | }, 400 | "execution_count": 48, 401 | "metadata": {}, 402 | "output_type": "execute_result" 403 | } 404 | ], 405 | "source": [ 406 | "re.search(r'\\w{3}', 'aaaaaaaaaaa') #only 3 \\w characters" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 49, 412 | "metadata": { 413 | "collapsed": false 414 | }, 415 | "outputs": [ 416 | { 417 | "data": { 418 | "text/plain": [ 419 | "'aaaa'" 420 | ] 421 | }, 422 | "execution_count": 49, 423 | "metadata": {}, 424 | "output_type": "execute_result" 425 | } 426 | ], 427 | "source": [ 428 | "re.search(r'\\w{1,4}', 'aaaaaaaaaaa').group() #1 is min, 4 is max" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 50, 434 | "metadata": { 435 | "collapsed": false 436 | }, 437 | "outputs": [ 438 | { 439 | "data": { 440 | "text/plain": [ 441 | "'abcdefnc abcd'" 442 | ] 443 | }, 444 | "execution_count": 50, 445 | "metadata": {}, 446 | "output_type": "execute_result" 447 | } 448 | ], 449 | "source": [ 450 | " \n", 451 | "re.search(r'\\w{1,10}\\W{0,4}\\w+',\"abcdefnc abcd\").group()#1-10 \\w characters,\n", 452 | " #0-4 \\W chracters\n", 453 | " # 1+ \\w characters" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 51, 459 | "metadata": { 460 | "collapsed": false 461 | }, 462 | "outputs": [ 463 | { 464 | "data": { 465 | "text/plain": [ 466 | "'abcdefnc abcd'" 467 | ] 468 | }, 469 | "execution_count": 51, 470 | "metadata": {}, 471 | "output_type": "execute_result" 472 | } 473 | ], 474 | "source": [ 475 | "re.search(r'\\w{1,}\\W{0,}\\w+',\"abcdefnc abcd\").group() #at least 1\n", 476 | " #at least 0" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": { 483 | "collapsed": true 484 | }, 485 | "outputs": [], 486 | "source": [] 487 | } 488 | ], 489 | "metadata": { 490 | "kernelspec": { 491 | "display_name": "Python 3", 492 | "language": "python", 493 | "name": "python3" 494 | }, 495 | "language_info": { 496 | "codemirror_mode": { 497 | "name": "ipython", 498 | "version": 3 499 | }, 500 | "file_extension": ".py", 501 | "mimetype": "text/x-python", 502 | "name": "python", 503 | "nbconvert_exporter": "python", 504 | "pygments_lexer": "ipython3", 505 | "version": "3.6.0" 506 | } 507 | }, 508 | "nbformat": 4, 509 | "nbformat_minor": 1 510 | } 511 | -------------------------------------------------------------------------------- /Regular Expressions made Easy - part 4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Other types of characters sets" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "\n", 19 | "'\\d' = matches digits [0-9]\n", 20 | "'\\D' = matches This matches any non-digit character; ~\\d" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "string = '23abced++'\n", 32 | "re.search('\\d+', string).group()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": { 39 | "collapsed": true 40 | }, 41 | "outputs": [], 42 | "source": [ 43 | "'\\s' = matches any whitespace character #new lines, tabs, spaces etc\n", 44 | "'\\S' = matches any non-whitespace chracter #~\\s" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 53, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/plain": [ 57 | "'23abced++'" 58 | ] 59 | }, 60 | "execution_count": 53, 61 | "metadata": {}, 62 | "output_type": "execute_result" 63 | } 64 | ], 65 | "source": [ 66 | "string = '23abced++'\n", 67 | "re.search('\\S+', string).group() #no spaces" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 54, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "string = '''Robots are branching out. A new prototype soft robot takes inspiration from plants by growing to explore its environment.\n", 79 | "\n", 80 | "Vines and some fungi extend from their tips to explore their surroundings. \n", 81 | "Elliot Hawkes of the University of California in Santa Barbara \n", 82 | "and his colleagues designed a bot that works \n", 83 | "on similar principles. Its mechanical body \n", 84 | "sits inside a plastic tube reel that extends \n", 85 | "through pressurized inflation, a method that some \n", 86 | "invertebrates like peanut worms (Sipunculus nudus)\n", 87 | "also use to extend their appendages. The plastic \n", 88 | "tubing has two compartments, and inflating one \n", 89 | "side or the other changes the extension direction. \n", 90 | "A camera sensor at the tip alerts the bot when it’s \n", 91 | "about to run into something.\n", 92 | "\n", 93 | "In the lab, Hawkes and his colleagues \n", 94 | "programmed the robot to form 3-D structures such \n", 95 | "as a radio antenna, turn off a valve, navigate a maze, \n", 96 | "swim through glue, act as a fire extinguisher, squeeze \n", 97 | "through tight gaps, shimmy through fly paper and slither \n", 98 | "across a bed of nails. The soft bot can extend up to \n", 99 | "72 meters, and unlike plants, it can grow at a speed of \n", 100 | "10 meters per second, the team reports July 19 in Science Robotics. \n", 101 | "The design could serve as a model for building robots \n", 102 | "that can traverse constrained environments\n", 103 | "\n", 104 | "This isn’t the first robot to take \n", 105 | "inspiration from plants. One plantlike \n", 106 | "predecessor was a robot modeled on roots.'''" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "(re.findall('\\S+', string) )" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": { 124 | "collapsed": true 125 | }, 126 | "outputs": [], 127 | "source": [ 128 | "' '.join(re.findall('\\S+', string))" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "metadata": { 135 | "collapsed": true 136 | }, 137 | "outputs": [], 138 | "source": [] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [ 148 | ". the dot matches any character except the newline." 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [ 159 | "string = '''Robots are branching out. A new prototype soft robot takes inspiration from plants by growing to explore its environment.\n", 160 | "\n", 161 | "Vines and some fungi extend from their tips to explore their surroundings. Elliot Hawkes of the University of California in Santa Barbara and his colleagues designed a bot that works on similar principles. Its mechanical body sits inside a plastic tube reel that extends through pressurized inflation, a method that some invertebrates like peanut worms (Sipunculus nudus) also use to extend their appendages. The plastic tubing has two compartments, and inflating one side or the other changes the extension direction. A camera sensor at the tip alerts the bot when it’s about to run into something.\n", 162 | "\n", 163 | "In the lab, Hawkes and his colleagues programmed the robot to form 3-D structures such as a radio antenna, turn off a valve, navigate a maze, swim through glue, act as a fire extinguisher, squeeze through tight gaps, shimmy through fly paper and slither across a bed of nails. The soft bot can extend up to 72 meters, and unlike plants, it can grow at a speed of 10 meters per second, the team reports July 19 in Science Robotics. The design could serve as a model for building robots that can traverse constrained environments\n", 164 | "\n", 165 | "This isn’t the first robot to take inspiration from plants. One plantlike predecessor was a robot modeled on roots.'''" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 60, 171 | "metadata": { 172 | "collapsed": false 173 | }, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "'Robots are branching out. A new prototype soft robot takes inspiration from plants by growing to explore its environment.'" 179 | ] 180 | }, 181 | "execution_count": 60, 182 | "metadata": {}, 183 | "output_type": "execute_result" 184 | } 185 | ], 186 | "source": [ 187 | "re.search('.+', string).group() #no new line" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": null, 193 | "metadata": { 194 | "collapsed": true 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "re.search('.+', string, flags = re.DOTALL).group()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": { 205 | "collapsed": true 206 | }, 207 | "outputs": [], 208 | "source": [] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Creating your own character sets" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": true 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "\n", 226 | "[A-Z] '-' is a metacharacter when used in [] (custom character sets)" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 63, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "string = 'Hello, There, How, Are, You'" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": 64, 243 | "metadata": { 244 | "collapsed": false 245 | }, 246 | "outputs": [ 247 | { 248 | "data": { 249 | "text/plain": [ 250 | "['H', 'T', 'H', 'A', 'Y']" 251 | ] 252 | }, 253 | "execution_count": 64, 254 | "metadata": {}, 255 | "output_type": "execute_result" 256 | } 257 | ], 258 | "source": [ 259 | "re.findall('[A-Z]', string) #pulls out all capital letters" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 65, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "['H', ',', 'T', ',', 'H', ',', 'A', ',', 'Y']" 273 | ] 274 | }, 275 | "execution_count": 65, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "re.findall('[A-Z,]', string) #here we search for any capital letters\n", 282 | " #or a comma" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 67, 288 | "metadata": { 289 | "collapsed": false 290 | }, 291 | "outputs": [ 292 | { 293 | "data": { 294 | "text/plain": [ 295 | "['H', ',', 'T', ',', 'H', ',', 'A', ',', 'Y', '.', '.', '.']" 296 | ] 297 | }, 298 | "execution_count": 67, 299 | "metadata": {}, 300 | "output_type": "execute_result" 301 | } 302 | ], 303 | "source": [ 304 | "string = 'Hello, There, How, Are, You...'\n", 305 | "re.findall('[A-Z,.]', string)" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "collapsed": true 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "string = 'Hello, There, How, Are, You...'\n", 317 | "re.findall('[A-Za-z,\\s.]', string)" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "collapsed": true 325 | }, 326 | "outputs": [], 327 | "source": [] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": null, 332 | "metadata": { 333 | "collapsed": true 334 | }, 335 | "outputs": [], 336 | "source": [] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": null, 341 | "metadata": { 342 | "collapsed": true 343 | }, 344 | "outputs": [], 345 | "source": [] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": { 351 | "collapsed": true 352 | }, 353 | "outputs": [], 354 | "source": [] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": { 360 | "collapsed": true 361 | }, 362 | "outputs": [], 363 | "source": [] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": { 369 | "collapsed": true 370 | }, 371 | "outputs": [], 372 | "source": [] 373 | } 374 | ], 375 | "metadata": { 376 | "kernelspec": { 377 | "display_name": "Python 3", 378 | "language": "python", 379 | "name": "python3" 380 | }, 381 | "language_info": { 382 | "codemirror_mode": { 383 | "name": "ipython", 384 | "version": 3 385 | }, 386 | "file_extension": ".py", 387 | "mimetype": "text/x-python", 388 | "name": "python", 389 | "nbconvert_exporter": "python", 390 | "pygments_lexer": "ipython3", 391 | "version": "3.6.0" 392 | } 393 | }, 394 | "nbformat": 4, 395 | "nbformat_minor": 1 396 | } 397 | -------------------------------------------------------------------------------- /Regular Expressions made Easy - part 5 + 6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "## Quantifers with custom sets" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "import re" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "+\n", 32 | "?\n", 33 | "*\n", 34 | "{}" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "string = 'HELLO, There, How, Are, You...'" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "re.search('[A-Z]+', string)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "re.findall('[A-Z]+', string)" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "re.findall('[A-Z]{2,}', string) # 2 or more" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "metadata": { 85 | "collapsed": false 86 | }, 87 | "outputs": [], 88 | "source": [ 89 | "string" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": { 96 | "collapsed": false 97 | }, 98 | "outputs": [], 99 | "source": [ 100 | "re.search('[A-Za-z\\s,]+', string).group() # one or more\n", 101 | " # of 4 types of characters" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "string" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "re.findall('[A-Z]?[a-z\\s,]+', string)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": false 131 | }, 132 | "outputs": [], 133 | "source": [ 134 | "re.search('[^A-Za-z\\s,]+', string).group() # ^ is a metacharacter within\n", 135 | " #brackets" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "collapsed": false 143 | }, 144 | "outputs": [], 145 | "source": [ 146 | "re.findall('[^A-Z]+', string)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## GROUPS" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": { 160 | "collapsed": true 161 | }, 162 | "outputs": [], 163 | "source": [ 164 | "#groups allow us to pull out sections of a match and store them" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 2, 170 | "metadata": { 171 | "collapsed": true 172 | }, 173 | "outputs": [], 174 | "source": [ 175 | "#contrived example\n", 176 | "import re\n", 177 | "string = 'John has 6 cats but I think my friend Susan has 3 dogs and Mike has 8 fishes'\n" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 3, 183 | "metadata": { 184 | "collapsed": false 185 | }, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "['John has 6 cats', 'Susan has 3 dogs', 'Mike has 8 fishes']" 191 | ] 192 | }, 193 | "execution_count": 3, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "re.findall('[A-Za-z]+ \\w+ \\d+ \\w+', string)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": { 206 | "collapsed": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "#the use of brackets denotes a group\n", 211 | "() = metacharacter" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 4, 217 | "metadata": { 218 | "collapsed": false 219 | }, 220 | "outputs": [ 221 | { 222 | "data": { 223 | "text/plain": [ 224 | "['John', 'Susan', 'Mike']" 225 | ] 226 | }, 227 | "execution_count": 4, 228 | "metadata": {}, 229 | "output_type": "execute_result" 230 | } 231 | ], 232 | "source": [ 233 | "re.findall('([A-Za-z]+) \\w+ \\d+ \\w+', string) #to pull out just the names" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 5, 239 | "metadata": { 240 | "collapsed": false 241 | }, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "['cats', 'dogs', 'fishes']" 247 | ] 248 | }, 249 | "execution_count": 5, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "re.findall('[A-Za-z]+ \\w+ \\d+ (\\w+)', string) #pull out animals" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 6, 261 | "metadata": { 262 | "collapsed": false 263 | }, 264 | "outputs": [ 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "[('John', '6', 'cats'), ('Susan', '3', 'dogs'), ('Mike', '8', 'fishes')]" 269 | ] 270 | }, 271 | "execution_count": 6, 272 | "metadata": {}, 273 | "output_type": "execute_result" 274 | } 275 | ], 276 | "source": [ 277 | "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string) #\n", 278 | "\n", 279 | "#use original string to make sure matching is correct, \n", 280 | "#then use groups to pull out the info you want" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": { 287 | "collapsed": true 288 | }, 289 | "outputs": [], 290 | "source": [] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 7, 295 | "metadata": { 296 | "collapsed": true 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "#organize the data by data-types\n", 301 | "info = re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)\n", 302 | "\n" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 8, 308 | "metadata": { 309 | "collapsed": false 310 | }, 311 | "outputs": [ 312 | { 313 | "data": { 314 | "text/plain": [ 315 | "[('John', '6', 'cats'), ('Susan', '3', 'dogs'), ('Mike', '8', 'fishes')]" 316 | ] 317 | }, 318 | "execution_count": 8, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "info" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 9, 330 | "metadata": { 331 | "collapsed": false 332 | }, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/plain": [ 337 | "[('John', 'Susan', 'Mike'), ('6', '3', '8'), ('cats', 'dogs', 'fishes')]" 338 | ] 339 | }, 340 | "execution_count": 9, 341 | "metadata": {}, 342 | "output_type": "execute_result" 343 | } 344 | ], 345 | "source": [ 346 | "list(zip(*info)) #organize your data by categories" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 10, 352 | "metadata": { 353 | "collapsed": false 354 | }, 355 | "outputs": [], 356 | "source": [ 357 | "match =re.search('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string) #pulls out three groups" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 11, 363 | "metadata": { 364 | "collapsed": false 365 | }, 366 | "outputs": [ 367 | { 368 | "data": { 369 | "text/plain": [ 370 | "<_sre.SRE_Match object; span=(0, 15), match='John has 6 cats'>" 371 | ] 372 | }, 373 | "execution_count": 11, 374 | "metadata": {}, 375 | "output_type": "execute_result" 376 | } 377 | ], 378 | "source": [ 379 | "match" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 19, 385 | "metadata": { 386 | "collapsed": false 387 | }, 388 | "outputs": [ 389 | { 390 | "data": { 391 | "text/plain": [ 392 | "'John has 6 cats but I think my friend Susan has 3 dogs and Mike has 8 fishes'" 393 | ] 394 | }, 395 | "execution_count": 19, 396 | "metadata": {}, 397 | "output_type": "execute_result" 398 | } 399 | ], 400 | "source": [ 401 | "string" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": 12, 407 | "metadata": { 408 | "collapsed": false 409 | }, 410 | "outputs": [ 411 | { 412 | "data": { 413 | "text/plain": [ 414 | "'John has 6 cats'" 415 | ] 416 | }, 417 | "execution_count": 12, 418 | "metadata": {}, 419 | "output_type": "execute_result" 420 | } 421 | ], 422 | "source": [ 423 | "match.group(0)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 13, 429 | "metadata": { 430 | "collapsed": false 431 | }, 432 | "outputs": [ 433 | { 434 | "data": { 435 | "text/plain": [ 436 | "('John', '6', 'cats')" 437 | ] 438 | }, 439 | "execution_count": 13, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "match.groups()" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 14, 451 | "metadata": { 452 | "collapsed": false 453 | }, 454 | "outputs": [ 455 | { 456 | "data": { 457 | "text/plain": [ 458 | "'John'" 459 | ] 460 | }, 461 | "execution_count": 14, 462 | "metadata": {}, 463 | "output_type": "execute_result" 464 | } 465 | ], 466 | "source": [ 467 | "match.group(1)" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": 15, 473 | "metadata": { 474 | "collapsed": false 475 | }, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/plain": [ 480 | "'6'" 481 | ] 482 | }, 483 | "execution_count": 15, 484 | "metadata": {}, 485 | "output_type": "execute_result" 486 | } 487 | ], 488 | "source": [ 489 | "match.group(2)" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": 16, 495 | "metadata": { 496 | "collapsed": false 497 | }, 498 | "outputs": [ 499 | { 500 | "data": { 501 | "text/plain": [ 502 | "('John', 'cats')" 503 | ] 504 | }, 505 | "execution_count": 16, 506 | "metadata": {}, 507 | "output_type": "execute_result" 508 | } 509 | ], 510 | "source": [ 511 | "match.group(1,3) #multiple groups" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 17, 517 | "metadata": { 518 | "collapsed": false 519 | }, 520 | "outputs": [ 521 | { 522 | "data": { 523 | "text/plain": [ 524 | "('cats', '6', 'John', 'John')" 525 | ] 526 | }, 527 | "execution_count": 17, 528 | "metadata": {}, 529 | "output_type": "execute_result" 530 | } 531 | ], 532 | "source": [ 533 | "match.group(3,2,1,1) #change the order" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 18, 539 | "metadata": { 540 | "collapsed": false 541 | }, 542 | "outputs": [ 543 | { 544 | "data": { 545 | "text/plain": [ 546 | "(0, 15)" 547 | ] 548 | }, 549 | "execution_count": 18, 550 | "metadata": {}, 551 | "output_type": "execute_result" 552 | } 553 | ], 554 | "source": [ 555 | "match.span()" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 20, 561 | "metadata": { 562 | "collapsed": false 563 | }, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/plain": [ 568 | "(9, 10)" 569 | ] 570 | }, 571 | "execution_count": 20, 572 | "metadata": {}, 573 | "output_type": "execute_result" 574 | } 575 | ], 576 | "source": [ 577 | "match.span(2)" 578 | ] 579 | }, 580 | { 581 | "cell_type": "code", 582 | "execution_count": 21, 583 | "metadata": { 584 | "collapsed": false 585 | }, 586 | "outputs": [ 587 | { 588 | "data": { 589 | "text/plain": [ 590 | "(11, 15)" 591 | ] 592 | }, 593 | "execution_count": 21, 594 | "metadata": {}, 595 | "output_type": "execute_result" 596 | } 597 | ], 598 | "source": [ 599 | "match.span(3)" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "metadata": { 606 | "collapsed": true 607 | }, 608 | "outputs": [], 609 | "source": [ 610 | "match.start(3)" 611 | ] 612 | }, 613 | { 614 | "cell_type": "code", 615 | "execution_count": null, 616 | "metadata": { 617 | "collapsed": true 618 | }, 619 | "outputs": [], 620 | "source": [ 621 | "#find all has no group function\n", 622 | "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string).group(1)" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 25, 628 | "metadata": { 629 | "collapsed": false 630 | }, 631 | "outputs": [ 632 | { 633 | "data": { 634 | "text/plain": [ 635 | "('John', '6', 'cats')" 636 | ] 637 | }, 638 | "execution_count": 25, 639 | "metadata": {}, 640 | "output_type": "execute_result" 641 | } 642 | ], 643 | "source": [ 644 | "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)[0]" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": null, 650 | "metadata": { 651 | "collapsed": true 652 | }, 653 | "outputs": [], 654 | "source": [ 655 | "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)[0].group(1)" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": 29, 661 | "metadata": { 662 | "collapsed": false 663 | }, 664 | "outputs": [ 665 | { 666 | "data": { 667 | "text/plain": [ 668 | "[('John', '6', 'cats'), ('Susan', '3', 'dogs'), ('Mike', '8', 'fishes')]" 669 | ] 670 | }, 671 | "execution_count": 29, 672 | "metadata": {}, 673 | "output_type": "execute_result" 674 | } 675 | ], 676 | "source": [ 677 | "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 30, 683 | "metadata": { 684 | "collapsed": true 685 | }, 686 | "outputs": [], 687 | "source": [ 688 | "data =re.findall('(([A-Za-z]+) \\w+ (\\d+) (\\w+))', string)" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 31, 694 | "metadata": { 695 | "collapsed": false 696 | }, 697 | "outputs": [ 698 | { 699 | "data": { 700 | "text/plain": [ 701 | "[('John has 6 cats', 'John', '6', 'cats'),\n", 702 | " ('Susan has 3 dogs', 'Susan', '3', 'dogs'),\n", 703 | " ('Mike has 8 fishes', 'Mike', '8', 'fishes')]" 704 | ] 705 | }, 706 | "execution_count": 31, 707 | "metadata": {}, 708 | "output_type": "execute_result" 709 | } 710 | ], 711 | "source": [ 712 | "data" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": 33, 718 | "metadata": { 719 | "collapsed": false 720 | }, 721 | "outputs": [ 722 | { 723 | "name": "stdout", 724 | "output_type": "stream", 725 | "text": [ 726 | "cats\n", 727 | "dogs\n", 728 | "fishes\n" 729 | ] 730 | } 731 | ], 732 | "source": [ 733 | "for i in data:\n", 734 | " print(i[3])" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": null, 740 | "metadata": { 741 | "collapsed": true 742 | }, 743 | "outputs": [], 744 | "source": [ 745 | "#we can use iteration" 746 | ] 747 | }, 748 | { 749 | "cell_type": "code", 750 | "execution_count": 50, 751 | "metadata": { 752 | "collapsed": false 753 | }, 754 | "outputs": [], 755 | "source": [ 756 | "it = re.finditer('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)" 757 | ] 758 | }, 759 | { 760 | "cell_type": "code", 761 | "execution_count": 42, 762 | "metadata": { 763 | "collapsed": false 764 | }, 765 | "outputs": [ 766 | { 767 | "data": { 768 | "text/plain": [ 769 | "('Mike', '8', 'fishes')" 770 | ] 771 | }, 772 | "execution_count": 42, 773 | "metadata": {}, 774 | "output_type": "execute_result" 775 | } 776 | ], 777 | "source": [ 778 | "next(it).groups()" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": null, 784 | "metadata": { 785 | "collapsed": true 786 | }, 787 | "outputs": [], 788 | "source": [ 789 | "for element in it:\n", 790 | " print (element.group(1,3, 2)) # don't forget iterators exhaust" 791 | ] 792 | }, 793 | { 794 | "cell_type": "code", 795 | "execution_count": 46, 796 | "metadata": { 797 | "collapsed": false 798 | }, 799 | "outputs": [ 800 | { 801 | "name": "stdout", 802 | "output_type": "stream", 803 | "text": [ 804 | "John has 6 cats\n", 805 | "Susan has 3 dogs\n", 806 | "Mike has 8 fishes\n" 807 | ] 808 | } 809 | ], 810 | "source": [ 811 | "for element in it:\n", 812 | " print(element.group())" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": { 819 | "collapsed": true 820 | }, 821 | "outputs": [], 822 | "source": [ 823 | "for element in it:\n", 824 | " print(element.groups())" 825 | ] 826 | } 827 | ], 828 | "metadata": { 829 | "kernelspec": { 830 | "display_name": "Python 3", 831 | "language": "python", 832 | "name": "python3" 833 | }, 834 | "language_info": { 835 | "codemirror_mode": { 836 | "name": "ipython", 837 | "version": 3 838 | }, 839 | "file_extension": ".py", 840 | "mimetype": "text/x-python", 841 | "name": "python", 842 | "nbconvert_exporter": "python", 843 | "pygments_lexer": "ipython3", 844 | "version": "3.6.0" 845 | } 846 | }, 847 | "nbformat": 4, 848 | "nbformat_minor": 1 849 | } 850 | -------------------------------------------------------------------------------- /Regular Expressions made Easy - part 7 .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Naming Groups" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import re" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "string = 'New York, New York 11369'" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": false 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "([A-Za-z\\s]+)\n", 41 | "([A-Za-z\\s]+)\n", 42 | "(\\d+)" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "match =re.search('([A-Za-z\\s]+),([A-Za-z\\s]+)(\\d+)', string)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 4, 59 | "metadata": { 60 | "collapsed": false 61 | }, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "('New York', ' New York ', '11369', 'New York, New York 11369')" 67 | ] 68 | }, 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "match.group(1), match.group(2), match.group(3), match.group(0)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": { 82 | "collapsed": true 83 | }, 84 | "outputs": [], 85 | "source": [ 86 | "?P< > #to name a group-- group name inside the <>, followed by RE for group\n", 87 | "\n", 88 | "(?P) (?P) (?P)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": { 95 | "collapsed": true 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "pattern = re.compile('(?P[A-Za-z\\s]+),(?P[A-Za-z\\s]+)(?P\\d+)')" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 6, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "match = re.search(pattern, string)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 7, 116 | "metadata": { 117 | "collapsed": false 118 | }, 119 | "outputs": [ 120 | { 121 | "data": { 122 | "text/plain": [ 123 | "('New York', ' New York ', '11369')" 124 | ] 125 | }, 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "output_type": "execute_result" 129 | } 130 | ], 131 | "source": [ 132 | "match.group('City'), match.group('State'), match.group('ZipCode')" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 8, 138 | "metadata": { 139 | "collapsed": false 140 | }, 141 | "outputs": [ 142 | { 143 | "data": { 144 | "text/plain": [ 145 | "'New York'" 146 | ] 147 | }, 148 | "execution_count": 8, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "match.group(1)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 11, 160 | "metadata": { 161 | "collapsed": false 162 | }, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "('New York', ' New York ', '11369')" 168 | ] 169 | }, 170 | "execution_count": 11, 171 | "metadata": {}, 172 | "output_type": "execute_result" 173 | } 174 | ], 175 | "source": [ 176 | "match.groups()" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 10, 182 | "metadata": { 183 | "collapsed": false 184 | }, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "{'City': 'New York', 'State': ' New York ', 'ZipCode': '11369'}" 190 | ] 191 | }, 192 | "execution_count": 10, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "#Just incase you forget the names of the groups you used\n", 199 | "\n", 200 | "match.groupdict()" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python 3", 225 | "language": "python", 226 | "name": "python3" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.6.0" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 1 243 | } 244 | -------------------------------------------------------------------------------- /Regular Expressions made Easy - part 8 .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "### Quantifiers on groups" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "#Using quantifiers on groups has some nuances, but very useful\n" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "import re" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [ 41 | { 42 | "data": { 43 | "text/plain": [ 44 | "<_sre.SRE_Match object; span=(0, 12), match='abababababab'>" 45 | ] 46 | }, 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "\n", 54 | "string = 'abababababab' #ab repeated many times\n", 55 | "\n", 56 | "re.search('(ab)+', string) #(ab)+ is many instances of one group repeated" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 3, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "<_sre.SRE_Match object; span=(0, 12), match='abababababab'>" 70 | ] 71 | }, 72 | "execution_count": 3, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "string = 'abababababab' #ab repeated many times\n", 79 | "\n", 80 | "re.search('[ab]+', string) #this is different" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "#difference explained below" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "metadata": { 98 | "collapsed": false 99 | }, 100 | "outputs": [ 101 | { 102 | "data": { 103 | "text/plain": [ 104 | "<_sre.SRE_Match object; span=(0, 6), match='ababab'>" 105 | ] 106 | }, 107 | "execution_count": 4, 108 | "metadata": {}, 109 | "output_type": "execute_result" 110 | } 111 | ], 112 | "source": [ 113 | "string = 'abababbbbbbb' #only partial fit to our new string\n", 114 | "re.search('(ab)+', string)" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 5, 120 | "metadata": { 121 | "collapsed": false 122 | }, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "<_sre.SRE_Match object; span=(0, 12), match='abababbbbbbb'>" 128 | ] 129 | }, 130 | "execution_count": 5, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "string = 'abababbbbbbb' #but this pattern fits perfectly\n", 137 | "re.search('[ab]+', string)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 7, 143 | "metadata": { 144 | "collapsed": false 145 | }, 146 | "outputs": [ 147 | { 148 | "data": { 149 | "text/plain": [ 150 | "<_sre.SRE_Match object; span=(0, 12), match='abababbbbbbb'>" 151 | ] 152 | }, 153 | "execution_count": 7, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "string = 'abababbbbbbb' #allows flexibility\n", 160 | "re.search('(ab)+\\w+', string)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 8, 166 | "metadata": { 167 | "collapsed": false 168 | }, 169 | "outputs": [ 170 | { 171 | "data": { 172 | "text/plain": [ 173 | "<_sre.SRE_Match object; span=(0, 11), match='abababsssss'>" 174 | ] 175 | }, 176 | "execution_count": 8, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "string = 'abababsssss' #allows flexibility\n", 183 | "re.search('(ab)+\\w+', string)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "source": [ 192 | "### Nuances to be wary of" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "collapsed": true 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "#only one group not multiple groups" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 9, 209 | "metadata": { 210 | "collapsed": false 211 | }, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "'ab'" 217 | ] 218 | }, 219 | "execution_count": 9, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "string = 'abababababab' #original string\n", 226 | "match =re.search('(ab)+', string) \n", 227 | "\n", 228 | "match.group(1)# capturing only one group; value is overwritten each time" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "match.group(2) #no value" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 11, 245 | "metadata": { 246 | "collapsed": false 247 | }, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/plain": [ 252 | "('ab',)" 253 | ] 254 | }, 255 | "execution_count": 11, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "match.groups() #only one group, group just overwritten" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 12, 267 | "metadata": { 268 | "collapsed": false 269 | }, 270 | "outputs": [ 271 | { 272 | "data": { 273 | "text/plain": [ 274 | "'abababababab'" 275 | ] 276 | }, 277 | "execution_count": 12, 278 | "metadata": {}, 279 | "output_type": "execute_result" 280 | } 281 | ], 282 | "source": [ 283 | "match.group(0) # the full match, not related to groups" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": { 290 | "collapsed": true 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "#Another simple example with two groups using quantifiers" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 16, 300 | "metadata": { 301 | "collapsed": true 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "string = 'ababababab'" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 17, 311 | "metadata": { 312 | "collapsed": false 313 | }, 314 | "outputs": [ 315 | { 316 | "data": { 317 | "text/plain": [ 318 | "<_sre.SRE_Match object; span=(0, 10), match='ababababab'>" 319 | ] 320 | }, 321 | "execution_count": 17, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "match =re.search ('(ab)+(ab)+', string)\n", 328 | "match" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 14, 334 | "metadata": { 335 | "collapsed": false 336 | }, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "('ab', 'ab')" 342 | ] 343 | }, 344 | "execution_count": 14, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "match.groups()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 18, 356 | "metadata": { 357 | "collapsed": false 358 | }, 359 | "outputs": [ 360 | { 361 | "data": { 362 | "text/plain": [ 363 | "(8, 10)" 364 | ] 365 | }, 366 | "execution_count": 18, 367 | "metadata": {}, 368 | "output_type": "execute_result" 369 | } 370 | ], 371 | "source": [ 372 | "match.span(2) # the first group is greedy" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": { 379 | "collapsed": true 380 | }, 381 | "outputs": [], 382 | "source": [ 383 | "#Only one group captured " 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 19, 389 | "metadata": { 390 | "collapsed": false 391 | }, 392 | "outputs": [], 393 | "source": [ 394 | "string = '123456789'\n", 395 | "\n", 396 | "match =re.search('(\\d)+', string)" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 20, 402 | "metadata": { 403 | "collapsed": false 404 | }, 405 | "outputs": [ 406 | { 407 | "data": { 408 | "text/plain": [ 409 | "<_sre.SRE_Match object; span=(0, 9), match='123456789'>" 410 | ] 411 | }, 412 | "execution_count": 20, 413 | "metadata": {}, 414 | "output_type": "execute_result" 415 | } 416 | ], 417 | "source": [ 418 | "match" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": 21, 424 | "metadata": { 425 | "collapsed": false 426 | }, 427 | "outputs": [ 428 | { 429 | "data": { 430 | "text/plain": [ 431 | "('9',)" 432 | ] 433 | }, 434 | "execution_count": 21, 435 | "metadata": {}, 436 | "output_type": "execute_result" 437 | } 438 | ], 439 | "source": [ 440 | "(match.groups()) # only one group, and it uses the last value" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": { 447 | "collapsed": false 448 | }, 449 | "outputs": [], 450 | "source": [ 451 | "match #full pattern still retained" 452 | ] 453 | }, 454 | { 455 | "cell_type": "markdown", 456 | "metadata": { 457 | "collapsed": true 458 | }, 459 | "source": [ 460 | "### Quantifiers with groups within findall" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 22, 466 | "metadata": { 467 | "collapsed": false 468 | }, 469 | "outputs": [ 470 | { 471 | "data": { 472 | "text/plain": [ 473 | "['9']" 474 | ] 475 | }, 476 | "execution_count": 22, 477 | "metadata": {}, 478 | "output_type": "execute_result" 479 | } 480 | ], 481 | "source": [ 482 | "string = '123456789'\n", 483 | "\n", 484 | "re.findall('(\\d)+', string) #only pulls out group and last instance" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 23, 490 | "metadata": { 491 | "collapsed": false 492 | }, 493 | "outputs": [ 494 | { 495 | "data": { 496 | "text/plain": [ 497 | "['4', '9']" 498 | ] 499 | }, 500 | "execution_count": 23, 501 | "metadata": {}, 502 | "output_type": "execute_result" 503 | } 504 | ], 505 | "source": [ 506 | "string = '1234 56789'\n", 507 | "\n", 508 | "re.findall('(\\d)+', string) #Here we have two matches" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 26, 514 | "metadata": { 515 | "collapsed": false 516 | }, 517 | "outputs": [ 518 | { 519 | "data": { 520 | "text/plain": [ 521 | "'56789'" 522 | ] 523 | }, 524 | "execution_count": 26, 525 | "metadata": {}, 526 | "output_type": "execute_result" 527 | } 528 | ], 529 | "source": [ 530 | "re.findall('((\\d)+)', string)[1][0] \n", 531 | "#to find full match create a main group engulfing the smaller groups" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 28, 537 | "metadata": { 538 | "collapsed": false 539 | }, 540 | "outputs": [ 541 | { 542 | "data": { 543 | "text/plain": [ 544 | "['ab', 'ab']" 545 | ] 546 | }, 547 | "execution_count": 28, 548 | "metadata": {}, 549 | "output_type": "execute_result" 550 | } 551 | ], 552 | "source": [ 553 | "#another example\n", 554 | "string = 'abbbbb ababababab'\n", 555 | "re.findall('(ab)+', string) #two instances" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 29, 561 | "metadata": { 562 | "collapsed": false 563 | }, 564 | "outputs": [ 565 | { 566 | "data": { 567 | "text/plain": [ 568 | "[('ab', 'ab'), ('ababababab', 'ab')]" 569 | ] 570 | }, 571 | "execution_count": 29, 572 | "metadata": {}, 573 | "output_type": "execute_result" 574 | } 575 | ], 576 | "source": [ 577 | "string = 'abbbbb ababababab'\n", 578 | "re.findall('((ab)+)', string) #full match" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "metadata": { 584 | "collapsed": true 585 | }, 586 | "source": [ 587 | "### Groups for word completion" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 30, 593 | "metadata": { 594 | "collapsed": false 595 | }, 596 | "outputs": [ 597 | { 598 | "data": { 599 | "text/plain": [ 600 | "<_sre.SRE_Match object; span=(0, 14), match='Happy Birthday'>" 601 | ] 602 | }, 603 | "execution_count": 30, 604 | "metadata": {}, 605 | "output_type": "execute_result" 606 | } 607 | ], 608 | "source": [ 609 | "re.search('Happy (Valentines|Birthday|Anniversary)', 'Happy Birthday')" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 31, 615 | "metadata": { 616 | "collapsed": false 617 | }, 618 | "outputs": [ 619 | { 620 | "data": { 621 | "text/plain": [ 622 | "<_sre.SRE_Match object; span=(0, 16), match='Happy Valentines'>" 623 | ] 624 | }, 625 | "execution_count": 31, 626 | "metadata": {}, 627 | "output_type": "execute_result" 628 | } 629 | ], 630 | "source": [ 631 | "re.search('Happy (Valentines|Birthday|Anniversary)', 'Happy Valentines')" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": { 638 | "collapsed": true 639 | }, 640 | "outputs": [], 641 | "source": [ 642 | "re.search('Happy Valentines| Happy Birthday | Happy Anniversary')" 643 | ] 644 | } 645 | ], 646 | "metadata": { 647 | "kernelspec": { 648 | "display_name": "Python 3", 649 | "language": "python", 650 | "name": "python3" 651 | }, 652 | "language_info": { 653 | "codemirror_mode": { 654 | "name": "ipython", 655 | "version": 3 656 | }, 657 | "file_extension": ".py", 658 | "mimetype": "text/x-python", 659 | "name": "python", 660 | "nbconvert_exporter": "python", 661 | "pygments_lexer": "ipython3", 662 | "version": "3.6.0" 663 | } 664 | }, 665 | "nbformat": 4, 666 | "nbformat_minor": 1 667 | } 668 | -------------------------------------------------------------------------------- /Regular Expressions made Easy - part 9 .ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "## Non-capture Groups" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": { 18 | "collapsed": true 19 | }, 20 | "outputs": [], 21 | "source": [ 22 | "import re" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": { 29 | "collapsed": false 30 | }, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "['4', '9']" 36 | ] 37 | }, 38 | "execution_count": 1, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "#Here is one such example:\n", 45 | " \n", 46 | "\n", 47 | "import re\n", 48 | "string = '1234 56789'\n", 49 | "\n", 50 | "re.findall('(\\d)+', string)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 2, 56 | "metadata": { 57 | "collapsed": false 58 | }, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "('4',)" 64 | ] 65 | }, 66 | "execution_count": 2, 67 | "metadata": {}, 68 | "output_type": "execute_result" 69 | } 70 | ], 71 | "source": [ 72 | "re.search('(\\d)+', string).groups() #using search" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "# non-capture groups syntax\n", 84 | "\n", 85 | "\n", 86 | "?: \n", 87 | " \n", 88 | "The symbol above represents non-capture groups and looks slightly\n", 89 | "similar to the syntax for naming groups\n", 90 | "\n", 91 | "?P #don't confuse the two please. \n", 92 | "\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "#comparison" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 3, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "['4', '9']" 117 | ] 118 | }, 119 | "execution_count": 3, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "re.findall('(\\d)+', string)" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 6, 131 | "metadata": { 132 | "collapsed": false 133 | }, 134 | "outputs": [ 135 | { 136 | "data": { 137 | "text/plain": [ 138 | "['1234', '56789']" 139 | ] 140 | }, 141 | "execution_count": 6, 142 | "metadata": {}, 143 | "output_type": "execute_result" 144 | } 145 | ], 146 | "source": [ 147 | "re.findall('(?:\\d)+', string) #with non capture group" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": { 154 | "collapsed": true 155 | }, 156 | "outputs": [], 157 | "source": [ 158 | "#So the group is part of the pattern, but we don't output the groups'\n", 159 | "#results" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 7, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "['1234', '56789']" 173 | ] 174 | }, 175 | "execution_count": 7, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "re.findall('\\d+', string) # when RE has no groups in findall, \n", 182 | " #we output entire match" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "outputs": [], 192 | "source": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "#Another example" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 19, 208 | "metadata": { 209 | "collapsed": false 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "string = '123123 = Alex, 123123123 = Danny, 123123123123 = Mike, 456456 = rick, 121212 = John, 132132 = Luis,' \n", 214 | " \n" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "metadata": { 221 | "collapsed": true 222 | }, 223 | "outputs": [], 224 | "source": [ 225 | "#We want to pull out all names whose ID has 123 within in" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 20, 231 | "metadata": { 232 | "collapsed": false 233 | }, 234 | "outputs": [ 235 | { 236 | "data": { 237 | "text/plain": [ 238 | "['Alex', 'Danny', 'Mike']" 239 | ] 240 | }, 241 | "execution_count": 20, 242 | "metadata": {}, 243 | "output_type": "execute_result" 244 | } 245 | ], 246 | "source": [ 247 | "re.findall('(?:123)+ = (\\w+),', string) #three instances" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 11, 253 | "metadata": { 254 | "collapsed": true 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "#Another example\n", 259 | "string = '1*1*1*1*22222 1*1*3333 2*1*2*1*222 1*2*2*2*333 3*3*3*444'" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 12, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "['1*1*1*1*22222', '1*1*3333']" 273 | ] 274 | }, 275 | "execution_count": 12, 276 | "metadata": {}, 277 | "output_type": "execute_result" 278 | } 279 | ], 280 | "source": [ 281 | "re.findall( r'(?:1\\*){2,}\\d+', string)" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": { 288 | "collapsed": true 289 | }, 290 | "outputs": [], 291 | "source": [] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": null, 296 | "metadata": { 297 | "collapsed": false 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "#Now, non-captured groups doesn't just affect the findall method\n", 302 | "#it also affects the search and match methods" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "### BE CAREFUL WITH SYNTAX " 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "metadata": { 316 | "collapsed": true 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "?: correct!\n", 321 | ":? incorrect!" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 13, 327 | "metadata": { 328 | "collapsed": false 329 | }, 330 | "outputs": [ 331 | { 332 | "name": "stdout", 333 | "output_type": "stream", 334 | "text": [ 335 | "()\n" 336 | ] 337 | } 338 | ], 339 | "source": [ 340 | "string = '1234 56789'\n", 341 | "\n", 342 | "match =re.search('(?:\\d)+', string)#correct syntax\n", 343 | "print(match.groups())" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 14, 349 | "metadata": { 350 | "collapsed": false 351 | }, 352 | "outputs": [ 353 | { 354 | "name": "stdout", 355 | "output_type": "stream", 356 | "text": [ 357 | "('4',)\n" 358 | ] 359 | } 360 | ], 361 | "source": [ 362 | "string = '1234 56789'\n", 363 | "\n", 364 | "match =re.search('(:?\\d)+', string)# :? incorrect syntax!!!! \n", 365 | "print(match.groups())" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "metadata": { 372 | "collapsed": true 373 | }, 374 | "outputs": [], 375 | "source": [ 376 | "Summary: \n", 377 | "\n", 378 | "#when we capture groups we are either storing the value or \n", 379 | "#outputting them\n" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "## Backreferences - Using captured groups inside other operations" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "collapsed": true 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "#backreferencing is making a refererence to the captured group\n", 398 | "#within the same regular expression\n", 399 | "\n" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": { 406 | "collapsed": true 407 | }, 408 | "outputs": [], 409 | "source": [ 410 | "#syntax and example" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 15, 416 | "metadata": { 417 | "collapsed": false 418 | }, 419 | "outputs": [ 420 | { 421 | "data": { 422 | "text/plain": [ 423 | "<_sre.SRE_Match object; span=(0, 11), match='Merry Merry'>" 424 | ] 425 | }, 426 | "execution_count": 15, 427 | "metadata": {}, 428 | "output_type": "execute_result" 429 | } 430 | ], 431 | "source": [ 432 | "re.search(r'(\\w+) \\1','Merry Merry Christmas') #Looking for repeated words" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 16, 438 | "metadata": { 439 | "collapsed": false 440 | }, 441 | "outputs": [ 442 | { 443 | "data": { 444 | "text/plain": [ 445 | "('Merry',)" 446 | ] 447 | }, 448 | "execution_count": 16, 449 | "metadata": {}, 450 | "output_type": "execute_result" 451 | } 452 | ], 453 | "source": [ 454 | "re.search(r'(\\w+) \\1','Merry Merry Christmas').groups()" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": { 461 | "collapsed": true 462 | }, 463 | "outputs": [], 464 | "source": [ 465 | "\\1 is just referencing the first group \n", 466 | "within the regular expression \n", 467 | "\n", 468 | "r'(\\w+) \\1'" 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": { 475 | "collapsed": true 476 | }, 477 | "outputs": [], 478 | "source": [ 479 | "#Another example" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 17, 485 | "metadata": { 486 | "collapsed": false 487 | }, 488 | "outputs": [ 489 | { 490 | "data": { 491 | "text/plain": [ 492 | "['Happy', 'Christmas']" 493 | ] 494 | }, 495 | "execution_count": 17, 496 | "metadata": {}, 497 | "output_type": "execute_result" 498 | } 499 | ], 500 | "source": [ 501 | "re.findall(r'(\\w+) \\1','Happy Happy Holidays. Merry Christmas Christmas') #Want to look for repeated words" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": { 508 | "collapsed": true 509 | }, 510 | "outputs": [], 511 | "source": [ 512 | "#another example" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 18, 518 | "metadata": { 519 | "collapsed": false 520 | }, 521 | "outputs": [ 522 | { 523 | "data": { 524 | "text/plain": [ 525 | "['Merry', 'Christmas', 'Merry']" 526 | ] 527 | }, 528 | "execution_count": 18, 529 | "metadata": {}, 530 | "output_type": "execute_result" 531 | } 532 | ], 533 | "source": [ 534 | "re.findall(r'(\\w+) \\1','Merry Merry Christmas Christmas Merry Merry Christmas')" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": { 541 | "collapsed": true 542 | }, 543 | "outputs": [], 544 | "source": [] 545 | }, 546 | { 547 | "cell_type": "code", 548 | "execution_count": null, 549 | "metadata": { 550 | "collapsed": true 551 | }, 552 | "outputs": [], 553 | "source": [] 554 | } 555 | ], 556 | "metadata": { 557 | "kernelspec": { 558 | "display_name": "Python 3", 559 | "language": "python", 560 | "name": "python3" 561 | }, 562 | "language_info": { 563 | "codemirror_mode": { 564 | "name": "ipython", 565 | "version": 3 566 | }, 567 | "file_extension": ".py", 568 | "mimetype": "text/x-python", 569 | "name": "python", 570 | "nbconvert_exporter": "python", 571 | "pygments_lexer": "ipython3", 572 | "version": "3.6.0" 573 | } 574 | }, 575 | "nbformat": 4, 576 | "nbformat_minor": 1 577 | } 578 | --------------------------------------------------------------------------------