├── Python Regular Expressions Made Easy # 14+15 -  Look arounds .ipynb
├── Python Regular Expressions Made Easy -  Part 16 - Negative Look Arounds.ipynb
├── Python Regular Expressions Made Easy -  Part 17 - Variable Width Assertions with Look Behinds.ipynb
├── README.md
├── Regular Expressions Made Easy - part -10-Flags .ipynb
├── Regular Expressions Made Easy - part 11-Re.Split .ipynb
├── Regular Expressions Made Easy - part 12 - re.Sub.ipynb
├── Regular Expressions Made Easy -13 -  Word Boundaries.ipynb
├── Regular Expressions made Easy - part 1 + part 2.ipynb
├── Regular Expressions made Easy - part 3.ipynb
├── Regular Expressions made Easy - part 4.ipynb
├── Regular Expressions made Easy - part 5 + 6.ipynb
├── Regular Expressions made Easy - part 7 .ipynb
├── Regular Expressions made Easy - part 8 .ipynb
└── Regular Expressions made Easy - part 9 .ipynb


/Python Regular Expressions Made Easy # 14+15 -  Look arounds .ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#Welcome to look arounds"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "#Allows us to confirm that some sort of subpattern is ahead or behind\n",
 23 |     "#main pattern\n",
 24 |     "\n"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 1,
 30 |    "metadata": {
 31 |     "collapsed": true
 32 |    },
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "import re"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "# 4 types of look arounds\n",
 47 |     "\n",
 48 |     "Positive look ahead   ?=\n",
 49 |     "\n",
 50 |     "Negative look ahead   ?!\n",
 51 |     "\n",
 52 |     "Positive look behind  ?<=\n",
 53 |     "\n",
 54 |     "Negative look behind ?<!\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "#similar syntax\n",
 58 |     "\n",
 59 |     "?:   #non-capturing groups\n",
 60 |     "    \n",
 61 |     "?P   #naming groups\n",
 62 |     "\n"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "source": [
 71 |     "#### Simple example of positive look ahead"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "#In the below string we are looking to consume the second column value\n",
 83 |     "#only if the first column starts with ABC and the last column \n",
 84 |     "#has the value 'active'\n",
 85 |     "\n",
 86 |     "#So only the first row and last row satisfies this condition which will\n",
 87 |     "#output the value '1.1.1.1' and 'x.x.x.x'"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 46,
 93 |    "metadata": {
 94 |     "collapsed": true
 95 |    },
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "string ='''ABC1    1.1.1.1    20151118    active\n",
 99 |     "           ABC2    2.2.2.2    20151118    inactive\n",
100 |     "           ABC3    x.x.x.x    xxxxxxxx    active'''\n",
101 |     "\n"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 47,
107 |    "metadata": {
108 |     "collapsed": false
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "pattern =re.compile('ABC\\w\\s+(\\S+)\\s+\\S+\\s+(?=active)')#positive look-ahead"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 48,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [
122 |     {
123 |      "data": {
124 |       "text/plain": [
125 |        "['1.1.1.1', 'x.x.x.x']"
126 |       ]
127 |      },
128 |      "execution_count": 48,
129 |      "metadata": {},
130 |      "output_type": "execute_result"
131 |     }
132 |    ],
133 |    "source": [
134 |     "re.findall(pattern, string)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 45,
140 |    "metadata": {
141 |     "collapsed": false
142 |    },
143 |    "outputs": [],
144 |    "source": [
145 |     "re.search(pattern, string)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "# However, we can also use non-capturing group"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 51,
162 |    "metadata": {
163 |     "collapsed": true
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "pattern =re.compile('ABC\\w\\s+(\\S+)\\s+\\S+\\s+(?:active)')#non-caputuring group"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 52,
173 |    "metadata": {
174 |     "collapsed": false
175 |    },
176 |    "outputs": [
177 |     {
178 |      "data": {
179 |       "text/plain": [
180 |        "['1.1.1.1', 'x.x.x.x']"
181 |       ]
182 |      },
183 |      "execution_count": 52,
184 |      "metadata": {},
185 |      "output_type": "execute_result"
186 |     }
187 |    ],
188 |    "source": [
189 |     "re.findall(pattern, string)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 8,
195 |    "metadata": {
196 |     "collapsed": false
197 |    },
198 |    "outputs": [
199 |     {
200 |      "data": {
201 |       "text/plain": [
202 |        "<_sre.SRE_Match object; span=(0, 37), match='ABC1    1.1.1.1    20151118    active'>"
203 |       ]
204 |      },
205 |      "execution_count": 8,
206 |      "metadata": {},
207 |      "output_type": "execute_result"
208 |     }
209 |    ],
210 |    "source": [
211 |     "re.search(pattern, string)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {},
217 |    "source": [
218 |     "### Difference between non-caputure groups and look arounds\n"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {
225 |     "collapsed": true
226 |    },
227 |    "outputs": [],
228 |    "source": [
229 |     "#Look aheads dont' consume, non-capture groups consume"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 11,
235 |    "metadata": {
236 |     "collapsed": true
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "string ='abababacb'                   #whereever a's are surrounded by b\n",
241 |     "                                      #so in our case we have two cases"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 12,
247 |    "metadata": {
248 |     "collapsed": true
249 |    },
250 |    "outputs": [],
251 |    "source": [
252 |     "pattern = re.compile('(?:b)(a)(?:b)')          #non-capturing group"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 13,
258 |    "metadata": {
259 |     "collapsed": false
260 |    },
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/plain": [
265 |        "['a']"
266 |       ]
267 |      },
268 |      "execution_count": 13,
269 |      "metadata": {},
270 |      "output_type": "execute_result"
271 |     }
272 |    ],
273 |    "source": [
274 |     "re.findall(pattern, string)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": null,
280 |    "metadata": {
281 |     "collapsed": true
282 |    },
283 |    "outputs": [],
284 |    "source": [
285 |     "string ='abababacb'"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 14,
291 |    "metadata": {
292 |     "collapsed": true
293 |    },
294 |    "outputs": [],
295 |    "source": [
296 |     "pattern = re.compile('(?<=b)(a)(?=b)')  #look around"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 15,
302 |    "metadata": {
303 |     "collapsed": false
304 |    },
305 |    "outputs": [
306 |     {
307 |      "data": {
308 |       "text/plain": [
309 |        "['a', 'a']"
310 |       ]
311 |      },
312 |      "execution_count": 15,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "re.findall(pattern, string)"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "#### Capture the entire look around "
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 17,
331 |    "metadata": {
332 |     "collapsed": true
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "string ='abababacb'"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 18,
342 |    "metadata": {
343 |     "collapsed": false
344 |    },
345 |    "outputs": [],
346 |    "source": [
347 |     "pattern = re.compile('(?=(bab))') "
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": 19,
353 |    "metadata": {
354 |     "collapsed": false
355 |    },
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "text/plain": [
360 |        "['bab', 'bab']"
361 |       ]
362 |      },
363 |      "execution_count": 19,
364 |      "metadata": {},
365 |      "output_type": "execute_result"
366 |     }
367 |    ],
368 |    "source": [
369 |     "re.findall(pattern, string)"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {
376 |     "collapsed": true
377 |    },
378 |    "outputs": [],
379 |    "source": [
380 |     "#Another example of positive look ahead"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": 20,
386 |    "metadata": {
387 |     "collapsed": true
388 |    },
389 |    "outputs": [],
390 |    "source": [
391 |     "import re\n",
392 |     "string = 'I love cherries, apples, and strawberries.'"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 21,
398 |    "metadata": {
399 |     "collapsed": true
400 |    },
401 |    "outputs": [],
402 |    "source": [
403 |     "pattern2 = re.compile(r'(\\w+)(?=\\.|,)')"
404 |    ]
405 |   },
406 |   {
407 |    "cell_type": "code",
408 |    "execution_count": 22,
409 |    "metadata": {
410 |     "collapsed": false
411 |    },
412 |    "outputs": [
413 |     {
414 |      "data": {
415 |       "text/plain": [
416 |        "['cherries', 'apples', 'strawberries']"
417 |       ]
418 |      },
419 |      "execution_count": 22,
420 |      "metadata": {},
421 |      "output_type": "execute_result"
422 |     }
423 |    ],
424 |    "source": [
425 |     "re.findall(pattern2,string)"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "code",
430 |    "execution_count": 23,
431 |    "metadata": {
432 |     "collapsed": true
433 |    },
434 |    "outputs": [],
435 |    "source": [
436 |     "pattern2 = re.compile(r'(\\w+)(?:\\.|,)')"
437 |    ]
438 |   },
439 |   {
440 |    "cell_type": "code",
441 |    "execution_count": 24,
442 |    "metadata": {
443 |     "collapsed": false
444 |    },
445 |    "outputs": [
446 |     {
447 |      "data": {
448 |       "text/plain": [
449 |        "['cherries', 'apples', 'strawberries']"
450 |       ]
451 |      },
452 |      "execution_count": 24,
453 |      "metadata": {},
454 |      "output_type": "execute_result"
455 |     }
456 |    ],
457 |    "source": [
458 |     "re.findall(pattern2,string)"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "markdown",
463 |    "metadata": {},
464 |    "source": [
465 |     "### consecutive look around fallacy"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": 29,
471 |    "metadata": {
472 |     "collapsed": true
473 |    },
474 |    "outputs": [],
475 |    "source": [
476 |     "string = '''cherry 100 red\n",
477 |     "            apple  150 green\n",
478 |     "            grapes 200 \n",
479 |     "            '''\n"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 30,
485 |    "metadata": {
486 |     "collapsed": false
487 |    },
488 |    "outputs": [],
489 |    "source": [
490 |     "pattern = re.compile(r'[a-z]+\\s*(?= \\d+)(?=\\s*)(?=[a-z]+)')     #zero-width assertion"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 31,
496 |    "metadata": {
497 |     "collapsed": false
498 |    },
499 |    "outputs": [
500 |     {
501 |      "data": {
502 |       "text/plain": [
503 |        "[]"
504 |       ]
505 |      },
506 |      "execution_count": 31,
507 |      "metadata": {},
508 |      "output_type": "execute_result"
509 |     }
510 |    ],
511 |    "source": [
512 |     "re.findall(pattern,string)                           #consecutives are not really consecutives"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 34,
518 |    "metadata": {
519 |     "collapsed": true
520 |    },
521 |    "outputs": [],
522 |    "source": [
523 |     "pattern = re.compile(r'[a-z]+\\s*(?=\\d+\\s*[a-z]+)')"
524 |    ]
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": 35,
529 |    "metadata": {
530 |     "collapsed": false
531 |    },
532 |    "outputs": [
533 |     {
534 |      "data": {
535 |       "text/plain": [
536 |        "['cherry ', 'apple  ']"
537 |       ]
538 |      },
539 |      "execution_count": 35,
540 |      "metadata": {},
541 |      "output_type": "execute_result"
542 |     }
543 |    ],
544 |    "source": [
545 |     "re.findall(pattern,string)"
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "code",
550 |    "execution_count": null,
551 |    "metadata": {
552 |     "collapsed": true
553 |    },
554 |    "outputs": [],
555 |    "source": []
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": 36,
560 |    "metadata": {
561 |     "collapsed": true
562 |    },
563 |    "outputs": [],
564 |    "source": [
565 |     "#Password validation example  #order doesn't matter"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": 37,
571 |    "metadata": {
572 |     "collapsed": false
573 |    },
574 |    "outputs": [],
575 |    "source": [
576 |     "pattern = re.compile('(?=.*[a-z])(?=.*[A-Z])(?=.*[0-9])(?=.*[!?.])\\S+')"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": 38,
582 |    "metadata": {
583 |     "collapsed": true
584 |    },
585 |    "outputs": [],
586 |    "source": [
587 |     "string = 'AZN#3232!abbb32..'\n",
588 |     "string2 = 'AZN#3232abbb3232'"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": 39,
594 |    "metadata": {
595 |     "collapsed": false
596 |    },
597 |    "outputs": [
598 |     {
599 |      "data": {
600 |       "text/plain": [
601 |        "<_sre.SRE_Match object; span=(0, 17), match='AZN#3232!abbb32..'>"
602 |       ]
603 |      },
604 |      "execution_count": 39,
605 |      "metadata": {},
606 |      "output_type": "execute_result"
607 |     }
608 |    ],
609 |    "source": [
610 |     "re.search(pattern, string)"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": 42,
616 |    "metadata": {
617 |     "collapsed": false
618 |    },
619 |    "outputs": [],
620 |    "source": [
621 |     "re.search(pattern, string2)"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": null,
627 |    "metadata": {
628 |     "collapsed": true
629 |    },
630 |    "outputs": [],
631 |    "source": []
632 |   },
633 |   {
634 |    "cell_type": "code",
635 |    "execution_count": null,
636 |    "metadata": {
637 |     "collapsed": true
638 |    },
639 |    "outputs": [],
640 |    "source": [
641 |     "#If we didn't use look arounds we would need to use this complicated pattern\n",
642 |     "\n",
643 |     "(?:.*[a-z].*[A-Z].*[0-9].*[!?.])|(?:.*[A-Z][a-z].*[0-9].*[!?.])|(?:.*[0-9].*[a-z].*[A-Z].*[!?.])|(?:.*[!?.].*[a-z`].*[A-Z].*[0-9])|(?:.*[A-Z][a-z].*[!?.].*[0-9])|..."
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": null,
649 |    "metadata": {
650 |     "collapsed": true
651 |    },
652 |    "outputs": [],
653 |    "source": []
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": null,
658 |    "metadata": {
659 |     "collapsed": true
660 |    },
661 |    "outputs": [],
662 |    "source": []
663 |   },
664 |   {
665 |    "cell_type": "code",
666 |    "execution_count": null,
667 |    "metadata": {
668 |     "collapsed": true
669 |    },
670 |    "outputs": [],
671 |    "source": []
672 |   }
673 |  ],
674 |  "metadata": {
675 |   "kernelspec": {
676 |    "display_name": "Python 3",
677 |    "language": "python",
678 |    "name": "python3"
679 |   },
680 |   "language_info": {
681 |    "codemirror_mode": {
682 |     "name": "ipython",
683 |     "version": 3
684 |    },
685 |    "file_extension": ".py",
686 |    "mimetype": "text/x-python",
687 |    "name": "python",
688 |    "nbconvert_exporter": "python",
689 |    "pygments_lexer": "ipython3",
690 |    "version": "3.6.0"
691 |   }
692 |  },
693 |  "nbformat": 4,
694 |  "nbformat_minor": 1
695 | }
696 | 


--------------------------------------------------------------------------------
/Python Regular Expressions Made Easy -  Part 16 - Negative Look Arounds.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "Negative look ahead   ?!\n",
 12 |     "\n",
 13 |     "Negative look behind ?<!\n",
 14 |     "\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "#Negative Look Ahead"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import re"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 5,
 42 |    "metadata": {
 43 |     "collapsed": true
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "string = '''\n",
 48 |     "Remaining party applicants:\n",
 49 |     "\n",
 50 |     "Occupation: Party Planner\n",
 51 |     "Occupation: Baking\n",
 52 |     "Occupation: Cook\n",
 53 |     "Occupation: Publicist\n",
 54 |     "Occupation: Entertainer\n",
 55 |     "Occupation: Baker\n",
 56 |     "Occupation: baker\n",
 57 |     "Occupation: pierrot'''\n"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 6,
 63 |    "metadata": {
 64 |     "collapsed": false
 65 |    },
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "pattern = re.compile('Occupation: (?!Baker|Baking|Cook).+', flags = re.IGNORECASE)"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": 7,
 74 |    "metadata": {
 75 |     "collapsed": false
 76 |    },
 77 |    "outputs": [
 78 |     {
 79 |      "data": {
 80 |       "text/plain": [
 81 |        "['Occupation: Party Planner',\n",
 82 |        " 'Occupation: Publicist',\n",
 83 |        " 'Occupation: Entertainer',\n",
 84 |        " 'Occupation: pierrot']"
 85 |       ]
 86 |      },
 87 |      "execution_count": 7,
 88 |      "metadata": {},
 89 |      "output_type": "execute_result"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "re.findall(pattern, string)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {
100 |     "collapsed": true
101 |    },
102 |    "outputs": [],
103 |    "source": []
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {
109 |     "collapsed": true
110 |    },
111 |    "outputs": [],
112 |    "source": []
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "#Negative Look Behind"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 21,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "string = '''\n",
134 |     "Full invitation list:\n",
135 |     "\n",
136 |     "Guest: Ashley Jackson\n",
137 |     "Guest: Maria Jackson\n",
138 |     "Guest: Bob Sanders\n",
139 |     "Guest: Bill Smith\n",
140 |     "Entertainer: Michael Johnson\n",
141 |     "Baker: Chris Jackson\n",
142 |     "Party Planner: Seema Patel\n",
143 |     "Publist: Seema Patel\n",
144 |     "Baker: Ashley Sanders'''"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 9,
150 |    "metadata": {
151 |     "collapsed": false
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "pattern = re.compile(r'(?<!Baker: )\\w+\\s\\w+', flags = re.IGNORECASE|re.M)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 16,
161 |    "metadata": {
162 |     "collapsed": true
163 |    },
164 |    "outputs": [],
165 |    "source": [
166 |     "pattern = re.compile(r'(?<!Baker: )\\b\\w+\\s\\w+$', flags = re.IGNORECASE|re.M)"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 10,
172 |    "metadata": {
173 |     "collapsed": false
174 |    },
175 |    "outputs": [
176 |     {
177 |      "data": {
178 |       "text/plain": [
179 |        "['Full invitation',\n",
180 |        " 'Ashley Jackson',\n",
181 |        " 'Maria Jackson',\n",
182 |        " 'Bob Sanders',\n",
183 |        " 'Bill Smith',\n",
184 |        " 'Michael Johnson',\n",
185 |        " 'hris Jackson',\n",
186 |        " 'Party Planner',\n",
187 |        " 'Seema Patel',\n",
188 |        " 'Seema Patel',\n",
189 |        " 'shley Sanders']"
190 |       ]
191 |      },
192 |      "execution_count": 10,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "re.findall(pattern, string)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {
205 |     "collapsed": true
206 |    },
207 |    "outputs": [],
208 |    "source": []
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 30,
213 |    "metadata": {
214 |     "collapsed": true
215 |    },
216 |    "outputs": [],
217 |    "source": [
218 |     "import regex\n",
219 |     "#Testing\n",
220 |     "pattern = regex.compile(r'(?<!Baker: ) ', flags = re.IGNORECASE|re.M)"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": 31,
226 |    "metadata": {
227 |     "collapsed": false
228 |    },
229 |    "outputs": [
230 |     {
231 |      "data": {
232 |       "text/plain": [
233 |        "['Full',\n",
234 |        " 'invitation',\n",
235 |        " 'list',\n",
236 |        " 'Guest',\n",
237 |        " 'Ashley',\n",
238 |        " 'Jackson',\n",
239 |        " 'Guest',\n",
240 |        " 'Maria',\n",
241 |        " 'Jackson',\n",
242 |        " 'Guest',\n",
243 |        " 'Bob',\n",
244 |        " 'Sanders',\n",
245 |        " 'Guest',\n",
246 |        " 'Bill',\n",
247 |        " 'Smith',\n",
248 |        " 'Entertainer',\n",
249 |        " 'Michael',\n",
250 |        " 'Johnson',\n",
251 |        " 'Baker',\n",
252 |        " 'hris',\n",
253 |        " 'Jackson',\n",
254 |        " 'Party',\n",
255 |        " 'Planner',\n",
256 |        " 'Seema',\n",
257 |        " 'Patel',\n",
258 |        " 'Publist',\n",
259 |        " 'Seema',\n",
260 |        " 'Patel',\n",
261 |        " 'Baker',\n",
262 |        " 'shley',\n",
263 |        " 'Sanders']"
264 |       ]
265 |      },
266 |      "execution_count": 31,
267 |      "metadata": {},
268 |      "output_type": "execute_result"
269 |     }
270 |    ],
271 |    "source": [
272 |     "regex.findall(pattern, string)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {
279 |     "collapsed": true
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "#If names have three words i.e. Maria Maria Jackson"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 20,
289 |    "metadata": {
290 |     "collapsed": true
291 |    },
292 |    "outputs": [],
293 |    "source": [
294 |     "string = '''\n",
295 |     "Full invitation list:\n",
296 |     "\n",
297 |     "Guest: Ashley Jackson\n",
298 |     "Guest: Maria Maria Jackson\n",
299 |     "Guest: Bob Sanders\n",
300 |     "Guest: Bill Smith\n",
301 |     "Entertainer: Michael Johnson\n",
302 |     "Baker: Chris Jackson\n",
303 |     "Party Planner: Seema Patel\n",
304 |     "Publist: Seema Patel\n",
305 |     "Baker: Ashley Sanders'''"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 21,
311 |    "metadata": {
312 |     "collapsed": false
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "pattern = re.compile(r'(?<!Baker: )(\\b\\w+\\s\\w+$|\\b\\w+\\s\\w+\\s\\w+$)', flags = re.IGNORECASE|re.M)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 22,
322 |    "metadata": {
323 |     "collapsed": false
324 |    },
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/plain": [
329 |        "['Ashley Jackson',\n",
330 |        " 'Maria Maria Jackson',\n",
331 |        " 'Bob Sanders',\n",
332 |        " 'Bill Smith',\n",
333 |        " 'Michael Johnson',\n",
334 |        " 'Seema Patel',\n",
335 |        " 'Seema Patel']"
336 |       ]
337 |      },
338 |      "execution_count": 22,
339 |      "metadata": {},
340 |      "output_type": "execute_result"
341 |     }
342 |    ],
343 |    "source": [
344 |     "re.findall(pattern, string)"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {
351 |     "collapsed": true
352 |    },
353 |    "outputs": [],
354 |    "source": [
355 |     "#Pull out entire phrase"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": 23,
361 |    "metadata": {
362 |     "collapsed": false
363 |    },
364 |    "outputs": [],
365 |    "source": [
366 |     "pattern = re.compile(r'^(?!Baker: ).+\\w+$', flags = re.IGNORECASE|re.M)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 27,
372 |    "metadata": {
373 |     "collapsed": true
374 |    },
375 |    "outputs": [],
376 |    "source": [
377 |     "pattern = re.compile(r'^(?!Baker: ).+\\w+$', flags = re.IGNORECASE|re.M)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 28,
383 |    "metadata": {
384 |     "collapsed": false
385 |    },
386 |    "outputs": [
387 |     {
388 |      "data": {
389 |       "text/plain": [
390 |        "['Guest: Ashley Jackson',\n",
391 |        " 'Guest: Maria Maria Jackson',\n",
392 |        " 'Guest: Bob Sanders',\n",
393 |        " 'Guest: Bill Smith',\n",
394 |        " 'Entertainer: Michael Johnson',\n",
395 |        " 'Party Planner: Seema Patel',\n",
396 |        " 'Publist: Seema Patel']"
397 |       ]
398 |      },
399 |      "execution_count": 28,
400 |      "metadata": {},
401 |      "output_type": "execute_result"
402 |     }
403 |    ],
404 |    "source": [
405 |     "re.findall(pattern, string)"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": null,
411 |    "metadata": {
412 |     "collapsed": true
413 |    },
414 |    "outputs": [],
415 |    "source": [
416 |     "`"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {
423 |     "collapsed": true
424 |    },
425 |    "outputs": [],
426 |    "source": []
427 |   }
428 |  ],
429 |  "metadata": {
430 |   "kernelspec": {
431 |    "display_name": "Python 3",
432 |    "language": "python",
433 |    "name": "python3"
434 |   },
435 |   "language_info": {
436 |    "codemirror_mode": {
437 |     "name": "ipython",
438 |     "version": 3
439 |    },
440 |    "file_extension": ".py",
441 |    "mimetype": "text/x-python",
442 |    "name": "python",
443 |    "nbconvert_exporter": "python",
444 |    "pygments_lexer": "ipython3",
445 |    "version": "3.6.0"
446 |   }
447 |  },
448 |  "nbformat": 4,
449 |  "nbformat_minor": 1
450 | }
451 | 


--------------------------------------------------------------------------------
/Python Regular Expressions Made Easy -  Part 17 - Variable Width Assertions with Look Behinds.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import re"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "#Revisit earlier examples"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 6,
 28 |    "metadata": {
 29 |     "collapsed": true
 30 |    },
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "string ='''1111    ABCC    77777777    active\n",
 34 |     "           2222    ABC     88888888    inactive\n",
 35 |     "           3333    XYZ     xxxxxxxx    active\n",
 36 |     "           4444    1234    20351118    inactive  \n",
 37 |     "           5555    1445    20153318    inactive  \n",
 38 |     "\n",
 39 |     "           '''\n",
 40 |     "\n"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {
 47 |     "collapsed": true
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "pattern =re.compile('(?<=[A-Z]+)[A-Z]+\\s+(\\S+)') #look behind # look-behind requires fixed-width pattern\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 3,
 57 |    "metadata": {
 58 |     "collapsed": false
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "import regex"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 4,
 68 |    "metadata": {
 69 |     "collapsed": true
 70 |    },
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "pattern =regex.compile('(?<=[A-Z]+)[A-Z]+\\s+(\\S+)')"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 7,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "['77777777', '88888888', 'xxxxxxxx']"
 87 |       ]
 88 |      },
 89 |      "execution_count": 7,
 90 |      "metadata": {},
 91 |      "output_type": "execute_result"
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "regex.findall(pattern, string)"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": null,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": []
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 9,
110 |    "metadata": {
111 |     "collapsed": true
112 |    },
113 |    "outputs": [],
114 |    "source": [
115 |     "s= 'C:\\Tools\\System32\\calc.exe'\n",
116 |     "s2= 'C:\\Windows\\System32\\calc.exe'\n",
117 |     "s3= 'C:\\Windows\\System32\\De-de\\calc.exe'\n",
118 |     "s4= 'C:\\Tools\\calc.exe'\n"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": null,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "pattern =re.compile(r'(?<!System32.*)calc.exe') #error"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 12,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "pattern =regex.compile(r'(?<!System32.*)calc.exe')"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 16,
146 |    "metadata": {
147 |     "collapsed": false
148 |    },
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/plain": [
153 |        "['calc.exe']"
154 |       ]
155 |      },
156 |      "execution_count": 16,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "regex.findall(pattern, s4)"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {
169 |     "collapsed": true
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "s= 'C:\\Tools\\System32\\calc.exe'"
174 |    ]
175 |   }
176 |  ],
177 |  "metadata": {
178 |   "kernelspec": {
179 |    "display_name": "Python 3",
180 |    "language": "python",
181 |    "name": "python3"
182 |   },
183 |   "language_info": {
184 |    "codemirror_mode": {
185 |     "name": "ipython",
186 |     "version": 3
187 |    },
188 |    "file_extension": ".py",
189 |    "mimetype": "text/x-python",
190 |    "name": "python",
191 |    "nbconvert_exporter": "python",
192 |    "pygments_lexer": "ipython3",
193 |    "version": "3.6.0"
194 |   }
195 |  },
196 |  "nbformat": 4,
197 |  "nbformat_minor": 1
198 | }
199 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python-Regular-Expressions
2 | Code to all videos regarding Python Regular Expressions
3 | 


--------------------------------------------------------------------------------
/Regular Expressions Made Easy - part -10-Flags .ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import re"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {
 18 |     "collapsed": false
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "string =\"\"\"U.S. stock-index futures pointed\n",
 23 |     "to a solidly higher open on \n",
 24 |     "Monday, indicating that major \n",
 25 |     "benchmarks were poised to rebound \n",
 26 |     "from last week’s sharp decline, \n",
 27 |     "which represented their biggest weekly drops in months.\n",
 28 |     "That weakness was driven in part by \n",
 29 |     "fears over North Korea, where tensions \n",
 30 |     "with the U.S. have been escalating. \n",
 31 |     "North Korea. That issue overshadowed the state of \n",
 32 |     "the equity market, where earnings \n",
 33 |     "have been strong at a time of high \n",
 34 |     "employment and low inflation, \n",
 35 |     "as well as valuations that a\n",
 36 |     "ppear elevated by many metrics, north korea North Korea.\"\"\""
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 3,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "ename": "SyntaxError",
 48 |      "evalue": "invalid syntax (<ipython-input-3-c2d9ab112965>, line 1)",
 49 |      "output_type": "error",
 50 |      "traceback": [
 51 |       "\u001b[1;36m  File \u001b[1;32m\"<ipython-input-3-c2d9ab112965>\"\u001b[1;36m, line \u001b[1;32m1\u001b[0m\n\u001b[1;33m    ^ = matches at beginning of string\u001b[0m\n\u001b[1;37m    ^\u001b[0m\n\u001b[1;31mSyntaxError\u001b[0m\u001b[1;31m:\u001b[0m invalid syntax\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "^ = matches at beginning of string\n",
 57 |     "$ = matches at end of string\n",
 58 |     "\n"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {
 65 |     "collapsed": true
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "#Below examples exactly the same"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 5,
 75 |    "metadata": {
 76 |     "collapsed": true
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "re.search('^North Korea\\.?', string)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 6,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "re.match('North Korea\\.?', string)"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 7,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "<_sre.SRE_Match object; span=(557, 569), match='North Korea.'>"
105 |       ]
106 |      },
107 |      "execution_count": 7,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "re.search('North Korea\\.?$', string)"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 8,
119 |    "metadata": {
120 |     "collapsed": true
121 |    },
122 |    "outputs": [],
123 |    "source": [
124 |     "#Only benefit to re.search is the MULTILINE flag"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "## re.MULTILINE/ re.M"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 9,
137 |    "metadata": {
138 |     "collapsed": false
139 |    },
140 |    "outputs": [
141 |     {
142 |      "data": {
143 |       "text/plain": [
144 |        "<_sre.SRE_Match object; span=(331, 343), match='North Korea.'>"
145 |       ]
146 |      },
147 |      "execution_count": 9,
148 |      "metadata": {},
149 |      "output_type": "execute_result"
150 |     }
151 |    ],
152 |    "source": [
153 |     "re.search('^North Korea\\.?', string, flags = re.MULTILINE ) "
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 10,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "re.match('^North Korea\\.?', string, flags = re.MULTILINE ) #no match"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "source": [
173 |     "## re.IGNORECASE / re.I"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 11,
179 |    "metadata": {
180 |     "collapsed": false
181 |    },
182 |    "outputs": [
183 |     {
184 |      "data": {
185 |       "text/plain": [
186 |        "['North Korea', 'North Korea', 'north korea', 'North Korea']"
187 |       ]
188 |      },
189 |      "execution_count": 11,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "re.findall('North Korea', string, flags = re.IGNORECASE) #re.I"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {
201 |     "collapsed": true
202 |    },
203 |    "source": [
204 |     "## re.DOTALL / re.S"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 12,
210 |    "metadata": {
211 |     "collapsed": false
212 |    },
213 |    "outputs": [
214 |     {
215 |      "data": {
216 |       "text/plain": [
217 |        "'U.S. stock-index futures pointed'"
218 |       ]
219 |      },
220 |      "execution_count": 12,
221 |      "metadata": {},
222 |      "output_type": "execute_result"
223 |     }
224 |    ],
225 |    "source": [
226 |     "re.match('.*', string).group()  #grabs the first line only\n",
227 |     "                                ##only used with . character"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 13,
233 |    "metadata": {
234 |     "collapsed": false
235 |    },
236 |    "outputs": [
237 |     {
238 |      "data": {
239 |       "text/plain": [
240 |        "'U.S. stock-index futures pointed\\nto a solidly higher open on \\nMonday, indicating that major \\nbenchmarks were poised to rebound \\nfrom last week’s sharp decline, \\nwhich represented their biggest weekly drops in months.\\nThat weakness was driven in part by \\nfears over North Korea, where tensions \\nwith the U.S. have been escalating. \\nNorth Korea. That issue overshadowed the state of \\nthe equity market, where earnings \\nhave been strong at a time of high \\nemployment and low inflation, \\nas well as valuations that a\\nppear elevated by many metrics, north korea North Korea.'"
241 |       ]
242 |      },
243 |      "execution_count": 13,
244 |      "metadata": {},
245 |      "output_type": "execute_result"
246 |     }
247 |    ],
248 |    "source": [
249 |     "re.match('.*', string, flags = re.DOTALL).group() #includes new lines"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {
256 |     "collapsed": true
257 |    },
258 |    "outputs": [],
259 |    "source": [
260 |     "## Other flags"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": null,
266 |    "metadata": {
267 |     "collapsed": false
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "\n",
272 |     "re.ASCII\n",
273 |     "re.DEBUG\n",
274 |     "re.LOCALE`"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 14,
280 |    "metadata": {
281 |     "collapsed": false
282 |    },
283 |    "outputs": [],
284 |    "source": [
285 |     "Japanese_string ='【北京時事】中国商務省は14日、核'\n",
286 |     "\n"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 16,
292 |    "metadata": {
293 |     "collapsed": false
294 |    },
295 |    "outputs": [
296 |     {
297 |      "data": {
298 |       "text/plain": [
299 |        "<_sre.SRE_Match object; span=(16, 17), match='核'>"
300 |       ]
301 |      },
302 |      "execution_count": 16,
303 |      "metadata": {},
304 |      "output_type": "execute_result"
305 |     }
306 |    ],
307 |    "source": [
308 |     "re.search('核', Japanese_string)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {
315 |     "collapsed": true
316 |    },
317 |    "outputs": [],
318 |    "source": []
319 |   }
320 |  ],
321 |  "metadata": {
322 |   "kernelspec": {
323 |    "display_name": "Python 3",
324 |    "language": "python",
325 |    "name": "python3"
326 |   },
327 |   "language_info": {
328 |    "codemirror_mode": {
329 |     "name": "ipython",
330 |     "version": 3
331 |    },
332 |    "file_extension": ".py",
333 |    "mimetype": "text/x-python",
334 |    "name": "python",
335 |    "nbconvert_exporter": "python",
336 |    "pygments_lexer": "ipython3",
337 |    "version": "3.6.0"
338 |   }
339 |  },
340 |  "nbformat": 4,
341 |  "nbformat_minor": 1
342 | }
343 | 


--------------------------------------------------------------------------------
/Regular Expressions Made Easy - part 11-Re.Split .ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import re"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## re.methods"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "### re.split"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 17,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/plain": [
 38 |        "['Today is sunny', ' I want go to the park', ' I want to eat ice cream', '']"
 39 |       ]
 40 |      },
 41 |      "execution_count": 17,
 42 |      "metadata": {},
 43 |      "output_type": "execute_result"
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "#Example 1\n",
 48 |     "re.split('\\.','Today is sunny. I want go to the park. I want to eat ice cream.')"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 18,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/plain": [
 61 |        "['Today is sunny',\n",
 62 |        " '.',\n",
 63 |        " ' I want go to the park',\n",
 64 |        " '.',\n",
 65 |        " ' I want to eat ice cream',\n",
 66 |        " '.',\n",
 67 |        " '']"
 68 |       ]
 69 |      },
 70 |      "execution_count": 18,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "#includes split point\n",
 77 |     "re.split('(\\.)','Today is sunny. I want go to the park. I want to eat ice cream.')"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 19,
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "['Today is sunny.',\n",
 91 |        " ' I want go to the park.',\n",
 92 |        " ' I want to eat ice cream.',\n",
 93 |        " '.']"
 94 |       ]
 95 |      },
 96 |      "execution_count": 19,
 97 |      "metadata": {},
 98 |      "output_type": "execute_result"
 99 |     }
100 |    ],
101 |    "source": [
102 |     "split = '.'\n",
103 |     "[i+split for i in re.split('\\.','Today is sunny. I want go to the park. I want to eat ice cream.')]"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {
110 |     "collapsed": true
111 |    },
112 |    "outputs": [],
113 |    "source": [
114 |     "#Example 2:\n",
115 |     "\n",
116 |     "string = '<p>My mother has <span style=\"color:blue\">blue</span> eyes.</p>'\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {
123 |     "collapsed": false
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "re.split('<\\w+>', string)   #doesn't work"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {
134 |     "collapsed": false
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "re.split('<.+>', string)  #captures entire string \n",
139 |     "                                #because it's greedy"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": false
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "re.split(\"<[^<>]+>\", string) #empty string problem"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {
157 |     "collapsed": true
158 |    },
159 |    "outputs": [],
160 |    "source": [
161 |     "#Another Example of  split and empty strings problem"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": null,
167 |    "metadata": {
168 |     "collapsed": false
169 |    },
170 |    "outputs": [],
171 |    "source": [
172 |     "re.split(',', ',happy, birthday,') #It seems to split at empty strings"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "## Handling empty string problem"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {
186 |     "collapsed": true
187 |    },
188 |    "outputs": [],
189 |    "source": [
190 |     "# list comprehensions"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "collapsed": false
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "[i for i in re.split(\"<[^<>]+>\", string) if i != '']"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": null,
207 |    "metadata": {
208 |     "collapsed": true
209 |    },
210 |    "outputs": [],
211 |    "source": [
212 |     "#Alternatives to split --"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {
219 |     "collapsed": true
220 |    },
221 |    "outputs": [],
222 |    "source": [
223 |     "string = '<p>My mother has <span style=\"color:blue\">blue</span> eyes.</p>'"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {
230 |     "collapsed": false
231 |    },
232 |    "outputs": [],
233 |    "source": [
234 |     "re.findall('>([^<]+)<',string)  #findall"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {
241 |     "collapsed": false
242 |    },
243 |    "outputs": [],
244 |    "source": [
245 |     "string = re.split(',', ',happy, birthday,')\n",
246 |     "\n",
247 |     "re.split(',', ',happy, birthday,')"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {
254 |     "collapsed": false
255 |    },
256 |    "outputs": [],
257 |    "source": [
258 |     "string = ',happy, birthday,'"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {
265 |     "collapsed": false
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "list(filter(None, string.split(',')))"
270 |    ]
271 |   }
272 |  ],
273 |  "metadata": {
274 |   "kernelspec": {
275 |    "display_name": "Python 3",
276 |    "language": "python",
277 |    "name": "python3"
278 |   },
279 |   "language_info": {
280 |    "codemirror_mode": {
281 |     "name": "ipython",
282 |     "version": 3
283 |    },
284 |    "file_extension": ".py",
285 |    "mimetype": "text/x-python",
286 |    "name": "python",
287 |    "nbconvert_exporter": "python",
288 |    "pygments_lexer": "ipython3",
289 |    "version": "3.6.0"
290 |   }
291 |  },
292 |  "nbformat": 4,
293 |  "nbformat_minor": 1
294 | }
295 | 


--------------------------------------------------------------------------------
/Regular Expressions Made Easy - part 12 - re.Sub.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import re"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "### re.sub"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "string =\"\"\"U.S. stock-index futures pointed\n",
 30 |     "to a solidly higher open on Monday, \n",
 31 |     "indicating that major \n",
 32 |     "benchmarks were poised to USA reboundfrom last week’s sharp decline, \n",
 33 |     "\\nwhich represented their biggest weekly drops in months.\"\"\""
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {
 40 |     "collapsed": false
 41 |    },
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "United States  stock-index futures pointed\n",
 48 |       "to a solidly higher open on Monday, \n",
 49 |       "indicating that major \n",
 50 |       "benchmarks were poised to United States A reboundfrom last week’s sharp decline, \n",
 51 |       "\n",
 52 |       "which represented their biggest weekly drops in months.\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "print(re.sub('U.S.|US|USA', 'United States ', string ))"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "source": [
 66 |     "## Using Functions with Sub"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {
 73 |     "collapsed": true
 74 |    },
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "#brief explanation of lambda"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 4,
 83 |    "metadata": {
 84 |     "collapsed": false
 85 |    },
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "9"
 91 |       ]
 92 |      },
 93 |      "execution_count": 4,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "def square(x):\n",
100 |     "    return (x ** 2)\n",
101 |     "\n",
102 |     "square(3)\n",
103 |     "\n"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 5,
109 |    "metadata": {
110 |     "collapsed": false
111 |    },
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "9"
117 |       ]
118 |      },
119 |      "execution_count": 5,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "square = lambda x: x**2     \n",
126 |     "square(3)"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 7,
132 |    "metadata": {
133 |     "collapsed": true
134 |    },
135 |    "outputs": [],
136 |    "source": [
137 |     "string = 'Dan has 3 snails. Mike has 4 cats. Alisa has 9 monkeys.'"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 8,
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "'3'"
151 |       ]
152 |      },
153 |      "execution_count": 8,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "re.search('(\\d+)', string).group()"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 9,
165 |    "metadata": {
166 |     "collapsed": false
167 |    },
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "['3', '4', '9']"
173 |       ]
174 |      },
175 |      "execution_count": 9,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     }
179 |    ],
180 |    "source": [
181 |     "re.findall('(\\d+)', string)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 10,
187 |    "metadata": {
188 |     "collapsed": false
189 |    },
190 |    "outputs": [
191 |     {
192 |      "data": {
193 |       "text/plain": [
194 |        "'Dan has 1 snails. Mike has 1 cats. Alisa has 1 monkeys.'"
195 |       ]
196 |      },
197 |      "execution_count": 10,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "re.sub('(\\d+)', '1', string)  #find all instances like findall"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {
210 |     "collapsed": true
211 |    },
212 |    "outputs": [],
213 |    "source": [
214 |     "#In this example we change the "
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 15,
220 |    "metadata": {
221 |     "collapsed": false
222 |    },
223 |    "outputs": [
224 |     {
225 |      "data": {
226 |       "text/plain": [
227 |        "'Dan has 9 snails. Mike has 16 cats. Alisa has 81 monkeys.'"
228 |       ]
229 |      },
230 |      "execution_count": 15,
231 |      "metadata": {},
232 |      "output_type": "execute_result"
233 |     }
234 |    ],
235 |    "source": [
236 |     "re.sub('(\\d+)', lambda x: str(square(int(x.group(0)))), string)"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {
243 |     "collapsed": true
244 |    },
245 |    "outputs": [],
246 |    "source": [
247 |     "#re.sub('(\\d+)', lambda x: str(x), string)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": null,
253 |    "metadata": {
254 |     "collapsed": false
255 |    },
256 |    "outputs": [],
257 |    "source": [
258 |     "#step 1   lambda x: x.group   x is match object\n",
259 |     "#step 2   turn the result into int\n",
260 |     "#step 3   Use Square function\n",
261 |     "#step 4   turn back to string"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": null,
267 |    "metadata": {
268 |     "collapsed": true
269 |    },
270 |    "outputs": [],
271 |    "source": [
272 |     "# Another example with function"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 16,
278 |    "metadata": {
279 |     "collapsed": false
280 |    },
281 |    "outputs": [
282 |     {
283 |      "name": "stdout",
284 |      "output_type": "stream",
285 |      "text": [
286 |       "eating laughing sleeping studying\n"
287 |      ]
288 |     }
289 |    ],
290 |    "source": [
291 |     "#m = match object\n",
292 |     "import re\n",
293 |     "\n",
294 |     "# The input string.\n",
295 |     "input = \"eat laugh sleep study\"\n",
296 |     "\n",
297 |     "# Use lambda to add \"ing\" to all words.\n",
298 |     "result = re.sub(\"\\w+\", lambda m: m.group() + \"ing\", input)\n",
299 |     "\n",
300 |     "# Display result.\n",
301 |     "print(result)"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {
308 |     "collapsed": false
309 |    },
310 |    "outputs": [],
311 |    "source": [
312 |     "import re\n",
313 |     "\n",
314 |     "# The input string.\n",
315 |     "input = \"eat laugh sleep study\"\n",
316 |     "\n",
317 |     "# Use lambda to add \"ing\" to all words.\n",
318 |     "result = re.sub(\"\\w+\", lambda m: m.group() + \"ing\", input)\n",
319 |     "\n",
320 |     "# Display result.\n",
321 |     "print(result)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {
327 |     "collapsed": true
328 |    },
329 |    "source": [
330 |     "### backreferencing with subs"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 18,
336 |    "metadata": {
337 |     "collapsed": true
338 |    },
339 |    "outputs": [],
340 |    "source": [
341 |     "string = 'Merry Merry Christmas'"
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "code",
346 |    "execution_count": 19,
347 |    "metadata": {
348 |     "collapsed": false
349 |    },
350 |    "outputs": [
351 |     {
352 |      "data": {
353 |       "text/plain": [
354 |        "('Merry ', 'Merry ')"
355 |       ]
356 |      },
357 |      "execution_count": 19,
358 |      "metadata": {},
359 |      "output_type": "execute_result"
360 |     }
361 |    ],
362 |    "source": [
363 |     "re.search(r'(\\w+ )(\\1)', string).groups()"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 20,
369 |    "metadata": {
370 |     "collapsed": false
371 |    },
372 |    "outputs": [
373 |     {
374 |      "data": {
375 |       "text/plain": [
376 |        "('Merry ', 'Merry ')"
377 |       ]
378 |      },
379 |      "execution_count": 20,
380 |      "metadata": {},
381 |      "output_type": "execute_result"
382 |     }
383 |    ],
384 |    "source": [
385 |     "re.search(r'(\\w+ )(\\1)', string).group(1,2)"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": null,
391 |    "metadata": {
392 |     "collapsed": true
393 |    },
394 |    "outputs": [],
395 |    "source": [
396 |     "#backreferencing example with sub"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 21,
402 |    "metadata": {
403 |     "collapsed": false
404 |    },
405 |    "outputs": [
406 |     {
407 |      "data": {
408 |       "text/plain": [
409 |        "'Happy Merry Christmas'"
410 |       ]
411 |      },
412 |      "execution_count": 21,
413 |      "metadata": {},
414 |      "output_type": "execute_result"
415 |     }
416 |    ],
417 |    "source": [
418 |     "re.sub(r'(\\w+) (\\1)',r'Happy \\1', string)   # \\1 = Merry"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 22,
424 |    "metadata": {
425 |     "collapsed": false
426 |    },
427 |    "outputs": [
428 |     {
429 |      "data": {
430 |       "text/plain": [
431 |        "'Merry Happy Christmas'"
432 |       ]
433 |      },
434 |      "execution_count": 22,
435 |      "metadata": {},
436 |      "output_type": "execute_result"
437 |     }
438 |    ],
439 |    "source": [
440 |     "re.sub(r'(\\w+) (\\1)',r'\\1 Happy', string)    #Merry Happy"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 23,
446 |    "metadata": {
447 |     "collapsed": false
448 |    },
449 |    "outputs": [
450 |     {
451 |      "data": {
452 |       "text/plain": [
453 |        "'Happy Merry Christmas'"
454 |       ]
455 |      },
456 |      "execution_count": 23,
457 |      "metadata": {},
458 |      "output_type": "execute_result"
459 |     }
460 |    ],
461 |    "source": [
462 |     "re.sub(r'(\\w+) (\\1)',r'Happy \\2', string)"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {
469 |     "collapsed": true
470 |    },
471 |    "outputs": [],
472 |    "source": []
473 |   }
474 |  ],
475 |  "metadata": {
476 |   "kernelspec": {
477 |    "display_name": "Python 3",
478 |    "language": "python",
479 |    "name": "python3"
480 |   },
481 |   "language_info": {
482 |    "codemirror_mode": {
483 |     "name": "ipython",
484 |     "version": 3
485 |    },
486 |    "file_extension": ".py",
487 |    "mimetype": "text/x-python",
488 |    "name": "python",
489 |    "nbconvert_exporter": "python",
490 |    "pygments_lexer": "ipython3",
491 |    "version": "3.6.0"
492 |   }
493 |  },
494 |  "nbformat": 4,
495 |  "nbformat_minor": 1
496 | }
497 | 


--------------------------------------------------------------------------------
/Regular Expressions Made Easy -13 -  Word Boundaries.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### \\b meta character"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "collapsed": true
 14 |    },
 15 |    "source": [
 16 |     " \\b is called 'boundary' and allows you to isolate words. \n",
 17 |     "\n",
 18 |     "- is similar to ^ and $ (location and no consumption)`````\n"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "import re"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 4,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "string = 'cat catherine catholic  wildcat copycat uncatchable'"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 2,
 46 |    "metadata": {
 47 |     "collapsed": false
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "pattern = re.compile('cat')"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 5,
 57 |    "metadata": {
 58 |     "collapsed": false
 59 |    },
 60 |    "outputs": [
 61 |     {
 62 |      "data": {
 63 |       "text/plain": [
 64 |        "['cat', 'cat', 'cat', 'cat', 'cat', 'cat']"
 65 |       ]
 66 |      },
 67 |      "execution_count": 5,
 68 |      "metadata": {},
 69 |      "output_type": "execute_result"
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "re.findall(pattern, string)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "#using space"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 6,
 90 |    "metadata": {
 91 |     "collapsed": true
 92 |    },
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "string = 'cat catherine catholic  wildcat copycat uncatchable'"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 9,
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "outputs": [],
105 |    "source": [
106 |     "pattern = re.compile(' cat ')"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 10,
112 |    "metadata": {
113 |     "collapsed": false
114 |    },
115 |    "outputs": [
116 |     {
117 |      "data": {
118 |       "text/plain": [
119 |        "[]"
120 |       ]
121 |      },
122 |      "execution_count": 10,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "re.findall(pattern, string)"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "#only pull out cat with boundary"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "string = 'cat catherine catholic  wildcat copycat uncatchable'"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 11,
156 |    "metadata": {
157 |     "collapsed": true
158 |    },
159 |    "outputs": [],
160 |    "source": [
161 |     "pattern = re.compile(r'\\bcat\\b')"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 12,
167 |    "metadata": {
168 |     "collapsed": false
169 |    },
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "['cat']"
175 |       ]
176 |      },
177 |      "execution_count": 12,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "re.findall(pattern, string)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {},
189 |    "source": [
190 |     "### Word boundaries nuances\n"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": [
201 |     "#be careful with periods(dot) and non-alphanumeric characters \n",
202 |     "#   \\w  [A-Za-z0-9_]   \\W  +:@^%"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 16,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "string = '.cat catherine catholic  wildcat copycat uncatchable'"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 17,
219 |    "metadata": {
220 |     "collapsed": true
221 |    },
222 |    "outputs": [],
223 |    "source": [
224 |     "pattern = re.compile(r'\\bcat\\b')"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 18,
230 |    "metadata": {
231 |     "collapsed": false
232 |    },
233 |    "outputs": [
234 |     {
235 |      "data": {
236 |       "text/plain": [
237 |        "['cat']"
238 |       ]
239 |      },
240 |      "execution_count": 18,
241 |      "metadata": {},
242 |      "output_type": "execute_result"
243 |     }
244 |    ],
245 |    "source": [
246 |     "re.findall(pattern, string)"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": null,
252 |    "metadata": {
253 |     "collapsed": true
254 |    },
255 |    "outputs": [],
256 |    "source": [
257 |     "# . = nonalpha numeric"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": null,
263 |    "metadata": {
264 |     "collapsed": true
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "#One side has to have an alphanumeric character and the other side \n",
269 |     "#is non alphanumeric character"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": null,
275 |    "metadata": {
276 |     "collapsed": true
277 |    },
278 |    "outputs": [],
279 |    "source": []
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": 19,
284 |    "metadata": {
285 |     "collapsed": true
286 |    },
287 |    "outputs": [],
288 |    "source": [
289 |     "string = '@cat cat catherine catholic  wildcat copycat uncatchable'"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 21,
295 |    "metadata": {
296 |     "collapsed": false
297 |    },
298 |    "outputs": [],
299 |    "source": [
300 |     "pattern = re.compile(r'\\bcat\\b')"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 22,
306 |    "metadata": {
307 |     "collapsed": false
308 |    },
309 |    "outputs": [
310 |     {
311 |      "data": {
312 |       "text/plain": [
313 |        "['cat', 'cat']"
314 |       ]
315 |      },
316 |      "execution_count": 22,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "re.findall(pattern, string)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {
329 |     "collapsed": true
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "#Example 2  Twitter examples   Twitter Handles"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 24,
339 |    "metadata": {
340 |     "collapsed": false
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "string = '@moondra2017.org'\n",
345 |     "string2 = '@moondra'\n",
346 |     "string3 = 'Python@moondra'\n",
347 |     "string4 = '@moondra_python'"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "code",
352 |    "execution_count": null,
353 |    "metadata": {
354 |     "collapsed": true
355 |    },
356 |    "outputs": [],
357 |    "source": [
358 |     "#we only want @moondra and '@moondra_python' -- string 2 and string 4"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 28,
364 |    "metadata": {
365 |     "collapsed": false
366 |    },
367 |    "outputs": [],
368 |    "source": [
369 |     "pattern = re.compile(r'\\b@[\\w]+\\b')    #no good\n",
370 |     "re.search(pattern, string)"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "code",
375 |    "execution_count": null,
376 |    "metadata": {
377 |     "collapsed": true
378 |    },
379 |    "outputs": [],
380 |    "source": [
381 |     "string = '@moondra2017.org'\n",
382 |     "string2 = '@moondra'\n",
383 |     "string3 = 'Python@moondra'\n",
384 |     "string4 = '@moondra_python'"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 32,
390 |    "metadata": {
391 |     "collapsed": false
392 |    },
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/plain": [
397 |        "<_sre.SRE_Match object; span=(0, 12), match='@moondra2017'>"
398 |       ]
399 |      },
400 |      "execution_count": 32,
401 |      "metadata": {},
402 |      "output_type": "execute_result"
403 |     }
404 |    ],
405 |    "source": [
406 |     "pattern = re.compile(r'\\B@[\\w]+\\b')    # _  is include in \\w\n",
407 |     "re.search(pattern, string)            # This works but not perfect"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 41,
413 |    "metadata": {
414 |     "collapsed": true
415 |    },
416 |    "outputs": [],
417 |    "source": [
418 |     "string = '@moondra2017.org'\n",
419 |     "string2 = '@moondra @moondra @moondra'\n",
420 |     "string3 = 'Python@moondra'\n",
421 |     "string4 = '@moondra_python'"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 48,
427 |    "metadata": {
428 |     "collapsed": false
429 |    },
430 |    "outputs": [
431 |     {
432 |      "data": {
433 |       "text/plain": [
434 |        "[]"
435 |       ]
436 |      },
437 |      "execution_count": 48,
438 |      "metadata": {},
439 |      "output_type": "execute_result"
440 |     }
441 |    ],
442 |    "source": [
443 |     "pattern = re.compile(r'\\B@[\\w]+\\b(?!\\.)')\n",
444 |     "re.findall(pattern, string)"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 33,
450 |    "metadata": {
451 |     "collapsed": true
452 |    },
453 |    "outputs": [],
454 |    "source": [
455 |     "pattern = re.compile(r'\\B@[\\w]+$')    #  #This is perfect\n",
456 |     "re.search(pattern, string)"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 38,
462 |    "metadata": {
463 |     "collapsed": false
464 |    },
465 |    "outputs": [
466 |     {
467 |      "data": {
468 |       "text/plain": [
469 |        "['@moondra']"
470 |       ]
471 |      },
472 |      "execution_count": 38,
473 |      "metadata": {},
474 |      "output_type": "execute_result"
475 |     }
476 |    ],
477 |    "source": [
478 |     "pattern = re.compile(r'\\B@[\\w]+$') \n",
479 |     "re.findall(pattern, string2)"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 35,
485 |    "metadata": {
486 |     "collapsed": false
487 |    },
488 |    "outputs": [],
489 |    "source": [
490 |     "pattern = re.compile(r'\\B@[\\w]+$') \n",
491 |     "re.search(pattern, string3)\n"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": 36,
497 |    "metadata": {
498 |     "collapsed": false
499 |    },
500 |    "outputs": [
501 |     {
502 |      "data": {
503 |       "text/plain": [
504 |        "<_sre.SRE_Match object; span=(0, 15), match='@moondra_python'>"
505 |       ]
506 |      },
507 |      "execution_count": 36,
508 |      "metadata": {},
509 |      "output_type": "execute_result"
510 |     }
511 |    ],
512 |    "source": [
513 |     "pattern = re.compile(r'\\B@[\\w]+$')\n",
514 |     "re.search(pattern, string4)"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": null,
520 |    "metadata": {
521 |     "collapsed": true
522 |    },
523 |    "outputs": [],
524 |    "source": []
525 |   },
526 |   {
527 |    "cell_type": "code",
528 |    "execution_count": null,
529 |    "metadata": {
530 |     "collapsed": true
531 |    },
532 |    "outputs": [],
533 |    "source": []
534 |   }
535 |  ],
536 |  "metadata": {
537 |   "kernelspec": {
538 |    "display_name": "Python 3",
539 |    "language": "python",
540 |    "name": "python3"
541 |   },
542 |   "language_info": {
543 |    "codemirror_mode": {
544 |     "name": "ipython",
545 |     "version": 3
546 |    },
547 |    "file_extension": ".py",
548 |    "mimetype": "text/x-python",
549 |    "name": "python",
550 |    "nbconvert_exporter": "python",
551 |    "pygments_lexer": "ipython3",
552 |    "version": "3.6.0"
553 |   }
554 |  },
555 |  "nbformat": 4,
556 |  "nbformat_minor": 1
557 | }
558 | 


--------------------------------------------------------------------------------
/Regular Expressions made Easy - part 1 + part 2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#Regular Expressions are used to match string patterns.\n",
 12 |     "-They are very powerful\n",
 13 |     "\n",
 14 |     "-If you want to pull out a string pattern RE can do it\n",
 15 |     "\n",
 16 |     "-They may seem intimidating  \n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "# Things to note"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": null,
 29 |    "metadata": {
 30 |     "collapsed": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "#The first thing I want start off with is the the back slash character\n",
 35 |     "#Very confusing to people"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "#Python uses back slash to indicate special characters \n",
 47 |     "\n",
 48 |     "\n",
 49 |     "'\\n'  Backslash followed by n denotes a newline\n",
 50 |     "'\\t'  denotes a tab\n",
 51 |     "\n"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": null,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "\n",
 63 |     "\n",
 64 |     " 'r' expression, that voids the Python's special characters\n",
 65 |     "\n",
 66 |     " r'\\n' means it's a raw string with two characters 'n' and '\\' as \n",
 67 |     "opposed to just one special character'  "
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": true
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "\n",
 79 |     "#Let's see some examples of this dont mind the python syntax\n"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 1,
 85 |    "metadata": {
 86 |     "collapsed": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "import re\n",
 91 |     "re.search('n', '\\n')  #first item is pattern, second item is string"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 2,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "<_sre.SRE_Match object; span=(1, 2), match='n'>"
105 |       ]
106 |      },
107 |      "execution_count": 2,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "#two ways to handle this one way is to use \\ for every backslash\n",
114 |     "\n",
115 |     "import re\n",
116 |     "re.search('n', '\\\\n')   "
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 4,
122 |    "metadata": {
123 |     "collapsed": true
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "re.search('n',  '\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n')  #not the best way if we\n",
128 |     "                                                   #have too many \\s"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": 5,
134 |    "metadata": {
135 |     "collapsed": false
136 |    },
137 |    "outputs": [
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "<_sre.SRE_Match object; span=(1, 2), match='n'>"
142 |       ]
143 |      },
144 |      "execution_count": 5,
145 |      "metadata": {},
146 |      "output_type": "execute_result"
147 |     }
148 |    ],
149 |    "source": [
150 |     "re.search('n',  r'\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n')    #r converts to raw string"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 6,
156 |    "metadata": {
157 |     "collapsed": false
158 |    },
159 |    "outputs": [
160 |     {
161 |      "data": {
162 |       "text/plain": [
163 |        "<_sre.SRE_Match object; span=(0, 1), match='\\n'>"
164 |       ]
165 |      },
166 |      "execution_count": 6,
167 |      "metadata": {},
168 |      "output_type": "execute_result"
169 |     }
170 |    ],
171 |    "source": [
172 |     "#there are some nuances that you should be aware of\n",
173 |     "#regular expressions has its own special characters as well\n",
174 |     "# regex with '\\n' and r'\\n' both look for newline\n",
175 |     "\n",
176 |     "re.search('\\n',  '\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n')   "
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 7,
182 |    "metadata": {
183 |     "collapsed": false
184 |    },
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "<_sre.SRE_Match object; span=(0, 1), match='\\n'>"
190 |       ]
191 |      },
192 |      "execution_count": 7,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "re.search(r'\\n',  '\\n\\n')     #this works as well because r'\\n' also looks\n",
199 |     "                                #for new line"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 8,
205 |    "metadata": {
206 |     "collapsed": false
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "#doesn't work because sting doesn't use newline and r'\\n' looks for newline\n",
211 |     "\n",
212 |     "re.search(r'\\n',  r'\\n\\n') `   #r\n",
213 |     "\n"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "# MATCH and SEARCH EXAMPLES"
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "code",
225 |    "execution_count": null,
226 |    "metadata": {
227 |     "collapsed": true
228 |    },
229 |    "outputs": [],
230 |    "source": [
231 |     "REs common methods - Match and Search"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": null,
237 |    "metadata": {
238 |     "collapsed": true
239 |    },
240 |    "outputs": [],
241 |    "source": [
242 |     "re.search(pattern, string, flags)  # searches anywhere in the sentence\n",
243 |     "                                    #flags special options"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {
250 |     "collapsed": true
251 |    },
252 |    "outputs": [],
253 |    "source": [
254 |     "re.match(pattern, string, flags) # only beginning of the string"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 11,
260 |    "metadata": {
261 |     "collapsed": true
262 |    },
263 |    "outputs": [],
264 |    "source": [
265 |     "re.match(\"c\", \"abcdef\")  #returns none because only looks at the start of string"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 10,
271 |    "metadata": {
272 |     "collapsed": false
273 |    },
274 |    "outputs": [
275 |     {
276 |      "data": {
277 |       "text/plain": [
278 |        "<_sre.SRE_Match object; span=(2, 3), match='c'>"
279 |       ]
280 |      },
281 |      "execution_count": 10,
282 |      "metadata": {},
283 |      "output_type": "execute_result"
284 |     }
285 |    ],
286 |    "source": [
287 |     "re.search(\"c\", \"abcdef\")   #searches anywhere"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {
294 |     "collapsed": true
295 |    },
296 |    "outputs": [],
297 |    "source": []
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 12,
302 |    "metadata": {
303 |     "collapsed": false
304 |    },
305 |    "outputs": [
306 |     {
307 |      "data": {
308 |       "text/plain": [
309 |        "False"
310 |       ]
311 |      },
312 |      "execution_count": 12,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "bool(re.match(\"c\", \"abcdef\"))  # no match returns boolean false"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 13,
324 |    "metadata": {
325 |     "collapsed": false
326 |    },
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "True"
332 |       ]
333 |      },
334 |      "execution_count": 13,
335 |      "metadata": {},
336 |      "output_type": "execute_result"
337 |     }
338 |    ],
339 |    "source": [
340 |     "bool(re.match(\"a\", \"abcdef\"))  #match returns true"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 14,
346 |    "metadata": {
347 |     "collapsed": false
348 |    },
349 |    "outputs": [
350 |     {
351 |      "data": {
352 |       "text/plain": [
353 |        "<_sre.SRE_Match object; span=(2, 3), match='c'>"
354 |       ]
355 |      },
356 |      "execution_count": 14,
357 |      "metadata": {},
358 |      "output_type": "execute_result"
359 |     }
360 |    ],
361 |    "source": [
362 |     "re.search(\"c\", \"abcdef\")  #tells you where it matched first and only first"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 15,
368 |    "metadata": {
369 |     "collapsed": false
370 |    },
371 |    "outputs": [
372 |     {
373 |      "data": {
374 |       "text/plain": [
375 |        "<_sre.SRE_Match object; span=(2, 3), match='c'>"
376 |       ]
377 |      },
378 |      "execution_count": 15,
379 |      "metadata": {},
380 |      "output_type": "execute_result"
381 |     }
382 |    ],
383 |    "source": [
384 |     "re.search(\"c\", \"abcdefc\")  #multiple 'c's first instance only"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 16,
390 |    "metadata": {
391 |     "collapsed": false
392 |    },
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/plain": [
397 |        "<_sre.SRE_Match object; span=(6, 7), match='c'>"
398 |       ]
399 |      },
400 |      "execution_count": 16,
401 |      "metadata": {},
402 |      "output_type": "execute_result"
403 |     }
404 |    ],
405 |    "source": [
406 |     "re.search(\"c\", \"abdef\\nc\") #multiline works with search"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 17,
412 |    "metadata": {
413 |     "collapsed": false
414 |    },
415 |    "outputs": [],
416 |    "source": [
417 |     "re.match(\"c\", \"abcdef\\nc\")   #match doesn't work with newline"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "markdown",
422 |    "metadata": {},
423 |    "source": [
424 |     "## Printing the output of match and search"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": 23,
430 |    "metadata": {
431 |     "collapsed": false
432 |    },
433 |    "outputs": [
434 |     {
435 |      "data": {
436 |       "text/plain": [
437 |        "<_sre.SRE_Match object; span=(0, 1), match='a'>"
438 |       ]
439 |      },
440 |      "execution_count": 23,
441 |      "metadata": {},
442 |      "output_type": "execute_result"
443 |     }
444 |    ],
445 |    "source": [
446 |     "(re.match(\"a\", \"abcdef\"))   #match objects"
447 |    ]
448 |   },
449 |   {
450 |    "cell_type": "code",
451 |    "execution_count": 19,
452 |    "metadata": {
453 |     "collapsed": false
454 |    },
455 |    "outputs": [
456 |     {
457 |      "data": {
458 |       "text/plain": [
459 |        "'a'"
460 |       ]
461 |      },
462 |      "execution_count": 19,
463 |      "metadata": {},
464 |      "output_type": "execute_result"
465 |     }
466 |    ],
467 |    "source": [
468 |     "re.match(\"a\", \"abcdef\").group()  #string output #defautlt value is 0"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": 20,
474 |    "metadata": {
475 |     "collapsed": false
476 |    },
477 |    "outputs": [
478 |     {
479 |      "data": {
480 |       "text/plain": [
481 |        "'a'"
482 |       ]
483 |      },
484 |      "execution_count": 20,
485 |      "metadata": {},
486 |      "output_type": "execute_result"
487 |     }
488 |    ],
489 |    "source": [
490 |     "re.match(\"a\", \"abcdef\").group(0)  "
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 21,
496 |    "metadata": {
497 |     "collapsed": false
498 |    },
499 |    "outputs": [
500 |     {
501 |      "data": {
502 |       "text/plain": [
503 |        "'n'"
504 |       ]
505 |      },
506 |      "execution_count": 21,
507 |      "metadata": {},
508 |      "output_type": "execute_result"
509 |     }
510 |    ],
511 |    "source": [
512 |     "re.search(\"n\", \"abcdefnc abcd\").group()"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 22,
518 |    "metadata": {
519 |     "collapsed": false
520 |    },
521 |    "outputs": [
522 |     {
523 |      "data": {
524 |       "text/plain": [
525 |        "'nc abcd'"
526 |       ]
527 |      },
528 |      "execution_count": 22,
529 |      "metadata": {},
530 |      "output_type": "execute_result"
531 |     }
532 |    ],
533 |    "source": [
534 |     "re.search('n.+', \"abcdefnc abcd\").group()  #pull out different types of strings \n",
535 |     "                                            #depending on the wildcards you use"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": 24,
541 |    "metadata": {
542 |     "collapsed": false
543 |    },
544 |    "outputs": [
545 |     {
546 |      "data": {
547 |       "text/plain": [
548 |        "6"
549 |       ]
550 |      },
551 |      "execution_count": 24,
552 |      "metadata": {},
553 |      "output_type": "execute_result"
554 |     }
555 |    ],
556 |    "source": [
557 |     "re.search(\"c\", \"abdef\\nc\").start()"
558 |    ]
559 |   },
560 |   {
561 |    "cell_type": "code",
562 |    "execution_count": 25,
563 |    "metadata": {
564 |     "collapsed": false
565 |    },
566 |    "outputs": [
567 |     {
568 |      "data": {
569 |       "text/plain": [
570 |        "7"
571 |       ]
572 |      },
573 |      "execution_count": 25,
574 |      "metadata": {},
575 |      "output_type": "execute_result"
576 |     }
577 |    ],
578 |    "source": [
579 |     "re.search(\"c\", \"abdef\\nc\").end()"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "markdown",
584 |    "metadata": {
585 |     "collapsed": true
586 |    },
587 |    "source": [
588 |     "##  Literal matching"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": 26,
594 |    "metadata": {
595 |     "collapsed": true
596 |    },
597 |    "outputs": [],
598 |    "source": [
599 |     "re.search('na',\"abcdefnc abcd\" )  #doesn't work, because they are ordered"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": 27,
605 |    "metadata": {
606 |     "collapsed": false
607 |    },
608 |    "outputs": [
609 |     {
610 |      "data": {
611 |       "text/plain": [
612 |        "<_sre.SRE_Match object; span=(0, 1), match='a'>"
613 |       ]
614 |      },
615 |      "execution_count": 27,
616 |      "metadata": {},
617 |      "output_type": "execute_result"
618 |     }
619 |    ],
620 |    "source": [
621 |     "re.search('n|a',\"abcdefnc abcda\" )  #n or a"
622 |    ]
623 |   },
624 |   {
625 |    "cell_type": "code",
626 |    "execution_count": 28,
627 |    "metadata": {
628 |     "collapsed": false
629 |    },
630 |    "outputs": [
631 |     {
632 |      "data": {
633 |       "text/plain": [
634 |        "<_sre.SRE_Match object; span=(5, 6), match='n'>"
635 |       ]
636 |      },
637 |      "execution_count": 28,
638 |      "metadata": {},
639 |      "output_type": "execute_result"
640 |     }
641 |    ],
642 |    "source": [
643 |     " re.search('n|a',\"bcdefnc abcda\" )  #replaced the a with b, first match is an n"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": 29,
649 |    "metadata": {
650 |     "collapsed": false
651 |    },
652 |    "outputs": [
653 |     {
654 |      "data": {
655 |       "text/plain": [
656 |        "<_sre.SRE_Match object; span=(0, 1), match='b'>"
657 |       ]
658 |      },
659 |      "execution_count": 29,
660 |      "metadata": {},
661 |      "output_type": "execute_result"
662 |     }
663 |    ],
664 |    "source": [
665 |     "re.search('n|a|b',\"bcdefnc abcda\" ) # as many OR expressions"
666 |    ]
667 |   },
668 |   {
669 |    "cell_type": "markdown",
670 |    "metadata": {},
671 |    "source": [
672 |     "##  re.findall"
673 |    ]
674 |   },
675 |   {
676 |    "cell_type": "code",
677 |    "execution_count": 30,
678 |    "metadata": {
679 |     "collapsed": false
680 |    },
681 |    "outputs": [
682 |     {
683 |      "data": {
684 |       "text/plain": [
685 |        "['n', 'a', 'a']"
686 |       ]
687 |      },
688 |      "execution_count": 30,
689 |      "metadata": {},
690 |      "output_type": "execute_result"
691 |     }
692 |    ],
693 |    "source": [
694 |     "re.findall('n|a',\"bcdefnc abcda\" ) #find all pulls out all instances"
695 |    ]
696 |   },
697 |   {
698 |    "cell_type": "code",
699 |    "execution_count": 31,
700 |    "metadata": {
701 |     "collapsed": false
702 |    },
703 |    "outputs": [
704 |     {
705 |      "data": {
706 |       "text/plain": [
707 |        "<_sre.SRE_Match object; span=(0, 4), match='abcd'>"
708 |       ]
709 |      },
710 |      "execution_count": 31,
711 |      "metadata": {},
712 |      "output_type": "execute_result"
713 |     }
714 |    ],
715 |    "source": [
716 |     "re.search('abcd',\"abcdefnc abcd\" ) #multiple characters - literal search"
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "code",
721 |    "execution_count": null,
722 |    "metadata": {
723 |     "collapsed": true
724 |    },
725 |    "outputs": [],
726 |    "source": [
727 |     "re.findall('abcd',\"abcdefnc abcd\" ) "
728 |    ]
729 |   },
730 |   {
731 |    "cell_type": "code",
732 |    "execution_count": null,
733 |    "metadata": {
734 |     "collapsed": true
735 |    },
736 |    "outputs": [],
737 |    "source": []
738 |   }
739 |  ],
740 |  "metadata": {
741 |   "kernelspec": {
742 |    "display_name": "Python 3",
743 |    "language": "python",
744 |    "name": "python3"
745 |   },
746 |   "language_info": {
747 |    "codemirror_mode": {
748 |     "name": "ipython",
749 |     "version": 3
750 |    },
751 |    "file_extension": ".py",
752 |    "mimetype": "text/x-python",
753 |    "name": "python",
754 |    "nbconvert_exporter": "python",
755 |    "pygments_lexer": "ipython3",
756 |    "version": "3.6.0"
757 |   }
758 |  },
759 |  "nbformat": 4,
760 |  "nbformat_minor": 1
761 | }
762 | 


--------------------------------------------------------------------------------
/Regular Expressions made Easy - part 3.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## CHARACTER SETS"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#Character sets can match a set of characters"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 33,
 24 |    "metadata": {
 25 |     "collapsed": false
 26 |    },
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "text/plain": [
 31 |        "<_sre.SRE_Match object; span=(0, 4), match='abcd'>"
 32 |       ]
 33 |      },
 34 |      "execution_count": 33,
 35 |      "metadata": {},
 36 |      "output_type": "execute_result"
 37 |     }
 38 |    ],
 39 |    "source": [
 40 |     "re.search('abcd',\"abcdefnc abcd\" ) # earlier code"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 34,
 46 |    "metadata": {
 47 |     "collapsed": false
 48 |    },
 49 |    "outputs": [
 50 |     {
 51 |      "data": {
 52 |       "text/plain": [
 53 |        "<_sre.SRE_Match object; span=(0, 4), match='abcd'>"
 54 |       ]
 55 |      },
 56 |      "execution_count": 34,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "re.search(r'\\w\\w\\w\\w',\"abcdefnc abcd\" )      #matches characters and numbers\n",
 63 |     "                                        #alpha numeric characters "
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {
 69 |     "collapsed": true
 70 |    },
 71 |    "source": [
 72 |     "\\w matches  alpha numeric characters [a-zA-Z0-9_] "
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 35,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [
 82 |     {
 83 |      "data": {
 84 |       "text/plain": [
 85 |        "<_sre.SRE_Match object; span=(0, 4), match='ab_c'>"
 86 |       ]
 87 |      },
 88 |      "execution_count": 35,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "re.search(r'\\w\\w\\w\\w',\"ab_cdefnc abcd\" ) #matches _ character"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 36,
100 |    "metadata": {
101 |     "collapsed": true
102 |    },
103 |    "outputs": [],
104 |    "source": [
105 |     "re.search(r'\\w\\w\\w', \"a3.!-!\")  #doesn't match symbols only numbers and \n",
106 |     "                                #    characters"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 37,
112 |    "metadata": {
113 |     "collapsed": false
114 |    },
115 |    "outputs": [
116 |     {
117 |      "data": {
118 |       "text/plain": [
119 |        "'a33'"
120 |       ]
121 |      },
122 |      "execution_count": 37,
123 |      "metadata": {},
124 |      "output_type": "execute_result"
125 |     }
126 |    ],
127 |    "source": [
128 |     "re.search(r'\\w\\w\\w', \"a33-_!\") .group()"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": [
139 |     "#\\W  opposite of \\w ; so nothing included in   [a-zA-Z0-9_]"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 38,
145 |    "metadata": {
146 |     "collapsed": false
147 |    },
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "<_sre.SRE_Match object; span=(0, 3), match='a3.'>"
153 |       ]
154 |      },
155 |      "execution_count": 38,
156 |      "metadata": {},
157 |      "output_type": "execute_result"
158 |     }
159 |    ],
160 |    "source": [
161 |     "\n",
162 |     "re.search(r'\\w\\w\\W', \"a3.-_!\") # \\W matches non characters and numbers"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 39,
168 |    "metadata": {
169 |     "collapsed": false
170 |    },
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "<_sre.SRE_Match object; span=(0, 3), match='a3 '>"
176 |       ]
177 |      },
178 |      "execution_count": 39,
179 |      "metadata": {},
180 |      "output_type": "execute_result"
181 |     }
182 |    ],
183 |    "source": [
184 |     "re.search(r'\\w\\w\\W', \"a3 .-_!\")   #matches empty space as well"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {
191 |     "collapsed": true
192 |    },
193 |    "outputs": [],
194 |    "source": [
195 |     "#We will go over other character sets later on"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "markdown",
200 |    "metadata": {},
201 |    "source": [
202 |     "# Let's go over quantifiers'"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {
209 |     "collapsed": true
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "#quantifiers\n",
214 |     "#\n",
215 |     "'+'   = 1 or more\n",
216 |     "'?' =  0 or 1\n",
217 |     "'*' =  0 or more\n",
218 |     "'{n,m}'  = n to m repetitions {,3}, {3,}\n",
219 |     "\n"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 40,
225 |    "metadata": {
226 |     "collapsed": false
227 |    },
228 |    "outputs": [
229 |     {
230 |      "data": {
231 |       "text/plain": [
232 |        "<_sre.SRE_Match object; span=(0, 2), match='ab'>"
233 |       ]
234 |      },
235 |      "execution_count": 40,
236 |      "metadata": {},
237 |      "output_type": "execute_result"
238 |     }
239 |    ],
240 |    "source": [
241 |     "re.search(r'\\w\\w',\"abcdefnc abcd\" )"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 41,
247 |    "metadata": {
248 |     "collapsed": false
249 |    },
250 |    "outputs": [
251 |     {
252 |      "data": {
253 |       "text/plain": [
254 |        "'abcdefnc'"
255 |       ]
256 |      },
257 |      "execution_count": 41,
258 |      "metadata": {},
259 |      "output_type": "execute_result"
260 |     }
261 |    ],
262 |    "source": [
263 |     "re.search(r'\\w+',\"abcdefnc abcd\" ).group()  #don't know the numbers of letters"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {
270 |     "collapsed": true
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "\\w\\w\\w\\w\\w\\w\\w\\w\\w"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 42,
280 |    "metadata": {
281 |     "collapsed": false
282 |    },
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "'abcdefnc abcd'"
288 |       ]
289 |      },
290 |      "execution_count": 42,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "re.search(r'\\w+\\W+\\w+',\"abcdefnc abcd\").group()"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 69,
302 |    "metadata": {
303 |     "collapsed": false
304 |    },
305 |    "outputs": [
306 |     {
307 |      "data": {
308 |       "text/plain": [
309 |        "'abcdefnc       abcd'"
310 |       ]
311 |      },
312 |      "execution_count": 69,
313 |      "metadata": {},
314 |      "output_type": "execute_result"
315 |     }
316 |    ],
317 |    "source": [
318 |     "re.search('\\w+\\W+\\w+',\"abcdefnc       abcd\").group()  #added spaces"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 44,
324 |    "metadata": {
325 |     "collapsed": false
326 |    },
327 |    "outputs": [
328 |     {
329 |      "data": {
330 |       "text/plain": [
331 |        "'abcdefnabcd'"
332 |       ]
333 |      },
334 |      "execution_count": 44,
335 |      "metadata": {},
336 |      "output_type": "execute_result"
337 |     }
338 |    ],
339 |    "source": [
340 |     "re.search(r'\\w+\\W?\\w+',\"abcdefnabcd\").group()  # ? = 0 or 1 instances"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 45,
346 |    "metadata": {
347 |     "collapsed": false
348 |    },
349 |    "outputs": [
350 |     {
351 |      "data": {
352 |       "text/plain": [
353 |        "'abcde fnabcd'"
354 |       ]
355 |      },
356 |      "execution_count": 45,
357 |      "metadata": {},
358 |      "output_type": "execute_result"
359 |     }
360 |    ],
361 |    "source": [
362 |     "re.search(r'\\w+\\W?\\w+',\"abcde fnabcd\").group()"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 47,
368 |    "metadata": {
369 |     "collapsed": false
370 |    },
371 |    "outputs": [],
372 |    "source": [
373 |     "re.search(r'\\w+\\W+\\w+', \"abcdefnabcd\")"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "code",
378 |    "execution_count": null,
379 |    "metadata": {
380 |     "collapsed": true
381 |    },
382 |    "outputs": [],
383 |    "source": [
384 |     "#Pulling out specific amounts"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 48,
390 |    "metadata": {
391 |     "collapsed": false
392 |    },
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/plain": [
397 |        "<_sre.SRE_Match object; span=(0, 3), match='aaa'>"
398 |       ]
399 |      },
400 |      "execution_count": 48,
401 |      "metadata": {},
402 |      "output_type": "execute_result"
403 |     }
404 |    ],
405 |    "source": [
406 |     "re.search(r'\\w{3}', 'aaaaaaaaaaa')   #only 3 \\w characters"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 49,
412 |    "metadata": {
413 |     "collapsed": false
414 |    },
415 |    "outputs": [
416 |     {
417 |      "data": {
418 |       "text/plain": [
419 |        "'aaaa'"
420 |       ]
421 |      },
422 |      "execution_count": 49,
423 |      "metadata": {},
424 |      "output_type": "execute_result"
425 |     }
426 |    ],
427 |    "source": [
428 |     "re.search(r'\\w{1,4}', 'aaaaaaaaaaa').group()   #1 is min, 4 is max"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "code",
433 |    "execution_count": 50,
434 |    "metadata": {
435 |     "collapsed": false
436 |    },
437 |    "outputs": [
438 |     {
439 |      "data": {
440 |       "text/plain": [
441 |        "'abcdefnc abcd'"
442 |       ]
443 |      },
444 |      "execution_count": 50,
445 |      "metadata": {},
446 |      "output_type": "execute_result"
447 |     }
448 |    ],
449 |    "source": [
450 |     " \n",
451 |     "re.search(r'\\w{1,10}\\W{0,4}\\w+',\"abcdefnc abcd\").group()#1-10 \\w characters,\n",
452 |     "                                                        #0-4  \\W chracters\n",
453 |     "                                                        # 1+ \\w characters"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 51,
459 |    "metadata": {
460 |     "collapsed": false
461 |    },
462 |    "outputs": [
463 |     {
464 |      "data": {
465 |       "text/plain": [
466 |        "'abcdefnc abcd'"
467 |       ]
468 |      },
469 |      "execution_count": 51,
470 |      "metadata": {},
471 |      "output_type": "execute_result"
472 |     }
473 |    ],
474 |    "source": [
475 |     "re.search(r'\\w{1,}\\W{0,}\\w+',\"abcdefnc abcd\").group() #at least 1\n",
476 |     "                                                                #at least 0"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {
483 |     "collapsed": true
484 |    },
485 |    "outputs": [],
486 |    "source": []
487 |   }
488 |  ],
489 |  "metadata": {
490 |   "kernelspec": {
491 |    "display_name": "Python 3",
492 |    "language": "python",
493 |    "name": "python3"
494 |   },
495 |   "language_info": {
496 |    "codemirror_mode": {
497 |     "name": "ipython",
498 |     "version": 3
499 |    },
500 |    "file_extension": ".py",
501 |    "mimetype": "text/x-python",
502 |    "name": "python",
503 |    "nbconvert_exporter": "python",
504 |    "pygments_lexer": "ipython3",
505 |    "version": "3.6.0"
506 |   }
507 |  },
508 |  "nbformat": 4,
509 |  "nbformat_minor": 1
510 | }
511 | 


--------------------------------------------------------------------------------
/Regular Expressions made Easy - part 4.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Other types of characters sets"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "\n",
 19 |     "'\\d'   =  matches digits [0-9]\n",
 20 |     "'\\D'   = matches This matches any non-digit character; ~\\d"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "string = '23abced++'\n",
 32 |     "re.search('\\d+', string).group()"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {
 39 |     "collapsed": true
 40 |    },
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "'\\s'  = matches any whitespace character   #new lines, tabs, spaces etc\n",
 44 |     "'\\S' = matches any non-whitespace chracter #~\\s"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 53,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [
 54 |     {
 55 |      "data": {
 56 |       "text/plain": [
 57 |        "'23abced++'"
 58 |       ]
 59 |      },
 60 |      "execution_count": 53,
 61 |      "metadata": {},
 62 |      "output_type": "execute_result"
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "string = '23abced++'\n",
 67 |     "re.search('\\S+', string).group()  #no spaces"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 54,
 73 |    "metadata": {
 74 |     "collapsed": true
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "string = '''Robots are branching out. A new prototype soft robot takes inspiration from plants by growing to explore its environment.\n",
 79 |     "\n",
 80 |     "Vines and some fungi extend from their tips to explore their surroundings. \n",
 81 |     "Elliot Hawkes of the University of California in Santa Barbara \n",
 82 |     "and his colleagues designed a bot that works \n",
 83 |     "on similar principles. Its mechanical body \n",
 84 |     "sits inside a plastic tube reel that extends \n",
 85 |     "through pressurized inflation, a method that some \n",
 86 |     "invertebrates like peanut worms (Sipunculus nudus)\n",
 87 |     "also use to extend their appendages. The plastic \n",
 88 |     "tubing has two compartments, and inflating one \n",
 89 |     "side or the other changes the extension direction. \n",
 90 |     "A camera sensor at the tip alerts the bot when it’s \n",
 91 |     "about to run into something.\n",
 92 |     "\n",
 93 |     "In the lab, Hawkes and his colleagues \n",
 94 |     "programmed the robot to form 3-D structures such \n",
 95 |     "as a radio antenna, turn off a valve, navigate a maze, \n",
 96 |     "swim through glue, act as a fire extinguisher, squeeze \n",
 97 |     "through tight gaps, shimmy through fly paper and slither \n",
 98 |     "across a bed of nails. The soft bot can extend up to \n",
 99 |     "72 meters, and unlike plants, it can grow at a speed of \n",
100 |     "10 meters per second, the team reports July 19 in Science Robotics. \n",
101 |     "The design could serve as a model for building robots \n",
102 |     "that can traverse constrained environments\n",
103 |     "\n",
104 |     "This isn’t the first robot to take \n",
105 |     "inspiration from plants. One plantlike \n",
106 |     "predecessor was a robot modeled on roots.'''"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "metadata": {
113 |     "collapsed": true
114 |    },
115 |    "outputs": [],
116 |    "source": [
117 |     "(re.findall('\\S+', string) )"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {
124 |     "collapsed": true
125 |    },
126 |    "outputs": [],
127 |    "source": [
128 |     "' '.join(re.findall('\\S+', string))"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "code",
133 |    "execution_count": null,
134 |    "metadata": {
135 |     "collapsed": true
136 |    },
137 |    "outputs": [],
138 |    "source": []
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {
144 |     "collapsed": true
145 |    },
146 |    "outputs": [],
147 |    "source": [
148 |     ". the dot matches any character except the newline."
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {
155 |     "collapsed": true
156 |    },
157 |    "outputs": [],
158 |    "source": [
159 |     "string = '''Robots are branching out. A new prototype soft robot takes inspiration from plants by growing to explore its environment.\n",
160 |     "\n",
161 |     "Vines and some fungi extend from their tips to explore their surroundings. Elliot Hawkes of the University of California in Santa Barbara and his colleagues designed a bot that works on similar principles. Its mechanical body sits inside a plastic tube reel that extends through pressurized inflation, a method that some invertebrates like peanut worms (Sipunculus nudus) also use to extend their appendages. The plastic tubing has two compartments, and inflating one side or the other changes the extension direction. A camera sensor at the tip alerts the bot when it’s about to run into something.\n",
162 |     "\n",
163 |     "In the lab, Hawkes and his colleagues programmed the robot to form 3-D structures such as a radio antenna, turn off a valve, navigate a maze, swim through glue, act as a fire extinguisher, squeeze through tight gaps, shimmy through fly paper and slither across a bed of nails. The soft bot can extend up to 72 meters, and unlike plants, it can grow at a speed of 10 meters per second, the team reports July 19 in Science Robotics. The design could serve as a model for building robots that can traverse constrained environments\n",
164 |     "\n",
165 |     "This isn’t the first robot to take inspiration from plants. One plantlike predecessor was a robot modeled on roots.'''"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 60,
171 |    "metadata": {
172 |     "collapsed": false
173 |    },
174 |    "outputs": [
175 |     {
176 |      "data": {
177 |       "text/plain": [
178 |        "'Robots are branching out. A new prototype soft robot takes inspiration from plants by growing to explore its environment.'"
179 |       ]
180 |      },
181 |      "execution_count": 60,
182 |      "metadata": {},
183 |      "output_type": "execute_result"
184 |     }
185 |    ],
186 |    "source": [
187 |     "re.search('.+', string).group()  #no new line"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": null,
193 |    "metadata": {
194 |     "collapsed": true
195 |    },
196 |    "outputs": [],
197 |    "source": [
198 |     "re.search('.+', string, flags = re.DOTALL).group()"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "metadata": {
205 |     "collapsed": true
206 |    },
207 |    "outputs": [],
208 |    "source": []
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "## Creating your own character sets"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {
221 |     "collapsed": true
222 |    },
223 |    "outputs": [],
224 |    "source": [
225 |     "\n",
226 |     "[A-Z]    '-'  is a metacharacter when used in [] (custom character sets)"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": 63,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": [
237 |     "string = 'Hello, There, How, Are, You'"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 64,
243 |    "metadata": {
244 |     "collapsed": false
245 |    },
246 |    "outputs": [
247 |     {
248 |      "data": {
249 |       "text/plain": [
250 |        "['H', 'T', 'H', 'A', 'Y']"
251 |       ]
252 |      },
253 |      "execution_count": 64,
254 |      "metadata": {},
255 |      "output_type": "execute_result"
256 |     }
257 |    ],
258 |    "source": [
259 |     "re.findall('[A-Z]', string)  #pulls out all capital letters"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 65,
265 |    "metadata": {
266 |     "collapsed": false
267 |    },
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "['H', ',', 'T', ',', 'H', ',', 'A', ',', 'Y']"
273 |       ]
274 |      },
275 |      "execution_count": 65,
276 |      "metadata": {},
277 |      "output_type": "execute_result"
278 |     }
279 |    ],
280 |    "source": [
281 |     "re.findall('[A-Z,]', string)  #here we search for any capital letters\n",
282 |     "                                #or a comma"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 67,
288 |    "metadata": {
289 |     "collapsed": false
290 |    },
291 |    "outputs": [
292 |     {
293 |      "data": {
294 |       "text/plain": [
295 |        "['H', ',', 'T', ',', 'H', ',', 'A', ',', 'Y', '.', '.', '.']"
296 |       ]
297 |      },
298 |      "execution_count": 67,
299 |      "metadata": {},
300 |      "output_type": "execute_result"
301 |     }
302 |    ],
303 |    "source": [
304 |     "string = 'Hello, There, How, Are, You...'\n",
305 |     "re.findall('[A-Z,.]', string)"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {
312 |     "collapsed": true
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "string = 'Hello, There, How, Are, You...'\n",
317 |     "re.findall('[A-Za-z,\\s.]', string)"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": null,
323 |    "metadata": {
324 |     "collapsed": true
325 |    },
326 |    "outputs": [],
327 |    "source": []
328 |   },
329 |   {
330 |    "cell_type": "code",
331 |    "execution_count": null,
332 |    "metadata": {
333 |     "collapsed": true
334 |    },
335 |    "outputs": [],
336 |    "source": []
337 |   },
338 |   {
339 |    "cell_type": "code",
340 |    "execution_count": null,
341 |    "metadata": {
342 |     "collapsed": true
343 |    },
344 |    "outputs": [],
345 |    "source": []
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {
351 |     "collapsed": true
352 |    },
353 |    "outputs": [],
354 |    "source": []
355 |   },
356 |   {
357 |    "cell_type": "code",
358 |    "execution_count": null,
359 |    "metadata": {
360 |     "collapsed": true
361 |    },
362 |    "outputs": [],
363 |    "source": []
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {
369 |     "collapsed": true
370 |    },
371 |    "outputs": [],
372 |    "source": []
373 |   }
374 |  ],
375 |  "metadata": {
376 |   "kernelspec": {
377 |    "display_name": "Python 3",
378 |    "language": "python",
379 |    "name": "python3"
380 |   },
381 |   "language_info": {
382 |    "codemirror_mode": {
383 |     "name": "ipython",
384 |     "version": 3
385 |    },
386 |    "file_extension": ".py",
387 |    "mimetype": "text/x-python",
388 |    "name": "python",
389 |    "nbconvert_exporter": "python",
390 |    "pygments_lexer": "ipython3",
391 |    "version": "3.6.0"
392 |   }
393 |  },
394 |  "nbformat": 4,
395 |  "nbformat_minor": 1
396 | }
397 | 


--------------------------------------------------------------------------------
/Regular Expressions made Easy - part 5 + 6.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "## Quantifers with custom sets"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "collapsed": true
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import re"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "+\n",
 32 |     "?\n",
 33 |     "*\n",
 34 |     "{}"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "metadata": {
 41 |     "collapsed": false
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "string = 'HELLO, There, How, Are, You...'"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "collapsed": false
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "re.search('[A-Z]+', string)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {
 63 |     "collapsed": false
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "re.findall('[A-Z]+', string)"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "re.findall('[A-Z]{2,}', string)   # 2 or more"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "metadata": {
 85 |     "collapsed": false
 86 |    },
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "string"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {
 96 |     "collapsed": false
 97 |    },
 98 |    "outputs": [],
 99 |    "source": [
100 |     "re.search('[A-Za-z\\s,]+', string).group() # one or more\n",
101 |     "                                          # of  4 types of characters"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {
108 |     "collapsed": false
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "string"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "re.findall('[A-Z]?[a-z\\s,]+', string)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "collapsed": false
131 |    },
132 |    "outputs": [],
133 |    "source": [
134 |     "re.search('[^A-Za-z\\s,]+', string).group()  # ^ is a metacharacter within\n",
135 |     "                                           #brackets"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {
142 |     "collapsed": false
143 |    },
144 |    "outputs": [],
145 |    "source": [
146 |     "re.findall('[^A-Z]+', string)"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## GROUPS"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {
160 |     "collapsed": true
161 |    },
162 |    "outputs": [],
163 |    "source": [
164 |     "#groups allow us to pull out sections of a match and store them"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 2,
170 |    "metadata": {
171 |     "collapsed": true
172 |    },
173 |    "outputs": [],
174 |    "source": [
175 |     "#contrived example\n",
176 |     "import re\n",
177 |     "string = 'John has 6 cats but I think my friend Susan has 3 dogs and Mike has 8 fishes'\n"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 3,
183 |    "metadata": {
184 |     "collapsed": false
185 |    },
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "['John has 6 cats', 'Susan has 3 dogs', 'Mike has 8 fishes']"
191 |       ]
192 |      },
193 |      "execution_count": 3,
194 |      "metadata": {},
195 |      "output_type": "execute_result"
196 |     }
197 |    ],
198 |    "source": [
199 |     "re.findall('[A-Za-z]+ \\w+ \\d+ \\w+', string)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {
206 |     "collapsed": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "#the use of brackets denotes a group\n",
211 |     "()  = metacharacter"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 4,
217 |    "metadata": {
218 |     "collapsed": false
219 |    },
220 |    "outputs": [
221 |     {
222 |      "data": {
223 |       "text/plain": [
224 |        "['John', 'Susan', 'Mike']"
225 |       ]
226 |      },
227 |      "execution_count": 4,
228 |      "metadata": {},
229 |      "output_type": "execute_result"
230 |     }
231 |    ],
232 |    "source": [
233 |     "re.findall('([A-Za-z]+) \\w+ \\d+ \\w+', string) #to pull out just the names"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 5,
239 |    "metadata": {
240 |     "collapsed": false
241 |    },
242 |    "outputs": [
243 |     {
244 |      "data": {
245 |       "text/plain": [
246 |        "['cats', 'dogs', 'fishes']"
247 |       ]
248 |      },
249 |      "execution_count": 5,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "re.findall('[A-Za-z]+ \\w+ \\d+ (\\w+)', string) #pull out animals"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 6,
261 |    "metadata": {
262 |     "collapsed": false
263 |    },
264 |    "outputs": [
265 |     {
266 |      "data": {
267 |       "text/plain": [
268 |        "[('John', '6', 'cats'), ('Susan', '3', 'dogs'), ('Mike', '8', 'fishes')]"
269 |       ]
270 |      },
271 |      "execution_count": 6,
272 |      "metadata": {},
273 |      "output_type": "execute_result"
274 |     }
275 |    ],
276 |    "source": [
277 |     "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)  #\n",
278 |     "\n",
279 |     "#use original string to make sure matching is correct, \n",
280 |     "#then use groups to pull out the info you want"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": null,
286 |    "metadata": {
287 |     "collapsed": true
288 |    },
289 |    "outputs": [],
290 |    "source": []
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": 7,
295 |    "metadata": {
296 |     "collapsed": true
297 |    },
298 |    "outputs": [],
299 |    "source": [
300 |     "#organize the data by data-types\n",
301 |     "info = re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)\n",
302 |     "\n"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 8,
308 |    "metadata": {
309 |     "collapsed": false
310 |    },
311 |    "outputs": [
312 |     {
313 |      "data": {
314 |       "text/plain": [
315 |        "[('John', '6', 'cats'), ('Susan', '3', 'dogs'), ('Mike', '8', 'fishes')]"
316 |       ]
317 |      },
318 |      "execution_count": 8,
319 |      "metadata": {},
320 |      "output_type": "execute_result"
321 |     }
322 |    ],
323 |    "source": [
324 |     "info"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 9,
330 |    "metadata": {
331 |     "collapsed": false
332 |    },
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/plain": [
337 |        "[('John', 'Susan', 'Mike'), ('6', '3', '8'), ('cats', 'dogs', 'fishes')]"
338 |       ]
339 |      },
340 |      "execution_count": 9,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "list(zip(*info))   #organize your data by categories"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 10,
352 |    "metadata": {
353 |     "collapsed": false
354 |    },
355 |    "outputs": [],
356 |    "source": [
357 |     "match =re.search('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string) #pulls out three groups"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 11,
363 |    "metadata": {
364 |     "collapsed": false
365 |    },
366 |    "outputs": [
367 |     {
368 |      "data": {
369 |       "text/plain": [
370 |        "<_sre.SRE_Match object; span=(0, 15), match='John has 6 cats'>"
371 |       ]
372 |      },
373 |      "execution_count": 11,
374 |      "metadata": {},
375 |      "output_type": "execute_result"
376 |     }
377 |    ],
378 |    "source": [
379 |     "match"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 19,
385 |    "metadata": {
386 |     "collapsed": false
387 |    },
388 |    "outputs": [
389 |     {
390 |      "data": {
391 |       "text/plain": [
392 |        "'John has 6 cats but I think my friend Susan has 3 dogs and Mike has 8 fishes'"
393 |       ]
394 |      },
395 |      "execution_count": 19,
396 |      "metadata": {},
397 |      "output_type": "execute_result"
398 |     }
399 |    ],
400 |    "source": [
401 |     "string"
402 |    ]
403 |   },
404 |   {
405 |    "cell_type": "code",
406 |    "execution_count": 12,
407 |    "metadata": {
408 |     "collapsed": false
409 |    },
410 |    "outputs": [
411 |     {
412 |      "data": {
413 |       "text/plain": [
414 |        "'John has 6 cats'"
415 |       ]
416 |      },
417 |      "execution_count": 12,
418 |      "metadata": {},
419 |      "output_type": "execute_result"
420 |     }
421 |    ],
422 |    "source": [
423 |     "match.group(0)"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 13,
429 |    "metadata": {
430 |     "collapsed": false
431 |    },
432 |    "outputs": [
433 |     {
434 |      "data": {
435 |       "text/plain": [
436 |        "('John', '6', 'cats')"
437 |       ]
438 |      },
439 |      "execution_count": 13,
440 |      "metadata": {},
441 |      "output_type": "execute_result"
442 |     }
443 |    ],
444 |    "source": [
445 |     "match.groups()"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 14,
451 |    "metadata": {
452 |     "collapsed": false
453 |    },
454 |    "outputs": [
455 |     {
456 |      "data": {
457 |       "text/plain": [
458 |        "'John'"
459 |       ]
460 |      },
461 |      "execution_count": 14,
462 |      "metadata": {},
463 |      "output_type": "execute_result"
464 |     }
465 |    ],
466 |    "source": [
467 |     "match.group(1)"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 15,
473 |    "metadata": {
474 |     "collapsed": false
475 |    },
476 |    "outputs": [
477 |     {
478 |      "data": {
479 |       "text/plain": [
480 |        "'6'"
481 |       ]
482 |      },
483 |      "execution_count": 15,
484 |      "metadata": {},
485 |      "output_type": "execute_result"
486 |     }
487 |    ],
488 |    "source": [
489 |     "match.group(2)"
490 |    ]
491 |   },
492 |   {
493 |    "cell_type": "code",
494 |    "execution_count": 16,
495 |    "metadata": {
496 |     "collapsed": false
497 |    },
498 |    "outputs": [
499 |     {
500 |      "data": {
501 |       "text/plain": [
502 |        "('John', 'cats')"
503 |       ]
504 |      },
505 |      "execution_count": 16,
506 |      "metadata": {},
507 |      "output_type": "execute_result"
508 |     }
509 |    ],
510 |    "source": [
511 |     "match.group(1,3)  #multiple groups"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "code",
516 |    "execution_count": 17,
517 |    "metadata": {
518 |     "collapsed": false
519 |    },
520 |    "outputs": [
521 |     {
522 |      "data": {
523 |       "text/plain": [
524 |        "('cats', '6', 'John', 'John')"
525 |       ]
526 |      },
527 |      "execution_count": 17,
528 |      "metadata": {},
529 |      "output_type": "execute_result"
530 |     }
531 |    ],
532 |    "source": [
533 |     "match.group(3,2,1,1)  #change the order"
534 |    ]
535 |   },
536 |   {
537 |    "cell_type": "code",
538 |    "execution_count": 18,
539 |    "metadata": {
540 |     "collapsed": false
541 |    },
542 |    "outputs": [
543 |     {
544 |      "data": {
545 |       "text/plain": [
546 |        "(0, 15)"
547 |       ]
548 |      },
549 |      "execution_count": 18,
550 |      "metadata": {},
551 |      "output_type": "execute_result"
552 |     }
553 |    ],
554 |    "source": [
555 |     "match.span()"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 20,
561 |    "metadata": {
562 |     "collapsed": false
563 |    },
564 |    "outputs": [
565 |     {
566 |      "data": {
567 |       "text/plain": [
568 |        "(9, 10)"
569 |       ]
570 |      },
571 |      "execution_count": 20,
572 |      "metadata": {},
573 |      "output_type": "execute_result"
574 |     }
575 |    ],
576 |    "source": [
577 |     "match.span(2)"
578 |    ]
579 |   },
580 |   {
581 |    "cell_type": "code",
582 |    "execution_count": 21,
583 |    "metadata": {
584 |     "collapsed": false
585 |    },
586 |    "outputs": [
587 |     {
588 |      "data": {
589 |       "text/plain": [
590 |        "(11, 15)"
591 |       ]
592 |      },
593 |      "execution_count": 21,
594 |      "metadata": {},
595 |      "output_type": "execute_result"
596 |     }
597 |    ],
598 |    "source": [
599 |     "match.span(3)"
600 |    ]
601 |   },
602 |   {
603 |    "cell_type": "code",
604 |    "execution_count": null,
605 |    "metadata": {
606 |     "collapsed": true
607 |    },
608 |    "outputs": [],
609 |    "source": [
610 |     "match.start(3)"
611 |    ]
612 |   },
613 |   {
614 |    "cell_type": "code",
615 |    "execution_count": null,
616 |    "metadata": {
617 |     "collapsed": true
618 |    },
619 |    "outputs": [],
620 |    "source": [
621 |     "#find all has no group function\n",
622 |     "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string).group(1)"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "code",
627 |    "execution_count": 25,
628 |    "metadata": {
629 |     "collapsed": false
630 |    },
631 |    "outputs": [
632 |     {
633 |      "data": {
634 |       "text/plain": [
635 |        "('John', '6', 'cats')"
636 |       ]
637 |      },
638 |      "execution_count": 25,
639 |      "metadata": {},
640 |      "output_type": "execute_result"
641 |     }
642 |    ],
643 |    "source": [
644 |     "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)[0]"
645 |    ]
646 |   },
647 |   {
648 |    "cell_type": "code",
649 |    "execution_count": null,
650 |    "metadata": {
651 |     "collapsed": true
652 |    },
653 |    "outputs": [],
654 |    "source": [
655 |     "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)[0].group(1)"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "code",
660 |    "execution_count": 29,
661 |    "metadata": {
662 |     "collapsed": false
663 |    },
664 |    "outputs": [
665 |     {
666 |      "data": {
667 |       "text/plain": [
668 |        "[('John', '6', 'cats'), ('Susan', '3', 'dogs'), ('Mike', '8', 'fishes')]"
669 |       ]
670 |      },
671 |      "execution_count": 29,
672 |      "metadata": {},
673 |      "output_type": "execute_result"
674 |     }
675 |    ],
676 |    "source": [
677 |     "re.findall('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)"
678 |    ]
679 |   },
680 |   {
681 |    "cell_type": "code",
682 |    "execution_count": 30,
683 |    "metadata": {
684 |     "collapsed": true
685 |    },
686 |    "outputs": [],
687 |    "source": [
688 |     "data =re.findall('(([A-Za-z]+) \\w+ (\\d+) (\\w+))', string)"
689 |    ]
690 |   },
691 |   {
692 |    "cell_type": "code",
693 |    "execution_count": 31,
694 |    "metadata": {
695 |     "collapsed": false
696 |    },
697 |    "outputs": [
698 |     {
699 |      "data": {
700 |       "text/plain": [
701 |        "[('John has 6 cats', 'John', '6', 'cats'),\n",
702 |        " ('Susan has 3 dogs', 'Susan', '3', 'dogs'),\n",
703 |        " ('Mike has 8 fishes', 'Mike', '8', 'fishes')]"
704 |       ]
705 |      },
706 |      "execution_count": 31,
707 |      "metadata": {},
708 |      "output_type": "execute_result"
709 |     }
710 |    ],
711 |    "source": [
712 |     "data"
713 |    ]
714 |   },
715 |   {
716 |    "cell_type": "code",
717 |    "execution_count": 33,
718 |    "metadata": {
719 |     "collapsed": false
720 |    },
721 |    "outputs": [
722 |     {
723 |      "name": "stdout",
724 |      "output_type": "stream",
725 |      "text": [
726 |       "cats\n",
727 |       "dogs\n",
728 |       "fishes\n"
729 |      ]
730 |     }
731 |    ],
732 |    "source": [
733 |     "for i in data:\n",
734 |     "    print(i[3])"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": null,
740 |    "metadata": {
741 |     "collapsed": true
742 |    },
743 |    "outputs": [],
744 |    "source": [
745 |     "#we can use iteration"
746 |    ]
747 |   },
748 |   {
749 |    "cell_type": "code",
750 |    "execution_count": 50,
751 |    "metadata": {
752 |     "collapsed": false
753 |    },
754 |    "outputs": [],
755 |    "source": [
756 |     "it = re.finditer('([A-Za-z]+) \\w+ (\\d+) (\\w+)', string)"
757 |    ]
758 |   },
759 |   {
760 |    "cell_type": "code",
761 |    "execution_count": 42,
762 |    "metadata": {
763 |     "collapsed": false
764 |    },
765 |    "outputs": [
766 |     {
767 |      "data": {
768 |       "text/plain": [
769 |        "('Mike', '8', 'fishes')"
770 |       ]
771 |      },
772 |      "execution_count": 42,
773 |      "metadata": {},
774 |      "output_type": "execute_result"
775 |     }
776 |    ],
777 |    "source": [
778 |     "next(it).groups()"
779 |    ]
780 |   },
781 |   {
782 |    "cell_type": "code",
783 |    "execution_count": null,
784 |    "metadata": {
785 |     "collapsed": true
786 |    },
787 |    "outputs": [],
788 |    "source": [
789 |     "for element in it:\n",
790 |     "    print (element.group(1,3, 2))   # don't forget iterators exhaust"
791 |    ]
792 |   },
793 |   {
794 |    "cell_type": "code",
795 |    "execution_count": 46,
796 |    "metadata": {
797 |     "collapsed": false
798 |    },
799 |    "outputs": [
800 |     {
801 |      "name": "stdout",
802 |      "output_type": "stream",
803 |      "text": [
804 |       "John has 6 cats\n",
805 |       "Susan has 3 dogs\n",
806 |       "Mike has 8 fishes\n"
807 |      ]
808 |     }
809 |    ],
810 |    "source": [
811 |     "for element in it:\n",
812 |     "    print(element.group())"
813 |    ]
814 |   },
815 |   {
816 |    "cell_type": "code",
817 |    "execution_count": null,
818 |    "metadata": {
819 |     "collapsed": true
820 |    },
821 |    "outputs": [],
822 |    "source": [
823 |     "for element in it:\n",
824 |     "    print(element.groups())"
825 |    ]
826 |   }
827 |  ],
828 |  "metadata": {
829 |   "kernelspec": {
830 |    "display_name": "Python 3",
831 |    "language": "python",
832 |    "name": "python3"
833 |   },
834 |   "language_info": {
835 |    "codemirror_mode": {
836 |     "name": "ipython",
837 |     "version": 3
838 |    },
839 |    "file_extension": ".py",
840 |    "mimetype": "text/x-python",
841 |    "name": "python",
842 |    "nbconvert_exporter": "python",
843 |    "pygments_lexer": "ipython3",
844 |    "version": "3.6.0"
845 |   }
846 |  },
847 |  "nbformat": 4,
848 |  "nbformat_minor": 1
849 | }
850 | 


--------------------------------------------------------------------------------
/Regular Expressions made Easy - part 7 .ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Naming Groups"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import re"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "string = 'New York, New York 11369'"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {
 36 |     "collapsed": false
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "([A-Za-z\\s]+)\n",
 41 |     "([A-Za-z\\s]+)\n",
 42 |     "(\\d+)"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {
 49 |     "collapsed": true
 50 |    },
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "match =re.search('([A-Za-z\\s]+),([A-Za-z\\s]+)(\\d+)', string)"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 4,
 59 |    "metadata": {
 60 |     "collapsed": false
 61 |    },
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "('New York', ' New York ', '11369', 'New York, New York 11369')"
 67 |       ]
 68 |      },
 69 |      "execution_count": 4,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "match.group(1), match.group(2), match.group(3), match.group(0)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {
 82 |     "collapsed": true
 83 |    },
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "?P< >   #to name a group-- group name inside the <>, followed by RE for group\n",
 87 |     "\n",
 88 |     "(?P<City>)      (?P<State>)    (?P<ZipCode>)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 5,
 94 |    "metadata": {
 95 |     "collapsed": true
 96 |    },
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "pattern = re.compile('(?P<City>[A-Za-z\\s]+),(?P<State>[A-Za-z\\s]+)(?P<ZipCode>\\d+)')"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 6,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [],
109 |    "source": [
110 |     "match = re.search(pattern, string)"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 7,
116 |    "metadata": {
117 |     "collapsed": false
118 |    },
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "('New York', ' New York ', '11369')"
124 |       ]
125 |      },
126 |      "execution_count": 7,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "match.group('City'), match.group('State'), match.group('ZipCode')"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 8,
138 |    "metadata": {
139 |     "collapsed": false
140 |    },
141 |    "outputs": [
142 |     {
143 |      "data": {
144 |       "text/plain": [
145 |        "'New York'"
146 |       ]
147 |      },
148 |      "execution_count": 8,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "match.group(1)"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 11,
160 |    "metadata": {
161 |     "collapsed": false
162 |    },
163 |    "outputs": [
164 |     {
165 |      "data": {
166 |       "text/plain": [
167 |        "('New York', ' New York ', '11369')"
168 |       ]
169 |      },
170 |      "execution_count": 11,
171 |      "metadata": {},
172 |      "output_type": "execute_result"
173 |     }
174 |    ],
175 |    "source": [
176 |     "match.groups()"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 10,
182 |    "metadata": {
183 |     "collapsed": false
184 |    },
185 |    "outputs": [
186 |     {
187 |      "data": {
188 |       "text/plain": [
189 |        "{'City': 'New York', 'State': ' New York ', 'ZipCode': '11369'}"
190 |       ]
191 |      },
192 |      "execution_count": 10,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "#Just incase you forget the names of the groups you used\n",
199 |     "\n",
200 |     "match.groupdict()"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": []
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "collapsed": true
217 |    },
218 |    "outputs": [],
219 |    "source": []
220 |   }
221 |  ],
222 |  "metadata": {
223 |   "kernelspec": {
224 |    "display_name": "Python 3",
225 |    "language": "python",
226 |    "name": "python3"
227 |   },
228 |   "language_info": {
229 |    "codemirror_mode": {
230 |     "name": "ipython",
231 |     "version": 3
232 |    },
233 |    "file_extension": ".py",
234 |    "mimetype": "text/x-python",
235 |    "name": "python",
236 |    "nbconvert_exporter": "python",
237 |    "pygments_lexer": "ipython3",
238 |    "version": "3.6.0"
239 |   }
240 |  },
241 |  "nbformat": 4,
242 |  "nbformat_minor": 1
243 | }
244 | 


--------------------------------------------------------------------------------
/Regular Expressions made Easy - part 8 .ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "### Quantifiers on groups"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "collapsed": true
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "#Using quantifiers on groups has some nuances, but very useful\n"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 1,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import re"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [
 41 |     {
 42 |      "data": {
 43 |       "text/plain": [
 44 |        "<_sre.SRE_Match object; span=(0, 12), match='abababababab'>"
 45 |       ]
 46 |      },
 47 |      "execution_count": 2,
 48 |      "metadata": {},
 49 |      "output_type": "execute_result"
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "\n",
 54 |     "string = 'abababababab'  #ab repeated many times\n",
 55 |     "\n",
 56 |     "re.search('(ab)+', string)  #(ab)+   is many instances of one group repeated"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 3,
 62 |    "metadata": {
 63 |     "collapsed": false
 64 |    },
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/plain": [
 69 |        "<_sre.SRE_Match object; span=(0, 12), match='abababababab'>"
 70 |       ]
 71 |      },
 72 |      "execution_count": 3,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "string = 'abababababab'  #ab repeated many times\n",
 79 |     "\n",
 80 |     "re.search('[ab]+', string)  #this is different"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "#difference explained below"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 4,
 97 |    "metadata": {
 98 |     "collapsed": false
 99 |    },
100 |    "outputs": [
101 |     {
102 |      "data": {
103 |       "text/plain": [
104 |        "<_sre.SRE_Match object; span=(0, 6), match='ababab'>"
105 |       ]
106 |      },
107 |      "execution_count": 4,
108 |      "metadata": {},
109 |      "output_type": "execute_result"
110 |     }
111 |    ],
112 |    "source": [
113 |     "string = 'abababbbbbbb'   #only partial fit to our new string\n",
114 |     "re.search('(ab)+', string)"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 5,
120 |    "metadata": {
121 |     "collapsed": false
122 |    },
123 |    "outputs": [
124 |     {
125 |      "data": {
126 |       "text/plain": [
127 |        "<_sre.SRE_Match object; span=(0, 12), match='abababbbbbbb'>"
128 |       ]
129 |      },
130 |      "execution_count": 5,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "string = 'abababbbbbbb'   #but this pattern fits perfectly\n",
137 |     "re.search('[ab]+', string)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": 7,
143 |    "metadata": {
144 |     "collapsed": false
145 |    },
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/plain": [
150 |        "<_sre.SRE_Match object; span=(0, 12), match='abababbbbbbb'>"
151 |       ]
152 |      },
153 |      "execution_count": 7,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "string = 'abababbbbbbb'   #allows flexibility\n",
160 |     "re.search('(ab)+\\w+', string)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 8,
166 |    "metadata": {
167 |     "collapsed": false
168 |    },
169 |    "outputs": [
170 |     {
171 |      "data": {
172 |       "text/plain": [
173 |        "<_sre.SRE_Match object; span=(0, 11), match='abababsssss'>"
174 |       ]
175 |      },
176 |      "execution_count": 8,
177 |      "metadata": {},
178 |      "output_type": "execute_result"
179 |     }
180 |    ],
181 |    "source": [
182 |     "string = 'abababsssss'   #allows flexibility\n",
183 |     "re.search('(ab)+\\w+', string)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "markdown",
188 |    "metadata": {
189 |     "collapsed": true
190 |    },
191 |    "source": [
192 |     "### Nuances to be wary of"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {
199 |     "collapsed": true
200 |    },
201 |    "outputs": [],
202 |    "source": [
203 |     "#only one group not multiple groups"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 9,
209 |    "metadata": {
210 |     "collapsed": false
211 |    },
212 |    "outputs": [
213 |     {
214 |      "data": {
215 |       "text/plain": [
216 |        "'ab'"
217 |       ]
218 |      },
219 |      "execution_count": 9,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "string = 'abababababab' #original string\n",
226 |     "match =re.search('(ab)+', string) \n",
227 |     "\n",
228 |     "match.group(1)# capturing only one group; value is overwritten each time"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {
235 |     "collapsed": true
236 |    },
237 |    "outputs": [],
238 |    "source": [
239 |     "match.group(2) #no value"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 11,
245 |    "metadata": {
246 |     "collapsed": false
247 |    },
248 |    "outputs": [
249 |     {
250 |      "data": {
251 |       "text/plain": [
252 |        "('ab',)"
253 |       ]
254 |      },
255 |      "execution_count": 11,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "match.groups()  #only one group, group just overwritten"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 12,
267 |    "metadata": {
268 |     "collapsed": false
269 |    },
270 |    "outputs": [
271 |     {
272 |      "data": {
273 |       "text/plain": [
274 |        "'abababababab'"
275 |       ]
276 |      },
277 |      "execution_count": 12,
278 |      "metadata": {},
279 |      "output_type": "execute_result"
280 |     }
281 |    ],
282 |    "source": [
283 |     "match.group(0) # the full match, not related to groups"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": null,
289 |    "metadata": {
290 |     "collapsed": true
291 |    },
292 |    "outputs": [],
293 |    "source": [
294 |     "#Another simple example with two groups using quantifiers"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 16,
300 |    "metadata": {
301 |     "collapsed": true
302 |    },
303 |    "outputs": [],
304 |    "source": [
305 |     "string = 'ababababab'"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 17,
311 |    "metadata": {
312 |     "collapsed": false
313 |    },
314 |    "outputs": [
315 |     {
316 |      "data": {
317 |       "text/plain": [
318 |        "<_sre.SRE_Match object; span=(0, 10), match='ababababab'>"
319 |       ]
320 |      },
321 |      "execution_count": 17,
322 |      "metadata": {},
323 |      "output_type": "execute_result"
324 |     }
325 |    ],
326 |    "source": [
327 |     "match =re.search ('(ab)+(ab)+', string)\n",
328 |     "match"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 14,
334 |    "metadata": {
335 |     "collapsed": false
336 |    },
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "('ab', 'ab')"
342 |       ]
343 |      },
344 |      "execution_count": 14,
345 |      "metadata": {},
346 |      "output_type": "execute_result"
347 |     }
348 |    ],
349 |    "source": [
350 |     "match.groups()"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 18,
356 |    "metadata": {
357 |     "collapsed": false
358 |    },
359 |    "outputs": [
360 |     {
361 |      "data": {
362 |       "text/plain": [
363 |        "(8, 10)"
364 |       ]
365 |      },
366 |      "execution_count": 18,
367 |      "metadata": {},
368 |      "output_type": "execute_result"
369 |     }
370 |    ],
371 |    "source": [
372 |     "match.span(2) # the first group is greedy"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "code",
377 |    "execution_count": null,
378 |    "metadata": {
379 |     "collapsed": true
380 |    },
381 |    "outputs": [],
382 |    "source": [
383 |     "#Only one group captured "
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "code",
388 |    "execution_count": 19,
389 |    "metadata": {
390 |     "collapsed": false
391 |    },
392 |    "outputs": [],
393 |    "source": [
394 |     "string = '123456789'\n",
395 |     "\n",
396 |     "match =re.search('(\\d)+', string)"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 20,
402 |    "metadata": {
403 |     "collapsed": false
404 |    },
405 |    "outputs": [
406 |     {
407 |      "data": {
408 |       "text/plain": [
409 |        "<_sre.SRE_Match object; span=(0, 9), match='123456789'>"
410 |       ]
411 |      },
412 |      "execution_count": 20,
413 |      "metadata": {},
414 |      "output_type": "execute_result"
415 |     }
416 |    ],
417 |    "source": [
418 |     "match"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": 21,
424 |    "metadata": {
425 |     "collapsed": false
426 |    },
427 |    "outputs": [
428 |     {
429 |      "data": {
430 |       "text/plain": [
431 |        "('9',)"
432 |       ]
433 |      },
434 |      "execution_count": 21,
435 |      "metadata": {},
436 |      "output_type": "execute_result"
437 |     }
438 |    ],
439 |    "source": [
440 |     "(match.groups())   # only one group, and it uses the last value"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "metadata": {
447 |     "collapsed": false
448 |    },
449 |    "outputs": [],
450 |    "source": [
451 |     "match   #full pattern still retained"
452 |    ]
453 |   },
454 |   {
455 |    "cell_type": "markdown",
456 |    "metadata": {
457 |     "collapsed": true
458 |    },
459 |    "source": [
460 |     "### Quantifiers with groups within findall"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "code",
465 |    "execution_count": 22,
466 |    "metadata": {
467 |     "collapsed": false
468 |    },
469 |    "outputs": [
470 |     {
471 |      "data": {
472 |       "text/plain": [
473 |        "['9']"
474 |       ]
475 |      },
476 |      "execution_count": 22,
477 |      "metadata": {},
478 |      "output_type": "execute_result"
479 |     }
480 |    ],
481 |    "source": [
482 |     "string = '123456789'\n",
483 |     "\n",
484 |     "re.findall('(\\d)+', string)  #only pulls out group and last instance"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": 23,
490 |    "metadata": {
491 |     "collapsed": false
492 |    },
493 |    "outputs": [
494 |     {
495 |      "data": {
496 |       "text/plain": [
497 |        "['4', '9']"
498 |       ]
499 |      },
500 |      "execution_count": 23,
501 |      "metadata": {},
502 |      "output_type": "execute_result"
503 |     }
504 |    ],
505 |    "source": [
506 |     "string = '1234 56789'\n",
507 |     "\n",
508 |     "re.findall('(\\d)+', string)  #Here we have two matches"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": 26,
514 |    "metadata": {
515 |     "collapsed": false
516 |    },
517 |    "outputs": [
518 |     {
519 |      "data": {
520 |       "text/plain": [
521 |        "'56789'"
522 |       ]
523 |      },
524 |      "execution_count": 26,
525 |      "metadata": {},
526 |      "output_type": "execute_result"
527 |     }
528 |    ],
529 |    "source": [
530 |     "re.findall('((\\d)+)', string)[1][0] \n",
531 |     "#to find full match create a main group engulfing the smaller groups"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 28,
537 |    "metadata": {
538 |     "collapsed": false
539 |    },
540 |    "outputs": [
541 |     {
542 |      "data": {
543 |       "text/plain": [
544 |        "['ab', 'ab']"
545 |       ]
546 |      },
547 |      "execution_count": 28,
548 |      "metadata": {},
549 |      "output_type": "execute_result"
550 |     }
551 |    ],
552 |    "source": [
553 |     "#another example\n",
554 |     "string  = 'abbbbb ababababab'\n",
555 |     "re.findall('(ab)+', string)   #two instances"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 29,
561 |    "metadata": {
562 |     "collapsed": false
563 |    },
564 |    "outputs": [
565 |     {
566 |      "data": {
567 |       "text/plain": [
568 |        "[('ab', 'ab'), ('ababababab', 'ab')]"
569 |       ]
570 |      },
571 |      "execution_count": 29,
572 |      "metadata": {},
573 |      "output_type": "execute_result"
574 |     }
575 |    ],
576 |    "source": [
577 |     "string  = 'abbbbb ababababab'\n",
578 |     "re.findall('((ab)+)', string)   #full match"
579 |    ]
580 |   },
581 |   {
582 |    "cell_type": "markdown",
583 |    "metadata": {
584 |     "collapsed": true
585 |    },
586 |    "source": [
587 |     "### Groups for word completion"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": 30,
593 |    "metadata": {
594 |     "collapsed": false
595 |    },
596 |    "outputs": [
597 |     {
598 |      "data": {
599 |       "text/plain": [
600 |        "<_sre.SRE_Match object; span=(0, 14), match='Happy Birthday'>"
601 |       ]
602 |      },
603 |      "execution_count": 30,
604 |      "metadata": {},
605 |      "output_type": "execute_result"
606 |     }
607 |    ],
608 |    "source": [
609 |     "re.search('Happy (Valentines|Birthday|Anniversary)', 'Happy Birthday')"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": 31,
615 |    "metadata": {
616 |     "collapsed": false
617 |    },
618 |    "outputs": [
619 |     {
620 |      "data": {
621 |       "text/plain": [
622 |        "<_sre.SRE_Match object; span=(0, 16), match='Happy Valentines'>"
623 |       ]
624 |      },
625 |      "execution_count": 31,
626 |      "metadata": {},
627 |      "output_type": "execute_result"
628 |     }
629 |    ],
630 |    "source": [
631 |     "re.search('Happy (Valentines|Birthday|Anniversary)', 'Happy Valentines')"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": null,
637 |    "metadata": {
638 |     "collapsed": true
639 |    },
640 |    "outputs": [],
641 |    "source": [
642 |     "re.search('Happy Valentines| Happy Birthday | Happy Anniversary')"
643 |    ]
644 |   }
645 |  ],
646 |  "metadata": {
647 |   "kernelspec": {
648 |    "display_name": "Python 3",
649 |    "language": "python",
650 |    "name": "python3"
651 |   },
652 |   "language_info": {
653 |    "codemirror_mode": {
654 |     "name": "ipython",
655 |     "version": 3
656 |    },
657 |    "file_extension": ".py",
658 |    "mimetype": "text/x-python",
659 |    "name": "python",
660 |    "nbconvert_exporter": "python",
661 |    "pygments_lexer": "ipython3",
662 |    "version": "3.6.0"
663 |   }
664 |  },
665 |  "nbformat": 4,
666 |  "nbformat_minor": 1
667 | }
668 | 


--------------------------------------------------------------------------------
/Regular Expressions made Easy - part 9 .ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "## Non-capture Groups"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {
 18 |     "collapsed": true
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import re"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {
 29 |     "collapsed": false
 30 |    },
 31 |    "outputs": [
 32 |     {
 33 |      "data": {
 34 |       "text/plain": [
 35 |        "['4', '9']"
 36 |       ]
 37 |      },
 38 |      "execution_count": 1,
 39 |      "metadata": {},
 40 |      "output_type": "execute_result"
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "#Here is one such example:\n",
 45 |     "    \n",
 46 |     "\n",
 47 |     "import re\n",
 48 |     "string = '1234 56789'\n",
 49 |     "\n",
 50 |     "re.findall('(\\d)+', string)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 2,
 56 |    "metadata": {
 57 |     "collapsed": false
 58 |    },
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "text/plain": [
 63 |        "('4',)"
 64 |       ]
 65 |      },
 66 |      "execution_count": 2,
 67 |      "metadata": {},
 68 |      "output_type": "execute_result"
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "re.search('(\\d)+', string).groups()  #using search"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {
 79 |     "collapsed": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "#  non-capture groups syntax\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "?:      \n",
 87 |     "    \n",
 88 |     "The symbol above represents non-capture groups and looks slightly\n",
 89 |     "similar to the syntax for naming groups\n",
 90 |     "\n",
 91 |     "?P  #don't confuse the two please. \n",
 92 |     "\n"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "#comparison"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 3,
109 |    "metadata": {
110 |     "collapsed": false
111 |    },
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "['4', '9']"
117 |       ]
118 |      },
119 |      "execution_count": 3,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "re.findall('(\\d)+', string)"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 6,
131 |    "metadata": {
132 |     "collapsed": false
133 |    },
134 |    "outputs": [
135 |     {
136 |      "data": {
137 |       "text/plain": [
138 |        "['1234', '56789']"
139 |       ]
140 |      },
141 |      "execution_count": 6,
142 |      "metadata": {},
143 |      "output_type": "execute_result"
144 |     }
145 |    ],
146 |    "source": [
147 |     "re.findall('(?:\\d)+', string) #with non capture group"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {
154 |     "collapsed": true
155 |    },
156 |    "outputs": [],
157 |    "source": [
158 |     "#So the group is part of the pattern, but we don't output the groups'\n",
159 |     "#results"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 7,
165 |    "metadata": {
166 |     "collapsed": false
167 |    },
168 |    "outputs": [
169 |     {
170 |      "data": {
171 |       "text/plain": [
172 |        "['1234', '56789']"
173 |       ]
174 |      },
175 |      "execution_count": 7,
176 |      "metadata": {},
177 |      "output_type": "execute_result"
178 |     }
179 |    ],
180 |    "source": [
181 |     "re.findall('\\d+', string)  # when RE has no groups in findall, \n",
182 |     "                            #we output entire match"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {
189 |     "collapsed": true
190 |    },
191 |    "outputs": [],
192 |    "source": []
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "#Another example"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 19,
208 |    "metadata": {
209 |     "collapsed": false
210 |    },
211 |    "outputs": [],
212 |    "source": [
213 |     "string  = '123123 = Alex, 123123123 = Danny, 123123123123 = Mike, 456456 = rick, 121212 = John, 132132 = Luis,' \n",
214 |     " \n"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": null,
220 |    "metadata": {
221 |     "collapsed": true
222 |    },
223 |    "outputs": [],
224 |    "source": [
225 |     "#We want to pull out all names whose ID has 123 within in"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 20,
231 |    "metadata": {
232 |     "collapsed": false
233 |    },
234 |    "outputs": [
235 |     {
236 |      "data": {
237 |       "text/plain": [
238 |        "['Alex', 'Danny', 'Mike']"
239 |       ]
240 |      },
241 |      "execution_count": 20,
242 |      "metadata": {},
243 |      "output_type": "execute_result"
244 |     }
245 |    ],
246 |    "source": [
247 |     "re.findall('(?:123)+ = (\\w+),', string)   #three instances"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 11,
253 |    "metadata": {
254 |     "collapsed": true
255 |    },
256 |    "outputs": [],
257 |    "source": [
258 |     "#Another example\n",
259 |     "string = '1*1*1*1*22222  1*1*3333  2*1*2*1*222  1*2*2*2*333 3*3*3*444'"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": 12,
265 |    "metadata": {
266 |     "collapsed": false
267 |    },
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/plain": [
272 |        "['1*1*1*1*22222', '1*1*3333']"
273 |       ]
274 |      },
275 |      "execution_count": 12,
276 |      "metadata": {},
277 |      "output_type": "execute_result"
278 |     }
279 |    ],
280 |    "source": [
281 |     "re.findall( r'(?:1\\*){2,}\\d+', string)"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {
288 |     "collapsed": true
289 |    },
290 |    "outputs": [],
291 |    "source": []
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": null,
296 |    "metadata": {
297 |     "collapsed": false
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "#Now, non-captured groups doesn't just affect the findall method\n",
302 |     "#it also affects the search and match methods"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "markdown",
307 |    "metadata": {},
308 |    "source": [
309 |     "### BE CAREFUL WITH SYNTAX "
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": null,
315 |    "metadata": {
316 |     "collapsed": true
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "?:   correct!\n",
321 |     ":?   incorrect!"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 13,
327 |    "metadata": {
328 |     "collapsed": false
329 |    },
330 |    "outputs": [
331 |     {
332 |      "name": "stdout",
333 |      "output_type": "stream",
334 |      "text": [
335 |       "()\n"
336 |      ]
337 |     }
338 |    ],
339 |    "source": [
340 |     "string = '1234 56789'\n",
341 |     "\n",
342 |     "match =re.search('(?:\\d)+', string)#correct syntax\n",
343 |     "print(match.groups())"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 14,
349 |    "metadata": {
350 |     "collapsed": false
351 |    },
352 |    "outputs": [
353 |     {
354 |      "name": "stdout",
355 |      "output_type": "stream",
356 |      "text": [
357 |       "('4',)\n"
358 |      ]
359 |     }
360 |    ],
361 |    "source": [
362 |     "string = '1234 56789'\n",
363 |     "\n",
364 |     "match =re.search('(:?\\d)+', string)# :? incorrect syntax!!!! \n",
365 |     "print(match.groups())"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "metadata": {
372 |     "collapsed": true
373 |    },
374 |    "outputs": [],
375 |    "source": [
376 |     "Summary: \n",
377 |     "\n",
378 |     "#when we capture groups we are either storing the value or \n",
379 |     "#outputting them\n"
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "markdown",
384 |    "metadata": {},
385 |    "source": [
386 |     "## Backreferences  - Using captured groups inside other operations"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {
393 |     "collapsed": true
394 |    },
395 |    "outputs": [],
396 |    "source": [
397 |     "#backreferencing is making a refererence to the captured group\n",
398 |     "#within the same regular expression\n",
399 |     "\n"
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "code",
404 |    "execution_count": null,
405 |    "metadata": {
406 |     "collapsed": true
407 |    },
408 |    "outputs": [],
409 |    "source": [
410 |     "#syntax and example"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 15,
416 |    "metadata": {
417 |     "collapsed": false
418 |    },
419 |    "outputs": [
420 |     {
421 |      "data": {
422 |       "text/plain": [
423 |        "<_sre.SRE_Match object; span=(0, 11), match='Merry Merry'>"
424 |       ]
425 |      },
426 |      "execution_count": 15,
427 |      "metadata": {},
428 |      "output_type": "execute_result"
429 |     }
430 |    ],
431 |    "source": [
432 |     "re.search(r'(\\w+) \\1','Merry Merry Christmas')  #Looking for repeated words"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": 16,
438 |    "metadata": {
439 |     "collapsed": false
440 |    },
441 |    "outputs": [
442 |     {
443 |      "data": {
444 |       "text/plain": [
445 |        "('Merry',)"
446 |       ]
447 |      },
448 |      "execution_count": 16,
449 |      "metadata": {},
450 |      "output_type": "execute_result"
451 |     }
452 |    ],
453 |    "source": [
454 |     "re.search(r'(\\w+) \\1','Merry Merry Christmas').groups()"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": null,
460 |    "metadata": {
461 |     "collapsed": true
462 |    },
463 |    "outputs": [],
464 |    "source": [
465 |     "\\1 is just referencing the first group \n",
466 |     "within the regular expression \n",
467 |     "\n",
468 |     "r'(\\w+) \\1'"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "code",
473 |    "execution_count": null,
474 |    "metadata": {
475 |     "collapsed": true
476 |    },
477 |    "outputs": [],
478 |    "source": [
479 |     "#Another example"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 17,
485 |    "metadata": {
486 |     "collapsed": false
487 |    },
488 |    "outputs": [
489 |     {
490 |      "data": {
491 |       "text/plain": [
492 |        "['Happy', 'Christmas']"
493 |       ]
494 |      },
495 |      "execution_count": 17,
496 |      "metadata": {},
497 |      "output_type": "execute_result"
498 |     }
499 |    ],
500 |    "source": [
501 |     "re.findall(r'(\\w+) \\1','Happy Happy Holidays. Merry Christmas Christmas')   #Want to look for repeated words"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": null,
507 |    "metadata": {
508 |     "collapsed": true
509 |    },
510 |    "outputs": [],
511 |    "source": [
512 |     "#another example"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 18,
518 |    "metadata": {
519 |     "collapsed": false
520 |    },
521 |    "outputs": [
522 |     {
523 |      "data": {
524 |       "text/plain": [
525 |        "['Merry', 'Christmas', 'Merry']"
526 |       ]
527 |      },
528 |      "execution_count": 18,
529 |      "metadata": {},
530 |      "output_type": "execute_result"
531 |     }
532 |    ],
533 |    "source": [
534 |     "re.findall(r'(\\w+) \\1','Merry Merry Christmas Christmas Merry Merry Christmas')"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "code",
539 |    "execution_count": null,
540 |    "metadata": {
541 |     "collapsed": true
542 |    },
543 |    "outputs": [],
544 |    "source": []
545 |   },
546 |   {
547 |    "cell_type": "code",
548 |    "execution_count": null,
549 |    "metadata": {
550 |     "collapsed": true
551 |    },
552 |    "outputs": [],
553 |    "source": []
554 |   }
555 |  ],
556 |  "metadata": {
557 |   "kernelspec": {
558 |    "display_name": "Python 3",
559 |    "language": "python",
560 |    "name": "python3"
561 |   },
562 |   "language_info": {
563 |    "codemirror_mode": {
564 |     "name": "ipython",
565 |     "version": 3
566 |    },
567 |    "file_extension": ".py",
568 |    "mimetype": "text/x-python",
569 |    "name": "python",
570 |    "nbconvert_exporter": "python",
571 |    "pygments_lexer": "ipython3",
572 |    "version": "3.6.0"
573 |   }
574 |  },
575 |  "nbformat": 4,
576 |  "nbformat_minor": 1
577 | }
578 | 


--------------------------------------------------------------------------------