\n",
399 | " Once upon a time there were three little sisters; and their names were\n",
400 | " \n",
401 | " Elsie \n",
402 | " \n",
403 | " Lacie \n",
404 | " and\n",
405 | " Tillie \n",
406 | " and they lived at the bottom of a well.\n",
407 | "
\n",
408 | " ...
\n",
409 | "\"\"\"\n",
410 | "from bs4 import BeautifulSoup\n",
411 | "soup = BeautifulSoup(html, 'lxml')\n",
412 | "print(soup.p.children)\n",
413 | "for i, child in enumerate(soup.p.children):\n",
414 | " print(i, child)"
415 | ]
416 | },
417 | {
418 | "cell_type": "code",
419 | "execution_count": 50,
420 | "metadata": {
421 | "collapsed": false,
422 | "scrolled": true
423 | },
424 | "outputs": [
425 | {
426 | "name": "stdout",
427 | "output_type": "stream",
428 | "text": [
429 | "\n",
430 | "0 \n",
431 | " Once upon a time there were three little sisters; and their names were\n",
432 | " \n",
433 | "1 \n",
434 | "Elsie \n",
435 | " \n",
436 | "2 \n",
437 | "\n",
438 | "3 Elsie \n",
439 | "4 Elsie\n",
440 | "5 \n",
441 | "\n",
442 | "6 \n",
443 | "\n",
444 | "7 Lacie \n",
445 | "8 Lacie\n",
446 | "9 \n",
447 | " and\n",
448 | " \n",
449 | "10 Tillie \n",
450 | "11 Tillie\n",
451 | "12 \n",
452 | " and they lived at the bottom of a well.\n",
453 | " \n"
454 | ]
455 | }
456 | ],
457 | "source": [
458 | "html = \"\"\"\n",
459 | "\n",
460 | " \n",
461 | " The Dormouse's story \n",
462 | " \n",
463 | " \n",
464 | " \n",
465 | " Once upon a time there were three little sisters; and their names were\n",
466 | " \n",
467 | " Elsie \n",
468 | " \n",
469 | " Lacie \n",
470 | " and\n",
471 | " Tillie \n",
472 | " and they lived at the bottom of a well.\n",
473 | "
\n",
474 | " ...
\n",
475 | "\"\"\"\n",
476 | "from bs4 import BeautifulSoup\n",
477 | "soup = BeautifulSoup(html, 'lxml')\n",
478 | "print(soup.p.descendants)\n",
479 | "for i, child in enumerate(soup.p.descendants):\n",
480 | " print(i, child)"
481 | ]
482 | },
483 | {
484 | "cell_type": "markdown",
485 | "metadata": {},
486 | "source": [
487 | "### 父节点和祖先节点"
488 | ]
489 | },
490 | {
491 | "cell_type": "code",
492 | "execution_count": 63,
493 | "metadata": {
494 | "collapsed": false
495 | },
496 | "outputs": [
497 | {
498 | "name": "stdout",
499 | "output_type": "stream",
500 | "text": [
501 | "\n",
502 | " Once upon a time there were three little sisters; and their names were\n",
503 | " \n",
504 | "Elsie \n",
505 | " \n",
506 | "Lacie \n",
507 | " and\n",
508 | " Tillie \n",
509 | " and they lived at the bottom of a well.\n",
510 | "
\n"
511 | ]
512 | }
513 | ],
514 | "source": [
515 | "html = \"\"\"\n",
516 | "\n",
517 | " \n",
518 | " The Dormouse's story \n",
519 | " \n",
520 | " \n",
521 | " \n",
522 | " Once upon a time there were three little sisters; and their names were\n",
523 | " \n",
524 | " Elsie \n",
525 | " \n",
526 | " Lacie \n",
527 | " and\n",
528 | " Tillie \n",
529 | " and they lived at the bottom of a well.\n",
530 | "
\n",
531 | " ...
\n",
532 | "\"\"\"\n",
533 | "from bs4 import BeautifulSoup\n",
534 | "soup = BeautifulSoup(html, 'lxml')\n",
535 | "print(soup.a.parent)"
536 | ]
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": 73,
541 | "metadata": {
542 | "collapsed": false
543 | },
544 | "outputs": [
545 | {
546 | "name": "stdout",
547 | "output_type": "stream",
548 | "text": [
549 | "[(0, \n",
550 | " Once upon a time there were three little sisters; and their names were\n",
551 | " \n",
552 | "Elsie \n",
553 | " \n",
554 | "Lacie \n",
555 | " and\n",
556 | " Tillie \n",
557 | " and they lived at the bottom of a well.\n",
558 | "
), (1, \n",
559 | "\n",
560 | " Once upon a time there were three little sisters; and their names were\n",
561 | " \n",
562 | "Elsie \n",
563 | " \n",
564 | "Lacie \n",
565 | " and\n",
566 | " Tillie \n",
567 | " and they lived at the bottom of a well.\n",
568 | "
\n",
569 | "...
\n",
570 | "), (2, \n",
571 | "\n",
572 | "The Dormouse's story \n",
573 | "\n",
574 | "\n",
575 | "\n",
576 | " Once upon a time there were three little sisters; and their names were\n",
577 | " \n",
578 | "Elsie \n",
579 | " \n",
580 | "Lacie \n",
581 | " and\n",
582 | " Tillie \n",
583 | " and they lived at the bottom of a well.\n",
584 | "
\n",
585 | "...
\n",
586 | "), (3, \n",
587 | "\n",
588 | "The Dormouse's story \n",
589 | "\n",
590 | "\n",
591 | "\n",
592 | " Once upon a time there were three little sisters; and their names were\n",
593 | " \n",
594 | "Elsie \n",
595 | " \n",
596 | "Lacie \n",
597 | " and\n",
598 | " Tillie \n",
599 | " and they lived at the bottom of a well.\n",
600 | "
\n",
601 | "...
\n",
602 | ")]\n"
603 | ]
604 | }
605 | ],
606 | "source": [
607 | "html = \"\"\"\n",
608 | "\n",
609 | " \n",
610 | " The Dormouse's story \n",
611 | " \n",
612 | " \n",
613 | " \n",
614 | " Once upon a time there were three little sisters; and their names were\n",
615 | " \n",
616 | " Elsie \n",
617 | " \n",
618 | " Lacie \n",
619 | " and\n",
620 | " Tillie \n",
621 | " and they lived at the bottom of a well.\n",
622 | "
\n",
623 | " ...
\n",
624 | "\"\"\"\n",
625 | "from bs4 import BeautifulSoup\n",
626 | "soup = BeautifulSoup(html, 'lxml')\n",
627 | "print(list(enumerate(soup.a.parents)))"
628 | ]
629 | },
630 | {
631 | "cell_type": "markdown",
632 | "metadata": {},
633 | "source": [
634 | "### 兄弟节点"
635 | ]
636 | },
637 | {
638 | "cell_type": "code",
639 | "execution_count": 74,
640 | "metadata": {
641 | "collapsed": false
642 | },
643 | "outputs": [
644 | {
645 | "name": "stdout",
646 | "output_type": "stream",
647 | "text": [
648 | "[(0, '\\n'), (1, Lacie ), (2, ' \\n and\\n '), (3, Tillie ), (4, '\\n and they lived at the bottom of a well.\\n ')]\n",
649 | "[(0, '\\n Once upon a time there were three little sisters; and their names were\\n ')]\n"
650 | ]
651 | }
652 | ],
653 | "source": [
654 | "html = \"\"\"\n",
655 | "\n",
656 | " \n",
657 | " The Dormouse's story \n",
658 | " \n",
659 | " \n",
660 | " \n",
661 | " Once upon a time there were three little sisters; and their names were\n",
662 | " \n",
663 | " Elsie \n",
664 | " \n",
665 | " Lacie \n",
666 | " and\n",
667 | " Tillie \n",
668 | " and they lived at the bottom of a well.\n",
669 | "
\n",
670 | " ...
\n",
671 | "\"\"\"\n",
672 | "from bs4 import BeautifulSoup\n",
673 | "soup = BeautifulSoup(html, 'lxml')\n",
674 | "print(list(enumerate(soup.a.next_siblings)))\n",
675 | "print(list(enumerate(soup.a.previous_siblings)))"
676 | ]
677 | },
678 | {
679 | "cell_type": "markdown",
680 | "metadata": {},
681 | "source": [
682 | "## 标准选择器"
683 | ]
684 | },
685 | {
686 | "cell_type": "markdown",
687 | "metadata": {},
688 | "source": [
689 | "### find_all( name , attrs , recursive , text , **kwargs )"
690 | ]
691 | },
692 | {
693 | "cell_type": "markdown",
694 | "metadata": {},
695 | "source": [
696 | "可根据标签名、属性、内容查找文档"
697 | ]
698 | },
699 | {
700 | "cell_type": "markdown",
701 | "metadata": {},
702 | "source": [
703 | "#### name"
704 | ]
705 | },
706 | {
707 | "cell_type": "code",
708 | "execution_count": 88,
709 | "metadata": {
710 | "collapsed": false
711 | },
712 | "outputs": [
713 | {
714 | "name": "stdout",
715 | "output_type": "stream",
716 | "text": [
717 | "[\n",
718 | "Foo \n",
719 | "Bar \n",
720 | "Jay \n",
721 | " , \n",
722 | "Foo \n",
723 | "Bar \n",
724 | " ]\n",
725 | "\n"
726 | ]
727 | }
728 | ],
729 | "source": [
730 | "html='''\n",
731 | "\n",
732 | "
\n",
733 | "
Hello \n",
734 | " \n",
735 | "
\n",
736 | "
\n",
737 | " Foo \n",
738 | " Bar \n",
739 | " Jay \n",
740 | " \n",
741 | "
\n",
742 | " Foo \n",
743 | " Bar \n",
744 | " \n",
745 | "
\n",
746 | "
\n",
747 | "'''\n",
748 | "from bs4 import BeautifulSoup\n",
749 | "soup = BeautifulSoup(html, 'lxml')\n",
750 | "print(soup.find_all('ul'))\n",
751 | "print(type(soup.find_all('ul')[0]))"
752 | ]
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": 119,
757 | "metadata": {
758 | "collapsed": false
759 | },
760 | "outputs": [
761 | {
762 | "name": "stdout",
763 | "output_type": "stream",
764 | "text": [
765 | "[Foo , Bar , Jay ]\n",
766 | "[Foo , Bar ]\n"
767 | ]
768 | }
769 | ],
770 | "source": [
771 | "html='''\n",
772 | "\n",
773 | "
\n",
774 | "
Hello \n",
775 | " \n",
776 | "
\n",
777 | "
\n",
778 | " Foo \n",
779 | " Bar \n",
780 | " Jay \n",
781 | " \n",
782 | "
\n",
783 | " Foo \n",
784 | " Bar \n",
785 | " \n",
786 | "
\n",
787 | "
\n",
788 | "'''\n",
789 | "from bs4 import BeautifulSoup\n",
790 | "soup = BeautifulSoup(html, 'lxml')\n",
791 | "for ul in soup.find_all('ul'):\n",
792 | " print(ul.find_all('li'))"
793 | ]
794 | },
795 | {
796 | "cell_type": "markdown",
797 | "metadata": {},
798 | "source": [
799 | "#### attrs"
800 | ]
801 | },
802 | {
803 | "cell_type": "code",
804 | "execution_count": 135,
805 | "metadata": {
806 | "collapsed": false
807 | },
808 | "outputs": [
809 | {
810 | "name": "stdout",
811 | "output_type": "stream",
812 | "text": [
813 | "[\n",
814 | "Foo \n",
815 | "Bar \n",
816 | "Jay \n",
817 | " ]\n",
818 | "[\n",
819 | "Foo \n",
820 | "Bar \n",
821 | "Jay \n",
822 | " ]\n"
823 | ]
824 | }
825 | ],
826 | "source": [
827 | "html='''\n",
828 | "\n",
829 | "
\n",
830 | "
Hello \n",
831 | " \n",
832 | "
\n",
833 | "
\n",
834 | " Foo \n",
835 | " Bar \n",
836 | " Jay \n",
837 | " \n",
838 | "
\n",
839 | " Foo \n",
840 | " Bar \n",
841 | " \n",
842 | "
\n",
843 | "
\n",
844 | "'''\n",
845 | "from bs4 import BeautifulSoup\n",
846 | "soup = BeautifulSoup(html, 'lxml')\n",
847 | "print(soup.find_all(attrs={'id': 'list-1'}))\n",
848 | "print(soup.find_all(attrs={'name': 'elements'}))"
849 | ]
850 | },
851 | {
852 | "cell_type": "code",
853 | "execution_count": 136,
854 | "metadata": {
855 | "collapsed": false
856 | },
857 | "outputs": [
858 | {
859 | "name": "stdout",
860 | "output_type": "stream",
861 | "text": [
862 | "[\n",
863 | "Foo \n",
864 | "Bar \n",
865 | "Jay \n",
866 | " ]\n",
867 | "[Foo , Bar , Jay , Foo , Bar ]\n"
868 | ]
869 | }
870 | ],
871 | "source": [
872 | "html='''\n",
873 | "\n",
874 | "
\n",
875 | "
Hello \n",
876 | " \n",
877 | "
\n",
878 | "
\n",
879 | " Foo \n",
880 | " Bar \n",
881 | " Jay \n",
882 | " \n",
883 | "
\n",
884 | " Foo \n",
885 | " Bar \n",
886 | " \n",
887 | "
\n",
888 | "
\n",
889 | "'''\n",
890 | "from bs4 import BeautifulSoup\n",
891 | "soup = BeautifulSoup(html, 'lxml')\n",
892 | "print(soup.find_all(id='list-1'))\n",
893 | "print(soup.find_all(class_='element'))"
894 | ]
895 | },
896 | {
897 | "cell_type": "markdown",
898 | "metadata": {},
899 | "source": [
900 | "#### text"
901 | ]
902 | },
903 | {
904 | "cell_type": "code",
905 | "execution_count": 94,
906 | "metadata": {
907 | "collapsed": false
908 | },
909 | "outputs": [
910 | {
911 | "name": "stdout",
912 | "output_type": "stream",
913 | "text": [
914 | "['Foo', 'Foo']\n"
915 | ]
916 | }
917 | ],
918 | "source": [
919 | "html='''\n",
920 | "\n",
921 | "
\n",
922 | "
Hello \n",
923 | " \n",
924 | "
\n",
925 | "
\n",
926 | " Foo \n",
927 | " Bar \n",
928 | " Jay \n",
929 | " \n",
930 | "
\n",
931 | " Foo \n",
932 | " Bar \n",
933 | " \n",
934 | "
\n",
935 | "
\n",
936 | "'''\n",
937 | "from bs4 import BeautifulSoup\n",
938 | "soup = BeautifulSoup(html, 'lxml')\n",
939 | "print(soup.find_all(text='Foo'))"
940 | ]
941 | },
942 | {
943 | "cell_type": "markdown",
944 | "metadata": {},
945 | "source": [
946 | "### find( name , attrs , recursive , text , **kwargs )"
947 | ]
948 | },
949 | {
950 | "cell_type": "markdown",
951 | "metadata": {},
952 | "source": [
953 | "find返回单个元素,find_all返回所有元素"
954 | ]
955 | },
956 | {
957 | "cell_type": "code",
958 | "execution_count": 104,
959 | "metadata": {
960 | "collapsed": false
961 | },
962 | "outputs": [
963 | {
964 | "name": "stdout",
965 | "output_type": "stream",
966 | "text": [
967 | "\n",
968 | "Foo \n",
969 | "Bar \n",
970 | "Jay \n",
971 | " \n",
972 | "\n",
973 | "None\n"
974 | ]
975 | }
976 | ],
977 | "source": [
978 | "html='''\n",
979 | "\n",
980 | "
\n",
981 | "
Hello \n",
982 | " \n",
983 | "
\n",
984 | "
\n",
985 | " Foo \n",
986 | " Bar \n",
987 | " Jay \n",
988 | " \n",
989 | "
\n",
990 | " Foo \n",
991 | " Bar \n",
992 | " \n",
993 | "
\n",
994 | "
\n",
995 | "'''\n",
996 | "from bs4 import BeautifulSoup\n",
997 | "soup = BeautifulSoup(html, 'lxml')\n",
998 | "print(soup.find('ul'))\n",
999 | "print(type(soup.find('ul')))\n",
1000 | "print(soup.find('page'))"
1001 | ]
1002 | },
1003 | {
1004 | "cell_type": "markdown",
1005 | "metadata": {},
1006 | "source": [
1007 | "### find_parents() find_parent()"
1008 | ]
1009 | },
1010 | {
1011 | "cell_type": "markdown",
1012 | "metadata": {},
1013 | "source": [
1014 | "find_parents()返回所有祖先节点,find_parent()返回直接父节点。"
1015 | ]
1016 | },
1017 | {
1018 | "cell_type": "markdown",
1019 | "metadata": {},
1020 | "source": [
1021 | "### find_next_siblings() find_next_sibling()"
1022 | ]
1023 | },
1024 | {
1025 | "cell_type": "markdown",
1026 | "metadata": {},
1027 | "source": [
1028 | "find_next_siblings()返回后面所有兄弟节点,find_next_sibling()返回后面第一个兄弟节点。"
1029 | ]
1030 | },
1031 | {
1032 | "cell_type": "markdown",
1033 | "metadata": {},
1034 | "source": [
1035 | "### find_previous_siblings() find_previous_sibling()"
1036 | ]
1037 | },
1038 | {
1039 | "cell_type": "markdown",
1040 | "metadata": {},
1041 | "source": [
1042 | "find_previous_siblings()返回前面所有兄弟节点,find_previous_sibling()返回前面第一个兄弟节点。"
1043 | ]
1044 | },
1045 | {
1046 | "cell_type": "markdown",
1047 | "metadata": {},
1048 | "source": [
1049 | "### find_all_next() find_next()"
1050 | ]
1051 | },
1052 | {
1053 | "cell_type": "markdown",
1054 | "metadata": {},
1055 | "source": [
1056 | "find_all_next()返回节点后所有符合条件的节点, find_next()返回第一个符合条件的节点"
1057 | ]
1058 | },
1059 | {
1060 | "cell_type": "markdown",
1061 | "metadata": {},
1062 | "source": [
1063 | "### find_all_previous() 和 find_previous()"
1064 | ]
1065 | },
1066 | {
1067 | "cell_type": "markdown",
1068 | "metadata": {},
1069 | "source": [
1070 | "find_all_previous()返回节点后所有符合条件的节点, find_previous()返回第一个符合条件的节点"
1071 | ]
1072 | },
1073 | {
1074 | "cell_type": "markdown",
1075 | "metadata": {},
1076 | "source": [
1077 | "## CSS选择器"
1078 | ]
1079 | },
1080 | {
1081 | "cell_type": "markdown",
1082 | "metadata": {},
1083 | "source": [
1084 | "通过select()直接传入CSS选择器即可完成选择"
1085 | ]
1086 | },
1087 | {
1088 | "cell_type": "code",
1089 | "execution_count": 122,
1090 | "metadata": {
1091 | "collapsed": false
1092 | },
1093 | "outputs": [
1094 | {
1095 | "name": "stdout",
1096 | "output_type": "stream",
1097 | "text": [
1098 | "[\n",
1099 | "
Hello \n",
1100 | "]\n",
1101 | "[Foo , Bar , Jay , Foo , Bar ]\n",
1102 | "[Foo , Bar ]\n",
1103 | "\n"
1104 | ]
1105 | }
1106 | ],
1107 | "source": [
1108 | "html='''\n",
1109 | "\n",
1110 | "
\n",
1111 | "
Hello \n",
1112 | " \n",
1113 | "
\n",
1114 | "
\n",
1115 | " Foo \n",
1116 | " Bar \n",
1117 | " Jay \n",
1118 | " \n",
1119 | "
\n",
1120 | " Foo \n",
1121 | " Bar \n",
1122 | " \n",
1123 | "
\n",
1124 | "
\n",
1125 | "'''\n",
1126 | "from bs4 import BeautifulSoup\n",
1127 | "soup = BeautifulSoup(html, 'lxml')\n",
1128 | "print(soup.select('.panel .panel-heading'))\n",
1129 | "print(soup.select('ul li'))\n",
1130 | "print(soup.select('#list-2 .element'))\n",
1131 | "print(type(soup.select('ul')[0]))"
1132 | ]
1133 | },
1134 | {
1135 | "cell_type": "code",
1136 | "execution_count": 123,
1137 | "metadata": {
1138 | "collapsed": false
1139 | },
1140 | "outputs": [
1141 | {
1142 | "name": "stdout",
1143 | "output_type": "stream",
1144 | "text": [
1145 | "[Foo , Bar , Jay ]\n",
1146 | "[Foo , Bar ]\n"
1147 | ]
1148 | }
1149 | ],
1150 | "source": [
1151 | "html='''\n",
1152 | "\n",
1153 | "
\n",
1154 | "
Hello \n",
1155 | " \n",
1156 | "
\n",
1157 | "
\n",
1158 | " Foo \n",
1159 | " Bar \n",
1160 | " Jay \n",
1161 | " \n",
1162 | "
\n",
1163 | " Foo \n",
1164 | " Bar \n",
1165 | " \n",
1166 | "
\n",
1167 | "
\n",
1168 | "'''\n",
1169 | "from bs4 import BeautifulSoup\n",
1170 | "soup = BeautifulSoup(html, 'lxml')\n",
1171 | "for ul in soup.select('ul'):\n",
1172 | " print(ul.select('li'))"
1173 | ]
1174 | },
1175 | {
1176 | "cell_type": "markdown",
1177 | "metadata": {},
1178 | "source": [
1179 | "### 获取属性"
1180 | ]
1181 | },
1182 | {
1183 | "cell_type": "code",
1184 | "execution_count": 129,
1185 | "metadata": {
1186 | "collapsed": false
1187 | },
1188 | "outputs": [
1189 | {
1190 | "name": "stdout",
1191 | "output_type": "stream",
1192 | "text": [
1193 | "list-1\n",
1194 | "list-1\n",
1195 | "list-2\n",
1196 | "list-2\n"
1197 | ]
1198 | }
1199 | ],
1200 | "source": [
1201 | "html='''\n",
1202 | "\n",
1203 | "
\n",
1204 | "
Hello \n",
1205 | " \n",
1206 | "
\n",
1207 | "
\n",
1208 | " Foo \n",
1209 | " Bar \n",
1210 | " Jay \n",
1211 | " \n",
1212 | "
\n",
1213 | " Foo \n",
1214 | " Bar \n",
1215 | " \n",
1216 | "
\n",
1217 | "
\n",
1218 | "'''\n",
1219 | "from bs4 import BeautifulSoup\n",
1220 | "soup = BeautifulSoup(html, 'lxml')\n",
1221 | "for ul in soup.select('ul'):\n",
1222 | " print(ul['id'])\n",
1223 | " print(ul.attrs['id'])"
1224 | ]
1225 | },
1226 | {
1227 | "cell_type": "markdown",
1228 | "metadata": {},
1229 | "source": [
1230 | "### 获取内容"
1231 | ]
1232 | },
1233 | {
1234 | "cell_type": "code",
1235 | "execution_count": 132,
1236 | "metadata": {
1237 | "collapsed": false,
1238 | "scrolled": true
1239 | },
1240 | "outputs": [
1241 | {
1242 | "name": "stdout",
1243 | "output_type": "stream",
1244 | "text": [
1245 | "Foo\n",
1246 | "Bar\n",
1247 | "Jay\n",
1248 | "Foo\n",
1249 | "Bar\n"
1250 | ]
1251 | }
1252 | ],
1253 | "source": [
1254 | "html='''\n",
1255 | "\n",
1256 | "
\n",
1257 | "
Hello \n",
1258 | " \n",
1259 | "
\n",
1260 | "
\n",
1261 | " Foo \n",
1262 | " Bar \n",
1263 | " Jay \n",
1264 | " \n",
1265 | "
\n",
1266 | " Foo \n",
1267 | " Bar \n",
1268 | " \n",
1269 | "
\n",
1270 | "
\n",
1271 | "'''\n",
1272 | "from bs4 import BeautifulSoup\n",
1273 | "soup = BeautifulSoup(html, 'lxml')\n",
1274 | "for ul in soup.select('li'):\n",
1275 | " print(ul.get_text())"
1276 | ]
1277 | },
1278 | {
1279 | "cell_type": "markdown",
1280 | "metadata": {},
1281 | "source": [
1282 | "## 总结"
1283 | ]
1284 | },
1285 | {
1286 | "cell_type": "markdown",
1287 | "metadata": {},
1288 | "source": [
1289 | "* 推荐使用lxml解析库,必要时使用html.parser\n",
1290 | "* 标签选择筛选功能弱但是速度快\n",
1291 | "* 建议使用find()、find_all() 查询匹配单个结果或者多个结果\n",
1292 | "* 如果对CSS选择器熟悉建议使用select()\n",
1293 | "* 记住常用的获取属性和文本值的方法"
1294 | ]
1295 | }
1296 | ],
1297 | "metadata": {
1298 | "kernelspec": {
1299 | "display_name": "Python 3",
1300 | "language": "python",
1301 | "name": "python3"
1302 | },
1303 | "language_info": {
1304 | "codemirror_mode": {
1305 | "name": "ipython",
1306 | "version": 3
1307 | },
1308 | "file_extension": ".py",
1309 | "mimetype": "text/x-python",
1310 | "name": "python",
1311 | "nbconvert_exporter": "python",
1312 | "pygments_lexer": "ipython3",
1313 | "version": "3.5.1"
1314 | }
1315 | },
1316 | "nbformat": 4,
1317 | "nbformat_minor": 0
1318 | }
1319 |
--------------------------------------------------------------------------------
/demo.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/pyquery.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# pyquery"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## 初始化"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "### 字符串初始化"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 4,
27 | "metadata": {
28 | "collapsed": false
29 | },
30 | "outputs": [
31 | {
32 | "name": "stdout",
33 | "output_type": "stream",
34 | "text": [
35 | "first item \n",
36 | " second item \n",
37 | " third item \n",
38 | " fourth item \n",
39 | " fifth item \n",
40 | " \n"
41 | ]
42 | }
43 | ],
44 | "source": [
45 | "html = '''\n",
46 | "\n",
55 | "'''\n",
56 | "from pyquery import PyQuery as pq\n",
57 | "doc = pq(html)\n",
58 | "print(doc('li'))"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "### URL初始化"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 8,
71 | "metadata": {
72 | "collapsed": false
73 | },
74 | "outputs": [
75 | {
76 | "name": "stdout",
77 | "output_type": "stream",
78 | "text": [
79 | "ç¾åº¦ä¸ä¸ï¼ä½ å°±ç¥é \n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "from pyquery import PyQuery as pq\n",
85 | "doc = pq(url='http://www.baidu.com')\n",
86 | "print(doc('head'))"
87 | ]
88 | },
89 | {
90 | "cell_type": "markdown",
91 | "metadata": {},
92 | "source": [
93 | "### 文件初始化"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 15,
99 | "metadata": {
100 | "collapsed": false
101 | },
102 | "outputs": [
103 | {
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "first item \n",
108 | " second item \n",
109 | " third item \n",
110 | " fourth item \n",
111 | " fifth item \n",
112 | " \n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "from pyquery import PyQuery as pq\n",
118 | "doc = pq(filename='demo.html')\n",
119 | "print(doc('li'))"
120 | ]
121 | },
122 | {
123 | "cell_type": "markdown",
124 | "metadata": {},
125 | "source": [
126 | "## 基本CSS选择器"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 17,
132 | "metadata": {
133 | "collapsed": false
134 | },
135 | "outputs": [
136 | {
137 | "name": "stdout",
138 | "output_type": "stream",
139 | "text": [
140 | "first item \n",
141 | " second item \n",
142 | " third item \n",
143 | " fourth item \n",
144 | " fifth item \n",
145 | " \n"
146 | ]
147 | }
148 | ],
149 | "source": [
150 | "html = '''\n",
151 | "\n",
152 | "
\n",
159 | "
\n",
160 | "'''\n",
161 | "from pyquery import PyQuery as pq\n",
162 | "doc = pq(html)\n",
163 | "print(doc('#container .list li'))"
164 | ]
165 | },
166 | {
167 | "cell_type": "markdown",
168 | "metadata": {},
169 | "source": [
170 | "## 查找元素"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "### 子元素"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 30,
183 | "metadata": {
184 | "collapsed": false
185 | },
186 | "outputs": [
187 | {
188 | "name": "stdout",
189 | "output_type": "stream",
190 | "text": [
191 | "\n",
192 | "\n",
199 | " \n",
200 | "\n",
201 | "first item \n",
202 | " second item \n",
203 | " third item \n",
204 | " fourth item \n",
205 | " fifth item \n",
206 | " \n"
207 | ]
208 | }
209 | ],
210 | "source": [
211 | "html = '''\n",
212 | "\n",
213 | "
\n",
220 | "
\n",
221 | "'''\n",
222 | "from pyquery import PyQuery as pq\n",
223 | "doc = pq(html)\n",
224 | "items = doc('.list')\n",
225 | "print(type(items))\n",
226 | "print(items)\n",
227 | "lis = items.find('li')\n",
228 | "print(type(lis))\n",
229 | "print(lis)"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 28,
235 | "metadata": {
236 | "collapsed": false
237 | },
238 | "outputs": [
239 | {
240 | "name": "stdout",
241 | "output_type": "stream",
242 | "text": [
243 | "\n",
244 | "first item \n",
245 | " second item \n",
246 | " third item \n",
247 | " fourth item \n",
248 | " fifth item \n",
249 | " \n"
250 | ]
251 | }
252 | ],
253 | "source": [
254 | "lis = items.children()\n",
255 | "print(type(lis))\n",
256 | "print(lis)"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": 36,
262 | "metadata": {
263 | "collapsed": false
264 | },
265 | "outputs": [
266 | {
267 | "name": "stdout",
268 | "output_type": "stream",
269 | "text": [
270 | "third item \n",
271 | " fourth item \n",
272 | " \n"
273 | ]
274 | }
275 | ],
276 | "source": [
277 | "lis = items.children('.active')\n",
278 | "print(lis)"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "metadata": {},
284 | "source": [
285 | "### 父元素"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 31,
291 | "metadata": {
292 | "collapsed": false
293 | },
294 | "outputs": [
295 | {
296 | "name": "stdout",
297 | "output_type": "stream",
298 | "text": [
299 | "\n",
300 | "\n",
301 | "
\n",
308 | "
\n"
309 | ]
310 | }
311 | ],
312 | "source": [
313 | "html = '''\n",
314 | "\n",
315 | "
\n",
322 | "
\n",
323 | "'''\n",
324 | "from pyquery import PyQuery as pq\n",
325 | "doc = pq(html)\n",
326 | "items = doc('.list')\n",
327 | "container = items.parent()\n",
328 | "print(type(container))\n",
329 | "print(container)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 32,
335 | "metadata": {
336 | "collapsed": false,
337 | "scrolled": true
338 | },
339 | "outputs": [
340 | {
341 | "name": "stdout",
342 | "output_type": "stream",
343 | "text": [
344 | "\n",
345 | "\n",
346 | "
\n",
347 | "
\n",
354 | "
\n",
355 | "
\n",
356 | "
\n",
363 | "
\n",
364 | " \n"
365 | ]
366 | }
367 | ],
368 | "source": [
369 | "html = '''\n",
370 | "\n",
371 | "
\n",
372 | "
\n",
379 | "
\n",
380 | "
\n",
381 | "'''\n",
382 | "from pyquery import PyQuery as pq\n",
383 | "doc = pq(html)\n",
384 | "items = doc('.list')\n",
385 | "parents = items.parents()\n",
386 | "print(type(parents))\n",
387 | "print(parents)"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 33,
393 | "metadata": {
394 | "collapsed": false
395 | },
396 | "outputs": [
397 | {
398 | "name": "stdout",
399 | "output_type": "stream",
400 | "text": [
401 | "\n",
402 | "
\n",
403 | "
\n",
410 | "
\n",
411 | "
\n"
412 | ]
413 | }
414 | ],
415 | "source": [
416 | "parent = items.parents('.wrap')\n",
417 | "print(parent)"
418 | ]
419 | },
420 | {
421 | "cell_type": "markdown",
422 | "metadata": {},
423 | "source": [
424 | "### 兄弟元素"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 45,
430 | "metadata": {
431 | "collapsed": false
432 | },
433 | "outputs": [
434 | {
435 | "name": "stdout",
436 | "output_type": "stream",
437 | "text": [
438 | "second item \n",
439 | " first item \n",
440 | " fourth item \n",
441 | " fifth item \n",
442 | " \n"
443 | ]
444 | }
445 | ],
446 | "source": [
447 | "html = '''\n",
448 | "\n",
449 | "
\n",
450 | "
\n",
457 | "
\n",
458 | "
\n",
459 | "'''\n",
460 | "from pyquery import PyQuery as pq\n",
461 | "doc = pq(html)\n",
462 | "li = doc('.list .item-0.active')\n",
463 | "print(li.siblings())"
464 | ]
465 | },
466 | {
467 | "cell_type": "code",
468 | "execution_count": 46,
469 | "metadata": {
470 | "collapsed": false
471 | },
472 | "outputs": [
473 | {
474 | "name": "stdout",
475 | "output_type": "stream",
476 | "text": [
477 | "fourth item \n",
478 | " \n"
479 | ]
480 | }
481 | ],
482 | "source": [
483 | "html = '''\n",
484 | "\n",
485 | "
\n",
486 | "
\n",
493 | "
\n",
494 | "
\n",
495 | "'''\n",
496 | "from pyquery import PyQuery as pq\n",
497 | "doc = pq(html)\n",
498 | "li = doc('.list .item-0.active')\n",
499 | "print(li.siblings('.active'))"
500 | ]
501 | },
502 | {
503 | "cell_type": "markdown",
504 | "metadata": {},
505 | "source": [
506 | "## 遍历"
507 | ]
508 | },
509 | {
510 | "cell_type": "markdown",
511 | "metadata": {},
512 | "source": [
513 | "### 单个元素"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": 49,
519 | "metadata": {
520 | "collapsed": false
521 | },
522 | "outputs": [
523 | {
524 | "name": "stdout",
525 | "output_type": "stream",
526 | "text": [
527 | "third item \n",
528 | " \n"
529 | ]
530 | }
531 | ],
532 | "source": [
533 | "html = '''\n",
534 | "\n",
535 | "
\n",
536 | "
\n",
543 | "
\n",
544 | "
\n",
545 | "'''\n",
546 | "from pyquery import PyQuery as pq\n",
547 | "doc = pq(html)\n",
548 | "li = doc('.item-0.active')\n",
549 | "print(li)"
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": 51,
555 | "metadata": {
556 | "collapsed": false
557 | },
558 | "outputs": [
559 | {
560 | "name": "stdout",
561 | "output_type": "stream",
562 | "text": [
563 | "\n",
564 | "first item \n",
565 | " \n",
566 | "second item \n",
567 | " \n",
568 | "third item \n",
569 | " \n",
570 | "fourth item \n",
571 | " \n",
572 | "fifth item \n",
573 | " \n"
574 | ]
575 | }
576 | ],
577 | "source": [
578 | "html = '''\n",
579 | "\n",
580 | "
\n",
581 | "
\n",
588 | "
\n",
589 | "
\n",
590 | "'''\n",
591 | "from pyquery import PyQuery as pq\n",
592 | "doc = pq(html)\n",
593 | "lis = doc('li').items()\n",
594 | "print(type(lis))\n",
595 | "for li in lis:\n",
596 | " print(li)"
597 | ]
598 | },
599 | {
600 | "cell_type": "markdown",
601 | "metadata": {},
602 | "source": [
603 | "## 获取信息"
604 | ]
605 | },
606 | {
607 | "cell_type": "markdown",
608 | "metadata": {},
609 | "source": [
610 | "### 获取属性"
611 | ]
612 | },
613 | {
614 | "cell_type": "code",
615 | "execution_count": 57,
616 | "metadata": {
617 | "collapsed": false
618 | },
619 | "outputs": [
620 | {
621 | "name": "stdout",
622 | "output_type": "stream",
623 | "text": [
624 | "third item \n",
625 | "link3.html\n",
626 | "link3.html\n"
627 | ]
628 | }
629 | ],
630 | "source": [
631 | "html = '''\n",
632 | "\n",
633 | "
\n",
634 | "
\n",
641 | "
\n",
642 | "
\n",
643 | "'''\n",
644 | "from pyquery import PyQuery as pq\n",
645 | "doc = pq(html)\n",
646 | "a = doc('.item-0.active a')\n",
647 | "print(a)\n",
648 | "print(a.attr('href'))\n",
649 | "print(a.attr.href)"
650 | ]
651 | },
652 | {
653 | "cell_type": "markdown",
654 | "metadata": {},
655 | "source": [
656 | "### 获取文本"
657 | ]
658 | },
659 | {
660 | "cell_type": "code",
661 | "execution_count": 62,
662 | "metadata": {
663 | "collapsed": false
664 | },
665 | "outputs": [
666 | {
667 | "name": "stdout",
668 | "output_type": "stream",
669 | "text": [
670 | "third item \n",
671 | "third item\n"
672 | ]
673 | }
674 | ],
675 | "source": [
676 | "html = '''\n",
677 | "\n",
678 | "
\n",
679 | "
\n",
686 | "
\n",
687 | "
\n",
688 | "'''\n",
689 | "from pyquery import PyQuery as pq\n",
690 | "doc = pq(html)\n",
691 | "a = doc('.item-0.active a')\n",
692 | "print(a)\n",
693 | "print(a.text())"
694 | ]
695 | },
696 | {
697 | "cell_type": "markdown",
698 | "metadata": {},
699 | "source": [
700 | "### 获取HTML"
701 | ]
702 | },
703 | {
704 | "cell_type": "code",
705 | "execution_count": 63,
706 | "metadata": {
707 | "collapsed": false
708 | },
709 | "outputs": [
710 | {
711 | "name": "stdout",
712 | "output_type": "stream",
713 | "text": [
714 | "third item \n",
715 | " \n",
716 | "third item \n"
717 | ]
718 | }
719 | ],
720 | "source": [
721 | "html = '''\n",
722 | "\n",
723 | "
\n",
724 | "
\n",
731 | "
\n",
732 | "
\n",
733 | "'''\n",
734 | "from pyquery import PyQuery as pq\n",
735 | "doc = pq(html)\n",
736 | "a = doc('.item-0.active')\n",
737 | "print(a)\n",
738 | "print(a.html())"
739 | ]
740 | },
741 | {
742 | "cell_type": "markdown",
743 | "metadata": {},
744 | "source": [
745 | "## DOM操作"
746 | ]
747 | },
748 | {
749 | "cell_type": "markdown",
750 | "metadata": {},
751 | "source": [
752 | "### addClass、removeClass"
753 | ]
754 | },
755 | {
756 | "cell_type": "code",
757 | "execution_count": 66,
758 | "metadata": {
759 | "collapsed": false
760 | },
761 | "outputs": [
762 | {
763 | "name": "stdout",
764 | "output_type": "stream",
765 | "text": [
766 | "third item \n",
767 | " \n",
768 | "third item \n",
769 | " \n",
770 | "third item \n",
771 | " \n"
772 | ]
773 | }
774 | ],
775 | "source": [
776 | "html = '''\n",
777 | "\n",
778 | "
\n",
779 | "
\n",
786 | "
\n",
787 | "
\n",
788 | "'''\n",
789 | "from pyquery import PyQuery as pq\n",
790 | "doc = pq(html)\n",
791 | "li = doc('.item-0.active')\n",
792 | "print(li)\n",
793 | "li.removeClass('active')\n",
794 | "print(li)\n",
795 | "li.addClass('active')\n",
796 | "print(li)"
797 | ]
798 | },
799 | {
800 | "cell_type": "markdown",
801 | "metadata": {},
802 | "source": [
803 | "### attr、css"
804 | ]
805 | },
806 | {
807 | "cell_type": "code",
808 | "execution_count": 68,
809 | "metadata": {
810 | "collapsed": false
811 | },
812 | "outputs": [
813 | {
814 | "name": "stdout",
815 | "output_type": "stream",
816 | "text": [
817 | "third item \n",
818 | " \n",
819 | "third item \n",
820 | " \n",
821 | "third item \n",
822 | " \n"
823 | ]
824 | }
825 | ],
826 | "source": [
827 | "html = '''\n",
828 | "\n",
829 | "
\n",
830 | "
\n",
837 | "
\n",
838 | "
\n",
839 | "'''\n",
840 | "from pyquery import PyQuery as pq\n",
841 | "doc = pq(html)\n",
842 | "li = doc('.item-0.active')\n",
843 | "print(li)\n",
844 | "li.attr('name', 'link')\n",
845 | "print(li)\n",
846 | "li.css('font-size', '14px')\n",
847 | "print(li)"
848 | ]
849 | },
850 | {
851 | "cell_type": "markdown",
852 | "metadata": {},
853 | "source": [
854 | "### remove"
855 | ]
856 | },
857 | {
858 | "cell_type": "code",
859 | "execution_count": 72,
860 | "metadata": {
861 | "collapsed": false
862 | },
863 | "outputs": [
864 | {
865 | "name": "stdout",
866 | "output_type": "stream",
867 | "text": [
868 | "Hello, World This is a paragraph.\n",
869 | "Hello, World\n"
870 | ]
871 | }
872 | ],
873 | "source": [
874 | "html = '''\n",
875 | "\n",
876 | " Hello, World\n",
877 | "
This is a paragraph.
\n",
878 | "
\n",
879 | "'''\n",
880 | "from pyquery import PyQuery as pq\n",
881 | "doc = pq(html)\n",
882 | "wrap = doc('.wrap')\n",
883 | "print(wrap.text())\n",
884 | "wrap.find('p').remove()\n",
885 | "print(wrap.text())"
886 | ]
887 | },
888 | {
889 | "cell_type": "markdown",
890 | "metadata": {},
891 | "source": [
892 | "### 其他DOM方法"
893 | ]
894 | },
895 | {
896 | "cell_type": "markdown",
897 | "metadata": {},
898 | "source": [
899 | "http://pyquery.readthedocs.io/en/latest/api.html"
900 | ]
901 | },
902 | {
903 | "cell_type": "markdown",
904 | "metadata": {},
905 | "source": [
906 | "## 伪类选择器"
907 | ]
908 | },
909 | {
910 | "cell_type": "code",
911 | "execution_count": 85,
912 | "metadata": {
913 | "collapsed": false
914 | },
915 | "outputs": [
916 | {
917 | "name": "stdout",
918 | "output_type": "stream",
919 | "text": [
920 | "first item \n",
921 | " \n",
922 | "fifth item \n",
923 | " \n",
924 | "second item \n",
925 | " \n",
926 | "fourth item \n",
927 | " fifth item \n",
928 | " \n",
929 | "second item \n",
930 | " fourth item \n",
931 | " \n",
932 | "second item \n",
933 | " \n"
934 | ]
935 | }
936 | ],
937 | "source": [
938 | "html = '''\n",
939 | "\n",
940 | "
\n",
941 | "
\n",
948 | "
\n",
949 | "
\n",
950 | "'''\n",
951 | "from pyquery import PyQuery as pq\n",
952 | "doc = pq(html)\n",
953 | "li = doc('li:first-child')\n",
954 | "print(li)\n",
955 | "li = doc('li:last-child')\n",
956 | "print(li)\n",
957 | "li = doc('li:nth-child(2)')\n",
958 | "print(li)\n",
959 | "li = doc('li:gt(2)')\n",
960 | "print(li)\n",
961 | "li = doc('li:nth-child(2n)')\n",
962 | "print(li)\n",
963 | "li = doc('li:contains(second)')\n",
964 | "print(li)bb"
965 | ]
966 | },
967 | {
968 | "cell_type": "markdown",
969 | "metadata": {},
970 | "source": [
971 | "更多CSS选择器可以查看\n",
972 | "http://www.w3school.com.cn/css/index.asp"
973 | ]
974 | },
975 | {
976 | "cell_type": "markdown",
977 | "metadata": {},
978 | "source": [
979 | "## 官方文档"
980 | ]
981 | },
982 | {
983 | "cell_type": "markdown",
984 | "metadata": {},
985 | "source": [
986 | "http://pyquery.readthedocs.io/"
987 | ]
988 | }
989 | ],
990 | "metadata": {
991 | "kernelspec": {
992 | "display_name": "Python 3",
993 | "language": "python",
994 | "name": "python3"
995 | },
996 | "language_info": {
997 | "codemirror_mode": {
998 | "name": "ipython",
999 | "version": 3
1000 | },
1001 | "file_extension": ".py",
1002 | "mimetype": "text/x-python",
1003 | "name": "python",
1004 | "nbconvert_exporter": "python",
1005 | "pygments_lexer": "ipython3",
1006 | "version": "3.5.1"
1007 | }
1008 | },
1009 | "nbformat": 4,
1010 | "nbformat_minor": 0
1011 | }
1012 |
--------------------------------------------------------------------------------
/regex.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 正则表达式"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## 常见匹配模式"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "| 模式| 描述|\n",
22 | "|----|----|\n",
23 | "| \\w\t| 匹配字母数字及下划线 |\n",
24 | "| \\W\t| 匹配非字母数字下划线 |\n",
25 | "| \\s\t| 匹配任意空白字符,等价于 [\\t\\n\\r\\f]. |\n",
26 | "| \\S\t| 匹配任意非空字符 |\n",
27 | "| \\d\t| 匹配任意数字,等价于 [0-9] |\n",
28 | "| \\D\t| 匹配任意非数字 |\n",
29 | "| \\A\t| 匹配字符串开始 |\n",
30 | "| \\Z\t| 匹配字符串结束,如果是存在换行,只匹配到换行前的结束字符串 |\n",
31 | "| \\z\t| 匹配字符串结束 |\n",
32 | "| \\G\t| 匹配最后匹配完成的位置 |\n",
33 | "| \\n | 匹配一个换行符 |\n",
34 | "| \\t | 匹配一个制表符 |\n",
35 | "| ^\t| 匹配字符串的开头 |\n",
36 | "| $\t| 匹配字符串的末尾。|\n",
37 | "| .\t| 匹配任意字符,除了换行符,当re.DOTALL标记被指定时,则可以匹配包括换行符的任意字符。|\n",
38 | "| [...]\t| 用来表示一组字符,单独列出:[amk] 匹配 'a','m'或'k' |\n",
39 | "| [^...]\t| 不在[]中的字符:[^abc] 匹配除了a,b,c之外的字符。| \n",
40 | "| *\t| 匹配0个或多个的表达式。|\n",
41 | "| +\t| 匹配1个或多个的表达式。|\n",
42 | "| ?\t| 匹配0个或1个由前面的正则表达式定义的片段,非贪婪方式| \n",
43 | "| {n}\t| 精确匹配n个前面表达式。|\n",
44 | "| {n, m} | 匹配 n 到 m 次由前面的正则表达式定义的片段,贪婪方式| \n",
45 | "| a|b | 匹配a或b |\n",
46 | "| ( )\t| 匹配括号内的表达式,也表示一个组 |"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "## re.match\n",
54 | "re.match 尝试从字符串的起始位置匹配一个模式,如果不是起始位置匹配成功的话,match()就返回none。"
55 | ]
56 | },
57 | {
58 | "cell_type": "raw",
59 | "metadata": {
60 | "collapsed": true
61 | },
62 | "source": [
63 | "re.match(pattern, string, flags=0)"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "### 最常规的匹配"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 17,
76 | "metadata": {
77 | "collapsed": false
78 | },
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "41\n",
85 | "<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>\n",
86 | "Hello 123 4567 World_This is a Regex Demo\n",
87 | "(0, 41)\n"
88 | ]
89 | }
90 | ],
91 | "source": [
92 | "import re\n",
93 | "\n",
94 | "content = 'Hello 123 4567 World_This is a Regex Demo'\n",
95 | "print(len(content))\n",
96 | "result = re.match('^Hello\\s\\d\\d\\d\\s\\d{4}\\s\\w{10}.*Demo$', content)\n",
97 | "print(result)\n",
98 | "print(result.group())\n",
99 | "print(result.span())"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "### 泛匹配"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 18,
112 | "metadata": {
113 | "collapsed": false
114 | },
115 | "outputs": [
116 | {
117 | "name": "stdout",
118 | "output_type": "stream",
119 | "text": [
120 | "<_sre.SRE_Match object; span=(0, 41), match='Hello 123 4567 World_This is a Regex Demo'>\n",
121 | "Hello 123 4567 World_This is a Regex Demo\n",
122 | "(0, 41)\n"
123 | ]
124 | }
125 | ],
126 | "source": [
127 | "import re\n",
128 | "\n",
129 | "content = 'Hello 123 4567 World_This is a Regex Demo'\n",
130 | "result = re.match('^Hello.*Demo$', content)\n",
131 | "print(result)\n",
132 | "print(result.group())\n",
133 | "print(result.span())"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "### 匹配目标"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 20,
146 | "metadata": {
147 | "collapsed": false,
148 | "scrolled": true
149 | },
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>\n",
156 | "1234567\n",
157 | "(0, 40)\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "import re\n",
163 | "\n",
164 | "content = 'Hello 1234567 World_This is a Regex Demo'\n",
165 | "result = re.match('^Hello\\s(\\d+)\\sWorld.*Demo$', content)\n",
166 | "print(result)\n",
167 | "print(result.group(1))\n",
168 | "print(result.span())"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {},
174 | "source": [
175 | "### 贪婪匹配"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 21,
181 | "metadata": {
182 | "collapsed": false
183 | },
184 | "outputs": [
185 | {
186 | "name": "stdout",
187 | "output_type": "stream",
188 | "text": [
189 | "<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>\n",
190 | "7\n"
191 | ]
192 | }
193 | ],
194 | "source": [
195 | "import re\n",
196 | "\n",
197 | "content = 'Hello 1234567 World_This is a Regex Demo'\n",
198 | "result = re.match('^He.*(\\d+).*Demo$', content)\n",
199 | "print(result)\n",
200 | "print(result.group(1))"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {},
206 | "source": [
207 | "### 非贪婪匹配"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 22,
213 | "metadata": {
214 | "collapsed": false
215 | },
216 | "outputs": [
217 | {
218 | "name": "stdout",
219 | "output_type": "stream",
220 | "text": [
221 | "<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This is a Regex Demo'>\n",
222 | "1234567\n"
223 | ]
224 | }
225 | ],
226 | "source": [
227 | "import re\n",
228 | "\n",
229 | "content = 'Hello 1234567 World_This is a Regex Demo'\n",
230 | "result = re.match('^He.*?(\\d+).*Demo$', content)\n",
231 | "print(result)\n",
232 | "print(result.group(1))"
233 | ]
234 | },
235 | {
236 | "cell_type": "markdown",
237 | "metadata": {},
238 | "source": [
239 | "### 匹配模式"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 26,
245 | "metadata": {
246 | "collapsed": false
247 | },
248 | "outputs": [
249 | {
250 | "name": "stdout",
251 | "output_type": "stream",
252 | "text": [
253 | "1234567\n"
254 | ]
255 | }
256 | ],
257 | "source": [
258 | "import re\n",
259 | "\n",
260 | "content = '''Hello 1234567 World_This\n",
261 | "is a Regex Demo\n",
262 | "'''\n",
263 | "result = re.match('^He.*?(\\d+).*?Demo$', content, re.S)\n",
264 | "print(result.group(1))"
265 | ]
266 | },
267 | {
268 | "cell_type": "markdown",
269 | "metadata": {},
270 | "source": [
271 | "### 转义"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 28,
277 | "metadata": {
278 | "collapsed": false
279 | },
280 | "outputs": [
281 | {
282 | "name": "stdout",
283 | "output_type": "stream",
284 | "text": [
285 | "None\n"
286 | ]
287 | }
288 | ],
289 | "source": [
290 | "import re\n",
291 | "\n",
292 | "content = 'price is $5.00'\n",
293 | "result = re.match('price is $5.00', content)\n",
294 | "print(result)"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 31,
300 | "metadata": {
301 | "collapsed": false
302 | },
303 | "outputs": [
304 | {
305 | "name": "stdout",
306 | "output_type": "stream",
307 | "text": [
308 | "<_sre.SRE_Match object; span=(0, 14), match='price is $5.00'>\n"
309 | ]
310 | }
311 | ],
312 | "source": [
313 | "import re\n",
314 | "\n",
315 | "content = 'price is $5.00'\n",
316 | "result = re.match('price is \\$5\\.00', content)\n",
317 | "print(result)"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "metadata": {},
323 | "source": [
324 | "总结:尽量使用泛匹配、使用括号得到匹配目标、尽量使用非贪婪模式、有换行符就用re.S"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {},
330 | "source": [
331 | "## re.search\n",
332 | "re.search 扫描整个字符串并返回第一个成功的匹配。"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 32,
338 | "metadata": {
339 | "collapsed": false
340 | },
341 | "outputs": [
342 | {
343 | "name": "stdout",
344 | "output_type": "stream",
345 | "text": [
346 | "None\n"
347 | ]
348 | }
349 | ],
350 | "source": [
351 | "import re\n",
352 | "\n",
353 | "content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'\n",
354 | "result = re.match('Hello.*?(\\d+).*?Demo', content)\n",
355 | "print(result)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 35,
361 | "metadata": {
362 | "collapsed": false
363 | },
364 | "outputs": [
365 | {
366 | "name": "stdout",
367 | "output_type": "stream",
368 | "text": [
369 | "<_sre.SRE_Match object; span=(13, 53), match='Hello 1234567 World_This is a Regex Demo'>\n",
370 | "1234567\n"
371 | ]
372 | }
373 | ],
374 | "source": [
375 | "import re\n",
376 | "\n",
377 | "content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'\n",
378 | "result = re.search('Hello.*?(\\d+).*?Demo', content)\n",
379 | "print(result)\n",
380 | "print(result.group(1))"
381 | ]
382 | },
383 | {
384 | "cell_type": "markdown",
385 | "metadata": {
386 | "collapsed": false
387 | },
388 | "source": [
389 | "总结:为匹配方便,能用search就不用match"
390 | ]
391 | },
392 | {
393 | "cell_type": "markdown",
394 | "metadata": {},
395 | "source": [
396 | "### 匹配演练"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": 36,
402 | "metadata": {
403 | "collapsed": false
404 | },
405 | "outputs": [
406 | {
407 | "name": "stdout",
408 | "output_type": "stream",
409 | "text": [
410 | "齐秦 往事随风\n"
411 | ]
412 | }
413 | ],
414 | "source": [
415 | "import re\n",
416 | "\n",
417 | "html = '''\n",
418 | "
经典老歌 \n",
419 | "
\n",
420 | " 经典老歌列表\n",
421 | "
\n",
422 | "
\n",
423 | " 一路上有你 \n",
424 | " \n",
425 | " 沧海一声笑 \n",
426 | " \n",
427 | " \n",
428 | " 往事随风 \n",
429 | " \n",
430 | " 光辉岁月 \n",
431 | " 记事本 \n",
432 | " \n",
433 | " 但愿人长久 \n",
434 | " \n",
435 | " \n",
436 | "
'''\n",
437 | "result = re.search('(.*?)', html, re.S)\n",
438 | "if result:\n",
439 | " print(result.group(1), result.group(2))"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 37,
445 | "metadata": {
446 | "collapsed": false
447 | },
448 | "outputs": [
449 | {
450 | "name": "stdout",
451 | "output_type": "stream",
452 | "text": [
453 | "任贤齐 沧海一声笑\n"
454 | ]
455 | }
456 | ],
457 | "source": [
458 | "import re\n",
459 | "\n",
460 | "html = '''\n",
461 | "
经典老歌 \n",
462 | "
\n",
463 | " 经典老歌列表\n",
464 | "
\n",
465 | "
\n",
466 | " 一路上有你 \n",
467 | " \n",
468 | " 沧海一声笑 \n",
469 | " \n",
470 | " \n",
471 | " 往事随风 \n",
472 | " \n",
473 | " 光辉岁月 \n",
474 | " 记事本 \n",
475 | " \n",
476 | " 但愿人长久 \n",
477 | " \n",
478 | " \n",
479 | "
'''\n",
480 | "result = re.search('(.*?)', html, re.S)\n",
481 | "if result:\n",
482 | " print(result.group(1), result.group(2))"
483 | ]
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": 40,
488 | "metadata": {
489 | "collapsed": false
490 | },
491 | "outputs": [
492 | {
493 | "name": "stdout",
494 | "output_type": "stream",
495 | "text": [
496 | "beyond 光辉岁月\n"
497 | ]
498 | }
499 | ],
500 | "source": [
501 | "import re\n",
502 | "\n",
503 | "html = '''\n",
504 | "
经典老歌 \n",
505 | "
\n",
506 | " 经典老歌列表\n",
507 | "
\n",
508 | "
\n",
509 | " 一路上有你 \n",
510 | " \n",
511 | " 沧海一声笑 \n",
512 | " \n",
513 | " \n",
514 | " 往事随风 \n",
515 | " \n",
516 | " 光辉岁月 \n",
517 | " 记事本 \n",
518 | " \n",
519 | " 但愿人长久 \n",
520 | " \n",
521 | " \n",
522 | "
'''\n",
523 | "result = re.search('(.*?)', html)\n",
524 | "if result:\n",
525 | " print(result.group(1), result.group(2))"
526 | ]
527 | },
528 | {
529 | "cell_type": "markdown",
530 | "metadata": {},
531 | "source": [
532 | "## re.findall\n",
533 | "搜索字符串,以列表形式返回全部能匹配的子串。"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 44,
539 | "metadata": {
540 | "collapsed": false
541 | },
542 | "outputs": [
543 | {
544 | "name": "stdout",
545 | "output_type": "stream",
546 | "text": [
547 | "[('/2.mp3', '任贤齐', '沧海一声笑'), ('/3.mp3', '齐秦', '往事随风'), ('/4.mp3', 'beyond', '光辉岁月'), ('/5.mp3', '陈慧琳', '记事本'), ('/6.mp3', '邓丽君', '但愿人长久')]\n",
548 | "\n",
549 | "('/2.mp3', '任贤齐', '沧海一声笑')\n",
550 | "/2.mp3 任贤齐 沧海一声笑\n",
551 | "('/3.mp3', '齐秦', '往事随风')\n",
552 | "/3.mp3 齐秦 往事随风\n",
553 | "('/4.mp3', 'beyond', '光辉岁月')\n",
554 | "/4.mp3 beyond 光辉岁月\n",
555 | "('/5.mp3', '陈慧琳', '记事本')\n",
556 | "/5.mp3 陈慧琳 记事本\n",
557 | "('/6.mp3', '邓丽君', '但愿人长久')\n",
558 | "/6.mp3 邓丽君 但愿人长久\n"
559 | ]
560 | }
561 | ],
562 | "source": [
563 | "import re\n",
564 | "\n",
565 | "html = '''\n",
566 | "
经典老歌 \n",
567 | "
\n",
568 | " 经典老歌列表\n",
569 | "
\n",
570 | "
\n",
571 | " 一路上有你 \n",
572 | " \n",
573 | " 沧海一声笑 \n",
574 | " \n",
575 | " \n",
576 | " 往事随风 \n",
577 | " \n",
578 | " 光辉岁月 \n",
579 | " 记事本 \n",
580 | " \n",
581 | " 但愿人长久 \n",
582 | " \n",
583 | " \n",
584 | "
'''\n",
585 | "results = re.findall('(.*?)', html, re.S)\n",
586 | "print(results)\n",
587 | "print(type(results))\n",
588 | "for result in results:\n",
589 | " print(result)\n",
590 | " print(result[0], result[1], result[2])"
591 | ]
592 | },
593 | {
594 | "cell_type": "code",
595 | "execution_count": 46,
596 | "metadata": {
597 | "collapsed": false
598 | },
599 | "outputs": [
600 | {
601 | "name": "stdout",
602 | "output_type": "stream",
603 | "text": [
604 | "[('', '一路上有你', ''), ('', '沧海一声笑', ' '), ('', '往事随风', ' '), ('', '光辉岁月', ' '), ('', '记事本', ' '), ('', '但愿人长久', ' ')]\n",
605 | "一路上有你\n",
606 | "沧海一声笑\n",
607 | "往事随风\n",
608 | "光辉岁月\n",
609 | "记事本\n",
610 | "但愿人长久\n"
611 | ]
612 | }
613 | ],
614 | "source": [
615 | "import re\n",
616 | "\n",
617 | "html = '''\n",
618 | "
经典老歌 \n",
619 | "
\n",
620 | " 经典老歌列表\n",
621 | "
\n",
622 | "
\n",
623 | " 一路上有你 \n",
624 | " \n",
625 | " 沧海一声笑 \n",
626 | " \n",
627 | " \n",
628 | " 往事随风 \n",
629 | " \n",
630 | " 光辉岁月 \n",
631 | " 记事本 \n",
632 | " \n",
633 | " 但愿人长久 \n",
634 | " \n",
635 | " \n",
636 | "
'''\n",
637 | "results = re.findall('\\s*?()?(\\w+)()?\\s*?', html, re.S)\n",
638 | "print(results)\n",
639 | "for result in results:\n",
640 | " print(result[1])"
641 | ]
642 | },
643 | {
644 | "cell_type": "markdown",
645 | "metadata": {},
646 | "source": [
647 | "### re.sub\n",
648 | "替换字符串中每一个匹配的子串后返回替换后的字符串。"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": 47,
654 | "metadata": {
655 | "collapsed": false
656 | },
657 | "outputs": [
658 | {
659 | "name": "stdout",
660 | "output_type": "stream",
661 | "text": [
662 | "Extra stings Hello World_This is a Regex Demo Extra stings\n"
663 | ]
664 | }
665 | ],
666 | "source": [
667 | "import re\n",
668 | "\n",
669 | "content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'\n",
670 | "content = re.sub('\\d+', '', content)\n",
671 | "print(content)"
672 | ]
673 | },
674 | {
675 | "cell_type": "code",
676 | "execution_count": 48,
677 | "metadata": {
678 | "collapsed": false
679 | },
680 | "outputs": [
681 | {
682 | "name": "stdout",
683 | "output_type": "stream",
684 | "text": [
685 | "Extra stings Hello Replacement World_This is a Regex Demo Extra stings\n"
686 | ]
687 | }
688 | ],
689 | "source": [
690 | "import re\n",
691 | "\n",
692 | "content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'\n",
693 | "content = re.sub('\\d+', 'Replacement', content)\n",
694 | "print(content)"
695 | ]
696 | },
697 | {
698 | "cell_type": "code",
699 | "execution_count": 51,
700 | "metadata": {
701 | "collapsed": false
702 | },
703 | "outputs": [
704 | {
705 | "name": "stdout",
706 | "output_type": "stream",
707 | "text": [
708 | "Extra stings Hello 1234567 8910 World_This is a Regex Demo Extra stings\n"
709 | ]
710 | }
711 | ],
712 | "source": [
713 | "import re\n",
714 | "\n",
715 | "content = 'Extra stings Hello 1234567 World_This is a Regex Demo Extra stings'\n",
716 | "content = re.sub('(\\d+)', r'\\1 8910', content)\n",
717 | "print(content)"
718 | ]
719 | },
720 | {
721 | "cell_type": "code",
722 | "execution_count": null,
723 | "metadata": {
724 | "collapsed": false
725 | },
726 | "outputs": [],
727 | "source": [
728 | "import re\n",
729 | "\n",
730 | "html = '''\n",
731 | "
经典老歌 \n",
732 | "
\n",
733 | " 经典老歌列表\n",
734 | "
\n",
735 | "
\n",
736 | " 一路上有你 \n",
737 | " \n",
738 | " 沧海一声笑 \n",
739 | " \n",
740 | " \n",
741 | " 往事随风 \n",
742 | " \n",
743 | " 光辉岁月 \n",
744 | " 记事本 \n",
745 | " \n",
746 | " 但愿人长久 \n",
747 | " \n",
748 | " \n",
749 | "
'''\n"
750 | ]
751 | },
752 | {
753 | "cell_type": "code",
754 | "execution_count": 54,
755 | "metadata": {
756 | "collapsed": false
757 | },
758 | "outputs": [
759 | {
760 | "name": "stdout",
761 | "output_type": "stream",
762 | "text": [
763 | "\n",
764 | "
经典老歌 \n",
765 | "
\n",
766 | " 经典老歌列表\n",
767 | "
\n",
768 | "
\n",
769 | " 一路上有你 \n",
770 | " \n",
771 | " 沧海一声笑\n",
772 | " \n",
773 | " \n",
774 | " 往事随风\n",
775 | " \n",
776 | " 光辉岁月 \n",
777 | " 记事本 \n",
778 | " \n",
779 | " 但愿人长久\n",
780 | " \n",
781 | " \n",
782 | "
\n",
783 | "['一路上有你', '\\n 沧海一声笑\\n ', '\\n 往事随风\\n ', '光辉岁月', '记事本', '\\n 但愿人长久\\n ']\n",
784 | "一路上有你\n",
785 | "沧海一声笑\n",
786 | "往事随风\n",
787 | "光辉岁月\n",
788 | "记事本\n",
789 | "但愿人长久\n"
790 | ]
791 | }
792 | ],
793 | "source": [
794 | "import re\n",
795 | "\n",
796 | "html = '''\n",
797 | "
经典老歌 \n",
798 | "
\n",
799 | " 经典老歌列表\n",
800 | "
\n",
801 | "
\n",
802 | " 一路上有你 \n",
803 | " \n",
804 | " 沧海一声笑 \n",
805 | " \n",
806 | " \n",
807 | " 往事随风 \n",
808 | " \n",
809 | " 光辉岁月 \n",
810 | " 记事本 \n",
811 | " \n",
812 | " 但愿人长久 \n",
813 | " \n",
814 | " \n",
815 | "
'''\n",
816 | "html = re.sub('|', '', html)\n",
817 | "print(html)\n",
818 | "results = re.findall('(.*?)', html, re.S)\n",
819 | "print(results)\n",
820 | "for result in results:\n",
821 | " print(result.strip())"
822 | ]
823 | },
824 | {
825 | "cell_type": "markdown",
826 | "metadata": {},
827 | "source": [
828 | "## re.compile\n",
829 | "将正则字符串编译成正则表达式对象"
830 | ]
831 | },
832 | {
833 | "cell_type": "code",
834 | "execution_count": null,
835 | "metadata": {
836 | "collapsed": true
837 | },
838 | "outputs": [],
839 | "source": [
840 | "将一个正则表达式串编译成正则对象,以便于复用该匹配模式"
841 | ]
842 | },
843 | {
844 | "cell_type": "code",
845 | "execution_count": 57,
846 | "metadata": {
847 | "collapsed": false
848 | },
849 | "outputs": [
850 | {
851 | "name": "stdout",
852 | "output_type": "stream",
853 | "text": [
854 | "<_sre.SRE_Match object; span=(0, 40), match='Hello 1234567 World_This\\nis a Regex Demo'>\n"
855 | ]
856 | }
857 | ],
858 | "source": [
859 | "import re\n",
860 | "\n",
861 | "content = '''Hello 1234567 World_This\n",
862 | "is a Regex Demo'''\n",
863 | "pattern = re.compile('Hello.*Demo', re.S)\n",
864 | "result = re.match(pattern, content)\n",
865 | "#result = re.match('Hello.*Demo', content, re.S)\n",
866 | "print(result)"
867 | ]
868 | },
869 | {
870 | "cell_type": "markdown",
871 | "metadata": {
872 | "collapsed": true
873 | },
874 | "source": [
875 | "## 实战练习"
876 | ]
877 | },
878 | {
879 | "cell_type": "code",
880 | "execution_count": 62,
881 | "metadata": {
882 | "collapsed": false,
883 | "scrolled": false
884 | },
885 | "outputs": [
886 | {
887 | "name": "stdout",
888 | "output_type": "stream",
889 | "text": [
890 | "https://book.douban.com/subject/26925834/?icn=index-editionrecommend 别走出这一步 [英]S.J.沃森 2017-1\n",
891 | "https://book.douban.com/subject/26953532/?icn=index-editionrecommend 白先勇细说红楼梦 白先勇 2017-2-1\n",
892 | "https://book.douban.com/subject/26959159/?icn=index-editionrecommend 岁月凶猛 冯仑 2017-2\n",
893 | "https://book.douban.com/subject/26949210/?icn=index-editionrecommend 如果没有今天,明天会不会有昨天? [瑞士]伊夫·博萨尔特(YvesBossart) 2017-1\n",
894 | "https://book.douban.com/subject/27001447/?icn=index-editionrecommend 人类这100年 阿夏 2017-2\n",
895 | "https://book.douban.com/subject/26864566/?icn=index-latestbook-subject 眼泪的化学 [澳]彼得·凯里 2017-2\n",
896 | "https://book.douban.com/subject/26991064/?icn=index-latestbook-subject 青年斯大林 [英]西蒙·蒙蒂菲奥里 2017-3\n",
897 | "https://book.douban.com/subject/26938056/?icn=index-latestbook-subject 带艾伯特回家 [美]霍默·希卡姆 2017-3\n",
898 | "https://book.douban.com/subject/26954757/?icn=index-latestbook-subject 乳房 [美]弗洛伦斯·威廉姆斯 2017-2\n",
899 | "https://book.douban.com/subject/26956479/?icn=index-latestbook-subject 草原动物园 马伯庸 2017-3\n",
900 | "https://book.douban.com/subject/26956018/?icn=index-latestbook-subject 贩卖音乐 [美]大卫·伊斯曼 2017-3-1\n",
901 | "https://book.douban.com/subject/26703649/?icn=index-latestbook-subject 被占的宅子 [阿根廷]胡利奥·科塔萨尔 2017-3\n",
902 | "https://book.douban.com/subject/26578402/?icn=index-latestbook-subject 信仰与观看 [法]罗兰·雷希特(RolandRecht) 2017-2-17\n",
903 | "https://book.douban.com/subject/26939171/?icn=index-latestbook-subject 妹妹的坟墓 [美]罗伯特·杜格尼(RobertDugoni) 2017-3-1\n",
904 | "https://book.douban.com/subject/26972465/?icn=index-latestbook-subject 全栈市场人 Lydia 2017-2-1\n",
905 | "https://book.douban.com/subject/26986928/?icn=index-latestbook-subject 终极X战警2 [英]马克·米勒 / [美]亚当·库伯特 2017-3-15\n",
906 | "https://book.douban.com/subject/26948144/?icn=index-latestbook-subject 格调(修订第3版) [美]保罗·福塞尔(PaulFussell) 2017-2\n",
907 | "https://book.douban.com/subject/26945792/?icn=index-latestbook-subject 原谅石 [美]洛里·斯皮尔曼 2017-2\n",
908 | "https://book.douban.com/subject/26974207/?icn=index-latestbook-subject 庇护二世闻见录 [意]皮科洛米尼 2017-2\n",
909 | "https://book.douban.com/subject/26983143/?icn=index-latestbook-subject 遇见野兔的那一年 [芬]阿托·帕西林纳 2017-3-1\n",
910 | "https://book.douban.com/subject/26976429/?icn=index-latestbook-subject 鲍勃·迪伦:诗人之歌 [法]让-多米尼克·布里埃 2017-4\n",
911 | "https://book.douban.com/subject/26962860/?icn=index-latestbook-subject 牙医谋杀案 [英]阿加莎·克里斯蒂 2017-3\n",
912 | "https://book.douban.com/subject/26923022/?icn=index-latestbook-subject 石挥谈艺录:把生命交给舞台 石挥 2017-2\n",
913 | "https://book.douban.com/subject/26897190/?icn=index-latestbook-subject 理想 [美]安·兰德 2017-2\n",
914 | "https://book.douban.com/subject/26985981/?icn=index-latestbook-subject 青苔不会消失 袁凌 2017-4\n",
915 | "https://book.douban.com/subject/26984949/?icn=index-latestbook-subject 地下铁道 [美]科尔森·怀特黑德(ColsonWhitehead) 2017-3\n",
916 | "https://book.douban.com/subject/26944012/?icn=index-latestbook-subject 极简进步史 [英]罗纳德·赖特 2017-4-1\n",
917 | "https://book.douban.com/subject/26969002/?icn=index-latestbook-subject 驻马店伤心故事集 郑在欢 2017-2\n",
918 | "https://book.douban.com/subject/26854223/?icn=index-latestbook-subject 致薇拉 [美]弗拉基米尔·纳博科夫 2017-3\n",
919 | "https://book.douban.com/subject/26841616/?icn=index-latestbook-subject 北方档案 [法]玛格丽特·尤瑟纳尔 2017-2\n",
920 | "https://book.douban.com/subject/26980391/?icn=index-latestbook-subject 食帖15:便当灵感集 林江 2017-2\n",
921 | "https://book.douban.com/subject/26958882/?icn=index-latestbook-subject 生火 [法]克里斯多夫·夏布特(ChristopheChabouté)编绘 2017-3\n",
922 | "https://book.douban.com/subject/26989163/?icn=index-latestbook-subject 文明之光(第四册) 吴军 2017-3-1\n",
923 | "https://book.douban.com/subject/26878906/?icn=index-latestbook-subject 公牛山 [美]布赖恩·帕诺威奇 2017-2\n",
924 | "https://book.douban.com/subject/26989534/?icn=index-latestbook-subject 几乎消失的偷闲艺术 [加拿大]达尼·拉费里埃 2017-4\n",
925 | "https://book.douban.com/subject/26939973/?icn=index-latestbook-subject 散步去 [日]谷口治郎 2017-3\n",
926 | "https://book.douban.com/subject/26865333/?icn=index-latestbook-subject 中国1945 [美]理查德·伯恩斯坦(RichardBernstein) 2017-3-1\n",
927 | "https://book.douban.com/subject/26989242/?icn=index-latestbook-subject 有匪2:离恨楼 Priest 2017-3\n",
928 | "https://book.douban.com/subject/26985790/?icn=index-latestbook-subject 女人、火与危险事物 [美]乔治·莱考夫 2017-3\n",
929 | "https://book.douban.com/subject/26972277/?icn=index-latestbook-subject 寻找时间的人 [爱尔兰]凯特·汤普森 2017-3\n",
930 | "https://www.douban.com/note/610758170/ 白先勇细说红楼梦【全二册】 白先勇 2017-2-1\n",
931 | "https://read.douban.com/ebook/31540864/?dcs=book-hot&dcm=douban&dct=read-subject 奇爱博士 [英]彼得·乔治 2016-8-1\n",
932 | "https://read.douban.com/ebook/31433872/?dcs=book-hot&dcm=douban&dct=read-subject 在时光中盛开的女子 李筱懿 2017-3\n",
933 | "https://read.douban.com/ebook/31178635/?dcs=book-hot&dcm=douban&dct=read-subject 如何高效记忆(原书第2版) [美]肯尼思•希格比(KennethL.Higbee) 2017-3-5\n",
934 | "https://read.douban.com/ebook/31358183/?dcs=book-hot&dcm=douban&dct=read-subject 愿无岁月可回头 回忆专用小马甲 2016-9\n",
935 | "https://read.douban.com/ebook/31341636/?dcs=book-hot&dcm=douban&dct=read-subject 走神的艺术与科学 [新西兰]迈克尔·C.科尔巴里斯 2017-3-1\n",
936 | "https://read.douban.com/ebook/27621094/?dcs=book-hot&dcm=douban&dct=read-subject 神秘的量子生命 [英]吉姆•艾尔-哈利利/约翰乔•麦克法登 2016-8\n",
937 | "https://read.douban.com/ebook/31221966/?dcs=book-hot&dcm=douban&dct=read-subject 寻找时间的人 [爱尔兰]凯特·汤普森 2017-3\n",
938 | "https://read.douban.com/ebook/31481323/?dcs=book-hot&dcm=douban&dct=read-subject 山之四季 [日]高村光太郎 2017-1\n",
939 | "https://read.douban.com/ebook/31154855/?dcs=book-hot&dcm=douban&dct=read-subject 东北游记 [美]迈克尔·麦尔 2017-1\n"
940 | ]
941 | }
942 | ],
943 | "source": [
944 | "import requests\n",
945 | "import re\n",
946 | "content = requests.get('https://book.douban.com/').text\n",
947 | "pattern = re.compile('(.*?).*?year\">(.*?).*?', re.S)\n",
948 | "results = re.findall(pattern, content)\n",
949 | "for result in results:\n",
950 | " url, name, author, date = result\n",
951 | " author = re.sub('\\s', '', author)\n",
952 | " date = re.sub('\\s', '', date)\n",
953 | " print(url, name, author, date)"
954 | ]
955 | }
956 | ],
957 | "metadata": {
958 | "kernelspec": {
959 | "display_name": "Python 3",
960 | "language": "python",
961 | "name": "python3"
962 | },
963 | "language_info": {
964 | "codemirror_mode": {
965 | "name": "ipython",
966 | "version": 3
967 | },
968 | "file_extension": ".py",
969 | "mimetype": "text/x-python",
970 | "name": "python",
971 | "nbconvert_exporter": "python",
972 | "pygments_lexer": "ipython3",
973 | "version": "3.5.1"
974 | }
975 | },
976 | "nbformat": 4,
977 | "nbformat_minor": 0
978 | }
979 |
--------------------------------------------------------------------------------
/requests.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# requests"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "## 实例引入"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {
21 | "collapsed": false
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import requests\n",
26 | "\n",
27 | "response = requests.get('https://www.baidu.com/')\n",
28 | "print(type(response))\n",
29 | "print(response.status_code)\n",
30 | "print(type(response.text))\n",
31 | "print(response.text)\n",
32 | "print(response.cookies)"
33 | ]
34 | },
35 | {
36 | "cell_type": "markdown",
37 | "metadata": {},
38 | "source": [
39 | "## 各种请求方式"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": null,
45 | "metadata": {
46 | "collapsed": true
47 | },
48 | "outputs": [],
49 | "source": [
50 | "import requests\n",
51 | "requests.post('http://httpbin.org/post')\n",
52 | "requests.put('http://httpbin.org/put')\n",
53 | "requests.delete('http://httpbin.org/delete')\n",
54 | "requests.head('http://httpbin.org/get')\n",
55 | "requests.options('http://httpbin.org/get')"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "# 请求"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## 基本GET请求"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "### 基本写法"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": null,
82 | "metadata": {
83 | "collapsed": false
84 | },
85 | "outputs": [],
86 | "source": [
87 | "import requests\n",
88 | "\n",
89 | "response = requests.get('http://httpbin.org/get')\n",
90 | "print(response.text)"
91 | ]
92 | },
93 | {
94 | "cell_type": "markdown",
95 | "metadata": {},
96 | "source": [
97 | "### 带参数GET请求"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": null,
103 | "metadata": {
104 | "collapsed": false
105 | },
106 | "outputs": [],
107 | "source": [
108 | "import requests\n",
109 | "response = requests.get(\"http://httpbin.org/get?name=germey&age=22\")\n",
110 | "print(response.text)"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": null,
116 | "metadata": {
117 | "collapsed": false,
118 | "scrolled": true
119 | },
120 | "outputs": [],
121 | "source": [
122 | "import requests\n",
123 | "\n",
124 | "data = {\n",
125 | " 'name': 'germey',\n",
126 | " 'age': 22\n",
127 | "}\n",
128 | "response = requests.get(\"http://httpbin.org/get\", params=data)\n",
129 | "print(response.text)"
130 | ]
131 | },
132 | {
133 | "cell_type": "markdown",
134 | "metadata": {},
135 | "source": [
136 | "### 解析json"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": null,
142 | "metadata": {
143 | "collapsed": false
144 | },
145 | "outputs": [],
146 | "source": [
147 | "import requests\n",
148 | "import json\n",
149 | "\n",
150 | "response = requests.get(\"http://httpbin.org/get\")\n",
151 | "print(type(response.text))\n",
152 | "print(response.json())\n",
153 | "print(json.loads(response.text))\n",
154 | "print(type(response.json()))"
155 | ]
156 | },
157 | {
158 | "cell_type": "markdown",
159 | "metadata": {},
160 | "source": [
161 | "### 获取二进制数据"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {
168 | "collapsed": false,
169 | "scrolled": true
170 | },
171 | "outputs": [],
172 | "source": [
173 | "import requests\n",
174 | "\n",
175 | "response = requests.get(\"https://github.com/favicon.ico\")\n",
176 | "print(type(response.text), type(response.content))\n",
177 | "print(response.text)\n",
178 | "print(response.content)"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": null,
184 | "metadata": {
185 | "collapsed": true
186 | },
187 | "outputs": [],
188 | "source": [
189 | "import requests\n",
190 | "\n",
191 | "response = requests.get(\"https://github.com/favicon.ico\")\n",
192 | "with open('favicon.ico', 'wb') as f:\n",
193 | " f.write(response.content)\n",
194 | " f.close()"
195 | ]
196 | },
197 | {
198 | "cell_type": "markdown",
199 | "metadata": {},
200 | "source": [
201 | "### 添加headers"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "metadata": {
208 | "collapsed": false
209 | },
210 | "outputs": [],
211 | "source": [
212 | "import requests\n",
213 | "\n",
214 | "response = requests.get(\"https://www.zhihu.com/explore\")\n",
215 | "print(response.text)"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": null,
221 | "metadata": {
222 | "collapsed": false,
223 | "scrolled": true
224 | },
225 | "outputs": [],
226 | "source": [
227 | "import requests\n",
228 | "\n",
229 | "headers = {\n",
230 | " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'\n",
231 | "}\n",
232 | "response = requests.get(\"https://www.zhihu.com/explore\", headers=headers)\n",
233 | "print(response.text)"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "## 基本POST请求"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": null,
246 | "metadata": {
247 | "collapsed": false
248 | },
249 | "outputs": [],
250 | "source": [
251 | "import requests\n",
252 | "\n",
253 | "data = {'name': 'germey', 'age': '22'}\n",
254 | "response = requests.post(\"http://httpbin.org/post\", data=data)\n",
255 | "print(response.text)"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": null,
261 | "metadata": {
262 | "collapsed": false
263 | },
264 | "outputs": [],
265 | "source": [
266 | "import requests\n",
267 | "\n",
268 | "data = {'name': 'germey', 'age': '22'}\n",
269 | "headers = {\n",
270 | " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'\n",
271 | "}\n",
272 | "response = requests.post(\"http://httpbin.org/post\", data=data, headers=headers)\n",
273 | "print(response.json())"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {},
279 | "source": [
280 | "# 响应"
281 | ]
282 | },
283 | {
284 | "cell_type": "markdown",
285 | "metadata": {},
286 | "source": [
287 | "## reponse属性"
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "execution_count": null,
293 | "metadata": {
294 | "collapsed": false
295 | },
296 | "outputs": [],
297 | "source": [
298 | "import requests\n",
299 | "\n",
300 | "response = requests.get('http://www.jianshu.com')\n",
301 | "print(type(response.status_code), response.status_code)\n",
302 | "print(type(response.headers), response.headers)\n",
303 | "print(type(response.cookies), response.cookies)\n",
304 | "print(type(response.url), response.url)\n",
305 | "print(type(response.history), response.history)"
306 | ]
307 | },
308 | {
309 | "cell_type": "markdown",
310 | "metadata": {},
311 | "source": [
312 | "## 状态码判断"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": null,
318 | "metadata": {
319 | "collapsed": false
320 | },
321 | "outputs": [],
322 | "source": [
323 | "import requests\n",
324 | "\n",
325 | "response = requests.get('http://www.jianshu.com/hello.html')\n",
326 | "exit() if not response.status_code == requests.codes.not_found else print('404 Not Found')"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {
333 | "collapsed": false
334 | },
335 | "outputs": [],
336 | "source": [
337 | "import requests\n",
338 | "\n",
339 | "response = requests.get('http://www.jianshu.com')\n",
340 | "exit() if not response.status_code == 200 else print('Request Successfully')"
341 | ]
342 | },
343 | {
344 | "cell_type": "code",
345 | "execution_count": null,
346 | "metadata": {
347 | "collapsed": true
348 | },
349 | "outputs": [],
350 | "source": [
351 | "100: ('continue',),\n",
352 | "101: ('switching_protocols',),\n",
353 | "102: ('processing',),\n",
354 | "103: ('checkpoint',),\n",
355 | "122: ('uri_too_long', 'request_uri_too_long'),\n",
356 | "200: ('ok', 'okay', 'all_ok', 'all_okay', 'all_good', '\\\\o/', '✓'),\n",
357 | "201: ('created',),\n",
358 | "202: ('accepted',),\n",
359 | "203: ('non_authoritative_info', 'non_authoritative_information'),\n",
360 | "204: ('no_content',),\n",
361 | "205: ('reset_content', 'reset'),\n",
362 | "206: ('partial_content', 'partial'),\n",
363 | "207: ('multi_status', 'multiple_status', 'multi_stati', 'multiple_stati'),\n",
364 | "208: ('already_reported',),\n",
365 | "226: ('im_used',),\n",
366 | "\n",
367 | "# Redirection.\n",
368 | "300: ('multiple_choices',),\n",
369 | "301: ('moved_permanently', 'moved', '\\\\o-'),\n",
370 | "302: ('found',),\n",
371 | "303: ('see_other', 'other'),\n",
372 | "304: ('not_modified',),\n",
373 | "305: ('use_proxy',),\n",
374 | "306: ('switch_proxy',),\n",
375 | "307: ('temporary_redirect', 'temporary_moved', 'temporary'),\n",
376 | "308: ('permanent_redirect',\n",
377 | " 'resume_incomplete', 'resume',), # These 2 to be removed in 3.0\n",
378 | "\n",
379 | "# Client Error.\n",
380 | "400: ('bad_request', 'bad'),\n",
381 | "401: ('unauthorized',),\n",
382 | "402: ('payment_required', 'payment'),\n",
383 | "403: ('forbidden',),\n",
384 | "404: ('not_found', '-o-'),\n",
385 | "405: ('method_not_allowed', 'not_allowed'),\n",
386 | "406: ('not_acceptable',),\n",
387 | "407: ('proxy_authentication_required', 'proxy_auth', 'proxy_authentication'),\n",
388 | "408: ('request_timeout', 'timeout'),\n",
389 | "409: ('conflict',),\n",
390 | "410: ('gone',),\n",
391 | "411: ('length_required',),\n",
392 | "412: ('precondition_failed', 'precondition'),\n",
393 | "413: ('request_entity_too_large',),\n",
394 | "414: ('request_uri_too_large',),\n",
395 | "415: ('unsupported_media_type', 'unsupported_media', 'media_type'),\n",
396 | "416: ('requested_range_not_satisfiable', 'requested_range', 'range_not_satisfiable'),\n",
397 | "417: ('expectation_failed',),\n",
398 | "418: ('im_a_teapot', 'teapot', 'i_am_a_teapot'),\n",
399 | "421: ('misdirected_request',),\n",
400 | "422: ('unprocessable_entity', 'unprocessable'),\n",
401 | "423: ('locked',),\n",
402 | "424: ('failed_dependency', 'dependency'),\n",
403 | "425: ('unordered_collection', 'unordered'),\n",
404 | "426: ('upgrade_required', 'upgrade'),\n",
405 | "428: ('precondition_required', 'precondition'),\n",
406 | "429: ('too_many_requests', 'too_many'),\n",
407 | "431: ('header_fields_too_large', 'fields_too_large'),\n",
408 | "444: ('no_response', 'none'),\n",
409 | "449: ('retry_with', 'retry'),\n",
410 | "450: ('blocked_by_windows_parental_controls', 'parental_controls'),\n",
411 | "451: ('unavailable_for_legal_reasons', 'legal_reasons'),\n",
412 | "499: ('client_closed_request',),\n",
413 | "\n",
414 | "# Server Error.\n",
415 | "500: ('internal_server_error', 'server_error', '/o\\\\', '✗'),\n",
416 | "501: ('not_implemented',),\n",
417 | "502: ('bad_gateway',),\n",
418 | "503: ('service_unavailable', 'unavailable'),\n",
419 | "504: ('gateway_timeout',),\n",
420 | "505: ('http_version_not_supported', 'http_version'),\n",
421 | "506: ('variant_also_negotiates',),\n",
422 | "507: ('insufficient_storage',),\n",
423 | "509: ('bandwidth_limit_exceeded', 'bandwidth'),\n",
424 | "510: ('not_extended',),\n",
425 | "511: ('network_authentication_required', 'network_auth', 'network_authentication'),"
426 | ]
427 | },
428 | {
429 | "cell_type": "markdown",
430 | "metadata": {},
431 | "source": [
432 | "# 高级操作"
433 | ]
434 | },
435 | {
436 | "cell_type": "markdown",
437 | "metadata": {},
438 | "source": [
439 | "## 文件上传"
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": null,
445 | "metadata": {
446 | "collapsed": false
447 | },
448 | "outputs": [],
449 | "source": [
450 | "import requests\n",
451 | "\n",
452 | "files = {'file': open('favicon.ico', 'rb')}\n",
453 | "response = requests.post(\"http://httpbin.org/post\", files=files)\n",
454 | "print(response.text)"
455 | ]
456 | },
457 | {
458 | "cell_type": "markdown",
459 | "metadata": {},
460 | "source": [
461 | "## 获取cookie"
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": null,
467 | "metadata": {
468 | "collapsed": false,
469 | "scrolled": true
470 | },
471 | "outputs": [],
472 | "source": [
473 | "import requests\n",
474 | "\n",
475 | "response = requests.get(\"https://www.baidu.com\")\n",
476 | "print(response.cookies)\n",
477 | "for key, value in response.cookies.items():\n",
478 | " print(key + '=' + value)"
479 | ]
480 | },
481 | {
482 | "cell_type": "markdown",
483 | "metadata": {},
484 | "source": [
485 | "## 会话维持"
486 | ]
487 | },
488 | {
489 | "cell_type": "markdown",
490 | "metadata": {},
491 | "source": [
492 | "模拟登录"
493 | ]
494 | },
495 | {
496 | "cell_type": "code",
497 | "execution_count": null,
498 | "metadata": {
499 | "collapsed": false
500 | },
501 | "outputs": [],
502 | "source": [
503 | "import requests\n",
504 | "\n",
505 | "requests.get('http://httpbin.org/cookies/set/number/123456789')\n",
506 | "response = requests.get('http://httpbin.org/cookies')\n",
507 | "print(response.text)"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": null,
513 | "metadata": {
514 | "collapsed": false
515 | },
516 | "outputs": [],
517 | "source": [
518 | "import requests\n",
519 | "\n",
520 | "s = requests.Session()\n",
521 | "s.get('http://httpbin.org/cookies/set/number/123456789')\n",
522 | "response = s.get('http://httpbin.org/cookies')\n",
523 | "print(response.text)"
524 | ]
525 | },
526 | {
527 | "cell_type": "markdown",
528 | "metadata": {},
529 | "source": [
530 | "## 证书验证"
531 | ]
532 | },
533 | {
534 | "cell_type": "code",
535 | "execution_count": null,
536 | "metadata": {
537 | "collapsed": false
538 | },
539 | "outputs": [],
540 | "source": [
541 | "import requests\n",
542 | "\n",
543 | "response = requests.get('https://www.12306.cn')\n",
544 | "print(response.status_code)"
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": null,
550 | "metadata": {
551 | "collapsed": false
552 | },
553 | "outputs": [],
554 | "source": [
555 | "import requests\n",
556 | "from requests.packages import urllib3\n",
557 | "urllib3.disable_warnings()\n",
558 | "response = requests.get('https://www.12306.cn', verify=False)\n",
559 | "print(response.status_code)"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": null,
565 | "metadata": {
566 | "collapsed": false
567 | },
568 | "outputs": [],
569 | "source": [
570 | "import requests\n",
571 | "\n",
572 | "response = requests.get('https://www.12306.cn', cert=('/path/server.crt', '/path/key'))\n",
573 | "print(response.status_code)"
574 | ]
575 | },
576 | {
577 | "cell_type": "markdown",
578 | "metadata": {},
579 | "source": [
580 | "## 代理设置"
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": null,
586 | "metadata": {
587 | "collapsed": false
588 | },
589 | "outputs": [],
590 | "source": [
591 | "import requests\n",
592 | "\n",
593 | "proxies = {\n",
594 | " \"http\": \"http://127.0.0.1:9743\",\n",
595 | " \"https\": \"https://127.0.0.1:9743\",\n",
596 | "}\n",
597 | "\n",
598 | "response = requests.get(\"https://www.taobao.com\", proxies=proxies)\n",
599 | "print(response.status_code)"
600 | ]
601 | },
602 | {
603 | "cell_type": "code",
604 | "execution_count": null,
605 | "metadata": {
606 | "collapsed": false
607 | },
608 | "outputs": [],
609 | "source": [
610 | "import requests\n",
611 | "\n",
612 | "proxies = {\n",
613 | " \"http\": \"http://user:password@127.0.0.1:9743/\",\n",
614 | "}\n",
615 | "response = requests.get(\"https://www.taobao.com\", proxies=proxies)\n",
616 | "print(response.status_code)"
617 | ]
618 | },
619 | {
620 | "cell_type": "code",
621 | "execution_count": null,
622 | "metadata": {
623 | "collapsed": true
624 | },
625 | "outputs": [],
626 | "source": [
627 | "pip3 install 'requests[socks]'"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": null,
633 | "metadata": {
634 | "collapsed": false
635 | },
636 | "outputs": [],
637 | "source": [
638 | "import requests\n",
639 | "\n",
640 | "proxies = {\n",
641 | " 'http': 'socks5://127.0.0.1:9742',\n",
642 | " 'https': 'socks5://127.0.0.1:9742'\n",
643 | "}\n",
644 | "response = requests.get(\"https://www.taobao.com\", proxies=proxies)\n",
645 | "print(response.status_code)"
646 | ]
647 | },
648 | {
649 | "cell_type": "markdown",
650 | "metadata": {},
651 | "source": [
652 | "## 超时设置"
653 | ]
654 | },
655 | {
656 | "cell_type": "code",
657 | "execution_count": null,
658 | "metadata": {
659 | "collapsed": false
660 | },
661 | "outputs": [],
662 | "source": [
663 | "import requests\n",
664 | "from requests.exceptions import ReadTimeout\n",
665 | "try:\n",
666 | " response = requests.get(\"http://httpbin.org/get\", timeout = 0.5)\n",
667 | " print(response.status_code)\n",
668 | "except ReadTimeout:\n",
669 | " print('Timeout')"
670 | ]
671 | },
672 | {
673 | "cell_type": "markdown",
674 | "metadata": {},
675 | "source": [
676 | "## 认证设置"
677 | ]
678 | },
679 | {
680 | "cell_type": "code",
681 | "execution_count": null,
682 | "metadata": {
683 | "collapsed": false
684 | },
685 | "outputs": [],
686 | "source": [
687 | "import requests\n",
688 | "from requests.auth import HTTPBasicAuth\n",
689 | "\n",
690 | "r = requests.get('http://120.27.34.24:9001', auth=HTTPBasicAuth('user', '123'))\n",
691 | "print(r.status_code)"
692 | ]
693 | },
694 | {
695 | "cell_type": "code",
696 | "execution_count": null,
697 | "metadata": {
698 | "collapsed": false
699 | },
700 | "outputs": [],
701 | "source": [
702 | "import requests\n",
703 | "\n",
704 | "r = requests.get('http://120.27.34.24:9001', auth=('user', '123'))\n",
705 | "print(r.status_code)"
706 | ]
707 | },
708 | {
709 | "cell_type": "markdown",
710 | "metadata": {},
711 | "source": [
712 | "## 异常处理"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 45,
718 | "metadata": {
719 | "collapsed": false
720 | },
721 | "outputs": [
722 | {
723 | "name": "stdout",
724 | "output_type": "stream",
725 | "text": [
726 | "Connection error\n"
727 | ]
728 | }
729 | ],
730 | "source": [
731 | "import requests\n",
732 | "from requests.exceptions import ReadTimeout, ConnectionError, RequestException\n",
733 | "try:\n",
734 | " response = requests.get(\"http://httpbin.org/get\", timeout = 0.5)\n",
735 | " print(response.status_code)\n",
736 | "except ReadTimeout:\n",
737 | " print('Timeout')\n",
738 | "except ConnectionError:\n",
739 | " print('Connection error')\n",
740 | "except RequestException:\n",
741 | " print('Error')"
742 | ]
743 | }
744 | ],
745 | "metadata": {
746 | "kernelspec": {
747 | "display_name": "Python 3",
748 | "language": "python",
749 | "name": "python3"
750 | },
751 | "language_info": {
752 | "codemirror_mode": {
753 | "name": "ipython",
754 | "version": 3
755 | },
756 | "file_extension": ".py",
757 | "mimetype": "text/x-python",
758 | "name": "python",
759 | "nbconvert_exporter": "python",
760 | "pygments_lexer": "ipython3",
761 | "version": "3.5.1"
762 | }
763 | },
764 | "nbformat": 4,
765 | "nbformat_minor": 0
766 | }
767 |
--------------------------------------------------------------------------------