├── .gitignore ├── .idea ├── .name ├── Spider.iml ├── encodings.xml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── README.md ├── baliPOI.txt ├── bash.exe.stackdump ├── ceshimysql.py ├── chengshijilu.txt ├── gufensoso ├── __init__.py └── yinyong.py ├── lundunPOI.txt ├── poispider ├── __init__.py ├── citylist.py ├── dazhongdianping.py ├── mafengwo.py ├── proxyIP.py ├── qyer.py ├── qyer_one_city.py └── tripadvisor.py ├── test.py ├── tst.py ├── webofscience.py └── zhuaqu └── 伦敦 └── lundun.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | zhuaqu 3 | test.py 4 | tst.py -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | Spider -------------------------------------------------------------------------------- /.idea/Spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 27 | 28 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 68 | 69 | 70 | 72 | 73 | 95 | 96 | 97 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 134 | 135 | 136 | 137 | 140 | 141 | 144 | 145 | 146 | 147 | 150 | 151 | 154 | 155 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 198 | 199 | 215 | 216 | 232 | 233 | 249 | 250 | 266 | 267 | 283 | 284 | 295 | 296 | 314 | 315 | 333 | 334 | 354 | 355 | 376 | 377 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 1458433175597 427 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 479 | 482 | 483 | 484 | 486 | 487 | 488 | 489 | 490 | 491 | 493 | 494 | 495 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spider -------------------------------------------------------------------------------- /baliPOI.txt: -------------------------------------------------------------------------------- 1 | 巴黎,景点,卢浮宫, Louvre Museum 2 | 巴黎,景点,埃菲尔铁塔, Eiffel Tower 3 | 巴黎,景点,巴黎圣母院,巴黎圣母院 4 | 巴黎,景点,凯旋门,凯旋门 5 | 巴黎,景点,塞纳河, Seine River 6 | 巴黎,购物,香榭丽舍大街, Champs Elysees 7 | 巴黎,娱乐,塞纳河游船,塞纳河游船 8 | 巴黎,景点,法国圣心大教堂,法国圣心大教堂 9 | 巴黎,购物,老佛爷百货,老佛爷百货 10 | 巴黎,景点,蒙马特高地, Montmartre 11 | 巴黎,景点,奥赛博物馆,奥赛博物馆 12 | 巴黎,景点,协和广场,协和广场 13 | 巴黎,景点,蓬皮杜国家艺术文化中心, The Pompidou Centre 14 | 巴黎,景点,亚历山大三世桥,亚历山大三世桥 15 | 巴黎,景点,艺术桥,艺术桥 16 | 巴黎,美食,Ladur茅e,Ladur茅e 17 | 巴黎,购物,巴黎春天百货,巴黎春天百货 18 | 巴黎,娱乐,巴黎歌剧院,巴黎歌剧院 19 | 巴黎,景点,杜乐丽花园, Tuileries Garden 20 | 巴黎,景点,巴黎迪士尼乐园, Paris Disneyland 21 | 巴黎,娱乐,红磨坊,红磨坊 22 | 巴黎,景点,卢森堡公园,卢森堡公园 23 | 巴黎,景点,蒙帕纳斯大厦, Tour Montparnasse 24 | 巴黎,景点,爱墙, The Love Of Wall 25 | 巴黎,景点,橘园美术馆,橘园美术馆 26 | 巴黎,景点,先贤祠,先贤祠 27 | 巴黎,景点,巴黎市政厅, City Hall 28 | 巴黎,景点,巴黎荣军院,巴黎荣军院 29 | 巴黎,美食,Angelina,Angelina 30 | 巴黎,购物,莎士比亚书店, Shakespeare&company 31 | 巴黎,交通,戴高乐机场,戴高乐机场 32 | 巴黎,交通,巴黎里昂火车站, Gare De Lyon 33 | 巴黎,景点,巴黎拉丁区, Latin Quarter 34 | 巴黎,景点,罗丹美术馆,罗丹美术馆 35 | 巴黎,美食,Amorino,Amorino 36 | 巴黎,交通,巴黎北火车站, Gare Du Nord 37 | 巴黎,景点,新桥,新桥 38 | 巴黎,景点,巴黎战神广场,巴黎战神广场 39 | 巴黎,景点,圣礼拜堂,圣礼拜堂 40 | 巴黎,景点,巴黎夏悠宫,巴黎夏悠宫 41 | 巴黎,美食, Leon De Bruxelles, Leon De Bruxelles 42 | 巴黎,景点,卡鲁索凯旋门,卡鲁索凯旋门 43 | 巴黎,景点,玛德莲教堂,玛德莲教堂 44 | 巴黎,美食,Paul,Paul 45 | 巴黎,购物,蒙田大道, Avenue Montaigne 46 | 巴黎,景点,拉雪兹公墓,拉雪兹公墓 47 | 巴黎,景点,巴黎大皇宫美术馆,巴黎大皇宫美术馆 48 | 巴黎,景点,旺多姆广场,旺多姆广场 49 | 巴黎,景点,西岱岛, Cite Island 50 | 巴黎,购物,小丘广场,小丘广场 51 | 巴黎,景点,玛黑区,玛黑区 52 | 巴黎,景点,利奥波德,利奥波德 53 | 巴黎,景点,孚日广场,孚日广场 54 | 巴黎,景点,巴黎古监狱,巴黎古监狱 55 | 巴黎,美食, Bouillon Chartier, Bouillon Chartier 56 | 巴黎,景点,人骨墓穴,人骨墓穴 57 | 巴黎,景点,巴黎军事博物馆,巴黎军事博物馆 58 | 巴黎,美食,Berthillon,Berthillon 59 | 巴黎,景点,圣路易岛,圣路易岛 60 | 巴黎,娱乐,双叟咖啡馆,双叟咖啡馆 61 | 巴黎,景点,巴黎小皇宫博物馆,巴黎小皇宫博物馆 62 | 巴黎,娱乐,双风车咖啡,双风车咖啡 63 | 巴黎,景点,卢克索方尖碑, Luxor Obelisk 64 | 巴黎,美食,金蜗牛, L'escargot Montorgueil 65 | 巴黎,交通,凡尔赛宫地铁站, Versailles 66 | 巴黎,美食, Chez Clement(champs, Chez Clement(champs 67 | 巴黎,美食,Hippopotamus,Hippopotamus 68 | 巴黎,景点,雨果故居,雨果故居 69 | 巴黎,购物,巴士底市集, Bastille Market 70 | 巴黎,景点,圣日耳曼德佩教堂,圣日耳曼德佩教堂 71 | 巴黎,购物,Lv(总店), Maison Louis Vuitton 72 | 巴黎,交通,歌剧院地铁站, Opéra地铁站 73 | 巴黎,景点,圣叙尔皮斯教堂,圣叙尔皮斯教堂 74 | 巴黎,美食,L茅on, de Bruxelles Champs Elys茅es 75 | 巴黎,景点,下水道博物馆,下水道博物馆 76 | 巴黎,景点,法国国家图书馆,法国国家图书馆 77 | 巴黎,美食,Epicure,Epicure 78 | 巴黎,交通,荣军院地铁站, Invalides地铁站 79 | 巴黎,景点,皇宫花园, Palais 80 | 巴黎,景点,吉美国立亚洲艺术博物馆, 集美博物馆 81 | 巴黎,美食, Pierre Herm茅 Cambon, Pierre Herm茅 Cambon 82 | 巴黎,景点,波旁宫,波旁宫 83 | 巴黎,美食,Op茅ra, Chez Clement 84 | 巴黎,景点,巴士底广场,巴士底广场 85 | 巴黎,景点,圣艾蒂安,圣艾蒂安 86 | 巴黎,美食,Ladur茅e, Bonaparte 87 | 巴黎,美食, Pierre Herme, Pierre Herme 88 | 巴黎,购物,跳蚤市场, Marche Aux Puces De St Ouen 89 | 巴黎,美食,双风车咖啡馆, Cafe Des Deux Moulins 90 | 巴黎,景点,法国国立中世纪博物馆,法国国立中世纪博物馆 91 | 巴黎,美食, Cafe De Flore, Cafe De Flore 92 | 巴黎,景点,德比尔哈克姆桥,德比尔哈克姆桥 93 | 巴黎,美食, Chez Clement(saint, Chez Clement(saint 94 | 巴黎,购物,河谷购物村,河谷购物村 95 | 巴黎,美食, Chez Papa, Chez Papa 96 | 巴黎,娱乐,和平咖啡馆,和平咖啡馆 97 | 巴黎,购物, Louis Vuitton, Louis Vuitton 98 | 巴黎,景点,爱丽舍宫,爱丽舍宫 99 | 巴黎,景点,毕加索美术馆,毕加索美术馆 100 | 巴黎,美食, Le Cinq, Le Cinq 101 | 巴黎,景点,法国国家自然历史博物馆,法国国家自然历史博物馆 102 | 巴黎,景点,圣保罗,圣保罗 103 | 巴黎,景点,奥斯曼大街,奥斯曼大街 104 | 巴黎,美食, Pierre Herm茅 Bonaparte, Pierre Herm茅 Bonaparte 105 | 巴黎,美食,拉杜雷, Ladure 106 | 巴黎,交通,巴黎火车东站, Gare De L’est 107 | 巴黎,美食,Ledoyen,Ledoyen 108 | 巴黎,景点,自由女神像,自由女神像 109 | 巴黎,景点,蒙马特公墓,蒙马特公墓 110 | 巴黎,交通, Parc De Sceaux地铁站, Parc De Sceaux地铁站 111 | 巴黎,景点,盖·布朗利博物馆, Musee Du Quai Branly 112 | 巴黎,美食, Cafe Constant, Cafe Constant 113 | 巴黎,景点,巴黎,巴黎 114 | 巴黎,交通,巴黎奥里机场, Orly International Airport 115 | 巴黎,景点,圣日耳曼大道, Boulevard St. Germain 116 | 巴黎,娱乐,罗兰加洛斯球场, Stade Roland Garros 117 | 巴黎,娱乐,夏约宫国家剧院, Theatre National De Chaillot 118 | 巴黎,美食, Taverne Karlsbrau, Taverne Karlsbrau 119 | 巴黎,景点,皇家桥, Pont Royal 120 | 巴黎,美食, Le Meurice, Le Meurice 121 | 巴黎,美食,Yoom,Yoom 122 | 巴黎,交通,巴黎蒙帕那斯火车站, Gare Montparnasse 123 | 巴黎,购物,OUTLETS打折村,OUTLETS打折村 124 | 巴黎,娱乐, Le Procope, Le Procope 125 | 巴黎,美食, Cafe Les Deux Magots, Cafe Les Deux Magots 126 | 巴黎,娱乐,丽都,丽都 127 | 巴黎,美食,蓝色列车餐厅, Le Train Bleu 128 | 巴黎,美食, Le Hide, Le Hide 129 | 巴黎,购物,玻玛榭百货, Le Bon Marche 130 | 巴黎,美食, Au Pied De Cochon, Au Pied De Cochon 131 | 巴黎,景点,法兰西学院,法兰西学院 132 | 巴黎,景点,巴黎动物园,巴黎动物园 133 | 巴黎,美食,儒勒凡尔纳餐厅, Le Jules Verne 134 | 巴黎,景点,战略博物馆,战略博物馆 135 | 巴黎,景点,巴黎国家海军博物馆,巴黎国家海军博物馆 136 | 巴黎,交通,凡尔赛门地铁站, Porte De Versailles地铁站 137 | 巴黎,景点,圣路易桥,圣路易桥 138 | 巴黎,景点,卢森堡宫, Palais Du Luxembourg 139 | 巴黎,景点,居斯塔,居斯塔 140 | 巴黎,景点,情趣博物馆, Museum Of Eroticism 141 | 巴黎,美食,Quick,Quick 142 | 巴黎,美食, Au Bougnat, Au Bougnat 143 | 巴黎,娱乐,疯马秀, Crazy Horse 144 | 巴黎,美食,诸葛烤鱼,诸葛烤鱼 145 | 巴黎,美食, Brioche Doree(香榭丽舍大街店), Brioche Doree(香榭丽舍大街店) 146 | 巴黎,美食, La Coupole, La Coupole 147 | 巴黎,景点,圣厄斯塔什教堂,圣厄斯塔什教堂 148 | 巴黎,美食, Cafe De La Paix, Cafe De La Paix 149 | 巴黎,美食, La Cinq, La Cinq 150 | 巴黎,美食, Alain Ducasse Au Plaza Athenee, Alain Ducasse Au Plaza Athenee 151 | 巴黎,美食,青木定治(Pérignon店), Sadaharu Aoki(pérignon店) 152 | 巴黎,交通,巴黎圣拉扎尔火车站, Gare St. Lazare 153 | 巴黎,美食, Le Bar A Huitres(saint, Le Bar A Huitres(saint 154 | 巴黎,美食,银塔餐厅,银塔餐厅 155 | 巴黎,美食,L'as, Du Fallafel 156 | 巴黎,景点,圣洛克教堂,圣洛克教堂 157 | 巴黎,交通,Châtelet, – Les Halles地铁站 158 | 巴黎,美食, Pizza Pino, Pizza Pino 159 | 巴黎,美食, Le Zimmer, Le Zimmer 160 | 巴黎,美食, Le Pre Catelan, Le Pre Catelan 161 | 巴黎,景点,圣马丁运河, Canal St 162 | 巴黎,美食,Fauchon,Fauchon 163 | 巴黎,购物,香奈儿, Chanel 164 | 巴黎,景点,圣米歇尔广场, La Place Saint 165 | 巴黎,交通,协和广场地铁站, Concorde地铁站 166 | 巴黎,购物,塞纳河书摊, Les Bouquinistes 167 | 巴黎,景点,圣赛芙韩教堂,圣赛芙韩教堂 168 | 巴黎,景点,巴黎天主圣三教堂,巴黎天主圣三教堂 169 | 巴黎,购物, Monoprix Paris France, Monoprix Paris France 170 | 巴黎,美食,野田岩, Nodaïwa 171 | 巴黎,美食, La Duree, La Duree 172 | 巴黎,景点,考古地下室, Archeological Crypt Of The Parvis Of Not... 173 | 巴黎,景点,圣婴喷泉,圣婴喷泉 174 | 巴黎,美食, Alain Ducasse Paris, Alain Ducasse Paris 175 | 巴黎,交通,拉德芳斯地铁站, La Défense地铁站 176 | 巴黎,交通, Val D'europe地铁站, Val D'europe地铁站 177 | 巴黎,美食,L'avant, Comptoir 178 | 巴黎,交通,杜伊乐丽地铁站, Tuileries地铁站 179 | 巴黎,交通,市政厅地铁站, Hôtel De Ville地铁站 180 | 巴黎,美食,L'escargot,L'escargot 181 | 巴黎,美食,北京食堂,北京食堂 182 | 巴黎,景点,金字塔广场, Place Des Pyramides 183 | 巴黎,美食, Le Relais Louis Xiii, Le Relais Louis Xiii 184 | 巴黎,美食,华都酒家,华都酒家 185 | 巴黎,美食,锅色天香,锅色天香 186 | 巴黎,景点,尚蒂伊城堡, Domaine De chantilly 187 | 巴黎,交通,Saint,Saint 188 | 巴黎,美食, Les Rillettes, Les Rillettes 189 | 巴黎,美食, Le Chalet Du Parc, Le Chalet Du Parc 190 | 巴黎,购物, Les Halles, Les Halles 191 | 巴黎,景点,阿拉伯文化博物馆, Museum Of The Arab World (institut... 192 | 巴黎,美食,双剧场餐厅, Bistro Des Deux Theatres 193 | 巴黎,美食, La Compagnie Des Crepes, La Compagnie Des Crepes 194 | 巴黎,景点,蒙马特博物馆,蒙马特博物馆 195 | 巴黎,美食,Hippopotamus,Hippopotamus 196 | 巴黎,美食, Le Clos Y, Le Clos Y 197 | 巴黎,景点,圣让蒙马特教堂, église Saint 198 | 巴黎,娱乐,丁香园咖啡馆,丁香园咖啡馆 199 | 巴黎,美食, Les Omeres餐厅, Les Omeres餐厅 200 | 巴黎,景点,巴黎国立高等美术学院,巴黎国立高等美术学院 201 | 巴黎,美食,Angelina,Angelina 202 | 巴黎,美食, Il Etait Une Oie Dans Le Sud Ouest, Il Etait Une Oie Dans Le Sud Ouest 203 | 巴黎,交通,金字塔地铁站, Pyramides地铁站 204 | 巴黎,美食,左岸咖啡,左岸咖啡 205 | 巴黎,美食, La Patisserie Des Reves, La Patisserie Des Reves 206 | 巴黎,交通,奥斯特里茨车地铁站, Paris 207 | 巴黎,美食, Le P'tit Troquet, Le P'tit Troquet 208 | 巴黎,景点,圣雅克塔, Saint 209 | 巴黎,美食, Kebab Paradise, Kebab Paradise 210 | 巴黎,景点,查尔斯戴高乐桥,查尔斯戴高乐桥 211 | 巴黎,景点,巴黎植物园,巴黎植物园 212 | 巴黎,购物,陈氏商场, Tang Freres 213 | 巴黎,美食,Flunch,Flunch 214 | 巴黎,美食, Chez Leon De Bruxelles, Chez Leon De Bruxelles 215 | 巴黎,交通,王宫,王宫 216 | 巴黎,美食,脿, Le Grenier Pain 217 | 巴黎,购物,花鸟市场, Flower Market 218 | 巴黎,美食, Le Bar A Huitres Place Des Vosges, Le Bar A Huitres Place Des Vosges 219 | 巴黎,美食, Auberge Nicolas Flamel, Auberge Nicolas Flamel 220 | 巴黎,美食,Lafayette,Lafayette 221 | 巴黎,美食,万里香, Restaurant Chez Shen 222 | 巴黎,娱乐,马列咖啡, La Café Marly 223 | 巴黎,景点,蒙特公园,蒙特公园 224 | 巴黎,交通,巴士底地铁站, Bastille地铁站 225 | 巴黎,美食,马克西姆西餐厅, Make Ximu Restaurant 226 | 巴黎,美食, La Bouteille D'or, La Bouteille D'or 227 | 巴黎,美食, Chez Fernand, Chez Fernand 228 | 巴黎,美食,卡普辛大咖啡馆,卡普辛大咖啡馆 229 | 巴黎,美食, Le Violon D'ingres, Le Violon D'ingres 230 | 巴黎,美食, Restaurant Mariette, Restaurant Mariette 231 | 巴黎,美食, Le Vernet, Le Vernet 232 | 巴黎,景点,卡那瓦雷博物馆,卡那瓦雷博物馆 233 | 巴黎,美食,路易塞特餐厅, Chez louisette 234 | 巴黎,美食,Taillevent,Taillevent 235 | 巴黎,景点,巴黎现代艺术博物馆, Modern Art Museum Paris 236 | 巴黎,美食, Le Brasserie Mollard, Le Brasserie Mollard 237 | 巴黎,交通,巴黎北地铁站, Nord地铁站 238 | 巴黎,购物,欧洲谷, Le Val D'europe 239 | 巴黎,景点,毕哈肯桥, Pont De Bir 240 | 巴黎,美食, Le Relais De L'entrecote, Le Relais De L'entrecote 241 | 巴黎,交通,斯特拉斯堡,斯特拉斯堡 242 | 巴黎,美食,Bofinger,Bofinger 243 | 巴黎,美食, La Chope, La Chope 244 | 巴黎,美食, Montblanc Boutique, Montblanc Boutique 245 | 巴黎,购物,戴罗勒商店, Deyrolle 246 | 巴黎,美食, Le Berkeley, Le Berkeley 247 | 巴黎,美食,L'escale,L'escale 248 | 巴黎,景点,拉维列特公园,拉维列特公园 249 | 巴黎,美食,Ladur茅e,Ladur茅e 250 | 巴黎,购物, Aligre Market, Aligre Market 251 | 巴黎,美食, Le Grand Cafe Capucines, Le Grand Cafe Capucines 252 | 巴黎,娱乐,狡兔酒吧,狡兔酒吧 253 | 巴黎,景点,玛摩丹美术馆,玛摩丹美术馆 254 | 巴黎,景点,美国圣三一大教堂, American Cathedral Of The Holy Tri... 255 | 巴黎,交通,阿尔玛桥地铁站, Pont De L'alma地铁站 256 | 巴黎,交通,洛雷特圣母院地铁站, Notre 257 | 巴黎,美食, Arnaud Larher Patissier Chocolatier, Arnaud Larher Patissier Chocolatier 258 | 巴黎,景点,裁判监狱所, Conciergerie 259 | 巴黎,美食,Quick,Quick 260 | 巴黎,美食, Le Wilson, Le Wilson 261 | 巴黎,美食, Chez Clement, Chez Clement 262 | 巴黎,美食, Chez Mamy, Chez Mamy 263 | 巴黎,美食,L'ange, 20 Restaurant 264 | 巴黎,景点,巴黎高等师范学院,巴黎高等师范学院 265 | 巴黎,景点,克鲁尼中世纪博物馆, Musée De Cluny 266 | 巴黎,娱乐,Verlet,Verlet 267 | 巴黎,美食, Au Petit Sud Ouest, Au Petit Sud Ouest 268 | 巴黎,美食, La Poule Au Pot, La Poule Au Pot 269 | 巴黎,美食, La Maison De L'aubrac, La Maison De L'aubrac 270 | 巴黎,美食, Brasserie De La Tour Eiffel, Brasserie De La Tour Eiffel 271 | 巴黎,景点,利奥波德,利奥波德 272 | 巴黎,美食,Higuma,Higuma 273 | 巴黎,交通,Versailles,Versailles 274 | 巴黎,景点,耶拿桥, Pont D'iéna 275 | 巴黎,美食,中华快餐,中华快餐 276 | 巴黎,美食, Le Cafe Des Beaux Arts, Le Cafe Des Beaux Arts 277 | 巴黎,交通,Bussy,Bussy 278 | 巴黎,景点,圣奥古斯丁大教堂, église Saint 279 | 巴黎,美食,L聽ambroise聽pare,L聽ambroise聽pare 280 | 巴黎,美食, Chez Robert Et Louise, Chez Robert Et Louise 281 | 巴黎,景点,格雷万蜡像馆,格雷万蜡像馆 282 | 巴黎,景点,圣母升天教堂,圣母升天教堂 283 | 巴黎,美食, La Fontaine De Mars, La Fontaine De Mars 284 | 巴黎,美食, Le Richer, Le Richer 285 | -------------------------------------------------------------------------------- /bash.exe.stackdump: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonecherry/Spider/1d86eb7556fcfa8be0b23cdc55fb82049eb1c63e/bash.exe.stackdump -------------------------------------------------------------------------------- /ceshimysql.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import MySQLdb 3 | db = 'southamerica' 4 | sqli = 'INSERT INTO ceshi VALUES(1,"haode")' 5 | try: 6 | conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, charset='utf8',db='southamerica',) 7 | cur = conn.cursor() 8 | # cur.execute("INSERT INTO southamerica.巴西(chinesename,englishname) VALUES( '艺术桥','艺术桥' )") 9 | cur.execute(sqli) 10 | cur.execute("insert into ceshi values(100,'Tom')") 11 | 12 | # cur.execute('set interactive_timeout=96*3600') 13 | except MySQLdb.Error, e: 14 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 15 | 16 | 17 | cur.close() 18 | conn.commit() 19 | conn.close() 20 | print 'ok' -------------------------------------------------------------------------------- /chengshijilu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonecherry/Spider/1d86eb7556fcfa8be0b23cdc55fb82049eb1c63e/chengshijilu.txt -------------------------------------------------------------------------------- /gufensoso/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonecherry/Spider/1d86eb7556fcfa8be0b23cdc55fb82049eb1c63e/gufensoso/__init__.py -------------------------------------------------------------------------------- /gufensoso/yinyong.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | import sys 3 | from lxml import etree 4 | import requests 5 | 6 | #getsource用来获取网页源代码 7 | def getsource(url): 8 | html = requests.get(url) 9 | return html.text 10 | 11 | if __name__ == '__main__': 12 | url = 'http://gg3.cytbj.com/scholar?q=社会网络动态分析' 13 | html = getsource(url) 14 | selector = etree.HTML(html) 15 | referenceNum = selector.xpath('//div[@class="gs_fl"]/a/text()')[0] 16 | print referenceNum[0:6] 17 | print referenceNum[6:] 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /lundunPOI.txt: -------------------------------------------------------------------------------- 1 | 伦敦,景点,大英博物馆, British Museum 2 | 伦敦,景点,塔桥, Tower Bridge 3 | 伦敦,景点,伦敦眼, London Eye 4 | 伦敦,景点,议会大厦, Houses Of Parliament 5 | 伦敦,景点,海德公园, Hyde Park 6 | 伦敦,景点,白金汉宫, Buckingham Palace 7 | 伦敦,景点,西敏寺, Westminster Abbey 8 | 伦敦,景点,大本钟, Big Ben 9 | 伦敦,景点,泰晤士河, Thames River 10 | 伦敦,购物,牛津街, Oxford Street 11 | 伦敦,景点,英国国家美术馆, National Gallery 12 | 伦敦,美食,唐人街, Chinatown 13 | 伦敦,购物,哈罗德百货, Harrods 14 | 伦敦,景点,圣保罗大教堂, St.paul's Cathedral 15 | 伦敦,景点,伦敦塔, Tower Of London 16 | 伦敦,景点,特拉法加广场, Trafalgar Square 17 | 伦敦,美食,Nando's,Nando's 18 | 伦敦,景点,9¾站台, Platform 9¾ 19 | 伦敦,交通,国王十字火车站, King's Cross Railway Station 20 | 伦敦,景点,自然历史博物馆, Natural History Museum 21 | 伦敦,景点,福尔摩斯博物馆, Sherlock Holmes Museum 22 | 伦敦,景点,伦敦杜莎夫人蜡像馆, Madame Tussauds London 23 | 伦敦,景点,圣詹姆斯公园, St James' Park 24 | 伦敦,购物,邦德街, Bond Street 25 | 伦敦,购物,伦敦M豆世界, M&m’s World London 26 | 伦敦,景点,唐宁街10号, 10 Downing Street 27 | 伦敦,景点,皮卡迪里广场, Piccadilly Circus 28 | 伦敦,购物,科文特花园, Covent Garden 29 | 伦敦,景点,维多利亚与艾伯特博物馆, Victoria & Albert Museum 30 | 伦敦,购物,塞尔福里奇百货公司, Selfridges 31 | 伦敦,购物,摄政街, Regent Street 32 | 伦敦,购物,博罗市场, Borough Market 33 | 伦敦,景点,碎片大厦, The Shard 34 | 伦敦,景点,格林公园, Green Park 35 | 伦敦,景点,诺丁山, Notting Hill 36 | 伦敦,景点,摄政公园, The Regent’s Park 37 | 伦敦,景点,千禧桥, Millennium Bridge 38 | 伦敦,景点,威斯敏斯特大教堂, Westminster Cathedral 39 | 伦敦,景点,泰特现代美术馆, Tate Modern 40 | 伦敦,景点,肯辛顿宫, Kensington Palace 41 | 伦敦,景点,肯辛顿花园, Kensington Gardens 42 | 伦敦,购物,福南梅森, Fortnum & Mason 43 | 伦敦,娱乐,女王陛下剧院, Her Majesty's Theatre 44 | 伦敦,美食,龙虾餐厅, Burger And Lobster(soho) 45 | 伦敦,交通,帕丁顿站, Paddington Station 46 | 伦敦,景点,伦敦动物园, Zsl London Zoo 47 | 伦敦,购物,波特贝罗路跳蚤市场, Portobello Road Market 48 | 伦敦,景点,国家肖像美术画廊, National Portrait Gallery 49 | 伦敦,购物,新考文特花园市场, New Covent Garden Market 50 | 伦敦,娱乐,酋长球场, Emirates Stadium 51 | 伦敦,购物,Hamleys,Hamleys 52 | 伦敦,景点,皇家马厩, Royal Mews 53 | 伦敦,景点,斯坦福桥球场, Stamford Bridge Stadium 54 | 伦敦,景点,旧皇家海军学院, Old Royal Naval College 55 | 伦敦,娱乐,莎士比亚圆形剧场, Shakespeare's Globe 56 | 伦敦,景点,大英图书馆, British Library 57 | 伦敦,景点,维多利亚与艾伯特博物馆儿童馆, Victoria And Albert Mu... 58 | 伦敦,娱乐,华纳工厂, Warner Bros. Studio Tour London 59 | 伦敦,购物,卡姆登市场, Camden Market 60 | 伦敦,购物,布里克巷, Brick Lane Market 61 | 伦敦,景点,圣詹姆士公园, St James's Park 62 | 伦敦,娱乐,皇家艾尔伯特音乐厅, Royal Albert Hall 63 | 伦敦,景点,国家航海博物馆, National Maritime Museum 64 | 伦敦,景点,市政厅, City Hall 65 | 伦敦,景点,温布顿草地网球博物馆, Wimbledon Lawn Tennis Museum 66 | 伦敦,景点,汉普顿宫, Hampton Court Palace 67 | 伦敦,景点,波洛克玩具博物馆, Pollock's Toy Museum & Shop 68 | 伦敦,景点,伦敦官府大道皇家骑兵卫队, Horse Guards Parade At Whi... 69 | 伦敦,景点,女王官邸, Queen's House 70 | 伦敦,购物,东伦敦花市, Columbia Road Flower Market 71 | 伦敦,景点,伦敦交通博物馆, London Transport Museum 72 | 伦敦,景点,滑铁卢桥, Waterloo Bridge 73 | 伦敦,景点,珍宝塔, Jewel Tower 74 | 伦敦,景点,金丝雀码头, Canary Wharf 75 | 伦敦,景点,科学博物馆, Science Museum 76 | 伦敦,交通,邱园地铁站, Kewgardens地铁站 77 | 伦敦,景点,伦敦大火纪念塔, Monument To The Great Fire Of London 78 | 伦敦,美食,Rock, & Sole Plaice 79 | 伦敦,景点,伦敦博物馆,伦敦博物馆 80 | 伦敦,美食,Angus, Steak House 81 | 伦敦,交通,希思罗机场1、2、3号航站楼地铁站, Heathrowterminals1,... 82 | 伦敦,景点,丘吉尔作战室, Churchill War Rooms 83 | 伦敦,交通,维多利亚火车站, Victoria Railway Station 84 | 伦敦,景点,英格兰银行博物馆, Bank Of England Museum 85 | 伦敦,景点,海事博物館, National Marine Museum 86 | 伦敦,购物,Burberry伦敦工厂店, Burberry Factory Outlet 87 | 伦敦,景点,大理石拱门, Marble Arch 88 | 伦敦,美食,Laduree,Laduree 89 | 伦敦,美食,正宗兰州拉面,正宗兰州拉面 90 | 伦敦,美食,Restaurant, Gordon Ramsay 91 | 伦敦,美食,Pret, A Manger 92 | 伦敦,美食,Angus, Steakhouse(cranbourn St) 93 | 伦敦,景点,圣乔治教堂, St. George's Church 94 | 伦敦,交通,贝克街地铁站, Bakerstreet地铁站 95 | 伦敦,美食,Borough, Market 96 | 伦敦,购物,Westfield,Westfield 97 | 伦敦,购物,Primark, Stores (oxford) 98 | 伦敦,购物,柯芬花园, Covent Garden 99 | 伦敦,交通,帕丁顿车站, Paddington Station 100 | 伦敦,娱乐,皇家歌剧院, Royal Opera House 101 | 伦敦,娱乐,女王剧院, Queen’s Theatre 102 | 伦敦,购物,美体小铺, The Body Shop 103 | 伦敦,景点,里士满公园, Richmond Park 104 | 伦敦,娱乐,Lyceum, Theatre 105 | 伦敦,美食,Snog,Snog 106 | 伦敦,美食,人民公社, Baozi Inn 107 | 伦敦,购物,格林威治市场, Greenwich Market 108 | 伦敦,景点,帝国战争博物馆, Imperial War Museum 109 | 伦敦,购物,Beatles Store, Beatles Store 110 | 伦敦,美食,Windsor Castle,Windsor Castle 111 | 伦敦,交通,莱斯特广场地铁站, Leicestersquare地铁站 112 | 伦敦,景点,亚伯特纪念碑, Albert Memorial 113 | 伦敦,景点,查令十字街, Charing Cross Road 114 | 伦敦,购物,自由店, Liberty 115 | 伦敦,购物,伯灵顿市场街, Burlington Arcade 116 | 伦敦,景点,Wallace Collection, Wallace Collection 117 | 伦敦,美食,文兴酒家,文兴酒家 118 | 伦敦,美食,Rules Restaurant, Rules Restaurant 119 | 伦敦,景点,温布尔登1号球场,温布尔登1号球场 120 | 伦敦,美食,Wasabi,Wasabi 121 | 伦敦,美食,Afternoon Tea, Afternoon Tea 122 | 伦敦,景点,温布尔登镇, Wimbledon 123 | 伦敦,交通,伦敦桥地铁站, Londonbridge地铁站 124 | 伦敦,景点,The Household Cavalry Museum, The Household Cavalry Museum 125 | 伦敦,景点,伦敦水族馆, Sea Life London Aquarium 126 | 伦敦,美食,老成都,老成都 127 | 伦敦,景点,旧皇家海军学院彩绘厅, The Painted Hall At The Old Ro... 128 | 伦敦,景点,童年博物馆, V&a Museum Of Childhood 129 | 伦敦,娱乐,The Sherlock Holmes Pub, The Sherlock Holmes Pub 130 | 伦敦,景点,海军拱门, Admiralty Arch 131 | 伦敦,景点,国会广场, Parliament Square 132 | 伦敦,娱乐,Lyceum, Theatre 133 | 伦敦,美食,Snog,Snog 134 | 伦敦,美食,人民公社, Baozi Inn 135 | 伦敦,购物,格林威治市场, Greenwich Market 136 | 伦敦,美食,Alain, Ducasse At The Dorchester 137 | 伦敦,景点,帝国战争博物馆, Imperial War Museum 138 | 伦敦,购物,Beatles, Store 139 | 伦敦,美食,Windsor, Castle 140 | 伦敦,交通,莱斯特广场地铁站, Leicestersquare地铁站 141 | 伦敦,景点,亚伯特纪念碑, Albert Memorial 142 | 伦敦,景点,查令十字街, Charing Cross Road 143 | 伦敦,购物,自由店, Liberty 144 | 伦敦,购物,伯灵顿市场街, Burlington Arcade 145 | 伦敦,景点,Wallace, Collection 146 | 伦敦,美食,文兴酒家,文兴酒家 147 | 伦敦, 148 | 娱乐,Lyceum, Theatre 149 | 伦敦, 150 | 美食,Snog,Snog 151 | 伦敦, 152 | 美食,人民公社, Baozi Inn 153 | 伦敦, 154 | 购物,格林威治市场, Greenwich Market 155 | 伦敦, 156 | 美食,Alain, Ducasse At The Dorchester 157 | 伦敦, 158 | 景点,帝国战争博物馆, Imperial War Museum 159 | 伦敦, 160 | 购物,Beatles, Store 161 | 伦敦, 162 | 美食,Windsor, Castle 163 | 伦敦, 164 | 交通,莱斯特广场地铁站, Leicestersquare地铁站 165 | 伦敦, 166 | 景点,亚伯特纪念碑, Albert Memorial 167 | 伦敦, 168 | 景点,查令十字街, Charing Cross Road 169 | 伦敦, 170 | 购物,自由店, Liberty 171 | 伦敦, 172 | 购物,伯灵顿市场街, Burlington Arcade 173 | 伦敦, 174 | 景点,Wallace, Collection 175 | 伦敦, 176 | 美食,文兴酒家,文兴酒家 177 | -------------------------------------------------------------------------------- /poispider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jonecherry/Spider/1d86eb7556fcfa8be0b23cdc55fb82049eb1c63e/poispider/__init__.py -------------------------------------------------------------------------------- /poispider/citylist.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import random 3 | import re 4 | import sys 5 | import urllib2 6 | 7 | import MySQLdb 8 | import requests 9 | from lxml import etree 10 | 11 | import proxyIP 12 | 13 | reload(sys) 14 | sys.setdefaultencoding("utf-8") 15 | 16 | #getsource用来获取网页源代码 17 | def getsource(url): 18 | 19 | html = requests.get(url) 20 | html.encoding = 'utf-8' 21 | return html.text 22 | def getit(url): 23 | proxy_handle = urllib2.ProxyHandler({'http': random.choice(proxyIP.proxy_list)}) 24 | opener = urllib2.build_opener(proxy_handle) 25 | urllib2.install_opener(opener) 26 | response = urllib2.urlopen(url) 27 | return response 28 | #获取信息块 29 | def getblock(source): 30 | blocks = re.findall('(

)',source,re.S) 31 | return blocks 32 | 33 | if __name__ == '__main__': 34 | countries = ['cuba','mexico','canada','thailand','singapore','south-korea','usa'] 35 | country = countries[0] 36 | starturl = 'http://place.qyer.com/'+country+'/citylist-0-0-1/' 37 | print '起始页:',starturl 38 | country_id = 1048 39 | parent_region_id = 1048 40 | region_type = 2 41 | db = 'map' 42 | # 数据表 43 | tb = 'map_region' 44 | # 连接数据库 45 | try: 46 | # conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, charset='utf8') 47 | conn = MySQLdb.connect(host='172.22.185.130', user='root', passwd='123456', port=3306, charset='utf8') 48 | cur = conn.cursor() 49 | cur.execute('set interactive_timeout=96*3600') 50 | conn.select_db(db) 51 | except MySQLdb.Error,e: 52 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 53 | temphtml = getsource(starturl) 54 | selector = etree.HTML(temphtml) 55 | pagenums = selector.xpath('//div[@class="ui_page"]/a/@data-page') 56 | # 需爬取得页数 57 | pagenum = pagenums[len(pagenums)-2] 58 | 59 | for i in range(1, int(pagenum) + 1): 60 | url = 'http://place.qyer.com/'+country+'/citylist-0-0-%d'%(i) 61 | print '当前列表页:',url 62 | html = getsource(url) 63 | selector2 = etree.HTML(html) 64 | blocks = getblock(html) 65 | for j,block in enumerate(blocks): 66 | selector1 = etree.HTML(block) 67 | city = selector1.xpath('//a/text()')[0].strip() 68 | cityenglishname = selector1.xpath('//span/text()')[0].strip() 69 | 70 | sub_url = selector1.xpath('//a/@href')[0] 71 | 72 | sub_html = getsource(sub_url) 73 | sub_selector = etree.HTML(sub_html) 74 | pa_num = sub_selector.xpath('//div[@class="plcTopBarStat fontYaHei"]/em/text()') 75 | if not pa_num: 76 | pa_num = 0 77 | else: 78 | pa_num = pa_num[0] 79 | # print pa_num 80 | 81 | sqli = "INSERT INTO " + db + "." + tb + "(region_ch_name,region_en_name,parent_region_id,country_id,region_type,visited_count)" + " VALUES(%s,%s,%s,%s,%s,%s)" 82 | 83 | # 判断数据库是否已经存在城市数据,决定是插入数据还是更新数据。 84 | sqli1 = "select * from "+db+"."+tb+" where region_ch_name = "+"'%s'"%(city) 85 | 86 | num_result = cur.execute(sqli1) 87 | if num_result: 88 | pass 89 | if not num_result: 90 | print '城市', cityenglishname 91 | cur.execute(sqli,(city, cityenglishname, parent_region_id, country_id, region_type, pa_num)) 92 | conn.commit() 93 | print '------------------------------------------------' 94 | cur.close() 95 | conn.close() 96 | print '------------finished--------------' 97 | -------------------------------------------------------------------------------- /poispider/dazhongdianping.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | from lxml import etree 3 | import requests 4 | import os 5 | import sys 6 | import MySQLdb 7 | import re 8 | import random 9 | import time 10 | reload(sys) 11 | sys.setdefaultencoding("utf-8") 12 | 13 | def xiaoxie(city_en_name): 14 | city_en_name = city_en_name.split() 15 | en_name = '' 16 | for name in city_en_name: 17 | en_name = en_name + name.lower() 18 | return en_name 19 | #getsource用来获取网页源代码 20 | def getsource(url): 21 | headlist = [{'User-Agent':'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.'}, 22 | {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, 23 | {'User-Agent':'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;'}, 24 | {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}, 25 | {'User-Agent':'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}, 26 | {'User-Agent':'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'}, 27 | {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, 28 | {'User-Agent':'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, 29 | {'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}, 30 | {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, 31 | {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}] 32 | html = requests.get(url,headers = headlist[random.randint(0,10)]) 33 | html.encoding = 'utf-8' 34 | return html.text 35 | #获取信息块 36 | def getblock(source): 37 | blocks = re.findall('(
)',source,re.S) 38 | return blocks 39 | 40 | def pankong(re): 41 | if len(re)==0: 42 | re = '' 43 | else: 44 | re = re [0] 45 | re = re.strip() 46 | return re 47 | def getnum(commentsnum): 48 | i = 0 49 | j = 0 50 | for char in commentsnum: 51 | if char == '(': 52 | break 53 | i = i + 1 54 | 55 | for char in commentsnum: 56 | if char == ')': 57 | break 58 | j = j + 1 59 | si = int(i) + 1 60 | ei = int(j) 61 | return commentsnum[si:ei] 62 | def qushuzi(str): 63 | num = '' 64 | if not str: 65 | return num 66 | else: 67 | for sr in str: 68 | if sr.isdigit(): 69 | num = num+sr 70 | return num 71 | def subjiexi(url): 72 | subhtml = getsource(url) 73 | subselector = etree.HTML(subhtml) 74 | 75 | # 地址 76 | poi_address1 = subselector.xpath('//span[@itemprop="locality region"]/text()') 77 | poi_address1 = pankong(poi_address1) 78 | poi_address2 = subselector.xpath('//span[@itemprop="street-address"]/@title') 79 | poi_address2 = pankong(poi_address2) 80 | poi_address = poi_address1+poi_address2 81 | print '地址:',poi_address 82 | # 电话 83 | poi_telephone = subselector.xpath('//span[@itemprop="tel"]/text()') 84 | poi_telephone = pankong(poi_telephone) 85 | print '电话:',poi_telephone 86 | # 评论数 87 | psp = subselector.xpath('//div[@class="brief-info"]/span/text()') 88 | comments_count = pankong(psp) 89 | comments_count = qushuzi(comments_count) 90 | print '评论数:',comments_count 91 | # 评分 92 | poi_score = psp[2] 93 | for chi,ch in enumerate(poi_score): 94 | if ch.isdigit(): 95 | ni = chi 96 | break 97 | poi_score = poi_score[ni:] 98 | print '评分',poi_score 99 | 100 | # 列表页解析 101 | def jiexi(url,tag_id): 102 | tempurl = url +'1' 103 | print '列表页', tempurl 104 | dangqianselector = url_to_selector(tempurl) 105 | # 购物&美食 106 | if tag_id in [1,4]: 107 | print '购物or美食' 108 | try: 109 | zongyeshu = dangqianselector.xpath('//div[@class="Pages"]/a/@data-ga-page')[-2] 110 | except: 111 | pass 112 | else: 113 | for ye in range(1,int(zongyeshu)+1): 114 | lieurl = url+ "%s"%(ye) 115 | print '列表ye',lieurl 116 | html = getsource(lieurl) 117 | blocks = re.findall('()', html, re.S) 118 | print '这页poi数量',len(blocks) 119 | for block in blocks: 120 | selector = etree.HTML(block) 121 | #中文名称、英文名称、本地名称 122 | name = selector.xpath('///a[@class="BL"]/@title')[0] 123 | shouzimu = name[0].encode('utf-8') 124 | if shouzimu.isalpha(): 125 | poi_ch_name = '' 126 | poi_en_name = name 127 | else: 128 | poi_ch_name = name 129 | poi_en_name = '' 130 | poi_loc_name = poi_en_name 131 | # print '名称:',poi_ch_name,poi_en_name 132 | # 判断数据库是否已经存在该POI记录,决定是插入数据还是更新数据。 133 | sqli_ch = "select * from " + db + "." + tb + " where poi_ch_name = " + "'%s'" % (poi_ch_name) 134 | sqli_en = "select * from " + db + "." + tb + " where poi_en_name = " + "'%s'" % (poi_en_name) 135 | try: 136 | r1 = cur.execute(sqli_ch) 137 | r2 = cur.execute(sqli_en) 138 | except: 139 | pass 140 | else: 141 | if poi_ch_name == '': 142 | r1 = 0 143 | if poi_en_name == '': 144 | r2 = 0 145 | print '查询结果:' 146 | print '中文', r1, '英文', r2 147 | if r1 or r2: 148 | print '已经存在记录,更新数据... ...' 149 | pass 150 | else: 151 | # 详情链接 152 | suburl = selector.xpath('//a[@class="BL"]/@href')[0] 153 | suburl = 'http://www.dianping.com'+suburl 154 | print '详情链接', suburl 155 | time.sleep(tingliu) 156 | subhtml = getsource(suburl) 157 | subselector = etree.HTML(subhtml) 158 | # 地址 159 | poi_address1 = subselector.xpath('//span[@itemprop="locality region"]/text()') 160 | poi_address1 = pankong(poi_address1) 161 | poi_address2 = subselector.xpath('//span[@itemprop="street-address"]/@title') 162 | poi_address2 = pankong(poi_address2) 163 | poi_address = poi_address1 + poi_address2 164 | # print '地址:', poi_address 165 | # 电话 166 | poi_telephone = subselector.xpath('//span[@itemprop="tel"]/text()') 167 | poi_telephone = pankong(poi_telephone) 168 | # print '电话:', poi_telephone 169 | # 评论数 170 | psp = subselector.xpath('//div[@class="brief-info"]/span/text()') 171 | comments_count = pankong(psp) 172 | comments_count = qushuzi(comments_count) 173 | # print '评论数:', comments_count 174 | # 评分 175 | try: 176 | poi_score = psp[2] 177 | except: 178 | poi_score = '' 179 | else: 180 | for chi, ch in enumerate(poi_score): 181 | if ch.isdigit(): 182 | ni = chi 183 | break 184 | poi_score = poi_score[ni:] 185 | # print '评分', poi_score 186 | 187 | if r1 or r2: 188 | print '已经存在记录,更新数据... ...' 189 | pass 190 | else: 191 | print '新增POI... ...' 192 | # print '中文:' + poi_ch_name, '英文:' + poi_en_name, '本地语言名称' + poi_en_name, '城市id' + str(poi_region_id), '类型:' + str(tag_id), '评论数' + str(comments_count), '评分' + str(poi_score), '地址' + poi_address, '电话' + poi_telephone 193 | sqli = "INSERT INTO " + db + "." + tb + "(poi_ch_name,poi_en_name,poi_loc_name,poi_region_id,poi_tag_id,poi_score,poi_address,poi_telephone,comments_count,source_website)" + " VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 194 | cur.execute(sqli, (poi_ch_name, poi_en_name, poi_loc_name, poi_region_id, tag_id,poi_score, poi_address, poi_telephone,comments_count, source)) 195 | conn.commit() 196 | print '----------------------------------------' 197 | # 景点 198 | elif tag_id == 3: 199 | print '景点' 200 | try: 201 | zongyeshu = dangqianselector.xpath('//div[@class="Pages"]/a/@data-ga-page')[-2] 202 | except: 203 | pass 204 | else: 205 | for ye in range(1, int(zongyeshu) + 1): 206 | jingurl = url + "%s"%(ye) 207 | html = getsource(jingurl) 208 | 209 | blocks = re.findall('(
  • )',html,re.S) 210 | for block in blocks: 211 | selector = etree.HTML(block) 212 | # 中文名称、英文名称、本地名称 213 | name = selector.xpath('//div[@class="poi-title"]/h4/a/text()')[0] 214 | shouzimu = name[0].encode('utf-8') 215 | if shouzimu.isalpha(): 216 | poi_ch_name = '' 217 | poi_en_name = name 218 | else: 219 | poi_ch_name = name 220 | poi_en_name = '' 221 | poi_loc_name = poi_en_name 222 | print '中文名称:',poi_ch_name,'英文名称:',poi_en_name 223 | # 判断数据库是否已经存在该POI记录,决定是插入数据还是更新数据。 224 | sqli_ch = "select * from " + db + "." + tb + " where poi_ch_name = " + "'%s'" % (poi_ch_name) 225 | sqli_en = "select * from " + db + "." + tb + " where poi_en_name = " + "'%s'" % (poi_en_name) 226 | try: 227 | r1 = cur.execute(sqli_ch) 228 | r2 = cur.execute(sqli_en) 229 | except: 230 | pass 231 | else: 232 | if poi_ch_name == '': 233 | r1 = 0 234 | if poi_en_name == '': 235 | r2 = 0 236 | print '查询结果:' 237 | print '中文', r1, '英文', r2 238 | if r1 or r2: 239 | print '已经存在记录,更新数据... ...' 240 | pass 241 | else: 242 | # 详情链接 243 | suburl = selector.xpath('//div[@class="poi-title"]/h4/a/@href')[0] 244 | suburl = 'http://www.dianping.com'+suburl 245 | print '详情链接', suburl 246 | time.sleep(tingliu) 247 | subhtml = getsource(suburl) 248 | subselector = etree.HTML(subhtml) 249 | 250 | # 地址 251 | poi_address1 = subselector.xpath('//span[@itemprop="locality region"]/text()') 252 | poi_address1 = pankong(poi_address1) 253 | poi_address2 = subselector.xpath('//span[@itemprop="street-address"]/@title') 254 | poi_address2 = pankong(poi_address2) 255 | poi_address = poi_address1 + poi_address2 256 | # print '地址:', poi_address 257 | # 电话 258 | poi_telephone = subselector.xpath('//span[@itemprop="tel"]/text()') 259 | poi_telephone = pankong(poi_telephone) 260 | # print '电话:', poi_telephone 261 | # 评论数 262 | psp = subselector.xpath('//div[@class="brief-info"]/span/text()') 263 | comments_count = pankong(psp) 264 | comments_count = qushuzi(comments_count) 265 | # print '评论数:', comments_count 266 | # 评分 267 | if not len(psp)==5: 268 | poi_score = '' 269 | else: 270 | poi_score = psp[2] 271 | for chi, ch in enumerate(poi_score): 272 | if ch.isdigit(): 273 | ni = chi 274 | break 275 | poi_score = poi_score[ni:] 276 | # print '评分', poi_score 277 | 278 | if r1 : 279 | print '已经存在记录,更新数据... ...' 280 | pass 281 | else: 282 | print '新增POI... ...' 283 | # print '中文:' + poi_ch_name, '英文:' + poi_en_name, '本地语言名称' + poi_en_name, '城市id' + str(poi_region_id), '类型:' + str(tag_id), '评论数' + str(comments_count), '评分' + str(poi_score), '地址' + poi_address, '电话' + poi_telephone 284 | sqli = "INSERT INTO " + db + "." + tb + "(poi_ch_name,poi_en_name,poi_loc_name,poi_region_id,poi_tag_id,poi_score,poi_address,poi_telephone,comments_count,source_website)" + " VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 285 | cur.execute(sqli, (poi_ch_name, poi_en_name, poi_loc_name, poi_region_id, tag_id, poi_score, poi_address,poi_telephone, comments_count, source)) 286 | conn.commit() 287 | print '----------------------------------------' 288 | # 酒店 289 | elif tag_id == 2: 290 | print '酒店' 291 | try: 292 | zongyeshu = dangqianselector.xpath('//div[@class="page"]/a/@data-ga-page')[-2] 293 | except: 294 | pass 295 | else: 296 | for ye in range(1, int(zongyeshu) + 1): 297 | jingurl = url + "%s" % (ye) 298 | html = getsource(jingurl) 299 | 300 | blocks = re.findall('(
  • )', html, re.S) 301 | for block in blocks: 302 | 303 | selector = etree.HTML(block) 304 | # 详情链接 305 | suburl = selector.xpath('//h2[@class="hotel-name"]/a/@href')[0] 306 | suburl = 'http://www.dianping.com' + suburl 307 | print '详情链接', suburl 308 | time.sleep(tingliu) 309 | subhtml = getsource(suburl) 310 | subselector = etree.HTML(subhtml) 311 | # 中文名称、英文名称、本地名称 312 | name0 = subselector.xpath('//h1[@class="shop-name"]/text()') 313 | name0 = pankong(name0) 314 | name1 = subselector.xpath('//span[@class="shop-enname"]/text()') 315 | name1 = pankong(name1) 316 | 317 | shouzimu = name0[0] .encode('utf-8') 318 | if shouzimu.isalpha(): 319 | poi_ch_name = '' 320 | poi_en_name = name0 321 | else: 322 | poi_ch_name = name0 323 | poi_en_name = name1 324 | poi_loc_name = poi_en_name 325 | print '中文名称:', poi_ch_name, '英文名称:', poi_en_name 326 | 327 | 328 | # 判断数据库是否已经存在该POI记录,决定是插入数据还是更新数据。 329 | sqli_ch = "select * from " + db + "." + tb + " where poi_ch_name = " + "'%s'" % (poi_ch_name) 330 | sqli_en = "select * from " + db + "." + tb + " where poi_en_name = " + "'%s'" % (poi_en_name) 331 | try: 332 | r1 = cur.execute(sqli_ch) 333 | r2 = cur.execute(sqli_en) 334 | except: 335 | pass 336 | else: 337 | if poi_ch_name == '': 338 | r1 = 0 339 | if poi_en_name == '': 340 | r2 = 0 341 | print '查询结果:' 342 | print '中文',r1,'英文',r2 343 | if r1 or r2: 344 | print '已经存在记录,更新数据... ...' 345 | pass 346 | else: 347 | # 地址 348 | poi_address = subselector.xpath('//p[@class="shop-address"]/text()') 349 | poi_address =pankong(poi_address) 350 | poi_address = poi_address[3:] 351 | # 评分 352 | poi_score = subselector.xpath('//p[@class="info shop-star"]/span[1]/@class') 353 | if not poi_score: 354 | poi_score = '' 355 | else: 356 | poi_score = qushuzi(poi_score[0]) 357 | poi_score = round(int(poi_score)/5,2) 358 | 359 | # 评论数 360 | comments_count = subselector.xpath('//p[@class="info shop-star"]/span[@class="item"]/text()') 361 | if not comments_count: 362 | comments_count = '' 363 | else: 364 | comments = '' 365 | for dd in comments_count[0]: 366 | if dd.isdigit(): 367 | comments = comments +dd 368 | comments_count = comments 369 | # print '评论数:',comments_count 370 | # 电话 371 | poi_telephone = '' 372 | 373 | print '新增POI... ...' 374 | # print '中文:' + poi_ch_name, '英文:' + poi_en_name, '本地语言名称' + poi_en_name, '城市id' + str(poi_region_id), '类型:' + str(tag_id), '评论数' + str(comments_count), '评分' + str(poi_score), '地址' + poi_address, '电话' + poi_telephone 375 | sqli = "INSERT INTO " + db + "." + tb + "(poi_ch_name,poi_en_name,poi_loc_name,poi_region_id,poi_tag_id,poi_score,poi_address,poi_telephone,comments_count,source_website)" + " VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 376 | cur.execute(sqli, (poi_ch_name, poi_en_name, poi_loc_name, poi_region_id, tag_id, poi_score, poi_address,poi_telephone, comments_count, source)) 377 | conn.commit() 378 | print '----------------------------------------' 379 | else: 380 | pass 381 | 382 | 383 | def url_to_selector(url): 384 | html = getsource(url) 385 | return etree.HTML(html) 386 | 387 | if __name__ == '__main__': 388 | # 设置白名单,过滤国家 389 | chengshibaimingdan = range(1,826) 390 | # 来源 391 | source = '大众点评' 392 | db = 'map' 393 | # 数据表 394 | tb = 'map_poi1' 395 | tingliu = 5 396 | # 标签id 美食:1,酒店:2,景点:3,购物:4,娱乐:5,交通:6 397 | if not os.path.exists('zhuaqu'): 398 | os.mkdir('zhuaqu') 399 | if not os.path.exists(os.path.join('zhuaqu',tb)): 400 | os.mkdir(os.path.join('zhuaqu',tb)) 401 | jilu = open(os.path.join('zhuaqu',tb,'jilu.txt'),'a') 402 | # 连接数据库 403 | try: 404 | conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, charset='utf8') 405 | # conn = MySQLdb.connect(host='172.22.185.78', user='root', passwd='123456', port=3306, charset='utf8') 406 | cur = conn.cursor() 407 | cur.execute('set interactive_timeout=96*3600') 408 | conn.select_db(db) 409 | except MySQLdb.Error,e: 410 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 411 | # 抽取待抓取的城市列表 412 | sqli0 = "select region_en_name,region_id from map.map_region" 413 | num_city = cur.execute(sqli0) 414 | cities = cur.fetchmany(num_city) 415 | for city in cities: 416 | poi_region_id = city[1] 417 | city = city[0] 418 | city = city.strip() 419 | city = xiaoxie(city) 420 | if poi_region_id in chengshibaimingdan: 421 | print '已经完成',city,'的抓取' 422 | pass 423 | else: 424 | starturl = 'http://www.dianping.com/'+city+'/food/p1' 425 | starthtml = getsource(starturl) 426 | # print starthtml 427 | try: 428 | startselector = etree.HTML(starthtml) 429 | except: 430 | pass 431 | else: 432 | poiyeshu = startselector.xpath('//div[@class="Pages"]') 433 | if len(poiyeshu)== 0: 434 | print '城市',city,'在点评上没有数据' 435 | pass 436 | else: 437 | print '开始抓取', city 438 | url_food = 'http://www.dianping.com/'+city+'/food/p' 439 | url_shopping = 'http://www.dianping.com/'+city+'/shopping/p' 440 | url_jingdian = 'http://www.dianping.com/'+city+'/attraction?district=&category=&pageNum=' 441 | url_jiudian = 'http://www.dianping.com/'+city+'/hotel/p' 442 | urllist = [url_food,url_shopping,url_jingdian,url_jingdian] 443 | for ui,url in enumerate(urllist): 444 | if ui == 0: 445 | tag_id = 1 446 | elif ui == 1: 447 | tag_id = 4 448 | elif ui == 2: 449 | tag_id = 3 450 | elif ui == 3: 451 | tag_id = 2 452 | else: 453 | tag_id = '' 454 | jiexi(url,tag_id) 455 | 456 | cur.close() 457 | conn.close() 458 | jilu.close() 459 | print 'finished' -------------------------------------------------------------------------------- /poispider/mafengwo.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | from lxml import etree 3 | import requests 4 | import os 5 | import sys 6 | import MySQLdb 7 | import re 8 | reload(sys) 9 | sys.setdefaultencoding("utf-8") 10 | 11 | 12 | #getsource用来获取网页源代码 13 | def getsource(url): 14 | html = requests.get(url) 15 | html.encoding = 'utf-8' 16 | return html.text 17 | #获取信息块 18 | def getblock(source): 19 | blocks = re.findall('(
    )',source,re.S) 20 | return blocks 21 | 22 | def getnum(commentsnum): 23 | i = 0 24 | j = 0 25 | for char in commentsnum: 26 | if char == '(': 27 | break 28 | i = i + 1 29 | 30 | for char in commentsnum: 31 | if char == ')': 32 | break 33 | j = j + 1 34 | si = int(i) + 1 35 | ei = int(j) 36 | return commentsnum[si:ei] 37 | 38 | 39 | # 页面解析 40 | def jiexi(html): 41 | blocks = getblock(html) 42 | for block in blocks: 43 | selector = etree.HTML(block) 44 | poi = selector.xpath('//h3')[0] 45 | poi = poi.xpath('string(.)') 46 | # poi = poi.replace('\n','') 47 | 48 | elements = poi.split('-') 49 | leixin = elements[0] 50 | leixin = leixin.replace(' ', '') 51 | leixin = leixin.replace('\n', '') 52 | 53 | temp = elements[1].split() 54 | 55 | if len(temp) == 0: 56 | continue 57 | shouzimu = str(temp[0]) 58 | 59 | if len(temp)==1 and not shouzimu.isalpha(): 60 | zhongwen = temp[0] 61 | yingwen = '' 62 | elif shouzimu.isalpha(): 63 | yingwen = '' 64 | for i in range(0, len(temp)): 65 | yingwen = yingwen + ' ' + temp[i] 66 | zhongwen = '' 67 | else: 68 | zhongwen = temp[0] 69 | yingwen = '' 70 | for i in range(1, len(temp)): 71 | yingwen = yingwen + ' ' + temp[i] 72 | 73 | # 判断数据库是否已经存在该POI记录,决定是插入数据还是更新数据。 74 | sqli1 = "select * from " + db + "." + tb + " where poi_ch_name = " + "'%s'" % (zhongwen) 75 | sqli2 = "select * from " + db + "." + tb + " where poi_en_name = " + "'%s'" % (yingwen) 76 | try: 77 | r1 = cur.execute(sqli1) 78 | r2 = cur.execute(sqli2) 79 | except: 80 | pass 81 | else: 82 | # 国家、城市 83 | poiinfos = selector.xpath('//li/a/text()') 84 | country_city = poiinfos[0] 85 | country_city = country_city.split('-') 86 | country = country_city[0] 87 | city = country_city[1] 88 | 89 | # 查询城市id 90 | sqli1 = 'select region_id from map.map_region where region_ch_name = '+"'%s'"%(city) 91 | cur.execute(sqli1) 92 | id = cur.fetchmany(1) 93 | 94 | if len(id)==0: 95 | region_id = '' 96 | else: 97 | region_id = id[0][0] 98 | 99 | 100 | # 评论数 101 | pinglunshu = getnum(poiinfos[1]) 102 | # 相关游记数 103 | relatedyoujishu = getnum(poiinfos[2]) 104 | 105 | # tag_id 106 | if leixin == '美食': 107 | tag_id = 1 108 | elif leixin == '酒店': 109 | tag_id = 2 110 | elif leixin == '景点': 111 | tag_id = 3 112 | elif leixin == '购物': 113 | tag_id =4 114 | elif leixin == '娱乐': 115 | tag_id = 5 116 | elif leixin == '交通': 117 | tag_id = 6 118 | else: 119 | tag_id = '' 120 | 121 | 122 | # 详情链接 123 | link = selector.xpath('//h3/a/@href')[0] 124 | print '详情链接',link 125 | try: 126 | subhtml = getsource(link) 127 | subselector = etree.HTML(subhtml) 128 | except: 129 | pass 130 | else: 131 | if leixin == '景点': 132 | quguoshoucang = subselector.xpath('//span[@class="pa-num"]/text()') 133 | # 去过数 134 | quguonum = quguoshoucang[1] 135 | # 收藏数 136 | shoucangshu = quguoshoucang[0] 137 | # 评分 138 | poi_score = '' 139 | # 排名 140 | poi_rank = '' 141 | # 地址 142 | poi_address = '' 143 | poi_telephone_tag = subselector.xpath('//dl[@class="intro"]/dd/span/text()') 144 | if '电话' in poi_telephone_tag: 145 | for ti,tele_tag in enumerate(poi_telephone_tag): 146 | if tele_tag == '电话': 147 | telei = str(ti+1) 148 | xpath_tele = '//dl[@class="intro"]/dd['+telei+']/p/text()' 149 | poi_telephone = subselector.xpath(xpath_tele)[0] 150 | else: 151 | poi_telephone ='' 152 | 153 | else: 154 | quguonum = '' 155 | shoucangshu = '' 156 | # 评分 157 | poi_score = subselector.xpath('//span[@class="score-info"]/em/text()') 158 | if not poi_score: 159 | poi_score = '' 160 | else: 161 | poi_score = poi_score[0] 162 | # 等级 163 | poi_rank = subselector.xpath('//div[@class="ranking"]/em/text()') 164 | if not poi_rank: 165 | poi_rank = '' 166 | else: 167 | poi_rank = poi_rank[0][3:] 168 | # 地址,电话 169 | box_info = subselector.xpath('//div[@class="m-box m-info"]/ul[@class="clearfix"]/li/text()') 170 | # print '++++++++',len(box_info) 171 | if len(box_info)>=4: 172 | poi_address = subselector.xpath('//div[@class="m-box m-info"]/ul[@class="clearfix"]/li[1]/text()')[1].strip() 173 | for chari,addchar in enumerate(poi_address): 174 | if addchar == ":": 175 | tempi = chari 176 | poi_address = poi_address[tempi+1:] 177 | poi_telephone = subselector.xpath('//div[@class="m-box m-info"]/ul[@class="clearfix"]/li[2]/text()')[1].strip() 178 | else: 179 | poi_telephone ='' 180 | poi_address = '' 181 | 182 | # print '国家:' + country, '城市:' + city, '中文:' + zhongwen, '英文:' + yingwen, '城市id' + str(region_id), '类型id:' + str(tag_id), '类型:' + leixin, \ 183 | # '评论数' + str(pinglunshu), '相关游记数' + str(relatedyoujishu), '去过数' + str(quguonum), '收藏数' + str(shoucangshu), '评分' + str(poi_score), \ 184 | # '排名' + str(poi_rank), '电话' + str(poi_telephone), '地址' + poi_address 185 | 186 | if r1 or r2: 187 | print '已经存在记录,更新数据... ...' 188 | pass 189 | else: 190 | print '新增POI... ...' 191 | sqli = "INSERT INTO " + db + "." + tb + "(poi_ch_name,poi_en_name,poi_loc_name,poi_region_id,poi_tag_id,poi_score,poi_rank,poi_address,poi_telephone,visited_count,comments_count,collection_count,source_website)" + " VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 192 | # print sqli 193 | 194 | cur.execute(sqli,(zhongwen,yingwen,yingwen,region_id,tag_id,poi_score,poi_rank,poi_address,poi_telephone,quguonum,pinglunshu,shoucangshu,source)) 195 | conn.commit() 196 | print '----------------------------------------' 197 | if __name__ == '__main__': 198 | # tag 199 | taglist = {'美食':1,'酒店':2,'景点':3,'购物':4,'娱乐':5,'交通':6} 200 | 201 | # 设置白名单,过滤国家 202 | chengshibaimingdan = ['美国'] 203 | # 来源 204 | source = '蚂蜂窝' 205 | 206 | db = 'map' 207 | # 数据表 208 | tb = 'map_poi_0' 209 | area = 'usa' 210 | 211 | if not os.path.exists('zhuaqu'): 212 | os.mkdir('zhuaqu') 213 | if not os.path.exists(os.path.join('zhuaqu',tb)): 214 | os.mkdir(os.path.join('zhuaqu',tb)) 215 | jilu = open(os.path.join('zhuaqu',tb,'jilu.txt'),'a') 216 | # 连接数据库 217 | try: 218 | conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, charset='utf8') 219 | cur = conn.cursor() 220 | cur.execute('set interactive_timeout=96*3600') 221 | conn.select_db(db) 222 | except MySQLdb.Error,e: 223 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 224 | # 抽取待抓取的城市列表 225 | sqli0 = "select region_ch_name from map.map_region" 226 | num_city = cur.execute(sqli0) 227 | cities = cur.fetchmany(num_city) 228 | 229 | for city in cities: 230 | city = city[0] 231 | city = city.strip() 232 | if city in chengshibaimingdan: 233 | pass 234 | else: 235 | url0 = 'http://www.mafengwo.cn/group/s.php?q=%s&p=1&t=poi&kt=1'%(city) 236 | print url0 237 | html0 = getsource(url0) 238 | if not html0: 239 | pass 240 | else: 241 | #获取爬取的页数 242 | selector0 = etree.HTML(html0) 243 | # temp0 = selector0.xpath('//div[@class="m-pagination"]/span[@class="count"]/text()') 244 | poinum = selector0.xpath('//p[@class="ser-result-primary"]/text()')[0] 245 | poinum = poinum.strip() 246 | for k,zifu in enumerate(poinum): 247 | if poinum[k-1]=='结'and poinum[k]=='果': 248 | si = k 249 | if poinum[k] =='条': 250 | ei = k 251 | try: 252 | poinum = int(poinum[si+1:ei]) 253 | except: 254 | pass 255 | else: 256 | print 'poi数量:',poinum 257 | 258 | if poinum < 750: 259 | yema = poinum / 15 260 | else: 261 | yema = 50 262 | 263 | if yema == 0: 264 | print '----------%s 1页---------'%(city) 265 | url = 'http://www.mafengwo.cn/group/s.php?q=%s&p=%d&t=poi' % (city, 1) 266 | html = getsource(url) 267 | # jilu.write(html) 268 | jiexi(html) 269 | else: 270 | for i in range(yema+1): 271 | url = 'http://www.mafengwo.cn/group/s.php?q=%s&p=%d&t=poi'%(city,i) 272 | html = getsource(url) 273 | # jilu.write(html) 274 | jiexi(html) 275 | 276 | cur.close() 277 | conn.close() 278 | jilu.close() 279 | print 'finished' -------------------------------------------------------------------------------- /poispider/proxyIP.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | 3 | proxy_list = [ 4 | {'http': "http://59.53.67.215:80"}, 5 | {'http': "http://60.161.14.77:8001"}, 6 | {'http': "http://61.144.14.68:80"}, 7 | {'http': "http://61.144.68.180:9999"}, 8 | {'http': "http://61.164.108.84:8844"}, 9 | {'http': "http://61.166.55.153:11808"} 10 | ] -------------------------------------------------------------------------------- /poispider/qyer.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | from lxml import etree 3 | import requests 4 | import os 5 | import sys 6 | import MySQLdb 7 | import re 8 | reload(sys) 9 | sys.setdefaultencoding("utf-8") 10 | 11 | #用来获取网页源代码 12 | def getsource(url): 13 | html = requests.get(url) 14 | html.encoding = 'utf-8' 15 | return html.text 16 | #获取信息块 17 | def getcountryblock(source): 18 | blocks = re.findall('(

    )',source,re.S) 19 | return blocks 20 | 21 | def getpoiblock(source): 22 | blocks = re.findall('(
  • )',source,re.S) 23 | return blocks 24 | # 对匹配为空进行处理 25 | def pankong(poi_xx): 26 | if len(poi_xx)==0: 27 | poi_xx = '' 28 | else: 29 | poi_xx = poi_xx[0] 30 | return poi_xx 31 | 32 | if __name__ == '__main__': 33 | countries = ['cuba'] 34 | 35 | for country in countries: 36 | starturl = 'http://place.qyer.com/'+country+'/citylist-0-0-1/' 37 | db = 'map' 38 | # 数据表 39 | tb = 'map_poi' 40 | # 希望跳过抓取的城市 41 | hulvcities = range(1,1049) 42 | hulvcities.append(987) 43 | hulvcities.append(915) 44 | 45 | # 连接数据库 46 | try: 47 | # conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, charset='utf8') 48 | conn = MySQLdb.connect(host='172.22.185.130', user='root', passwd='123456', port=3306, charset='utf8') 49 | cur = conn.cursor() 50 | cur.execute('set interactive_timeout=96*3600') 51 | conn.select_db(db) 52 | except MySQLdb.Error, e: 53 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 54 | temphtml = getsource(starturl) 55 | selector = etree.HTML(temphtml) 56 | pagenums = selector.xpath('//div[@class="ui_page"]/a/@data-page') 57 | # 需爬取得页数 58 | pagenum = pagenums[len(pagenums) - 2] 59 | 60 | for i in range(1, int(pagenum) + 1): 61 | url = 'http://place.qyer.com/'+country+'/citylist-0-0-%d' % (i) 62 | print url 63 | html = getsource(url) 64 | selector2 = etree.HTML(html) 65 | blocks = getcountryblock(html) 66 | for j, block in enumerate(blocks): 67 | selector1 = etree.HTML(block) 68 | city = selector1.xpath('//a/text()')[0].strip() 69 | cityenglishname = selector1.xpath('//span/text()')[0].strip() 70 | 71 | # 对应的城市id 72 | sqli1 = "select region_id from " + db + ".map_region" + " where region_ch_name = " + "'%s'" % (city) 73 | num_result = cur.execute(sqli1) 74 | if not num_result: 75 | region_id = '' 76 | else: 77 | region_id = cur.fetchmany(1) 78 | region_id = region_id[0][0] 79 | 80 | # 过滤已经抓取完成的城市 81 | if region_id in hulvcities: 82 | print city, '已经完成抓取,跳过' 83 | pass 84 | else: 85 | # 城市主页 86 | sub_url = selector1.xpath('//a/@href')[0] 87 | sighturl = sub_url+'sight/' 88 | foodurl = sub_url+'food/' 89 | shoppingurl = sub_url+'shopping/' 90 | city_urls = [sighturl,foodurl,shoppingurl] 91 | for ci,cityurl in enumerate(city_urls): 92 | sub_html = getsource(cityurl) 93 | sub_selector = etree.HTML(sub_html) 94 | # 爬取的页数 95 | poiyeshu = sub_selector.xpath('//div[@class="ui_page"]/a/@data-page') 96 | try: 97 | poiyeshu = poiyeshu[-2] 98 | except: 99 | pass 100 | else: 101 | for ye in range(1,int(poiyeshu)+1): 102 | dangqianurl = cityurl+"?page=%s"%(ye) 103 | print '列表页url',dangqianurl 104 | dangqianhtml = getsource(dangqianurl) 105 | poiblocks = getpoiblock(dangqianhtml) 106 | print '当前页的poi数',len(poiblocks) 107 | for poiblock in poiblocks: 108 | dangqianselector = etree.HTML(poiblock) 109 | 110 | # 中文、英文、本地名称 111 | name0 = dangqianselector.xpath('//h3[@class="title fontYaHei"]/a/text()')[0].strip() 112 | name1 = dangqianselector.xpath('//h3[@class="title fontYaHei"]/a/span/text()') 113 | if len(name1)==0: 114 | name1 = '' 115 | else: 116 | name1 = name1[0].strip() 117 | 118 | shouzimu = name0[0].encode('utf-8') 119 | if shouzimu.isalpha(): 120 | poi_en_name = name0 121 | poi_ch_name = '' 122 | else: 123 | poi_en_name = name1 124 | poi_ch_name = name0 125 | poi_loc_name = poi_en_name 126 | 127 | # 类别id 128 | if ci == 0: 129 | tag_id = 3 130 | elif ci == 1: 131 | tag_id =1 132 | elif ci == 2: 133 | tag_id = 4 134 | # 评分 135 | poi_score = dangqianselector.xpath('//span[@class="grade"]/text()') 136 | if len(poi_score)==0: 137 | poi_score='' 138 | else: 139 | poi_score = poi_score[0] 140 | 141 | # 排名 142 | poi_rank = dangqianselector.xpath('//em[@class="rank orange"]/text()') 143 | poi_rank = pankong(poi_rank) 144 | newstr = '' 145 | for sr in poi_rank: 146 | if sr.isdigit(): 147 | newstr = newstr + sr 148 | poi_rank = newstr 149 | 150 | # 详情页url 151 | xiangqingurl = dangqianselector.xpath('//h3[@class="title fontYaHei"]/a/@href')[0] 152 | print '详情页',xiangqingurl 153 | try: 154 | xiangqinghtml = getsource(xiangqingurl) 155 | except: 156 | pass 157 | else: 158 | xiangqingselector = etree.HTML(xiangqinghtml) 159 | poi_tips_biaoti = xiangqingselector.xpath('//div[@class="poiDet-main"]/ul[@class="poiDet-tips"]/li/span/text()') 160 | biaotilist = [] 161 | for tipi,biaoti in enumerate(poi_tips_biaoti): 162 | biaoti = biaoti.strip() 163 | biaotilist.append(biaoti) 164 | 165 | for ti,biaoti in enumerate(biaotilist): 166 | if biaoti =='': 167 | biaotilist.pop(ti) 168 | for bi,biaoti in enumerate(biaotilist): 169 | if biaoti == '地址:': 170 | addi = bi + 1 171 | if biaoti == '电话:': 172 | telei = bi + 1 173 | if '地址:'in biaotilist: 174 | # 地址 175 | xpath_str_add = "//ul[@class='poiDet-tips']/li["+str(addi)+"]/div/p/text()" 176 | poi_address = xiangqingselector.xpath(xpath_str_add) 177 | poi_address = pankong(poi_address) 178 | else: 179 | poi_address = '' 180 | 181 | if '电话:'in biaotilist: 182 | # 电话 183 | xpath_str_tele = "//ul[@class='poiDet-tips']/li[" + str(telei) + "]/div/p/text()" 184 | poi_telephone = xiangqingselector.xpath(xpath_str_tele) 185 | poi_telephone = pankong(poi_telephone) 186 | if not poi_telephone: 187 | poi_telephone = '' 188 | else: 189 | poi_telephone = '' 190 | # 评论数 191 | pinglunshu = dangqianselector.xpath('//div[@class="info"]/span[@class="dping"]/a/text()') 192 | if len(pinglunshu)==0: 193 | pinglunshu ='' 194 | else: 195 | pinglunshu = pinglunshu[0] 196 | 197 | comments_count = pinglunshu.strip() 198 | newstr1 = '' 199 | for sr1 in comments_count: 200 | if sr1.isdigit(): 201 | newstr1 = newstr1 + sr1 202 | comments_count = newstr1 203 | # 来源 204 | source = 'qyer' 205 | sqli = "INSERT INTO " + db + "." + tb + "(poi_ch_name,poi_en_name,poi_loc_name,poi_region_id,poi_tag_id,poi_score,poi_rank,poi_address,poi_telephone,comments_count,source_website)" + " VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 206 | 207 | # 判断数据库是否已经存在城市数据,决定是插入数据还是更新数据。 208 | sqli1 = "select * from " + db + "." + tb + " where poi_ch_name = " + "'%s'" % (poi_ch_name) 209 | sqli2 = "select * from " + db + "." + tb + " where poi_en_name = " + "'%s'" % (poi_en_name) 210 | # print '中文查询',sqli1 211 | # print '英文查询',sqli2 212 | try: 213 | r1 = cur.execute(sqli1) 214 | r2 = cur.execute(sqli2) 215 | except: 216 | pass 217 | if poi_ch_name =='': 218 | r1 = 0 219 | if poi_en_name == '': 220 | r2 = 0 221 | print '查询结果:','中文',r1,'英文',r2 222 | if r1 or r2: 223 | print '已经存在记录,迭代数据 ... ...' 224 | pass 225 | else: 226 | print '插入新POI... ...' 227 | cur.execute(sqli,(poi_ch_name, poi_en_name, poi_loc_name, region_id, tag_id,poi_score,poi_rank,poi_address,poi_telephone,comments_count,source)) 228 | conn.commit() 229 | print '------------------------------------------------' 230 | cur.close() 231 | conn.close() 232 | print '------------finished--------------' 233 | -------------------------------------------------------------------------------- /poispider/qyer_one_city.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | from lxml import etree 3 | import requests 4 | import os 5 | import sys 6 | import MySQLdb 7 | import re 8 | reload(sys) 9 | sys.setdefaultencoding("utf-8") 10 | 11 | #用来获取网页源代码 12 | def getsource(url): 13 | html = requests.get(url) 14 | html.encoding = 'utf-8' 15 | return html.text 16 | #获取信息块 17 | def getcountryblock(source): 18 | blocks = re.findall('(

    )',source,re.S) 19 | return blocks 20 | 21 | def getpoiblock(source): 22 | blocks = re.findall('(
  • )',source,re.S) 23 | return blocks 24 | # 对匹配为空进行处理 25 | def pankong(poi_xx): 26 | if len(poi_xx)==0: 27 | poi_xx = '' 28 | else: 29 | poi_xx = poi_xx[0] 30 | return poi_xx 31 | 32 | if __name__ == '__main__': 33 | country = 'singapore' 34 | city = '新加坡' 35 | starturl = 'http://place.qyer.com/singapore/' 36 | 37 | db = 'map' 38 | # 数据表 39 | tb = 'map_poi' 40 | # 希望跳过抓取的城市 41 | hulvcities = range(1,826) 42 | 43 | # 连接数据库 44 | try: 45 | # conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, charset='utf8') 46 | conn = MySQLdb.connect(host='172.22.185.78', user='root', passwd='123456', port=3306, charset='utf8') 47 | cur = conn.cursor() 48 | cur.execute('set interactive_timeout=96*3600') 49 | conn.select_db(db) 50 | except MySQLdb.Error, e: 51 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 52 | 53 | 54 | # 对应的城市id 55 | sqli1 = "select region_id from " + db + ".map_region" + " where region_ch_name = " + "'%s'" % (city) 56 | num_result = cur.execute(sqli1) 57 | if not num_result: 58 | region_id = '' 59 | else: 60 | region_id = cur.fetchmany(1) 61 | region_id = region_id[0][0] 62 | 63 | # 过滤已经抓取完成的城市 64 | if region_id in hulvcities: 65 | print city, '已经完成抓取,跳过' 66 | pass 67 | else: 68 | # 城市主页 69 | sighturl = starturl+'sight/' 70 | foodurl = starturl+'food/' 71 | shoppingurl = starturl+'shopping/' 72 | city_urls = [sighturl,foodurl,shoppingurl] 73 | for ci,cityurl in enumerate(city_urls): 74 | print cityurl 75 | sub_html = getsource(cityurl) 76 | sub_selector = etree.HTML(sub_html) 77 | # 爬取的页数 78 | poiyeshu = sub_selector.xpath('//div[@class="ui_page"]/a/@data-page') 79 | try: 80 | poiyeshu = poiyeshu[-2] 81 | except: 82 | pass 83 | else: 84 | for ye in range(1,int(poiyeshu)+1): 85 | dangqianurl = cityurl+"?page=%s"%(ye) 86 | print '列表页url',dangqianurl 87 | dangqianhtml = getsource(dangqianurl) 88 | poiblocks = getpoiblock(dangqianhtml) 89 | print '当前页的poi数',len(poiblocks) 90 | for poiblock in poiblocks: 91 | dangqianselector = etree.HTML(poiblock) 92 | 93 | # 中文、英文、本地名称 94 | name0 = dangqianselector.xpath('//h3[@class="title fontYaHei"]/a/text()')[0].strip() 95 | name1 = dangqianselector.xpath('//h3[@class="title fontYaHei"]/a/span/text()') 96 | if len(name1)==0: 97 | name1 = '' 98 | else: 99 | name1 = name1[0].strip() 100 | 101 | shouzimu = name0[0].encode('utf-8') 102 | if shouzimu.isalpha(): 103 | poi_en_name = name0 104 | poi_ch_name = '' 105 | else: 106 | poi_en_name = name1 107 | poi_ch_name = name0 108 | poi_loc_name = poi_en_name 109 | 110 | # 类别id 111 | if ci == 0: 112 | tag_id = 3 113 | elif ci == 1: 114 | tag_id =1 115 | elif ci == 2: 116 | tag_id = 4 117 | # 评分 118 | poi_score = dangqianselector.xpath('//span[@class="grade"]/text()') 119 | if len(poi_score)==0: 120 | poi_score='' 121 | else: 122 | poi_score = poi_score[0] 123 | 124 | # 排名 125 | poi_rank = dangqianselector.xpath('//em[@class="rank orange"]/text()') 126 | poi_rank = pankong(poi_rank) 127 | newstr = '' 128 | for sr in poi_rank: 129 | if sr.isdigit(): 130 | newstr = newstr + sr 131 | poi_rank = newstr 132 | 133 | # 详情页url 134 | xiangqingurl = dangqianselector.xpath('//h3[@class="title fontYaHei"]/a/@href')[0] 135 | print '详情页',xiangqingurl 136 | try: 137 | xiangqinghtml = getsource(xiangqingurl) 138 | except: 139 | pass 140 | else: 141 | xiangqingselector = etree.HTML(xiangqinghtml) 142 | poi_tips_biaoti = xiangqingselector.xpath('//div[@class="poiDet-main"]/ul[@class="poiDet-tips"]/li/span/text()') 143 | biaotilist = [] 144 | for tipi,biaoti in enumerate(poi_tips_biaoti): 145 | biaoti = biaoti.strip() 146 | biaotilist.append(biaoti) 147 | 148 | for ti,biaoti in enumerate(biaotilist): 149 | if biaoti =='': 150 | biaotilist.pop(ti) 151 | for bi,biaoti in enumerate(biaotilist): 152 | if biaoti == '地址:': 153 | addi = bi + 1 154 | if biaoti == '电话:': 155 | telei = bi + 1 156 | if '地址:'in biaotilist: 157 | # 地址 158 | xpath_str_add = "//ul[@class='poiDet-tips']/li["+str(addi)+"]/div/p/text()" 159 | poi_address = xiangqingselector.xpath(xpath_str_add) 160 | poi_address = pankong(poi_address) 161 | else: 162 | poi_address = '' 163 | 164 | if '电话:'in biaotilist: 165 | # 电话 166 | xpath_str_tele = "//ul[@class='poiDet-tips']/li[" + str(telei) + "]/div/p/text()" 167 | poi_telephone = xiangqingselector.xpath(xpath_str_tele) 168 | poi_telephone = pankong(poi_telephone) 169 | if not poi_telephone: 170 | poi_telephone = '' 171 | else: 172 | poi_telephone = '' 173 | # 评论数 174 | pinglunshu = dangqianselector.xpath('//div[@class="info"]/span[@class="dping"]/a/text()') 175 | if len(pinglunshu)==0: 176 | pinglunshu ='' 177 | else: 178 | pinglunshu = pinglunshu[0] 179 | 180 | comments_count = pinglunshu.strip() 181 | newstr1 = '' 182 | for sr1 in comments_count: 183 | if sr1.isdigit(): 184 | newstr1 = newstr1 + sr1 185 | comments_count = newstr1 186 | # 来源 187 | source = 'qyer' 188 | sqli = "INSERT INTO " + db + "." + tb + "(poi_ch_name,poi_en_name,poi_loc_name,poi_region_id,poi_tag_id,poi_score,poi_rank,poi_address,poi_telephone,comments_count,source_website)" + " VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 189 | 190 | # 判断数据库是否已经存在城市数据,决定是插入数据还是更新数据。 191 | sqli1 = "select * from " + db + "." + tb + " where poi_ch_name = " + "'%s'" % (poi_ch_name) 192 | sqli2 = "select * from " + db + "." + tb + " where poi_en_name = " + "'%s'" % (poi_en_name) 193 | # print '中文查询',sqli1 194 | # print '英文查询',sqli2 195 | try: 196 | r1 = cur.execute(sqli1) 197 | r2 = cur.execute(sqli2) 198 | except: 199 | pass 200 | if poi_ch_name =='': 201 | r1 = 0 202 | if poi_en_name == '': 203 | r2 = 0 204 | print '查询结果:','中文',r1,'英文',r2 205 | if r1 or r2: 206 | print '已经存在记录,迭代数据 ... ...' 207 | pass 208 | else: 209 | print '插入新POI... ...' 210 | cur.execute(sqli,(poi_ch_name, poi_en_name, poi_loc_name, region_id, tag_id,poi_score,poi_rank,poi_address,poi_telephone,comments_count,source)) 211 | conn.commit() 212 | print '------------------------------------------------' 213 | cur.close() 214 | conn.close() 215 | print '------------finished--------------' 216 | -------------------------------------------------------------------------------- /poispider/tripadvisor.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | from lxml import etree 3 | import requests 4 | import os 5 | import sys 6 | import MySQLdb 7 | import re 8 | import random 9 | import time 10 | reload(sys) 11 | sys.setdefaultencoding("utf-8") 12 | 13 | def qushuzi(str): 14 | num = '' 15 | if not str: 16 | return num 17 | else: 18 | for sr in str: 19 | if sr.isdigit(): 20 | num = num+sr 21 | return num 22 | #用来获取网页源代码 23 | def getsource(url): 24 | headlist = [{'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.'}, 25 | {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, 26 | {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;'}, 27 | {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}, 28 | {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}, 29 | {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'}, 30 | {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, 31 | {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, 32 | {'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}, 33 | {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, 34 | {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'}, 35 | {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}] 36 | num = len(headlist)-1 37 | html = requests.get(url, headers=headlist[random.randint(0, num)]) 38 | html.encoding = 'utf-8' 39 | return html.text 40 | #获取信息块 41 | def getcityblock(source): 42 | blocks = re.findall('()',source,re.S) 48 | elif ci == 1: 49 | blocks = re.findall('(
    .*?
    )', source, re.S) 50 | elif ci == 2: 51 | blocks = re.findall('(
    .*?
    )',source,re.S) 52 | else: 53 | blocks = [] 54 | return blocks 55 | # 对匹配为空进行处理 56 | def pankong(poi_xx): 57 | if len(poi_xx)==0: 58 | poi_xx = '' 59 | else: 60 | poi_xx = poi_xx[0] 61 | return poi_xx 62 | def tiaoguo(selector): 63 | guolvcibiao = ['市'] 64 | city = selector.xpath('//li[@class="cityName tabItem dropDown hvrIE6"]/span/span/text()') 65 | city = pankong(city) 66 | # 对应的城市id 67 | sqli = "select region_id from " + db + ".map_region" + " where region_ch_name = " + "'%s'" % (city) 68 | num_result = cur.execute(sqli) 69 | if not num_result: 70 | region_id = '' 71 | else: 72 | region_id = cur.fetchmany(1) 73 | region_id = region_id[0][0] 74 | # 过滤已经抓取完成的城市 75 | if region_id in hulvcities: 76 | return city,region_id,True 77 | else: 78 | return city,region_id,False 79 | def yemianjiexi(block,ci): 80 | xiangqingselector = '' 81 | poi_ch_name = '' 82 | poi_en_name = '' 83 | poi_loc_name = '' 84 | poi_telephone ='' 85 | poi_address ='' 86 | poi_rank='' 87 | comments_count='' 88 | 89 | selector = etree.HTML(block) 90 | # 详情页信息获取异常标记 91 | tiaoguopoi = 0 92 | 93 | if ci == 0: 94 | # 详情页url 95 | 96 | xiangqingurl = selector.xpath('//div[@class="listing_title"]/a/@href')[0] 97 | xiangqingurl = starturl + xiangqingurl 98 | # print '详情页', xiangqingurl 99 | try: 100 | xiangqinghtml = getsource(xiangqingurl) 101 | except: 102 | tiaoguopoi = 1 103 | return tiaoguopoi, xiangqingselector, poi_ch_name, poi_en_name, poi_loc_name, poi_telephone, poi_address, poi_rank, comments_count 104 | else: 105 | time.sleep(tingliu) 106 | xiangqingselector = etree.HTML(xiangqinghtml) 107 | 108 | # 名称 109 | name0 = selector.xpath('//div[@class="listing_title"]/a/text()') 110 | name0 = pankong(name0) 111 | 112 | name1 = xiangqingselector.xpath('//span[@class="altHead"]/text()') 113 | name1 = pankong(name1) 114 | shouzimu0 = name0[0].encode('utf-8') 115 | if not name1: 116 | if shouzimu0.isalpha(): 117 | poi_en_name = name0 118 | poi_ch_name = '' 119 | else: 120 | poi_en_name = '' 121 | poi_ch_name = name0 122 | else: 123 | if shouzimu0.isalpha(): 124 | poi_en_name = name0 125 | poi_ch_name = name1 126 | else: 127 | poi_en_name = name1 128 | poi_ch_name = name0 129 | 130 | poi_loc_name = poi_en_name 131 | 132 | comments_count = xiangqingselector.xpath('//a[@class="more taLnk"]/@content') 133 | comments_count = pankong(comments_count) 134 | poi_rank = xiangqingselector.xpath('//b[@class="rank"]/text()') 135 | poi_rank = pankong(poi_rank) 136 | if poi_rank: 137 | poi_rank = qushuzi(poi_rank) 138 | # 酒店类的电话获取还未完成,后期优化 139 | poi_telephone = '' 140 | # 地址 141 | street_address = xiangqingselector.xpath('//span[@class="street-address"]/text()') 142 | street_address = pankong(street_address) 143 | 144 | extended_address = xiangqingselector.xpath('//span[@class="extended-address"]/text()') 145 | extended_address = pankong(extended_address) 146 | 147 | addresslocality = xiangqingselector.xpath('//span[@property="addressLocality"]/text()') 148 | addresslocality = pankong(addresslocality) 149 | 150 | addressregion = xiangqingselector.xpath('//span[@property="addressRegion"]/text()') 151 | addressregion = pankong(addressregion) 152 | 153 | postcode = xiangqingselector.xpath('//span[@property="postalCode"]/text()') 154 | postcode = pankong(postcode) 155 | 156 | elif ci == 1: 157 | # 详情页url 158 | xiangqingurl = selector.xpath('//div[@class="property_title"]/a/@href')[0] 159 | xiangqingurl = starturl + xiangqingurl 160 | # print '详情页', xiangqingurl 161 | try: 162 | xiangqinghtml = getsource(xiangqingurl) 163 | except: 164 | tiaoguopoi = 1 165 | return tiaoguopoi, xiangqingselector, poi_ch_name, poi_en_name, poi_loc_name, poi_telephone, poi_address, poi_rank, comments_count 166 | else: 167 | time.sleep(tingliu) 168 | xiangqingselector = etree.HTML(xiangqinghtml) 169 | # 名称 170 | name0 = selector.xpath('//a[@target="_blank"]/text()') 171 | name0 = pankong(name0).strip() 172 | if not name0: 173 | name0 = xiangqingselector.xpath('//h1[@id="HEADING"]/text()') 174 | name0 = pankong(name0).strip() 175 | name1 = xiangqingselector.xpath('//span[@class="altHead"]/text()') 176 | name1 = pankong(name1) 177 | shouzimu0 = name0[0].encode('utf-8') 178 | if not name1: 179 | if shouzimu0.isalpha(): 180 | poi_en_name = name0 181 | poi_ch_name = '' 182 | else: 183 | poi_en_name = '' 184 | poi_ch_name = name0 185 | else: 186 | if shouzimu0.isalpha(): 187 | poi_en_name = name0 188 | poi_ch_name = name1 189 | else: 190 | poi_en_name = name1 191 | poi_ch_name = name0 192 | poi_loc_name = poi_en_name 193 | 194 | comments_count = xiangqingselector.xpath('//a[@class="more"]/@content') 195 | comments_count = pankong(comments_count) 196 | poi_rank = xiangqingselector.xpath('//b[@class="rank_text wrap"]/span/text()') 197 | poi_rank = pankong(poi_rank) 198 | if poi_rank: 199 | poi_rank = qushuzi(poi_rank) 200 | poi_telephone = xiangqingselector.xpath('//div[@class="phoneNumber"]/text()') 201 | poi_telephone = pankong(poi_telephone) 202 | if poi_telephone: 203 | for i,ch in enumerate(poi_telephone): 204 | if ch.isdigit(): 205 | tempi = i 206 | break 207 | poi_telephone = poi_telephone[tempi:] 208 | 209 | # 地址 210 | street_address = xiangqingselector.xpath('//span[@class="street-address"]/text()') 211 | street_address = pankong(street_address) 212 | 213 | extended_address = xiangqingselector.xpath('//span[@class="extended-address"]/text()') 214 | extended_address = pankong(extended_address) 215 | 216 | addresslocality = xiangqingselector.xpath('//span[@property="addressLocality"]/text()') 217 | addresslocality = pankong(addresslocality) 218 | 219 | addressregion = xiangqingselector.xpath('//span[@property="addressRegion"]/text()') 220 | addressregion = pankong(addressregion) 221 | 222 | postcode = xiangqingselector.xpath('//span[@property="postalCode"]/text()') 223 | postcode = pankong(postcode) 224 | 225 | elif ci == 2: 226 | # 详情页url 227 | xiangqingurl = selector.xpath('//a[@class="property_title"]/@href')[0] 228 | xiangqingurl = starturl + xiangqingurl 229 | # print '详情页', xiangqingurl 230 | try: 231 | xiangqinghtml = getsource(xiangqingurl) 232 | except: 233 | tiaoguopoi = 1 234 | return tiaoguopoi,xiangqingselector,poi_ch_name,poi_en_name,poi_loc_name,poi_telephone,poi_address,poi_rank,comments_count 235 | else: 236 | time.sleep(tingliu) 237 | xiangqingselector = etree.HTML(xiangqinghtml) 238 | 239 | # 名称 240 | poi_ch_name = '' 241 | poi_en_name = selector.xpath('//a[@target="_blank"]/text()') 242 | poi_en_name = pankong(poi_en_name) 243 | poi_en_name = poi_en_name.strip() 244 | poi_loc_name = poi_en_name 245 | 246 | comments_count = xiangqingselector.xpath('//a[@class="more"]/@content') 247 | comments_count = pankong(comments_count) 248 | poi_rank = xiangqingselector.xpath('//b[@class="rank_text wrap"]/span/text()') 249 | poi_rank = pankong(poi_rank) 250 | if poi_rank: 251 | poi_rank = qushuzi(poi_rank) 252 | poi_telephone = xiangqingselector.xpath('//div[@class="fl phoneNumber"]/text()') 253 | poi_telephone = pankong(poi_telephone) 254 | 255 | # 地址 256 | street_address = xiangqingselector.xpath('//span[@class="street-address"]/text()') 257 | street_address = pankong(street_address) 258 | 259 | extended_address = xiangqingselector.xpath('//span[@class="extended-address"]/text()') 260 | extended_address = pankong(extended_address) 261 | 262 | addresslocality = xiangqingselector.xpath('//span[@property="addressLocality"]/text()') 263 | addresslocality = pankong(addresslocality) 264 | 265 | addressregion = xiangqingselector.xpath('//span[@property="addressRegion"]/text()') 266 | addressregion = pankong(addressregion) 267 | 268 | postcode = xiangqingselector.xpath('//span[@property="postalCode"]/text()') 269 | postcode = pankong(postcode) 270 | 271 | if extended_address: 272 | poi_address = street_address + ',' + extended_address+','+addresslocality + ',' + addressregion + postcode 273 | else: 274 | poi_address = street_address + ',' + addresslocality + ',' + addressregion + postcode 275 | 276 | if not poi_telephone: 277 | poi_telephone = '' 278 | else: 279 | pass 280 | return tiaoguopoi,xiangqingselector,poi_ch_name,poi_en_name,poi_loc_name,poi_telephone,poi_address,poi_rank,comments_count 281 | 282 | if __name__ == '__main__': 283 | starturl = 'http://www.tripadvisor.cn' 284 | tingliu = 2 285 | db = 'map' 286 | # 数据表 287 | tb = 'map_poi' 288 | # 希望跳过抓取的城市 289 | hulvcities = range(1,2) 290 | # 跳过多少个城市 291 | num_tiao = 3 292 | # 来源 293 | source = 'tripadvisor' 294 | # 连接数据库 295 | try: 296 | # conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, charset='utf8') 297 | conn = MySQLdb.connect(host='172.22.185.130', user='root', passwd='123456', port=3306, charset='utf8') 298 | cur = conn.cursor() 299 | cur.execute('set interactive_timeout=96*3600') 300 | conn.select_db(db) 301 | except MySQLdb.Error, e: 302 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 303 | 304 | for i in range(2,51): 305 | # 城市列表页 306 | # url = 'http://www.tripadvisor.cn/TourismChildrenAjax?geo=191&offset=%d&desktop=true'% (i) 307 | url = 'http://www.tripadvisor.cn/TourismChildrenAjax?geo=150768&offset=%d&desktop=true' % (i) 308 | print '城市列表页:',url 309 | html = getsource(url) 310 | time.sleep(tingliu) 311 | blocks = getcityblock(html)# 设置当前列表页跳过前面多少个城市 312 | 313 | for j, block in enumerate(blocks): 314 | if num_tiao == 0: 315 | pass 316 | elif num_tiao>j: 317 | continue 318 | selector1 = etree.HTML(block) 319 | 320 | # 城市主页 321 | sub_url = selector1.xpath('//a/@href')[0] 322 | zhuye_url = starturl+sub_url 323 | print '城市主页:',zhuye_url 324 | zhuye_html = getsource(zhuye_url) 325 | time.sleep(tingliu) 326 | zhuye_selector = etree.HTML(zhuye_html) 327 | poi_region,region_id, tiaoma = tiaoguo(zhuye_selector) 328 | if tiaoma: 329 | pass 330 | else: 331 | hotel_url = zhuye_selector.xpath('//li[@class="hotels twoLines"]/a/@href')[0] 332 | jingdian_url = zhuye_selector.xpath('//li[@class="attractions twoLines"]/a/@href')[0] 333 | canting_url = zhuye_selector.xpath('//li[@class="restaurants twoLines"]/a/@href')[0] 334 | city_urls = [hotel_url,jingdian_url,canting_url] 335 | for ci,cityurl in enumerate(city_urls): 336 | city_url = starturl+cityurl 337 | print '行业url',city_url 338 | if ci ==0: 339 | tag_id = 2 340 | if ci == 1: 341 | tag_id = 3 342 | if ci == 2: 343 | tag_id = 1 344 | dizhiyuanshu = city_url.split('-') 345 | sub_html = getsource(city_url) 346 | time.sleep(tingliu) 347 | sub_selector = etree.HTML(sub_html) 348 | # 爬取的页数 349 | poiyeshu = sub_selector.xpath('//a[@class="pageNum last taLnk"]/@data-page-number') 350 | if len(poiyeshu) == 0: 351 | poiyeshu = 1 352 | else: 353 | poiyeshu = poiyeshu[0] 354 | 355 | for ye in range(1,int(poiyeshu)+1): 356 | index = 30*(ye-1) 357 | if index != 0: 358 | if ci == 0: 359 | dangqianurl = dizhiyuanshu[0]+'-'+dizhiyuanshu[1]+'-'+'oa'+str(index)+'-'+dizhiyuanshu[2]+'-'+dizhiyuanshu[3] 360 | if ci == 1: 361 | dangqianhtml = dizhiyuanshu[0]+'-'+dizhiyuanshu[1]+'-'+dizhiyuanshu[2]+'-'+'oa'+str(index)+'-'+dizhiyuanshu[3] 362 | if ci == 2: 363 | dangqianhtml = dizhiyuanshu[0]+'-'+dizhiyuanshu[1]+'-'+'oa'+str(index)+'-'+dizhiyuanshu[2] 364 | else: 365 | dangqianurl = city_url 366 | print '列表页url',dangqianurl 367 | dangqianhtml = getsource(dangqianurl) 368 | time.sleep(tingliu) 369 | poiblocks = getpoiblock(dangqianhtml,ci) 370 | print '当前页的poi数',len(poiblocks) 371 | for poiblock in poiblocks: 372 | try: 373 | tiaoguopoi,xiangqingselector,poi_ch_name, poi_en_name, poi_loc_name, poi_telephone, poi_address, poi_rank,comments_count = yemianjiexi(poiblock,ci) 374 | except: 375 | pass 376 | else: 377 | # tiaoguopoi为poi详情页获取阶段,异常获取标志。若获取异常,tiaoguopoi为1,跳过该poi 378 | if tiaoguopoi: 379 | pass 380 | else: 381 | sqli = "INSERT INTO " + db + "." + tb + "(poi_ch_name,poi_en_name,poi_loc_name,poi_region_id,poi_region,poi_tag_id,poi_rank,poi_address,poi_telephone,comments_count,source_website)" + " VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 382 | 383 | # 判断数据库是否已经存在城市数据,决定是插入数据还是更新数据。 384 | sqli1 = "select * from " + db + "." + tb + " where poi_ch_name = " + "'%s'" % (poi_ch_name) 385 | sqli2 = "select * from " + db + "." + tb + " where poi_en_name = " + "'%s'" % (poi_en_name) 386 | 387 | try: 388 | r1 = cur.execute(sqli1) 389 | r2 = cur.execute(sqli2) 390 | except: 391 | pass 392 | 393 | if not poi_ch_name : 394 | r1 = 0 395 | if not poi_en_name : 396 | r2 = 0 397 | # print '中文:',poi_ch_name,'英文:',poi_en_name 398 | # print '查询结果:','中文',r1,'英文',r2 399 | 400 | if r1 or r2: 401 | print '已经存在记录,迭代数据 ... ...' 402 | pass 403 | else: 404 | print '插入新POI... ...' 405 | print '中文:' + poi_ch_name,'英文:' + poi_en_name, '本地语言名称:' + poi_en_name, '城市id:' + str(region_id),'城市:'+poi_region, '类型:' + str(tag_id), '评论数:' + str(comments_count), '排名:' + str(poi_rank), '地址:' + str(poi_address), '电话:' + str(poi_telephone) 406 | cur.execute(sqli,(poi_ch_name, poi_en_name, poi_loc_name, region_id,poi_region, tag_id,poi_rank,poi_address,poi_telephone,comments_count,source)) 407 | conn.commit() 408 | print '------------------------------------------------' 409 | cur.close() 410 | conn.close() 411 | print '------------finished--------------' 412 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #encoding=utf-8 2 | from lxml import etree 3 | import requests 4 | import os 5 | import sys 6 | import MySQLdb 7 | import re 8 | import random 9 | import time 10 | reload(sys) 11 | sys.setdefaultencoding("utf-8") 12 | 13 | def qushuzi(str): 14 | num = '' 15 | if not str: 16 | return num 17 | else: 18 | for sr in str: 19 | if sr.isdigit(): 20 | num = num+sr 21 | return num 22 | #用来获取网页源代码 23 | def getsource(url): 24 | headlist = [{'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.'}, 25 | {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50'}, 26 | {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;'}, 27 | {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}, 28 | {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)'}, 29 | {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'}, 30 | {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1'}, 31 | {'User-Agent': 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11'}, 32 | {'User-Agent': 'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'}, 33 | {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'}, 34 | {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)'}, 35 | {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}] 36 | num = len(headlist)-1 37 | html = requests.get(url, headers=headlist[random.randint(0, num)]) 38 | html.encoding = 'utf-8' 39 | return html.text 40 | #获取信息块 41 | def getcityblock(source): 42 | blocks = re.findall('(
    )',source,re.S) 48 | elif ci == 1: 49 | blocks = re.findall('(
    .*?
    )', source, re.S) 50 | elif ci == 2: 51 | blocks = re.findall('(
    .*?
    )',source,re.S) 52 | else: 53 | blocks = [] 54 | return blocks 55 | # 对匹配为空进行处理 56 | def pankong(poi_xx): 57 | if len(poi_xx)==0: 58 | poi_xx = '' 59 | else: 60 | poi_xx = poi_xx[0] 61 | return poi_xx 62 | def tiaoguo(selector): 63 | guolvcibiao = ['市'] 64 | city = selector.xpath('//li[@class="cityName tabItem dropDown hvrIE6"]/span/span/text()') 65 | city = pankong(city) 66 | # 对应的城市id 67 | sqli = "select region_id from " + db + ".map_region" + " where region_ch_name = " + "'%s'" % (city) 68 | num_result = cur.execute(sqli) 69 | if not num_result: 70 | region_id = '' 71 | else: 72 | region_id = cur.fetchmany(1) 73 | region_id = region_id[0][0] 74 | # 过滤已经抓取完成的城市 75 | if region_id in hulvcities: 76 | return city,region_id,True 77 | else: 78 | return city,region_id,False 79 | def yemianjiexi(block,ci): 80 | selector = etree.HTML(block) 81 | 82 | if ci == 0: 83 | # 详情页url 84 | 85 | xiangqingurl = selector.xpath('//div[@class="listing_title"]/a/@href')[0] 86 | xiangqingurl = starturl + xiangqingurl 87 | print '详情页', xiangqingurl 88 | xiangqinghtml = getsource(xiangqingurl) 89 | time.sleep(tingliu) 90 | xiangqingselector = etree.HTML(xiangqinghtml) 91 | 92 | # 名称 93 | name0 = selector.xpath('//div[@class="listing_title"]/a/text()') 94 | name0 = pankong(name0) 95 | 96 | name1 = xiangqingselector.xpath('//span[@class="altHead"]/text()') 97 | name1 = pankong(name1) 98 | shouzimu0 = name0[0].encode('utf-8') 99 | if not name1: 100 | if shouzimu0.isalpha(): 101 | poi_en_name = name0 102 | poi_ch_name = '' 103 | else: 104 | poi_en_name = '' 105 | poi_ch_name = name0 106 | else: 107 | if shouzimu0.isalpha(): 108 | poi_en_name = name0 109 | poi_ch_name = name1 110 | else: 111 | poi_en_name = name1 112 | poi_ch_name = name0 113 | 114 | poi_loc_name = poi_en_name 115 | 116 | comments_count = xiangqingselector.xpath('//a[@class="more taLnk"]/@content') 117 | comments_count = pankong(comments_count) 118 | poi_rank = xiangqingselector.xpath('//b[@class="rank"]/text()') 119 | poi_rank = pankong(poi_rank) 120 | if poi_rank: 121 | poi_rank = qushuzi(poi_rank) 122 | poi_telephone = xiangqingselector.xpath('//div[@class="fl"]') 123 | poi_telephone = pankong(poi_telephone) 124 | # 地址 125 | street_address = xiangqingselector.xpath('//span[@class="street-address"]/text()') 126 | street_address = pankong(street_address) 127 | 128 | extended_address = xiangqingselector.xpath('//span[@class="extended-address"]/text()') 129 | extended_address = pankong(extended_address) 130 | 131 | addresslocality = xiangqingselector.xpath('//span[@property="addressLocality"]/text()') 132 | addresslocality = pankong(addresslocality) 133 | 134 | addressregion = xiangqingselector.xpath('//span[@property="addressRegion"]/text()') 135 | addressregion = pankong(addressregion) 136 | 137 | postcode = xiangqingselector.xpath('//span[@property="postalCode"]/text()') 138 | postcode = pankong(postcode) 139 | 140 | elif ci == 1: 141 | # 详情页url 142 | xiangqingurl = selector.xpath('//div[@class="property_title"]/a/@href')[0] 143 | xiangqingurl = starturl + xiangqingurl 144 | print '详情页', xiangqingurl 145 | xiangqinghtml = getsource(xiangqingurl) 146 | time.sleep(tingliu) 147 | xiangqingselector = etree.HTML(xiangqinghtml) 148 | # 名称 149 | name0 = selector.xpath('//a[@target="_blank"]/text()') 150 | name0 = pankong(name0).strip() 151 | if not name0: 152 | name0 = xiangqingselector.xpath('//h1[@id="HEADING"]/text()') 153 | name0 = pankong(name0).strip() 154 | name1 = xiangqingselector.xpath('//span[@class="altHead"]/text()') 155 | name1 = pankong(name1) 156 | shouzimu0 = name0[0].encode('utf-8') 157 | if not name1: 158 | if shouzimu0.isalpha(): 159 | poi_en_name = name0 160 | poi_ch_name = '' 161 | else: 162 | poi_en_name = '' 163 | poi_ch_name = name0 164 | else: 165 | if shouzimu0.isalpha(): 166 | poi_en_name = name0 167 | poi_ch_name = name1 168 | else: 169 | poi_en_name = name1 170 | poi_ch_name = name0 171 | poi_loc_name = poi_en_name 172 | 173 | comments_count = xiangqingselector.xpath('//a[@class="more"]/@content') 174 | comments_count = pankong(comments_count) 175 | poi_rank = xiangqingselector.xpath('//b[@class="rank_text wrap"]/span/text()') 176 | poi_rank = pankong(poi_rank) 177 | if poi_rank: 178 | poi_rank = qushuzi(poi_rank) 179 | poi_telephone = xiangqingselector.xpath('//div[@class="phoneNumber"]/text()') 180 | poi_telephone = pankong(poi_telephone) 181 | if poi_telephone: 182 | for i,ch in enumerate(poi_telephone): 183 | if ch.isdigit(): 184 | tempi = i 185 | break 186 | poi_telephone = poi_telephone[tempi:] 187 | 188 | # 地址 189 | street_address = xiangqingselector.xpath('//span[@class="street-address"]/text()') 190 | street_address = pankong(street_address) 191 | 192 | extended_address = xiangqingselector.xpath('//span[@class="extended-address"]/text()') 193 | extended_address = pankong(extended_address) 194 | 195 | addresslocality = xiangqingselector.xpath('//span[@property="addressLocality"]/text()') 196 | addresslocality = pankong(addresslocality) 197 | 198 | addressregion = xiangqingselector.xpath('//span[@property="addressRegion"]/text()') 199 | addressregion = pankong(addressregion) 200 | 201 | postcode = xiangqingselector.xpath('//span[@property="postalCode"]/text()') 202 | postcode = pankong(postcode) 203 | 204 | elif ci == 2: 205 | # 详情页url 206 | xiangqingurl = selector.xpath('//a[@class="property_title"]/@href')[0] 207 | xiangqingurl = starturl + xiangqingurl 208 | print '详情页', xiangqingurl 209 | xiangqinghtml = getsource(xiangqingurl) 210 | time.sleep(tingliu) 211 | xiangqingselector = etree.HTML(xiangqinghtml) 212 | 213 | # 名称 214 | poi_ch_name = '' 215 | poi_en_name = selector.xpath('//a[@target="_blank"]/text()') 216 | poi_en_name = pankong(poi_en_name) 217 | poi_en_name = poi_en_name.strip() 218 | poi_loc_name = poi_en_name 219 | 220 | comments_count = xiangqingselector.xpath('//a[@class="more"]/@content') 221 | comments_count = pankong(comments_count) 222 | poi_rank = xiangqingselector.xpath('//b[@class="rank_text wrap"]/span/text()') 223 | poi_rank = pankong(poi_rank) 224 | if poi_rank: 225 | poi_rank = qushuzi(poi_rank) 226 | poi_telephone = xiangqingselector.xpath('//div[@class="fl phoneNumber"]/text()') 227 | poi_telephone = pankong(poi_telephone) 228 | 229 | # 地址 230 | street_address = xiangqingselector.xpath('//span[@class="street-address"]/text()') 231 | street_address = pankong(street_address) 232 | 233 | extended_address = xiangqingselector.xpath('//span[@class="extended-address"]/text()') 234 | extended_address = pankong(extended_address) 235 | 236 | addresslocality = xiangqingselector.xpath('//span[@property="addressLocality"]/text()') 237 | addresslocality = pankong(addresslocality) 238 | 239 | addressregion = xiangqingselector.xpath('//span[@property="addressRegion"]/text()') 240 | addressregion = pankong(addressregion) 241 | 242 | postcode = xiangqingselector.xpath('//span[@property="postalCode"]/text()') 243 | postcode = pankong(postcode) 244 | 245 | if extended_address: 246 | poi_address = street_address + ',' + extended_address+','+addresslocality + ',' + addressregion + postcode 247 | else: 248 | poi_address = street_address + ',' + addresslocality + ',' + addressregion + postcode 249 | 250 | if not poi_telephone: 251 | poi_telephone = '' 252 | else: 253 | pass 254 | return xiangqingselector,poi_ch_name,poi_en_name,poi_loc_name,poi_telephone,poi_address,poi_rank,comments_count 255 | 256 | if __name__ == '__main__': 257 | 258 | starturl = 'http://www.tripadvisor.cn' 259 | tingliu = 3 260 | db = 'map' 261 | # 数据表 262 | tb = 'map_poi4' 263 | # 希望跳过抓取的城市 264 | hulvcities = range(1,2) 265 | # 来源 266 | source = 'tripadvisor' 267 | # 连接数据库 268 | try: 269 | conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='123456', port=3306, charset='utf8') 270 | # conn = MySQLdb.connect(host='172.22.185.78', user='root', passwd='123456', port=3306, charset='utf8') 271 | cur = conn.cursor() 272 | cur.execute('set interactive_timeout=96*3600') 273 | conn.select_db(db) 274 | except MySQLdb.Error, e: 275 | print "Mysql Error %d: %s" % (e.args[0], e.args[1]) 276 | 277 | for i in range(4,51): 278 | url = 'http://www.tripadvisor.cn/TourismChildrenAjax?geo=191&offset=%d&desktop=true'% (i) 279 | print '城市列表页:',url 280 | html = getsource(url) 281 | time.sleep(tingliu) 282 | blocks = getcityblock(html) 283 | for j, block in enumerate(blocks): 284 | selector1 = etree.HTML(block) 285 | 286 | # 城市主页 287 | sub_url = selector1.xpath('//a/@href')[0] 288 | zhuye_url = starturl+sub_url 289 | print '城市主页:',zhuye_url 290 | zhuye_html = getsource(zhuye_url) 291 | time.sleep(tingliu) 292 | zhuye_selector = etree.HTML(zhuye_html) 293 | poi_region,region_id, tiaoma = tiaoguo(zhuye_selector) 294 | if tiaoma: 295 | pass 296 | else: 297 | # hotel_url = zhuye_selector.xpath('//li[@class="hotels twoLines"]/a/@href')[0] 298 | # jingdian_url = zhuye_selector.xpath('//li[@class="attractions twoLines"]/a/@href')[0] 299 | canting_url = zhuye_selector.xpath('//li[@class="restaurants twoLines"]/a/@href')[0] 300 | # city_urls = [hotel_url,jingdian_url,canting_url] 301 | city_urls = [canting_url] 302 | for ci,cityurl in enumerate(city_urls): 303 | city_url = starturl+cityurl 304 | print '行业url',city_url 305 | if ci ==0: 306 | tag_id = 2 307 | if ci == 1: 308 | tag_id = 3 309 | if ci == 2: 310 | tag_id = 1 311 | ci = 2 312 | dizhiyuanshu = city_url.split('-') 313 | sub_html = getsource(city_url) 314 | time.sleep(tingliu) 315 | sub_selector = etree.HTML(sub_html) 316 | # 爬取的页数 317 | poiyeshu = sub_selector.xpath('//a[@class="pageNum last taLnk"]/@data-page-number') 318 | if len(poiyeshu) == 0: 319 | poiyeshu = 1 320 | else: 321 | poiyeshu = poiyeshu[0] 322 | 323 | for ye in range(1,int(poiyeshu)+1): 324 | index = 30*(ye-1) 325 | if index != 0: 326 | if ci == 0: 327 | dangqianurl = dizhiyuanshu[0]+'-'+dizhiyuanshu[1]+'-'+'oa'+str(index)+'-'+dizhiyuanshu[2]+'-'+dizhiyuanshu[3] 328 | if ci == 1: 329 | dangqianhtml = dizhiyuanshu[0]+'-'+dizhiyuanshu[1]+'-'+dizhiyuanshu[2]+'-'+'oa'+str(index)+'-'+dizhiyuanshu[3] 330 | if ci == 2: 331 | dangqianhtml = dizhiyuanshu[0]+'-'+dizhiyuanshu[1]+'-'+'oa'+str(index)+'-'+dizhiyuanshu[2] 332 | else: 333 | dangqianurl = city_url 334 | print '列表页url',dangqianurl 335 | dangqianhtml = getsource(dangqianurl) 336 | time.sleep(tingliu) 337 | poiblocks = getpoiblock(dangqianhtml,ci) 338 | print '当前页的poi数',len(poiblocks) 339 | for poiblock in poiblocks: 340 | 341 | xiangqingselector,poi_ch_name, poi_en_name, poi_loc_name, poi_telephone, poi_address, poi_rank,comments_count = yemianjiexi(poiblock,ci) 342 | 343 | sqli = "INSERT INTO " + db + "." + tb + "(poi_ch_name,poi_en_name,poi_loc_name,poi_region_id,poi_region,poi_tag_id,poi_rank,poi_address,poi_telephone,comments_count,source_website)" + " VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 344 | 345 | # 判断数据库是否已经存在城市数据,决定是插入数据还是更新数据。 346 | sqli1 = "select * from " + db + "." + tb + " where poi_ch_name = " + "'%s'" % (poi_ch_name) 347 | sqli2 = "select * from " + db + "." + tb + " where poi_en_name = " + "'%s'" % (poi_en_name) 348 | 349 | try: 350 | r1 = cur.execute(sqli1) 351 | r2 = cur.execute(sqli2) 352 | except: 353 | pass 354 | 355 | if not poi_ch_name : 356 | r1 = 0 357 | if not poi_en_name : 358 | r2 = 0 359 | print '中文:',poi_ch_name,'英文:',poi_en_name 360 | print '查询结果:','中文',r1,'英文',r2 361 | 362 | if r1 or r2: 363 | print '已经存在记录,迭代数据 ... ...' 364 | pass 365 | else: 366 | print '插入新POI... ...' 367 | print '中文:' + poi_ch_name,'英文:' + poi_en_name, '本地语言名称:' + poi_en_name, '城市id:' + str(region_id),'城市:'+poi_region, '类型:' + str(tag_id), '评论数:' + str(comments_count), '排名:' + str(poi_rank), '地址:' + poi_address, '电话:' + poi_telephone 368 | cur.execute(sqli,(poi_ch_name, poi_en_name, poi_loc_name, region_id,poi_region, tag_id,poi_rank,poi_address,poi_telephone,comments_count,source)) 369 | conn.commit() 370 | print '------------------------------------------------' 371 | cur.close() 372 | conn.close() 373 | print '------------finished--------------' 374 | -------------------------------------------------------------------------------- /tst.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import random 3 | import re 4 | import sys 5 | import urllib2 6 | 7 | # import MySQLdb 8 | import requests 9 | # from lxml import etree 10 | import socket 11 | # import proxyIP 12 | 13 | reload(sys) 14 | sys.setdefaultencoding("utf-8") 15 | 16 | #getsource用来获取网页源代码 17 | def getsource(url): 18 | headers = { 19 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 20 | # 详情页 cookie 21 | # 'Cookie':'JSESSIONID=EE4F457A78D5ADB9B416BA9BD1260691; BIGipServerESI-WEB-APP-8080=748214282.36895.0000; PSSID="A1-wS2nsbT8yAaiz92yMmlx2BpnjzUUjx2BdSTY-18x2dyfQZ1yAMOOmix2BC4bxxs5fRAx3Dx3DS0YEN7J3xxOrhIYxxYDlHQyQx3Dx3D-9vvmzcndpRgQCGPd1c2qPQx3Dx3D-wx2BJQh9GKVmtdJw3700KssQx3Dx3D"; CUSTOMER="WUHAN UNIV"; E_GROUP_NAME="IC2 Platform"; esi.isLocalStorageCleared=true; Type=documents; Show=null; esi.docsActiveTab=0; esi.Type=; esi.FilterValue=; esi.GroupBy=; esi.FilterBy=; esi.Show=; esi.authorsList=; esi.frontList=; esi.fieldsList=; esi.instList=; esi.journalList=; esi.terriList=; esi.titleList=', 22 | # 列表页 cookie 23 | 'Cookie' : 'Type=documents; Show=null; esi.docsActiveTab=0; esi.Type=; esi.FilterValue=; esi.GroupBy=; esi.FilterBy=; esi.Show=; esi.authorsList=; esi.frontList=; esi.fieldsList=; esi.instList=; esi.journalList=; esi.terriList=; esi.titleList=; clientId=CID1242b7d4b50d1e0ed3ac323a908e08bd; esi.isLocalStorageCleared=true; qq%5Flogin%5Fstate=54D44A36B7E6D2165B2090C42F8CB27E; JSESSIONID=811CB477BFE1C370AB4A99C711A0C0E1; _pk_ref.2.a2d0=%5B%22%22%2C%22%22%2C1480485842%2C%22http%3A%2F%2Fwww.lib.whu.edu.cn%2Fdc%2Furlto_proxy.asp%3Fid%3D638%26url%3Dhttp%3A%2F%2Firas.lib.whu.edu.cn%3A8080%2Fgo%3Fid%3DESI%26source_id%3DWHU03752%26u%3D%26title%3DESI%C2%A3%C2%A8Essential%20Science%20Indicators%C2%A3%C2%A9%22%5D; _pk_id.2.a2d0=b5d6c9744eade5ad.1480474056.2.1480485842.1480485842.; _pk_ses.2.a2d0=*' 24 | } 25 | socket.setdefaulttimeout(60) 26 | 27 | html = requests.get(url,headers = headers) 28 | html.encoding = 'utf-8' 29 | return html.text 30 | 31 | if __name__ == '__main__': 32 | # 详情页 url 33 | # url = 'http://gateway.webofknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=TSMetrics&SrcApp=TSM_TEST&DestApp=WOS_CPL&DestLinkType=FullRecord&KeyUT=ISI:000262948300002' 34 | # 列表页 url 35 | url = 'http://iras.lib.whu.edu.cn:8080/rwt/ESI/https/MW3XTLUJN3SXT7DFPMYHI4DQNW3X85UTMW4YI3LTPMYGG55N/IndicatorsDataAction.action?_dc=1480468834254&type=documents&author=&researchField=CLINICAL%20MEDICINE&institution=&journal=&territory=&article_UT=&researchFront=&articleTitle=&docType=Top&year=&page=1&start=10&limit=10&sort=%5B%7B%22property%22%3A%22citations%22%2C%22direction%22%3A%22DESC%22%7D%5D' 36 | result = getsource(url) 37 | print result 38 | 39 | -------------------------------------------------------------------------------- /webofscience.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import random 3 | import re 4 | import sys 5 | import urllib2 6 | 7 | # import MySQLdb 8 | import requests 9 | # from lxml import etree 10 | import socket 11 | # import proxyIP 12 | 13 | reload(sys) 14 | sys.setdefaultencoding("utf-8") 15 | 16 | #getsource用来获取网页源代码 17 | def getsource(url): 18 | headers = { 19 | 'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', 20 | # 详情页 cookie 21 | # 'Cookie':'JSESSIONID=EE4F457A78D5ADB9B416BA9BD1260691; BIGipServerESI-WEB-APP-8080=748214282.36895.0000; PSSID="A1-wS2nsbT8yAaiz92yMmlx2BpnjzUUjx2BdSTY-18x2dyfQZ1yAMOOmix2BC4bxxs5fRAx3Dx3DS0YEN7J3xxOrhIYxxYDlHQyQx3Dx3D-9vvmzcndpRgQCGPd1c2qPQx3Dx3D-wx2BJQh9GKVmtdJw3700KssQx3Dx3D"; CUSTOMER="WUHAN UNIV"; E_GROUP_NAME="IC2 Platform"; esi.isLocalStorageCleared=true; Type=documents; Show=null; esi.docsActiveTab=0; esi.Type=; esi.FilterValue=; esi.GroupBy=; esi.FilterBy=; esi.Show=; esi.authorsList=; esi.frontList=; esi.fieldsList=; esi.instList=; esi.journalList=; esi.terriList=; esi.titleList=', 22 | # 列表页 cookie 23 | 'Cookie' : 'Type=documents; Show=null; esi.docsActiveTab=0; esi.Type=; esi.FilterValue=; esi.GroupBy=; esi.FilterBy=; esi.Show=; esi.authorsList=; esi.frontList=; esi.fieldsList=; esi.instList=; esi.journalList=; esi.terriList=; esi.titleList=; clientId=CID1242b7d4b50d1e0ed3ac323a908e08bd; esi.isLocalStorageCleared=true; qq%5Flogin%5Fstate=54D44A36B7E6D2165B2090C42F8CB27E; JSESSIONID=811CB477BFE1C370AB4A99C711A0C0E1; _pk_ref.2.a2d0=%5B%22%22%2C%22%22%2C1480485842%2C%22http%3A%2F%2Fwww.lib.whu.edu.cn%2Fdc%2Furlto_proxy.asp%3Fid%3D638%26url%3Dhttp%3A%2F%2Firas.lib.whu.edu.cn%3A8080%2Fgo%3Fid%3DESI%26source_id%3DWHU03752%26u%3D%26title%3DESI%C2%A3%C2%A8Essential%20Science%20Indicators%C2%A3%C2%A9%22%5D; _pk_id.2.a2d0=b5d6c9744eade5ad.1480474056.2.1480485842.1480485842.; _pk_ses.2.a2d0=*' 24 | } 25 | socket.setdefaulttimeout(60) 26 | 27 | html = requests.get(url,headers = headers) 28 | html.encoding = 'utf-8' 29 | return html.text 30 | 31 | if __name__ == '__main__': 32 | # 详情页 url 33 | # url = 'http://gateway.webofknowledge.com/gateway/Gateway.cgi?GWVersion=2&SrcAuth=TSMetrics&SrcApp=TSM_TEST&DestApp=WOS_CPL&DestLinkType=FullRecord&KeyUT=ISI:000262948300002' 34 | # 列表页 url 35 | url = 'http://iras.lib.whu.edu.cn:8080/rwt/ESI/https/MW3XTLUJN3SXT7DFPMYHI4DQNW3X85UTMW4YI3LTPMYGG55N/IndicatorsDataAction.action?_dc=1480468834254&type=documents&author=&researchField=CLINICAL%20MEDICINE&institution=&journal=&territory=&article_UT=&researchFront=&articleTitle=&docType=Top&year=&page=1&start=10&limit=10&sort=%5B%7B%22property%22%3A%22citations%22%2C%22direction%22%3A%22DESC%22%7D%5D' 36 | result = getsource(url) 37 | print result 38 | 39 | --------------------------------------------------------------------------------