├── .idea ├── misc.xml ├── modules.xml ├── new_media_crawler.iml ├── vcs.xml └── workspace.xml ├── README.md ├── article_crawler ├── .DS_Store ├── __init__.py ├── fb_crawler.py ├── instagram_crawler.py ├── main.py ├── twitter_crawler.py ├── weibo_crawler.py ├── weibo_crawler_old.py ├── weixin_crawler.py ├── youku_crawler.py └── youtube_crawler.py └── fans_crawler ├── .DS_Store ├── __init__.py ├── fb_crawler.py ├── instagram_crawler.py ├── main.py ├── twitter_crawler.py ├── weibo_crawler.py ├── weibo_crawler_old.py ├── weixin_crawler.py ├── youku_crawler.py └── youtube_crawler.py /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/new_media_crawler.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 141 | 142 | 143 | 145 | 146 | 174 | 175 | 176 | 177 | 178 | true 179 | DEFINITION_ORDER 180 | 181 | 182 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 221 | 222 | 223 | 224 | 227 | 228 | 231 | 232 | 233 | 234 | 237 | 238 | 241 | 242 | 245 | 246 | 247 | 248 | 251 | 252 | 255 | 256 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 295 | 296 | 313 | 314 | 331 | 332 | 349 | 350 | 367 | 368 | 387 | 388 | 389 | 390 | 391 | 404 | 405 | 418 | 419 | 436 | 437 | 449 | 450 | project 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 1475028097156 492 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 555 | 558 | 559 | 560 | 562 | 563 | 564 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 | 680 | 681 | 682 | 683 | 684 | 685 | 686 | 687 | 688 | 689 | 690 | 691 | 692 | 693 | 694 | 695 | 696 | 697 | 698 | 699 | 700 | 701 | 702 | 703 | 704 | 705 | 706 | 707 | 708 | 709 | 710 | 711 | 712 | 713 | 714 | 715 | 716 | 717 | 718 | 719 | 720 | 721 | 722 | 723 | 724 | 725 | 726 | 727 | 728 | 729 | 730 | 731 | 732 | 733 | 734 | 735 | 736 | 737 | 738 | 739 | 740 | 741 | 742 | 743 | 744 | 745 | 746 | 747 | 748 | 749 | 750 | 751 | 752 | 753 | 754 | 755 | 756 | 757 | 758 | 759 | 760 | 761 | 762 | 763 | 764 | 765 | 766 | 767 | 768 | 769 | 770 | 771 | 772 | 773 | 774 | 775 | 776 | 777 | 778 | 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | 787 | 788 | 789 | 790 | 791 | 792 | 793 | 794 | 795 | 796 | 797 | 798 | 799 | 800 | 801 | 802 | 803 | 804 | 805 | 806 | 807 | 808 | 809 | 810 | 811 | 812 | 813 | 814 | 815 | 816 | 817 | 818 | 819 | 820 | 821 | 822 | 823 | 824 | 825 | 826 | 827 | 828 | 829 | 830 | 831 | 832 | 833 | 834 | 835 | 836 | 837 | 838 | 839 | 840 | 841 | 842 | 843 | 844 | 845 | 846 | 847 | 848 | 849 | 850 | 851 | 852 | 853 | 854 | 855 | 856 | 857 | 858 | 859 | 860 | 861 | 862 | 863 | 864 | 865 | 866 | 867 | 868 | 869 | 870 | 871 | 872 | 873 | 874 | 875 | 876 | 877 | 878 | 879 | 880 | 881 | 882 | 883 | 884 | 885 | 886 | 887 | 888 | 889 | 890 | 891 | 892 | 893 | 894 | 895 | 896 | 897 | 898 | 899 | 900 | 901 | 902 | 903 | 904 | 905 | 906 | 907 | 908 | 909 | 910 | 911 | 912 | 913 | 914 | 915 | 916 | 917 | 918 | 919 | 920 | 921 | 922 | 923 | 924 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | facebook,微博,twitter,youtube,优酷 粉丝数 浏览量 点赞数 评论数 爬虫 -------------------------------------------------------------------------------- /article_crawler/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NiShuang/new_media_fans_cralwer/e86382f153bfd9079623b8cafcb4563a9487013e/article_crawler/.DS_Store -------------------------------------------------------------------------------- /article_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NiShuang/new_media_fans_cralwer/e86382f153bfd9079623b8cafcb4563a9487013e/article_crawler/__init__.py -------------------------------------------------------------------------------- /article_crawler/fb_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import json 4 | import time 5 | import datetime 6 | 7 | 8 | def get_by_api(): 9 | app_id = '' 10 | app_secret = '' 11 | access_token = app_id + '|' + app_secret 12 | username = 'Insta360VRVideoCamera' 13 | url = 'https://graph.facebook.com/' + username + '/posts?fields=shares,message,comments.limit(0).summary(true),likes.limit(0).summary(true),created_time,id,link&limit=100&access_token=' + access_token 14 | headers = {} 15 | headers['Host'] = 'graph.facebook.com' 16 | headers['Connection'] = 'keep-alive' 17 | headers['Upgrade-Insecure-Requests'] = '1' 18 | headers['Cache-Control'] = 'max-age=0' 19 | now = time.mktime(datetime.date.today().timetuple()) 20 | week_ago = now - (3600 * 24 * 7) 21 | today = datetime.datetime.now().strftime('%Y-%m-%d') 22 | share_total = 0 23 | like_total = 0 24 | comment_total = 0 25 | while True: 26 | request = urllib2.Request(url = url, headers = headers) 27 | response = urllib2.urlopen(request) 28 | page = response.read() 29 | jsonData = json.loads(page, encoding="utf-8") 30 | data = jsonData['data'] 31 | for item in data: 32 | share = item['shares']['count'] if item.has_key('shares') else 0 33 | temp = time.mktime(time.strptime(item['created_time'], "%Y-%m-%dT%H:%M:%S+0000")) 34 | if temp >= week_ago: 35 | share_total += int(share) 36 | like_total += int(item['likes']['summary']['total_count']) 37 | comment_total += int(item['comments']['summary']['total_count']) 38 | if len(data) == 0: 39 | break 40 | paging = jsonData['paging'] if jsonData.has_key('paging') else {} 41 | url = paging['next'] if paging.has_key('next') else '' 42 | result = { 43 | 'platform': 'facebook', 44 | 'date': today, 45 | 'comment': comment_total, 46 | 'like': like_total, 47 | 'share': share_total, 48 | 'dislike': 0, 49 | 'view': 0 50 | } 51 | jsonResult = json.dumps(result) 52 | print jsonResult 53 | return jsonResult 54 | 55 | if __name__ == '__main__': 56 | get_by_api() 57 | -------------------------------------------------------------------------------- /article_crawler/instagram_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import json 4 | import datetime 5 | import time 6 | 7 | 8 | def get_by_api(): 9 | username = 'insta360official' 10 | url = 'https://www.instagram.com/' + username + '/media/' 11 | now = time.mktime(datetime.date.today().timetuple()) 12 | week_ago = now - (3600 * 24 * 7) 13 | today = datetime.datetime.now().strftime('%Y-%m-%d') 14 | like_total = 0 15 | comment_total = 0 16 | view_total = 0 17 | request = urllib2.Request(url = url) 18 | response = urllib2.urlopen(request) 19 | page = response.read() 20 | print page 21 | jsonData = json.loads(page, encoding="utf-8") 22 | data = jsonData['items'] 23 | for item in data: 24 | temp = int(item['created_time']) 25 | if temp >= week_ago: 26 | like_total += int(item['likes']['count']) 27 | comment_total += int(item['comments']['count']) 28 | if item.has_key('video_views'): 29 | view_total += int(item['video_views']) 30 | 31 | result = { 32 | 'platform': 'instagram', 33 | 'date': today, 34 | 'comment': comment_total, 35 | 'like': like_total, 36 | 'share': 0, 37 | 'dislike': 0, 38 | 'view': view_total 39 | } 40 | jsonResult = json.dumps(result) 41 | print jsonResult 42 | return jsonResult 43 | 44 | 45 | 46 | def OAuth(): 47 | url = 'https://www.instagram.com/oauth/authorize/?client_id=&redirect_uri=http://www.baidu.com&response_type=token' 48 | request = urllib2.Request(url = url) 49 | response = urllib2.urlopen(request) 50 | redirect_url = response.geturl() 51 | request = urllib2.Request(url=redirect_url) 52 | response = urllib2.urlopen(request) 53 | redirect_url = response.geturl() 54 | print redirect_url 55 | # data = json.loads(page, encoding="utf-8") 56 | 57 | 58 | def get_by_request(): 59 | username = 'insta360official' 60 | url = 'https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22https%3A%2F%2Fwww.instagram.com%2F' + username + '%2F%22%20and%20xpath%3D%22%2Fhtml%2Fbody%2Fscript%5B1%5D%22&format=json' 61 | headers = {} 62 | headers['Host'] = 'query.yahooapis.com' 63 | headers['Connection'] = 'keep-alive' 64 | headers['Origin'] = 'https://livecounts.net' 65 | headers['Pragma'] = 'no-cache' 66 | headers['Referer'] = 'https://livecounts.net/instagram/cielni' 67 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' 68 | request = urllib2.Request(url=url, headers=headers) 69 | response = urllib2.urlopen(request) 70 | page = response.read() 71 | print page 72 | jsonData = json.loads(page, encoding="utf-8") 73 | content = jsonData['query']['results']['script']['content'] 74 | print content 75 | content = content[21:-1] 76 | print content 77 | content = json.loads(content, encoding="utf-8") 78 | fans = content['entry_data']['ProfilePage'][0]['user']['followed_by']['count'] 79 | print fans 80 | if __name__ == "__main__": 81 | # OAuth() 82 | # get_by_request() 83 | get_by_api() 84 | -------------------------------------------------------------------------------- /article_crawler/main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | 4 | from fb_crawler import get_by_api as get_fb 5 | from weibo_crawler_old import get_by_api as get_sina 6 | from twitter_crawler import get_by_api as get_twitter 7 | from youtube_crawler import YoukuCrawler 8 | from youku_crawler import get_by_api as get_youku 9 | from weixin_crawler import get_by_request as get_weixin 10 | from instagram_crawler import get_by_api as get_instagram 11 | 12 | def main(): 13 | platform = ['facebook', 'weibo', 'twitter', 'youtube', 'youku', 'weixin', 'instagram'] 14 | result = [] 15 | for i in platform: 16 | data = '{}' 17 | 18 | if i == 'facebook': 19 | data = get_fb() 20 | elif i == 'weibo': 21 | data = get_sina() 22 | elif i == 'twitter': 23 | data = get_twitter() 24 | elif i == 'youtube': 25 | c = YoukuCrawler() 26 | data = c.get_videos_info() 27 | elif i == 'youku': 28 | data = get_youku() 29 | elif i == 'weixin': 30 | data = get_weixin() 31 | elif i == 'instagram': 32 | data = get_instagram() 33 | 34 | data = json.loads(data) 35 | today = datetime.datetime.now().strftime('%Y-%m-%d') 36 | temp = {'platform': i, 'data': data, 'date': today} 37 | result.append(temp) 38 | jsonResult = json.dumps(result) 39 | print jsonResult 40 | return jsonResult 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /article_crawler/twitter_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import urllib 4 | import json 5 | import time 6 | import datetime 7 | import ssl 8 | from functools import wraps 9 | 10 | def get_by_api(): 11 | username = 'insta360' 12 | url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?count=200&trim_user=true&contributor_details=false&exclude_replies=true&include_rts=fasle&screen_name=' + username 13 | oauth = OAuth() 14 | headers = {} 15 | headers['Host'] = 'api.twitter.com' 16 | headers['X-Target-URI'] = 'https://api.twitter.com' 17 | headers['Content-Type'] = 'application/x-www-form-urlencoded' 18 | headers['Connection'] = 'keep-alive' 19 | headers['Authorization'] = oauth 20 | index = 1 21 | now = time.mktime(datetime.date.today().timetuple()) 22 | week_ago = now - (3600 * 24 * 7) 23 | today = datetime.datetime.now().strftime('%Y-%m-%d') 24 | share_total = 0 25 | like_total = 0 26 | while(True): 27 | request = urllib2.Request(url = url + '&page=' + str(index), headers = headers) 28 | response = urllib2.urlopen(request) 29 | page = response.read() 30 | data = json.loads(page, encoding="utf-8") 31 | for item in data: 32 | temp = time.mktime(time.strptime(item['created_at'], "%a %b %d %H:%M:%S +0000 %Y")) 33 | if temp >= week_ago: 34 | share_total += int(item['retweet_count']) 35 | like_total += int(item['favorite_count']) 36 | index += 1 37 | if len(data) == 0: 38 | break 39 | result = { 40 | 'platform': 'twitter', 41 | 'date': today, 42 | 'comment': 0, 43 | 'like': like_total, 44 | 'share': share_total, 45 | 'dislike': 0, 46 | 'view': 0 47 | } 48 | jsonResult = json.dumps(result) 49 | print jsonResult 50 | return jsonResult 51 | 52 | 53 | def sslwrap(func): 54 | @wraps(func) 55 | def bar(*args, **kw): 56 | kw['ssl_version'] = ssl._PROTOCOL_NAMES 57 | return func(*args, **kw) 58 | return bar 59 | 60 | 61 | def OAuth(): 62 | ssl.wrap_socket = sslwrap(ssl.wrap_socket) 63 | url = 'https://api.twitter.com/oauth2/token' 64 | value = {} 65 | value['grant_type'] = 'client_credentials' 66 | value['client_id'] = '' 67 | value['client_secret'] = '' 68 | data = urllib.urlencode(value) 69 | request = urllib2.Request(url = url, data = data) 70 | response = urllib2.urlopen(request) 71 | page = response.read() 72 | data = json.loads(page, encoding="utf-8") 73 | result = data['token_type'] + ' ' + data['access_token'] 74 | return result 75 | 76 | if __name__ == '__main__': 77 | get_by_api() 78 | -------------------------------------------------------------------------------- /article_crawler/weibo_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import time 4 | import re 5 | import datetime 6 | import json 7 | import sys 8 | from selenium import webdriver 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.common.exceptions import NoSuchElementException, TimeoutException 11 | import selenium.webdriver.support.ui as ui 12 | reload(sys) 13 | sys.setdefaultencoding("utf-8") 14 | 15 | ''' 16 | 版本过低 17 | pip install -U selenium 18 | WebDriverException: Message: Can't load the profile. 19 | Profile Dir: %s If you specified a log_file in the FirefoxBinary constructor, 20 | check it for details. 21 | ''' 22 | 23 | # 先调用无界面浏览器PhantomJS或Firefox 24 | # driver = webdriver.PhantomJS(executable_path="G:\phantomjs-1.9.1-windows\phantomjs.exe") 25 | driver = webdriver.PhantomJS() 26 | # options = webdriver.ChromeOptions() 27 | # options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"]) 28 | # driver = webdriver.Chrome(chrome_options=options) 29 | wait = ui.WebDriverWait(driver, 10) 30 | 31 | 32 | # ******************************************************************************** 33 | # 第一步: 登陆weibo.cn 获取新浪微博的cookie 34 | # 该方法针对weibo.cn有效(明文形式传输数据) weibo.com见学弟设置POST和Header方法 35 | # LoginWeibo(username, password) 参数用户名 密码 36 | 37 | # https://www.zhihu.com/question/21451510 38 | # http://www.cnblogs.com/fnng/p/3606934.html 39 | # 验证码暂停时间手动输入 40 | # ******************************************************************************** 41 | 42 | def LoginWeibo(username, password): 43 | # ********************************************************************** 44 | # 直接访问driver.get("http://weibo.cn/5824697471")会跳转到登陆页面 用户id 45 | # 46 | # 用户名 47 | # 密码 "password_4903" 中数字会变动,故采用绝对路径方法,否则不能定位到元素 48 | # 49 | # 勾选记住登录状态check默认是保留 故注释掉该代码 不保留Cookie 则'expiry'=None 50 | # ********************************************************************** 51 | 52 | # 输入用户名/密码登录 53 | print u'准备登陆Weibo.cn网站...' 54 | driver.get("http://weibo.com/") 55 | driver.maximize_window() 56 | wait = WebDriverWait(driver, 10) 57 | try: 58 | wait.until(lambda x: x.find_element_by_id("loginname")) 59 | except TimeoutException: 60 | return 61 | 62 | # time.sleep(5) 63 | 64 | elem_user = driver.find_element_by_id("loginname") 65 | elem_user.send_keys(username) # 用户名 66 | elem_pwd = driver.find_element_by_name("password") 67 | elem_pwd.send_keys(password) # 密码 68 | # elem_rem = driver.find_element_by_id("login_form_savestate") 69 | # elem_rem.click() # 记住登录状态,默认是记住,所以不需要 70 | 71 | elem_sub = driver.find_element_by_xpath("//*[@id='pl_login_form']/div/div[3]/div[6]/a") 72 | elem_sub.click() # 点击登陆 73 | time.sleep(5) 74 | 75 | # 获取Coockie 推荐 http://www.cnblogs.com/fnng/p/3269450.html 76 | # print driver.current_url 77 | # print driver.get_cookies() #获得cookie信息 dict存储 78 | # print u'输出Cookie键值对信息:' 79 | # for cookie in driver.get_cookies(): 80 | # #print cookie 81 | # for key in cookie: 82 | # print key, cookie[key] 83 | 84 | # driver.get_cookies()类型list 仅包含一个元素cookie类型dict 85 | print u'登陆成功...' 86 | 87 | 88 | def VisitPersonPage(user_id): 89 | 90 | print u'准备访问个人网站.....' 91 | driver.get("http://weibo.cn/" + user_id) 92 | result = [] 93 | print '\n' 94 | print u'获取微博内容信息' 95 | num = 1 96 | while num <= 10: 97 | url_wb = "http://weibo.cn/" + user_id + "?filter=0&page=" + str(num) 98 | driver.get(url_wb) 99 | info = driver.find_elements_by_xpath("//div[@class='c']") 100 | for value in info: 101 | print value.text 102 | info = value.text 103 | 104 | # 跳过最后一行数据为class=c 105 | # Error: 'NoneType' object has no attribute 'groups' 106 | if u'设置:皮肤.图片' not in info: 107 | if info.startswith(u'转发'): 108 | print u'转发微博' 109 | status = '转发' 110 | else: 111 | print u'原创微博' 112 | status = '原创' 113 | 114 | # 获取最后一个点赞数 因为转发是后有个点赞数 115 | str1 = info.split(u" 赞")[-1] 116 | # print str1 117 | like = 0 118 | if str1: 119 | val1 = re.match(r'\[(.*?)\]', str1).groups()[0] 120 | like = val1 121 | 122 | str2 = info.split(u" 转发")[-1] 123 | share = 0 124 | if str2: 125 | val2 = re.match(r'\[(.*?)\]', str2).groups()[0] 126 | share = val2 127 | 128 | str3 = info.split(u" 评论")[-1] 129 | comment = 0 130 | if str3: 131 | val3 = re.match(r'\[(.*?)\]', str3).groups()[0] 132 | comment = val3 133 | 134 | str4 = info.split(u" 收藏 ")[-1] 135 | flag = str4.find(u"来自") 136 | temp_time = str4[:(flag - 1)] 137 | # print temp_time 138 | created_time = format_time(temp_time) 139 | date = created_time[0:10] 140 | message = info[:info.rindex(u" 赞")] 141 | 142 | try: 143 | url = value.find_element_by_xpath('div[2]/a[1]').get_attribute("href") 144 | except NoSuchElementException: 145 | url = '' 146 | temp = { 147 | 'account': user_id, 148 | 'message': message, 149 | # 'id': item['id_str'], 150 | 'public_time': created_time, 151 | 'date': date, 152 | 'share': share, 153 | 'like': like, 154 | 'comment': comment, 155 | 'link': url, 156 | 'status': status 157 | } 158 | result.append(temp) 159 | else: 160 | break 161 | else: 162 | print u'next page...\n' 163 | num += 1 164 | print '\n\n' 165 | jsonResult = json.dumps(result) 166 | print jsonResult 167 | return jsonResult 168 | 169 | def format_time(string): 170 | now = datetime.datetime.now() 171 | result = now.strftime('%Y-%m-%d %H:%M:%S') 172 | if u'分钟前' in string: 173 | d = int(string[0:1]) 174 | temp = now - datetime.timedelta(minutes= d) 175 | result =temp.strftime('%Y-%m-%d %H:%M:%S') 176 | 177 | elif u'今天' in string: 178 | t = string[-5:] 179 | temp = now.strftime('%Y-%m-%d') 180 | result = temp + ' ' + t + ':00' 181 | 182 | elif u'月' in string: 183 | temp = time.strptime(string, "%m月%d日 %H:%M".decode('utf-8')) 184 | result = str(now.year) + '-' + time.strftime("%m-%d %H:%M:%S", temp) 185 | 186 | elif len(string) == 19: 187 | result = string 188 | 189 | 190 | return result 191 | # ******************************************************************************* 192 | # 程序入口 预先调用 193 | # ******************************************************************************* 194 | 195 | 196 | def get_by_selenium(): 197 | # 定义变量 198 | username = '####' # 输入你的用户名 199 | password = '####' # 输入你的密码 200 | 201 | # 操作函数 202 | LoginWeibo(username, password) # 登陆微博 203 | 204 | # driver.add_cookie({'name':'name', 'value':'_T_WM'}) 205 | # driver.add_cookie({'name':'value', 'value':'c86fbdcd26505c256a1504b9273df8ba'}) 206 | user_id = 'insta360' 207 | # 注意 208 | # 因为sina微博增加了验证码,但是你用Firefox登陆一次输入验证码,再调用该程序即可,因为Cookies已经保证 209 | # 会直接跳转到明星微博那部分,即: http://weibo.cn/guangxianliuyan 210 | 211 | return VisitPersonPage(user_id) # 访问个人页面 212 | 213 | if __name__ == '__main__': 214 | get_by_selenium() 215 | 216 | -------------------------------------------------------------------------------- /article_crawler/weibo_crawler_old.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib 3 | import urllib2 4 | import datetime 5 | import time 6 | import json 7 | import requests 8 | import base64 9 | from selenium import webdriver 10 | from selenium.common.exceptions import TimeoutException 11 | from selenium.webdriver.support.ui import WebDriverWait 12 | 13 | def get_by_api(): 14 | url = 'https://api.weibo.com/2/statuses/user_timeline.json?page=1' 15 | username = '**********' 16 | password = '**********' 17 | value = {} 18 | value['trim_user'] = '1' 19 | value['count'] = '100' 20 | value['source'] = '218121934' 21 | data = urllib.urlencode(value) 22 | base64string = base64.encodestring( 23 | '%s:%s' % (username, password))[:-1] # 注意哦,这里最后会自动添加一个\n 24 | authheader = "Basic %s" % base64string 25 | header = {} 26 | header['Authorization'] = authheader 27 | now = time.mktime(datetime.date.today().timetuple()) 28 | week_ago = now - (3600 * 24 * 7) 29 | today = datetime.datetime.now().strftime('%Y-%m-%d') 30 | share_total = 0 31 | like_total = 0 32 | comment_total = 0 33 | results = requests.get(url=url, params=data, headers=header) 34 | page = results.content 35 | print page 36 | jsonData = json.loads(page, encoding="utf-8") 37 | data = jsonData['statuses'] 38 | for item in data: 39 | temp = time.mktime(time.strptime(item['created_at'], "%a %b %d %H:%M:%S +0800 %Y")) 40 | if temp >= week_ago: 41 | share_total += int(item['reposts_count']) 42 | like_total += int(item['attitudes_count']) 43 | comment_total += int(item['comments_count']) 44 | result = { 45 | 'platform': 'weibo', 46 | 'date': today, 47 | 'comment': comment_total, 48 | 'like': like_total, 49 | 'share': share_total, 50 | 'dislike': 0, 51 | 'view': 0 52 | } 53 | jsonResult = json.dumps(result) 54 | print jsonResult 55 | return jsonResult 56 | 57 | def get_by_selenium(): 58 | username = 'insta360' 59 | url = 'http://weibo.cn/'+ username 60 | cap = webdriver.DesiredCapabilities.PHANTOMJS 61 | cap["phantomjs.page.settings.resourceTimeout"] = 1000 62 | cap["phantomjs.page.settings.loadImages"] = False 63 | cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True 64 | cap["userAgent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0" 65 | cap["XSSAuditingEnabled"] = True 66 | cap["host"] = 'weibo.cn' 67 | # cap["cookie"] = '_T_WM=d2e28a98d3031cf98e282a29740b5f24; SUB=_2A2566MQNDeTxGeRJ7VYX8CzFyDmIHXVWEuxFrDV6PUJbkdAKLU_GkW1OqRtS_kr8ak-kdubq12_Bbpo41w..; gsid_CTandWM=4uona6911nQUejIzV9kdEbBcmf5' 68 | driver = webdriver.PhantomJS(desired_capabilities=cap, 69 | service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any', 70 | '--web-security=true']) 71 | # driver = webdriver.Chrome() 72 | driver.get('http://baidu.com') 73 | driver.add_cookie({'name': '_T_WM', 'value': 'd2e28a98d3031cf98e282a29740b5f24'}) 74 | driver.add_cookie({'name': 'SUB', 'value': '_2A2566MQNDeTxGeRJ7VYX8CzFyDmIHXVWEuxFrDV6PUJbkdAKLU_GkW1OqRtS_kr8ak-kdubq12_Bbpo41w..'}) 75 | driver.add_cookie({'name': 'gsid_CTandWM', 'value': '4uona6911nQUejIzV9kdEbBcmf5'}) 76 | driver.get(url) 77 | 78 | wait = WebDriverWait(driver, 20) 79 | print driver.page_source 80 | try: 81 | result = int( 82 | wait.until(lambda x: x.find_element_by_xpath('/html/body/div[3]/div/a[2]').text[3:-1])) 83 | except TimeoutException: 84 | result = 0 85 | print result 86 | time.sleep(10) 87 | driver.quit() 88 | return result 89 | 90 | if __name__ == "__main__": 91 | get_by_api() 92 | -------------------------------------------------------------------------------- /article_crawler/weixin_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import urllib 4 | import json 5 | import time 6 | import datetime 7 | 8 | 9 | def get_by_request(): 10 | username = 'Insta360_official' 11 | url = 'http://www.newrank.cn/xdnphb/detail/getAccountArticle' 12 | headers = {} 13 | headers['Host'] = 'www.newrank.cn' 14 | headers['Referer'] = 'http://www.newrank.cn/public/info/detail.html?account=' + username 15 | headers['Cookie'] = 'userFaceTip=userFaceTip; CNZZDATA1253878005=1419576409-1475115174-%7C1475115174; Hm_lvt_a19fd7224d30e3c8a6558dcb38c4beed=1475116869; Hm_lpvt_a19fd7224d30e3c8a6558dcb38c4beed=1475116869; userFaceTip=userFaceTip' 16 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0' 17 | headers['X-Requested-With'] = 'XMLHttpRequest' 18 | headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8' 19 | 20 | value = {} 21 | value['flag'] = 'true' 22 | value['uuid'] = '91B514A33A4D2FA4C1E923ABDA595A90' 23 | value['nonce'] = '3679c0e73' 24 | value['xyz'] = '6cdb1d7fbdeea8afe76a21479f46f0b2' 25 | data = urllib.urlencode(value) 26 | request = urllib2.Request(url = url,data = data, headers = headers) 27 | response = urllib2.urlopen(request) 28 | page = response.read() 29 | now = time.mktime(datetime.date.today().timetuple()) 30 | week_ago = now - (3600 * 24 * 7) 31 | today = datetime.datetime.now().strftime('%Y-%m-%d') 32 | like_total = 0 33 | view_total = 0 34 | result = json.loads(page, encoding="utf-8") 35 | articles = result['value']['lastestArticle'] 36 | for article in articles: 37 | temp = time.mktime(time.strptime(article['publicTime'], "%Y-%m-%d %H:%M:%S")) 38 | if temp >= week_ago: 39 | view_total += int(article['clicksCount']) 40 | like_total += int(article['likeCount']) 41 | result = { 42 | 'platform': 'weixin', 43 | 'date': today, 44 | 'comment': 0, 45 | 'like': like_total, 46 | 'share': 0, 47 | 'dislike': 0, 48 | 'view': view_total 49 | } 50 | jsonResult = json.dumps(result) 51 | print jsonResult 52 | return jsonResult 53 | 54 | if __name__ == '__main__': 55 | get_by_request() -------------------------------------------------------------------------------- /article_crawler/youku_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | import urllib2 4 | import json 5 | import time 6 | import datetime 7 | 8 | 9 | def get_by_api(): 10 | client_id = '' 11 | username = 'Insta360' 12 | url = 'https://openapi.youku.com/v2/videos/by_user.json?client_id=' + client_id + '&user_name=' + username + '&count=20' 13 | now = time.mktime(datetime.date.today().timetuple()) 14 | week_ago = now - (3600 * 24 * 7) 15 | today = datetime.datetime.now().strftime('%Y-%m-%d') 16 | view_total = 0 17 | like_total = 0 18 | dislike_total = 0 19 | comment_total = 0 20 | request = urllib2.Request(url = url) 21 | response = urllib2.urlopen(request) 22 | page = response.read() 23 | jsonData = json.loads(page, encoding="utf-8") 24 | data = jsonData['videos'] 25 | for item in data: 26 | temp = time.mktime(time.strptime(item['published'], "%Y-%m-%d %H:%M:%S")) 27 | if temp >= week_ago: 28 | view_total += int(item['view_count']) 29 | dislike_total += int(item['down_count']) 30 | like_total += int(item['up_count']) 31 | comment_total += int(item['comment_count']) 32 | 33 | result = { 34 | 'platform': 'youku', 35 | 'date': today, 36 | 'comment': comment_total, 37 | 'like': like_total, 38 | 'share': 0, 39 | 'dislike': dislike_total, 40 | 'view': view_total 41 | } 42 | jsonResult = json.dumps(result) 43 | print jsonResult 44 | return jsonResult 45 | 46 | if __name__ == '__main__': 47 | get_by_api() 48 | 49 | -------------------------------------------------------------------------------- /article_crawler/youtube_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import time 4 | import json 5 | import datetime 6 | import requests 7 | 8 | 9 | class YoukuCrawler: 10 | def __init__(self): 11 | self.video_ids = [] 12 | self.maxResults = 50 13 | playlist_id = '' 14 | self.app_key = '' 15 | self.list_api = 'https://www.googleapis.com/youtube/v3/playlistItems?maxResults=' + str(self.maxResults) + '&part=snippet&playlistId=' + playlist_id + '&key=' + self.app_key 16 | # self.info_api = 'https://www.googleapis.com/youtube/v3/videos?maxResults=50&part=snippet,statistics' + '&key=' + self.app_key 17 | self.info_api = 'https://www.googleapis.com/youtube/v3/videos' 18 | now = time.mktime(datetime.date.today().timetuple()) 19 | self.week_ago = now - (3600 * 24 * 7) 20 | self.view_total = 0 21 | self.like_total = 0 22 | self.dislike_total = 0 23 | self.comment_total = 0 24 | def main(self): 25 | self.get_video_ids() 26 | return self.get_videos_info() 27 | 28 | def get_video_ids(self): 29 | url = self.list_api 30 | request = urllib2.Request(url=url) 31 | response = urllib2.urlopen(request) 32 | page = response.read() 33 | result = json.loads(page, encoding="utf-8") 34 | # total = int(result['pageInfo']['totalResults']) 35 | # perPage = int(result['pageInfo']['resultsPerPage']) 36 | # self.totalPage = (total/perPage) + (0 if (total%perPage)==0 else 1) 37 | videos = result['items'] 38 | for video in videos: 39 | self.video_ids.append(video['snippet']['resourceId']['videoId']) 40 | 41 | while(result.has_key('nextPageToken')): 42 | url = self.list_api + '&pageToken=' + result['nextPageToken'] 43 | request = urllib2.Request(url=url) 44 | response = urllib2.urlopen(request) 45 | page = response.read() 46 | result = json.loads(page, encoding="utf-8") 47 | videos = result['items'] 48 | for video in videos: 49 | self.video_ids.append(video['snippet']['resourceId']['videoId']) 50 | 51 | 52 | def get_videos_info(self): 53 | url = self.info_api 54 | query = '' 55 | count = 0 56 | for i in self.video_ids: 57 | count += 1 58 | query = query + i + ',' 59 | if count % self.maxResults == 0 or count == len(self.video_ids): 60 | query = query[:-1] 61 | results = requests.get(url, 62 | params={'id': query, 'maxResults': self.maxResults, 'part': 'snippet,statistics', 'key': self.app_key}) 63 | page = results.content 64 | videos = json.loads(page, encoding="utf-8")['items'] 65 | for video in videos: 66 | try: 67 | like_count = int(video['statistics']['likeCount']) 68 | except KeyError: 69 | like_count = 0 70 | try: 71 | dislike_count = int(video['statistics']['dislikeCount']) 72 | except KeyError: 73 | dislike_count = 0 74 | temp = time.mktime(time.strptime(video['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%S.000Z")) 75 | if temp >= self.week_ago: 76 | self.dislike_total += dislike_count 77 | self.like_total += like_count 78 | self.comment_total += int(video['statistics']['commentCount']) 79 | self.view_total += int(video['statistics']['viewCount']) 80 | query = '' 81 | today = datetime.datetime.now().strftime('%Y-%m-%d') 82 | result = { 83 | 'platform': 'youtube', 84 | 'date': today, 85 | 'comment': self.comment_total, 86 | 'like': self.like_total, 87 | 'share': 0, 88 | 'dislike': self.dislike_total, 89 | 'view': self.view_total 90 | } 91 | jsonResult = json.dumps(result) 92 | print jsonResult 93 | return jsonResult 94 | 95 | if __name__ == "__main__": 96 | c = YoukuCrawler() 97 | c.main() 98 | -------------------------------------------------------------------------------- /fans_crawler/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NiShuang/new_media_fans_cralwer/e86382f153bfd9079623b8cafcb4563a9487013e/fans_crawler/.DS_Store -------------------------------------------------------------------------------- /fans_crawler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NiShuang/new_media_fans_cralwer/e86382f153bfd9079623b8cafcb4563a9487013e/fans_crawler/__init__.py -------------------------------------------------------------------------------- /fans_crawler/fb_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import re 3 | import urllib2 4 | import json 5 | from selenium import webdriver 6 | from selenium.common.exceptions import TimeoutException 7 | from selenium.webdriver.support.ui import WebDriverWait 8 | import ssl 9 | from functools import wraps 10 | 11 | 12 | def get_by_api(): 13 | app_id = '' 14 | app_secret = '' 15 | access_token = app_id + '|' + app_secret 16 | username = 'Insta360VRVideoCamera' 17 | url = 'https://graph.facebook.com/' + username + '/?fields=fan_count&access_token=' + access_token 18 | headers = {} 19 | headers['Host'] = 'graph.facebook.com' 20 | headers['Connection'] = 'keep-alive' 21 | headers['Upgrade-Insecure-Requests'] = '1' 22 | headers['Cache-Control'] = 'max-age=0' 23 | 24 | request = urllib2.Request(url = url, headers = headers) 25 | response = urllib2.urlopen(request) 26 | page = response.read() 27 | # print page 28 | jsonData = json.loads(page, encoding="utf-8") 29 | fans = jsonData['fan_count'] 30 | print fans 31 | return fans 32 | 33 | 34 | def sslwrap(func): 35 | @wraps(func) 36 | def bar(*args, **kw): 37 | kw['ssl_version'] = ssl._PROTOCOL_NAMES 38 | return func(*args, **kw) 39 | return bar 40 | 41 | 42 | def get_by_request(): 43 | ssl.wrap_socket = sslwrap(ssl.wrap_socket) 44 | username = 'Insta360VRVideoCamera' 45 | url = 'https://www.facebook.com/plugins/fan.php?id=' + username 46 | headers = {} 47 | headers['Host'] = 'www.facebook.com' 48 | # headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0' 49 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' 50 | # headers['Connection'] = 'keep-alive' 51 | # headers['Upgrade-Insecure-Requests'] = '1' 52 | request = urllib2.Request(url = url,headers=headers) 53 | response = urllib2.urlopen(request) 54 | page = response.read() 55 | pattern = re.compile("
(.{0,10})\s", re.S) 56 | items = re.findall(pattern, page) 57 | # print page.decode("UTF-8") 58 | fans = int(items[0].replace(',','')) 59 | print fans 60 | return fans 61 | 62 | 63 | def get_by_selenium(): 64 | username = 'Insta360VRVideoCamera' 65 | url = 'https://www.facebook.com/plugins/fan.php?id=' + username 66 | cap = webdriver.DesiredCapabilities.PHANTOMJS 67 | cap["phantomjs.page.settings.resourceTimeout"] = 1000 68 | cap["phantomjs.page.settings.loadImages"] = False 69 | cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True 70 | cap["userAgent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0" 71 | cap["XSSAuditingEnabled"] = True 72 | driver = webdriver.PhantomJS(desired_capabilities=cap, 73 | service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any', 74 | '--web-security=true']) 75 | # driver = webdriver.Chrome() 76 | driver.get(url) 77 | wait = WebDriverWait(driver, 20) 78 | # print driver.page_source 79 | try: 80 | string = wait.until(lambda x: x.find_elements_by_class_name('_1drq')[0].text) 81 | except TimeoutException: 82 | string = 0 83 | pattern = re.compile("\d", re.S) 84 | items = re.findall(pattern, string) 85 | temp = '' 86 | for item in items: 87 | temp += item 88 | fans = int(temp) 89 | print fans 90 | driver.quit() 91 | return fans 92 | 93 | if __name__ == "__main__": 94 | # get_by_request() 95 | # get_by_selenium() 96 | get_by_api() 97 | -------------------------------------------------------------------------------- /fans_crawler/instagram_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import json 4 | import urllib 5 | 6 | 7 | def get_by_api(): 8 | user_id = '' 9 | access_token = '' 10 | url = 'https://api.instagram.com/v1/users/' + user_id + '/?access_token=' + access_token 11 | # oauth = OAuth() 12 | request = urllib2.Request(url = url) 13 | response = urllib2.urlopen(request) 14 | page = response.read() 15 | print page 16 | data = json.loads(page, encoding="utf-8") 17 | fans = data['data']['counts']['followed_by'] 18 | print fans 19 | return fans 20 | 21 | 22 | def OAuth(): 23 | url = 'https://www.instagram.com/oauth/authorize/?client_id=a84f3a3ec8c44dfbbe9d2e3f07dc9c97&redirect_uri=http://www.baidu.com&response_type=token' 24 | request = urllib2.Request(url = url) 25 | response = urllib2.urlopen(request) 26 | redirect_url = response.geturl() 27 | request = urllib2.Request(url=redirect_url) 28 | response = urllib2.urlopen(request) 29 | redirect_url = response.geturl() 30 | print redirect_url 31 | # data = json.loads(page, encoding="utf-8") 32 | 33 | 34 | def get_by_request(): 35 | username = 'insta360official' 36 | url = 'https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22https%3A%2F%2Fwww.instagram.com%2F' + username + '%2F%22%20and%20xpath%3D%22%2Fhtml%2Fbody%2Fscript%5B1%5D%22&format=json' 37 | headers = {} 38 | headers['Host'] = 'query.yahooapis.com' 39 | headers['Connection'] = 'keep-alive' 40 | headers['Origin'] = 'https://livecounts.net' 41 | headers['Pragma'] = 'no-cache' 42 | headers['Referer'] = 'https://livecounts.net/instagram/cielni' 43 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' 44 | request = urllib2.Request(url=url, headers=headers) 45 | response = urllib2.urlopen(request) 46 | page = response.read() 47 | print page 48 | jsonData = json.loads(page, encoding="utf-8") 49 | content = jsonData['query']['results']['script']['content'] 50 | print content 51 | content = content[21:-1] 52 | print content 53 | content = json.loads(content, encoding="utf-8") 54 | fans = content['entry_data']['ProfilePage'][0]['user']['followed_by']['count'] 55 | print fans 56 | if __name__ == "__main__": 57 | # OAuth() 58 | # get_by_request() 59 | get_by_api() 60 | -------------------------------------------------------------------------------- /fans_crawler/main.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import json 3 | 4 | from fb_crawler import get_by_api as get_fb_fans 5 | from weibo_crawler import get_by_request as get_sina_fans 6 | from twitter_crawler import get_by_request as get_twitter_fans 7 | from youtube_crawler import get_by_api as get_youtube_fans 8 | from youku_crawler import get_by_api as get_youku_fans 9 | from weixin_crawler import get_by_api as get_weixin_fans 10 | from instagram_crawler import get_by_request as get_instagram_fans 11 | 12 | def main(): 13 | platform = ['facebook', 'weibo', 'twitter', 'youtube', 'youku', 'weixin', 'instagram'] 14 | result = [] 15 | for i in platform: 16 | fans = 0 17 | 18 | if i == 'facebook': 19 | fans = get_fb_fans() 20 | elif i == 'weibo': 21 | fans = get_sina_fans() 22 | elif i == 'twitter': 23 | fans = get_twitter_fans() 24 | elif i == 'youtube': 25 | fans = get_youtube_fans() 26 | elif i == 'youku': 27 | fans = get_youku_fans() 28 | elif i == 'weixin': 29 | fans = get_weixin_fans() 30 | elif i == 'instagram': 31 | fans = get_instagram_fans() 32 | 33 | today = datetime.datetime.now().strftime('%Y-%m-%d') 34 | temp = {'platform': i, 'fans': fans, 'date': today} 35 | result.append(temp) 36 | jsonResult = json.dumps(result) 37 | print jsonResult 38 | return jsonResult 39 | 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /fans_crawler/twitter_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import json 4 | import ssl 5 | import urllib 6 | from functools import wraps 7 | 8 | 9 | def get_by_request(): 10 | username = 'insta360' 11 | url = 'https://cdn.syndication.twimg.com/widgets/followbutton/info.json?screen_names=' + username 12 | # headers = {} 13 | # headers['Host'] = 'www.facebook.com' 14 | # headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0' 15 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' 16 | # headers['Connection'] = 'keep-alive' 17 | # headers['Upgrade-Insecure-Requests'] = '1' 18 | request = urllib2.Request(url = url) 19 | response = urllib2.urlopen(request) 20 | page = response.read() 21 | result = json.loads(page, encoding="utf-8") 22 | fans = result[0]['followers_count'] 23 | print fans 24 | return fans 25 | 26 | 27 | def get_by_api(): 28 | username = 'insta360' 29 | url = 'https://api.twitter.com/1.1/users/show.json?include_entities=fasle&screen_name=' + username 30 | oauth = OAuth() 31 | headers = {} 32 | headers['Host'] = 'api.twitter.com' 33 | headers['X-Target-URI'] = 'https://api.twitter.com' 34 | headers['Content-Type'] = 'application/x-www-form-urlencoded' 35 | headers['Connection'] = 'keep-alive' 36 | headers['Authorization'] = oauth 37 | request = urllib2.Request(url = url, headers = headers) 38 | response = urllib2.urlopen(request) 39 | page = response.read() 40 | print page 41 | data = json.loads(page, encoding="utf-8") 42 | fans = data['followers_count'] 43 | print fans 44 | return fans 45 | 46 | 47 | def OAuth(): 48 | ssl.wrap_socket = sslwrap(ssl.wrap_socket) 49 | url = 'https://api.twitter.com/oauth2/token' 50 | value = {} 51 | value['grant_type'] = 'client_credentials' 52 | value['client_id'] = '' 53 | value['client_secret'] = '' 54 | data = urllib.urlencode(value) 55 | request = urllib2.Request(url = url, data = data) 56 | response = urllib2.urlopen(request) 57 | page = response.read() 58 | data = json.loads(page, encoding="utf-8") 59 | result = data['token_type'] + ' ' + data['access_token'] 60 | return result 61 | 62 | 63 | def sslwrap(func): 64 | @wraps(func) 65 | def bar(*args, **kw): 66 | kw['ssl_version'] = ssl._PROTOCOL_NAMES 67 | return func(*args, **kw) 68 | return bar 69 | 70 | 71 | if __name__ == "__main__": 72 | get_by_request() 73 | # get_by_api() 74 | -------------------------------------------------------------------------------- /fans_crawler/weibo_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import re 3 | import urllib2 4 | from selenium import webdriver 5 | from selenium.common.exceptions import TimeoutException 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | 8 | def get_by_request(): 9 | username = 'insta360' 10 | url = 'http://weibo.cn/'+ username 11 | headers = {} 12 | headers['Host'] = 'weibo.cn' 13 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0' 14 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' 15 | headers['Cookie'] = '_T_WM=d2e28a98d3031cf98e282a29740b5f24' 16 | # headers['Connection'] = 'keep-alive' 17 | # headers['Upgrade-Insecure-Requests'] = '1' 18 | request = urllib2.Request(url = url, headers=headers) 19 | response = urllib2.urlopen(request) 20 | page = response.read() 21 | pattern = re.compile("\[(.{0,10})\] <\\\\\/h2>", re.S) 22 | items = re.findall(pattern, page) 23 | # print page.decode("UTF-8") 24 | fans = int(items[1]) 25 | print fans 26 | return fans 27 | 28 | 29 | def get_by_selenium(): 30 | username = 'insta360' 31 | url = 'http://weibo.com/'+ username + '?is_all=1' 32 | # cap = webdriver.DesiredCapabilities.PHANTOMJS 33 | # cap["phantomjs.page.settings.resourceTimeout"] = 1000 34 | # cap["phantomjs.page.settings.loadImages"] = False 35 | # cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True 36 | # cap["userAgent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0" 37 | # cap["XSSAuditingEnabled"] = True 38 | # cap["host"] = 'weibo.com' 39 | # cap["cookie"] = 'UOR=www.umeng.com,widget.weibo.com,www.insta360.com; SINAGLOBAL=6982249232630.452.1472299450582; ULV=1475028466086:3:2:2:8231266012653.427.1475028466020:1474966940284; SUB=_2AkMgtrrUf8NhqwJRmP0czWrmZY53wgjEieLBAH7sJRMxHRl-yT83qm8AtRCo0NEVwCee4iQkVabYZqZ8gEhMng..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWT6ckK7WZ-8GkEahm6SKw1; TC-Page-G0=0cd4658437f38175b9211f1336161d7d; _s_tentry=-; Apache=8231266012653.427.1475028466020' 40 | # driver = webdriver.PhantomJS(desired_capabilities=cap, 41 | # service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any', 42 | # '--web-security=true']) 43 | driver = webdriver.Chrome() 44 | driver.get(url) 45 | wait = WebDriverWait(driver, 20) 46 | # print driver.page_source 47 | try: 48 | result = int( 49 | wait.until(lambda x: x.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[2]/strong').text)) 50 | except TimeoutException: 51 | result = 0 52 | print result 53 | driver.quit() 54 | return result 55 | 56 | if __name__ == "__main__": 57 | get_by_request() 58 | # get_by_selenium() -------------------------------------------------------------------------------- /fans_crawler/weixin_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import json 4 | import ssl 5 | import urllib 6 | 7 | 8 | def get_by_api(): 9 | url = 'https://api.weixin.qq.com/cgi-bin/user/get' 10 | token = get_token() 11 | value = {} 12 | value['access_token'] = token 13 | value['next_openid'] = '' 14 | data = urllib.urlencode(value) 15 | request = urllib2.Request(url = url, data = data) 16 | response = urllib2.urlopen(request) 17 | page = response.read() 18 | print page 19 | data = json.loads(page, encoding="utf-8") 20 | fans = 0 21 | try: 22 | fans = data['total'] 23 | except KeyError: 24 | pass 25 | print fans 26 | return fans 27 | 28 | 29 | def get_token(): 30 | url = 'https://api.weixin.qq.com/cgi-bin/token' 31 | value = {} 32 | value['grant_type'] = 'client_credential' 33 | value['appid'] = '' 34 | value['secret'] = '' 35 | data = urllib.urlencode(value) 36 | request = urllib2.Request(url = url, data = data) 37 | response = urllib2.urlopen(request) 38 | page = response.read() 39 | print page 40 | data = json.loads(page, encoding="utf-8") 41 | result = '' 42 | try: 43 | result = data['access_token'] 44 | except KeyError: 45 | pass 46 | return result 47 | 48 | if __name__ == "__main__": 49 | get_by_api() 50 | -------------------------------------------------------------------------------- /fans_crawler/youku_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import re 4 | import json 5 | from selenium import webdriver 6 | from selenium.common.exceptions import TimeoutException 7 | from selenium.webdriver.support.ui import WebDriverWait 8 | 9 | def get_by_request(): 10 | url = 'http://i.youku.com/i/UMjk1ODg3NDgwOA==' 11 | headers = {} 12 | headers['Host'] = 'i.youku.com' 13 | headers['Referer'] = 'http://www.insta360.com/' 14 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0' 15 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' 16 | # headers['Connection'] = 'keep-alive' 17 | headers['Upgrade-Insecure-Requests'] = '1' 18 | request = urllib2.Request(url = url, headers=headers) 19 | response = urllib2.urlopen(request) 20 | page = response.read() 21 | # print page 22 | pattern = re.compile("
  • ", re.S) 23 | items = re.findall(pattern, page) 24 | fans = int(items[0]) 25 | print fans 26 | return fans 27 | 28 | def get_by_api(): 29 | url = 'https://openapi.youku.com/v2/users/friendship/followers.json?client_id=&user_id=' 30 | request = urllib2.Request(url = url) 31 | response = urllib2.urlopen(request) 32 | page = response.read() 33 | result = json.loads(page, encoding="utf-8") 34 | fans = int(result['total']) 35 | print fans 36 | return fans 37 | 38 | def get_by_selenium(): 39 | url = 'http://i.youku.com/i/UMjk1ODg3NDgwOA==' 40 | cap = webdriver.DesiredCapabilities.PHANTOMJS 41 | cap["phantomjs.page.settings.resourceTimeout"] = 1000 42 | cap["phantomjs.page.settings.loadImages"] = False 43 | cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True 44 | cap["userAgent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0" 45 | cap["XSSAuditingEnabled"] = True 46 | driver = webdriver.PhantomJS(desired_capabilities=cap, 47 | service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any', 48 | '--web-security=true']) 49 | # driver = webdriver.Chrome() 50 | driver.get(url) 51 | wait = WebDriverWait(driver, 20) 52 | print driver.page_source 53 | try: 54 | fans = int(wait.until(lambda x: x.find_elements_by_class_name('snum')[0].find_element_by_xpath('em').text)) 55 | except TimeoutException: 56 | fans = 0 57 | print fans 58 | driver.quit() 59 | return fans 60 | 61 | if __name__ == "__main__": 62 | get_by_api() 63 | # get_by_selenium() 64 | -------------------------------------------------------------------------------- /fans_crawler/youtube_crawler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import urllib2 3 | import re 4 | import json 5 | from selenium import webdriver 6 | from selenium.common.exceptions import TimeoutException 7 | from selenium.webdriver.support.ui import WebDriverWait 8 | 9 | def get_by_request(): 10 | url = 'https://www.youtube.com/channel/UC3qWcF49rv8VMZO7Vg6kj5w' 11 | headers = {} 12 | headers['Host'] = 'www.youtube.com' 13 | headers['Referer'] = 'http://www.insta360.com/' 14 | # headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0' 15 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' 16 | # headers['Connection'] = 'keep-alive' 17 | # headers['Upgrade-Insecure-Requests'] = '1' 18 | request = urllib2.Request(url = url, headers=headers) 19 | response = urllib2.urlopen(request) 20 | page = response.read() 21 | # print page 22 | pattern = re.compile("subscribers\">(.*)