├── .idea
├── misc.xml
├── modules.xml
├── new_media_crawler.iml
├── vcs.xml
└── workspace.xml
├── README.md
├── article_crawler
├── .DS_Store
├── __init__.py
├── fb_crawler.py
├── instagram_crawler.py
├── main.py
├── twitter_crawler.py
├── weibo_crawler.py
├── weibo_crawler_old.py
├── weixin_crawler.py
├── youku_crawler.py
└── youtube_crawler.py
└── fans_crawler
├── .DS_Store
├── __init__.py
├── fb_crawler.py
├── instagram_crawler.py
├── main.py
├── twitter_crawler.py
├── weibo_crawler.py
├── weibo_crawler_old.py
├── weixin_crawler.py
├── youku_crawler.py
└── youtube_crawler.py
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/new_media_crawler.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 | true
179 | DEFINITION_ORDER
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 | project
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 | 1475028097156
492 |
493 |
494 | 1475028097156
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
843 |
844 |
845 |
846 |
847 |
848 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 |
874 |
875 |
876 |
877 |
878 |
879 |
880 |
881 |
882 |
883 |
884 |
885 |
886 |
887 |
888 |
889 |
890 |
891 |
892 |
893 |
894 |
895 |
896 |
897 |
898 |
899 |
900 |
901 |
902 |
903 |
904 |
905 |
906 |
907 |
908 |
909 |
910 |
911 |
912 |
913 |
914 |
915 |
916 |
917 |
918 |
919 |
920 |
921 |
922 |
923 |
924 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | facebook,微博,twitter,youtube,优酷 粉丝数 浏览量 点赞数 评论数 爬虫
--------------------------------------------------------------------------------
/article_crawler/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NiShuang/new_media_fans_cralwer/e86382f153bfd9079623b8cafcb4563a9487013e/article_crawler/.DS_Store
--------------------------------------------------------------------------------
/article_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NiShuang/new_media_fans_cralwer/e86382f153bfd9079623b8cafcb4563a9487013e/article_crawler/__init__.py
--------------------------------------------------------------------------------
/article_crawler/fb_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import json
4 | import time
5 | import datetime
6 |
7 |
8 | def get_by_api():
9 | app_id = ''
10 | app_secret = ''
11 | access_token = app_id + '|' + app_secret
12 | username = 'Insta360VRVideoCamera'
13 | url = 'https://graph.facebook.com/' + username + '/posts?fields=shares,message,comments.limit(0).summary(true),likes.limit(0).summary(true),created_time,id,link&limit=100&access_token=' + access_token
14 | headers = {}
15 | headers['Host'] = 'graph.facebook.com'
16 | headers['Connection'] = 'keep-alive'
17 | headers['Upgrade-Insecure-Requests'] = '1'
18 | headers['Cache-Control'] = 'max-age=0'
19 | now = time.mktime(datetime.date.today().timetuple())
20 | week_ago = now - (3600 * 24 * 7)
21 | today = datetime.datetime.now().strftime('%Y-%m-%d')
22 | share_total = 0
23 | like_total = 0
24 | comment_total = 0
25 | while True:
26 | request = urllib2.Request(url = url, headers = headers)
27 | response = urllib2.urlopen(request)
28 | page = response.read()
29 | jsonData = json.loads(page, encoding="utf-8")
30 | data = jsonData['data']
31 | for item in data:
32 | share = item['shares']['count'] if item.has_key('shares') else 0
33 | temp = time.mktime(time.strptime(item['created_time'], "%Y-%m-%dT%H:%M:%S+0000"))
34 | if temp >= week_ago:
35 | share_total += int(share)
36 | like_total += int(item['likes']['summary']['total_count'])
37 | comment_total += int(item['comments']['summary']['total_count'])
38 | if len(data) == 0:
39 | break
40 | paging = jsonData['paging'] if jsonData.has_key('paging') else {}
41 | url = paging['next'] if paging.has_key('next') else ''
42 | result = {
43 | 'platform': 'facebook',
44 | 'date': today,
45 | 'comment': comment_total,
46 | 'like': like_total,
47 | 'share': share_total,
48 | 'dislike': 0,
49 | 'view': 0
50 | }
51 | jsonResult = json.dumps(result)
52 | print jsonResult
53 | return jsonResult
54 |
55 | if __name__ == '__main__':
56 | get_by_api()
57 |
--------------------------------------------------------------------------------
/article_crawler/instagram_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import json
4 | import datetime
5 | import time
6 |
7 |
8 | def get_by_api():
9 | username = 'insta360official'
10 | url = 'https://www.instagram.com/' + username + '/media/'
11 | now = time.mktime(datetime.date.today().timetuple())
12 | week_ago = now - (3600 * 24 * 7)
13 | today = datetime.datetime.now().strftime('%Y-%m-%d')
14 | like_total = 0
15 | comment_total = 0
16 | view_total = 0
17 | request = urllib2.Request(url = url)
18 | response = urllib2.urlopen(request)
19 | page = response.read()
20 | print page
21 | jsonData = json.loads(page, encoding="utf-8")
22 | data = jsonData['items']
23 | for item in data:
24 | temp = int(item['created_time'])
25 | if temp >= week_ago:
26 | like_total += int(item['likes']['count'])
27 | comment_total += int(item['comments']['count'])
28 | if item.has_key('video_views'):
29 | view_total += int(item['video_views'])
30 |
31 | result = {
32 | 'platform': 'instagram',
33 | 'date': today,
34 | 'comment': comment_total,
35 | 'like': like_total,
36 | 'share': 0,
37 | 'dislike': 0,
38 | 'view': view_total
39 | }
40 | jsonResult = json.dumps(result)
41 | print jsonResult
42 | return jsonResult
43 |
44 |
45 |
46 | def OAuth():
47 | url = 'https://www.instagram.com/oauth/authorize/?client_id=&redirect_uri=http://www.baidu.com&response_type=token'
48 | request = urllib2.Request(url = url)
49 | response = urllib2.urlopen(request)
50 | redirect_url = response.geturl()
51 | request = urllib2.Request(url=redirect_url)
52 | response = urllib2.urlopen(request)
53 | redirect_url = response.geturl()
54 | print redirect_url
55 | # data = json.loads(page, encoding="utf-8")
56 |
57 |
58 | def get_by_request():
59 | username = 'insta360official'
60 | url = 'https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22https%3A%2F%2Fwww.instagram.com%2F' + username + '%2F%22%20and%20xpath%3D%22%2Fhtml%2Fbody%2Fscript%5B1%5D%22&format=json'
61 | headers = {}
62 | headers['Host'] = 'query.yahooapis.com'
63 | headers['Connection'] = 'keep-alive'
64 | headers['Origin'] = 'https://livecounts.net'
65 | headers['Pragma'] = 'no-cache'
66 | headers['Referer'] = 'https://livecounts.net/instagram/cielni'
67 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
68 | request = urllib2.Request(url=url, headers=headers)
69 | response = urllib2.urlopen(request)
70 | page = response.read()
71 | print page
72 | jsonData = json.loads(page, encoding="utf-8")
73 | content = jsonData['query']['results']['script']['content']
74 | print content
75 | content = content[21:-1]
76 | print content
77 | content = json.loads(content, encoding="utf-8")
78 | fans = content['entry_data']['ProfilePage'][0]['user']['followed_by']['count']
79 | print fans
80 | if __name__ == "__main__":
81 | # OAuth()
82 | # get_by_request()
83 | get_by_api()
84 |
--------------------------------------------------------------------------------
/article_crawler/main.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import json
3 |
4 | from fb_crawler import get_by_api as get_fb
5 | from weibo_crawler_old import get_by_api as get_sina
6 | from twitter_crawler import get_by_api as get_twitter
7 | from youtube_crawler import YoukuCrawler
8 | from youku_crawler import get_by_api as get_youku
9 | from weixin_crawler import get_by_request as get_weixin
10 | from instagram_crawler import get_by_api as get_instagram
11 |
12 | def main():
13 | platform = ['facebook', 'weibo', 'twitter', 'youtube', 'youku', 'weixin', 'instagram']
14 | result = []
15 | for i in platform:
16 | data = '{}'
17 |
18 | if i == 'facebook':
19 | data = get_fb()
20 | elif i == 'weibo':
21 | data = get_sina()
22 | elif i == 'twitter':
23 | data = get_twitter()
24 | elif i == 'youtube':
25 | c = YoukuCrawler()
26 | data = c.get_videos_info()
27 | elif i == 'youku':
28 | data = get_youku()
29 | elif i == 'weixin':
30 | data = get_weixin()
31 | elif i == 'instagram':
32 | data = get_instagram()
33 |
34 | data = json.loads(data)
35 | today = datetime.datetime.now().strftime('%Y-%m-%d')
36 | temp = {'platform': i, 'data': data, 'date': today}
37 | result.append(temp)
38 | jsonResult = json.dumps(result)
39 | print jsonResult
40 | return jsonResult
41 |
42 |
43 | if __name__ == "__main__":
44 | main()
45 |
--------------------------------------------------------------------------------
/article_crawler/twitter_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import urllib
4 | import json
5 | import time
6 | import datetime
7 | import ssl
8 | from functools import wraps
9 |
10 | def get_by_api():
11 | username = 'insta360'
12 | url = 'https://api.twitter.com/1.1/statuses/user_timeline.json?count=200&trim_user=true&contributor_details=false&exclude_replies=true&include_rts=fasle&screen_name=' + username
13 | oauth = OAuth()
14 | headers = {}
15 | headers['Host'] = 'api.twitter.com'
16 | headers['X-Target-URI'] = 'https://api.twitter.com'
17 | headers['Content-Type'] = 'application/x-www-form-urlencoded'
18 | headers['Connection'] = 'keep-alive'
19 | headers['Authorization'] = oauth
20 | index = 1
21 | now = time.mktime(datetime.date.today().timetuple())
22 | week_ago = now - (3600 * 24 * 7)
23 | today = datetime.datetime.now().strftime('%Y-%m-%d')
24 | share_total = 0
25 | like_total = 0
26 | while(True):
27 | request = urllib2.Request(url = url + '&page=' + str(index), headers = headers)
28 | response = urllib2.urlopen(request)
29 | page = response.read()
30 | data = json.loads(page, encoding="utf-8")
31 | for item in data:
32 | temp = time.mktime(time.strptime(item['created_at'], "%a %b %d %H:%M:%S +0000 %Y"))
33 | if temp >= week_ago:
34 | share_total += int(item['retweet_count'])
35 | like_total += int(item['favorite_count'])
36 | index += 1
37 | if len(data) == 0:
38 | break
39 | result = {
40 | 'platform': 'twitter',
41 | 'date': today,
42 | 'comment': 0,
43 | 'like': like_total,
44 | 'share': share_total,
45 | 'dislike': 0,
46 | 'view': 0
47 | }
48 | jsonResult = json.dumps(result)
49 | print jsonResult
50 | return jsonResult
51 |
52 |
53 | def sslwrap(func):
54 | @wraps(func)
55 | def bar(*args, **kw):
56 | kw['ssl_version'] = ssl._PROTOCOL_NAMES
57 | return func(*args, **kw)
58 | return bar
59 |
60 |
61 | def OAuth():
62 | ssl.wrap_socket = sslwrap(ssl.wrap_socket)
63 | url = 'https://api.twitter.com/oauth2/token'
64 | value = {}
65 | value['grant_type'] = 'client_credentials'
66 | value['client_id'] = ''
67 | value['client_secret'] = ''
68 | data = urllib.urlencode(value)
69 | request = urllib2.Request(url = url, data = data)
70 | response = urllib2.urlopen(request)
71 | page = response.read()
72 | data = json.loads(page, encoding="utf-8")
73 | result = data['token_type'] + ' ' + data['access_token']
74 | return result
75 |
76 | if __name__ == '__main__':
77 | get_by_api()
78 |
--------------------------------------------------------------------------------
/article_crawler/weibo_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 |
3 | import time
4 | import re
5 | import datetime
6 | import json
7 | import sys
8 | from selenium import webdriver
9 | from selenium.webdriver.support.ui import WebDriverWait
10 | from selenium.common.exceptions import NoSuchElementException, TimeoutException
11 | import selenium.webdriver.support.ui as ui
12 | reload(sys)
13 | sys.setdefaultencoding("utf-8")
14 |
15 | '''
16 | 版本过低
17 | pip install -U selenium
18 | WebDriverException: Message: Can't load the profile.
19 | Profile Dir: %s If you specified a log_file in the FirefoxBinary constructor,
20 | check it for details.
21 | '''
22 |
23 | # 先调用无界面浏览器PhantomJS或Firefox
24 | # driver = webdriver.PhantomJS(executable_path="G:\phantomjs-1.9.1-windows\phantomjs.exe")
25 | driver = webdriver.PhantomJS()
26 | # options = webdriver.ChromeOptions()
27 | # options.add_experimental_option("excludeSwitches", ["ignore-certificate-errors"])
28 | # driver = webdriver.Chrome(chrome_options=options)
29 | wait = ui.WebDriverWait(driver, 10)
30 |
31 |
32 | # ********************************************************************************
33 | # 第一步: 登陆weibo.cn 获取新浪微博的cookie
34 | # 该方法针对weibo.cn有效(明文形式传输数据) weibo.com见学弟设置POST和Header方法
35 | # LoginWeibo(username, password) 参数用户名 密码
36 |
37 | # https://www.zhihu.com/question/21451510
38 | # http://www.cnblogs.com/fnng/p/3606934.html
39 | # 验证码暂停时间手动输入
40 | # ********************************************************************************
41 |
42 | def LoginWeibo(username, password):
43 | # **********************************************************************
44 | # 直接访问driver.get("http://weibo.cn/5824697471")会跳转到登陆页面 用户id
45 | #
46 | # 用户名
47 | # 密码 "password_4903" 中数字会变动,故采用绝对路径方法,否则不能定位到元素
48 | #
49 | # 勾选记住登录状态check默认是保留 故注释掉该代码 不保留Cookie 则'expiry'=None
50 | # **********************************************************************
51 |
52 | # 输入用户名/密码登录
53 | print u'准备登陆Weibo.cn网站...'
54 | driver.get("http://weibo.com/")
55 | driver.maximize_window()
56 | wait = WebDriverWait(driver, 10)
57 | try:
58 | wait.until(lambda x: x.find_element_by_id("loginname"))
59 | except TimeoutException:
60 | return
61 |
62 | # time.sleep(5)
63 |
64 | elem_user = driver.find_element_by_id("loginname")
65 | elem_user.send_keys(username) # 用户名
66 | elem_pwd = driver.find_element_by_name("password")
67 | elem_pwd.send_keys(password) # 密码
68 | # elem_rem = driver.find_element_by_id("login_form_savestate")
69 | # elem_rem.click() # 记住登录状态,默认是记住,所以不需要
70 |
71 | elem_sub = driver.find_element_by_xpath("//*[@id='pl_login_form']/div/div[3]/div[6]/a")
72 | elem_sub.click() # 点击登陆
73 | time.sleep(5)
74 |
75 | # 获取Coockie 推荐 http://www.cnblogs.com/fnng/p/3269450.html
76 | # print driver.current_url
77 | # print driver.get_cookies() #获得cookie信息 dict存储
78 | # print u'输出Cookie键值对信息:'
79 | # for cookie in driver.get_cookies():
80 | # #print cookie
81 | # for key in cookie:
82 | # print key, cookie[key]
83 |
84 | # driver.get_cookies()类型list 仅包含一个元素cookie类型dict
85 | print u'登陆成功...'
86 |
87 |
88 | def VisitPersonPage(user_id):
89 |
90 | print u'准备访问个人网站.....'
91 | driver.get("http://weibo.cn/" + user_id)
92 | result = []
93 | print '\n'
94 | print u'获取微博内容信息'
95 | num = 1
96 | while num <= 10:
97 | url_wb = "http://weibo.cn/" + user_id + "?filter=0&page=" + str(num)
98 | driver.get(url_wb)
99 | info = driver.find_elements_by_xpath("//div[@class='c']")
100 | for value in info:
101 | print value.text
102 | info = value.text
103 |
104 | # 跳过最后一行数据为class=c
105 | # Error: 'NoneType' object has no attribute 'groups'
106 | if u'设置:皮肤.图片' not in info:
107 | if info.startswith(u'转发'):
108 | print u'转发微博'
109 | status = '转发'
110 | else:
111 | print u'原创微博'
112 | status = '原创'
113 |
114 | # 获取最后一个点赞数 因为转发是后有个点赞数
115 | str1 = info.split(u" 赞")[-1]
116 | # print str1
117 | like = 0
118 | if str1:
119 | val1 = re.match(r'\[(.*?)\]', str1).groups()[0]
120 | like = val1
121 |
122 | str2 = info.split(u" 转发")[-1]
123 | share = 0
124 | if str2:
125 | val2 = re.match(r'\[(.*?)\]', str2).groups()[0]
126 | share = val2
127 |
128 | str3 = info.split(u" 评论")[-1]
129 | comment = 0
130 | if str3:
131 | val3 = re.match(r'\[(.*?)\]', str3).groups()[0]
132 | comment = val3
133 |
134 | str4 = info.split(u" 收藏 ")[-1]
135 | flag = str4.find(u"来自")
136 | temp_time = str4[:(flag - 1)]
137 | # print temp_time
138 | created_time = format_time(temp_time)
139 | date = created_time[0:10]
140 | message = info[:info.rindex(u" 赞")]
141 |
142 | try:
143 | url = value.find_element_by_xpath('div[2]/a[1]').get_attribute("href")
144 | except NoSuchElementException:
145 | url = ''
146 | temp = {
147 | 'account': user_id,
148 | 'message': message,
149 | # 'id': item['id_str'],
150 | 'public_time': created_time,
151 | 'date': date,
152 | 'share': share,
153 | 'like': like,
154 | 'comment': comment,
155 | 'link': url,
156 | 'status': status
157 | }
158 | result.append(temp)
159 | else:
160 | break
161 | else:
162 | print u'next page...\n'
163 | num += 1
164 | print '\n\n'
165 | jsonResult = json.dumps(result)
166 | print jsonResult
167 | return jsonResult
168 |
169 | def format_time(string):
170 | now = datetime.datetime.now()
171 | result = now.strftime('%Y-%m-%d %H:%M:%S')
172 | if u'分钟前' in string:
173 | d = int(string[0:1])
174 | temp = now - datetime.timedelta(minutes= d)
175 | result =temp.strftime('%Y-%m-%d %H:%M:%S')
176 |
177 | elif u'今天' in string:
178 | t = string[-5:]
179 | temp = now.strftime('%Y-%m-%d')
180 | result = temp + ' ' + t + ':00'
181 |
182 | elif u'月' in string:
183 | temp = time.strptime(string, "%m月%d日 %H:%M".decode('utf-8'))
184 | result = str(now.year) + '-' + time.strftime("%m-%d %H:%M:%S", temp)
185 |
186 | elif len(string) == 19:
187 | result = string
188 |
189 |
190 | return result
191 | # *******************************************************************************
192 | # 程序入口 预先调用
193 | # *******************************************************************************
194 |
195 |
196 | def get_by_selenium():
197 | # 定义变量
198 | username = '####' # 输入你的用户名
199 | password = '####' # 输入你的密码
200 |
201 | # 操作函数
202 | LoginWeibo(username, password) # 登陆微博
203 |
204 | # driver.add_cookie({'name':'name', 'value':'_T_WM'})
205 | # driver.add_cookie({'name':'value', 'value':'c86fbdcd26505c256a1504b9273df8ba'})
206 | user_id = 'insta360'
207 | # 注意
208 | # 因为sina微博增加了验证码,但是你用Firefox登陆一次输入验证码,再调用该程序即可,因为Cookies已经保证
209 | # 会直接跳转到明星微博那部分,即: http://weibo.cn/guangxianliuyan
210 |
211 | return VisitPersonPage(user_id) # 访问个人页面
212 |
213 | if __name__ == '__main__':
214 | get_by_selenium()
215 |
216 |
--------------------------------------------------------------------------------
/article_crawler/weibo_crawler_old.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib
3 | import urllib2
4 | import datetime
5 | import time
6 | import json
7 | import requests
8 | import base64
9 | from selenium import webdriver
10 | from selenium.common.exceptions import TimeoutException
11 | from selenium.webdriver.support.ui import WebDriverWait
12 |
13 | def get_by_api():
14 | url = 'https://api.weibo.com/2/statuses/user_timeline.json?page=1'
15 | username = '**********'
16 | password = '**********'
17 | value = {}
18 | value['trim_user'] = '1'
19 | value['count'] = '100'
20 | value['source'] = '218121934'
21 | data = urllib.urlencode(value)
22 | base64string = base64.encodestring(
23 | '%s:%s' % (username, password))[:-1] # 注意哦,这里最后会自动添加一个\n
24 | authheader = "Basic %s" % base64string
25 | header = {}
26 | header['Authorization'] = authheader
27 | now = time.mktime(datetime.date.today().timetuple())
28 | week_ago = now - (3600 * 24 * 7)
29 | today = datetime.datetime.now().strftime('%Y-%m-%d')
30 | share_total = 0
31 | like_total = 0
32 | comment_total = 0
33 | results = requests.get(url=url, params=data, headers=header)
34 | page = results.content
35 | print page
36 | jsonData = json.loads(page, encoding="utf-8")
37 | data = jsonData['statuses']
38 | for item in data:
39 | temp = time.mktime(time.strptime(item['created_at'], "%a %b %d %H:%M:%S +0800 %Y"))
40 | if temp >= week_ago:
41 | share_total += int(item['reposts_count'])
42 | like_total += int(item['attitudes_count'])
43 | comment_total += int(item['comments_count'])
44 | result = {
45 | 'platform': 'weibo',
46 | 'date': today,
47 | 'comment': comment_total,
48 | 'like': like_total,
49 | 'share': share_total,
50 | 'dislike': 0,
51 | 'view': 0
52 | }
53 | jsonResult = json.dumps(result)
54 | print jsonResult
55 | return jsonResult
56 |
57 | def get_by_selenium():
58 | username = 'insta360'
59 | url = 'http://weibo.cn/'+ username
60 | cap = webdriver.DesiredCapabilities.PHANTOMJS
61 | cap["phantomjs.page.settings.resourceTimeout"] = 1000
62 | cap["phantomjs.page.settings.loadImages"] = False
63 | cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True
64 | cap["userAgent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"
65 | cap["XSSAuditingEnabled"] = True
66 | cap["host"] = 'weibo.cn'
67 | # cap["cookie"] = '_T_WM=d2e28a98d3031cf98e282a29740b5f24; SUB=_2A2566MQNDeTxGeRJ7VYX8CzFyDmIHXVWEuxFrDV6PUJbkdAKLU_GkW1OqRtS_kr8ak-kdubq12_Bbpo41w..; gsid_CTandWM=4uona6911nQUejIzV9kdEbBcmf5'
68 | driver = webdriver.PhantomJS(desired_capabilities=cap,
69 | service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any',
70 | '--web-security=true'])
71 | # driver = webdriver.Chrome()
72 | driver.get('http://baidu.com')
73 | driver.add_cookie({'name': '_T_WM', 'value': 'd2e28a98d3031cf98e282a29740b5f24'})
74 | driver.add_cookie({'name': 'SUB', 'value': '_2A2566MQNDeTxGeRJ7VYX8CzFyDmIHXVWEuxFrDV6PUJbkdAKLU_GkW1OqRtS_kr8ak-kdubq12_Bbpo41w..'})
75 | driver.add_cookie({'name': 'gsid_CTandWM', 'value': '4uona6911nQUejIzV9kdEbBcmf5'})
76 | driver.get(url)
77 |
78 | wait = WebDriverWait(driver, 20)
79 | print driver.page_source
80 | try:
81 | result = int(
82 | wait.until(lambda x: x.find_element_by_xpath('/html/body/div[3]/div/a[2]').text[3:-1]))
83 | except TimeoutException:
84 | result = 0
85 | print result
86 | time.sleep(10)
87 | driver.quit()
88 | return result
89 |
90 | if __name__ == "__main__":
91 | get_by_api()
92 |
--------------------------------------------------------------------------------
/article_crawler/weixin_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import urllib
4 | import json
5 | import time
6 | import datetime
7 |
8 |
9 | def get_by_request():
10 | username = 'Insta360_official'
11 | url = 'http://www.newrank.cn/xdnphb/detail/getAccountArticle'
12 | headers = {}
13 | headers['Host'] = 'www.newrank.cn'
14 | headers['Referer'] = 'http://www.newrank.cn/public/info/detail.html?account=' + username
15 | headers['Cookie'] = 'userFaceTip=userFaceTip; CNZZDATA1253878005=1419576409-1475115174-%7C1475115174; Hm_lvt_a19fd7224d30e3c8a6558dcb38c4beed=1475116869; Hm_lpvt_a19fd7224d30e3c8a6558dcb38c4beed=1475116869; userFaceTip=userFaceTip'
16 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
17 | headers['X-Requested-With'] = 'XMLHttpRequest'
18 | headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
19 |
20 | value = {}
21 | value['flag'] = 'true'
22 | value['uuid'] = '91B514A33A4D2FA4C1E923ABDA595A90'
23 | value['nonce'] = '3679c0e73'
24 | value['xyz'] = '6cdb1d7fbdeea8afe76a21479f46f0b2'
25 | data = urllib.urlencode(value)
26 | request = urllib2.Request(url = url,data = data, headers = headers)
27 | response = urllib2.urlopen(request)
28 | page = response.read()
29 | now = time.mktime(datetime.date.today().timetuple())
30 | week_ago = now - (3600 * 24 * 7)
31 | today = datetime.datetime.now().strftime('%Y-%m-%d')
32 | like_total = 0
33 | view_total = 0
34 | result = json.loads(page, encoding="utf-8")
35 | articles = result['value']['lastestArticle']
36 | for article in articles:
37 | temp = time.mktime(time.strptime(article['publicTime'], "%Y-%m-%d %H:%M:%S"))
38 | if temp >= week_ago:
39 | view_total += int(article['clicksCount'])
40 | like_total += int(article['likeCount'])
41 | result = {
42 | 'platform': 'weixin',
43 | 'date': today,
44 | 'comment': 0,
45 | 'like': like_total,
46 | 'share': 0,
47 | 'dislike': 0,
48 | 'view': view_total
49 | }
50 | jsonResult = json.dumps(result)
51 | print jsonResult
52 | return jsonResult
53 |
54 | if __name__ == '__main__':
55 | get_by_request()
--------------------------------------------------------------------------------
/article_crawler/youku_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 |
3 | import urllib2
4 | import json
5 | import time
6 | import datetime
7 |
8 |
9 | def get_by_api():
10 | client_id = ''
11 | username = 'Insta360'
12 | url = 'https://openapi.youku.com/v2/videos/by_user.json?client_id=' + client_id + '&user_name=' + username + '&count=20'
13 | now = time.mktime(datetime.date.today().timetuple())
14 | week_ago = now - (3600 * 24 * 7)
15 | today = datetime.datetime.now().strftime('%Y-%m-%d')
16 | view_total = 0
17 | like_total = 0
18 | dislike_total = 0
19 | comment_total = 0
20 | request = urllib2.Request(url = url)
21 | response = urllib2.urlopen(request)
22 | page = response.read()
23 | jsonData = json.loads(page, encoding="utf-8")
24 | data = jsonData['videos']
25 | for item in data:
26 | temp = time.mktime(time.strptime(item['published'], "%Y-%m-%d %H:%M:%S"))
27 | if temp >= week_ago:
28 | view_total += int(item['view_count'])
29 | dislike_total += int(item['down_count'])
30 | like_total += int(item['up_count'])
31 | comment_total += int(item['comment_count'])
32 |
33 | result = {
34 | 'platform': 'youku',
35 | 'date': today,
36 | 'comment': comment_total,
37 | 'like': like_total,
38 | 'share': 0,
39 | 'dislike': dislike_total,
40 | 'view': view_total
41 | }
42 | jsonResult = json.dumps(result)
43 | print jsonResult
44 | return jsonResult
45 |
46 | if __name__ == '__main__':
47 | get_by_api()
48 |
49 |
--------------------------------------------------------------------------------
/article_crawler/youtube_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import time
4 | import json
5 | import datetime
6 | import requests
7 |
8 |
9 | class YoukuCrawler:
10 | def __init__(self):
11 | self.video_ids = []
12 | self.maxResults = 50
13 | playlist_id = ''
14 | self.app_key = ''
15 | self.list_api = 'https://www.googleapis.com/youtube/v3/playlistItems?maxResults=' + str(self.maxResults) + '&part=snippet&playlistId=' + playlist_id + '&key=' + self.app_key
16 | # self.info_api = 'https://www.googleapis.com/youtube/v3/videos?maxResults=50&part=snippet,statistics' + '&key=' + self.app_key
17 | self.info_api = 'https://www.googleapis.com/youtube/v3/videos'
18 | now = time.mktime(datetime.date.today().timetuple())
19 | self.week_ago = now - (3600 * 24 * 7)
20 | self.view_total = 0
21 | self.like_total = 0
22 | self.dislike_total = 0
23 | self.comment_total = 0
24 | def main(self):
25 | self.get_video_ids()
26 | return self.get_videos_info()
27 |
28 | def get_video_ids(self):
29 | url = self.list_api
30 | request = urllib2.Request(url=url)
31 | response = urllib2.urlopen(request)
32 | page = response.read()
33 | result = json.loads(page, encoding="utf-8")
34 | # total = int(result['pageInfo']['totalResults'])
35 | # perPage = int(result['pageInfo']['resultsPerPage'])
36 | # self.totalPage = (total/perPage) + (0 if (total%perPage)==0 else 1)
37 | videos = result['items']
38 | for video in videos:
39 | self.video_ids.append(video['snippet']['resourceId']['videoId'])
40 |
41 | while(result.has_key('nextPageToken')):
42 | url = self.list_api + '&pageToken=' + result['nextPageToken']
43 | request = urllib2.Request(url=url)
44 | response = urllib2.urlopen(request)
45 | page = response.read()
46 | result = json.loads(page, encoding="utf-8")
47 | videos = result['items']
48 | for video in videos:
49 | self.video_ids.append(video['snippet']['resourceId']['videoId'])
50 |
51 |
52 | def get_videos_info(self):
53 | url = self.info_api
54 | query = ''
55 | count = 0
56 | for i in self.video_ids:
57 | count += 1
58 | query = query + i + ','
59 | if count % self.maxResults == 0 or count == len(self.video_ids):
60 | query = query[:-1]
61 | results = requests.get(url,
62 | params={'id': query, 'maxResults': self.maxResults, 'part': 'snippet,statistics', 'key': self.app_key})
63 | page = results.content
64 | videos = json.loads(page, encoding="utf-8")['items']
65 | for video in videos:
66 | try:
67 | like_count = int(video['statistics']['likeCount'])
68 | except KeyError:
69 | like_count = 0
70 | try:
71 | dislike_count = int(video['statistics']['dislikeCount'])
72 | except KeyError:
73 | dislike_count = 0
74 | temp = time.mktime(time.strptime(video['snippet']['publishedAt'], "%Y-%m-%dT%H:%M:%S.000Z"))
75 | if temp >= self.week_ago:
76 | self.dislike_total += dislike_count
77 | self.like_total += like_count
78 | self.comment_total += int(video['statistics']['commentCount'])
79 | self.view_total += int(video['statistics']['viewCount'])
80 | query = ''
81 | today = datetime.datetime.now().strftime('%Y-%m-%d')
82 | result = {
83 | 'platform': 'youtube',
84 | 'date': today,
85 | 'comment': self.comment_total,
86 | 'like': self.like_total,
87 | 'share': 0,
88 | 'dislike': self.dislike_total,
89 | 'view': self.view_total
90 | }
91 | jsonResult = json.dumps(result)
92 | print jsonResult
93 | return jsonResult
94 |
95 | if __name__ == "__main__":
96 | c = YoukuCrawler()
97 | c.main()
98 |
--------------------------------------------------------------------------------
/fans_crawler/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NiShuang/new_media_fans_cralwer/e86382f153bfd9079623b8cafcb4563a9487013e/fans_crawler/.DS_Store
--------------------------------------------------------------------------------
/fans_crawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/NiShuang/new_media_fans_cralwer/e86382f153bfd9079623b8cafcb4563a9487013e/fans_crawler/__init__.py
--------------------------------------------------------------------------------
/fans_crawler/fb_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import re
3 | import urllib2
4 | import json
5 | from selenium import webdriver
6 | from selenium.common.exceptions import TimeoutException
7 | from selenium.webdriver.support.ui import WebDriverWait
8 | import ssl
9 | from functools import wraps
10 |
11 |
12 | def get_by_api():
13 | app_id = ''
14 | app_secret = ''
15 | access_token = app_id + '|' + app_secret
16 | username = 'Insta360VRVideoCamera'
17 | url = 'https://graph.facebook.com/' + username + '/?fields=fan_count&access_token=' + access_token
18 | headers = {}
19 | headers['Host'] = 'graph.facebook.com'
20 | headers['Connection'] = 'keep-alive'
21 | headers['Upgrade-Insecure-Requests'] = '1'
22 | headers['Cache-Control'] = 'max-age=0'
23 |
24 | request = urllib2.Request(url = url, headers = headers)
25 | response = urllib2.urlopen(request)
26 | page = response.read()
27 | # print page
28 | jsonData = json.loads(page, encoding="utf-8")
29 | fans = jsonData['fan_count']
30 | print fans
31 | return fans
32 |
33 |
34 | def sslwrap(func):
35 | @wraps(func)
36 | def bar(*args, **kw):
37 | kw['ssl_version'] = ssl._PROTOCOL_NAMES
38 | return func(*args, **kw)
39 | return bar
40 |
41 |
42 | def get_by_request():
43 | ssl.wrap_socket = sslwrap(ssl.wrap_socket)
44 | username = 'Insta360VRVideoCamera'
45 | url = 'https://www.facebook.com/plugins/fan.php?id=' + username
46 | headers = {}
47 | headers['Host'] = 'www.facebook.com'
48 | # headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
49 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
50 | # headers['Connection'] = 'keep-alive'
51 | # headers['Upgrade-Insecure-Requests'] = '1'
52 | request = urllib2.Request(url = url,headers=headers)
53 | response = urllib2.urlopen(request)
54 | page = response.read()
55 | pattern = re.compile("
(.{0,10})\s", re.S)
56 | items = re.findall(pattern, page)
57 | # print page.decode("UTF-8")
58 | fans = int(items[0].replace(',',''))
59 | print fans
60 | return fans
61 |
62 |
63 | def get_by_selenium():
64 | username = 'Insta360VRVideoCamera'
65 | url = 'https://www.facebook.com/plugins/fan.php?id=' + username
66 | cap = webdriver.DesiredCapabilities.PHANTOMJS
67 | cap["phantomjs.page.settings.resourceTimeout"] = 1000
68 | cap["phantomjs.page.settings.loadImages"] = False
69 | cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True
70 | cap["userAgent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"
71 | cap["XSSAuditingEnabled"] = True
72 | driver = webdriver.PhantomJS(desired_capabilities=cap,
73 | service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any',
74 | '--web-security=true'])
75 | # driver = webdriver.Chrome()
76 | driver.get(url)
77 | wait = WebDriverWait(driver, 20)
78 | # print driver.page_source
79 | try:
80 | string = wait.until(lambda x: x.find_elements_by_class_name('_1drq')[0].text)
81 | except TimeoutException:
82 | string = 0
83 | pattern = re.compile("\d", re.S)
84 | items = re.findall(pattern, string)
85 | temp = ''
86 | for item in items:
87 | temp += item
88 | fans = int(temp)
89 | print fans
90 | driver.quit()
91 | return fans
92 |
93 | if __name__ == "__main__":
94 | # get_by_request()
95 | # get_by_selenium()
96 | get_by_api()
97 |
--------------------------------------------------------------------------------
/fans_crawler/instagram_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import json
4 | import urllib
5 |
6 |
7 | def get_by_api():
8 | user_id = ''
9 | access_token = ''
10 | url = 'https://api.instagram.com/v1/users/' + user_id + '/?access_token=' + access_token
11 | # oauth = OAuth()
12 | request = urllib2.Request(url = url)
13 | response = urllib2.urlopen(request)
14 | page = response.read()
15 | print page
16 | data = json.loads(page, encoding="utf-8")
17 | fans = data['data']['counts']['followed_by']
18 | print fans
19 | return fans
20 |
21 |
22 | def OAuth():
23 | url = 'https://www.instagram.com/oauth/authorize/?client_id=a84f3a3ec8c44dfbbe9d2e3f07dc9c97&redirect_uri=http://www.baidu.com&response_type=token'
24 | request = urllib2.Request(url = url)
25 | response = urllib2.urlopen(request)
26 | redirect_url = response.geturl()
27 | request = urllib2.Request(url=redirect_url)
28 | response = urllib2.urlopen(request)
29 | redirect_url = response.geturl()
30 | print redirect_url
31 | # data = json.loads(page, encoding="utf-8")
32 |
33 |
34 | def get_by_request():
35 | username = 'insta360official'
36 | url = 'https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20html%20where%20url%3D%22https%3A%2F%2Fwww.instagram.com%2F' + username + '%2F%22%20and%20xpath%3D%22%2Fhtml%2Fbody%2Fscript%5B1%5D%22&format=json'
37 | headers = {}
38 | headers['Host'] = 'query.yahooapis.com'
39 | headers['Connection'] = 'keep-alive'
40 | headers['Origin'] = 'https://livecounts.net'
41 | headers['Pragma'] = 'no-cache'
42 | headers['Referer'] = 'https://livecounts.net/instagram/cielni'
43 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
44 | request = urllib2.Request(url=url, headers=headers)
45 | response = urllib2.urlopen(request)
46 | page = response.read()
47 | print page
48 | jsonData = json.loads(page, encoding="utf-8")
49 | content = jsonData['query']['results']['script']['content']
50 | print content
51 | content = content[21:-1]
52 | print content
53 | content = json.loads(content, encoding="utf-8")
54 | fans = content['entry_data']['ProfilePage'][0]['user']['followed_by']['count']
55 | print fans
56 | if __name__ == "__main__":
57 | # OAuth()
58 | # get_by_request()
59 | get_by_api()
60 |
--------------------------------------------------------------------------------
/fans_crawler/main.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import json
3 |
4 | from fb_crawler import get_by_api as get_fb_fans
5 | from weibo_crawler import get_by_request as get_sina_fans
6 | from twitter_crawler import get_by_request as get_twitter_fans
7 | from youtube_crawler import get_by_api as get_youtube_fans
8 | from youku_crawler import get_by_api as get_youku_fans
9 | from weixin_crawler import get_by_api as get_weixin_fans
10 | from instagram_crawler import get_by_request as get_instagram_fans
11 |
12 | def main():
13 | platform = ['facebook', 'weibo', 'twitter', 'youtube', 'youku', 'weixin', 'instagram']
14 | result = []
15 | for i in platform:
16 | fans = 0
17 |
18 | if i == 'facebook':
19 | fans = get_fb_fans()
20 | elif i == 'weibo':
21 | fans = get_sina_fans()
22 | elif i == 'twitter':
23 | fans = get_twitter_fans()
24 | elif i == 'youtube':
25 | fans = get_youtube_fans()
26 | elif i == 'youku':
27 | fans = get_youku_fans()
28 | elif i == 'weixin':
29 | fans = get_weixin_fans()
30 | elif i == 'instagram':
31 | fans = get_instagram_fans()
32 |
33 | today = datetime.datetime.now().strftime('%Y-%m-%d')
34 | temp = {'platform': i, 'fans': fans, 'date': today}
35 | result.append(temp)
36 | jsonResult = json.dumps(result)
37 | print jsonResult
38 | return jsonResult
39 |
40 |
41 | if __name__ == "__main__":
42 | main()
43 |
--------------------------------------------------------------------------------
/fans_crawler/twitter_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import json
4 | import ssl
5 | import urllib
6 | from functools import wraps
7 |
8 |
9 | def get_by_request():
10 | username = 'insta360'
11 | url = 'https://cdn.syndication.twimg.com/widgets/followbutton/info.json?screen_names=' + username
12 | # headers = {}
13 | # headers['Host'] = 'www.facebook.com'
14 | # headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
15 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
16 | # headers['Connection'] = 'keep-alive'
17 | # headers['Upgrade-Insecure-Requests'] = '1'
18 | request = urllib2.Request(url = url)
19 | response = urllib2.urlopen(request)
20 | page = response.read()
21 | result = json.loads(page, encoding="utf-8")
22 | fans = result[0]['followers_count']
23 | print fans
24 | return fans
25 |
26 |
27 | def get_by_api():
28 | username = 'insta360'
29 | url = 'https://api.twitter.com/1.1/users/show.json?include_entities=fasle&screen_name=' + username
30 | oauth = OAuth()
31 | headers = {}
32 | headers['Host'] = 'api.twitter.com'
33 | headers['X-Target-URI'] = 'https://api.twitter.com'
34 | headers['Content-Type'] = 'application/x-www-form-urlencoded'
35 | headers['Connection'] = 'keep-alive'
36 | headers['Authorization'] = oauth
37 | request = urllib2.Request(url = url, headers = headers)
38 | response = urllib2.urlopen(request)
39 | page = response.read()
40 | print page
41 | data = json.loads(page, encoding="utf-8")
42 | fans = data['followers_count']
43 | print fans
44 | return fans
45 |
46 |
47 | def OAuth():
48 | ssl.wrap_socket = sslwrap(ssl.wrap_socket)
49 | url = 'https://api.twitter.com/oauth2/token'
50 | value = {}
51 | value['grant_type'] = 'client_credentials'
52 | value['client_id'] = ''
53 | value['client_secret'] = ''
54 | data = urllib.urlencode(value)
55 | request = urllib2.Request(url = url, data = data)
56 | response = urllib2.urlopen(request)
57 | page = response.read()
58 | data = json.loads(page, encoding="utf-8")
59 | result = data['token_type'] + ' ' + data['access_token']
60 | return result
61 |
62 |
63 | def sslwrap(func):
64 | @wraps(func)
65 | def bar(*args, **kw):
66 | kw['ssl_version'] = ssl._PROTOCOL_NAMES
67 | return func(*args, **kw)
68 | return bar
69 |
70 |
71 | if __name__ == "__main__":
72 | get_by_request()
73 | # get_by_api()
74 |
--------------------------------------------------------------------------------
/fans_crawler/weibo_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import re
3 | import urllib2
4 | from selenium import webdriver
5 | from selenium.common.exceptions import TimeoutException
6 | from selenium.webdriver.support.ui import WebDriverWait
7 |
8 | def get_by_request():
9 | username = 'insta360'
10 | url = 'http://weibo.cn/'+ username
11 | headers = {}
12 | headers['Host'] = 'weibo.cn'
13 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
14 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
15 | headers['Cookie'] = '_T_WM=d2e28a98d3031cf98e282a29740b5f24'
16 | # headers['Connection'] = 'keep-alive'
17 | # headers['Upgrade-Insecure-Requests'] = '1'
18 | request = urllib2.Request(url = url, headers=headers)
19 | response = urllib2.urlopen(request)
20 | page = response.read()
21 | pattern = re.compile("\[(.{0,10})\]
<\\\\\/h2>", re.S)
22 | items = re.findall(pattern, page)
23 | # print page.decode("UTF-8")
24 | fans = int(items[1])
25 | print fans
26 | return fans
27 |
28 |
29 | def get_by_selenium():
30 | username = 'insta360'
31 | url = 'http://weibo.com/'+ username + '?is_all=1'
32 | # cap = webdriver.DesiredCapabilities.PHANTOMJS
33 | # cap["phantomjs.page.settings.resourceTimeout"] = 1000
34 | # cap["phantomjs.page.settings.loadImages"] = False
35 | # cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True
36 | # cap["userAgent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"
37 | # cap["XSSAuditingEnabled"] = True
38 | # cap["host"] = 'weibo.com'
39 | # cap["cookie"] = 'UOR=www.umeng.com,widget.weibo.com,www.insta360.com; SINAGLOBAL=6982249232630.452.1472299450582; ULV=1475028466086:3:2:2:8231266012653.427.1475028466020:1474966940284; SUB=_2AkMgtrrUf8NhqwJRmP0czWrmZY53wgjEieLBAH7sJRMxHRl-yT83qm8AtRCo0NEVwCee4iQkVabYZqZ8gEhMng..; SUBP=0033WrSXqPxfM72-Ws9jqgMF55529P9D9WWT6ckK7WZ-8GkEahm6SKw1; TC-Page-G0=0cd4658437f38175b9211f1336161d7d; _s_tentry=-; Apache=8231266012653.427.1475028466020'
40 | # driver = webdriver.PhantomJS(desired_capabilities=cap,
41 | # service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any',
42 | # '--web-security=true'])
43 | driver = webdriver.Chrome()
44 | driver.get(url)
45 | wait = WebDriverWait(driver, 20)
46 | # print driver.page_source
47 | try:
48 | result = int(
49 | wait.until(lambda x: x.find_element_by_xpath('//*[@id="Pl_Core_T8CustomTriColumn__3"]/div/div/div/table/tbody/tr/td[2]/strong').text))
50 | except TimeoutException:
51 | result = 0
52 | print result
53 | driver.quit()
54 | return result
55 |
56 | if __name__ == "__main__":
57 | get_by_request()
58 | # get_by_selenium()
--------------------------------------------------------------------------------
/fans_crawler/weixin_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import json
4 | import ssl
5 | import urllib
6 |
7 |
8 | def get_by_api():
9 | url = 'https://api.weixin.qq.com/cgi-bin/user/get'
10 | token = get_token()
11 | value = {}
12 | value['access_token'] = token
13 | value['next_openid'] = ''
14 | data = urllib.urlencode(value)
15 | request = urllib2.Request(url = url, data = data)
16 | response = urllib2.urlopen(request)
17 | page = response.read()
18 | print page
19 | data = json.loads(page, encoding="utf-8")
20 | fans = 0
21 | try:
22 | fans = data['total']
23 | except KeyError:
24 | pass
25 | print fans
26 | return fans
27 |
28 |
29 | def get_token():
30 | url = 'https://api.weixin.qq.com/cgi-bin/token'
31 | value = {}
32 | value['grant_type'] = 'client_credential'
33 | value['appid'] = ''
34 | value['secret'] = ''
35 | data = urllib.urlencode(value)
36 | request = urllib2.Request(url = url, data = data)
37 | response = urllib2.urlopen(request)
38 | page = response.read()
39 | print page
40 | data = json.loads(page, encoding="utf-8")
41 | result = ''
42 | try:
43 | result = data['access_token']
44 | except KeyError:
45 | pass
46 | return result
47 |
48 | if __name__ == "__main__":
49 | get_by_api()
50 |
--------------------------------------------------------------------------------
/fans_crawler/youku_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import re
4 | import json
5 | from selenium import webdriver
6 | from selenium.common.exceptions import TimeoutException
7 | from selenium.webdriver.support.ui import WebDriverWait
8 |
9 | def get_by_request():
10 | url = 'http://i.youku.com/i/UMjk1ODg3NDgwOA=='
11 | headers = {}
12 | headers['Host'] = 'i.youku.com'
13 | headers['Referer'] = 'http://www.insta360.com/'
14 | headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
15 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
16 | # headers['Connection'] = 'keep-alive'
17 | headers['Upgrade-Insecure-Requests'] = '1'
18 | request = urllib2.Request(url = url, headers=headers)
19 | response = urllib2.urlopen(request)
20 | page = response.read()
21 | # print page
22 | pattern = re.compile("", re.S)
23 | items = re.findall(pattern, page)
24 | fans = int(items[0])
25 | print fans
26 | return fans
27 |
28 | def get_by_api():
29 | url = 'https://openapi.youku.com/v2/users/friendship/followers.json?client_id=&user_id='
30 | request = urllib2.Request(url = url)
31 | response = urllib2.urlopen(request)
32 | page = response.read()
33 | result = json.loads(page, encoding="utf-8")
34 | fans = int(result['total'])
35 | print fans
36 | return fans
37 |
38 | def get_by_selenium():
39 | url = 'http://i.youku.com/i/UMjk1ODg3NDgwOA=='
40 | cap = webdriver.DesiredCapabilities.PHANTOMJS
41 | cap["phantomjs.page.settings.resourceTimeout"] = 1000
42 | cap["phantomjs.page.settings.loadImages"] = False
43 | cap["phantomjs.page.settings.localToRemoteUrlAccessEnabled"] = True
44 | cap["userAgent"] = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0"
45 | cap["XSSAuditingEnabled"] = True
46 | driver = webdriver.PhantomJS(desired_capabilities=cap,
47 | service_args=['--ignore-ssl-errors=true', '--ssl-protocol=any',
48 | '--web-security=true'])
49 | # driver = webdriver.Chrome()
50 | driver.get(url)
51 | wait = WebDriverWait(driver, 20)
52 | print driver.page_source
53 | try:
54 | fans = int(wait.until(lambda x: x.find_elements_by_class_name('snum')[0].find_element_by_xpath('em').text))
55 | except TimeoutException:
56 | fans = 0
57 | print fans
58 | driver.quit()
59 | return fans
60 |
61 | if __name__ == "__main__":
62 | get_by_api()
63 | # get_by_selenium()
64 |
--------------------------------------------------------------------------------
/fans_crawler/youtube_crawler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | import urllib2
3 | import re
4 | import json
5 | from selenium import webdriver
6 | from selenium.common.exceptions import TimeoutException
7 | from selenium.webdriver.support.ui import WebDriverWait
8 |
9 | def get_by_request():
10 | url = 'https://www.youtube.com/channel/UC3qWcF49rv8VMZO7Vg6kj5w'
11 | headers = {}
12 | headers['Host'] = 'www.youtube.com'
13 | headers['Referer'] = 'http://www.insta360.com/'
14 | # headers['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0'
15 | # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
16 | # headers['Connection'] = 'keep-alive'
17 | # headers['Upgrade-Insecure-Requests'] = '1'
18 | request = urllib2.Request(url = url, headers=headers)
19 | response = urllib2.urlopen(request)
20 | page = response.read()
21 | # print page
22 | pattern = re.compile("subscribers\">(.*)