├── FlaskProject
├── .idea
│ ├── .gitignore
│ ├── flaskProject.iml
│ ├── inspectionProfiles
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ └── modules.xml
├── __pycache__
│ └── app.cpython-38.pyc
├── app.py
├── data.txt
├── static
│ ├── css
│ │ └── style.css
│ ├── img
│ │ ├── generated_leaf.jpg
│ │ ├── generated_tree.jpg
│ │ ├── icon.png
│ │ ├── leaf.jpg
│ │ └── tree.jpg
│ ├── js
│ │ ├── echarts.min.js
│ │ └── main.js
│ └── vendor
│ │ ├── animate.css
│ │ ├── animate.css
│ │ └── animate.min.css
│ │ ├── aos
│ │ ├── aos.css
│ │ └── aos.js
│ │ ├── bootstrap
│ │ ├── css
│ │ │ ├── bootstrap-grid.css
│ │ │ ├── bootstrap-grid.css.map
│ │ │ ├── bootstrap-grid.min.css
│ │ │ ├── bootstrap-grid.min.css.map
│ │ │ ├── bootstrap-reboot.css
│ │ │ ├── bootstrap-reboot.css.map
│ │ │ ├── bootstrap-reboot.min.css
│ │ │ ├── bootstrap-reboot.min.css.map
│ │ │ ├── bootstrap.css
│ │ │ ├── bootstrap.css.map
│ │ │ ├── bootstrap.min.css
│ │ │ └── bootstrap.min.css.map
│ │ └── js
│ │ │ ├── bootstrap.bundle.js
│ │ │ ├── bootstrap.bundle.js.map
│ │ │ ├── bootstrap.bundle.min.js
│ │ │ ├── bootstrap.bundle.min.js.map
│ │ │ ├── bootstrap.js
│ │ │ ├── bootstrap.js.map
│ │ │ ├── bootstrap.min.js
│ │ │ └── bootstrap.min.js.map
│ │ ├── boxicons
│ │ ├── css
│ │ │ ├── animations.css
│ │ │ ├── boxicons.css
│ │ │ ├── boxicons.min.css
│ │ │ └── transformations.css
│ │ └── fonts
│ │ │ ├── boxicons.eot
│ │ │ ├── boxicons.svg
│ │ │ ├── boxicons.ttf
│ │ │ ├── boxicons.woff
│ │ │ └── boxicons.woff2
│ │ ├── counterup
│ │ └── counterup.min.js
│ │ ├── icofont
│ │ ├── fonts
│ │ │ ├── icofont.woff
│ │ │ └── icofont.woff2
│ │ └── icofont.min.css
│ │ ├── isotope-layout
│ │ ├── isotope.pkgd.js
│ │ └── isotope.pkgd.min.js
│ │ ├── jquery-sticky
│ │ └── jquery.sticky.js
│ │ ├── jquery.easing
│ │ └── jquery.easing.min.js
│ │ ├── jquery
│ │ ├── jquery.min.js
│ │ └── jquery.min.map
│ │ ├── php-email-form
│ │ └── validate.js
│ │ ├── venobox
│ │ ├── venobox.css
│ │ ├── venobox.js
│ │ └── venobox.min.js
│ │ └── waypoints
│ │ └── jquery.waypoints.min.js
├── templates
│ ├── index.html
│ ├── movie.html
│ ├── score.html
│ └── word.html
└── wordCloud.py
├── Master
├── .idea
│ ├── .gitignore
│ ├── Master.iml
│ ├── inspectionProfiles
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ └── modules.xml
└── main.py
├── Pic
├── index.jpg
├── mongoDB_data.jpg
├── movies.jpg
├── proxy.jpg
├── redis_data.jpg
├── score.jpg
├── slave.jpg
└── words.jpg
├── README.md
├── Slave
├── .idea
│ ├── .gitignore
│ ├── Slave.iml
│ ├── inspectionProfiles
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ └── modules.xml
├── movies
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-38.pyc
│ │ ├── middlewares.cpython-38.pyc
│ │ ├── pipelines.cpython-38.pyc
│ │ └── settings.cpython-38.pyc
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ ├── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ │ ├── __init__.cpython-38.pyc
│ │ │ └── douban_redis.cpython-38.pyc
│ │ └── douban_redis.py
│ └── start.py
└── scrapy.cfg
└── proxy_pool
├── .idea
├── .gitignore
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── proxy_pool.iml
├── __pycache__
└── setting.cpython-38.pyc
├── api
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-38.pyc
│ └── proxyApi.cpython-38.pyc
└── proxyApi.py
├── db
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-38.pyc
│ ├── dbClient.cpython-38.pyc
│ └── redisClient.cpython-38.pyc
├── dbClient.py
├── redisClient.py
└── ssdbClient.py
├── docs
├── Makefile
├── changelog.rst
├── conf.py
├── dev
│ ├── ext_fetcher.rst
│ ├── ext_validator.rst
│ └── index.rst
├── index.rst
├── make.bat
└── user
│ ├── how_to_config.rst
│ ├── how_to_run.rst
│ ├── how_to_use.rst
│ └── index.rst
├── fetcher
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-38.pyc
│ └── proxyFetcher.cpython-38.pyc
└── proxyFetcher.py
├── handler
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-38.pyc
│ ├── configHandler.cpython-38.pyc
│ ├── logHandler.cpython-38.pyc
│ └── proxyHandler.cpython-38.pyc
├── configHandler.py
├── logHandler.py
└── proxyHandler.py
├── helper
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-38.pyc
│ ├── check.cpython-38.pyc
│ ├── fetch.cpython-38.pyc
│ ├── launcher.cpython-38.pyc
│ ├── proxy.cpython-38.pyc
│ ├── scheduler.cpython-38.pyc
│ └── validator.cpython-38.pyc
├── check.py
├── fetch.py
├── launcher.py
├── proxy.py
├── scheduler.py
└── validator.py
├── proxyPool.py
├── requirements.txt
├── setting.py
├── test
├── __init__.py
├── testConfigHandler.py
├── testDbClient.py
├── testLogHandler.py
├── testProxyClass.py
├── testProxyFetcher.py
├── testProxyValidator.py
├── testRedisClient.py
└── testSsdbClient.py
└── util
├── __init__.py
├── __pycache__
├── __init__.cpython-38.pyc
├── lazyProperty.cpython-38.pyc
├── singleton.cpython-38.pyc
├── six.cpython-38.pyc
└── webRequest.cpython-38.pyc
├── lazyProperty.py
├── singleton.py
├── six.py
└── webRequest.py
/FlaskProject/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/FlaskProject/.idea/flaskProject.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
18 |
19 |
--------------------------------------------------------------------------------
/FlaskProject/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/FlaskProject/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/FlaskProject/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/FlaskProject/__pycache__/app.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/__pycache__/app.cpython-38.pyc
--------------------------------------------------------------------------------
/FlaskProject/app.py:
--------------------------------------------------------------------------------
1 | from flask import Flask, render_template, make_response,jsonify
2 | import pymongo
3 |
4 | app = Flask(__name__)
5 |
6 | def myCollection():
7 | client = pymongo.MongoClient(host='localhost', port=27017)
8 | db = client.movies
9 | collection = db.douban
10 | return collection
11 |
12 | @app.route('/')
13 | def home():
14 | return index()
15 |
16 | @app.route('/index')
17 | def index():
18 | #电影 评分 词汇 团队成员
19 | movies_num = 0
20 | votes_num = 0
21 | words_num = 11655
22 | team_num = 8
23 | for item in myCollection().find():
24 | movies_num += 1
25 | votes_num += int(item['comment_num'])
26 | votes_num = int(votes_num / 10000)
27 | return render_template("index.html",movies_num=movies_num,votes_num=votes_num,words_num=words_num,team_num=team_num)
28 |
29 | @app.route('/movie')
30 | def movie():
31 | query = {}
32 | projection = {}
33 |
34 | projection["rank"] = u"$rank"
35 | projection["page_url"] = u"$page_url"
36 | projection["title"] = u"$title"
37 | projection["score"] = u"$score"
38 | projection["comment_num"] = u"$comment_num"
39 | projection["directedBy"] = u"$directedBy"
40 | projection["actors"] = u"$actors"
41 | projection["comment"] = u"$comment"
42 | projection["year"] = u"$year"
43 | projection["_id"] = 0
44 |
45 | cursor = myCollection().find(query, projection=projection)
46 | movies = []
47 | for doc in cursor:
48 | movies.append({
49 | 'rank': int(doc['rank']),
50 | 'link': doc['page_url'],
51 | 'title': doc['title'],
52 | 'score': doc['score'],
53 | 'comment_num': doc['comment_num'],
54 | 'directed_by': doc['directedBy'],
55 | # 'actors': doc['actors'],
56 | 'comment': doc['comment'],
57 | 'year': doc['year'],
58 | })
59 | movies.sort(key=lambda x: x['rank'], reverse=False)
60 |
61 | return render_template("movie.html",movies = movies)
62 |
63 |
64 | @app.route('/word')
65 | def word():
66 | return render_template("word.html")
67 |
68 | @app.route('/score')
69 | def score():
70 | # sql = "select score,count(score) from movie250 group by score"
71 | pipeline = [
72 | {
73 | u"$group": {
74 | u"_id": {
75 | u"score": u"$score"
76 | },
77 | u"COUNT(score)": {
78 | u"$sum": 1
79 | }
80 | }
81 | },
82 | {
83 | u"$project": {
84 | u"score": u"$_id.score",
85 | u"COUNT(score)": u"$COUNT(score)",
86 | u"_id": 0
87 | }
88 | }
89 | ]
90 | cursor = myCollection().aggregate(pipeline, allowDiskUse=True)
91 | score = [] # 评分
92 | num = [] # 每个评分统计出的电影数量
93 | score_num = {}
94 | for doc in cursor:
95 | score.append(doc['score'])
96 | score_num[doc['score']] = doc['COUNT(score)']
97 | score.sort()
98 | for count in range(len(score_num)):
99 | num.append(score_num[score[count]])
100 | count += 1
101 |
102 | return render_template("score.html",score=score,num=num)
103 |
104 | if __name__ == '__main__':
105 | app.run(debug=True)
--------------------------------------------------------------------------------
/FlaskProject/static/img/generated_leaf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/generated_leaf.jpg
--------------------------------------------------------------------------------
/FlaskProject/static/img/generated_tree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/generated_tree.jpg
--------------------------------------------------------------------------------
/FlaskProject/static/img/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/icon.png
--------------------------------------------------------------------------------
/FlaskProject/static/img/leaf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/leaf.jpg
--------------------------------------------------------------------------------
/FlaskProject/static/img/tree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/tree.jpg
--------------------------------------------------------------------------------
/FlaskProject/static/js/main.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Template Name: Mamba - v2.0.1
3 | * Template URL: https://bootstrapmade.com/mamba-one-page-bootstrap-template-free/
4 | * Author: BootstrapMade.com
5 | * License: https://bootstrapmade.com/license/
6 | */
7 | !(function($) {
8 | "use strict";
9 |
10 | // Toggle .header-scrolled class to #header when page is scrolled
11 | $(window).scroll(function() {
12 | if ($(this).scrollTop() > 100) {
13 | $('#header').addClass('header-scrolled');
14 | } else {
15 | $('#header').removeClass('header-scrolled');
16 | }
17 | });
18 |
19 | if ($(window).scrollTop() > 100) {
20 | $('#header').addClass('header-scrolled');
21 | }
22 |
23 | // Stick the header at top on scroll
24 | $("#header").sticky({
25 | topSpacing: 0,
26 | zIndex: '50'
27 | });
28 |
29 | // Smooth scroll for the navigation menu and links with .scrollto classes
30 | $(document).on('click', '.nav-menu a, .mobile-nav a, .scrollto', function(e) {
31 | if (location.pathname.replace(/^\//, '') == this.pathname.replace(/^\//, '') && location.hostname == this.hostname) {
32 | e.preventDefault();
33 | var target = $(this.hash);
34 | if (target.length) {
35 |
36 | var scrollto = target.offset().top;
37 | var scrolled = 2;
38 |
39 | if ($('#header-sticky-wrapper').length) {
40 | scrollto -= $('#header-sticky-wrapper').outerHeight() - scrolled;
41 | }
42 |
43 | if ($(this).attr("href") == '#header') {
44 | scrollto = 0;
45 | }
46 |
47 | $('html, body').animate({
48 | scrollTop: scrollto
49 | }, 1500, 'easeInOutExpo');
50 |
51 | if ($(this).parents('.nav-menu, .mobile-nav').length) {
52 | $('.nav-menu .active, .mobile-nav .active').removeClass('active');
53 | $(this).closest('li').addClass('active');
54 | }
55 |
56 | if ($('body').hasClass('mobile-nav-active')) {
57 | $('body').removeClass('mobile-nav-active');
58 | $('.mobile-nav-toggle i').toggleClass('icofont-navigation-menu icofont-close');
59 | $('.mobile-nav-overly').fadeOut();
60 | }
61 | return false;
62 | }
63 | }
64 | });
65 |
66 | // Mobile Navigation
67 | if ($('.nav-menu').length) {
68 | var $mobile_nav = $('.nav-menu').clone().prop({
69 | class: 'mobile-nav d-lg-none'
70 | });
71 | $('body').append($mobile_nav);
72 | $('body').prepend('');
73 | $('body').append('
');
74 |
75 | $(document).on('click', '.mobile-nav-toggle', function(e) {
76 | $('body').toggleClass('mobile-nav-active');
77 | $('.mobile-nav-toggle i').toggleClass('icofont-navigation-menu icofont-close');
78 | $('.mobile-nav-overly').toggle();
79 | });
80 |
81 | $(document).on('click', '.mobile-nav .drop-down > a', function(e) {
82 | e.preventDefault();
83 | $(this).next().slideToggle(300);
84 | $(this).parent().toggleClass('active');
85 | });
86 |
87 | $(document).click(function(e) {
88 | var container = $(".mobile-nav, .mobile-nav-toggle");
89 | if (!container.is(e.target) && container.has(e.target).length === 0) {
90 | if ($('body').hasClass('mobile-nav-active')) {
91 | $('body').removeClass('mobile-nav-active');
92 | $('.mobile-nav-toggle i').toggleClass('icofont-navigation-menu icofont-close');
93 | $('.mobile-nav-overly').fadeOut();
94 | }
95 | }
96 | });
97 | } else if ($(".mobile-nav, .mobile-nav-toggle").length) {
98 | $(".mobile-nav, .mobile-nav-toggle").hide();
99 | }
100 |
101 | // Intro carousel
102 | var heroCarousel = $("#heroCarousel");
103 | var heroCarouselIndicators = $("#hero-carousel-indicators");
104 | heroCarousel.find(".carousel-inner").children(".carousel-item").each(function(index) {
105 | (index === 0) ?
106 | heroCarouselIndicators.append(""):
107 | heroCarouselIndicators.append("");
108 | });
109 |
110 | heroCarousel.on('slid.bs.carousel', function(e) {
111 | $(this).find('h2').addClass('animated fadeInDown');
112 | $(this).find('p').addClass('animated fadeInUp');
113 | $(this).find('.btn-get-started').addClass('animated fadeInUp');
114 | });
115 |
116 | // Back to top button
117 | $(window).scroll(function() {
118 | if ($(this).scrollTop() > 100) {
119 | $('.back-to-top').fadeIn('slow');
120 | } else {
121 | $('.back-to-top').fadeOut('slow');
122 | }
123 | });
124 |
125 | $('.back-to-top').click(function() {
126 | $('html, body').animate({
127 | scrollTop: 0
128 | }, 1500, 'easeInOutExpo');
129 | return false;
130 | });
131 |
132 | // Initiate the venobox plugin
133 | $(window).on('load', function() {
134 | $('.venobox').venobox();
135 | });
136 |
137 | // jQuery counterUp
138 | $('[data-toggle="counter-up"]').counterUp({
139 | delay: 10,
140 | time: 1000
141 | });
142 |
143 | // Porfolio isotope and filter
144 | $(window).on('load', function() {
145 | var portfolioIsotope = $('.portfolio-container').isotope({
146 | itemSelector: '.portfolio-item',
147 | layoutMode: 'fitRows'
148 | });
149 |
150 | $('#portfolio-flters li').on('click', function() {
151 | $("#portfolio-flters li").removeClass('filter-active');
152 | $(this).addClass('filter-active');
153 |
154 | portfolioIsotope.isotope({
155 | filter: $(this).data('filter')
156 | });
157 | });
158 |
159 | // Initiate venobox (lightbox feature used in portofilo)
160 | $(document).ready(function() {
161 | $('.venobox').venobox();
162 | });
163 | });
164 |
165 | // Initi AOS
166 | AOS.init({
167 | duration: 1000,
168 | easing: "ease-in-out-back"
169 | });
170 |
171 | })(jQuery);
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/bootstrap/css/bootstrap-reboot.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Bootstrap Reboot v4.4.1 (https://getbootstrap.com/)
3 | * Copyright 2011-2019 The Bootstrap Authors
4 | * Copyright 2011-2019 Twitter, Inc.
5 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
6 | * Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md)
7 | */
8 | *,
9 | *::before,
10 | *::after {
11 | box-sizing: border-box;
12 | }
13 |
14 | html {
15 | font-family: sans-serif;
16 | line-height: 1.15;
17 | -webkit-text-size-adjust: 100%;
18 | -webkit-tap-highlight-color: rgba(0, 0, 0, 0);
19 | }
20 |
21 | article, aside, figcaption, figure, footer, header, hgroup, main, nav, section {
22 | display: block;
23 | }
24 |
25 | body {
26 | margin: 0;
27 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
28 | font-size: 1rem;
29 | font-weight: 400;
30 | line-height: 1.5;
31 | color: #212529;
32 | text-align: left;
33 | background-color: #fff;
34 | }
35 |
36 | [tabindex="-1"]:focus:not(:focus-visible) {
37 | outline: 0 !important;
38 | }
39 |
40 | hr {
41 | box-sizing: content-box;
42 | height: 0;
43 | overflow: visible;
44 | }
45 |
46 | h1, h2, h3, h4, h5, h6 {
47 | margin-top: 0;
48 | margin-bottom: 0.5rem;
49 | }
50 |
51 | p {
52 | margin-top: 0;
53 | margin-bottom: 1rem;
54 | }
55 |
56 | abbr[title],
57 | abbr[data-original-title] {
58 | text-decoration: underline;
59 | -webkit-text-decoration: underline dotted;
60 | text-decoration: underline dotted;
61 | cursor: help;
62 | border-bottom: 0;
63 | -webkit-text-decoration-skip-ink: none;
64 | text-decoration-skip-ink: none;
65 | }
66 |
67 | address {
68 | margin-bottom: 1rem;
69 | font-style: normal;
70 | line-height: inherit;
71 | }
72 |
73 | ol,
74 | ul,
75 | dl {
76 | margin-top: 0;
77 | margin-bottom: 1rem;
78 | }
79 |
80 | ol ol,
81 | ul ul,
82 | ol ul,
83 | ul ol {
84 | margin-bottom: 0;
85 | }
86 |
87 | dt {
88 | font-weight: 700;
89 | }
90 |
91 | dd {
92 | margin-bottom: .5rem;
93 | margin-left: 0;
94 | }
95 |
96 | blockquote {
97 | margin: 0 0 1rem;
98 | }
99 |
100 | b,
101 | strong {
102 | font-weight: bolder;
103 | }
104 |
105 | small {
106 | font-size: 80%;
107 | }
108 |
109 | sub,
110 | sup {
111 | position: relative;
112 | font-size: 75%;
113 | line-height: 0;
114 | vertical-align: baseline;
115 | }
116 |
117 | sub {
118 | bottom: -.25em;
119 | }
120 |
121 | sup {
122 | top: -.5em;
123 | }
124 |
125 | a {
126 | color: #007bff;
127 | text-decoration: none;
128 | background-color: transparent;
129 | }
130 |
131 | a:hover {
132 | color: #0056b3;
133 | text-decoration: underline;
134 | }
135 |
136 | a:not([href]) {
137 | color: inherit;
138 | text-decoration: none;
139 | }
140 |
141 | a:not([href]):hover {
142 | color: inherit;
143 | text-decoration: none;
144 | }
145 |
146 | pre,
147 | code,
148 | kbd,
149 | samp {
150 | font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
151 | font-size: 1em;
152 | }
153 |
154 | pre {
155 | margin-top: 0;
156 | margin-bottom: 1rem;
157 | overflow: auto;
158 | }
159 |
160 | figure {
161 | margin: 0 0 1rem;
162 | }
163 |
164 | img {
165 | vertical-align: middle;
166 | border-style: none;
167 | }
168 |
169 | svg {
170 | overflow: hidden;
171 | vertical-align: middle;
172 | }
173 |
174 | table {
175 | border-collapse: collapse;
176 | }
177 |
178 | caption {
179 | padding-top: 0.75rem;
180 | padding-bottom: 0.75rem;
181 | color: #6c757d;
182 | text-align: left;
183 | caption-side: bottom;
184 | }
185 |
186 | th {
187 | text-align: inherit;
188 | }
189 |
190 | label {
191 | display: inline-block;
192 | margin-bottom: 0.5rem;
193 | }
194 |
195 | button {
196 | border-radius: 0;
197 | }
198 |
199 | button:focus {
200 | outline: 1px dotted;
201 | outline: 5px auto -webkit-focus-ring-color;
202 | }
203 |
204 | input,
205 | button,
206 | select,
207 | optgroup,
208 | textarea {
209 | margin: 0;
210 | font-family: inherit;
211 | font-size: inherit;
212 | line-height: inherit;
213 | }
214 |
215 | button,
216 | input {
217 | overflow: visible;
218 | }
219 |
220 | button,
221 | select {
222 | text-transform: none;
223 | }
224 |
225 | select {
226 | word-wrap: normal;
227 | }
228 |
229 | button,
230 | [type="button"],
231 | [type="reset"],
232 | [type="submit"] {
233 | -webkit-appearance: button;
234 | }
235 |
236 | button:not(:disabled),
237 | [type="button"]:not(:disabled),
238 | [type="reset"]:not(:disabled),
239 | [type="submit"]:not(:disabled) {
240 | cursor: pointer;
241 | }
242 |
243 | button::-moz-focus-inner,
244 | [type="button"]::-moz-focus-inner,
245 | [type="reset"]::-moz-focus-inner,
246 | [type="submit"]::-moz-focus-inner {
247 | padding: 0;
248 | border-style: none;
249 | }
250 |
251 | input[type="radio"],
252 | input[type="checkbox"] {
253 | box-sizing: border-box;
254 | padding: 0;
255 | }
256 |
257 | input[type="date"],
258 | input[type="time"],
259 | input[type="datetime-local"],
260 | input[type="month"] {
261 | -webkit-appearance: listbox;
262 | }
263 |
264 | textarea {
265 | overflow: auto;
266 | resize: vertical;
267 | }
268 |
269 | fieldset {
270 | min-width: 0;
271 | padding: 0;
272 | margin: 0;
273 | border: 0;
274 | }
275 |
276 | legend {
277 | display: block;
278 | width: 100%;
279 | max-width: 100%;
280 | padding: 0;
281 | margin-bottom: .5rem;
282 | font-size: 1.5rem;
283 | line-height: inherit;
284 | color: inherit;
285 | white-space: normal;
286 | }
287 |
288 | progress {
289 | vertical-align: baseline;
290 | }
291 |
292 | [type="number"]::-webkit-inner-spin-button,
293 | [type="number"]::-webkit-outer-spin-button {
294 | height: auto;
295 | }
296 |
297 | [type="search"] {
298 | outline-offset: -2px;
299 | -webkit-appearance: none;
300 | }
301 |
302 | [type="search"]::-webkit-search-decoration {
303 | -webkit-appearance: none;
304 | }
305 |
306 | ::-webkit-file-upload-button {
307 | font: inherit;
308 | -webkit-appearance: button;
309 | }
310 |
311 | output {
312 | display: inline-block;
313 | }
314 |
315 | summary {
316 | display: list-item;
317 | cursor: pointer;
318 | }
319 |
320 | template {
321 | display: none;
322 | }
323 |
324 | [hidden] {
325 | display: none !important;
326 | }
327 | /*# sourceMappingURL=bootstrap-reboot.css.map */
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/bootstrap/css/bootstrap-reboot.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 | * Bootstrap Reboot v4.4.1 (https://getbootstrap.com/)
3 | * Copyright 2011-2019 The Bootstrap Authors
4 | * Copyright 2011-2019 Twitter, Inc.
5 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
6 | * Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md)
7 | */*,::after,::before{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-webkit-tap-highlight-color:transparent}article,aside,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}body{margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-size:1rem;font-weight:400;line-height:1.5;color:#212529;text-align:left;background-color:#fff}[tabindex="-1"]:focus:not(:focus-visible){outline:0!important}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem}p{margin-top:0;margin-bottom:1rem}abbr[data-original-title],abbr[title]{text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted;cursor:help;border-bottom:0;-webkit-text-decoration-skip-ink:none;text-decoration-skip-ink:none}address{margin-bottom:1rem;font-style:normal;line-height:inherit}dl,ol,ul{margin-top:0;margin-bottom:1rem}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}a{color:#007bff;text-decoration:none;background-color:transparent}a:hover{color:#0056b3;text-decoration:underline}a:not([href]){color:inherit;text-decoration:none}a:not([href]):hover{color:inherit;text-decoration:none}code,kbd,pre,samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;font-size:1em}pre{margin-top:0;margin-bottom:1rem;overflow:auto}figure{margin:0 0 1rem}img{vertical-align:middle;border-style:none}svg{overflow:hidden;vertical-align:middle}table{border-collapse:collapse}caption{padding-top:.75rem;padding-bottom:.75rem;color:#6c757d;text-align:left;caption-side:bottom}th{text-align:inherit}label{display:inline-block;margin-bottom:.5rem}button{border-radius:0}button:focus{outline:1px dotted;outline:5px auto -webkit-focus-ring-color}button,input,optgroup,select,textarea{margin:0;font-family:inherit;font-size:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}select{word-wrap:normal}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}[type=button]:not(:disabled),[type=reset]:not(:disabled),[type=submit]:not(:disabled),button:not(:disabled){cursor:pointer}[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner,button::-moz-focus-inner{padding:0;border-style:none}input[type=checkbox],input[type=radio]{box-sizing:border-box;padding:0}input[type=date],input[type=datetime-local],input[type=month],input[type=time]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;max-width:100%;padding:0;margin-bottom:.5rem;font-size:1.5rem;line-height:inherit;color:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item;cursor:pointer}template{display:none}[hidden]{display:none!important}
8 | /*# sourceMappingURL=bootstrap-reboot.min.css.map */
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/css/animations.css:
--------------------------------------------------------------------------------
1 | @-webkit-keyframes spin
2 | {
3 | 0%
4 | {
5 | -webkit-transform: rotate(0);
6 | transform: rotate(0);
7 | }
8 | 100%
9 | {
10 | -webkit-transform: rotate(359deg);
11 | transform: rotate(359deg);
12 | }
13 | }
14 | @keyframes spin
15 | {
16 | 0%
17 | {
18 | -webkit-transform: rotate(0);
19 | transform: rotate(0);
20 | }
21 | 100%
22 | {
23 | -webkit-transform: rotate(359deg);
24 | transform: rotate(359deg);
25 | }
26 | }
27 | @-webkit-keyframes burst
28 | {
29 | 0%
30 | {
31 | -webkit-transform: scale(1);
32 | transform: scale(1);
33 |
34 | opacity: 1;
35 | }
36 | 90%
37 | {
38 | -webkit-transform: scale(1.5);
39 | transform: scale(1.5);
40 |
41 | opacity: 0;
42 | }
43 | }
44 | @keyframes burst
45 | {
46 | 0%
47 | {
48 | -webkit-transform: scale(1);
49 | transform: scale(1);
50 |
51 | opacity: 1;
52 | }
53 | 90%
54 | {
55 | -webkit-transform: scale(1.5);
56 | transform: scale(1.5);
57 |
58 | opacity: 0;
59 | }
60 | }
61 | @-webkit-keyframes flashing
62 | {
63 | 0%
64 | {
65 | opacity: 1;
66 | }
67 | 45%
68 | {
69 | opacity: 0;
70 | }
71 | 90%
72 | {
73 | opacity: 1;
74 | }
75 | }
76 | @keyframes flashing
77 | {
78 | 0%
79 | {
80 | opacity: 1;
81 | }
82 | 45%
83 | {
84 | opacity: 0;
85 | }
86 | 90%
87 | {
88 | opacity: 1;
89 | }
90 | }
91 | @-webkit-keyframes fade-left
92 | {
93 | 0%
94 | {
95 | -webkit-transform: translateX(0);
96 | transform: translateX(0);
97 |
98 | opacity: 1;
99 | }
100 | 75%
101 | {
102 | -webkit-transform: translateX(-20px);
103 | transform: translateX(-20px);
104 |
105 | opacity: 0;
106 | }
107 | }
108 | @keyframes fade-left
109 | {
110 | 0%
111 | {
112 | -webkit-transform: translateX(0);
113 | transform: translateX(0);
114 |
115 | opacity: 1;
116 | }
117 | 75%
118 | {
119 | -webkit-transform: translateX(-20px);
120 | transform: translateX(-20px);
121 |
122 | opacity: 0;
123 | }
124 | }
125 | @-webkit-keyframes fade-right
126 | {
127 | 0%
128 | {
129 | -webkit-transform: translateX(0);
130 | transform: translateX(0);
131 |
132 | opacity: 1;
133 | }
134 | 75%
135 | {
136 | -webkit-transform: translateX(20px);
137 | transform: translateX(20px);
138 |
139 | opacity: 0;
140 | }
141 | }
142 | @keyframes fade-right
143 | {
144 | 0%
145 | {
146 | -webkit-transform: translateX(0);
147 | transform: translateX(0);
148 |
149 | opacity: 1;
150 | }
151 | 75%
152 | {
153 | -webkit-transform: translateX(20px);
154 | transform: translateX(20px);
155 |
156 | opacity: 0;
157 | }
158 | }
159 | @-webkit-keyframes fade-up
160 | {
161 | 0%
162 | {
163 | -webkit-transform: translateY(0);
164 | transform: translateY(0);
165 |
166 | opacity: 1;
167 | }
168 | 75%
169 | {
170 | -webkit-transform: translateY(-20px);
171 | transform: translateY(-20px);
172 |
173 | opacity: 0;
174 | }
175 | }
176 | @keyframes fade-up
177 | {
178 | 0%
179 | {
180 | -webkit-transform: translateY(0);
181 | transform: translateY(0);
182 |
183 | opacity: 1;
184 | }
185 | 75%
186 | {
187 | -webkit-transform: translateY(-20px);
188 | transform: translateY(-20px);
189 |
190 | opacity: 0;
191 | }
192 | }
193 | @-webkit-keyframes fade-down
194 | {
195 | 0%
196 | {
197 | -webkit-transform: translateY(0);
198 | transform: translateY(0);
199 |
200 | opacity: 1;
201 | }
202 | 75%
203 | {
204 | -webkit-transform: translateY(20px);
205 | transform: translateY(20px);
206 |
207 | opacity: 0;
208 | }
209 | }
210 | @keyframes fade-down
211 | {
212 | 0%
213 | {
214 | -webkit-transform: translateY(0);
215 | transform: translateY(0);
216 |
217 | opacity: 1;
218 | }
219 | 75%
220 | {
221 | -webkit-transform: translateY(20px);
222 | transform: translateY(20px);
223 |
224 | opacity: 0;
225 | }
226 | }
227 | @-webkit-keyframes tada
228 | {
229 | from
230 | {
231 | -webkit-transform: scale3d(1, 1, 1);
232 | transform: scale3d(1, 1, 1);
233 | }
234 |
235 | 10%,
236 | 20%
237 | {
238 | -webkit-transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg);
239 | transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg);
240 | }
241 |
242 | 30%,
243 | 50%,
244 | 70%,
245 | 90%
246 | {
247 | -webkit-transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg);
248 | transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg);
249 | }
250 |
251 | 40%,
252 | 60%,
253 | 80%
254 | {
255 | -webkit-transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, -10deg);
256 | transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, -10deg);
257 | }
258 |
259 | to
260 | {
261 | -webkit-transform: scale3d(1, 1, 1);
262 | transform: scale3d(1, 1, 1);
263 | }
264 | }
265 |
266 | @keyframes tada
267 | {
268 | from
269 | {
270 | -webkit-transform: scale3d(1, 1, 1);
271 | transform: scale3d(1, 1, 1);
272 | }
273 |
274 | 10%,
275 | 20%
276 | {
277 | -webkit-transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg);
278 | transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg);
279 | }
280 |
281 | 30%,
282 | 50%,
283 | 70%,
284 | 90%
285 | {
286 | -webkit-transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg);
287 | transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg);
288 | }
289 |
290 | 40%,
291 | 60%,
292 | 80%
293 | {
294 | -webkit-transform: rotate3d(0, 0, 1, -10deg);
295 | transform: rotate3d(0, 0, 1, -10deg);
296 | }
297 |
298 | to
299 | {
300 | -webkit-transform: scale3d(1, 1, 1);
301 | transform: scale3d(1, 1, 1);
302 | }
303 | }
304 | .bx-spin
305 | {
306 | -webkit-animation: spin 2s linear infinite;
307 | animation: spin 2s linear infinite;
308 | }
309 | .bx-spin-hover:hover
310 | {
311 | -webkit-animation: spin 2s linear infinite;
312 | animation: spin 2s linear infinite;
313 | }
314 |
315 | .bx-tada
316 | {
317 | -webkit-animation: tada 1.5s ease infinite;
318 | animation: tada 1.5s ease infinite;
319 | }
320 | .bx-tada-hover:hover
321 | {
322 | -webkit-animation: tada 1.5s ease infinite;
323 | animation: tada 1.5s ease infinite;
324 | }
325 |
326 | .bx-flashing
327 | {
328 | -webkit-animation: flashing 1.5s infinite linear;
329 | animation: flashing 1.5s infinite linear;
330 | }
331 | .bx-flashing-hover:hover
332 | {
333 | -webkit-animation: flashing 1.5s infinite linear;
334 | animation: flashing 1.5s infinite linear;
335 | }
336 |
337 | .bx-burst
338 | {
339 | -webkit-animation: burst 1.5s infinite linear;
340 | animation: burst 1.5s infinite linear;
341 | }
342 | .bx-burst-hover:hover
343 | {
344 | -webkit-animation: burst 1.5s infinite linear;
345 | animation: burst 1.5s infinite linear;
346 | }
347 | .bx-fade-up
348 | {
349 | -webkit-animation: fade-up 1.5s infinite linear;
350 | animation: fade-up 1.5s infinite linear;
351 | }
352 | .bx-fade-up-hover:hover
353 | {
354 | -webkit-animation: fade-up 1.5s infinite linear;
355 | animation: fade-up 1.5s infinite linear;
356 | }
357 | .bx-fade-down
358 | {
359 | -webkit-animation: fade-down 1.5s infinite linear;
360 | animation: fade-down 1.5s infinite linear;
361 | }
362 | .bx-fade-down-hover:hover
363 | {
364 | -webkit-animation: fade-down 1.5s infinite linear;
365 | animation: fade-down 1.5s infinite linear;
366 | }
367 | .bx-fade-left
368 | {
369 | -webkit-animation: fade-left 1.5s infinite linear;
370 | animation: fade-left 1.5s infinite linear;
371 | }
372 | .bx-fade-left-hover:hover
373 | {
374 | -webkit-animation: fade-left 1.5s infinite linear;
375 | animation: fade-left 1.5s infinite linear;
376 | }
377 | .bx-fade-right
378 | {
379 | -webkit-animation: fade-right 1.5s infinite linear;
380 | animation: fade-right 1.5s infinite linear;
381 | }
382 | .bx-fade-right-hover:hover
383 | {
384 | -webkit-animation: fade-right 1.5s infinite linear;
385 | animation: fade-right 1.5s infinite linear;
386 | }
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/css/transformations.css:
--------------------------------------------------------------------------------
1 | .bx-rotate-90
2 | {
3 | transform: rotate(90deg);
4 |
5 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=1)';
6 | }
7 | .bx-rotate-180
8 | {
9 | transform: rotate(180deg);
10 |
11 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=2)';
12 | }
13 | .bx-rotate-270
14 | {
15 | transform: rotate(270deg);
16 |
17 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=3)';
18 | }
19 | .bx-flip-horizontal
20 | {
21 | transform: scaleX(-1);
22 |
23 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)';
24 | }
25 | .bx-flip-vertical
26 | {
27 | transform: scaleY(-1);
28 |
29 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)';
30 | }
31 |
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/fonts/boxicons.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.eot
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/fonts/boxicons.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.ttf
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff2
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/counterup/counterup.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 | * jquery.counterup.js 2.1.0
3 | *
4 | * Copyright 2013, Benjamin Intal http://gambit.ph @bfintal
5 | * Released under the GPL v2 License
6 | *
7 | * Amended by Jeremy Paris, Ciro Mattia Gonano and others
8 | *
9 | * Date: Feb 24, 2017
10 | */
11 | (function($){"use strict";$.fn.counterUp=function(options){var settings=$.extend({time:400,delay:10,offset:100,beginAt:0,formatter:false,context:"window",callback:function(){}},options),s;return this.each(function(){var $this=$(this),counter={time:$(this).data("counterup-time")||settings.time,delay:$(this).data("counterup-delay")||settings.delay,offset:$(this).data("counterup-offset")||settings.offset,beginAt:$(this).data("counterup-beginat")||settings.beginAt,context:$(this).data("counterup-context")||settings.context};var counterUpper=function(){var nums=[];var divisions=counter.time/counter.delay;var num=$(this).attr("data-num")?$(this).attr("data-num"):$this.text();var isComma=/[0-9]+,[0-9]+/.test(num);num=num.replace(/,/g,"");var decimalPlaces=(num.split(".")[1]||[]).length;if(counter.beginAt>num)counter.beginAt=num;var isTime=/[0-9]+:[0-9]+:[0-9]+/.test(num);if(isTime){var times=num.split(":"),m=1;s=0;while(times.length>0){s+=m*parseInt(times.pop(),10);m*=60}}for(var i=divisions;i>=counter.beginAt/num*divisions;i--){var newNum=parseFloat(num/divisions*i).toFixed(decimalPlaces);if(isTime){newNum=parseInt(s/divisions*i);var hours=parseInt(newNum/3600)%24;var minutes=parseInt(newNum/60)%60;var seconds=parseInt(newNum%60,10);newNum=(hours<10?"0"+hours:hours)+":"+(minutes<10?"0"+minutes:minutes)+":"+(seconds<10?"0"+seconds:seconds)}if(isComma){while(/(\d+)(\d{3})/.test(newNum.toString())){newNum=newNum.toString().replace(/(\d+)(\d{3})/,"$1"+","+"$2")}}if(settings.formatter){newNum=settings.formatter.call(this,newNum)}nums.unshift(newNum)}$this.data("counterup-nums",nums);$this.text(counter.beginAt);var f=function(){if(!$this.data("counterup-nums")){settings.callback.call(this);return}$this.html($this.data("counterup-nums").shift());if($this.data("counterup-nums").length){setTimeout($this.data("counterup-func"),counter.delay)}else{$this.data("counterup-nums",null);$this.data("counterup-func",null);settings.callback.call(this)}};$this.data("counterup-func",f);setTimeout($this.data("counterup-func"),counter.delay)};$this.waypoint(function(direction){counterUpper();this.destroy()},{offset:counter.offset+"%",context:counter.context})})}})(jQuery);
12 |
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/icofont/fonts/icofont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/icofont/fonts/icofont.woff
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/icofont/fonts/icofont.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/icofont/fonts/icofont.woff2
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/jquery.easing/jquery.easing.min.js:
--------------------------------------------------------------------------------
1 | (function(factory){if(typeof define==="function"&&define.amd){define(["jquery"],function($){return factory($)})}else if(typeof module==="object"&&typeof module.exports==="object"){exports=factory(require("jquery"))}else{factory(jQuery)}})(function($){$.easing.jswing=$.easing.swing;var pow=Math.pow,sqrt=Math.sqrt,sin=Math.sin,cos=Math.cos,PI=Math.PI,c1=1.70158,c2=c1*1.525,c3=c1+1,c4=2*PI/3,c5=2*PI/4.5;function bounceOut(x){var n1=7.5625,d1=2.75;if(x<1/d1){return n1*x*x}else if(x<2/d1){return n1*(x-=1.5/d1)*x+.75}else if(x<2.5/d1){return n1*(x-=2.25/d1)*x+.9375}else{return n1*(x-=2.625/d1)*x+.984375}}$.extend($.easing,{def:"easeOutQuad",swing:function(x){return $.easing[$.easing.def](x)},easeInQuad:function(x){return x*x},easeOutQuad:function(x){return 1-(1-x)*(1-x)},easeInOutQuad:function(x){return x<.5?2*x*x:1-pow(-2*x+2,2)/2},easeInCubic:function(x){return x*x*x},easeOutCubic:function(x){return 1-pow(1-x,3)},easeInOutCubic:function(x){return x<.5?4*x*x*x:1-pow(-2*x+2,3)/2},easeInQuart:function(x){return x*x*x*x},easeOutQuart:function(x){return 1-pow(1-x,4)},easeInOutQuart:function(x){return x<.5?8*x*x*x*x:1-pow(-2*x+2,4)/2},easeInQuint:function(x){return x*x*x*x*x},easeOutQuint:function(x){return 1-pow(1-x,5)},easeInOutQuint:function(x){return x<.5?16*x*x*x*x*x:1-pow(-2*x+2,5)/2},easeInSine:function(x){return 1-cos(x*PI/2)},easeOutSine:function(x){return sin(x*PI/2)},easeInOutSine:function(x){return-(cos(PI*x)-1)/2},easeInExpo:function(x){return x===0?0:pow(2,10*x-10)},easeOutExpo:function(x){return x===1?1:1-pow(2,-10*x)},easeInOutExpo:function(x){return x===0?0:x===1?1:x<.5?pow(2,20*x-10)/2:(2-pow(2,-20*x+10))/2},easeInCirc:function(x){return 1-sqrt(1-pow(x,2))},easeOutCirc:function(x){return sqrt(1-pow(x-1,2))},easeInOutCirc:function(x){return x<.5?(1-sqrt(1-pow(2*x,2)))/2:(sqrt(1-pow(-2*x+2,2))+1)/2},easeInElastic:function(x){return x===0?0:x===1?1:-pow(2,10*x-10)*sin((x*10-10.75)*c4)},easeOutElastic:function(x){return x===0?0:x===1?1:pow(2,-10*x)*sin((x*10-.75)*c4)+1},easeInOutElastic:function(x){return x===0?0:x===1?1:x<.5?-(pow(2,20*x-10)*sin((20*x-11.125)*c5))/2:pow(2,-20*x+10)*sin((20*x-11.125)*c5)/2+1},easeInBack:function(x){return c3*x*x*x-c1*x*x},easeOutBack:function(x){return 1+c3*pow(x-1,3)+c1*pow(x-1,2)},easeInOutBack:function(x){return x<.5?pow(2*x,2)*((c2+1)*2*x-c2)/2:(pow(2*x-2,2)*((c2+1)*(x*2-2)+c2)+2)/2},easeInBounce:function(x){return 1-bounceOut(1-x)},easeOutBounce:bounceOut,easeInOutBounce:function(x){return x<.5?(1-bounceOut(1-2*x))/2:(1+bounceOut(2*x-1))/2}})});
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/php-email-form/validate.js:
--------------------------------------------------------------------------------
1 | jQuery(document).ready(function($) {
2 | "use strict";
3 |
4 | //Contact
5 | $('form.php-email-form').submit(function() {
6 |
7 | var f = $(this).find('.form-group'),
8 | ferror = false,
9 | emailExp = /^[^\s()<>@,;:\/]+@\w[\w\.-]+\.[a-z]{2,}$/i;
10 |
11 | f.children('input').each(function() { // run all inputs
12 |
13 | var i = $(this); // current input
14 | var rule = i.attr('data-rule');
15 |
16 | if (rule !== undefined) {
17 | var ierror = false; // error flag for current input
18 | var pos = rule.indexOf(':', 0);
19 | if (pos >= 0) {
20 | var exp = rule.substr(pos + 1, rule.length);
21 | rule = rule.substr(0, pos);
22 | } else {
23 | rule = rule.substr(pos + 1, rule.length);
24 | }
25 |
26 | switch (rule) {
27 | case 'required':
28 | if (i.val() === '') {
29 | ferror = ierror = true;
30 | }
31 | break;
32 |
33 | case 'minlen':
34 | if (i.val().length < parseInt(exp)) {
35 | ferror = ierror = true;
36 | }
37 | break;
38 |
39 | case 'email':
40 | if (!emailExp.test(i.val())) {
41 | ferror = ierror = true;
42 | }
43 | break;
44 |
45 | case 'checked':
46 | if (! i.is(':checked')) {
47 | ferror = ierror = true;
48 | }
49 | break;
50 |
51 | case 'regexp':
52 | exp = new RegExp(exp);
53 | if (!exp.test(i.val())) {
54 | ferror = ierror = true;
55 | }
56 | break;
57 | }
58 | i.next('.validate').html((ierror ? (i.attr('data-msg') !== undefined ? i.attr('data-msg') : 'wrong Input') : '')).show('blind');
59 | }
60 | });
61 | f.children('textarea').each(function() { // run all inputs
62 |
63 | var i = $(this); // current input
64 | var rule = i.attr('data-rule');
65 |
66 | if (rule !== undefined) {
67 | var ierror = false; // error flag for current input
68 | var pos = rule.indexOf(':', 0);
69 | if (pos >= 0) {
70 | var exp = rule.substr(pos + 1, rule.length);
71 | rule = rule.substr(0, pos);
72 | } else {
73 | rule = rule.substr(pos + 1, rule.length);
74 | }
75 |
76 | switch (rule) {
77 | case 'required':
78 | if (i.val() === '') {
79 | ferror = ierror = true;
80 | }
81 | break;
82 |
83 | case 'minlen':
84 | if (i.val().length < parseInt(exp)) {
85 | ferror = ierror = true;
86 | }
87 | break;
88 | }
89 | i.next('.validate').html((ierror ? (i.attr('data-msg') != undefined ? i.attr('data-msg') : 'wrong Input') : '')).show('blind');
90 | }
91 | });
92 | if (ferror) return false;
93 | else var str = $(this).serialize();
94 |
95 | var this_form = $(this);
96 | var action = $(this).attr('action');
97 |
98 | if( ! action ) {
99 | this_form.find('.loading').slideUp();
100 | this_form.find('.error-message').slideDown().html('The form action property is not set!');
101 | return false;
102 | }
103 |
104 | this_form.find('.sent-message').slideUp();
105 | this_form.find('.error-message').slideUp();
106 | this_form.find('.loading').slideDown();
107 |
108 | $.ajax({
109 | type: "POST",
110 | url: action,
111 | data: str,
112 | success: function(msg) {
113 | if (msg == 'OK') {
114 | this_form.find('.loading').slideUp();
115 | this_form.find('.sent-message').slideDown();
116 | this_form.find("input:not(input[type=submit]), textarea").val('');
117 | } else {
118 | this_form.find('.loading').slideUp();
119 | this_form.find('.error-message').slideDown().html(msg);
120 | }
121 | }
122 | });
123 | return false;
124 | });
125 |
126 | });
127 |
--------------------------------------------------------------------------------
/FlaskProject/static/vendor/waypoints/jquery.waypoints.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 | Waypoints - 4.0.1
3 | Copyright © 2011-2016 Caleb Troughton
4 | Licensed under the MIT license.
5 | https://github.com/imakewebthings/waypoints/blob/master/licenses.txt
6 | */
7 | !function(){"use strict";function t(o){if(!o)throw new Error("No options passed to Waypoint constructor");if(!o.element)throw new Error("No element option passed to Waypoint constructor");if(!o.handler)throw new Error("No handler option passed to Waypoint constructor");this.key="waypoint-"+e,this.options=t.Adapter.extend({},t.defaults,o),this.element=this.options.element,this.adapter=new t.Adapter(this.element),this.callback=o.handler,this.axis=this.options.horizontal?"horizontal":"vertical",this.enabled=this.options.enabled,this.triggerPoint=null,this.group=t.Group.findOrCreate({name:this.options.group,axis:this.axis}),this.context=t.Context.findOrCreateByElement(this.options.context),t.offsetAliases[this.options.offset]&&(this.options.offset=t.offsetAliases[this.options.offset]),this.group.add(this),this.context.add(this),i[this.key]=this,e+=1}var e=0,i={};t.prototype.queueTrigger=function(t){this.group.queueTrigger(this,t)},t.prototype.trigger=function(t){this.enabled&&this.callback&&this.callback.apply(this,t)},t.prototype.destroy=function(){this.context.remove(this),this.group.remove(this),delete i[this.key]},t.prototype.disable=function(){return this.enabled=!1,this},t.prototype.enable=function(){return this.context.refresh(),this.enabled=!0,this},t.prototype.next=function(){return this.group.next(this)},t.prototype.previous=function(){return this.group.previous(this)},t.invokeAll=function(t){var e=[];for(var o in i)e.push(i[o]);for(var n=0,r=e.length;r>n;n++)e[n][t]()},t.destroyAll=function(){t.invokeAll("destroy")},t.disableAll=function(){t.invokeAll("disable")},t.enableAll=function(){t.Context.refreshAll();for(var e in i)i[e].enabled=!0;return this},t.refreshAll=function(){t.Context.refreshAll()},t.viewportHeight=function(){return window.innerHeight||document.documentElement.clientHeight},t.viewportWidth=function(){return document.documentElement.clientWidth},t.adapters=[],t.defaults={context:window,continuous:!0,enabled:!0,group:"default",horizontal:!1,offset:0},t.offsetAliases={"bottom-in-view":function(){return this.context.innerHeight()-this.adapter.outerHeight()},"right-in-view":function(){return this.context.innerWidth()-this.adapter.outerWidth()}},window.Waypoint=t}(),function(){"use strict";function t(t){window.setTimeout(t,1e3/60)}function e(t){this.element=t,this.Adapter=n.Adapter,this.adapter=new this.Adapter(t),this.key="waypoint-context-"+i,this.didScroll=!1,this.didResize=!1,this.oldScroll={x:this.adapter.scrollLeft(),y:this.adapter.scrollTop()},this.waypoints={vertical:{},horizontal:{}},t.waypointContextKey=this.key,o[t.waypointContextKey]=this,i+=1,n.windowContext||(n.windowContext=!0,n.windowContext=new e(window)),this.createThrottledScrollHandler(),this.createThrottledResizeHandler()}var i=0,o={},n=window.Waypoint,r=window.onload;e.prototype.add=function(t){var e=t.options.horizontal?"horizontal":"vertical";this.waypoints[e][t.key]=t,this.refresh()},e.prototype.checkEmpty=function(){var t=this.Adapter.isEmptyObject(this.waypoints.horizontal),e=this.Adapter.isEmptyObject(this.waypoints.vertical),i=this.element==this.element.window;t&&e&&!i&&(this.adapter.off(".waypoints"),delete o[this.key])},e.prototype.createThrottledResizeHandler=function(){function t(){e.handleResize(),e.didResize=!1}var e=this;this.adapter.on("resize.waypoints",function(){e.didResize||(e.didResize=!0,n.requestAnimationFrame(t))})},e.prototype.createThrottledScrollHandler=function(){function t(){e.handleScroll(),e.didScroll=!1}var e=this;this.adapter.on("scroll.waypoints",function(){(!e.didScroll||n.isTouch)&&(e.didScroll=!0,n.requestAnimationFrame(t))})},e.prototype.handleResize=function(){n.Context.refreshAll()},e.prototype.handleScroll=function(){var t={},e={horizontal:{newScroll:this.adapter.scrollLeft(),oldScroll:this.oldScroll.x,forward:"right",backward:"left"},vertical:{newScroll:this.adapter.scrollTop(),oldScroll:this.oldScroll.y,forward:"down",backward:"up"}};for(var i in e){var o=e[i],n=o.newScroll>o.oldScroll,r=n?o.forward:o.backward;for(var s in this.waypoints[i]){var a=this.waypoints[i][s];if(null!==a.triggerPoint){var l=o.oldScroll=a.triggerPoint,p=l&&h,u=!l&&!h;(p||u)&&(a.queueTrigger(r),t[a.group.id]=a.group)}}}for(var c in t)t[c].flushTriggers();this.oldScroll={x:e.horizontal.newScroll,y:e.vertical.newScroll}},e.prototype.innerHeight=function(){return this.element==this.element.window?n.viewportHeight():this.adapter.innerHeight()},e.prototype.remove=function(t){delete this.waypoints[t.axis][t.key],this.checkEmpty()},e.prototype.innerWidth=function(){return this.element==this.element.window?n.viewportWidth():this.adapter.innerWidth()},e.prototype.destroy=function(){var t=[];for(var e in this.waypoints)for(var i in this.waypoints[e])t.push(this.waypoints[e][i]);for(var o=0,n=t.length;n>o;o++)t[o].destroy()},e.prototype.refresh=function(){var t,e=this.element==this.element.window,i=e?void 0:this.adapter.offset(),o={};this.handleScroll(),t={horizontal:{contextOffset:e?0:i.left,contextScroll:e?0:this.oldScroll.x,contextDimension:this.innerWidth(),oldScroll:this.oldScroll.x,forward:"right",backward:"left",offsetProp:"left"},vertical:{contextOffset:e?0:i.top,contextScroll:e?0:this.oldScroll.y,contextDimension:this.innerHeight(),oldScroll:this.oldScroll.y,forward:"down",backward:"up",offsetProp:"top"}};for(var r in t){var s=t[r];for(var a in this.waypoints[r]){var l,h,p,u,c,d=this.waypoints[r][a],f=d.options.offset,w=d.triggerPoint,y=0,g=null==w;d.element!==d.element.window&&(y=d.adapter.offset()[s.offsetProp]),"function"==typeof f?f=f.apply(d):"string"==typeof f&&(f=parseFloat(f),d.options.offset.indexOf("%")>-1&&(f=Math.ceil(s.contextDimension*f/100))),l=s.contextScroll-s.contextOffset,d.triggerPoint=Math.floor(y+l-f),h=w=s.oldScroll,u=h&&p,c=!h&&!p,!g&&u?(d.queueTrigger(s.backward),o[d.group.id]=d.group):!g&&c?(d.queueTrigger(s.forward),o[d.group.id]=d.group):g&&s.oldScroll>=d.triggerPoint&&(d.queueTrigger(s.forward),o[d.group.id]=d.group)}}return n.requestAnimationFrame(function(){for(var t in o)o[t].flushTriggers()}),this},e.findOrCreateByElement=function(t){return e.findByElement(t)||new e(t)},e.refreshAll=function(){for(var t in o)o[t].refresh()},e.findByElement=function(t){return o[t.waypointContextKey]},window.onload=function(){r&&r(),e.refreshAll()},n.requestAnimationFrame=function(e){var i=window.requestAnimationFrame||window.mozRequestAnimationFrame||window.webkitRequestAnimationFrame||t;i.call(window,e)},n.Context=e}(),function(){"use strict";function t(t,e){return t.triggerPoint-e.triggerPoint}function e(t,e){return e.triggerPoint-t.triggerPoint}function i(t){this.name=t.name,this.axis=t.axis,this.id=this.name+"-"+this.axis,this.waypoints=[],this.clearTriggerQueues(),o[this.axis][this.name]=this}var o={vertical:{},horizontal:{}},n=window.Waypoint;i.prototype.add=function(t){this.waypoints.push(t)},i.prototype.clearTriggerQueues=function(){this.triggerQueues={up:[],down:[],left:[],right:[]}},i.prototype.flushTriggers=function(){for(var i in this.triggerQueues){var o=this.triggerQueues[i],n="up"===i||"left"===i;o.sort(n?e:t);for(var r=0,s=o.length;s>r;r+=1){var a=o[r];(a.options.continuous||r===o.length-1)&&a.trigger([i])}}this.clearTriggerQueues()},i.prototype.next=function(e){this.waypoints.sort(t);var i=n.Adapter.inArray(e,this.waypoints),o=i===this.waypoints.length-1;return o?null:this.waypoints[i+1]},i.prototype.previous=function(e){this.waypoints.sort(t);var i=n.Adapter.inArray(e,this.waypoints);return i?this.waypoints[i-1]:null},i.prototype.queueTrigger=function(t,e){this.triggerQueues[e].push(t)},i.prototype.remove=function(t){var e=n.Adapter.inArray(t,this.waypoints);e>-1&&this.waypoints.splice(e,1)},i.prototype.first=function(){return this.waypoints[0]},i.prototype.last=function(){return this.waypoints[this.waypoints.length-1]},i.findOrCreate=function(t){return o[t.axis][t.name]||new i(t)},n.Group=i}(),function(){"use strict";function t(t){this.$element=e(t)}var e=window.jQuery,i=window.Waypoint;e.each(["innerHeight","innerWidth","off","offset","on","outerHeight","outerWidth","scrollLeft","scrollTop"],function(e,i){t.prototype[i]=function(){var t=Array.prototype.slice.call(arguments);return this.$element[i].apply(this.$element,t)}}),e.each(["extend","inArray","isEmptyObject"],function(i,o){t[o]=e[o]}),i.adapters.push({name:"jquery",Adapter:t}),i.Adapter=t}(),function(){"use strict";function t(t){return function(){var i=[],o=arguments[0];return t.isFunction(arguments[0])&&(o=t.extend({},arguments[1]),o.handler=arguments[0]),this.each(function(){var n=t.extend({},o,{element:this});"string"==typeof n.context&&(n.context=t(this).closest(n.context)[0]),i.push(new e(n))}),i}}var e=window.Waypoint;window.jQuery&&(window.jQuery.fn.waypoint=t(window.jQuery)),window.Zepto&&(window.Zepto.fn.waypoint=t(window.Zepto))}();
--------------------------------------------------------------------------------
/FlaskProject/templates/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 豆瓣Top250数据分析
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
豆瓣Top250数据分析
68 |
应用Python、Scrapy、Scrapy-Redis、MongoDB、Flask、Echarts、WordCloud等技术实现
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
86 |
87 |
96 |
97 |
106 |
107 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
--------------------------------------------------------------------------------
/FlaskProject/templates/movie.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 豆瓣Top250数据分析
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
豆瓣Top250电影排行
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 | 排名 |
77 | 电影名称 |
78 | 评分 |
79 | 评价人数 |
80 | 导演 |
81 | 精彩短评 |
82 | 上映时间(/年) |
83 |
84 |
85 | {% for movie in movies %}
86 |
87 | {{movie['rank']}} |
88 |
89 |
90 | {{ movie['title'] }}
91 |
92 | |
93 |
94 | {{movie['score']}} |
95 | {{movie['comment_num']}} |
96 | {{movie['directed_by']}} |
97 | {{movie['comment']}} |
98 | {{movie['year']}} |
99 |
100 | {% endfor %}
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
--------------------------------------------------------------------------------
/FlaskProject/templates/score.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | 豆瓣Top250评分分布图
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
豆瓣Top250数据分析
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
--------------------------------------------------------------------------------
/FlaskProject/templates/word.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 豆瓣Top250数据分析
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |

71 |
72 |
73 |
74 |
75 |
76 |
77 |
词频统计
78 |
根据250部电影提取出的词云块增强人们对经典电影的领悟
79 |
80 |
81 |
82 |
83 |
84 |
从电影中看百味人生
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
--------------------------------------------------------------------------------
/FlaskProject/wordCloud.py:
--------------------------------------------------------------------------------
1 | from wordcloud import WordCloud #词云
2 | import jieba #分词
3 | from matplotlib import pyplot as plt #绘图 数据可视化
4 | from PIL import Image #图片处理
5 | import numpy as np #矩阵运算
6 | import pymongo #数据库
7 |
8 | client = pymongo.MongoClient(host='localhost', port=27017)
9 | db = client.movies
10 | collection = db.douban
11 | query = {}
12 | projection = {}
13 |
14 | projection["title"] = u"$title"
15 | projection["movie_type"] = u"$movie_type"
16 | projection["directedBy"] = u"$directedBy"
17 | projection["_id"] = 0
18 |
19 | cursor = collection.find(query, projection = projection)
20 | text = ""
21 | for doc in cursor:
22 | for content in doc.values():
23 | content.replace('/',' ')
24 | text = text + content
25 |
26 | cut = jieba.cut(text)
27 | string = ' '.join(cut)
28 | print(len(string))
29 |
30 | img = Image.open(r'./static/img/tree.jpg')
31 | img_array = np.array(img) #将图片转换为数组
32 | wc = WordCloud(
33 | background_color='white',
34 | mask=img_array,
35 | font_path="msyh.ttc" #字体所在位置C:\Windows\Fonts
36 | )
37 | wc.generate_from_text(string)
38 |
39 | #绘制图片
40 |
41 | fig = plt.figure(1)
42 | plt.imshow(wc)
43 | plt.axis('off') #是否显示坐标轴
44 |
45 | # plt.show() #显示生成的词云图片
46 | plt.savefig('./static/img/generated_tree.jpg',dpi=500)
47 |
--------------------------------------------------------------------------------
/Master/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/Master/.idea/Master.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Master/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Master/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/Master/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Master/main.py:
--------------------------------------------------------------------------------
1 | import redis
2 | import pymongo
3 | import json
4 |
5 | def main():
6 | r = redis.Redis(host='localhost',port=6379,db=0)
7 | client = pymongo.MongoClient(host='localhost', port=27017)
8 | db = client.movies
9 | collection = db.douban
10 | while True:
11 | source, data = r.blpop(["douban_redis:items"])
12 | item = json.loads(data)
13 | print(item)
14 | collection.replace_one(filter={"rank":item["rank"]},replacement=item,upsert=True)
15 |
16 | if __name__ == '__main__':
17 | main()
--------------------------------------------------------------------------------
/Pic/index.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/index.jpg
--------------------------------------------------------------------------------
/Pic/mongoDB_data.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/mongoDB_data.jpg
--------------------------------------------------------------------------------
/Pic/movies.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/movies.jpg
--------------------------------------------------------------------------------
/Pic/proxy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/proxy.jpg
--------------------------------------------------------------------------------
/Pic/redis_data.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/redis_data.jpg
--------------------------------------------------------------------------------
/Pic/score.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/score.jpg
--------------------------------------------------------------------------------
/Pic/slave.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/slave.jpg
--------------------------------------------------------------------------------
/Pic/words.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/words.jpg
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 1. 环境配置
2 | ## 1.1 爬虫部分软件包版本
3 | - Python 3.8.13
4 | - Scrapy 2.6.2
5 | - Scrapy-redis 0.7.3
6 | - pymongo 4.2.0
7 | - redis 4.3.4
8 | ## 1.2 数据库
9 | - MongoDB
10 | - Redis
11 | ## 1.3 前后端交互
12 | ### 前端
13 | - jinja2
14 | - Echarts
15 | ### 后端
16 | - flask 2.2.2
17 | ## 1.4 IP代理池
18 | 参考:[https://github.com/jhao104/proxy_pool](https://github.com/jhao104/proxy_pool)进行配置。
19 | # 2. 项目文件目录
20 | **---FlaskProject(数据可视化代码)**
21 | ------static(用到的静态资源)
22 | ------templates(前端展示模板)
23 | ------app.py(后端代码)
24 | ------data.txt(爬取到的数据示例)
25 | ------wordCloud.py(生成词云代码)
26 | **---Master(主机端的代码)**
27 | ------main.py(将数据从redis中取出,放入到MongoDB)
28 | **---Pic(运行效果图)**
29 | **---proxy_pool(IP代理池)**
30 | **---Slave(从机端的代码)**
31 | ------movies
32 | ---------spiders
33 | ------------douban_redis.py(爬取数据的主要代码)
34 | ---------middlewares.py(中间件,实现IP代理、动态User-Agent等功能)
35 | ---------settings.py(爬虫的相关配置)
36 | # 3. 项目配置过程
37 | 项目整体基于分布式的思想设计,分为**主机端代码**和**从机端代码。**
38 | ## 3.1 从机
39 | 从机负责执行爬虫程序,从网站爬取数据并存储到主机的Redis数据库中。Redis数据库可以记录爬取的url进度,因此爬虫程序可以中途暂停,从机数目可以任意设置,并且所有从机都执行相同的代码。
40 | 在进行项目测试的时候可以通过在一台电脑上配置虚拟机,从而实现分布式的效果。虚拟机推荐使用[CentOS 7](http://isoredirect.centos.org/centos/7/isos/x86_64/)系统,系统轻量化、占用资源少。从机需要在`settings.py`文件中设置主机的IP和端口。
41 | 从机环境配置完成后,cd到`spiders`文件夹下运行:`scrapy runspider douban_redis.py`命令可以启动从机程序,从机程序启动后会等待主机发放起始url。
42 | ## 3.2 主机
43 | 主机负责维护Redis数据库,并将Redis数据库中的数据存储到MongoDB数据库中。
44 | 启动Redis服务后,在`redis-cli.exe`中运行:`lpush douban:start_urls https://movie.douban.com/top250`命令即可在Redis数据库中插入起始url,插入成功后从机会自动开始爬取程序。
45 | 主机端运行代理池`proxy_pool`下的代码可获取免费代理IP,并存入`redis`数据库中。
46 | 主机端的main.py用于实现取数据的功能,可以将Redis数据库中的数据取出,放入到MongoDB数据库中。
47 | ## 3.3 可视化
48 | 安装flask后,在主机端打开`FlaskProject`文件,运行`app.py`即可启动后端服务。启动后端服务后,在浏览器访问在本机默认IP:端口`http://127.0.0.1:5000/`即可看到可视化效果。
49 | ## 3.4 IP代理池
50 | 项目参考:[https://github.com/jhao104/proxy_pool](https://github.com/jhao104/proxy_pool)
51 | 参考说明配置完环境后运行以下命令可以启动IP池程序。
52 | ```
53 | # 启动调度程序
54 | python proxyPool.py schedule
55 |
56 | # 启动webApi服务
57 | python proxyPool.py server
58 | ```
59 | # 4. 运行截图
60 | ## 4.1 从机运行
61 | ### 从机爬虫程序
62 | 
63 | ## 4.2 主机数据库
64 | ### Redis数据库缓存URL
65 | 
66 | ### Redis缓存代理IPs
67 | 
68 | ### MongoDB数据库
69 | 
70 | ## 4.3 可视化
71 | ### 首页
72 | 
73 | ### 电影
74 | 
75 | ### 评分
76 | 
77 | ### 词云
78 | 
--------------------------------------------------------------------------------
/Slave/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 |
--------------------------------------------------------------------------------
/Slave/.idea/Slave.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Slave/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/Slave/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/Slave/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Slave/movies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__init__.py
--------------------------------------------------------------------------------
/Slave/movies/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/Slave/movies/__pycache__/middlewares.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/middlewares.cpython-38.pyc
--------------------------------------------------------------------------------
/Slave/movies/__pycache__/pipelines.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/pipelines.cpython-38.pyc
--------------------------------------------------------------------------------
/Slave/movies/__pycache__/settings.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/settings.cpython-38.pyc
--------------------------------------------------------------------------------
/Slave/movies/items.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your scraped items
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/items.html
5 |
6 | import scrapy
7 |
8 |
9 | class MoviesItem(scrapy.Item):
10 | # define the fields for your item here like:
11 | pass
12 | # page_url = scrapy.Field()
13 | # title = scrapy.Field()
14 | # year = scrapy.Field()
15 | # score = scrapy.Field()
16 | # directedBy = scrapy.Field()
17 | # actors = scrapy.Field()
18 | # movie_type = scrapy.Field()
19 | # comment = scrapy.Field()
20 | # introduc = scrapy.Field()
21 | # image_urls = scrapy.Field()
22 | # image_name = scrapy.Field()
23 |
--------------------------------------------------------------------------------
/Slave/movies/middlewares.py:
--------------------------------------------------------------------------------
1 | # Define here the models for your spider middleware
2 | #
3 | # See documentation in:
4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5 |
6 | from scrapy import signals
7 | from fake_useragent import UserAgent
8 | import redis
9 | import random
10 | from scrapy.exceptions import NotConfigured
11 | from twisted.internet.error import ConnectError, TimeoutError
12 | import json
13 | # useful for handling different item types with a single interface
14 | from itemadapter import is_item, ItemAdapter
15 |
16 |
17 | class RandomProxyMiddleWare(object):
18 | def __init__(self, settings):
19 | # 2.初始化配置及相关变量
20 | self.r = redis.Redis(host='localhost', port=6379, db=0)
21 | self.proxy_key = settings.get('PROXY_REDIS_KEY')
22 | self.max_failed = 1
23 |
24 | @property
25 | def proxies(self):
26 | # return [i.decode('utf-8') for i in self.r.hkeys('use_proxy')]
27 | # return [i.decode('utf-8') for i in self.r.hkeys('use_proxy')
28 | # if json.loads(self.r.hget('use_proxy', i.decode('utf-8')).decode('utf-8'))['https'] == True]
29 | return []
30 |
31 | @classmethod
32 | def from_crawler(cls, crawler):
33 | # 1. 创建中间件对象
34 | # 默认代理是启用的
35 | if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
36 | raise NotConfigured
37 | return cls(crawler.settings)
38 |
39 | def process_request(self, request, spider):
40 | # 3. 为每个request对象分配随机的ip代理
41 | if self.proxies and not request.meta.get('proxy'):
42 | proxies_list = self.proxies
43 | if proxies_list:
44 | request.meta['proxy'] = 'https://' + random.choice(proxies_list)
45 |
46 | def process_response(self, request, response, spider):
47 | # 4. 请求成功
48 | # 如果proxy为空则直接返回
49 | if not request.meta.get('proxy'):
50 | return response
51 | cur_proxy = request.meta.get('proxy').replace('https://', '')
52 | # 判断ip是否被对方封禁
53 | if response.status in (400, 401, 403):
54 | # 先拿到当前ip:port对应的value
55 | value = json.loads(self.r.hget(self.proxy_key, cur_proxy).decode('utf-8'))
56 | value['fail_count'] += 1
57 | self.r.hset(self.proxy_key, cur_proxy,
58 | str(value).replace("'", '"').replace('False', 'false').replace('True', 'true'))
59 | # 当某个IP的失败次数累积到一定的数量
60 | filed_times = json.loads(self.r.hget(self.proxy_key, cur_proxy).decode('utf-8'))['fail_count'] or 0
61 | if int(filed_times) >= self.max_failed:
62 | print('got wrong http code (%s) when use %s' % (response.status, cur_proxy))
63 | # 可以认为该IP被对方封禁。从代理池中将该IP删除
64 | self.remove_proxy(cur_proxy)
65 | del request.meta['proxy']
66 | # 返回request 将该请求重新->调度器
67 | return request
68 | return response
69 |
70 | def process_exception(self, request, exception, spider):
71 | # 4.1 请求失败
72 | cur_proxy = request.meta.get('proxy')
73 | # 请求使用代理,并且网络请求报错,认为该IP出错,删除,并重新->调度器
74 | if cur_proxy and isinstance(cur_proxy, (ConnectError, TimeoutError)):
75 | print('error (%s) occur when use proxy %s' % (exception, cur_proxy))
76 | self.remove_proxy(cur_proxy)
77 | del request.meta['proxy']
78 | return request
79 |
80 | def remove_proxy(self, proxy):
81 | if proxy in self.proxies:
82 | self.r.hdel(self.proxy_key, proxy)
83 |
84 |
85 | class UserAgentMiddleware(object):
86 | def process_request(self, request, spider):
87 | request.headers.setdefault(b'User-Agent', UserAgent().random)
88 |
89 | class MoviesSpiderMiddleware:
90 | # Not all methods need to be defined. If a method is not defined,
91 | # scrapy acts as if the spider middleware does not modify the
92 | # passed objects.
93 |
94 | @classmethod
95 | def from_crawler(cls, crawler):
96 | # This method is used by Scrapy to create your spiders.
97 | s = cls()
98 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
99 | return s
100 |
101 | def process_spider_input(self, response, spider):
102 | # Called for each response that goes through the spider
103 | # middleware and into the spider.
104 |
105 | # Should return None or raise an exception.
106 | return None
107 |
108 | def process_spider_output(self, response, result, spider):
109 | # Called with the results returned from the Spider, after
110 | # it has processed the response.
111 |
112 | # Must return an iterable of Request, or item objects.
113 | for i in result:
114 | yield i
115 |
116 | def process_spider_exception(self, response, exception, spider):
117 | # Called when a spider or process_spider_input() method
118 | # (from other spider middleware) raises an exception.
119 |
120 | # Should return either None or an iterable of Request or item objects.
121 | pass
122 |
123 | def process_start_requests(self, start_requests, spider):
124 | # Called with the start requests of the spider, and works
125 | # similarly to the process_spider_output() method, except
126 | # that it doesn’t have a response associated.
127 |
128 | # Must return only requests (not items).
129 | for r in start_requests:
130 | yield r
131 |
132 | def spider_opened(self, spider):
133 | spider.logger.info('Spider opened: %s' % spider.name)
134 |
135 |
136 | class MoviesDownloaderMiddleware:
137 | # Not all methods need to be defined. If a method is not defined,
138 | # scrapy acts as if the downloader middleware does not modify the
139 | # passed objects.
140 |
141 | @classmethod
142 | def from_crawler(cls, crawler):
143 | # This method is used by Scrapy to create your spiders.
144 | s = cls()
145 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
146 | return s
147 |
148 | def process_request(self, request, spider):
149 | # Called for each request that goes through the downloader
150 | # middleware.
151 |
152 | # Must either:
153 | # - return None: continue processing this request
154 | # - or return a Response object
155 | # - or return a Request object
156 | # - or raise IgnoreRequest: process_exception() methods of
157 | # installed downloader middleware will be called
158 |
159 | # 在请求页面时伪装成站内请求,用以反 反爬虫
160 | referer = request.url
161 | if referer:
162 | request.headers['referer'] = referer
163 |
164 | return None
165 |
166 | def process_response(self, request, response, spider):
167 | # Called with the response returned from the downloader.
168 |
169 | # Must either;
170 | # - return a Response object
171 | # - return a Request object
172 | # - or raise IgnoreRequest
173 | return response
174 |
175 | def process_exception(self, request, exception, spider):
176 | # Called when a download handler or a process_request()
177 | # (from other downloader middleware) raises an exception.
178 |
179 | # Must either:
180 | # - return None: continue processing this exception
181 | # - return a Response object: stops process_exception() chain
182 | # - return a Request object: stops process_exception() chain
183 | pass
184 |
185 | def spider_opened(self, spider):
186 | spider.logger.info('Spider opened: %s' % spider.name)
187 |
--------------------------------------------------------------------------------
/Slave/movies/pipelines.py:
--------------------------------------------------------------------------------
1 | # Define your item pipelines here
2 | #
3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5 |
6 |
7 | # useful for handling different item types with a single interface
8 | from itemadapter import ItemAdapter
9 | import pymongo
10 | from pymysql import connect
11 | from scrapy.pipelines.images import ImagesPipeline
12 | import scrapy
13 | from scrapy.exceptions import DropItem
14 |
15 | class MongoMoviesPipeline:
16 | def open_spider(self,spider):
17 | self.client = pymongo.MongoClient()
18 |
19 | def process_item(self, item, spider):
20 | # self.client.movies.douban.replace_one(filter={"page_url":item["page_url"]},replacement=item,upsert=True)
21 | return item
22 |
23 | def close_spider(self,spider):
24 | self.client.close()
25 |
26 | class ImagePipeline(ImagesPipeline):
27 | def get_media_requests(self, item, info):
28 | yield scrapy.Request(item['image_urls'], meta={"image_name": item['image_name']})
29 |
30 | def file_path(self, request, response=None, info=None, *, item=None):
31 | file_name = request.meta['image_name'] + ".jpg"
32 | return file_name
33 |
34 | def item_completed(self, results, item, info):
35 | image_paths = [x['path'] for ok, x in results if ok]
36 | if not image_paths:
37 | raise DropItem("Item contains no images")
38 | return item
39 |
40 | # class MysqlMoviesPipeline:
41 | # def open_spider(self,spider):
42 | # self.client = connect(host='localhost',port='3306',user='root',password='123456',db='movies',charset='utf8')
43 | # self.cursor = self.client.cursor()
44 | #
45 | # def process_item(self, item, spider):
46 | # self.client.movies.douban.insert_one(item)
47 | # return item
48 | #
49 | # def close_spider(self,spider):
50 | # self.cursor.close()
51 | # self.client.close()
52 |
--------------------------------------------------------------------------------
/Slave/movies/settings.py:
--------------------------------------------------------------------------------
1 | # Scrapy settings for movies project
2 | #
3 | # For simplicity, this file contains only settings considered important or
4 | # commonly used. You can find more settings consulting the documentation:
5 | #
6 | # https://docs.scrapy.org/en/latest/topics/settings.html
7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9 |
10 | BOT_NAME = 'movies'
11 |
12 | SPIDER_MODULES = ['movies.spiders']
13 | NEWSPIDER_MODULE = 'movies.spiders'
14 |
15 |
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
18 |
19 | # Obey robots.txt rules
20 | ROBOTSTXT_OBEY = True
21 | LOG_LEVEL = 'DEBUG'
22 |
23 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
24 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
25 | SCHEDULER_PERSIST = True
26 |
27 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
28 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
29 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
30 |
31 | ITEM_PIPELINES = {
32 | 'scrapy_redis.pipelines.RedisPipeline': 300,
33 | }
34 | REDIS_HOST = '192.168.108.1'
35 | REDIS_PORT = '6379'
36 |
37 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
38 | #CONCURRENT_REQUESTS = 32
39 |
40 | # Configure a delay for requests for the same website (default: 0)
41 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
42 | # See also autothrottle settings and docs
43 | DOWNLOAD_DELAY = 5
44 | # The download delay setting will honor only one of:
45 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
46 | #CONCURRENT_REQUESTS_PER_IP = 16
47 |
48 | # Disable cookies (enabled by default)
49 | #COOKIES_ENABLED = False
50 |
51 | # Disable Telnet Console (enabled by default)
52 | #TELNETCONSOLE_ENABLED = False
53 |
54 | # Override the default request headers:
55 | #DEFAULT_REQUEST_HEADERS = {
56 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
57 | # 'Accept-Language': 'en',
58 | #}
59 |
60 | # Enable or disable spider middlewares
61 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
62 | #SPIDER_MIDDLEWARES = {
63 | # 'movies.middlewares.MoviesSpiderMiddleware': 543,
64 | #}
65 |
66 | # Enable or disable downloader middlewares
67 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
68 | DOWNLOADER_MIDDLEWARES = {
69 | 'movies.middlewares.RandomProxyMiddleWare': 241,
70 | 'movies.middlewares.UserAgentMiddleware': 242,
71 | 'movies.middlewares.MoviesDownloaderMiddleware': 243,
72 | }
73 |
74 | PROXY_REDIS_KEY = 'use_proxy'
75 | HTTPPROXY_ENABLED = True
76 |
77 | # Enable or disable extensions
78 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
79 | #EXTENSIONS = {
80 | # 'scrapy.extensions.telnet.TelnetConsole': None,
81 | #}
82 |
83 | # Configure item pipelines
84 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
85 |
86 | # ITEM_PIPELINES = {
87 | # 'movies.pipelines.MongoMoviesPipeline': 300,
88 | # # 'movies.pipelines.ImagePipeline': 301,
89 | # }
90 | # IMAGES_STORE ='../images/'
91 | # IMAGES_URLS_FIELD = 'image_urls' #对应item里面设定的字段,取到图片的url
92 |
93 | # Enable and configure the AutoThrottle extension (disabled by default)
94 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
95 | #AUTOTHROTTLE_ENABLED = True
96 | # The initial download delay
97 | #AUTOTHROTTLE_START_DELAY = 5
98 | # The maximum download delay to be set in case of high latencies
99 | #AUTOTHROTTLE_MAX_DELAY = 60
100 | # The average number of requests Scrapy should be sending in parallel to
101 | # each remote server
102 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
103 | # Enable showing throttling stats for every response received:
104 | #AUTOTHROTTLE_DEBUG = False
105 |
106 | # Enable and configure HTTP caching (disabled by default)
107 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
108 | #HTTPCACHE_ENABLED = True
109 | #HTTPCACHE_EXPIRATION_SECS = 0
110 | #HTTPCACHE_DIR = 'httpcache'
111 | #HTTPCACHE_IGNORE_HTTP_CODES = []
112 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
113 |
--------------------------------------------------------------------------------
/Slave/movies/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/Slave/movies/spiders/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/spiders/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/Slave/movies/spiders/__pycache__/douban_redis.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/spiders/__pycache__/douban_redis.cpython-38.pyc
--------------------------------------------------------------------------------
/Slave/movies/spiders/douban_redis.py:
--------------------------------------------------------------------------------
1 | import scrapy
2 | from scrapy_redis.spiders import RedisCrawlSpider
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.spiders import Rule
5 | import re
6 |
7 | class DoubanSpider(RedisCrawlSpider):
8 | name = 'douban_redis'
9 | allowed_domains = ['douban.com']
10 | # start_urls = ['https://movie.douban.com/top250?start=0&filter=']
11 | # start_urls = ['https://movie.douban.com/top250?start={}&filter='.format(num) for num in range(0, 226, 25)]
12 | redis_key = 'douban:start_urls'
13 |
14 | rules = (
15 | Rule(LinkExtractor(restrict_xpaths=r'//div[@class="hd"]/a'), callback='parse_info'),
16 | Rule(LinkExtractor(restrict_xpaths=r'//div[@class="paginator"]/a'), follow=True),
17 | )
18 |
19 | def parse_info(self, response):
20 | page_url = response.url
21 | title = response.xpath("//h1/span[@property='v:itemreviewed']/text()").extract_first()
22 | year = response.xpath("//h1/span[@class='year']/text()").extract_first()
23 | score = response.xpath("//strong[@class='ll rating_num']/text()").extract_first()
24 | directedBy = response.xpath("//span[@class='attrs']/a[@rel='v:directedBy']/text()").extract_first()
25 | actors = response.xpath("string(//span[@class='actor']/span[@class='attrs']/span)").extract_first()
26 | if actors == '':
27 | actors = response.xpath("string(//span[@class='actor']/span[@class='attrs'])").extract_first()
28 | movie_type = '/'.join(response.xpath("//span[@property='v:genre']/text()").extract())
29 | rank = re.findall(r"\d+",response.xpath("//span[@class='top250-no']/text()").extract_first())[0]
30 | comment_num = response.xpath("//span[@property='v:votes']/text()").extract_first()
31 | comments = response.xpath("//p/span[@class='short']/text()").extract()
32 | comment = ''
33 | # 任意选一条长度小于100的短评
34 | for cmt in comments:
35 | if len(cmt) < 100:
36 | comment = cmt
37 | # 没有长度小于100的短评 读取长文
38 | if comment == '':
39 | comments = response.xpath("//p/span[@class='full']/text()").extract_first()
40 | introduc = response.xpath("string(//div[@class='indent']/span[@class='all hidden'])").extract_first()
41 | if introduc == '':
42 | introduc = response.xpath("string(//div[@class='indent']/span[@property='v:summary'])").extract_first()
43 | image_url = response.xpath("//img[@title='点击看更多海报']/@src").extract_first()
44 | image_name = page_url.split('/')[-2]
45 | print(title)
46 | yield {
47 | "page_url":page_url,
48 | "title":title,
49 | "year":year,
50 | "score":score,
51 | "directedBy":directedBy,
52 | "actors":actors,
53 | "movie_type":movie_type,
54 | "rank":rank,
55 | "comment":comment,
56 | "comment_num":comment_num,
57 | "introduc":introduc,
58 | "image_urls": image_url,
59 | "image_name": image_name
60 | }
61 |
62 |
--------------------------------------------------------------------------------
/Slave/movies/start.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 |
3 | # execute('scrapy crawl douban'.split())
4 | execute('scrapy crawl douban_redis'.split())
5 |
--------------------------------------------------------------------------------
/Slave/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = movies.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = movies
12 |
--------------------------------------------------------------------------------
/proxy_pool/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 |
--------------------------------------------------------------------------------
/proxy_pool/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/proxy_pool/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/proxy_pool/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/proxy_pool/.idea/proxy_pool.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/proxy_pool/__pycache__/setting.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/__pycache__/setting.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/api/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: __init__.py
5 | Description :
6 | Author : JHao
7 | date: 2016/12/3
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/12/3:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 |
--------------------------------------------------------------------------------
/proxy_pool/api/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/api/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/api/__pycache__/proxyApi.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/api/__pycache__/proxyApi.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/api/proxyApi.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 | """
4 | -------------------------------------------------
5 | File Name: ProxyApi.py
6 | Description : WebApi
7 | Author : JHao
8 | date: 2016/12/4
9 | -------------------------------------------------
10 | Change Activity:
11 | 2016/12/04: WebApi
12 | 2019/08/14: 集成Gunicorn启动方式
13 | 2020/06/23: 新增pop接口
14 | 2022/07/21: 更新count接口
15 | -------------------------------------------------
16 | """
17 | __author__ = 'JHao'
18 |
19 | import platform
20 | from werkzeug.wrappers import Response
21 | from flask import Flask, jsonify, request
22 |
23 | from util.six import iteritems
24 | from helper.proxy import Proxy
25 | from handler.proxyHandler import ProxyHandler
26 | from handler.configHandler import ConfigHandler
27 |
28 | app = Flask(__name__)
29 | conf = ConfigHandler()
30 | proxy_handler = ProxyHandler()
31 |
32 |
33 | class JsonResponse(Response):
34 | @classmethod
35 | def force_type(cls, response, environ=None):
36 | if isinstance(response, (dict, list)):
37 | response = jsonify(response)
38 |
39 | return super(JsonResponse, cls).force_type(response, environ)
40 |
41 |
42 | app.response_class = JsonResponse
43 |
44 | api_list = [
45 | {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"},
46 | {"url": "/pop", "params": "", "desc": "get and delete a proxy"},
47 | {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"},
48 | {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"},
49 | {"url": "/count", "params": "", "desc": "return proxy count"}
50 | # 'refresh': 'refresh proxy pool',
51 | ]
52 |
53 |
54 | @app.route('/')
55 | def index():
56 | return {'url': api_list}
57 |
58 |
59 | @app.route('/get/')
60 | def get():
61 | https = request.args.get("type", "").lower() == 'https'
62 | proxy = proxy_handler.get(https)
63 | return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"}
64 |
65 |
66 | @app.route('/pop/')
67 | def pop():
68 | https = request.args.get("type", "").lower() == 'https'
69 | proxy = proxy_handler.pop(https)
70 | return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"}
71 |
72 |
73 | @app.route('/refresh/')
74 | def refresh():
75 | # TODO refresh会有守护程序定时执行,由api直接调用性能较差,暂不使用
76 | return 'success'
77 |
78 |
79 | @app.route('/all/')
80 | def getAll():
81 | https = request.args.get("type", "").lower() == 'https'
82 | proxies = proxy_handler.getAll(https)
83 | return jsonify([_.to_dict for _ in proxies])
84 |
85 |
86 | @app.route('/delete/', methods=['GET'])
87 | def delete():
88 | proxy = request.args.get('proxy')
89 | status = proxy_handler.delete(Proxy(proxy))
90 | return {"code": 0, "src": status}
91 |
92 |
93 | @app.route('/count/')
94 | def getCount():
95 | proxies = proxy_handler.getAll()
96 | http_type_dict = {}
97 | source_dict = {}
98 | for proxy in proxies:
99 | http_type = 'https' if proxy.https else 'http'
100 | http_type_dict[http_type] = http_type_dict.get(http_type, 0) + 1
101 | for source in proxy.source.split('/'):
102 | source_dict[source] = source_dict.get(source, 0) + 1
103 | return {"http_type": http_type_dict, "source": source_dict, "count": len(proxies)}
104 |
105 |
106 | def runFlask():
107 | if platform.system() == "Windows":
108 | app.run(host=conf.serverHost, port=conf.serverPort)
109 | else:
110 | import gunicorn.app.base
111 |
112 | class StandaloneApplication(gunicorn.app.base.BaseApplication):
113 |
114 | def __init__(self, app, options=None):
115 | self.options = options or {}
116 | self.application = app
117 | super(StandaloneApplication, self).__init__()
118 |
119 | def load_config(self):
120 | _config = dict([(key, value) for key, value in iteritems(self.options)
121 | if key in self.cfg.settings and value is not None])
122 | for key, value in iteritems(_config):
123 | self.cfg.set(key.lower(), value)
124 |
125 | def load(self):
126 | return self.application
127 |
128 | _options = {
129 | 'bind': '%s:%s' % (conf.serverHost, conf.serverPort),
130 | 'workers': 4,
131 | 'accesslog': '-', # log to stdout
132 | 'access_log_format': '%(h)s %(l)s %(t)s "%(r)s" %(s)s "%(a)s"'
133 | }
134 | StandaloneApplication(app, _options).run()
135 |
136 |
137 | if __name__ == '__main__':
138 | runFlask()
139 |
--------------------------------------------------------------------------------
/proxy_pool/db/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: __init__.py.py
5 | Description :
6 | Author : JHao
7 | date: 2016/12/2
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/12/2:
11 | -------------------------------------------------
12 | """
--------------------------------------------------------------------------------
/proxy_pool/db/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/db/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/db/__pycache__/dbClient.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/db/__pycache__/dbClient.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/db/__pycache__/redisClient.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/db/__pycache__/redisClient.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/db/dbClient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 | """
4 | -------------------------------------------------
5 | File Name: DbClient.py
6 | Description : DB工厂类
7 | Author : JHao
8 | date: 2016/12/2
9 | -------------------------------------------------
10 | Change Activity:
11 | 2016/12/02: DB工厂类
12 | 2020/07/03: 取消raw_proxy储存
13 | -------------------------------------------------
14 | """
15 | __author__ = 'JHao'
16 |
17 | import os
18 | import sys
19 |
20 | from util.six import urlparse, withMetaclass
21 | from util.singleton import Singleton
22 |
23 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
24 |
25 |
26 | class DbClient(withMetaclass(Singleton)):
27 | """
28 | DbClient DB工厂类 提供get/put/update/pop/delete/exists/getAll/clean/getCount/changeTable方法
29 |
30 |
31 | 抽象方法定义:
32 | get(): 随机返回一个proxy;
33 | put(proxy): 存入一个proxy;
34 | pop(): 顺序返回并删除一个proxy;
35 | update(proxy): 更新指定proxy信息;
36 | delete(proxy): 删除指定proxy;
37 | exists(proxy): 判断指定proxy是否存在;
38 | getAll(): 返回所有代理;
39 | clean(): 清除所有proxy信息;
40 | getCount(): 返回proxy统计信息;
41 | changeTable(name): 切换操作对象
42 |
43 |
44 | 所有方法需要相应类去具体实现:
45 | ssdb: ssdbClient.py
46 | redis: redisClient.py
47 | mongodb: mongodbClient.py
48 |
49 | """
50 |
51 | def __init__(self, db_conn):
52 | """
53 | init
54 | :return:
55 | """
56 | self.parseDbConn(db_conn)
57 | self.__initDbClient()
58 |
59 | @classmethod
60 | def parseDbConn(cls, db_conn):
61 | db_conf = urlparse(db_conn)
62 | cls.db_type = db_conf.scheme.upper().strip()
63 | cls.db_host = db_conf.hostname
64 | cls.db_port = db_conf.port
65 | cls.db_user = db_conf.username
66 | cls.db_pwd = db_conf.password
67 | cls.db_name = db_conf.path[1:]
68 | return cls
69 |
70 | def __initDbClient(self):
71 | """
72 | init DB Client
73 | :return:
74 | """
75 | __type = None
76 | if "SSDB" == self.db_type:
77 | __type = "ssdbClient"
78 | elif "REDIS" == self.db_type:
79 | __type = "redisClient"
80 | else:
81 | pass
82 | assert __type, 'type error, Not support DB type: {}'.format(self.db_type)
83 | self.client = getattr(__import__(__type), "%sClient" % self.db_type.title())(host=self.db_host,
84 | port=self.db_port,
85 | username=self.db_user,
86 | password=self.db_pwd,
87 | db=self.db_name)
88 |
89 | def get(self, https, **kwargs):
90 | return self.client.get(https, **kwargs)
91 |
92 | def put(self, key, **kwargs):
93 | return self.client.put(key, **kwargs)
94 |
95 | def update(self, key, value, **kwargs):
96 | return self.client.update(key, value, **kwargs)
97 |
98 | def delete(self, key, **kwargs):
99 | return self.client.delete(key, **kwargs)
100 |
101 | def exists(self, key, **kwargs):
102 | return self.client.exists(key, **kwargs)
103 |
104 | def pop(self, https, **kwargs):
105 | return self.client.pop(https, **kwargs)
106 |
107 | def getAll(self, https):
108 | return self.client.getAll(https)
109 |
110 | def clear(self):
111 | return self.client.clear()
112 |
113 | def changeTable(self, name):
114 | self.client.changeTable(name)
115 |
116 | def getCount(self):
117 | return self.client.getCount()
118 |
119 | def test(self):
120 | return self.client.test()
121 |
--------------------------------------------------------------------------------
/proxy_pool/db/redisClient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -----------------------------------------------------
4 | File Name: redisClient.py
5 | Description : 封装Redis相关操作
6 | Author : JHao
7 | date: 2019/8/9
8 | ------------------------------------------------------
9 | Change Activity:
10 | 2019/08/09: 封装Redis相关操作
11 | 2020/06/23: 优化pop方法, 改用hscan命令
12 | 2021/05/26: 区别http/https代理
13 | ------------------------------------------------------
14 | """
15 | __author__ = 'JHao'
16 |
17 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError
18 | from redis.connection import BlockingConnectionPool
19 | from handler.logHandler import LogHandler
20 | from random import choice
21 | from redis import Redis
22 | import json
23 |
24 |
25 | class RedisClient(object):
26 | """
27 | Redis client
28 |
29 | Redis中代理存放的结构为hash:
30 | key为ip:port, value为代理属性的字典;
31 |
32 | """
33 |
34 | def __init__(self, **kwargs):
35 | """
36 | init
37 | :param host: host
38 | :param port: port
39 | :param password: password
40 | :param db: db
41 | :return:
42 | """
43 | self.name = ""
44 | kwargs.pop("username")
45 | self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True,
46 | timeout=5,
47 | socket_timeout=5,
48 | **kwargs))
49 |
50 | def get(self, https):
51 | """
52 | 返回一个代理
53 | :return:
54 | """
55 | if https:
56 | items = self.__conn.hvals(self.name)
57 | proxies = list(filter(lambda x: json.loads(x).get("https"), items))
58 | return choice(proxies) if proxies else None
59 | else:
60 | proxies = self.__conn.hkeys(self.name)
61 | proxy = choice(proxies) if proxies else None
62 | return self.__conn.hget(self.name, proxy) if proxy else None
63 |
64 | def put(self, proxy_obj):
65 | """
66 | 将代理放入hash, 使用changeTable指定hash name
67 | :param proxy_obj: Proxy obj
68 | :return:
69 | """
70 | data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json)
71 | return data
72 |
73 | def pop(self, https):
74 | """
75 | 弹出一个代理
76 | :return: dict {proxy: value}
77 | """
78 | proxy = self.get(https)
79 | if proxy:
80 | self.__conn.hdel(self.name, json.loads(proxy).get("proxy", ""))
81 | return proxy if proxy else None
82 |
83 | def delete(self, proxy_str):
84 | """
85 | 移除指定代理, 使用changeTable指定hash name
86 | :param proxy_str: proxy str
87 | :return:
88 | """
89 | return self.__conn.hdel(self.name, proxy_str)
90 |
91 | def exists(self, proxy_str):
92 | """
93 | 判断指定代理是否存在, 使用changeTable指定hash name
94 | :param proxy_str: proxy str
95 | :return:
96 | """
97 | return self.__conn.hexists(self.name, proxy_str)
98 |
99 | def update(self, proxy_obj):
100 | """
101 | 更新 proxy 属性
102 | :param proxy_obj:
103 | :return:
104 | """
105 | return self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json)
106 |
107 | def getAll(self, https):
108 | """
109 | 字典形式返回所有代理, 使用changeTable指定hash name
110 | :return:
111 | """
112 | items = self.__conn.hvals(self.name)
113 | if https:
114 | return list(filter(lambda x: json.loads(x).get("https"), items))
115 | else:
116 | return items
117 |
118 | def clear(self):
119 | """
120 | 清空所有代理, 使用changeTable指定hash name
121 | :return:
122 | """
123 | return self.__conn.delete(self.name)
124 |
125 | def getCount(self):
126 | """
127 | 返回代理数量
128 | :return:
129 | """
130 | proxies = self.getAll(https=False)
131 | return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))}
132 |
133 | def changeTable(self, name):
134 | """
135 | 切换操作对象
136 | :param name:
137 | :return:
138 | """
139 | self.name = name
140 |
141 | def test(self):
142 | log = LogHandler('redis_client')
143 | try:
144 | self.getCount()
145 | except TimeoutError as e:
146 | log.error('redis connection time out: %s' % str(e), exc_info=True)
147 | return e
148 | except ConnectionError as e:
149 | log.error('redis connection error: %s' % str(e), exc_info=True)
150 | return e
151 | except ResponseError as e:
152 | log.error('redis connection error: %s' % str(e), exc_info=True)
153 | return e
154 |
155 |
156 |
--------------------------------------------------------------------------------
/proxy_pool/db/ssdbClient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # !/usr/bin/env python
3 | """
4 | -------------------------------------------------
5 | File Name: ssdbClient.py
6 | Description : 封装SSDB操作
7 | Author : JHao
8 | date: 2016/12/2
9 | -------------------------------------------------
10 | Change Activity:
11 | 2016/12/2:
12 | 2017/09/22: PY3中 redis-py返回的数据是bytes型
13 | 2017/09/27: 修改pop()方法 返回{proxy:value}字典
14 | 2020/07/03: 2.1.0 优化代码结构
15 | 2021/05/26: 区分http和https代理
16 | -------------------------------------------------
17 | """
18 | __author__ = 'JHao'
19 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError
20 | from redis.connection import BlockingConnectionPool
21 | from handler.logHandler import LogHandler
22 | from random import choice
23 | from redis import Redis
24 | import json
25 |
26 |
27 | class SsdbClient(object):
28 | """
29 | SSDB client
30 |
31 | SSDB中代理存放的结构为hash:
32 | key为代理的ip:por, value为代理属性的字典;
33 | """
34 |
35 | def __init__(self, **kwargs):
36 | """
37 | init
38 | :param host: host
39 | :param port: port
40 | :param password: password
41 | :return:
42 | """
43 | self.name = ""
44 | kwargs.pop("username")
45 | self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True,
46 | timeout=5,
47 | socket_timeout=5,
48 | **kwargs))
49 |
50 | def get(self, https):
51 | """
52 | 从hash中随机返回一个代理
53 | :return:
54 | """
55 | if https:
56 | items_dict = self.__conn.hgetall(self.name)
57 | proxies = list(filter(lambda x: json.loads(x).get("https"), items_dict.values()))
58 | return choice(proxies) if proxies else None
59 | else:
60 | proxies = self.__conn.hkeys(self.name)
61 | proxy = choice(proxies) if proxies else None
62 | return self.__conn.hget(self.name, proxy) if proxy else None
63 |
64 | def put(self, proxy_obj):
65 | """
66 | 将代理放入hash
67 | :param proxy_obj: Proxy obj
68 | :return:
69 | """
70 | result = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json)
71 | return result
72 |
73 | def pop(self, https):
74 | """
75 | 顺序弹出一个代理
76 | :return: proxy
77 | """
78 | proxy = self.get(https)
79 | if proxy:
80 | self.__conn.hdel(self.name, json.loads(proxy).get("proxy", ""))
81 | return proxy if proxy else None
82 |
83 | def delete(self, proxy_str):
84 | """
85 | 移除指定代理, 使用changeTable指定hash name
86 | :param proxy_str: proxy str
87 | :return:
88 | """
89 | self.__conn.hdel(self.name, proxy_str)
90 |
91 | def exists(self, proxy_str):
92 | """
93 | 判断指定代理是否存在, 使用changeTable指定hash name
94 | :param proxy_str: proxy str
95 | :return:
96 | """
97 | return self.__conn.hexists(self.name, proxy_str)
98 |
99 | def update(self, proxy_obj):
100 | """
101 | 更新 proxy 属性
102 | :param proxy_obj:
103 | :return:
104 | """
105 | self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json)
106 |
107 | def getAll(self, https):
108 | """
109 | 字典形式返回所有代理, 使用changeTable指定hash name
110 | :return:
111 | """
112 | item_dict = self.__conn.hgetall(self.name)
113 | if https:
114 | return list(filter(lambda x: json.loads(x).get("https"), item_dict.values()))
115 | else:
116 | return item_dict.values()
117 |
118 | def clear(self):
119 | """
120 | 清空所有代理, 使用changeTable指定hash name
121 | :return:
122 | """
123 | return self.__conn.delete(self.name)
124 |
125 | def getCount(self):
126 | """
127 | 返回代理数量
128 | :return:
129 | """
130 | proxies = self.getAll(https=False)
131 | return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))}
132 |
133 | def changeTable(self, name):
134 | """
135 | 切换操作对象
136 | :param name:
137 | :return:
138 | """
139 | self.name = name
140 |
141 | def test(self):
142 | log = LogHandler('ssdb_client')
143 | try:
144 | self.getCount()
145 | except TimeoutError as e:
146 | log.error('ssdb connection time out: %s' % str(e), exc_info=True)
147 | return e
148 | except ConnectionError as e:
149 | log.error('ssdb connection error: %s' % str(e), exc_info=True)
150 | return e
151 | except ResponseError as e:
152 | log.error('ssdb connection error: %s' % str(e), exc_info=True)
153 | return e
154 |
--------------------------------------------------------------------------------
/proxy_pool/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/proxy_pool/docs/changelog.rst:
--------------------------------------------------------------------------------
1 | .. _changelog:
2 |
3 | ChangeLog
4 | ==========
5 |
6 | 2.4.1 (2022-07-17)
7 | ------------------
8 |
9 | 1. 新增代理源 **FreeProxyList**; (2022-07-21)
10 | 2. 新增代理源 **FateZero**; (2022-08-01)
11 | 3. 新增代理属性 ``region``; (2022-08-16)
12 |
13 | 2.4.0 (2021-11-17)
14 | ------------------
15 |
16 | 1. 移除无效代理源 **神鸡代理**; (2021-11-16)
17 | 2. 移除无效代理源 **极速代理**; (2021-11-16)
18 | 3. 移除代理源 **西拉代理**; (2021-11-16)
19 | 4. 新增代理源 **蝶鸟IP**; (2021-11-16)
20 | 5. 新增代理源 **PROXY11**; (2021-11-16)
21 | 6. 多线程采集代理; (2021-11-17)
22 |
23 | 2.3.0 (2021-05-27)
24 | ------------------
25 |
26 | 1. 修复Dockerfile时区问题; (2021-04-12)
27 | 2. 新增Proxy属性 ``source``, 标记代理来源; (2021-04-13)
28 | 3. 新增Proxy属性 ``https``, 标记支持https的代理; (2021-05-27)
29 |
30 | 2.2.0 (2021-04-08)
31 | ------------------
32 |
33 | 1. 启动时检查数据库连通性;
34 | 2. 新增免费代理源 **米扑代理**;
35 | 3. 新增免费代理源 **Pzzqz**;
36 | 4. 新增免费代理源 **神鸡代理**;
37 | 5. 新增免费代理源 **极速代理**;
38 | 6. 新增免费代理源 **小幻代理**;
39 |
40 | 2.1.1 (2021-02-23)
41 | ------------------
42 |
43 | 1. Fix Bug `#493`_, 新增时区配置; (2020-08-12)
44 | 2. 修复 **66代理** 采集; (2020-11-04)
45 | 3. 修复 **全网代理** 采集, 解决HTML端口加密问题; (2020-11-04)
46 | 4. 新增 **代理盒子** 免费源; (2020-11-04)
47 | 5. 新增 ``POOL_SIZE_MIN`` 配置项, runProxyCheck时, 剩余代理少于POOL_SIZE_MIN触发抓取; (2021-02-23)
48 |
49 | .. _#493: https://github.com/jhao104/proxy_pool/issues/493
50 |
51 | 2.1.0 (2020.07)
52 | ------------------
53 |
54 | 1. 新增免费代理源 **西拉代理** (2020-03-30)
55 | 2. Fix Bug `#356`_ `#401`_
56 | 3. 优化Docker镜像体积; (2020-06-19)
57 | 4. 优化配置方式;
58 | 5. 优化代码结构;
59 | 6. 不再储存raw_proxy, 抓取后直接验证入库;
60 |
61 | .. _#401: https://github.com/jhao104/proxy_pool/issues/401
62 | .. _#356: https://github.com/jhao104/proxy_pool/issues/356
63 |
64 | 2.0.1 (2019.10)
65 | -----------------
66 |
67 | 1. 新增免费代理源 **89免费代理**;
68 | #. 新增免费代理源 **齐云代理**
69 |
70 | 2.0.0 (2019.08)
71 | ------------------
72 |
73 | 1. WebApi集成Gunicorn方式启动, Windows平台暂不支持;
74 | #. 优化Proxy调度程序;
75 | #. 扩展Proxy属性;
76 | #. 新增cli工具, 更加方便启动proxyPool
77 |
78 | 1.14 (2019.07)
79 | -----------------
80 |
81 | 1. 修复 Queue阻塞导致的 ``ProxyValidSchedule`` 假死bug;
82 | #. 修改代理源 **云代理** 抓取;
83 | #. 修改代理源 **码农代理** 抓取;
84 | #. 修改代理源 **代理66** 抓取, 引入 ``PyExecJS`` 模块破解加速乐动态Cookies加密;
85 |
86 | 1.13 (2019.02)
87 | -----------------
88 |
89 | 1. 使用.py文件替换.ini作为配置文件;
90 |
91 | #. 优化代理采集部分;
92 |
93 | 1.12 (2018.04)
94 | -----------------
95 |
96 | 1. 优化代理格式检查;
97 |
98 | #. 增加代理源;
99 |
100 | #. fix bug `#122`_ `#126`_
101 |
102 | .. _#122: https://github.com/jhao104/proxy_pool/issues/122
103 | .. _#126: https://github.com/jhao104/proxy_pool/issues/126
104 |
105 | 1.11 (2017.08)
106 | -----------------
107 |
108 | 1. 使用多线程验证useful_pool;
109 |
110 | 1.10 (2016.11)
111 | -----------------
112 |
113 | 1. 第一版;
114 |
115 | #. 支持PY2/PY3;
116 |
117 | #. 代理池基本功能;
118 |
--------------------------------------------------------------------------------
/proxy_pool/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | import sphinx_rtd_theme
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'ProxyPool'
21 | copyright = '2020, jhao104'
22 | author = 'jhao104'
23 |
24 | master_doc = 'index'
25 |
26 | # The full version, including alpha/beta/rc tags
27 | release = '2.1.0'
28 |
29 | # -- General configuration ---------------------------------------------------
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 | ]
36 |
37 | # If true, sectionauthor and moduleauthor directives will be shown in the
38 | # output. They are ignored by default.
39 | show_authors = False
40 |
41 | # The name of the Pygments (syntax highlighting) style to use.
42 | pygments_style = "sphinx"
43 |
44 | # Add any paths that contain templates here, relative to this directory.
45 | templates_path = ['_templates']
46 |
47 | # The language for content autogenerated by Sphinx. Refer to documentation
48 | # for a list of supported languages.
49 | #
50 | # This is also used if you do content translation via gettext catalogs.
51 | # Usually you set "language" from the command line for these cases.
52 | language = 'zh_CN'
53 |
54 | # List of patterns, relative to source directory, that match files and
55 | # directories to ignore when looking for source files.
56 | # This pattern also affects html_static_path and html_extra_path.
57 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
58 |
59 | # -- Options for HTML output -------------------------------------------------
60 |
61 | # The theme to use for HTML and HTML Help pages. See the documentation for
62 | # a list of builtin themes.
63 | #
64 | html_theme = 'sphinx_rtd_theme'
65 |
66 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
67 |
68 | # Add any paths that contain custom static files (such as style sheets) here,
69 | # relative to this directory. They are copied after the builtin static files,
70 | # so a file named "default.css" will overwrite the builtin "default.css".
71 | html_static_path = ['_static']
72 |
--------------------------------------------------------------------------------
/proxy_pool/docs/dev/ext_fetcher.rst:
--------------------------------------------------------------------------------
1 | .. ext_fetcher
2 |
3 | 扩展代理源
4 | -----------
5 |
6 | 项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,如果直接运行可能拿到的代理质量不理想。因此提供了用户自定义扩展代理获取的方法。
7 |
8 | 如果要添加一个新的代理获取方法, 过程如下:
9 |
10 | 1. 首先在 `ProxyFetcher`_ 类中添加自定义的获取代理的静态方法,该方法需要以生成器(yield)形式返回 ``host:ip`` 格式的代理字符串, 例如:
11 |
12 | .. code-block:: python
13 |
14 | class ProxyFetcher(object):
15 | # ....
16 | # 自定义代理源获取方法
17 | @staticmethod
18 | def freeProxyCustom01(): # 命名不和已有重复即可
19 | # 通过某网站或者某接口或某数据库获取代理
20 | # 假设你已经拿到了一个代理列表
21 | proxies = ["x.x.x.x:3128", "x.x.x.x:80"]
22 | for proxy in proxies:
23 | yield proxy
24 | # 确保每个proxy都是 host:ip正确的格式返回
25 |
26 | 2. 添加好方法后,修改配置文件 `setting.py`_ 中的 ``PROXY_FETCHER`` 项, 加入刚才添加的自定义方法的名字:
27 |
28 | .. code-block:: python
29 |
30 | PROXY_FETCHER = [
31 | # ....
32 | "freeProxyCustom01" # # 确保名字和你添加方法名字一致
33 | ]
34 |
35 | .. _ProxyFetcher: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L20
36 | .. _setting.py: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47
--------------------------------------------------------------------------------
/proxy_pool/docs/dev/ext_validator.rst:
--------------------------------------------------------------------------------
1 | .. ext_validator
2 |
3 | 代理校验
4 | -----------
5 |
6 | 内置校验
7 | >>>>>>>>>
8 |
9 | 项目中使用的代理校验方法全部定义在 `validator.py`_ 中, 通过 `ProxyValidator`_ 类中提供的装饰器来区分。校验方法返回 ``True`` 表示
10 | 校验通过, 返回 ``False`` 表示校验不通过。
11 |
12 | * 代理校验方法分为三类: ``preValidator`` 、 ``httpValidator`` 、 ``httpsValidator``:
13 |
14 | * **preValidator**: 预校验,在代理抓取后验证前调用,目前实现了 `formatValidator`_ 校验代理IP格式是否合法;
15 | * **httpValidator**: 代理可用性校验,通过则认为代理可用, 目前实现了 `httpTimeOutValidator`_ 校验;
16 | * **httpsValidator**: 校验代理是否支持https,目前实现了 `httpsTimeOutValidator`_ 校验。
17 |
18 |
19 | .. _validator.py: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py
20 | .. _ProxyValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L29
21 | .. _formatValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L51
22 | .. _httpTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L58
23 | .. _httpsTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L71
24 |
25 | 每种校验可以定义多个方法,只有 **所有** 方法都返回 ``True`` 的情况下才视为该校验通过,校验方法执行顺序为: 先执行 **httpValidator** , 前者通过后再执行 **httpsValidator** 。
26 | 只有 `preValidator` 校验通过的代理才会进入可用性校验, `httpValidator` 校验通过后认为代理可用准备更新入代理池, `httpValidator` 校验通过后视为代理支持https更新代理的 `https` 属性为 `True` 。
27 |
28 | 扩展校验
29 | >>>>>>>>>
30 |
31 | 在 `validator.py`_ 已有自定义校验的示例,自定义函数需返回True或者False,使用 `ProxyValidator`_ 中提供的装饰器来区分校验类型。 下面是两个例子:
32 |
33 | * 1. 自定义一个代理可用性的校验(``addHttpValidator``):
34 |
35 | .. code-block:: python
36 |
37 | @ProxyValidator.addHttpValidator
38 | def customValidatorExample01(proxy):
39 | """自定义代理可用性校验函数"""
40 | proxies = {"http": "http://{proxy}".format(proxy=proxy)}
41 | try:
42 | r = requests.get("http://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5)
43 | return True if r.status_code == 200 and len(r.content) > 200 else False
44 | except Exception as e:
45 | return False
46 |
47 | * 2. 自定义一个代理是否支持https的校验(``addHttpsValidator``):
48 |
49 | .. code-block:: python
50 |
51 | @ProxyValidator.addHttpsValidator
52 | def customValidatorExample02(proxy):
53 | """自定义代理是否支持https校验函数"""
54 | proxies = {"https": "https://{proxy}".format(proxy=proxy)}
55 | try:
56 | r = requests.get("https://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5, verify=False)
57 | return True if r.status_code == 200 and len(r.content) > 200 else False
58 | except Exception as e:
59 | return False
60 |
61 | 注意,比如在运行代理可用性校验时,所有被 ``ProxyValidator.addHttpValidator`` 装饰的函数会被依次按定义顺序执行,只有当所有函数都返回True时才会判断代理可用。 ``HttpsValidator`` 运行机制也是如此。
62 |
--------------------------------------------------------------------------------
/proxy_pool/docs/dev/index.rst:
--------------------------------------------------------------------------------
1 | =========
2 | 开发指南
3 | =========
4 |
5 | .. module:: dev
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 |
10 | ext_fetcher
11 | ext_validator
12 |
--------------------------------------------------------------------------------
/proxy_pool/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. ProxyPool documentation master file, created by
2 | sphinx-quickstart on Wed Jul 8 16:13:42 2020.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | ProxyPool
7 | =====================================
8 |
9 | ::
10 |
11 | ****************************************************************
12 | *** ______ ********************* ______ *********** _ ********
13 | *** | ___ \_ ******************** | ___ \ ********* | | ********
14 | *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ********
15 | *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ********
16 | *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ ****
17 | *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ ****
18 | **** __ / / *****
19 | ************************* /___ / *******************************
20 | ************************* ********************************
21 | ****************************************************************
22 |
23 | Python爬虫代理IP池
24 |
25 | 安装
26 | -----
27 |
28 | * 下载代码
29 |
30 | .. code-block:: console
31 |
32 | $ git clone git@github.com:jhao104/proxy_pool.git
33 |
34 | * 安装依赖
35 |
36 | .. code-block:: console
37 |
38 | $ pip install -r requirements.txt
39 |
40 | * 更新配置
41 |
42 | .. code-block:: python
43 |
44 | HOST = "0.0.0.0"
45 | PORT = 5000
46 |
47 | DB_CONN = 'redis://@127.0.0.1:8888'
48 |
49 | PROXY_FETCHER = [
50 | "freeProxy01",
51 | "freeProxy02",
52 | # ....
53 | ]
54 |
55 | * 启动项目
56 |
57 | .. code-block:: console
58 |
59 | $ python proxyPool.py schedule
60 | $ python proxyPool.py server
61 |
62 | 使用
63 | ______
64 |
65 | * API
66 |
67 | ============ ======== ================ ==============
68 | Api Method Description Params
69 | ============ ======== ================ ==============
70 | / GET API介绍 无
71 | /get GET 返回一个代理 可选参数: `?type=https` 过滤支持https的代理
72 | /pop GET 返回并删除一个代理 可选参数: `?type=https` 过滤支持https的代理
73 | /all GET 返回所有代理 可选参数: `?type=https` 过滤支持https的代理
74 | /count GET 返回代理数量 无
75 | /delete GET 删除指定代理 `?proxy=host:ip`
76 | ============ ======== ================ ==============
77 |
78 |
79 | * 爬虫
80 |
81 | .. code-block:: python
82 |
83 | import requests
84 |
85 | def get_proxy():
86 | return requests.get("http://127.0.0.1:5010/get?type=https").json()
87 |
88 | def delete_proxy(proxy):
89 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
90 |
91 | # your spider code
92 |
93 | def getHtml():
94 | # ....
95 | retry_count = 5
96 | proxy = get_proxy().get("proxy")
97 | while retry_count > 0:
98 | try:
99 | html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy), "https": "https://{}".format(proxy)})
100 | # 使用代理访问
101 | return html
102 | except Exception:
103 | retry_count -= 1
104 | # 删除代理池中代理
105 | delete_proxy(proxy)
106 | return None
107 |
108 | Contents
109 | --------
110 |
111 | .. toctree::
112 | :maxdepth: 2
113 |
114 | user/index
115 | dev/index
116 | changelog
117 |
--------------------------------------------------------------------------------
/proxy_pool/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/proxy_pool/docs/user/how_to_config.rst:
--------------------------------------------------------------------------------
1 | .. how_to_config
2 |
3 | 配置参考
4 | ---------
5 |
6 | 配置文件 ``setting.py`` 位于项目的主目录下, 配置主要分为四类: **服务配置** 、 **数据库配置** 、 **采集配置** 、 **校验配置**.
7 |
8 | 服务配置
9 | >>>>>>>>>
10 |
11 | * ``HOST``
12 |
13 | API服务监听的IP, 本机访问设置为 ``127.0.0.1``, 开启远程访问设置为: ``0.0.0.0``.
14 |
15 | * ``PORT``
16 |
17 | API服务监听的端口.
18 |
19 | 数据库配置
20 | >>>>>>>>>>>
21 |
22 | * ``DB_CONN``
23 |
24 | 用户存放代理IP的数据库URI, 配置格式为: ``db_type://[[user]:[pwd]]@ip:port/[db]``.
25 |
26 | 目前支持的db_type有: ``ssdb`` 、 ``redis``.
27 |
28 | 配置示例:
29 |
30 | .. code-block:: python
31 |
32 | # SSDB IP: 127.0.0.1 Port: 8888
33 | DB_CONN = 'ssdb://@127.0.0.1:8888'
34 | # SSDB IP: 127.0.0.1 Port: 8899 Password: 123456
35 | DB_CONN = 'ssdb://:123456@127.0.0.1:8888'
36 |
37 | # Redis IP: 127.0.0.1 Port: 6379
38 | DB_CONN = 'redis://@127.0.0.1:6379'
39 | # Redis IP: 127.0.0.1 Port: 6379 Password: 123456
40 | DB_CONN = 'redis://:123456@127.0.0.1:6379'
41 | # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 DB: 15
42 | DB_CONN = 'redis://:123456@127.0.0.1:6379/15'
43 |
44 |
45 | * ``TABLE_NAME``
46 |
47 | 存放代理的数据载体名称, ssdb和redis的存放结构为hash.
48 |
49 | 采集配置
50 | >>>>>>>>>
51 |
52 | * ``PROXY_FETCHER``
53 |
54 | 启用的代理采集方法名, 代理采集方法位于 ``fetcher/proxyFetcher.py`` 类中.
55 |
56 | 由于各个代理源的稳定性不容易掌握, 当某个代理采集方法失效时, 可以该配置中注释掉其名称.
57 |
58 | 如果有增加某些代理采集方法, 也请在该配置中添加其方法名, 具体请参考 :doc:`/dev/extend_fetcher`.
59 |
60 | 调度程序每次执行采集任务时都会再次加载该配置, 保证每次运行的采集方法都是有效的.
61 |
62 | 校验配置
63 | >>>>>>>>>
64 |
65 | * ``HTTP_URL``
66 |
67 | 用于检验代理是否可用的地址, 默认为 ``http://httpbin.org``, 可根据使用场景修改为其他地址.
68 |
69 | * ``HTTPS_URL``
70 |
71 | 用于检验代理是否支持HTTPS的地址, 默认为 ``https://www.qq.com``, 可根据使用场景修改为其他地址.
72 |
73 | * ``VERIFY_TIMEOUT``
74 |
75 | 检验代理的超时时间, 默认为 ``10`` , 单位秒. 使用代理访问 ``HTTP(S)_URL`` 耗时超过 ``VERIFY_TIMEOUT`` 时, 视为代理不可用.
76 |
77 | * ``MAX_FAIL_COUNT``
78 |
79 | 检验代理允许最大失败次数, 默认为 ``0``, 即出错一次即删除.
80 |
81 | * ``POOL_SIZE_MIN``
82 |
83 | 代理检测定时任务运行前若代理数量小于 `POOL_SIZE_MIN`, 则先运行抓取程序.
--------------------------------------------------------------------------------
/proxy_pool/docs/user/how_to_run.rst:
--------------------------------------------------------------------------------
1 | .. how_to_run
2 |
3 |
4 | 如何运行
5 | ---------
6 |
7 | 下载代码
8 | >>>>>>>>>
9 |
10 | 本项目需要下载代码到本地运行, 通过 ``git`` 下载:
11 |
12 | .. code-block:: console
13 |
14 | $ git clone git@github.com:jhao104/proxy_pool.git
15 |
16 | 或者下载特定的 ``release`` 版本:
17 |
18 | .. code-block:: console
19 |
20 | https://github.com/jhao104/proxy_pool/releases
21 |
22 | 安装依赖
23 | >>>>>>>>>
24 |
25 | 到项目目录下使用 ``pip`` 安装依赖库:
26 |
27 | .. code-block:: console
28 |
29 | $ pip install -r requirements.txt
30 |
31 |
32 | 更新配置
33 | >>>>>>>>>
34 |
35 | 配置文件 ``setting.py`` 位于项目的主目录下:
36 |
37 | .. code-block:: python
38 |
39 | # 配置API服务
40 |
41 | HOST = "0.0.0.0" # IP
42 | PORT = 5000 # 监听端口
43 |
44 | # 配置数据库
45 |
46 | DB_CONN = 'redis://@127.0.0.1:8888/0'
47 |
48 | # 配置 ProxyFetcher
49 |
50 | PROXY_FETCHER = [
51 | "freeProxy01", # 这里是启用的代理抓取方法,所有fetch方法位于fetcher/proxyFetcher.py
52 | "freeProxy02",
53 | # ....
54 | ]
55 |
56 | 更多配置请参考 :doc:`/user/how_to_config`
57 |
58 | 启动项目
59 | >>>>>>>>>
60 |
61 | 如果已配置好运行环境, 具备运行条件, 可以通过 ``proxyPool.py`` 启动. ``proxyPool.py`` 是项目的CLI入口.
62 | 完整程序包含两部份: ``schedule`` 调度程序和 ``server`` API服务, 调度程序负责采集和验证代理, API服务提供代理服务HTTP接口.
63 |
64 | 通过命令行程序分别启动调度程序和API服务:
65 |
66 | .. code-block:: console
67 |
68 | # 启动调度程序
69 | $ python proxyPool.py schedule
70 |
71 | # 启动webApi服务
72 | $ python proxyPool.py server
73 |
74 |
--------------------------------------------------------------------------------
/proxy_pool/docs/user/how_to_use.rst:
--------------------------------------------------------------------------------
1 | .. how_to_use
2 |
3 | 如何使用
4 | ----------
5 |
6 | 爬虫代码要对接代理池目前有两种方式: 一是通过调用API接口使用, 二是直接读取数据库.
7 |
8 | 调用API
9 | >>>>>>>>>
10 |
11 | 启动ProxyPool的 ``server`` 后会提供如下几个http接口:
12 |
13 | ============ ======== ================ ==============
14 | Api Method Description Arg
15 | ============ ======== ================ ==============
16 | / GET API介绍 无
17 | /get GET 随机返回一个代理 无
18 | /get_all GET 返回所有代理 无
19 | /get_status GET 返回代理数量 无
20 | /delete GET 删除指定代理 proxy=host:ip
21 | ============ ======== ================ ==============
22 |
23 | 在代码中可以通过封装上面的API接口来使用代理, 例子:
24 |
25 | .. code-block:: python
26 |
27 | import requests
28 |
29 | def get_proxy():
30 | return requests.get("http://127.0.0.1:5010/get/").json()
31 |
32 | def delete_proxy(proxy):
33 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
34 |
35 | # your spider code
36 |
37 | def getHtml():
38 | # ....
39 | retry_count = 5
40 | proxy = get_proxy().get("proxy")
41 | while retry_count > 0:
42 | try:
43 | # 使用代理访问
44 | html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)})
45 | return html
46 | except Exception:
47 | retry_count -= 1
48 | # 删除代理池中代理
49 | delete_proxy(proxy)
50 | return None
51 |
52 | 本例中我们在本地 ``127.0.0.1`` 启动端口为 ``5010`` 的 ``server``, 使用 ``/get`` 接口获取代理, ``/delete`` 删除代理.
53 |
54 | 读数据库
55 | >>>>>>>>>
56 |
57 | 目前支持配置两种数据库: ``REDIS`` 、 ``SSDB``.
58 |
59 | * **REDIS** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME**
60 |
61 | * **SSDB** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME**
62 |
63 | 可以在代码中自行读取.
64 |
--------------------------------------------------------------------------------
/proxy_pool/docs/user/index.rst:
--------------------------------------------------------------------------------
1 | =========
2 | 用户指南
3 | =========
4 |
5 | .. module:: user
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 |
10 | how_to_run
11 | how_to_use
12 | how_to_config
13 |
--------------------------------------------------------------------------------
/proxy_pool/fetcher/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: __init__.py
5 | Description :
6 | Author : JHao
7 | date: 2016/11/25
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/11/25:
11 | -------------------------------------------------
12 | """
--------------------------------------------------------------------------------
/proxy_pool/fetcher/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/fetcher/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/fetcher/__pycache__/proxyFetcher.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/fetcher/__pycache__/proxyFetcher.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/fetcher/proxyFetcher.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: proxyFetcher
5 | Description :
6 | Author : JHao
7 | date: 2016/11/25
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/11/25: proxyFetcher
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | import re
16 | import json
17 | from time import sleep
18 |
19 | from util.webRequest import WebRequest
20 |
21 |
22 | class ProxyFetcher(object):
23 | """
24 | proxy getter
25 | """
26 |
27 | @staticmethod
28 | def freeProxy01():
29 | """
30 | 站大爷 https://www.zdaye.com/dayProxy.html
31 | """
32 | start_url = "https://www.zdaye.com/dayProxy.html"
33 | html_tree = WebRequest().get(start_url).tree
34 | latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip()
35 | from datetime import datetime
36 | interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S")
37 | if interval.seconds < 300: # 只采集5分钟内的更新
38 | target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip()
39 | while target_url:
40 | _tree = WebRequest().get(target_url).tree
41 | for tr in _tree.xpath("//table//tr"):
42 | ip = "".join(tr.xpath("./td[1]/text()")).strip()
43 | port = "".join(tr.xpath("./td[2]/text()")).strip()
44 | yield "%s:%s" % (ip, port)
45 | next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href")
46 | target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False
47 | sleep(5)
48 |
49 | @staticmethod
50 | def freeProxy02():
51 | """
52 | 代理66 http://www.66ip.cn/
53 | """
54 | url = "http://www.66ip.cn/"
55 | resp = WebRequest().get(url, timeout=10).tree
56 | for i, tr in enumerate(resp.xpath("(//table)[3]//tr")):
57 | if i > 0:
58 | ip = "".join(tr.xpath("./td[1]/text()")).strip()
59 | port = "".join(tr.xpath("./td[2]/text()")).strip()
60 | yield "%s:%s" % (ip, port)
61 |
62 | @staticmethod
63 | def freeProxy03():
64 | """ 开心代理 """
65 | target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"]
66 | for url in target_urls:
67 | tree = WebRequest().get(url).tree
68 | for tr in tree.xpath("//table[@class='active']//tr")[1:]:
69 | ip = "".join(tr.xpath('./td[1]/text()')).strip()
70 | port = "".join(tr.xpath('./td[2]/text()')).strip()
71 | yield "%s:%s" % (ip, port)
72 |
73 | @staticmethod
74 | def freeProxy04():
75 | """ FreeProxyList https://www.freeproxylists.net/zh/ """
76 | url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50"
77 | tree = WebRequest().get(url, verify=False).tree
78 | from urllib import parse
79 |
80 | def parse_ip(input_str):
81 | html_str = parse.unquote(input_str)
82 | ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str)
83 | return ips[0] if ips else None
84 |
85 | for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"):
86 | ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip())
87 | port = "".join(tr.xpath('./td[2]/text()')).strip()
88 | if ip:
89 | yield "%s:%s" % (ip, port)
90 |
91 | @staticmethod
92 | def freeProxy05(page_count=1):
93 | """ 快代理 https://www.kuaidaili.com """
94 | url_pattern = [
95 | 'https://www.kuaidaili.com/free/inha/{}/',
96 | 'https://www.kuaidaili.com/free/intr/{}/'
97 | ]
98 | url_list = []
99 | for page_index in range(1, page_count + 1):
100 | for pattern in url_pattern:
101 | url_list.append(pattern.format(page_index))
102 |
103 | for url in url_list:
104 | tree = WebRequest().get(url).tree
105 | proxy_list = tree.xpath('.//table//tr')
106 | sleep(1) # 必须sleep 不然第二条请求不到数据
107 | for tr in proxy_list[1:]:
108 | yield ':'.join(tr.xpath('./td/text()')[0:2])
109 |
110 | @staticmethod
111 | def freeProxy06():
112 | """ FateZero http://proxylist.fatezero.org/ """
113 | url = "http://proxylist.fatezero.org/proxy.list"
114 | try:
115 | resp_text = WebRequest().get(url).text
116 | for each in resp_text.split("\n"):
117 | json_info = json.loads(each)
118 | if json_info.get("country") == "CN":
119 | yield "%s:%s" % (json_info.get("host", ""), json_info.get("port", ""))
120 | except Exception as e:
121 | print(e)
122 |
123 | @staticmethod
124 | def freeProxy07():
125 | """ 云代理 """
126 | urls = ['http://www.ip3366.net/free/?stype=1', "http://www.ip3366.net/free/?stype=2"]
127 | for url in urls:
128 | r = WebRequest().get(url, timeout=10)
129 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
130 | for proxy in proxies:
131 | yield ":".join(proxy)
132 |
133 | @staticmethod
134 | def freeProxy08():
135 | """ 小幻代理 """
136 | urls = ['https://ip.ihuan.me/address/5Lit5Zu9.html']
137 | for url in urls:
138 | r = WebRequest().get(url, timeout=10)
139 | proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+) | ', r.text)
140 | for proxy in proxies:
141 | yield ":".join(proxy)
142 |
143 | @staticmethod
144 | def freeProxy09(page_count=1):
145 | """ 免费代理库 """
146 | for i in range(1, page_count + 1):
147 | url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i)
148 | html_tree = WebRequest().get(url).tree
149 | for index, tr in enumerate(html_tree.xpath("//table//tr")):
150 | if index == 0:
151 | continue
152 | yield ":".join(tr.xpath("./td/text()")[0:2]).strip()
153 |
154 | @staticmethod
155 | def freeProxy10():
156 | """ 89免费代理 """
157 | r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10)
158 | proxies = re.findall(
159 | r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?',
160 | r.text)
161 | for proxy in proxies:
162 | yield ':'.join(proxy)
163 |
164 | # @staticmethod
165 | # def wallProxy01():
166 | # """
167 | # PzzQz https://pzzqz.com/
168 | # """
169 | # from requests import Session
170 | # from lxml import etree
171 | # session = Session()
172 | # try:
173 | # index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text
174 | # x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp)
175 | # if x_csrf_token:
176 | # data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""}
177 | # proxy_resp = session.post("https://pzzqz.com/", verify=False,
178 | # headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json()
179 | # tree = etree.HTML(proxy_resp["proxy_html"])
180 | # for tr in tree.xpath("//tr"):
181 | # ip = "".join(tr.xpath("./td[1]/text()"))
182 | # port = "".join(tr.xpath("./td[2]/text()"))
183 | # yield "%s:%s" % (ip, port)
184 | # except Exception as e:
185 | # print(e)
186 |
187 | # @staticmethod
188 | # def freeProxy10():
189 | # """
190 | # 墙外网站 cn-proxy
191 | # :return:
192 | # """
193 | # urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
194 | # request = WebRequest()
195 | # for url in urls:
196 | # r = request.get(url, timeout=10)
197 | # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\w\W](\d+) | ', r.text)
198 | # for proxy in proxies:
199 | # yield ':'.join(proxy)
200 |
201 | # @staticmethod
202 | # def freeProxy11():
203 | # """
204 | # https://proxy-list.org/english/index.php
205 | # :return:
206 | # """
207 | # urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)]
208 | # request = WebRequest()
209 | # import base64
210 | # for url in urls:
211 | # r = request.get(url, timeout=10)
212 | # proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
213 | # for proxy in proxies:
214 | # yield base64.b64decode(proxy).decode()
215 |
216 | # @staticmethod
217 | # def freeProxy12():
218 | # urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
219 | # request = WebRequest()
220 | # for url in urls:
221 | # r = request.get(url, timeout=10)
222 | # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) | [\s\S]*?(\d+) | ', r.text)
223 | # for proxy in proxies:
224 | # yield ':'.join(proxy)
225 |
226 |
227 | if __name__ == '__main__':
228 | p = ProxyFetcher()
229 | for _ in p.freeProxy06():
230 | print(_)
231 |
232 | # http://nntime.com/proxy-list-01.htm
233 |
--------------------------------------------------------------------------------
/proxy_pool/handler/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: __init__.py
5 | Description :
6 | Author : JHao
7 | date: 2016/12/3
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/12/3:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | # from handler.ProxyManager import ProxyManager
16 |
--------------------------------------------------------------------------------
/proxy_pool/handler/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/handler/__pycache__/configHandler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/configHandler.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/handler/__pycache__/logHandler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/logHandler.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/handler/__pycache__/proxyHandler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/proxyHandler.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/handler/configHandler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: configHandler
5 | Description :
6 | Author : JHao
7 | date: 2020/6/22
8 | -------------------------------------------------
9 | Change Activity:
10 | 2020/6/22:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | import os
16 | import setting
17 | from util.singleton import Singleton
18 | from util.lazyProperty import LazyProperty
19 | from util.six import reload_six, withMetaclass
20 |
21 |
22 | class ConfigHandler(withMetaclass(Singleton)):
23 |
24 | def __init__(self):
25 | pass
26 |
27 | @LazyProperty
28 | def serverHost(self):
29 | return os.environ.get("HOST", setting.HOST)
30 |
31 | @LazyProperty
32 | def serverPort(self):
33 | return os.environ.get("PORT", setting.PORT)
34 |
35 | @LazyProperty
36 | def dbConn(self):
37 | return os.getenv("DB_CONN", setting.DB_CONN)
38 |
39 | @LazyProperty
40 | def tableName(self):
41 | return os.getenv("TABLE_NAME", setting.TABLE_NAME)
42 |
43 | @property
44 | def fetchers(self):
45 | reload_six(setting)
46 | return setting.PROXY_FETCHER
47 |
48 | @LazyProperty
49 | def httpUrl(self):
50 | return os.getenv("HTTP_URL", setting.HTTP_URL)
51 |
52 | @LazyProperty
53 | def httpsUrl(self):
54 | return os.getenv("HTTPS_URL", setting.HTTPS_URL)
55 |
56 | @LazyProperty
57 | def verifyTimeout(self):
58 | return int(os.getenv("VERIFY_TIMEOUT", setting.VERIFY_TIMEOUT))
59 |
60 | # @LazyProperty
61 | # def proxyCheckCount(self):
62 | # return int(os.getenv("PROXY_CHECK_COUNT", setting.PROXY_CHECK_COUNT))
63 |
64 | @LazyProperty
65 | def maxFailCount(self):
66 | return int(os.getenv("MAX_FAIL_COUNT", setting.MAX_FAIL_COUNT))
67 |
68 | # @LazyProperty
69 | # def maxFailRate(self):
70 | # return int(os.getenv("MAX_FAIL_RATE", setting.MAX_FAIL_RATE))
71 |
72 | @LazyProperty
73 | def poolSizeMin(self):
74 | return int(os.getenv("POOL_SIZE_MIN", setting.POOL_SIZE_MIN))
75 |
76 | @LazyProperty
77 | def proxyRegion(self):
78 | return bool(os.getenv("PROXY_REGION", setting.PROXY_REGION))
79 |
80 | @LazyProperty
81 | def timezone(self):
82 | return os.getenv("TIMEZONE", setting.TIMEZONE)
83 |
84 |
--------------------------------------------------------------------------------
/proxy_pool/handler/logHandler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: LogHandler.py
5 | Description : 日志操作模块
6 | Author : JHao
7 | date: 2017/3/6
8 | -------------------------------------------------
9 | Change Activity:
10 | 2017/03/06: log handler
11 | 2017/09/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出)
12 | 2020/07/13: Windows下TimedRotatingFileHandler线程不安全, 不再使用
13 | -------------------------------------------------
14 | """
15 | __author__ = 'JHao'
16 |
17 | import os
18 | import logging
19 | import platform
20 |
21 | from logging.handlers import TimedRotatingFileHandler
22 |
23 | # 日志级别
24 | CRITICAL = 50
25 | FATAL = CRITICAL
26 | ERROR = 40
27 | WARNING = 30
28 | WARN = WARNING
29 | INFO = 20
30 | DEBUG = 10
31 | NOTSET = 0
32 |
33 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
34 | ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir)
35 | LOG_PATH = os.path.join(ROOT_PATH, 'log')
36 |
37 | if not os.path.exists(LOG_PATH):
38 | try:
39 | os.mkdir(LOG_PATH)
40 | except FileExistsError:
41 | pass
42 |
43 |
44 | class LogHandler(logging.Logger):
45 | """
46 | LogHandler
47 | """
48 |
49 | def __init__(self, name, level=DEBUG, stream=True, file=True):
50 | self.name = name
51 | self.level = level
52 | logging.Logger.__init__(self, self.name, level=level)
53 | if stream:
54 | self.__setStreamHandler__()
55 | if file:
56 | if platform.system() != "Windows":
57 | self.__setFileHandler__()
58 |
59 | def __setFileHandler__(self, level=None):
60 | """
61 | set file handler
62 | :param level:
63 | :return:
64 | """
65 | file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name))
66 | # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天
67 | file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15)
68 | file_handler.suffix = '%Y%m%d.log'
69 | if not level:
70 | file_handler.setLevel(self.level)
71 | else:
72 | file_handler.setLevel(level)
73 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
74 |
75 | file_handler.setFormatter(formatter)
76 | self.file_handler = file_handler
77 | self.addHandler(file_handler)
78 |
79 | def __setStreamHandler__(self, level=None):
80 | """
81 | set stream handler
82 | :param level:
83 | :return:
84 | """
85 | stream_handler = logging.StreamHandler()
86 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
87 | stream_handler.setFormatter(formatter)
88 | if not level:
89 | stream_handler.setLevel(self.level)
90 | else:
91 | stream_handler.setLevel(level)
92 | self.addHandler(stream_handler)
93 |
94 |
95 | if __name__ == '__main__':
96 | log = LogHandler('test')
97 | log.info('this is a test msg')
98 |
--------------------------------------------------------------------------------
/proxy_pool/handler/proxyHandler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: ProxyHandler.py
5 | Description :
6 | Author : JHao
7 | date: 2016/12/3
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/12/03:
11 | 2020/05/26: 区分http和https
12 | -------------------------------------------------
13 | """
14 | __author__ = 'JHao'
15 |
16 | from helper.proxy import Proxy
17 | from db.dbClient import DbClient
18 | from handler.configHandler import ConfigHandler
19 |
20 |
21 | class ProxyHandler(object):
22 | """ Proxy CRUD operator"""
23 |
24 | def __init__(self):
25 | self.conf = ConfigHandler()
26 | self.db = DbClient(self.conf.dbConn)
27 | self.db.changeTable(self.conf.tableName)
28 |
29 | def get(self, https=False):
30 | """
31 | return a proxy
32 | Args:
33 | https: True/False
34 | Returns:
35 | """
36 | proxy = self.db.get(https)
37 | return Proxy.createFromJson(proxy) if proxy else None
38 |
39 | def pop(self, https):
40 | """
41 | return and delete a useful proxy
42 | :return:
43 | """
44 | proxy = self.db.pop(https)
45 | if proxy:
46 | return Proxy.createFromJson(proxy)
47 | return None
48 |
49 | def put(self, proxy):
50 | """
51 | put proxy into use proxy
52 | :return:
53 | """
54 | self.db.put(proxy)
55 |
56 | def delete(self, proxy):
57 | """
58 | delete useful proxy
59 | :param proxy:
60 | :return:
61 | """
62 | return self.db.delete(proxy.proxy)
63 |
64 | def getAll(self, https=False):
65 | """
66 | get all proxy from pool as Proxy list
67 | :return:
68 | """
69 | proxies = self.db.getAll(https)
70 | return [Proxy.createFromJson(_) for _ in proxies]
71 |
72 | def exists(self, proxy):
73 | """
74 | check proxy exists
75 | :param proxy:
76 | :return:
77 | """
78 | return self.db.exists(proxy.proxy)
79 |
80 | def getCount(self):
81 | """
82 | return raw_proxy and use_proxy count
83 | :return:
84 | """
85 | total_use_proxy = self.db.getCount()
86 | return {'count': total_use_proxy}
87 |
--------------------------------------------------------------------------------
/proxy_pool/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__init__.py
--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/check.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/check.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/fetch.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/fetch.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/launcher.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/launcher.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/proxy.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/proxy.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/scheduler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/scheduler.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/validator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/validator.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/helper/check.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: check
5 | Description : 执行代理校验
6 | Author : JHao
7 | date: 2019/8/6
8 | -------------------------------------------------
9 | Change Activity:
10 | 2019/08/06: 执行代理校验
11 | 2021/05/25: 分别校验http和https
12 | 2022/08/16: 获取代理Region信息
13 | -------------------------------------------------
14 | """
15 | __author__ = 'JHao'
16 |
17 | from util.six import Empty
18 | from threading import Thread
19 | from datetime import datetime
20 | from util.webRequest import WebRequest
21 | from handler.logHandler import LogHandler
22 | from helper.validator import ProxyValidator
23 | from handler.proxyHandler import ProxyHandler
24 | from handler.configHandler import ConfigHandler
25 |
26 |
27 | class DoValidator(object):
28 | """ 执行校验 """
29 |
30 | conf = ConfigHandler()
31 |
32 | @classmethod
33 | def validator(cls, proxy, work_type):
34 | """
35 | 校验入口
36 | Args:
37 | proxy: Proxy Object
38 | work_type: raw/use
39 | Returns:
40 | Proxy Object
41 | """
42 | http_r = cls.httpValidator(proxy)
43 | https_r = False if not http_r else cls.httpsValidator(proxy)
44 |
45 | proxy.check_count += 1
46 | proxy.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
47 | proxy.last_status = True if http_r else False
48 | if http_r:
49 | if proxy.fail_count > 0:
50 | proxy.fail_count -= 1
51 | proxy.https = True if https_r else False
52 | if work_type == "raw":
53 | proxy.region = cls.regionGetter(proxy) if cls.conf.proxyRegion else ""
54 | else:
55 | proxy.fail_count += 1
56 | return proxy
57 |
58 | @classmethod
59 | def httpValidator(cls, proxy):
60 | for func in ProxyValidator.http_validator:
61 | if not func(proxy.proxy):
62 | return False
63 | return True
64 |
65 | @classmethod
66 | def httpsValidator(cls, proxy):
67 | for func in ProxyValidator.https_validator:
68 | if not func(proxy.proxy):
69 | return False
70 | return True
71 |
72 | @classmethod
73 | def preValidator(cls, proxy):
74 | for func in ProxyValidator.pre_validator:
75 | if not func(proxy):
76 | return False
77 | return True
78 |
79 | @classmethod
80 | def regionGetter(cls, proxy):
81 | try:
82 | url = 'https://searchplugin.csdn.net/api/v1/ip/get?ip=%s' % proxy.proxy.split(':')[0]
83 | r = WebRequest().get(url=url, retry_time=1, timeout=2).json
84 | return r['data']['address']
85 | except:
86 | return 'error'
87 |
88 |
89 | class _ThreadChecker(Thread):
90 | """ 多线程检测 """
91 |
92 | def __init__(self, work_type, target_queue, thread_name):
93 | Thread.__init__(self, name=thread_name)
94 | self.work_type = work_type
95 | self.log = LogHandler("checker")
96 | self.proxy_handler = ProxyHandler()
97 | self.target_queue = target_queue
98 | self.conf = ConfigHandler()
99 |
100 | def run(self):
101 | self.log.info("{}ProxyCheck - {}: start".format(self.work_type.title(), self.name))
102 | while True:
103 | try:
104 | proxy = self.target_queue.get(block=False)
105 | except Empty:
106 | self.log.info("{}ProxyCheck - {}: complete".format(self.work_type.title(), self.name))
107 | break
108 | proxy = DoValidator.validator(proxy, self.work_type)
109 | if self.work_type == "raw":
110 | self.__ifRaw(proxy)
111 | else:
112 | self.__ifUse(proxy)
113 | self.target_queue.task_done()
114 |
115 | def __ifRaw(self, proxy):
116 | if proxy.last_status:
117 | if self.proxy_handler.exists(proxy):
118 | self.log.info('RawProxyCheck - {}: {} exist'.format(self.name, proxy.proxy.ljust(23)))
119 | else:
120 | self.log.info('RawProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23)))
121 | self.proxy_handler.put(proxy)
122 | else:
123 | self.log.info('RawProxyCheck - {}: {} fail'.format(self.name, proxy.proxy.ljust(23)))
124 |
125 | def __ifUse(self, proxy):
126 | if proxy.last_status:
127 | self.log.info('UseProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23)))
128 | self.proxy_handler.put(proxy)
129 | else:
130 | if proxy.fail_count > self.conf.maxFailCount:
131 | self.log.info('UseProxyCheck - {}: {} fail, count {} delete'.format(self.name,
132 | proxy.proxy.ljust(23),
133 | proxy.fail_count))
134 | self.proxy_handler.delete(proxy)
135 | else:
136 | self.log.info('UseProxyCheck - {}: {} fail, count {} keep'.format(self.name,
137 | proxy.proxy.ljust(23),
138 | proxy.fail_count))
139 | self.proxy_handler.put(proxy)
140 |
141 |
142 | def Checker(tp, queue):
143 | """
144 | run Proxy ThreadChecker
145 | :param tp: raw/use
146 | :param queue: Proxy Queue
147 | :return:
148 | """
149 | thread_list = list()
150 | for index in range(20):
151 | thread_list.append(_ThreadChecker(tp, queue, "thread_%s" % str(index).zfill(2)))
152 |
153 | for thread in thread_list:
154 | thread.setDaemon(True)
155 | thread.start()
156 |
157 | for thread in thread_list:
158 | thread.join()
159 |
--------------------------------------------------------------------------------
/proxy_pool/helper/fetch.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: fetchScheduler
5 | Description :
6 | Author : JHao
7 | date: 2019/8/6
8 | -------------------------------------------------
9 | Change Activity:
10 | 2021/11/18: 多线程采集
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | from threading import Thread
16 | from helper.proxy import Proxy
17 | from helper.check import DoValidator
18 | from handler.logHandler import LogHandler
19 | from handler.proxyHandler import ProxyHandler
20 | from fetcher.proxyFetcher import ProxyFetcher
21 | from handler.configHandler import ConfigHandler
22 |
23 |
24 | class _ThreadFetcher(Thread):
25 |
26 | def __init__(self, fetch_source, proxy_dict):
27 | Thread.__init__(self)
28 | self.fetch_source = fetch_source
29 | self.proxy_dict = proxy_dict
30 | self.fetcher = getattr(ProxyFetcher, fetch_source, None)
31 | self.log = LogHandler("fetcher")
32 | self.conf = ConfigHandler()
33 | self.proxy_handler = ProxyHandler()
34 |
35 | def run(self):
36 | self.log.info("ProxyFetch - {func}: start".format(func=self.fetch_source))
37 | try:
38 | for proxy in self.fetcher():
39 | self.log.info('ProxyFetch - %s: %s ok' % (self.fetch_source, proxy.ljust(23)))
40 | proxy = proxy.strip()
41 | if proxy in self.proxy_dict:
42 | self.proxy_dict[proxy].add_source(self.fetch_source)
43 | else:
44 | self.proxy_dict[proxy] = Proxy(
45 | proxy, source=self.fetch_source)
46 | except Exception as e:
47 | self.log.error("ProxyFetch - {func}: error".format(func=self.fetch_source))
48 | self.log.error(str(e))
49 |
50 |
51 | class Fetcher(object):
52 | name = "fetcher"
53 |
54 | def __init__(self):
55 | self.log = LogHandler(self.name)
56 | self.conf = ConfigHandler()
57 |
58 | def run(self):
59 | """
60 | fetch proxy with proxyFetcher
61 | :return:
62 | """
63 | proxy_dict = dict()
64 | thread_list = list()
65 | self.log.info("ProxyFetch : start")
66 |
67 | for fetch_source in self.conf.fetchers:
68 | self.log.info("ProxyFetch - {func}: start".format(func=fetch_source))
69 | fetcher = getattr(ProxyFetcher, fetch_source, None)
70 | if not fetcher:
71 | self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_source))
72 | continue
73 | if not callable(fetcher):
74 | self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_source))
75 | continue
76 | thread_list.append(_ThreadFetcher(fetch_source, proxy_dict))
77 |
78 | for thread in thread_list:
79 | thread.setDaemon(True)
80 | thread.start()
81 |
82 | for thread in thread_list:
83 | thread.join()
84 |
85 | self.log.info("ProxyFetch - all complete!")
86 | for _ in proxy_dict.values():
87 | if DoValidator.preValidator(_.proxy):
88 | yield _
89 |
--------------------------------------------------------------------------------
/proxy_pool/helper/launcher.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: launcher
5 | Description : 启动器
6 | Author : JHao
7 | date: 2021/3/26
8 | -------------------------------------------------
9 | Change Activity:
10 | 2021/3/26: 启动器
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | import sys
16 | from db.dbClient import DbClient
17 | from handler.logHandler import LogHandler
18 | from handler.configHandler import ConfigHandler
19 |
20 | log = LogHandler('launcher')
21 |
22 |
23 | def startServer():
24 | __beforeStart()
25 | from api.proxyApi import runFlask
26 | runFlask()
27 |
28 |
29 | def startScheduler():
30 | __beforeStart()
31 | from helper.scheduler import runScheduler
32 | runScheduler()
33 |
34 |
35 | def __beforeStart():
36 | __showVersion()
37 | __showConfigure()
38 | if __checkDBConfig():
39 | log.info('exit!')
40 | sys.exit()
41 |
42 |
43 | def __showVersion():
44 | from setting import VERSION
45 | log.info("ProxyPool Version: %s" % VERSION)
46 |
47 |
48 | def __showConfigure():
49 | conf = ConfigHandler()
50 | log.info("ProxyPool configure HOST: %s" % conf.serverHost)
51 | log.info("ProxyPool configure PORT: %s" % conf.serverPort)
52 | log.info("ProxyPool configure PROXY_FETCHER: %s" % conf.fetchers)
53 |
54 |
55 | def __checkDBConfig():
56 | conf = ConfigHandler()
57 | db = DbClient(conf.dbConn)
58 | log.info("============ DATABASE CONFIGURE ================")
59 | log.info("DB_TYPE: %s" % db.db_type)
60 | log.info("DB_HOST: %s" % db.db_host)
61 | log.info("DB_PORT: %s" % db.db_port)
62 | log.info("DB_NAME: %s" % db.db_name)
63 | log.info("DB_USER: %s" % db.db_user)
64 | log.info("=================================================")
65 | return db.test()
66 |
--------------------------------------------------------------------------------
/proxy_pool/helper/proxy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: Proxy
5 | Description : 代理对象类型封装
6 | Author : JHao
7 | date: 2019/7/11
8 | -------------------------------------------------
9 | Change Activity:
10 | 2019/7/11: 代理对象类型封装
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | import json
16 |
17 |
18 | class Proxy(object):
19 |
20 | def __init__(self, proxy, fail_count=0, region="", anonymous="",
21 | source="", check_count=0, last_status="", last_time="", https=False):
22 | self._proxy = proxy
23 | self._fail_count = fail_count
24 | self._region = region
25 | self._anonymous = anonymous
26 | self._source = source.split('/')
27 | self._check_count = check_count
28 | self._last_status = last_status
29 | self._last_time = last_time
30 | self._https = https
31 |
32 | @classmethod
33 | def createFromJson(cls, proxy_json):
34 | _dict = json.loads(proxy_json)
35 | return cls(proxy=_dict.get("proxy", ""),
36 | fail_count=_dict.get("fail_count", 0),
37 | region=_dict.get("region", ""),
38 | anonymous=_dict.get("anonymous", ""),
39 | source=_dict.get("source", ""),
40 | check_count=_dict.get("check_count", 0),
41 | last_status=_dict.get("last_status", ""),
42 | last_time=_dict.get("last_time", ""),
43 | https=_dict.get("https", False)
44 | )
45 |
46 | @property
47 | def proxy(self):
48 | """ 代理 ip:port """
49 | return self._proxy
50 |
51 | @property
52 | def fail_count(self):
53 | """ 检测失败次数 """
54 | return self._fail_count
55 |
56 | @property
57 | def region(self):
58 | """ 地理位置(国家/城市) """
59 | return self._region
60 |
61 | @property
62 | def anonymous(self):
63 | """ 匿名 """
64 | return self._anonymous
65 |
66 | @property
67 | def source(self):
68 | """ 代理来源 """
69 | return '/'.join(self._source)
70 |
71 | @property
72 | def check_count(self):
73 | """ 代理检测次数 """
74 | return self._check_count
75 |
76 | @property
77 | def last_status(self):
78 | """ 最后一次检测结果 True -> 可用; False -> 不可用"""
79 | return self._last_status
80 |
81 | @property
82 | def last_time(self):
83 | """ 最后一次检测时间 """
84 | return self._last_time
85 |
86 | @property
87 | def https(self):
88 | """ 是否支持https """
89 | return self._https
90 |
91 | @property
92 | def to_dict(self):
93 | """ 属性字典 """
94 | return {"proxy": self.proxy,
95 | "https": self.https,
96 | "fail_count": self.fail_count,
97 | "region": self.region,
98 | "anonymous": self.anonymous,
99 | "source": self.source,
100 | "check_count": self.check_count,
101 | "last_status": self.last_status,
102 | "last_time": self.last_time}
103 |
104 | @property
105 | def to_json(self):
106 | """ 属性json格式 """
107 | return json.dumps(self.to_dict, ensure_ascii=False)
108 |
109 | @fail_count.setter
110 | def fail_count(self, value):
111 | self._fail_count = value
112 |
113 | @check_count.setter
114 | def check_count(self, value):
115 | self._check_count = value
116 |
117 | @last_status.setter
118 | def last_status(self, value):
119 | self._last_status = value
120 |
121 | @last_time.setter
122 | def last_time(self, value):
123 | self._last_time = value
124 |
125 | @https.setter
126 | def https(self, value):
127 | self._https = value
128 |
129 | @region.setter
130 | def region(self, value):
131 | self._region = value
132 |
133 | def add_source(self, source_str):
134 | if source_str:
135 | self._source.append(source_str)
136 | self._source = list(set(self._source))
137 |
--------------------------------------------------------------------------------
/proxy_pool/helper/scheduler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: proxyScheduler
5 | Description :
6 | Author : JHao
7 | date: 2019/8/5
8 | -------------------------------------------------
9 | Change Activity:
10 | 2019/08/05: proxyScheduler
11 | 2021/02/23: runProxyCheck时,剩余代理少于POOL_SIZE_MIN时执行抓取
12 | -------------------------------------------------
13 | """
14 | __author__ = 'JHao'
15 |
16 | from apscheduler.schedulers.blocking import BlockingScheduler
17 | from apscheduler.executors.pool import ProcessPoolExecutor
18 |
19 | from util.six import Queue
20 | from helper.fetch import Fetcher
21 | from helper.check import Checker
22 | from handler.logHandler import LogHandler
23 | from handler.proxyHandler import ProxyHandler
24 | from handler.configHandler import ConfigHandler
25 |
26 |
27 | def __runProxyFetch():
28 | proxy_queue = Queue()
29 | proxy_fetcher = Fetcher()
30 |
31 | for proxy in proxy_fetcher.run():
32 | proxy_queue.put(proxy)
33 |
34 | Checker("raw", proxy_queue)
35 |
36 |
37 | def __runProxyCheck():
38 | proxy_handler = ProxyHandler()
39 | proxy_queue = Queue()
40 | if proxy_handler.db.getCount().get("total", 0) < proxy_handler.conf.poolSizeMin:
41 | __runProxyFetch()
42 | for proxy in proxy_handler.getAll():
43 | proxy_queue.put(proxy)
44 | Checker("use", proxy_queue)
45 |
46 |
47 | def runScheduler():
48 | __runProxyFetch()
49 |
50 | timezone = ConfigHandler().timezone
51 | scheduler_log = LogHandler("scheduler")
52 | scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone)
53 |
54 | scheduler.add_job(__runProxyFetch, 'interval', minutes=4, id="proxy_fetch", name="proxy采集")
55 | scheduler.add_job(__runProxyCheck, 'interval', minutes=2, id="proxy_check", name="proxy检查")
56 | executors = {
57 | 'default': {'type': 'threadpool', 'max_workers': 20},
58 | 'processpool': ProcessPoolExecutor(max_workers=5)
59 | }
60 | job_defaults = {
61 | 'coalesce': False,
62 | 'max_instances': 10
63 | }
64 |
65 | scheduler.configure(executors=executors, job_defaults=job_defaults, timezone=timezone)
66 |
67 | scheduler.start()
68 |
69 |
70 | if __name__ == '__main__':
71 | runScheduler()
72 |
--------------------------------------------------------------------------------
/proxy_pool/helper/validator.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: _validators
5 | Description : 定义proxy验证方法
6 | Author : JHao
7 | date: 2021/5/25
8 | -------------------------------------------------
9 | Change Activity:
10 | 2021/5/25:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | from re import findall
16 | from requests import head
17 | from util.six import withMetaclass
18 | from util.singleton import Singleton
19 | from handler.configHandler import ConfigHandler
20 |
21 | conf = ConfigHandler()
22 |
23 | HEADER = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
24 | 'Accept': '*/*',
25 | 'Connection': 'keep-alive',
26 | 'Accept-Language': 'zh-CN,zh;q=0.8'}
27 |
28 |
29 | class ProxyValidator(withMetaclass(Singleton)):
30 | pre_validator = []
31 | http_validator = []
32 | https_validator = []
33 |
34 | @classmethod
35 | def addPreValidator(cls, func):
36 | cls.pre_validator.append(func)
37 | return func
38 |
39 | @classmethod
40 | def addHttpValidator(cls, func):
41 | cls.http_validator.append(func)
42 | return func
43 |
44 | @classmethod
45 | def addHttpsValidator(cls, func):
46 | cls.https_validator.append(func)
47 | return func
48 |
49 |
50 | @ProxyValidator.addPreValidator
51 | def formatValidator(proxy):
52 | """检查代理格式"""
53 | verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
54 | _proxy = findall(verify_regex, proxy)
55 | return True if len(_proxy) == 1 and _proxy[0] == proxy else False
56 |
57 |
58 | @ProxyValidator.addHttpValidator
59 | def httpTimeOutValidator(proxy):
60 | """ http检测超时 """
61 |
62 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)}
63 |
64 | try:
65 | r = head(conf.httpUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout)
66 | return True if r.status_code == 200 else False
67 | except Exception as e:
68 | return False
69 |
70 |
71 | @ProxyValidator.addHttpsValidator
72 | def httpsTimeOutValidator(proxy):
73 | """https检测超时"""
74 |
75 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)}
76 | try:
77 | r = head(conf.httpsUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout, verify=False)
78 | return True if r.status_code == 200 else False
79 | except Exception as e:
80 | return False
81 |
82 |
83 | @ProxyValidator.addHttpValidator
84 | def customValidatorExample(proxy):
85 | """自定义validator函数,校验代理是否可用, 返回True/False"""
86 | return True
87 |
--------------------------------------------------------------------------------
/proxy_pool/proxyPool.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: proxy_pool
5 | Description : proxy pool 启动入口
6 | Author : JHao
7 | date: 2020/6/19
8 | -------------------------------------------------
9 | Change Activity:
10 | 2020/6/19:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | import click
16 | from helper.launcher import startServer, startScheduler
17 | from setting import BANNER, VERSION
18 |
19 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
20 |
21 |
22 | @click.group(context_settings=CONTEXT_SETTINGS)
23 | @click.version_option(version=VERSION)
24 | def cli():
25 | """ProxyPool cli工具"""
26 |
27 | @cli.command(name="schedule")
28 | def schedule():
29 | """ 启动调度程序 """
30 | click.echo(BANNER)
31 | startScheduler()
32 |
33 |
34 | @cli.command(name="server")
35 | def server():
36 | """ 启动api服务 """
37 | click.echo(BANNER)
38 | startServer()
39 |
40 |
41 | if __name__ == '__main__':
42 | cli()
43 |
--------------------------------------------------------------------------------
/proxy_pool/requirements.txt:
--------------------------------------------------------------------------------
1 | APScheduler==3.2.0
2 | werkzeug==0.15.5
3 | Flask==2.2.2
4 | requests==2.20.0
5 | click==7.0
6 | gunicorn==19.9.0
7 | lxml
8 | redis
9 |
--------------------------------------------------------------------------------
/proxy_pool/setting.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: setting.py
5 | Description : 配置文件
6 | Author : JHao
7 | date: 2019/2/15
8 | -------------------------------------------------
9 | Change Activity:
10 | 2019/2/15:
11 | -------------------------------------------------
12 | """
13 |
14 | BANNER = r"""
15 | ****************************************************************
16 | *** ______ ********************* ______ *********** _ ********
17 | *** | ___ \_ ******************** | ___ \ ********* | | ********
18 | *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ********
19 | *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ********
20 | *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ ****
21 | *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ ****
22 | **** __ / / *****
23 | ************************* /___ / *******************************
24 | ************************* ********************************
25 | ****************************************************************
26 | """
27 |
28 | VERSION = "2.4.0"
29 |
30 | # ############### server config ###############
31 | HOST = "0.0.0.0"
32 |
33 | PORT = 5010
34 |
35 | # ############### database config ###################
36 | # db connection uri
37 | # example:
38 | # Redis: redis://:password@ip:port/db
39 | # Ssdb: ssdb://:password@ip:port
40 | DB_CONN = 'redis://:@127.0.0.1:6379/0'
41 |
42 | # proxy table name
43 | TABLE_NAME = 'use_proxy'
44 |
45 |
46 | # ###### config the proxy fetch function ######
47 | PROXY_FETCHER = [
48 | "freeProxy01",
49 | "freeProxy02",
50 | "freeProxy03",
51 | "freeProxy04",
52 | "freeProxy05",
53 | "freeProxy06",
54 | "freeProxy07",
55 | "freeProxy08",
56 | "freeProxy09",
57 | "freeProxy10"
58 | ]
59 |
60 | # ############# proxy validator #################
61 | # 代理验证目标网站
62 | HTTP_URL = "http://httpbin.org"
63 |
64 | HTTPS_URL = "https://www.qq.com"
65 |
66 | # 代理验证时超时时间
67 | VERIFY_TIMEOUT = 10
68 |
69 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败次数,超过则剔除代理
70 | MAX_FAIL_COUNT = 0
71 |
72 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败率,超过则剔除代理
73 | # MAX_FAIL_RATE = 0.1
74 |
75 | # proxyCheck时代理数量少于POOL_SIZE_MIN触发抓取
76 | POOL_SIZE_MIN = 20
77 |
78 | # ############# proxy attributes #################
79 | # 是否启用代理地域属性
80 | PROXY_REGION = True
81 |
82 | # ############# scheduler config #################
83 |
84 | # Set the timezone for the scheduler forcely (optional)
85 | # If it is running on a VM, and
86 | # "ValueError: Timezone offset does not match system offset"
87 | # was raised during scheduling.
88 | # Please uncomment the following line and set a timezone for the scheduler.
89 | # Otherwise it will detect the timezone from the system automatically.
90 |
91 | TIMEZONE = "Asia/Shanghai"
92 |
--------------------------------------------------------------------------------
/proxy_pool/test/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: __init__
5 | Description :
6 | Author : JHao
7 | date: 2019/2/15
8 | -------------------------------------------------
9 | Change Activity:
10 | 2019/2/15:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
--------------------------------------------------------------------------------
/proxy_pool/test/testConfigHandler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: testGetConfig
5 | Description : testGetConfig
6 | Author : J_hao
7 | date: 2017/7/31
8 | -------------------------------------------------
9 | Change Activity:
10 | 2017/7/31:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'J_hao'
14 |
15 | from handler.configHandler import ConfigHandler
16 | from time import sleep
17 |
18 |
19 | def testConfig():
20 | """
21 | :return:
22 | """
23 | conf = ConfigHandler()
24 | print(conf.dbConn)
25 | print(conf.serverPort)
26 | print(conf.serverHost)
27 | print(conf.tableName)
28 | assert isinstance(conf.fetchers, list)
29 | print(conf.fetchers)
30 |
31 | for _ in range(2):
32 | print(conf.fetchers)
33 | sleep(5)
34 |
35 |
36 | if __name__ == '__main__':
37 | testConfig()
38 |
39 |
--------------------------------------------------------------------------------
/proxy_pool/test/testDbClient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: testDbClient
5 | Description :
6 | Author : JHao
7 | date: 2020/6/23
8 | -------------------------------------------------
9 | Change Activity:
10 | 2020/6/23:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | from db.dbClient import DbClient
16 |
17 |
18 | def testDbClient():
19 | # ############### ssdb ###############
20 | ssdb_uri = "ssdb://:password@127.0.0.1:8888"
21 | s = DbClient.parseDbConn(ssdb_uri)
22 | assert s.db_type == "SSDB"
23 | assert s.db_pwd == "password"
24 | assert s.db_host == "127.0.0.1"
25 | assert s.db_port == 8888
26 |
27 | # ############### redis ###############
28 | redis_uri = "redis://:password@127.0.0.1:6379/1"
29 | r = DbClient.parseDbConn(redis_uri)
30 | assert r.db_type == "REDIS"
31 | assert r.db_pwd == "password"
32 | assert r.db_host == "127.0.0.1"
33 | assert r.db_port == 6379
34 | assert r.db_name == "1"
35 | print("DbClient ok!")
36 |
37 |
38 | if __name__ == '__main__':
39 | testDbClient()
40 |
--------------------------------------------------------------------------------
/proxy_pool/test/testLogHandler.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: testLogHandler
5 | Description :
6 | Author : J_hao
7 | date: 2017/8/2
8 | -------------------------------------------------
9 | Change Activity:
10 | 2017/8/2:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'J_hao'
14 |
15 | from handler.logHandler import LogHandler
16 |
17 |
18 | def testLogHandler():
19 | log = LogHandler('test')
20 | log.info('this is info')
21 | log.error('this is error')
22 |
23 |
24 | if __name__ == '__main__':
25 | testLogHandler()
26 |
--------------------------------------------------------------------------------
/proxy_pool/test/testProxyClass.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: testProxyClass
5 | Description :
6 | Author : JHao
7 | date: 2019/8/8
8 | -------------------------------------------------
9 | Change Activity:
10 | 2019/8/8:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | import json
16 | from helper.proxy import Proxy
17 |
18 |
19 | def testProxyClass():
20 | proxy = Proxy("127.0.0.1:8080")
21 |
22 | print(proxy.to_json)
23 |
24 | proxy.source = "test"
25 |
26 | proxy_str = json.dumps(proxy.to_dict, ensure_ascii=False)
27 |
28 | print(proxy_str)
29 |
30 | print(Proxy.createFromJson(proxy_str).to_dict)
31 |
32 |
33 | if __name__ == '__main__':
34 | testProxyClass()
35 |
--------------------------------------------------------------------------------
/proxy_pool/test/testProxyFetcher.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: testProxyFetcher
5 | Description :
6 | Author : JHao
7 | date: 2020/6/23
8 | -------------------------------------------------
9 | Change Activity:
10 | 2020/6/23:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | from fetcher.proxyFetcher import ProxyFetcher
16 | from handler.configHandler import ConfigHandler
17 |
18 |
19 | def testProxyFetcher():
20 | conf = ConfigHandler()
21 | proxy_getter_functions = conf.fetchers
22 | proxy_counter = {_: 0 for _ in proxy_getter_functions}
23 | for proxyGetter in proxy_getter_functions:
24 | for proxy in getattr(ProxyFetcher, proxyGetter.strip())():
25 | if proxy:
26 | print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
27 | proxy_counter[proxyGetter] = proxy_counter.get(proxyGetter) + 1
28 | for key, value in proxy_counter.items():
29 | print(key, value)
30 |
31 |
32 | if __name__ == '__main__':
33 | testProxyFetcher()
34 |
--------------------------------------------------------------------------------
/proxy_pool/test/testProxyValidator.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: testProxyValidator
5 | Description :
6 | Author : JHao
7 | date: 2021/5/25
8 | -------------------------------------------------
9 | Change Activity:
10 | 2021/5/25:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | from helper.validator import ProxyValidator
16 |
17 |
18 | def testProxyValidator():
19 | for _ in ProxyValidator.pre_validator:
20 | print(_)
21 | for _ in ProxyValidator.http_validator:
22 | print(_)
23 | for _ in ProxyValidator.https_validator:
24 | print(_)
25 |
26 |
27 | if __name__ == '__main__':
28 | testProxyValidator()
29 |
--------------------------------------------------------------------------------
/proxy_pool/test/testRedisClient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: testRedisClient
5 | Description :
6 | Author : JHao
7 | date: 2020/6/23
8 | -------------------------------------------------
9 | Change Activity:
10 | 2020/6/23:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 |
16 | def testRedisClient():
17 | from db.dbClient import DbClient
18 | from helper.proxy import Proxy
19 |
20 | uri = "redis://:pwd@127.0.0.1:6379"
21 | db = DbClient(uri)
22 | db.changeTable("use_proxy")
23 | proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}')
24 |
25 | print("put: ", db.put(proxy))
26 |
27 | print("get: ", db.get(https=None))
28 |
29 | print("exists: ", db.exists("27.38.96.101:9797"))
30 |
31 | print("exists: ", db.exists("27.38.96.101:8888"))
32 |
33 | print("pop: ", db.pop(https=None))
34 |
35 | print("getAll: ", db.getAll(https=None))
36 |
37 | print("getCount", db.getCount())
38 |
39 |
40 | if __name__ == '__main__':
41 | testRedisClient()
42 |
--------------------------------------------------------------------------------
/proxy_pool/test/testSsdbClient.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: testSsdbClient
5 | Description :
6 | Author : JHao
7 | date: 2020/7/3
8 | -------------------------------------------------
9 | Change Activity:
10 | 2020/7/3:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 |
16 | def testSsdbClient():
17 | from db.dbClient import DbClient
18 | from helper.proxy import Proxy
19 |
20 | uri = "ssdb://@127.0.0.1:8888"
21 | db = DbClient(uri)
22 | db.changeTable("use_proxy")
23 | proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}')
24 |
25 | print("put: ", db.put(proxy))
26 |
27 | print("get: ", db.get(https=None))
28 |
29 | print("exists: ", db.exists("27.38.96.101:9797"))
30 |
31 | print("exists: ", db.exists("27.38.96.101:8888"))
32 |
33 | print("getAll: ", db.getAll(https=None))
34 |
35 | # print("pop: ", db.pop(https=None))
36 |
37 | print("clear: ", db.clear())
38 |
39 | print("getCount", db.getCount())
40 |
41 |
42 | if __name__ == '__main__':
43 | testSsdbClient()
44 |
--------------------------------------------------------------------------------
/proxy_pool/util/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: __init__
5 | Description :
6 | Author : JHao
7 | date: 2020/7/6
8 | -------------------------------------------------
9 | Change Activity:
10 | 2020/7/6:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/lazyProperty.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/lazyProperty.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/singleton.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/singleton.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/six.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/six.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/webRequest.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/webRequest.cpython-38.pyc
--------------------------------------------------------------------------------
/proxy_pool/util/lazyProperty.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: lazyProperty
5 | Description :
6 | Author : JHao
7 | date: 2016/12/3
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/12/3:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 |
16 | class LazyProperty(object):
17 | """
18 | LazyProperty
19 | explain: http://www.spiderpy.cn/blog/5/
20 | """
21 |
22 | def __init__(self, func):
23 | self.func = func
24 |
25 | def __get__(self, instance, owner):
26 | if instance is None:
27 | return self
28 | else:
29 | value = self.func(instance)
30 | setattr(instance, self.func.__name__, value)
31 | return value
32 |
--------------------------------------------------------------------------------
/proxy_pool/util/singleton.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: singleton
5 | Description :
6 | Author : JHao
7 | date: 2016/12/3
8 | -------------------------------------------------
9 | Change Activity:
10 | 2016/12/3:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 |
16 | class Singleton(type):
17 | """
18 | Singleton Metaclass
19 | """
20 |
21 | _inst = {}
22 |
23 | def __call__(cls, *args, **kwargs):
24 | if cls not in cls._inst:
25 | cls._inst[cls] = super(Singleton, cls).__call__(*args)
26 | return cls._inst[cls]
27 |
--------------------------------------------------------------------------------
/proxy_pool/util/six.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: six
5 | Description :
6 | Author : JHao
7 | date: 2020/6/22
8 | -------------------------------------------------
9 | Change Activity:
10 | 2020/6/22:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 |
15 | import sys
16 |
17 | PY2 = sys.version_info[0] == 2
18 | PY3 = sys.version_info[0] == 3
19 |
20 | if PY3:
21 | def iteritems(d, **kw):
22 | return iter(d.items(**kw))
23 | else:
24 | def iteritems(d, **kw):
25 | return d.iteritems(**kw)
26 |
27 | if PY3:
28 | from urllib.parse import urlparse
29 | else:
30 | from urlparse import urlparse
31 |
32 | if PY3:
33 | from imp import reload as reload_six
34 | else:
35 | reload_six = reload
36 |
37 | if PY3:
38 | from queue import Empty, Queue
39 | else:
40 | from Queue import Empty, Queue
41 |
42 |
43 | def withMetaclass(meta, *bases):
44 | """Create a base class with a metaclass."""
45 |
46 | # This requires a bit of explanation: the basic idea is to make a dummy
47 | # metaclass for one level of class instantiation that replaces itself with
48 | # the actual metaclass.
49 | class MetaClass(meta):
50 |
51 | def __new__(cls, name, this_bases, d):
52 | return meta(name, bases, d)
53 |
54 | return type.__new__(MetaClass, 'temporary_class', (), {})
55 |
--------------------------------------------------------------------------------
/proxy_pool/util/webRequest.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | -------------------------------------------------
4 | File Name: WebRequest
5 | Description : Network Requests Class
6 | Author : J_hao
7 | date: 2017/7/31
8 | -------------------------------------------------
9 | Change Activity:
10 | 2017/7/31:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'J_hao'
14 |
15 | from requests.models import Response
16 | from lxml import etree
17 | import requests
18 | import random
19 | import time
20 |
21 | from handler.logHandler import LogHandler
22 |
23 | requests.packages.urllib3.disable_warnings()
24 |
25 |
26 | class WebRequest(object):
27 | name = "web_request"
28 |
29 | def __init__(self, *args, **kwargs):
30 | self.log = LogHandler(self.name, file=False)
31 | self.response = Response()
32 |
33 | @property
34 | def user_agent(self):
35 | """
36 | return an User-Agent at random
37 | :return:
38 | """
39 | ua_list = [
40 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101',
41 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122',
42 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71',
43 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95',
44 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
45 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
46 | 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
47 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
48 | ]
49 | return random.choice(ua_list)
50 |
51 | @property
52 | def header(self):
53 | """
54 | basic header
55 | :return:
56 | """
57 | return {'User-Agent': self.user_agent,
58 | 'Accept': '*/*',
59 | 'Connection': 'keep-alive',
60 | 'Accept-Language': 'zh-CN,zh;q=0.8'}
61 |
62 | def get(self, url, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs):
63 | """
64 | get method
65 | :param url: target url
66 | :param header: headers
67 | :param retry_time: retry time
68 | :param retry_interval: retry interval
69 | :param timeout: network timeout
70 | :return:
71 | """
72 | headers = self.header
73 | if header and isinstance(header, dict):
74 | headers.update(header)
75 | while True:
76 | try:
77 | self.response = requests.get(url, headers=headers, timeout=timeout, *args, **kwargs)
78 | return self
79 | except Exception as e:
80 | self.log.error("requests: %s error: %s" % (url, str(e)))
81 | retry_time -= 1
82 | if retry_time <= 0:
83 | resp = Response()
84 | resp.status_code = 200
85 | return self
86 | self.log.info("retry %s second after" % retry_interval)
87 | time.sleep(retry_interval)
88 |
89 | @property
90 | def tree(self):
91 | return etree.HTML(self.response.content)
92 |
93 | @property
94 | def text(self):
95 | return self.response.text
96 |
97 | @property
98 | def json(self):
99 | try:
100 | return self.response.json()
101 | except Exception as e:
102 | self.log.error(str(e))
103 | return {}
104 |
105 |
--------------------------------------------------------------------------------