├── FlaskProject
    ├── .idea
    │   ├── .gitignore
    │   ├── flaskProject.iml
    │   ├── inspectionProfiles
    │   │   └── profiles_settings.xml
    │   ├── misc.xml
    │   └── modules.xml
    ├── __pycache__
    │   └── app.cpython-38.pyc
    ├── app.py
    ├── data.txt
    ├── static
    │   ├── css
    │   │   └── style.css
    │   ├── img
    │   │   ├── generated_leaf.jpg
    │   │   ├── generated_tree.jpg
    │   │   ├── icon.png
    │   │   ├── leaf.jpg
    │   │   └── tree.jpg
    │   ├── js
    │   │   ├── echarts.min.js
    │   │   └── main.js
    │   └── vendor
    │   │   ├── animate.css
    │   │       ├── animate.css
    │   │       └── animate.min.css
    │   │   ├── aos
    │   │       ├── aos.css
    │   │       └── aos.js
    │   │   ├── bootstrap
    │   │       ├── css
    │   │       │   ├── bootstrap-grid.css
    │   │       │   ├── bootstrap-grid.css.map
    │   │       │   ├── bootstrap-grid.min.css
    │   │       │   ├── bootstrap-grid.min.css.map
    │   │       │   ├── bootstrap-reboot.css
    │   │       │   ├── bootstrap-reboot.css.map
    │   │       │   ├── bootstrap-reboot.min.css
    │   │       │   ├── bootstrap-reboot.min.css.map
    │   │       │   ├── bootstrap.css
    │   │       │   ├── bootstrap.css.map
    │   │       │   ├── bootstrap.min.css
    │   │       │   └── bootstrap.min.css.map
    │   │       └── js
    │   │       │   ├── bootstrap.bundle.js
    │   │       │   ├── bootstrap.bundle.js.map
    │   │       │   ├── bootstrap.bundle.min.js
    │   │       │   ├── bootstrap.bundle.min.js.map
    │   │       │   ├── bootstrap.js
    │   │       │   ├── bootstrap.js.map
    │   │       │   ├── bootstrap.min.js
    │   │       │   └── bootstrap.min.js.map
    │   │   ├── boxicons
    │   │       ├── css
    │   │       │   ├── animations.css
    │   │       │   ├── boxicons.css
    │   │       │   ├── boxicons.min.css
    │   │       │   └── transformations.css
    │   │       └── fonts
    │   │       │   ├── boxicons.eot
    │   │       │   ├── boxicons.svg
    │   │       │   ├── boxicons.ttf
    │   │       │   ├── boxicons.woff
    │   │       │   └── boxicons.woff2
    │   │   ├── counterup
    │   │       └── counterup.min.js
    │   │   ├── icofont
    │   │       ├── fonts
    │   │       │   ├── icofont.woff
    │   │       │   └── icofont.woff2
    │   │       └── icofont.min.css
    │   │   ├── isotope-layout
    │   │       ├── isotope.pkgd.js
    │   │       └── isotope.pkgd.min.js
    │   │   ├── jquery-sticky
    │   │       └── jquery.sticky.js
    │   │   ├── jquery.easing
    │   │       └── jquery.easing.min.js
    │   │   ├── jquery
    │   │       ├── jquery.min.js
    │   │       └── jquery.min.map
    │   │   ├── php-email-form
    │   │       └── validate.js
    │   │   ├── venobox
    │   │       ├── venobox.css
    │   │       ├── venobox.js
    │   │       └── venobox.min.js
    │   │   └── waypoints
    │   │       └── jquery.waypoints.min.js
    ├── templates
    │   ├── index.html
    │   ├── movie.html
    │   ├── score.html
    │   └── word.html
    └── wordCloud.py
├── Master
    ├── .idea
    │   ├── .gitignore
    │   ├── Master.iml
    │   ├── inspectionProfiles
    │   │   └── profiles_settings.xml
    │   ├── misc.xml
    │   └── modules.xml
    └── main.py
├── Pic
    ├── index.jpg
    ├── mongoDB_data.jpg
    ├── movies.jpg
    ├── proxy.jpg
    ├── redis_data.jpg
    ├── score.jpg
    ├── slave.jpg
    └── words.jpg
├── README.md
├── Slave
    ├── .idea
    │   ├── .gitignore
    │   ├── Slave.iml
    │   ├── inspectionProfiles
    │   │   └── profiles_settings.xml
    │   ├── misc.xml
    │   └── modules.xml
    ├── movies
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── middlewares.cpython-38.pyc
    │   │   ├── pipelines.cpython-38.pyc
    │   │   └── settings.cpython-38.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-38.pyc
    │   │   │   └── douban_redis.cpython-38.pyc
    │   │   └── douban_redis.py
    │   └── start.py
    └── scrapy.cfg
└── proxy_pool
    ├── .idea
        ├── .gitignore
        ├── inspectionProfiles
        │   └── profiles_settings.xml
        ├── misc.xml
        ├── modules.xml
        └── proxy_pool.iml
    ├── __pycache__
        └── setting.cpython-38.pyc
    ├── api
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-38.pyc
        │   └── proxyApi.cpython-38.pyc
        └── proxyApi.py
    ├── db
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-38.pyc
        │   ├── dbClient.cpython-38.pyc
        │   └── redisClient.cpython-38.pyc
        ├── dbClient.py
        ├── redisClient.py
        └── ssdbClient.py
    ├── docs
        ├── Makefile
        ├── changelog.rst
        ├── conf.py
        ├── dev
        │   ├── ext_fetcher.rst
        │   ├── ext_validator.rst
        │   └── index.rst
        ├── index.rst
        ├── make.bat
        └── user
        │   ├── how_to_config.rst
        │   ├── how_to_run.rst
        │   ├── how_to_use.rst
        │   └── index.rst
    ├── fetcher
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-38.pyc
        │   └── proxyFetcher.cpython-38.pyc
        └── proxyFetcher.py
    ├── handler
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-38.pyc
        │   ├── configHandler.cpython-38.pyc
        │   ├── logHandler.cpython-38.pyc
        │   └── proxyHandler.cpython-38.pyc
        ├── configHandler.py
        ├── logHandler.py
        └── proxyHandler.py
    ├── helper
        ├── __init__.py
        ├── __pycache__
        │   ├── __init__.cpython-38.pyc
        │   ├── check.cpython-38.pyc
        │   ├── fetch.cpython-38.pyc
        │   ├── launcher.cpython-38.pyc
        │   ├── proxy.cpython-38.pyc
        │   ├── scheduler.cpython-38.pyc
        │   └── validator.cpython-38.pyc
        ├── check.py
        ├── fetch.py
        ├── launcher.py
        ├── proxy.py
        ├── scheduler.py
        └── validator.py
    ├── proxyPool.py
    ├── requirements.txt
    ├── setting.py
    ├── test
        ├── __init__.py
        ├── testConfigHandler.py
        ├── testDbClient.py
        ├── testLogHandler.py
        ├── testProxyClass.py
        ├── testProxyFetcher.py
        ├── testProxyValidator.py
        ├── testRedisClient.py
        └── testSsdbClient.py
    └── util
        ├── __init__.py
        ├── __pycache__
            ├── __init__.cpython-38.pyc
            ├── lazyProperty.cpython-38.pyc
            ├── singleton.cpython-38.pyc
            ├── six.cpython-38.pyc
            └── webRequest.cpython-38.pyc
        ├── lazyProperty.py
        ├── singleton.py
        ├── six.py
        └── webRequest.py


/FlaskProject/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/FlaskProject/.idea/flaskProject.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="Flask">
 4 |     <option name="enabled" value="true" />
 5 |   </component>
 6 |   <component name="NewModuleRootManager">
 7 |     <content url="file://$MODULE_DIR$" />
 8 |     <orderEntry type="inheritedJdk" />
 9 |     <orderEntry type="sourceFolder" forTests="false" />
10 |   </component>
11 |   <component name="TemplatesService">
12 |     <option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
13 |     <option name="TEMPLATE_FOLDERS">
14 |       <list>
15 |         <option value="$MODULE_DIR$/../flaskProject\templates" />
16 |       </list>
17 |     </option>
18 |   </component>
19 | </module>


--------------------------------------------------------------------------------
/FlaskProject/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/FlaskProject/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (web_crawler)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/FlaskProject/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/flaskProject.iml" filepath="$PROJECT_DIR$/.idea/flaskProject.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/FlaskProject/__pycache__/app.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/__pycache__/app.cpython-38.pyc


--------------------------------------------------------------------------------
/FlaskProject/app.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, render_template, make_response,jsonify
  2 | import pymongo
  3 | 
  4 | app = Flask(__name__)
  5 | 
  6 | def myCollection():
  7 |     client = pymongo.MongoClient(host='localhost', port=27017)
  8 |     db = client.movies
  9 |     collection = db.douban
 10 |     return collection
 11 | 
 12 | @app.route('/')
 13 | def home():
 14 |     return index()
 15 | 
 16 | @app.route('/index')
 17 | def index():
 18 |     #电影 评分 词汇 团队成员
 19 |     movies_num = 0
 20 |     votes_num = 0
 21 |     words_num = 11655
 22 |     team_num = 8
 23 |     for item in myCollection().find():
 24 |         movies_num += 1
 25 |         votes_num += int(item['comment_num'])
 26 |     votes_num = int(votes_num / 10000)
 27 |     return render_template("index.html",movies_num=movies_num,votes_num=votes_num,words_num=words_num,team_num=team_num)
 28 | 
 29 | @app.route('/movie')
 30 | def movie():
 31 |     query = {}
 32 |     projection = {}
 33 | 
 34 |     projection["rank"] = u"$rank"
 35 |     projection["page_url"] = u"$page_url"
 36 |     projection["title"] = u"$title"
 37 |     projection["score"] = u"$score"
 38 |     projection["comment_num"] = u"$comment_num"
 39 |     projection["directedBy"] = u"$directedBy"
 40 |     projection["actors"] = u"$actors"
 41 |     projection["comment"] = u"$comment"
 42 |     projection["year"] = u"$year"
 43 |     projection["_id"] = 0
 44 | 
 45 |     cursor = myCollection().find(query, projection=projection)
 46 |     movies = []
 47 |     for doc in cursor:
 48 |         movies.append({
 49 |             'rank': int(doc['rank']),
 50 |             'link': doc['page_url'],
 51 |             'title': doc['title'],
 52 |             'score': doc['score'],
 53 |             'comment_num': doc['comment_num'],
 54 |             'directed_by': doc['directedBy'],
 55 |             # 'actors': doc['actors'],
 56 |             'comment': doc['comment'],
 57 |             'year': doc['year'],
 58 |         })
 59 |     movies.sort(key=lambda x: x['rank'], reverse=False)
 60 | 
 61 |     return render_template("movie.html",movies = movies)
 62 | 
 63 | 
 64 | @app.route('/word')
 65 | def word():
 66 |     return render_template("word.html")
 67 | 
 68 | @app.route('/score')
 69 | def score():
 70 |     # sql = "select score,count(score) from movie250 group by score"
 71 |     pipeline = [
 72 |         {
 73 |             u"$group": {
 74 |                 u"_id": {
 75 |                     u"score": u"$score"
 76 |                 },
 77 |                 u"COUNT(score)": {
 78 |                     u"$sum": 1
 79 |                 }
 80 |             }
 81 |         },
 82 |         {
 83 |             u"$project": {
 84 |                 u"score": u"$_id.score",
 85 |                 u"COUNT(score)": u"$COUNT(score)",
 86 |                 u"_id": 0
 87 |             }
 88 |         }
 89 |     ]
 90 |     cursor = myCollection().aggregate(pipeline, allowDiskUse=True)
 91 |     score = []  # 评分
 92 |     num = []  # 每个评分统计出的电影数量
 93 |     score_num = {}
 94 |     for doc in cursor:
 95 |         score.append(doc['score'])
 96 |         score_num[doc['score']] = doc['COUNT(score)']
 97 |     score.sort()
 98 |     for count in range(len(score_num)):
 99 |         num.append(score_num[score[count]])
100 |         count += 1
101 | 
102 |     return render_template("score.html",score=score,num=num)
103 | 
104 | if __name__ == '__main__':
105 |     app.run(debug=True)


--------------------------------------------------------------------------------
/FlaskProject/static/img/generated_leaf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/generated_leaf.jpg


--------------------------------------------------------------------------------
/FlaskProject/static/img/generated_tree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/generated_tree.jpg


--------------------------------------------------------------------------------
/FlaskProject/static/img/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/icon.png


--------------------------------------------------------------------------------
/FlaskProject/static/img/leaf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/leaf.jpg


--------------------------------------------------------------------------------
/FlaskProject/static/img/tree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/tree.jpg


--------------------------------------------------------------------------------
/FlaskProject/static/js/main.js:
--------------------------------------------------------------------------------
  1 | /**
  2 | * Template Name: Mamba - v2.0.1
  3 | * Template URL: https://bootstrapmade.com/mamba-one-page-bootstrap-template-free/
  4 | * Author: BootstrapMade.com
  5 | * License: https://bootstrapmade.com/license/
  6 | */
  7 | !(function($) {
  8 |   "use strict";
  9 | 
 10 |   // Toggle .header-scrolled class to #header when page is scrolled
 11 |   $(window).scroll(function() {
 12 |     if ($(this).scrollTop() > 100) {
 13 |       $('#header').addClass('header-scrolled');
 14 |     } else {
 15 |       $('#header').removeClass('header-scrolled');
 16 |     }
 17 |   });
 18 | 
 19 |   if ($(window).scrollTop() > 100) {
 20 |     $('#header').addClass('header-scrolled');
 21 |   }
 22 | 
 23 |   // Stick the header at top on scroll
 24 |   $("#header").sticky({
 25 |     topSpacing: 0,
 26 |     zIndex: '50'
 27 |   });
 28 | 
 29 |   // Smooth scroll for the navigation menu and links with .scrollto classes
 30 |   $(document).on('click', '.nav-menu a, .mobile-nav a, .scrollto', function(e) {
 31 |     if (location.pathname.replace(/^\//, '') == this.pathname.replace(/^\//, '') && location.hostname == this.hostname) {
 32 |       e.preventDefault();
 33 |       var target = $(this.hash);
 34 |       if (target.length) {
 35 | 
 36 |         var scrollto = target.offset().top;
 37 |         var scrolled = 2;
 38 | 
 39 |         if ($('#header-sticky-wrapper').length) {
 40 |           scrollto -= $('#header-sticky-wrapper').outerHeight() - scrolled;
 41 |         }
 42 | 
 43 |         if ($(this).attr("href") == '#header') {
 44 |           scrollto = 0;
 45 |         }
 46 | 
 47 |         $('html, body').animate({
 48 |           scrollTop: scrollto
 49 |         }, 1500, 'easeInOutExpo');
 50 | 
 51 |         if ($(this).parents('.nav-menu, .mobile-nav').length) {
 52 |           $('.nav-menu .active, .mobile-nav .active').removeClass('active');
 53 |           $(this).closest('li').addClass('active');
 54 |         }
 55 | 
 56 |         if ($('body').hasClass('mobile-nav-active')) {
 57 |           $('body').removeClass('mobile-nav-active');
 58 |           $('.mobile-nav-toggle i').toggleClass('icofont-navigation-menu icofont-close');
 59 |           $('.mobile-nav-overly').fadeOut();
 60 |         }
 61 |         return false;
 62 |       }
 63 |     }
 64 |   });
 65 | 
 66 |   // Mobile Navigation
 67 |   if ($('.nav-menu').length) {
 68 |     var $mobile_nav = $('.nav-menu').clone().prop({
 69 |       class: 'mobile-nav d-lg-none'
 70 |     });
 71 |     $('body').append($mobile_nav);
 72 |     $('body').prepend('<button type="button" class="mobile-nav-toggle d-lg-none"><i class="icofont-navigation-menu"></i></button>');
 73 |     $('body').append('<div class="mobile-nav-overly"></div>');
 74 | 
 75 |     $(document).on('click', '.mobile-nav-toggle', function(e) {
 76 |       $('body').toggleClass('mobile-nav-active');
 77 |       $('.mobile-nav-toggle i').toggleClass('icofont-navigation-menu icofont-close');
 78 |       $('.mobile-nav-overly').toggle();
 79 |     });
 80 | 
 81 |     $(document).on('click', '.mobile-nav .drop-down > a', function(e) {
 82 |       e.preventDefault();
 83 |       $(this).next().slideToggle(300);
 84 |       $(this).parent().toggleClass('active');
 85 |     });
 86 | 
 87 |     $(document).click(function(e) {
 88 |       var container = $(".mobile-nav, .mobile-nav-toggle");
 89 |       if (!container.is(e.target) && container.has(e.target).length === 0) {
 90 |         if ($('body').hasClass('mobile-nav-active')) {
 91 |           $('body').removeClass('mobile-nav-active');
 92 |           $('.mobile-nav-toggle i').toggleClass('icofont-navigation-menu icofont-close');
 93 |           $('.mobile-nav-overly').fadeOut();
 94 |         }
 95 |       }
 96 |     });
 97 |   } else if ($(".mobile-nav, .mobile-nav-toggle").length) {
 98 |     $(".mobile-nav, .mobile-nav-toggle").hide();
 99 |   }
100 | 
101 |   // Intro carousel
102 |   var heroCarousel = $("#heroCarousel");
103 |   var heroCarouselIndicators = $("#hero-carousel-indicators");
104 |   heroCarousel.find(".carousel-inner").children(".carousel-item").each(function(index) {
105 |     (index === 0) ?
106 |     heroCarouselIndicators.append("<li data-target='#heroCarousel' data-slide-to='" + index + "' class='active'></li>"):
107 |       heroCarouselIndicators.append("<li data-target='#heroCarousel' data-slide-to='" + index + "'></li>");
108 |   });
109 | 
110 |   heroCarousel.on('slid.bs.carousel', function(e) {
111 |     $(this).find('h2').addClass('animated fadeInDown');
112 |     $(this).find('p').addClass('animated fadeInUp');
113 |     $(this).find('.btn-get-started').addClass('animated fadeInUp');
114 |   });
115 | 
116 |   // Back to top button
117 |   $(window).scroll(function() {
118 |     if ($(this).scrollTop() > 100) {
119 |       $('.back-to-top').fadeIn('slow');
120 |     } else {
121 |       $('.back-to-top').fadeOut('slow');
122 |     }
123 |   });
124 | 
125 |   $('.back-to-top').click(function() {
126 |     $('html, body').animate({
127 |       scrollTop: 0
128 |     }, 1500, 'easeInOutExpo');
129 |     return false;
130 |   });
131 | 
132 |   // Initiate the venobox plugin
133 |   $(window).on('load', function() {
134 |     $('.venobox').venobox();
135 |   });
136 | 
137 |   // jQuery counterUp
138 |   $('[data-toggle="counter-up"]').counterUp({
139 |     delay: 10,
140 |     time: 1000
141 |   });
142 | 
143 |   // Porfolio isotope and filter
144 |   $(window).on('load', function() {
145 |     var portfolioIsotope = $('.portfolio-container').isotope({
146 |       itemSelector: '.portfolio-item',
147 |       layoutMode: 'fitRows'
148 |     });
149 | 
150 |     $('#portfolio-flters li').on('click', function() {
151 |       $("#portfolio-flters li").removeClass('filter-active');
152 |       $(this).addClass('filter-active');
153 | 
154 |       portfolioIsotope.isotope({
155 |         filter: $(this).data('filter')
156 |       });
157 |     });
158 | 
159 |     // Initiate venobox (lightbox feature used in portofilo)
160 |     $(document).ready(function() {
161 |       $('.venobox').venobox();
162 |     });
163 |   });
164 | 
165 |   // Initi AOS
166 |   AOS.init({
167 |     duration: 1000,
168 |     easing: "ease-in-out-back"
169 |   });
170 | 
171 | })(jQuery);


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/bootstrap/css/bootstrap-reboot.css:
--------------------------------------------------------------------------------
  1 | /*!
  2 |  * Bootstrap Reboot v4.4.1 (https://getbootstrap.com/)
  3 |  * Copyright 2011-2019 The Bootstrap Authors
  4 |  * Copyright 2011-2019 Twitter, Inc.
  5 |  * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
  6 |  * Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md)
  7 |  */
  8 | *,
  9 | *::before,
 10 | *::after {
 11 |   box-sizing: border-box;
 12 | }
 13 | 
 14 | html {
 15 |   font-family: sans-serif;
 16 |   line-height: 1.15;
 17 |   -webkit-text-size-adjust: 100%;
 18 |   -webkit-tap-highlight-color: rgba(0, 0, 0, 0);
 19 | }
 20 | 
 21 | article, aside, figcaption, figure, footer, header, hgroup, main, nav, section {
 22 |   display: block;
 23 | }
 24 | 
 25 | body {
 26 |   margin: 0;
 27 |   font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji";
 28 |   font-size: 1rem;
 29 |   font-weight: 400;
 30 |   line-height: 1.5;
 31 |   color: #212529;
 32 |   text-align: left;
 33 |   background-color: #fff;
 34 | }
 35 | 
 36 | [tabindex="-1"]:focus:not(:focus-visible) {
 37 |   outline: 0 !important;
 38 | }
 39 | 
 40 | hr {
 41 |   box-sizing: content-box;
 42 |   height: 0;
 43 |   overflow: visible;
 44 | }
 45 | 
 46 | h1, h2, h3, h4, h5, h6 {
 47 |   margin-top: 0;
 48 |   margin-bottom: 0.5rem;
 49 | }
 50 | 
 51 | p {
 52 |   margin-top: 0;
 53 |   margin-bottom: 1rem;
 54 | }
 55 | 
 56 | abbr[title],
 57 | abbr[data-original-title] {
 58 |   text-decoration: underline;
 59 |   -webkit-text-decoration: underline dotted;
 60 |   text-decoration: underline dotted;
 61 |   cursor: help;
 62 |   border-bottom: 0;
 63 |   -webkit-text-decoration-skip-ink: none;
 64 |   text-decoration-skip-ink: none;
 65 | }
 66 | 
 67 | address {
 68 |   margin-bottom: 1rem;
 69 |   font-style: normal;
 70 |   line-height: inherit;
 71 | }
 72 | 
 73 | ol,
 74 | ul,
 75 | dl {
 76 |   margin-top: 0;
 77 |   margin-bottom: 1rem;
 78 | }
 79 | 
 80 | ol ol,
 81 | ul ul,
 82 | ol ul,
 83 | ul ol {
 84 |   margin-bottom: 0;
 85 | }
 86 | 
 87 | dt {
 88 |   font-weight: 700;
 89 | }
 90 | 
 91 | dd {
 92 |   margin-bottom: .5rem;
 93 |   margin-left: 0;
 94 | }
 95 | 
 96 | blockquote {
 97 |   margin: 0 0 1rem;
 98 | }
 99 | 
100 | b,
101 | strong {
102 |   font-weight: bolder;
103 | }
104 | 
105 | small {
106 |   font-size: 80%;
107 | }
108 | 
109 | sub,
110 | sup {
111 |   position: relative;
112 |   font-size: 75%;
113 |   line-height: 0;
114 |   vertical-align: baseline;
115 | }
116 | 
117 | sub {
118 |   bottom: -.25em;
119 | }
120 | 
121 | sup {
122 |   top: -.5em;
123 | }
124 | 
125 | a {
126 |   color: #007bff;
127 |   text-decoration: none;
128 |   background-color: transparent;
129 | }
130 | 
131 | a:hover {
132 |   color: #0056b3;
133 |   text-decoration: underline;
134 | }
135 | 
136 | a:not([href]) {
137 |   color: inherit;
138 |   text-decoration: none;
139 | }
140 | 
141 | a:not([href]):hover {
142 |   color: inherit;
143 |   text-decoration: none;
144 | }
145 | 
146 | pre,
147 | code,
148 | kbd,
149 | samp {
150 |   font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
151 |   font-size: 1em;
152 | }
153 | 
154 | pre {
155 |   margin-top: 0;
156 |   margin-bottom: 1rem;
157 |   overflow: auto;
158 | }
159 | 
160 | figure {
161 |   margin: 0 0 1rem;
162 | }
163 | 
164 | img {
165 |   vertical-align: middle;
166 |   border-style: none;
167 | }
168 | 
169 | svg {
170 |   overflow: hidden;
171 |   vertical-align: middle;
172 | }
173 | 
174 | table {
175 |   border-collapse: collapse;
176 | }
177 | 
178 | caption {
179 |   padding-top: 0.75rem;
180 |   padding-bottom: 0.75rem;
181 |   color: #6c757d;
182 |   text-align: left;
183 |   caption-side: bottom;
184 | }
185 | 
186 | th {
187 |   text-align: inherit;
188 | }
189 | 
190 | label {
191 |   display: inline-block;
192 |   margin-bottom: 0.5rem;
193 | }
194 | 
195 | button {
196 |   border-radius: 0;
197 | }
198 | 
199 | button:focus {
200 |   outline: 1px dotted;
201 |   outline: 5px auto -webkit-focus-ring-color;
202 | }
203 | 
204 | input,
205 | button,
206 | select,
207 | optgroup,
208 | textarea {
209 |   margin: 0;
210 |   font-family: inherit;
211 |   font-size: inherit;
212 |   line-height: inherit;
213 | }
214 | 
215 | button,
216 | input {
217 |   overflow: visible;
218 | }
219 | 
220 | button,
221 | select {
222 |   text-transform: none;
223 | }
224 | 
225 | select {
226 |   word-wrap: normal;
227 | }
228 | 
229 | button,
230 | [type="button"],
231 | [type="reset"],
232 | [type="submit"] {
233 |   -webkit-appearance: button;
234 | }
235 | 
236 | button:not(:disabled),
237 | [type="button"]:not(:disabled),
238 | [type="reset"]:not(:disabled),
239 | [type="submit"]:not(:disabled) {
240 |   cursor: pointer;
241 | }
242 | 
243 | button::-moz-focus-inner,
244 | [type="button"]::-moz-focus-inner,
245 | [type="reset"]::-moz-focus-inner,
246 | [type="submit"]::-moz-focus-inner {
247 |   padding: 0;
248 |   border-style: none;
249 | }
250 | 
251 | input[type="radio"],
252 | input[type="checkbox"] {
253 |   box-sizing: border-box;
254 |   padding: 0;
255 | }
256 | 
257 | input[type="date"],
258 | input[type="time"],
259 | input[type="datetime-local"],
260 | input[type="month"] {
261 |   -webkit-appearance: listbox;
262 | }
263 | 
264 | textarea {
265 |   overflow: auto;
266 |   resize: vertical;
267 | }
268 | 
269 | fieldset {
270 |   min-width: 0;
271 |   padding: 0;
272 |   margin: 0;
273 |   border: 0;
274 | }
275 | 
276 | legend {
277 |   display: block;
278 |   width: 100%;
279 |   max-width: 100%;
280 |   padding: 0;
281 |   margin-bottom: .5rem;
282 |   font-size: 1.5rem;
283 |   line-height: inherit;
284 |   color: inherit;
285 |   white-space: normal;
286 | }
287 | 
288 | progress {
289 |   vertical-align: baseline;
290 | }
291 | 
292 | [type="number"]::-webkit-inner-spin-button,
293 | [type="number"]::-webkit-outer-spin-button {
294 |   height: auto;
295 | }
296 | 
297 | [type="search"] {
298 |   outline-offset: -2px;
299 |   -webkit-appearance: none;
300 | }
301 | 
302 | [type="search"]::-webkit-search-decoration {
303 |   -webkit-appearance: none;
304 | }
305 | 
306 | ::-webkit-file-upload-button {
307 |   font: inherit;
308 |   -webkit-appearance: button;
309 | }
310 | 
311 | output {
312 |   display: inline-block;
313 | }
314 | 
315 | summary {
316 |   display: list-item;
317 |   cursor: pointer;
318 | }
319 | 
320 | template {
321 |   display: none;
322 | }
323 | 
324 | [hidden] {
325 |   display: none !important;
326 | }
327 | /*# sourceMappingURL=bootstrap-reboot.css.map */


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/bootstrap/css/bootstrap-reboot.min.css:
--------------------------------------------------------------------------------
1 | /*!
2 |  * Bootstrap Reboot v4.4.1 (https://getbootstrap.com/)
3 |  * Copyright 2011-2019 The Bootstrap Authors
4 |  * Copyright 2011-2019 Twitter, Inc.
5 |  * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
6 |  * Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md)
7 |  */*,::after,::before{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-webkit-tap-highlight-color:transparent}article,aside,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}body{margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-size:1rem;font-weight:400;line-height:1.5;color:#212529;text-align:left;background-color:#fff}[tabindex="-1"]:focus:not(:focus-visible){outline:0!important}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem}p{margin-top:0;margin-bottom:1rem}abbr[data-original-title],abbr[title]{text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted;cursor:help;border-bottom:0;-webkit-text-decoration-skip-ink:none;text-decoration-skip-ink:none}address{margin-bottom:1rem;font-style:normal;line-height:inherit}dl,ol,ul{margin-top:0;margin-bottom:1rem}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}a{color:#007bff;text-decoration:none;background-color:transparent}a:hover{color:#0056b3;text-decoration:underline}a:not([href]){color:inherit;text-decoration:none}a:not([href]):hover{color:inherit;text-decoration:none}code,kbd,pre,samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;font-size:1em}pre{margin-top:0;margin-bottom:1rem;overflow:auto}figure{margin:0 0 1rem}img{vertical-align:middle;border-style:none}svg{overflow:hidden;vertical-align:middle}table{border-collapse:collapse}caption{padding-top:.75rem;padding-bottom:.75rem;color:#6c757d;text-align:left;caption-side:bottom}th{text-align:inherit}label{display:inline-block;margin-bottom:.5rem}button{border-radius:0}button:focus{outline:1px dotted;outline:5px auto -webkit-focus-ring-color}button,input,optgroup,select,textarea{margin:0;font-family:inherit;font-size:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}select{word-wrap:normal}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}[type=button]:not(:disabled),[type=reset]:not(:disabled),[type=submit]:not(:disabled),button:not(:disabled){cursor:pointer}[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner,button::-moz-focus-inner{padding:0;border-style:none}input[type=checkbox],input[type=radio]{box-sizing:border-box;padding:0}input[type=date],input[type=datetime-local],input[type=month],input[type=time]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;max-width:100%;padding:0;margin-bottom:.5rem;font-size:1.5rem;line-height:inherit;color:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item;cursor:pointer}template{display:none}[hidden]{display:none!important}
8 | /*# sourceMappingURL=bootstrap-reboot.min.css.map */


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/css/animations.css:
--------------------------------------------------------------------------------
  1 | @-webkit-keyframes spin
  2 | {
  3 |     0%
  4 |     {
  5 |         -webkit-transform: rotate(0);
  6 |                 transform: rotate(0);
  7 |     }
  8 |     100%
  9 |     {
 10 |         -webkit-transform: rotate(359deg);
 11 |                 transform: rotate(359deg);
 12 |     }
 13 | }
 14 | @keyframes spin
 15 | {
 16 |     0%
 17 |     {
 18 |         -webkit-transform: rotate(0);
 19 |                 transform: rotate(0);
 20 |     }
 21 |     100%
 22 |     {
 23 |         -webkit-transform: rotate(359deg);
 24 |                 transform: rotate(359deg);
 25 |     }
 26 | }
 27 | @-webkit-keyframes burst
 28 | {
 29 |     0%
 30 |     {
 31 |         -webkit-transform: scale(1);
 32 |                 transform: scale(1);
 33 | 
 34 |         opacity: 1;
 35 |     }
 36 |     90%
 37 |     {
 38 |         -webkit-transform: scale(1.5);
 39 |                 transform: scale(1.5);
 40 | 
 41 |         opacity: 0;
 42 |     }
 43 | }
 44 | @keyframes burst
 45 | {
 46 |     0%
 47 |     {
 48 |         -webkit-transform: scale(1);
 49 |                 transform: scale(1);
 50 | 
 51 |         opacity: 1;
 52 |     }
 53 |     90%
 54 |     {
 55 |         -webkit-transform: scale(1.5);
 56 |                 transform: scale(1.5);
 57 | 
 58 |         opacity: 0;
 59 |     }
 60 | }
 61 | @-webkit-keyframes flashing
 62 | {
 63 |     0%
 64 |     {
 65 |         opacity: 1;
 66 |     }
 67 |     45%
 68 |     {
 69 |         opacity: 0;
 70 |     }
 71 |     90%
 72 |     {
 73 |         opacity: 1;
 74 |     }
 75 | }
 76 | @keyframes flashing
 77 | {
 78 |     0%
 79 |     {
 80 |         opacity: 1;
 81 |     }
 82 |     45%
 83 |     {
 84 |         opacity: 0;
 85 |     }
 86 |     90%
 87 |     {
 88 |         opacity: 1;
 89 |     }
 90 | }
 91 | @-webkit-keyframes fade-left
 92 | {
 93 |     0%
 94 |     {
 95 |         -webkit-transform: translateX(0);
 96 |                 transform: translateX(0);
 97 | 
 98 |         opacity: 1;
 99 |     }
100 |     75%
101 |     {
102 |         -webkit-transform: translateX(-20px);
103 |                 transform: translateX(-20px);
104 | 
105 |         opacity: 0;
106 |     }
107 | }
108 | @keyframes fade-left
109 | {
110 |     0%
111 |     {
112 |         -webkit-transform: translateX(0);
113 |                 transform: translateX(0);
114 | 
115 |         opacity: 1;
116 |     }
117 |     75%
118 |     {
119 |         -webkit-transform: translateX(-20px);
120 |                 transform: translateX(-20px);
121 | 
122 |         opacity: 0;
123 |     }
124 | }
125 | @-webkit-keyframes fade-right
126 | {
127 |     0%
128 |     {
129 |         -webkit-transform: translateX(0);
130 |                 transform: translateX(0);
131 | 
132 |         opacity: 1;
133 |     }
134 |     75%
135 |     {
136 |         -webkit-transform: translateX(20px);
137 |                 transform: translateX(20px);
138 | 
139 |         opacity: 0;
140 |     }
141 | }
142 | @keyframes fade-right
143 | {
144 |     0%
145 |     {
146 |         -webkit-transform: translateX(0);
147 |                 transform: translateX(0);
148 | 
149 |         opacity: 1;
150 |     }
151 |     75%
152 |     {
153 |         -webkit-transform: translateX(20px);
154 |                 transform: translateX(20px);
155 | 
156 |         opacity: 0;
157 |     }
158 | }
159 | @-webkit-keyframes fade-up
160 | {
161 |     0%
162 |     {
163 |         -webkit-transform: translateY(0);
164 |                 transform: translateY(0);
165 | 
166 |         opacity: 1;
167 |     }
168 |     75%
169 |     {
170 |         -webkit-transform: translateY(-20px);
171 |                 transform: translateY(-20px);
172 | 
173 |         opacity: 0;
174 |     }
175 | }
176 | @keyframes fade-up
177 | {
178 |     0%
179 |     {
180 |         -webkit-transform: translateY(0);
181 |                 transform: translateY(0);
182 | 
183 |         opacity: 1;
184 |     }
185 |     75%
186 |     {
187 |         -webkit-transform: translateY(-20px);
188 |                 transform: translateY(-20px);
189 | 
190 |         opacity: 0;
191 |     }
192 | }
193 | @-webkit-keyframes fade-down
194 | {
195 |     0%
196 |     {
197 |         -webkit-transform: translateY(0);
198 |                 transform: translateY(0);
199 | 
200 |         opacity: 1;
201 |     }
202 |     75%
203 |     {
204 |         -webkit-transform: translateY(20px);
205 |                 transform: translateY(20px);
206 | 
207 |         opacity: 0;
208 |     }
209 | }
210 | @keyframes fade-down
211 | {
212 |     0%
213 |     {
214 |         -webkit-transform: translateY(0);
215 |                 transform: translateY(0);
216 | 
217 |         opacity: 1;
218 |     }
219 |     75%
220 |     {
221 |         -webkit-transform: translateY(20px);
222 |                 transform: translateY(20px);
223 | 
224 |         opacity: 0;
225 |     }
226 | }
227 | @-webkit-keyframes tada
228 | {
229 |     from
230 |     {
231 |         -webkit-transform: scale3d(1, 1, 1);
232 |                 transform: scale3d(1, 1, 1);
233 |     }
234 | 
235 |     10%,
236 |     20%
237 |     {
238 |         -webkit-transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg);
239 |                 transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg);
240 |     }
241 | 
242 |     30%,
243 |     50%,
244 |     70%,
245 |     90%
246 |     {
247 |         -webkit-transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg);
248 |                 transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg);
249 |     }
250 | 
251 |     40%,
252 |     60%,
253 |     80%
254 |     {
255 |         -webkit-transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, -10deg);
256 |                 transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, -10deg);
257 |     }
258 | 
259 |     to
260 |     {
261 |         -webkit-transform: scale3d(1, 1, 1);
262 |                 transform: scale3d(1, 1, 1);
263 |     }
264 | }
265 | 
266 | @keyframes tada
267 | {
268 |     from
269 |     {
270 |         -webkit-transform: scale3d(1, 1, 1);
271 |                 transform: scale3d(1, 1, 1);
272 |     }
273 | 
274 |     10%,
275 |     20%
276 |     {
277 |         -webkit-transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg);
278 |                 transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg);
279 |     }
280 | 
281 |     30%,
282 |     50%,
283 |     70%,
284 |     90%
285 |     {
286 |         -webkit-transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg);
287 |                 transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg);
288 |     }
289 | 
290 |     40%,
291 |     60%,
292 |     80%
293 |     {
294 |         -webkit-transform: rotate3d(0, 0, 1, -10deg);
295 |                 transform: rotate3d(0, 0, 1, -10deg);
296 |     }
297 | 
298 |     to
299 |     {
300 |         -webkit-transform: scale3d(1, 1, 1);
301 |                 transform: scale3d(1, 1, 1);
302 |     }
303 | }
304 | .bx-spin
305 | {
306 |     -webkit-animation: spin 2s linear infinite;
307 |             animation: spin 2s linear infinite;
308 | }
309 | .bx-spin-hover:hover
310 | {
311 |     -webkit-animation: spin 2s linear infinite;
312 |             animation: spin 2s linear infinite;
313 | }
314 | 
315 | .bx-tada
316 | {
317 |     -webkit-animation: tada 1.5s ease infinite;
318 |             animation: tada 1.5s ease infinite;
319 | }
320 | .bx-tada-hover:hover
321 | {
322 |     -webkit-animation: tada 1.5s ease infinite;
323 |             animation: tada 1.5s ease infinite;
324 | }
325 | 
326 | .bx-flashing
327 | {
328 |     -webkit-animation: flashing 1.5s infinite linear;
329 |             animation: flashing 1.5s infinite linear;
330 | }
331 | .bx-flashing-hover:hover
332 | {
333 |     -webkit-animation: flashing 1.5s infinite linear;
334 |             animation: flashing 1.5s infinite linear;
335 | }
336 | 
337 | .bx-burst
338 | {
339 |     -webkit-animation: burst 1.5s infinite linear;
340 |             animation: burst 1.5s infinite linear;
341 | }
342 | .bx-burst-hover:hover
343 | {
344 |     -webkit-animation: burst 1.5s infinite linear;
345 |             animation: burst 1.5s infinite linear;
346 | }
347 | .bx-fade-up
348 | {
349 |     -webkit-animation: fade-up 1.5s infinite linear;
350 |             animation: fade-up 1.5s infinite linear;
351 | }
352 | .bx-fade-up-hover:hover
353 | {
354 |     -webkit-animation: fade-up 1.5s infinite linear;
355 |             animation: fade-up 1.5s infinite linear;
356 | }
357 | .bx-fade-down
358 | {
359 |     -webkit-animation: fade-down 1.5s infinite linear;
360 |             animation: fade-down 1.5s infinite linear;
361 | }
362 | .bx-fade-down-hover:hover
363 | {
364 |     -webkit-animation: fade-down 1.5s infinite linear;
365 |             animation: fade-down 1.5s infinite linear;
366 | }
367 | .bx-fade-left
368 | {
369 |     -webkit-animation: fade-left 1.5s infinite linear;
370 |             animation: fade-left 1.5s infinite linear;
371 | }
372 | .bx-fade-left-hover:hover
373 | {
374 |     -webkit-animation: fade-left 1.5s infinite linear;
375 |             animation: fade-left 1.5s infinite linear;
376 | }
377 | .bx-fade-right
378 | {
379 |     -webkit-animation: fade-right 1.5s infinite linear;
380 |             animation: fade-right 1.5s infinite linear;
381 | }
382 | .bx-fade-right-hover:hover
383 | {
384 |     -webkit-animation: fade-right 1.5s infinite linear;
385 |             animation: fade-right 1.5s infinite linear;
386 | }


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/css/transformations.css:
--------------------------------------------------------------------------------
 1 | .bx-rotate-90
 2 | {
 3 |     transform: rotate(90deg);
 4 | 
 5 |     -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=1)';
 6 | }
 7 | .bx-rotate-180
 8 | {
 9 |     transform: rotate(180deg);
10 | 
11 |     -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=2)';
12 | }
13 | .bx-rotate-270
14 | {
15 |     transform: rotate(270deg);
16 | 
17 |     -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=3)';
18 | }
19 | .bx-flip-horizontal
20 | {
21 |     transform: scaleX(-1);
22 | 
23 |     -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)';
24 | }
25 | .bx-flip-vertical
26 | {
27 |     transform: scaleY(-1);
28 | 
29 |     -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)';
30 | }
31 | 


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/fonts/boxicons.eot:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.eot


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/fonts/boxicons.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.ttf


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff2


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/counterup/counterup.min.js:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * jquery.counterup.js 2.1.0
 3 |  *
 4 |  * Copyright 2013, Benjamin Intal http://gambit.ph @bfintal
 5 |  * Released under the GPL v2 License
 6 |  *
 7 |  * Amended by Jeremy Paris, Ciro Mattia Gonano and others
 8 |  *
 9 |  * Date: Feb 24, 2017
10 |  */
11 |  (function($){"use strict";$.fn.counterUp=function(options){var settings=$.extend({time:400,delay:10,offset:100,beginAt:0,formatter:false,context:"window",callback:function(){}},options),s;return this.each(function(){var $this=$(this),counter={time:$(this).data("counterup-time")||settings.time,delay:$(this).data("counterup-delay")||settings.delay,offset:$(this).data("counterup-offset")||settings.offset,beginAt:$(this).data("counterup-beginat")||settings.beginAt,context:$(this).data("counterup-context")||settings.context};var counterUpper=function(){var nums=[];var divisions=counter.time/counter.delay;var num=$(this).attr("data-num")?$(this).attr("data-num"):$this.text();var isComma=/[0-9]+,[0-9]+/.test(num);num=num.replace(/,/g,"");var decimalPlaces=(num.split(".")[1]||[]).length;if(counter.beginAt>num)counter.beginAt=num;var isTime=/[0-9]+:[0-9]+:[0-9]+/.test(num);if(isTime){var times=num.split(":"),m=1;s=0;while(times.length>0){s+=m*parseInt(times.pop(),10);m*=60}}for(var i=divisions;i>=counter.beginAt/num*divisions;i--){var newNum=parseFloat(num/divisions*i).toFixed(decimalPlaces);if(isTime){newNum=parseInt(s/divisions*i);var hours=parseInt(newNum/3600)%24;var minutes=parseInt(newNum/60)%60;var seconds=parseInt(newNum%60,10);newNum=(hours<10?"0"+hours:hours)+":"+(minutes<10?"0"+minutes:minutes)+":"+(seconds<10?"0"+seconds:seconds)}if(isComma){while(/(\d+)(\d{3})/.test(newNum.toString())){newNum=newNum.toString().replace(/(\d+)(\d{3})/,"$1"+","+"$2")}}if(settings.formatter){newNum=settings.formatter.call(this,newNum)}nums.unshift(newNum)}$this.data("counterup-nums",nums);$this.text(counter.beginAt);var f=function(){if(!$this.data("counterup-nums")){settings.callback.call(this);return}$this.html($this.data("counterup-nums").shift());if($this.data("counterup-nums").length){setTimeout($this.data("counterup-func"),counter.delay)}else{$this.data("counterup-nums",null);$this.data("counterup-func",null);settings.callback.call(this)}};$this.data("counterup-func",f);setTimeout($this.data("counterup-func"),counter.delay)};$this.waypoint(function(direction){counterUpper();this.destroy()},{offset:counter.offset+"%",context:counter.context})})}})(jQuery);
12 | 


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/icofont/fonts/icofont.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/icofont/fonts/icofont.woff


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/icofont/fonts/icofont.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/icofont/fonts/icofont.woff2


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/jquery.easing/jquery.easing.min.js:
--------------------------------------------------------------------------------
1 | (function(factory){if(typeof define==="function"&&define.amd){define(["jquery"],function($){return factory($)})}else if(typeof module==="object"&&typeof module.exports==="object"){exports=factory(require("jquery"))}else{factory(jQuery)}})(function($){$.easing.jswing=$.easing.swing;var pow=Math.pow,sqrt=Math.sqrt,sin=Math.sin,cos=Math.cos,PI=Math.PI,c1=1.70158,c2=c1*1.525,c3=c1+1,c4=2*PI/3,c5=2*PI/4.5;function bounceOut(x){var n1=7.5625,d1=2.75;if(x<1/d1){return n1*x*x}else if(x<2/d1){return n1*(x-=1.5/d1)*x+.75}else if(x<2.5/d1){return n1*(x-=2.25/d1)*x+.9375}else{return n1*(x-=2.625/d1)*x+.984375}}$.extend($.easing,{def:"easeOutQuad",swing:function(x){return $.easing[$.easing.def](x)},easeInQuad:function(x){return x*x},easeOutQuad:function(x){return 1-(1-x)*(1-x)},easeInOutQuad:function(x){return x<.5?2*x*x:1-pow(-2*x+2,2)/2},easeInCubic:function(x){return x*x*x},easeOutCubic:function(x){return 1-pow(1-x,3)},easeInOutCubic:function(x){return x<.5?4*x*x*x:1-pow(-2*x+2,3)/2},easeInQuart:function(x){return x*x*x*x},easeOutQuart:function(x){return 1-pow(1-x,4)},easeInOutQuart:function(x){return x<.5?8*x*x*x*x:1-pow(-2*x+2,4)/2},easeInQuint:function(x){return x*x*x*x*x},easeOutQuint:function(x){return 1-pow(1-x,5)},easeInOutQuint:function(x){return x<.5?16*x*x*x*x*x:1-pow(-2*x+2,5)/2},easeInSine:function(x){return 1-cos(x*PI/2)},easeOutSine:function(x){return sin(x*PI/2)},easeInOutSine:function(x){return-(cos(PI*x)-1)/2},easeInExpo:function(x){return x===0?0:pow(2,10*x-10)},easeOutExpo:function(x){return x===1?1:1-pow(2,-10*x)},easeInOutExpo:function(x){return x===0?0:x===1?1:x<.5?pow(2,20*x-10)/2:(2-pow(2,-20*x+10))/2},easeInCirc:function(x){return 1-sqrt(1-pow(x,2))},easeOutCirc:function(x){return sqrt(1-pow(x-1,2))},easeInOutCirc:function(x){return x<.5?(1-sqrt(1-pow(2*x,2)))/2:(sqrt(1-pow(-2*x+2,2))+1)/2},easeInElastic:function(x){return x===0?0:x===1?1:-pow(2,10*x-10)*sin((x*10-10.75)*c4)},easeOutElastic:function(x){return x===0?0:x===1?1:pow(2,-10*x)*sin((x*10-.75)*c4)+1},easeInOutElastic:function(x){return x===0?0:x===1?1:x<.5?-(pow(2,20*x-10)*sin((20*x-11.125)*c5))/2:pow(2,-20*x+10)*sin((20*x-11.125)*c5)/2+1},easeInBack:function(x){return c3*x*x*x-c1*x*x},easeOutBack:function(x){return 1+c3*pow(x-1,3)+c1*pow(x-1,2)},easeInOutBack:function(x){return x<.5?pow(2*x,2)*((c2+1)*2*x-c2)/2:(pow(2*x-2,2)*((c2+1)*(x*2-2)+c2)+2)/2},easeInBounce:function(x){return 1-bounceOut(1-x)},easeOutBounce:bounceOut,easeInOutBounce:function(x){return x<.5?(1-bounceOut(1-2*x))/2:(1+bounceOut(2*x-1))/2}})});


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/php-email-form/validate.js:
--------------------------------------------------------------------------------
  1 | jQuery(document).ready(function($) {
  2 |   "use strict";
  3 | 
  4 |   //Contact
  5 |   $('form.php-email-form').submit(function() {
  6 |    
  7 |     var f = $(this).find('.form-group'),
  8 |       ferror = false,
  9 |       emailExp = /^[^\s()<>@,;:\/]+@\w[\w\.-]+\.[a-z]{2,}$/i;
 10 | 
 11 |     f.children('input').each(function() { // run all inputs
 12 |      
 13 |       var i = $(this); // current input
 14 |       var rule = i.attr('data-rule');
 15 | 
 16 |       if (rule !== undefined) {
 17 |         var ierror = false; // error flag for current input
 18 |         var pos = rule.indexOf(':', 0);
 19 |         if (pos >= 0) {
 20 |           var exp = rule.substr(pos + 1, rule.length);
 21 |           rule = rule.substr(0, pos);
 22 |         } else {
 23 |           rule = rule.substr(pos + 1, rule.length);
 24 |         }
 25 | 
 26 |         switch (rule) {
 27 |           case 'required':
 28 |             if (i.val() === '') {
 29 |               ferror = ierror = true;
 30 |             }
 31 |             break;
 32 | 
 33 |           case 'minlen':
 34 |             if (i.val().length < parseInt(exp)) {
 35 |               ferror = ierror = true;
 36 |             }
 37 |             break;
 38 | 
 39 |           case 'email':
 40 |             if (!emailExp.test(i.val())) {
 41 |               ferror = ierror = true;
 42 |             }
 43 |             break;
 44 | 
 45 |           case 'checked':
 46 |             if (! i.is(':checked')) {
 47 |               ferror = ierror = true;
 48 |             }
 49 |             break;
 50 | 
 51 |           case 'regexp':
 52 |             exp = new RegExp(exp);
 53 |             if (!exp.test(i.val())) {
 54 |               ferror = ierror = true;
 55 |             }
 56 |             break;
 57 |         }
 58 |         i.next('.validate').html((ierror ? (i.attr('data-msg') !== undefined ? i.attr('data-msg') : 'wrong Input') : '')).show('blind');
 59 |       }
 60 |     });
 61 |     f.children('textarea').each(function() { // run all inputs
 62 | 
 63 |       var i = $(this); // current input
 64 |       var rule = i.attr('data-rule');
 65 | 
 66 |       if (rule !== undefined) {
 67 |         var ierror = false; // error flag for current input
 68 |         var pos = rule.indexOf(':', 0);
 69 |         if (pos >= 0) {
 70 |           var exp = rule.substr(pos + 1, rule.length);
 71 |           rule = rule.substr(0, pos);
 72 |         } else {
 73 |           rule = rule.substr(pos + 1, rule.length);
 74 |         }
 75 | 
 76 |         switch (rule) {
 77 |           case 'required':
 78 |             if (i.val() === '') {
 79 |               ferror = ierror = true;
 80 |             }
 81 |             break;
 82 | 
 83 |           case 'minlen':
 84 |             if (i.val().length < parseInt(exp)) {
 85 |               ferror = ierror = true;
 86 |             }
 87 |             break;
 88 |         }
 89 |         i.next('.validate').html((ierror ? (i.attr('data-msg') != undefined ? i.attr('data-msg') : 'wrong Input') : '')).show('blind');
 90 |       }
 91 |     });
 92 |     if (ferror) return false;
 93 |     else var str = $(this).serialize();
 94 | 
 95 |     var this_form = $(this);
 96 |     var action = $(this).attr('action');
 97 | 
 98 |     if( ! action ) {
 99 |       this_form.find('.loading').slideUp();
100 |       this_form.find('.error-message').slideDown().html('The form action property is not set!');
101 |       return false;
102 |     }
103 |     
104 |     this_form.find('.sent-message').slideUp();
105 |     this_form.find('.error-message').slideUp();
106 |     this_form.find('.loading').slideDown();
107 |     
108 |     $.ajax({
109 |       type: "POST",
110 |       url: action,
111 |       data: str,
112 |       success: function(msg) {
113 |         if (msg == 'OK') {
114 |           this_form.find('.loading').slideUp();
115 |           this_form.find('.sent-message').slideDown();
116 |           this_form.find("input:not(input[type=submit]), textarea").val('');
117 |         } else {
118 |           this_form.find('.loading').slideUp();
119 |           this_form.find('.error-message').slideDown().html(msg);
120 |         }
121 |       }
122 |     });
123 |     return false;
124 |   });
125 | 
126 | });
127 | 


--------------------------------------------------------------------------------
/FlaskProject/static/vendor/waypoints/jquery.waypoints.min.js:
--------------------------------------------------------------------------------
1 | /*!
2 | Waypoints - 4.0.1
3 | Copyright © 2011-2016 Caleb Troughton
4 | Licensed under the MIT license.
5 | https://github.com/imakewebthings/waypoints/blob/master/licenses.txt
6 | */
7 | !function(){"use strict";function t(o){if(!o)throw new Error("No options passed to Waypoint constructor");if(!o.element)throw new Error("No element option passed to Waypoint constructor");if(!o.handler)throw new Error("No handler option passed to Waypoint constructor");this.key="waypoint-"+e,this.options=t.Adapter.extend({},t.defaults,o),this.element=this.options.element,this.adapter=new t.Adapter(this.element),this.callback=o.handler,this.axis=this.options.horizontal?"horizontal":"vertical",this.enabled=this.options.enabled,this.triggerPoint=null,this.group=t.Group.findOrCreate({name:this.options.group,axis:this.axis}),this.context=t.Context.findOrCreateByElement(this.options.context),t.offsetAliases[this.options.offset]&&(this.options.offset=t.offsetAliases[this.options.offset]),this.group.add(this),this.context.add(this),i[this.key]=this,e+=1}var e=0,i={};t.prototype.queueTrigger=function(t){this.group.queueTrigger(this,t)},t.prototype.trigger=function(t){this.enabled&&this.callback&&this.callback.apply(this,t)},t.prototype.destroy=function(){this.context.remove(this),this.group.remove(this),delete i[this.key]},t.prototype.disable=function(){return this.enabled=!1,this},t.prototype.enable=function(){return this.context.refresh(),this.enabled=!0,this},t.prototype.next=function(){return this.group.next(this)},t.prototype.previous=function(){return this.group.previous(this)},t.invokeAll=function(t){var e=[];for(var o in i)e.push(i[o]);for(var n=0,r=e.length;r>n;n++)e[n][t]()},t.destroyAll=function(){t.invokeAll("destroy")},t.disableAll=function(){t.invokeAll("disable")},t.enableAll=function(){t.Context.refreshAll();for(var e in i)i[e].enabled=!0;return this},t.refreshAll=function(){t.Context.refreshAll()},t.viewportHeight=function(){return window.innerHeight||document.documentElement.clientHeight},t.viewportWidth=function(){return document.documentElement.clientWidth},t.adapters=[],t.defaults={context:window,continuous:!0,enabled:!0,group:"default",horizontal:!1,offset:0},t.offsetAliases={"bottom-in-view":function(){return this.context.innerHeight()-this.adapter.outerHeight()},"right-in-view":function(){return this.context.innerWidth()-this.adapter.outerWidth()}},window.Waypoint=t}(),function(){"use strict";function t(t){window.setTimeout(t,1e3/60)}function e(t){this.element=t,this.Adapter=n.Adapter,this.adapter=new this.Adapter(t),this.key="waypoint-context-"+i,this.didScroll=!1,this.didResize=!1,this.oldScroll={x:this.adapter.scrollLeft(),y:this.adapter.scrollTop()},this.waypoints={vertical:{},horizontal:{}},t.waypointContextKey=this.key,o[t.waypointContextKey]=this,i+=1,n.windowContext||(n.windowContext=!0,n.windowContext=new e(window)),this.createThrottledScrollHandler(),this.createThrottledResizeHandler()}var i=0,o={},n=window.Waypoint,r=window.onload;e.prototype.add=function(t){var e=t.options.horizontal?"horizontal":"vertical";this.waypoints[e][t.key]=t,this.refresh()},e.prototype.checkEmpty=function(){var t=this.Adapter.isEmptyObject(this.waypoints.horizontal),e=this.Adapter.isEmptyObject(this.waypoints.vertical),i=this.element==this.element.window;t&&e&&!i&&(this.adapter.off(".waypoints"),delete o[this.key])},e.prototype.createThrottledResizeHandler=function(){function t(){e.handleResize(),e.didResize=!1}var e=this;this.adapter.on("resize.waypoints",function(){e.didResize||(e.didResize=!0,n.requestAnimationFrame(t))})},e.prototype.createThrottledScrollHandler=function(){function t(){e.handleScroll(),e.didScroll=!1}var e=this;this.adapter.on("scroll.waypoints",function(){(!e.didScroll||n.isTouch)&&(e.didScroll=!0,n.requestAnimationFrame(t))})},e.prototype.handleResize=function(){n.Context.refreshAll()},e.prototype.handleScroll=function(){var t={},e={horizontal:{newScroll:this.adapter.scrollLeft(),oldScroll:this.oldScroll.x,forward:"right",backward:"left"},vertical:{newScroll:this.adapter.scrollTop(),oldScroll:this.oldScroll.y,forward:"down",backward:"up"}};for(var i in e){var o=e[i],n=o.newScroll>o.oldScroll,r=n?o.forward:o.backward;for(var s in this.waypoints[i]){var a=this.waypoints[i][s];if(null!==a.triggerPoint){var l=o.oldScroll<a.triggerPoint,h=o.newScroll>=a.triggerPoint,p=l&&h,u=!l&&!h;(p||u)&&(a.queueTrigger(r),t[a.group.id]=a.group)}}}for(var c in t)t[c].flushTriggers();this.oldScroll={x:e.horizontal.newScroll,y:e.vertical.newScroll}},e.prototype.innerHeight=function(){return this.element==this.element.window?n.viewportHeight():this.adapter.innerHeight()},e.prototype.remove=function(t){delete this.waypoints[t.axis][t.key],this.checkEmpty()},e.prototype.innerWidth=function(){return this.element==this.element.window?n.viewportWidth():this.adapter.innerWidth()},e.prototype.destroy=function(){var t=[];for(var e in this.waypoints)for(var i in this.waypoints[e])t.push(this.waypoints[e][i]);for(var o=0,n=t.length;n>o;o++)t[o].destroy()},e.prototype.refresh=function(){var t,e=this.element==this.element.window,i=e?void 0:this.adapter.offset(),o={};this.handleScroll(),t={horizontal:{contextOffset:e?0:i.left,contextScroll:e?0:this.oldScroll.x,contextDimension:this.innerWidth(),oldScroll:this.oldScroll.x,forward:"right",backward:"left",offsetProp:"left"},vertical:{contextOffset:e?0:i.top,contextScroll:e?0:this.oldScroll.y,contextDimension:this.innerHeight(),oldScroll:this.oldScroll.y,forward:"down",backward:"up",offsetProp:"top"}};for(var r in t){var s=t[r];for(var a in this.waypoints[r]){var l,h,p,u,c,d=this.waypoints[r][a],f=d.options.offset,w=d.triggerPoint,y=0,g=null==w;d.element!==d.element.window&&(y=d.adapter.offset()[s.offsetProp]),"function"==typeof f?f=f.apply(d):"string"==typeof f&&(f=parseFloat(f),d.options.offset.indexOf("%")>-1&&(f=Math.ceil(s.contextDimension*f/100))),l=s.contextScroll-s.contextOffset,d.triggerPoint=Math.floor(y+l-f),h=w<s.oldScroll,p=d.triggerPoint>=s.oldScroll,u=h&&p,c=!h&&!p,!g&&u?(d.queueTrigger(s.backward),o[d.group.id]=d.group):!g&&c?(d.queueTrigger(s.forward),o[d.group.id]=d.group):g&&s.oldScroll>=d.triggerPoint&&(d.queueTrigger(s.forward),o[d.group.id]=d.group)}}return n.requestAnimationFrame(function(){for(var t in o)o[t].flushTriggers()}),this},e.findOrCreateByElement=function(t){return e.findByElement(t)||new e(t)},e.refreshAll=function(){for(var t in o)o[t].refresh()},e.findByElement=function(t){return o[t.waypointContextKey]},window.onload=function(){r&&r(),e.refreshAll()},n.requestAnimationFrame=function(e){var i=window.requestAnimationFrame||window.mozRequestAnimationFrame||window.webkitRequestAnimationFrame||t;i.call(window,e)},n.Context=e}(),function(){"use strict";function t(t,e){return t.triggerPoint-e.triggerPoint}function e(t,e){return e.triggerPoint-t.triggerPoint}function i(t){this.name=t.name,this.axis=t.axis,this.id=this.name+"-"+this.axis,this.waypoints=[],this.clearTriggerQueues(),o[this.axis][this.name]=this}var o={vertical:{},horizontal:{}},n=window.Waypoint;i.prototype.add=function(t){this.waypoints.push(t)},i.prototype.clearTriggerQueues=function(){this.triggerQueues={up:[],down:[],left:[],right:[]}},i.prototype.flushTriggers=function(){for(var i in this.triggerQueues){var o=this.triggerQueues[i],n="up"===i||"left"===i;o.sort(n?e:t);for(var r=0,s=o.length;s>r;r+=1){var a=o[r];(a.options.continuous||r===o.length-1)&&a.trigger([i])}}this.clearTriggerQueues()},i.prototype.next=function(e){this.waypoints.sort(t);var i=n.Adapter.inArray(e,this.waypoints),o=i===this.waypoints.length-1;return o?null:this.waypoints[i+1]},i.prototype.previous=function(e){this.waypoints.sort(t);var i=n.Adapter.inArray(e,this.waypoints);return i?this.waypoints[i-1]:null},i.prototype.queueTrigger=function(t,e){this.triggerQueues[e].push(t)},i.prototype.remove=function(t){var e=n.Adapter.inArray(t,this.waypoints);e>-1&&this.waypoints.splice(e,1)},i.prototype.first=function(){return this.waypoints[0]},i.prototype.last=function(){return this.waypoints[this.waypoints.length-1]},i.findOrCreate=function(t){return o[t.axis][t.name]||new i(t)},n.Group=i}(),function(){"use strict";function t(t){this.$element=e(t)}var e=window.jQuery,i=window.Waypoint;e.each(["innerHeight","innerWidth","off","offset","on","outerHeight","outerWidth","scrollLeft","scrollTop"],function(e,i){t.prototype[i]=function(){var t=Array.prototype.slice.call(arguments);return this.$element[i].apply(this.$element,t)}}),e.each(["extend","inArray","isEmptyObject"],function(i,o){t[o]=e[o]}),i.adapters.push({name:"jquery",Adapter:t}),i.Adapter=t}(),function(){"use strict";function t(t){return function(){var i=[],o=arguments[0];return t.isFunction(arguments[0])&&(o=t.extend({},arguments[1]),o.handler=arguments[0]),this.each(function(){var n=t.extend({},o,{element:this});"string"==typeof n.context&&(n.context=t(this).closest(n.context)[0]),i.push(new e(n))}),i}}var e=window.Waypoint;window.jQuery&&(window.jQuery.fn.waypoint=t(window.jQuery)),window.Zepto&&(window.Zepto.fn.waypoint=t(window.Zepto))}();


--------------------------------------------------------------------------------
/FlaskProject/templates/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta content="width=device-width, initial-scale=1.0" name="viewport">
  7 | 
  8 |   <title>豆瓣Top250数据分析</title>
  9 |   <meta content="" name="descriptison">
 10 |   <meta content="" name="keywords">
 11 | 
 12 |   <!-- Favicons -->
 13 |   <link href="../static/img/icon.png" rel="shortcut icon">
 14 | 
 15 |   <!-- Google Fonts -->
 16 |   <link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,600,600i,700,700i,900" rel="stylesheet">
 17 | 
 18 |   <!-- Vendor CSS Files -->
 19 |   <link href="../static/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
 20 |   <link href="../static/vendor/icofont/icofont.min.css" rel="stylesheet">
 21 |   <link href="../static/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
 22 |   <link href="../static/vendor/animate.css/animate.min.css" rel="stylesheet">
 23 |   <link href="../static/vendor/venobox/venobox.css" rel="stylesheet">
 24 |   <link href="../static/vendor/aos/aos.css" rel="stylesheet">
 25 | 
 26 |   <!-- Template Main CSS File -->
 27 |   <link href="../static/css/style.css" rel="stylesheet">
 28 | 
 29 |   <!-- =======================================================
 30 |   * Template Name: Mamba - v2.0.1
 31 |   * Template URL: https://bootstrapmade.com/mamba-one-page-bootstrap-template-free/
 32 |   * Author: BootstrapMade.com
 33 |   * License: https://bootstrapmade.com/license/
 34 |   ======================================================== -->
 35 | </head>
 36 | 
 37 | <body>
 38 | 
 39 | 
 40 | 
 41 |   <!-- ======= Header ======= -->
 42 |   <header id="header">
 43 |     <div class="container">
 44 | 
 45 |       <div class="logo float-left">
 46 |         <h1 class="text-light"><a href="/index"><span>Movies</span></a></h1>
 47 |       </div>
 48 | 
 49 |       <nav class="nav-menu float-right d-none d-lg-block">
 50 |         <ul>
 51 |           <li class="active"><a href="/index">首页</a></li>
 52 |           <li><a href="/movie">电影</a></li>
 53 |           <li><a href="/score">评分</a></li>
 54 |           <li><a href="/word">词云</a></li>
 55 | 
 56 |         </ul>
 57 |       </nav><!-- .nav-menu -->
 58 | 
 59 |     </div>
 60 |   </header><!-- End Header -->
 61 | 
 62 |   <!-- ======= Our Team Section ======= -->
 63 |     <section id="team" class="team">
 64 |       <div class="container">
 65 | 
 66 |         <div class="section-title">
 67 |           <h2>豆瓣Top250数据分析</h2>
 68 |           <p>应用Python、Scrapy、Scrapy-Redis、MongoDB、Flask、Echarts、WordCloud等技术实现</p>
 69 |         </div>
 70 | 
 71 |         <!-- ======= Counts Section ======= -->
 72 |     <section class="counts section-bg">
 73 |       <div class="container">
 74 | 
 75 |         <div class="row">
 76 | 
 77 |           <div class="col-lg-3 col-md-6 text-center" data-aos="fade-up">
 78 |             <a href="/movie">
 79 |               <div class="count-box">
 80 |               <i class="icofont-simple-smile" style="color: #20b38e;"></i>
 81 |               <span data-toggle="counter-up">{{ movies_num }}</span>
 82 |               <p>经典电影</p>
 83 |             </div>
 84 |                 </a>
 85 |           </div>
 86 | 
 87 |           <div class="col-lg-3 col-md-6 text-center" data-aos="fade-up" data-aos-delay="200">
 88 |             <a href="/score">
 89 |               <div class="count-box">
 90 |               <i class="icofont-document-folder" style="color: #c042ff;"></i>
 91 |               <span data-toggle="counter-up">{{ votes_num }}</span>
 92 |               <p>评分统计(/万)</p>
 93 |             </div>
 94 |             </a>
 95 |           </div>
 96 | 
 97 |           <div class="col-lg-3 col-md-6 text-center" data-aos="fade-up" data-aos-delay="400">
 98 |             <a href="/word">
 99 |               <div class="count-box">
100 |               <i class="icofont-live-support" style="color: #46d1ff;"></i>
101 |               <span data-toggle="counter-up">{{ words_num }}</span>
102 |               <p>词汇统计</p>
103 |             </div>
104 |             </a>
105 |           </div>
106 | 
107 |           <div class="col-lg-3 col-md-6 text-center" data-aos="fade-up" data-aos-delay="600">
108 |             <a href="/team">
109 |               <div class="count-box">
110 |               <i class="icofont-users-alt-5" style="color: #ffb459;"></i>
111 |               <span data-toggle="counter-up">{{ team_num }}</span>
112 |               <p>团队成员</p>
113 |             </div>
114 |             </a>
115 |           </div>
116 | 
117 |         </div>
118 | 
119 |       </div>
120 |     </section><!-- End Counts Section -->
121 | 
122 |       </div>
123 |     </section><!-- End Our Team Section -->
124 | 
125 |   <a href="#" class="back-to-top"><i class="icofont-simple-up"></i></a>
126 | 
127 |   <!-- Vendor JS Files -->
128 |   <script src="../static/vendor/jquery/jquery.min.js"></script>
129 |   <script src="../static/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
130 |   <script src="../static/vendor/jquery.easing/jquery.easing.min.js"></script>
131 |   <script src="../static/vendor/php-email-form/validate.js"></script>
132 |   <script src="../static/vendor/jquery-sticky/jquery.sticky.js"></script>
133 |   <script src="../static/vendor/venobox/venobox.min.js"></script>
134 |   <script src="../static/vendor/waypoints/jquery.waypoints.min.js"></script>
135 |   <script src="../static/vendor/counterup/counterup.min.js"></script>
136 |   <script src="../static/vendor/isotope-layout/isotope.pkgd.min.js"></script>
137 |   <script src="../static/vendor/aos/aos.js"></script>
138 | 
139 |   <!-- Template Main JS File -->
140 |   <script src="../static/js/main.js"></script>
141 | 
142 | </body>
143 | 
144 | </html>


--------------------------------------------------------------------------------
/FlaskProject/templates/movie.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta content="width=device-width, initial-scale=1.0" name="viewport">
  7 | 
  8 |   <title>豆瓣Top250数据分析</title>
  9 |   <meta content="" name="descriptison">
 10 |   <meta content="" name="keywords">
 11 | 
 12 |   <!-- Favicons -->
 13 |   <link href="../static/img/icon.png" rel="shortcut icon">
 14 | 
 15 |   <!-- Google Fonts -->
 16 |   <link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,600,600i,700,700i,900" rel="stylesheet">
 17 | 
 18 |   <!-- Vendor CSS Files -->
 19 |   <link href="../static/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
 20 |   <link href="../static/vendor/icofont/icofont.min.css" rel="stylesheet">
 21 |   <link href="../static/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
 22 |   <link href="../static/vendor/animate.css/animate.min.css" rel="stylesheet">
 23 |   <link href="../static/vendor/venobox/venobox.css" rel="stylesheet">
 24 |   <link href="../static/vendor/aos/aos.css" rel="stylesheet">
 25 | 
 26 |   <!-- Template Main CSS File -->
 27 |   <link href="../static/css/style.css" rel="stylesheet">
 28 | 
 29 |   <!-- =======================================================
 30 |   * Template Name: Mamba - v2.0.1
 31 |   * Template URL: https://bootstrapmade.com/mamba-one-page-bootstrap-template-free/
 32 |   * Author: BootstrapMade.com
 33 |   * License: https://bootstrapmade.com/license/
 34 |   ======================================================== -->
 35 | </head>
 36 | 
 37 | <body>
 38 | 
 39 | 
 40 | 
 41 |   <!-- ======= Header ======= -->
 42 |   <header id="header">
 43 |     <div class="container">
 44 | 
 45 |       <div class="logo float-left">
 46 |         <h1 class="text-light"><a href="/index"><span>Movies</span></a></h1>
 47 |       </div>
 48 | 
 49 |       <nav class="nav-menu float-right d-none d-lg-block">
 50 |         <ul>
 51 |           <li><a href="/index">首页</a></li>
 52 |           <li class="active"><a href="/movie">电影</a></li>
 53 |           <li><a href="/score">评分</a></li>
 54 |           <li><a href="/word">词云</a></li>
 55 | 
 56 |         </ul>
 57 |       </nav><!-- .nav-menu -->
 58 | 
 59 |     </div>
 60 |   </header><!-- End Header -->
 61 | 
 62 |   <!-- ======= Our Team Section ======= -->
 63 |     <section id="team" class="team">
 64 |       <div class="container">
 65 | 
 66 |         <div class="section-title">
 67 |             <h2>豆瓣Top250电影排行</h2>
 68 |         </div>
 69 | 
 70 |         <!-- ======= Counts Section ======= -->
 71 |     <section class="counts section-bg">
 72 |       <div class="container">
 73 | 
 74 |           <table class="table table-striped">
 75 |               <tr>
 76 |                   <td>排名</td>
 77 |                   <td>电影名称</td>
 78 |                   <td>评分</td>
 79 |                   <td>评价人数</td>
 80 |                   <td>导演</td>
 81 |                   <td>精彩短评</td>
 82 |                   <td>上映时间(/年)</td>
 83 |               </tr>
 84 | 
 85 |               {% for movie in movies %}
 86 |               <tr>
 87 |                   <td>{{movie['rank']}}</td>
 88 |                   <td>
 89 |                       <a href="{{ movie['link'] }}" target="_blank">
 90 |                       {{ movie['title'] }}
 91 |                       </a>
 92 |                   </td>
 93 | 
 94 |                   <td>{{movie['score']}}</td>
 95 |                   <td>{{movie['comment_num']}}</td>
 96 |                   <td>{{movie['directed_by']}}</td>
 97 |                   <td>{{movie['comment']}}</td>
 98 |                   <td>{{movie['year']}}</td>
 99 |               </tr>
100 |               {% endfor %}
101 |           </table>
102 | 
103 | 
104 | 
105 |       </div>
106 |     </section><!-- End Counts Section -->
107 | 
108 |       </div>
109 |     </section><!-- End Our Team Section -->
110 | 
111 | 
112 | 
113 | 
114 |   <a href="#" class="back-to-top"><i class="icofont-simple-up"></i></a>
115 | 
116 |   <!-- Vendor JS Files -->
117 |   <script src="../static/vendor/jquery/jquery.min.js"></script>
118 |   <script src="../static/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
119 |   <script src="../static/vendor/jquery.easing/jquery.easing.min.js"></script>
120 |   <script src="../static/vendor/php-email-form/validate.js"></script>
121 |   <script src="../static/vendor/jquery-sticky/jquery.sticky.js"></script>
122 |   <script src="../static/vendor/venobox/venobox.min.js"></script>
123 |   <script src="../static/vendor/waypoints/jquery.waypoints.min.js"></script>
124 |   <script src="../static/vendor/counterup/counterup.min.js"></script>
125 |   <script src="../static/vendor/isotope-layout/isotope.pkgd.min.js"></script>
126 |   <script src="../static/vendor/aos/aos.js"></script>
127 | 
128 |   <!-- Template Main JS File -->
129 |   <script src="../static/js/main.js"></script>
130 | 
131 | </body>
132 | 
133 | </html>


--------------------------------------------------------------------------------
/FlaskProject/templates/score.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta content="width=device-width, initial-scale=1.0" name="viewport">
  7 |     <!-- 引入 echarts.js -->
  8 |     <script src="static/js/echarts.min.js"></script>
  9 |   <title>豆瓣Top250评分分布图</title>
 10 |   <meta content="" name="descriptison">
 11 |   <meta content="" name="keywords">
 12 | 
 13 |   <!-- Favicons -->
 14 |   <link href="../static/img/icon.png" rel="shortcut icon">
 15 | 
 16 |   <!-- Google Fonts -->
 17 |   <link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,600,600i,700,700i,900" rel="stylesheet">
 18 | 
 19 |   <!-- Vendor CSS Files -->
 20 |   <link href="../static/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
 21 |   <link href="../static/vendor/icofont/icofont.min.css" rel="stylesheet">
 22 |   <link href="../static/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
 23 |   <link href="../static/vendor/animate.css/animate.min.css" rel="stylesheet">
 24 |   <link href="../static/vendor/venobox/venobox.css" rel="stylesheet">
 25 |   <link href="../static/vendor/aos/aos.css" rel="stylesheet">
 26 | 
 27 |   <!-- Template Main CSS File -->
 28 |   <link href="../static/css/style.css" rel="stylesheet">
 29 | 
 30 |   <!-- =======================================================
 31 |   * Template Name: Mamba - v2.0.1
 32 |   * Template URL: https://bootstrapmade.com/mamba-one-page-bootstrap-template-free/
 33 |   * Author: BootstrapMade.com
 34 |   * License: https://bootstrapmade.com/license/
 35 |   ======================================================== -->
 36 | </head>
 37 | 
 38 | <body>
 39 | 
 40 | 
 41 | 
 42 |   <!-- ======= Header ======= -->
 43 |   <header id="header">
 44 |     <div class="container">
 45 | 
 46 |       <div class="logo float-left">
 47 |         <h1 class="text-light"><a href="/index"><span>Movies</span></a></h1>
 48 |       </div>
 49 | 
 50 |       <nav class="nav-menu float-right d-none d-lg-block">
 51 |         <ul>
 52 |           <li><a href="/index">首页</a></li>
 53 |           <li><a href="/movie">电影</a></li>
 54 |           <li class="active"><a href="/score">评分</a></li>
 55 |           <li><a href="/word">词云</a></li>
 56 | 
 57 |         </ul>
 58 |       </nav><!-- .nav-menu -->
 59 | 
 60 |     </div>
 61 |   </header><!-- End Header -->
 62 | 
 63 |   <!-- ======= Our Team Section ======= -->
 64 |     <section id="team" class="team">
 65 |       <div class="container">
 66 | 
 67 |         <div class="section-title">
 68 |           <h2>豆瓣Top250数据分析</h2>
 69 |         </div>
 70 | 
 71 |         <!-- ======= Counts Section ======= -->
 72 |     <section class="counts section-bg">
 73 |       <div class="container">
 74 |           <!-- 为 ECharts 准备一个具备大小（宽高）的 DOM -->
 75 |             <div id="main" style="width: 100%;height:400px;"></div>
 76 | 
 77 | 
 78 | 
 79 |       </div>
 80 |     </section><!-- End Counts Section -->
 81 | 
 82 |       </div>
 83 |     </section><!-- End Our Team Section -->
 84 |   <script type="text/javascript">
 85 |     var dom = document.getElementById("main");
 86 |     var myChart = echarts.init(dom);
 87 |     var app = {};
 88 |     option = null;
 89 |     option = {
 90 |         grid: {
 91 |             containLabel:true
 92 |         },
 93 |         xAxis: {
 94 |             type: 'category',
 95 |             data: {{ score|tojson  }}
 96 |             <!--['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']-->
 97 |         },
 98 |         yAxis: {
 99 |             type: 'value'
100 |         },
101 |         series: [{
102 |             data: {{ num }},
103 |                 <!--[120, 200, 150, 80, 70, 110, 130],-->
104 |             type: 'bar',
105 |             showBackground: true,
106 |             backgroundStyle: {
107 |                 color: 'rgba(220, 220, 220, 0.8)'
108 |             }
109 |         }]
110 |     };
111 |     ;
112 |     if (option && typeof option === "object") {
113 |         myChart.setOption(option, true);
114 |     }
115 |   </script>
116 | 
117 |   <a href="#" class="back-to-top"><i class="icofont-simple-up"></i></a>
118 | 
119 |   <!-- Vendor JS Files -->
120 |   <script src="../static/vendor/jquery/jquery.min.js"></script>
121 |   <script src="../static/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
122 |   <script src="../static/vendor/jquery.easing/jquery.easing.min.js"></script>
123 |   <script src="../static/vendor/php-email-form/validate.js"></script>
124 |   <script src="../static/vendor/jquery-sticky/jquery.sticky.js"></script>
125 |   <script src="../static/vendor/venobox/venobox.min.js"></script>
126 |   <script src="../static/vendor/waypoints/jquery.waypoints.min.js"></script>
127 |   <script src="../static/vendor/counterup/counterup.min.js"></script>
128 |   <script src="../static/vendor/isotope-layout/isotope.pkgd.min.js"></script>
129 |   <script src="../static/vendor/aos/aos.js"></script>
130 | 
131 |   <!-- Template Main JS File -->
132 |   <script src="../static/js/main.js"></script>
133 | 
134 | </body>
135 | 
136 | </html>


--------------------------------------------------------------------------------
/FlaskProject/templates/word.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |   <meta charset="utf-8">
  6 |   <meta content="width=device-width, initial-scale=1.0" name="viewport">
  7 | 
  8 |   <title>豆瓣Top250数据分析</title>
  9 |   <meta content="" name="descriptison">
 10 |   <meta content="" name="keywords">
 11 | 
 12 |   <!-- Favicons -->
 13 |   <link href="../static/img/icon.png" rel="shortcut icon">
 14 | 
 15 |   <!-- Google Fonts -->
 16 |   <link href="https://fonts.googleapis.com/css?family=Open+Sans:300,300i,400,400i,600,600i,700,700i|Raleway:300,300i,400,400i,600,600i,700,700i,900" rel="stylesheet">
 17 | 
 18 |   <!-- Vendor CSS Files -->
 19 |   <link href="../static/vendor/bootstrap/css/bootstrap.min.css" rel="stylesheet">
 20 |   <link href="../static/vendor/icofont/icofont.min.css" rel="stylesheet">
 21 |   <link href="../static/vendor/boxicons/css/boxicons.min.css" rel="stylesheet">
 22 |   <link href="../static/vendor/animate.css/animate.min.css" rel="stylesheet">
 23 |   <link href="../static/vendor/venobox/venobox.css" rel="stylesheet">
 24 |   <link href="../static/vendor/aos/aos.css" rel="stylesheet">
 25 | 
 26 |   <!-- Template Main CSS File -->
 27 |   <link href="../static/css/style.css" rel="stylesheet">
 28 | 
 29 |   <!-- =======================================================
 30 |   * Template Name: Mamba - v2.0.1
 31 |   * Template URL: https://bootstrapmade.com/mamba-one-page-bootstrap-template-free/
 32 |   * Author: BootstrapMade.com
 33 |   * License: https://bootstrapmade.com/license/
 34 |   ======================================================== -->
 35 | </head>
 36 | 
 37 | <body>
 38 | 
 39 | 
 40 | 
 41 |   <!-- ======= Header ======= -->
 42 |   <header id="header">
 43 |     <div class="container">
 44 | 
 45 |       <div class="logo float-left">
 46 |         <h1 class="text-light"><a href="/index"><span>Movies</span></a></h1>
 47 |         <!-- Uncomment below if you prefer to use an image logo -->
 48 |         <!-- <a href="temp.html"><img src="../static/img/logo.png" alt="" class="img-fluid"></a>-->
 49 |       </div>
 50 | 
 51 |       <nav class="nav-menu float-right d-none d-lg-block">
 52 |         <ul>
 53 |           <li><a href="/index">首页</a></li>
 54 |           <li><a href="/movie">电影</a></li>
 55 |           <li><a href="/score">评分</a></li>
 56 |           <li class="active"><a href="/word">词云</a></li>
 57 | 
 58 |         </ul>
 59 |       </nav><!-- .nav-menu -->
 60 | 
 61 |     </div>
 62 |   </header><!-- End Header -->
 63 | 
 64 |   <!-- ======= About Us Section ======= -->
 65 |     <section id="about" class="about">
 66 |       <div class="container">
 67 | 
 68 |         <div class="row no-gutters">
 69 |           <div class="col-lg-6 video-box">
 70 |             <img src="../static/img/generated_tree.jpg" class="img-fluid" alt="">
 71 | 
 72 |           </div>
 73 | 
 74 |           <div class="col-lg-6 d-flex flex-column justify-content-center about-content">
 75 | 
 76 |             <div class="section-title">
 77 |               <h2>词频统计</h2>
 78 |               <p>根据250部电影提取出的词云块增强人们对经典电影的领悟</p>
 79 |             </div>
 80 | 
 81 |             <div class="icon-box" data-aos="fade-up" data-aos-delay="100">
 82 |               <div class="icon"><i class="bx bx-fingerprint"></i></div>
 83 |               <h4 class="title"><a href="">关于250部电影</a></h4>
 84 |               <p class="description">从电影中看百味人生</p>
 85 |             </div>
 86 | 
 87 | 
 88 | 
 89 |           </div>
 90 |         </div>
 91 | 
 92 |       </div>
 93 |     </section><!-- End About Us Section -->
 94 | 
 95 | 
 96 |   <a href="#" class="back-to-top"><i class="icofont-simple-up"></i></a>
 97 | 
 98 |   <!-- Vendor JS Files -->
 99 |   <script src="../static/vendor/jquery/jquery.min.js"></script>
100 |   <script src="../static/vendor/bootstrap/js/bootstrap.bundle.min.js"></script>
101 |   <script src="../static/vendor/jquery.easing/jquery.easing.min.js"></script>
102 |   <script src="../static/vendor/php-email-form/validate.js"></script>
103 |   <script src="../static/vendor/jquery-sticky/jquery.sticky.js"></script>
104 |   <script src="../static/vendor/venobox/venobox.min.js"></script>
105 |   <script src="../static/vendor/waypoints/jquery.waypoints.min.js"></script>
106 |   <script src="../static/vendor/counterup/counterup.min.js"></script>
107 |   <script src="../static/vendor/isotope-layout/isotope.pkgd.min.js"></script>
108 |   <script src="../static/vendor/aos/aos.js"></script>
109 | 
110 |   <!-- Template Main JS File -->
111 |   <script src="../static/js/main.js"></script>
112 | 
113 | </body>
114 | 
115 | </html>


--------------------------------------------------------------------------------
/FlaskProject/wordCloud.py:
--------------------------------------------------------------------------------
 1 | from wordcloud import WordCloud         #词云
 2 | import jieba                            #分词
 3 | from matplotlib import pyplot as plt    #绘图 数据可视化
 4 | from PIL import Image                   #图片处理
 5 | import numpy as np                      #矩阵运算
 6 | import pymongo                          #数据库
 7 | 
 8 | client = pymongo.MongoClient(host='localhost', port=27017)
 9 | db = client.movies
10 | collection = db.douban
11 | query = {}
12 | projection = {}
13 | 
14 | projection["title"] = u"$title"
15 | projection["movie_type"] = u"$movie_type"
16 | projection["directedBy"] = u"$directedBy"
17 | projection["_id"] = 0
18 | 
19 | cursor = collection.find(query, projection = projection)
20 | text = ""
21 | for doc in cursor:
22 |     for content in doc.values():
23 |         content.replace('/',' ')
24 |         text = text + content
25 | 
26 | cut = jieba.cut(text)
27 | string = ' '.join(cut)
28 | print(len(string))
29 | 
30 | img = Image.open(r'./static/img/tree.jpg')
31 | img_array = np.array(img)   #将图片转换为数组
32 | wc = WordCloud(
33 |     background_color='white',
34 |     mask=img_array,
35 |     font_path="msyh.ttc"    #字体所在位置C:\Windows\Fonts
36 | )
37 | wc.generate_from_text(string)
38 | 
39 | #绘制图片
40 | 
41 | fig = plt.figure(1)
42 | plt.imshow(wc)
43 | plt.axis('off') #是否显示坐标轴
44 | 
45 | # plt.show()  #显示生成的词云图片
46 | plt.savefig('./static/img/generated_tree.jpg',dpi=500)
47 | 


--------------------------------------------------------------------------------
/Master/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/Master/.idea/Master.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="jdk" jdkName="Python 3.8 (web_crawler)" jdkType="Python SDK" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/Master/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/Master/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (web_crawler)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/Master/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Master.iml" filepath="$PROJECT_DIR$/.idea/Master.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/Master/main.py:
--------------------------------------------------------------------------------
 1 | import redis
 2 | import pymongo
 3 | import json
 4 | 
 5 | def main():
 6 |     r = redis.Redis(host='localhost',port=6379,db=0)
 7 |     client = pymongo.MongoClient(host='localhost', port=27017)
 8 |     db = client.movies
 9 |     collection = db.douban
10 |     while True:
11 |         source, data = r.blpop(["douban_redis:items"])
12 |         item = json.loads(data)
13 |         print(item)
14 |         collection.replace_one(filter={"rank":item["rank"]},replacement=item,upsert=True)
15 | 
16 | if __name__ == '__main__':
17 |     main()


--------------------------------------------------------------------------------
/Pic/index.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/index.jpg


--------------------------------------------------------------------------------
/Pic/mongoDB_data.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/mongoDB_data.jpg


--------------------------------------------------------------------------------
/Pic/movies.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/movies.jpg


--------------------------------------------------------------------------------
/Pic/proxy.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/proxy.jpg


--------------------------------------------------------------------------------
/Pic/redis_data.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/redis_data.jpg


--------------------------------------------------------------------------------
/Pic/score.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/score.jpg


--------------------------------------------------------------------------------
/Pic/slave.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/slave.jpg


--------------------------------------------------------------------------------
/Pic/words.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/words.jpg


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 1. 环境配置
 2 | ## 1.1 爬虫部分软件包版本
 3 | - Python 3.8.13
 4 | - Scrapy 2.6.2
 5 | - Scrapy-redis 0.7.3
 6 | - pymongo 4.2.0
 7 | - redis 4.3.4
 8 | ## 1.2 数据库
 9 | - MongoDB
10 | - Redis
11 | ## 1.3 前后端交互
12 | ### 前端
13 | - jinja2
14 | - Echarts
15 | ### 后端
16 | - flask 2.2.2
17 | ## 1.4 IP代理池
18 | &ensp;&ensp;参考：[https://github.com/jhao104/proxy_pool](https://github.com/jhao104/proxy_pool)进行配置。
19 | # 2. 项目文件目录
20 | **---FlaskProject（数据可视化代码）**<br>
21 | ------static（用到的静态资源）<br>
22 | ------templates（前端展示模板）<br>
23 | ------app.py（后端代码）<br>
24 | ------data.txt（爬取到的数据示例）<br>
25 | ------wordCloud.py（生成词云代码）<br>
26 | **---Master（主机端的代码）**<br>
27 | ------main.py（将数据从redis中取出，放入到MongoDB）<br>
28 | **---Pic（运行效果图）**<br>
29 | **---proxy_pool（IP代理池）**<br>
30 | **---Slave（从机端的代码）**<br>
31 | ------movies<br>
32 | ---------spiders<br>
33 | ------------douban_redis.py（爬取数据的主要代码）<br>
34 | ---------middlewares.py（中间件，实现IP代理、动态User-Agent等功能）<br>
35 | ---------settings.py（爬虫的相关配置）<br>
36 | # 3. 项目配置过程
37 | &ensp;&ensp;项目整体基于分布式的思想设计，分为**主机端代码**和**从机端代码。**
38 | ## 3.1 从机
39 | &ensp;&ensp;从机负责执行爬虫程序，从网站爬取数据并存储到主机的Redis数据库中。Redis数据库可以记录爬取的url进度，因此爬虫程序可以中途暂停，从机数目可以任意设置，并且所有从机都执行相同的代码。<br>
40 | &ensp;&ensp;在进行项目测试的时候可以通过在一台电脑上配置虚拟机，从而实现分布式的效果。虚拟机推荐使用[CentOS 7](http://isoredirect.centos.org/centos/7/isos/x86_64/)系统，系统轻量化、占用资源少。从机需要在`settings.py`文件中设置主机的IP和端口。<br>
41 | &ensp;&ensp;从机环境配置完成后，cd到`spiders`文件夹下运行：`scrapy runspider douban_redis.py`命令可以启动从机程序，从机程序启动后会等待主机发放起始url。<br>
42 | ## 3.2 主机
43 | &ensp;&ensp;主机负责维护Redis数据库，并将Redis数据库中的数据存储到MongoDB数据库中。<br>
44 | &ensp;&ensp;启动Redis服务后，在`redis-cli.exe`中运行：`lpush douban:start_urls https://movie.douban.com/top250`命令即可在Redis数据库中插入起始url，插入成功后从机会自动开始爬取程序。<br>
45 | &ensp;&ensp;主机端运行代理池`proxy_pool`下的代码可获取免费代理IP，并存入`redis`数据库中。<br>
46 | &ensp;&ensp;主机端的main.py用于实现取数据的功能，可以将Redis数据库中的数据取出，放入到MongoDB数据库中。
47 | ## 3.3 可视化
48 | &ensp;&ensp;安装flask后，在主机端打开`FlaskProject`文件，运行`app.py`即可启动后端服务。启动后端服务后，在浏览器访问在本机默认IP:端口`http://127.0.0.1:5000/`即可看到可视化效果。<br>
49 | ## 3.4 IP代理池
50 | &ensp;&ensp;项目参考：[https://github.com/jhao104/proxy_pool](https://github.com/jhao104/proxy_pool)<br>
51 | &ensp;&ensp;参考说明配置完环境后运行以下命令可以启动IP池程序。<br>
52 | ```
53 | # 启动调度程序
54 | python proxyPool.py schedule
55 | 
56 | # 启动webApi服务
57 | python proxyPool.py server
58 | ```
59 | # 4. 运行截图
60 | ## 4.1 从机运行
61 | ### 从机爬虫程序
62 | ![从机爬虫程序](https://github.com/CoderDon/Crawler/raw/main/Pic/slave.jpg)
63 | ## 4.2 主机数据库
64 | ### Redis数据库缓存URL
65 | ![Redis数据库缓存URL](https://github.com/CoderDon/Crawler/raw/main/Pic/redis_data.jpg)
66 | ### Redis缓存代理IPs
67 | ![Redis缓存代理IPs](https://github.com/CoderDon/Crawler/raw/main/Pic/proxy.jpg)
68 | ### MongoDB数据库
69 | ![MongoDB数据库](https://github.com/CoderDon/Crawler/raw/main/Pic/mongoDB_data.jpg)
70 | ## 4.3 可视化
71 | ### 首页
72 | ![首页](https://github.com/CoderDon/Crawler/raw/main/Pic/index.jpg)
73 | ### 电影
74 | ![电影](https://github.com/CoderDon/Crawler/raw/main/Pic/movies.jpg)
75 | ### 评分
76 | ![评分](https://github.com/CoderDon/Crawler/raw/main/Pic/score.jpg)
77 | ### 词云
78 | ![词云](https://github.com/CoderDon/Crawler/raw/main/Pic/words.jpg)


--------------------------------------------------------------------------------
/Slave/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Editor-based HTTP Client requests
5 | /httpRequests/
6 | # Datasource local storage ignored files
7 | /dataSources/
8 | /dataSources.local.xml
9 | 


--------------------------------------------------------------------------------
/Slave/.idea/Slave.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="jdk" jdkName="Python 3.8 (web_crawler)" jdkType="Python SDK" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/Slave/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/Slave/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (web_crawler)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/Slave/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/Slave.iml" filepath="$PROJECT_DIR$/.idea/Slave.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/Slave/movies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__init__.py


--------------------------------------------------------------------------------
/Slave/movies/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/Slave/movies/__pycache__/middlewares.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/middlewares.cpython-38.pyc


--------------------------------------------------------------------------------
/Slave/movies/__pycache__/pipelines.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/pipelines.cpython-38.pyc


--------------------------------------------------------------------------------
/Slave/movies/__pycache__/settings.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/settings.cpython-38.pyc


--------------------------------------------------------------------------------
/Slave/movies/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class MoviesItem(scrapy.Item):
10 |     # define the fields for your item here like:
11 |     pass
12 |     # page_url = scrapy.Field()
13 |     # title = scrapy.Field()
14 |     # year = scrapy.Field()
15 |     # score = scrapy.Field()
16 |     # directedBy = scrapy.Field()
17 |     # actors = scrapy.Field()
18 |     # movie_type = scrapy.Field()
19 |     # comment = scrapy.Field()
20 |     # introduc = scrapy.Field()
21 |     # image_urls = scrapy.Field()
22 |     # image_name = scrapy.Field()
23 | 


--------------------------------------------------------------------------------
/Slave/movies/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | 
  6 | from scrapy import signals
  7 | from fake_useragent import UserAgent
  8 | import redis
  9 | import random
 10 | from scrapy.exceptions import NotConfigured
 11 | from twisted.internet.error import ConnectError, TimeoutError
 12 | import json
 13 | # useful for handling different item types with a single interface
 14 | from itemadapter import is_item, ItemAdapter
 15 | 
 16 | 
 17 | class RandomProxyMiddleWare(object):
 18 |     def __init__(self, settings):
 19 |         # 2.初始化配置及相关变量
 20 |         self.r = redis.Redis(host='localhost', port=6379, db=0)
 21 |         self.proxy_key = settings.get('PROXY_REDIS_KEY')
 22 |         self.max_failed = 1
 23 | 
 24 |     @property
 25 |     def proxies(self):
 26 |         # return [i.decode('utf-8') for i in self.r.hkeys('use_proxy')]
 27 |         # return [i.decode('utf-8') for i in self.r.hkeys('use_proxy')
 28 |         #         if json.loads(self.r.hget('use_proxy', i.decode('utf-8')).decode('utf-8'))['https'] == True]
 29 |         return []
 30 | 
 31 |     @classmethod
 32 |     def from_crawler(cls, crawler):
 33 |         # 1. 创建中间件对象
 34 |         # 默认代理是启用的
 35 |         if not crawler.settings.getbool('HTTPPROXY_ENABLED'):
 36 |             raise NotConfigured
 37 |         return cls(crawler.settings)
 38 | 
 39 |     def process_request(self, request, spider):
 40 |         # 3. 为每个request对象分配随机的ip代理
 41 |         if self.proxies and not request.meta.get('proxy'):
 42 |             proxies_list = self.proxies
 43 |             if proxies_list:
 44 |                 request.meta['proxy'] = 'https://' + random.choice(proxies_list)
 45 | 
 46 |     def process_response(self, request, response, spider):
 47 |         # 4. 请求成功
 48 |         # 如果proxy为空则直接返回
 49 |         if not request.meta.get('proxy'):
 50 |             return response
 51 |         cur_proxy = request.meta.get('proxy').replace('https://', '')
 52 |         # 判断ip是否被对方封禁
 53 |         if response.status in (400, 401, 403):
 54 |             # 先拿到当前ip:port对应的value
 55 |             value = json.loads(self.r.hget(self.proxy_key, cur_proxy).decode('utf-8'))
 56 |             value['fail_count'] += 1
 57 |             self.r.hset(self.proxy_key, cur_proxy,
 58 |                         str(value).replace("'", '"').replace('False', 'false').replace('True', 'true'))
 59 |         # 当某个IP的失败次数累积到一定的数量
 60 |         filed_times = json.loads(self.r.hget(self.proxy_key, cur_proxy).decode('utf-8'))['fail_count'] or 0
 61 |         if int(filed_times) >= self.max_failed:
 62 |             print('got wrong http code (%s) when use %s' % (response.status, cur_proxy))
 63 |             # 可以认为该IP被对方封禁。从代理池中将该IP删除
 64 |             self.remove_proxy(cur_proxy)
 65 |             del request.meta['proxy']
 66 |             # 返回request 将该请求重新->调度器
 67 |             return request
 68 |         return response
 69 | 
 70 |     def process_exception(self, request, exception, spider):
 71 |         # 4.1 请求失败
 72 |         cur_proxy = request.meta.get('proxy')
 73 |         # 请求使用代理，并且网络请求报错，认为该IP出错，删除，并重新->调度器
 74 |         if cur_proxy and isinstance(cur_proxy, (ConnectError, TimeoutError)):
 75 |             print('error (%s) occur when use proxy %s' % (exception, cur_proxy))
 76 |             self.remove_proxy(cur_proxy)
 77 |             del request.meta['proxy']
 78 |             return request
 79 | 
 80 |     def remove_proxy(self, proxy):
 81 |         if proxy in self.proxies:
 82 |             self.r.hdel(self.proxy_key, proxy)
 83 | 
 84 | 
 85 | class UserAgentMiddleware(object):
 86 |     def process_request(self, request, spider):
 87 |         request.headers.setdefault(b'User-Agent', UserAgent().random)
 88 | 
 89 | class MoviesSpiderMiddleware:
 90 |     # Not all methods need to be defined. If a method is not defined,
 91 |     # scrapy acts as if the spider middleware does not modify the
 92 |     # passed objects.
 93 | 
 94 |     @classmethod
 95 |     def from_crawler(cls, crawler):
 96 |         # This method is used by Scrapy to create your spiders.
 97 |         s = cls()
 98 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 99 |         return s
100 | 
101 |     def process_spider_input(self, response, spider):
102 |         # Called for each response that goes through the spider
103 |         # middleware and into the spider.
104 | 
105 |         # Should return None or raise an exception.
106 |         return None
107 | 
108 |     def process_spider_output(self, response, result, spider):
109 |         # Called with the results returned from the Spider, after
110 |         # it has processed the response.
111 | 
112 |         # Must return an iterable of Request, or item objects.
113 |         for i in result:
114 |             yield i
115 | 
116 |     def process_spider_exception(self, response, exception, spider):
117 |         # Called when a spider or process_spider_input() method
118 |         # (from other spider middleware) raises an exception.
119 | 
120 |         # Should return either None or an iterable of Request or item objects.
121 |         pass
122 | 
123 |     def process_start_requests(self, start_requests, spider):
124 |         # Called with the start requests of the spider, and works
125 |         # similarly to the process_spider_output() method, except
126 |         # that it doesn’t have a response associated.
127 | 
128 |         # Must return only requests (not items).
129 |         for r in start_requests:
130 |             yield r
131 | 
132 |     def spider_opened(self, spider):
133 |         spider.logger.info('Spider opened: %s' % spider.name)
134 | 
135 | 
136 | class MoviesDownloaderMiddleware:
137 |     # Not all methods need to be defined. If a method is not defined,
138 |     # scrapy acts as if the downloader middleware does not modify the
139 |     # passed objects.
140 | 
141 |     @classmethod
142 |     def from_crawler(cls, crawler):
143 |         # This method is used by Scrapy to create your spiders.
144 |         s = cls()
145 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
146 |         return s
147 | 
148 |     def process_request(self, request, spider):
149 |         # Called for each request that goes through the downloader
150 |         # middleware.
151 | 
152 |         # Must either:
153 |         # - return None: continue processing this request
154 |         # - or return a Response object
155 |         # - or return a Request object
156 |         # - or raise IgnoreRequest: process_exception() methods of
157 |         #   installed downloader middleware will be called
158 | 
159 |         # 在请求页面时伪装成站内请求，用以反 反爬虫
160 |         referer = request.url
161 |         if referer:
162 |             request.headers['referer'] = referer
163 | 
164 |         return None
165 | 
166 |     def process_response(self, request, response, spider):
167 |         # Called with the response returned from the downloader.
168 | 
169 |         # Must either;
170 |         # - return a Response object
171 |         # - return a Request object
172 |         # - or raise IgnoreRequest
173 |         return response
174 | 
175 |     def process_exception(self, request, exception, spider):
176 |         # Called when a download handler or a process_request()
177 |         # (from other downloader middleware) raises an exception.
178 | 
179 |         # Must either:
180 |         # - return None: continue processing this exception
181 |         # - return a Response object: stops process_exception() chain
182 |         # - return a Request object: stops process_exception() chain
183 |         pass
184 | 
185 |     def spider_opened(self, spider):
186 |         spider.logger.info('Spider opened: %s' % spider.name)
187 | 


--------------------------------------------------------------------------------
/Slave/movies/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | import pymongo
10 | from pymysql import connect
11 | from scrapy.pipelines.images import ImagesPipeline
12 | import scrapy
13 | from scrapy.exceptions import DropItem
14 | 
15 | class MongoMoviesPipeline:
16 |     def open_spider(self,spider):
17 |         self.client = pymongo.MongoClient()
18 | 
19 |     def process_item(self, item, spider):
20 |         # self.client.movies.douban.replace_one(filter={"page_url":item["page_url"]},replacement=item,upsert=True)
21 |         return item
22 | 
23 |     def close_spider(self,spider):
24 |         self.client.close()
25 | 
26 | class ImagePipeline(ImagesPipeline):
27 |     def get_media_requests(self, item, info):
28 |         yield scrapy.Request(item['image_urls'], meta={"image_name": item['image_name']})
29 | 
30 |     def file_path(self, request, response=None, info=None, *, item=None):
31 |         file_name = request.meta['image_name'] + ".jpg"
32 |         return file_name
33 | 
34 |     def item_completed(self, results, item, info):
35 |         image_paths = [x['path'] for ok, x in results if ok]
36 |         if not image_paths:
37 |             raise DropItem("Item contains no images")
38 |         return item
39 | 
40 | # class MysqlMoviesPipeline:
41 | #     def open_spider(self,spider):
42 | #         self.client = connect(host='localhost',port='3306',user='root',password='123456',db='movies',charset='utf8')
43 | #         self.cursor = self.client.cursor()
44 | #
45 | #     def process_item(self, item, spider):
46 | #         self.client.movies.douban.insert_one(item)
47 | #         return item
48 | #
49 | #     def close_spider(self,spider):
50 | #         self.cursor.close()
51 | #         self.client.close()
52 | 


--------------------------------------------------------------------------------
/Slave/movies/settings.py:
--------------------------------------------------------------------------------
  1 | # Scrapy settings for movies project
  2 | #
  3 | # For simplicity, this file contains only settings considered important or
  4 | # commonly used. You can find more settings consulting the documentation:
  5 | #
  6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
  7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  9 | 
 10 | BOT_NAME = 'movies'
 11 | 
 12 | SPIDER_MODULES = ['movies.spiders']
 13 | NEWSPIDER_MODULE = 'movies.spiders'
 14 | 
 15 | 
 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 17 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'
 18 | 
 19 | # Obey robots.txt rules
 20 | ROBOTSTXT_OBEY = True
 21 | LOG_LEVEL = 'DEBUG'
 22 | 
 23 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 24 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 25 | SCHEDULER_PERSIST = True
 26 | 
 27 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue"
 28 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
 29 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack"
 30 | 
 31 | ITEM_PIPELINES = {
 32 |     'scrapy_redis.pipelines.RedisPipeline': 300,
 33 | }
 34 | REDIS_HOST = '192.168.108.1'
 35 | REDIS_PORT = '6379'
 36 | 
 37 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 38 | #CONCURRENT_REQUESTS = 32
 39 | 
 40 | # Configure a delay for requests for the same website (default: 0)
 41 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 42 | # See also autothrottle settings and docs
 43 | DOWNLOAD_DELAY = 5
 44 | # The download delay setting will honor only one of:
 45 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 46 | #CONCURRENT_REQUESTS_PER_IP = 16
 47 | 
 48 | # Disable cookies (enabled by default)
 49 | #COOKIES_ENABLED = False
 50 | 
 51 | # Disable Telnet Console (enabled by default)
 52 | #TELNETCONSOLE_ENABLED = False
 53 | 
 54 | # Override the default request headers:
 55 | #DEFAULT_REQUEST_HEADERS = {
 56 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 57 | #   'Accept-Language': 'en',
 58 | #}
 59 | 
 60 | # Enable or disable spider middlewares
 61 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 62 | #SPIDER_MIDDLEWARES = {
 63 | #    'movies.middlewares.MoviesSpiderMiddleware': 543,
 64 | #}
 65 | 
 66 | # Enable or disable downloader middlewares
 67 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 68 | DOWNLOADER_MIDDLEWARES = {
 69 |     'movies.middlewares.RandomProxyMiddleWare': 241,
 70 |     'movies.middlewares.UserAgentMiddleware': 242,
 71 |     'movies.middlewares.MoviesDownloaderMiddleware': 243,
 72 | }
 73 | 
 74 | PROXY_REDIS_KEY = 'use_proxy'
 75 | HTTPPROXY_ENABLED = True
 76 | 
 77 | # Enable or disable extensions
 78 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 79 | #EXTENSIONS = {
 80 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 81 | #}
 82 | 
 83 | # Configure item pipelines
 84 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 85 | 
 86 | # ITEM_PIPELINES = {
 87 | #    'movies.pipelines.MongoMoviesPipeline': 300,
 88 | #    # 'movies.pipelines.ImagePipeline': 301,
 89 | # }
 90 | # IMAGES_STORE ='../images/'
 91 | # IMAGES_URLS_FIELD = 'image_urls'   #对应item里面设定的字段，取到图片的url
 92 | 
 93 | # Enable and configure the AutoThrottle extension (disabled by default)
 94 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 95 | #AUTOTHROTTLE_ENABLED = True
 96 | # The initial download delay
 97 | #AUTOTHROTTLE_START_DELAY = 5
 98 | # The maximum download delay to be set in case of high latencies
 99 | #AUTOTHROTTLE_MAX_DELAY = 60
100 | # The average number of requests Scrapy should be sending in parallel to
101 | # each remote server
102 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
103 | # Enable showing throttling stats for every response received:
104 | #AUTOTHROTTLE_DEBUG = False
105 | 
106 | # Enable and configure HTTP caching (disabled by default)
107 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
108 | #HTTPCACHE_ENABLED = True
109 | #HTTPCACHE_EXPIRATION_SECS = 0
110 | #HTTPCACHE_DIR = 'httpcache'
111 | #HTTPCACHE_IGNORE_HTTP_CODES = []
112 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
113 | 


--------------------------------------------------------------------------------
/Slave/movies/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/Slave/movies/spiders/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/spiders/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/Slave/movies/spiders/__pycache__/douban_redis.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/spiders/__pycache__/douban_redis.cpython-38.pyc


--------------------------------------------------------------------------------
/Slave/movies/spiders/douban_redis.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from scrapy_redis.spiders import RedisCrawlSpider
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import Rule
 5 | import re
 6 | 
 7 | class DoubanSpider(RedisCrawlSpider):
 8 |     name = 'douban_redis'
 9 |     allowed_domains = ['douban.com']
10 |     # start_urls = ['https://movie.douban.com/top250?start=0&filter=']
11 |     # start_urls = ['https://movie.douban.com/top250?start={}&filter='.format(num) for num in range(0, 226, 25)]
12 |     redis_key = 'douban:start_urls'
13 | 
14 |     rules = (
15 |         Rule(LinkExtractor(restrict_xpaths=r'//div[@class="hd"]/a'), callback='parse_info'),
16 |         Rule(LinkExtractor(restrict_xpaths=r'//div[@class="paginator"]/a'), follow=True),
17 |     )
18 | 
19 |     def parse_info(self, response):
20 |         page_url = response.url
21 |         title = response.xpath("//h1/span[@property='v:itemreviewed']/text()").extract_first()
22 |         year = response.xpath("//h1/span[@class='year']/text()").extract_first()
23 |         score = response.xpath("//strong[@class='ll rating_num']/text()").extract_first()
24 |         directedBy = response.xpath("//span[@class='attrs']/a[@rel='v:directedBy']/text()").extract_first()
25 |         actors = response.xpath("string(//span[@class='actor']/span[@class='attrs']/span)").extract_first()
26 |         if actors == '':
27 |             actors = response.xpath("string(//span[@class='actor']/span[@class='attrs'])").extract_first()
28 |         movie_type = '/'.join(response.xpath("//span[@property='v:genre']/text()").extract())
29 |         rank = re.findall(r"\d+",response.xpath("//span[@class='top250-no']/text()").extract_first())[0]
30 |         comment_num = response.xpath("//span[@property='v:votes']/text()").extract_first()
31 |         comments = response.xpath("//p/span[@class='short']/text()").extract()
32 |         comment = ''
33 |         # 任意选一条长度小于100的短评
34 |         for cmt in comments:
35 |             if len(cmt) < 100:
36 |                 comment = cmt
37 |         # 没有长度小于100的短评 读取长文
38 |         if comment == '':
39 |             comments = response.xpath("//p/span[@class='full']/text()").extract_first()
40 |         introduc = response.xpath("string(//div[@class='indent']/span[@class='all hidden'])").extract_first()
41 |         if introduc == '':
42 |             introduc = response.xpath("string(//div[@class='indent']/span[@property='v:summary'])").extract_first()
43 |         image_url = response.xpath("//img[@title='点击看更多海报']/@src").extract_first()
44 |         image_name = page_url.split('/')[-2]
45 |         print(title)
46 |         yield {
47 |             "page_url":page_url,
48 |             "title":title,
49 |             "year":year,
50 |             "score":score,
51 |             "directedBy":directedBy,
52 |             "actors":actors,
53 |             "movie_type":movie_type,
54 |             "rank":rank,
55 |             "comment":comment,
56 |             "comment_num":comment_num,
57 |             "introduc":introduc,
58 |             "image_urls": image_url,
59 |             "image_name": image_name
60 |         }
61 | 
62 | 


--------------------------------------------------------------------------------
/Slave/movies/start.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 | 
3 | # execute('scrapy crawl douban'.split())
4 | execute('scrapy crawl douban_redis'.split())
5 | 


--------------------------------------------------------------------------------
/Slave/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = movies.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = movies
12 | 


--------------------------------------------------------------------------------
/proxy_pool/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | # Datasource local storage ignored files
5 | /dataSources/
6 | /dataSources.local.xml
7 | # Editor-based HTTP Client requests
8 | /httpRequests/
9 | 


--------------------------------------------------------------------------------
/proxy_pool/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="InspectionProjectProfileManager">
2 |   <settings>
3 |     <option name="USE_PROJECT_PROFILE" value="false" />
4 |     <version value="1.0" />
5 |   </settings>
6 | </component>


--------------------------------------------------------------------------------
/proxy_pool/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.8 (proxy)" project-jdk-type="Python SDK" />
4 | </project>


--------------------------------------------------------------------------------
/proxy_pool/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/proxy_pool.iml" filepath="$PROJECT_DIR$/.idea/proxy_pool.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/proxy_pool/.idea/proxy_pool.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.8 (proxy)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="PyDocumentationSettings">
 9 |     <option name="format" value="PLAIN" />
10 |     <option name="myDocStringFormat" value="Plain" />
11 |   </component>
12 |   <component name="TemplatesService">
13 |     <option name="TEMPLATE_CONFIGURATION" value="Jinja2" />
14 |   </component>
15 | </module>


--------------------------------------------------------------------------------
/proxy_pool/__pycache__/setting.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/__pycache__/setting.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/api/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     __init__.py  
 5 |    Description :  
 6 |    Author :       JHao
 7 |    date：          2016/12/3
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2016/12/3: 
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | 


--------------------------------------------------------------------------------
/proxy_pool/api/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/api/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/api/__pycache__/proxyApi.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/api/__pycache__/proxyApi.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/api/proxyApi.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # !/usr/bin/env python
  3 | """
  4 | -------------------------------------------------
  5 |    File Name：     ProxyApi.py
  6 |    Description :   WebApi
  7 |    Author :       JHao
  8 |    date：          2016/12/4
  9 | -------------------------------------------------
 10 |    Change Activity:
 11 |                    2016/12/04: WebApi
 12 |                    2019/08/14: 集成Gunicorn启动方式
 13 |                    2020/06/23: 新增pop接口
 14 |                    2022/07/21: 更新count接口
 15 | -------------------------------------------------
 16 | """
 17 | __author__ = 'JHao'
 18 | 
 19 | import platform
 20 | from werkzeug.wrappers import Response
 21 | from flask import Flask, jsonify, request
 22 | 
 23 | from util.six import iteritems
 24 | from helper.proxy import Proxy
 25 | from handler.proxyHandler import ProxyHandler
 26 | from handler.configHandler import ConfigHandler
 27 | 
 28 | app = Flask(__name__)
 29 | conf = ConfigHandler()
 30 | proxy_handler = ProxyHandler()
 31 | 
 32 | 
 33 | class JsonResponse(Response):
 34 |     @classmethod
 35 |     def force_type(cls, response, environ=None):
 36 |         if isinstance(response, (dict, list)):
 37 |             response = jsonify(response)
 38 | 
 39 |         return super(JsonResponse, cls).force_type(response, environ)
 40 | 
 41 | 
 42 | app.response_class = JsonResponse
 43 | 
 44 | api_list = [
 45 |     {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"},
 46 |     {"url": "/pop", "params": "", "desc": "get and delete a proxy"},
 47 |     {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"},
 48 |     {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"},
 49 |     {"url": "/count", "params": "", "desc": "return proxy count"}
 50 |     # 'refresh': 'refresh proxy pool',
 51 | ]
 52 | 
 53 | 
 54 | @app.route('/')
 55 | def index():
 56 |     return {'url': api_list}
 57 | 
 58 | 
 59 | @app.route('/get/')
 60 | def get():
 61 |     https = request.args.get("type", "").lower() == 'https'
 62 |     proxy = proxy_handler.get(https)
 63 |     return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"}
 64 | 
 65 | 
 66 | @app.route('/pop/')
 67 | def pop():
 68 |     https = request.args.get("type", "").lower() == 'https'
 69 |     proxy = proxy_handler.pop(https)
 70 |     return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"}
 71 | 
 72 | 
 73 | @app.route('/refresh/')
 74 | def refresh():
 75 |     # TODO refresh会有守护程序定时执行，由api直接调用性能较差，暂不使用
 76 |     return 'success'
 77 | 
 78 | 
 79 | @app.route('/all/')
 80 | def getAll():
 81 |     https = request.args.get("type", "").lower() == 'https'
 82 |     proxies = proxy_handler.getAll(https)
 83 |     return jsonify([_.to_dict for _ in proxies])
 84 | 
 85 | 
 86 | @app.route('/delete/', methods=['GET'])
 87 | def delete():
 88 |     proxy = request.args.get('proxy')
 89 |     status = proxy_handler.delete(Proxy(proxy))
 90 |     return {"code": 0, "src": status}
 91 | 
 92 | 
 93 | @app.route('/count/')
 94 | def getCount():
 95 |     proxies = proxy_handler.getAll()
 96 |     http_type_dict = {}
 97 |     source_dict = {}
 98 |     for proxy in proxies:
 99 |         http_type = 'https' if proxy.https else 'http'
100 |         http_type_dict[http_type] = http_type_dict.get(http_type, 0) + 1
101 |         for source in proxy.source.split('/'):
102 |             source_dict[source] = source_dict.get(source, 0) + 1
103 |     return {"http_type": http_type_dict, "source": source_dict, "count": len(proxies)}
104 | 
105 | 
106 | def runFlask():
107 |     if platform.system() == "Windows":
108 |         app.run(host=conf.serverHost, port=conf.serverPort)
109 |     else:
110 |         import gunicorn.app.base
111 | 
112 |         class StandaloneApplication(gunicorn.app.base.BaseApplication):
113 | 
114 |             def __init__(self, app, options=None):
115 |                 self.options = options or {}
116 |                 self.application = app
117 |                 super(StandaloneApplication, self).__init__()
118 | 
119 |             def load_config(self):
120 |                 _config = dict([(key, value) for key, value in iteritems(self.options)
121 |                                 if key in self.cfg.settings and value is not None])
122 |                 for key, value in iteritems(_config):
123 |                     self.cfg.set(key.lower(), value)
124 | 
125 |             def load(self):
126 |                 return self.application
127 | 
128 |         _options = {
129 |             'bind': '%s:%s' % (conf.serverHost, conf.serverPort),
130 |             'workers': 4,
131 |             'accesslog': '-',  # log to stdout
132 |             'access_log_format': '%(h)s %(l)s %(t)s "%(r)s" %(s)s "%(a)s"'
133 |         }
134 |         StandaloneApplication(app, _options).run()
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     runFlask()
139 | 


--------------------------------------------------------------------------------
/proxy_pool/db/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     __init__.py.py  
 5 |    Description :  
 6 |    Author :       JHao
 7 |    date：          2016/12/2
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2016/12/2: 
11 | -------------------------------------------------
12 | """


--------------------------------------------------------------------------------
/proxy_pool/db/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/db/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/db/__pycache__/dbClient.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/db/__pycache__/dbClient.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/db/__pycache__/redisClient.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/db/__pycache__/redisClient.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/db/dbClient.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # !/usr/bin/env python
  3 | """
  4 | -------------------------------------------------
  5 |    File Name：    DbClient.py
  6 |    Description :  DB工厂类
  7 |    Author :       JHao
  8 |    date：          2016/12/2
  9 | -------------------------------------------------
 10 |    Change Activity:
 11 |                    2016/12/02:   DB工厂类
 12 |                    2020/07/03:   取消raw_proxy储存
 13 | -------------------------------------------------
 14 | """
 15 | __author__ = 'JHao'
 16 | 
 17 | import os
 18 | import sys
 19 | 
 20 | from util.six import urlparse, withMetaclass
 21 | from util.singleton import Singleton
 22 | 
 23 | sys.path.append(os.path.dirname(os.path.abspath(__file__)))
 24 | 
 25 | 
 26 | class DbClient(withMetaclass(Singleton)):
 27 |     """
 28 |     DbClient DB工厂类 提供get/put/update/pop/delete/exists/getAll/clean/getCount/changeTable方法
 29 | 
 30 | 
 31 |     抽象方法定义：
 32 |         get(): 随机返回一个proxy;
 33 |         put(proxy): 存入一个proxy;
 34 |         pop(): 顺序返回并删除一个proxy;
 35 |         update(proxy): 更新指定proxy信息;
 36 |         delete(proxy): 删除指定proxy;
 37 |         exists(proxy): 判断指定proxy是否存在;
 38 |         getAll(): 返回所有代理;
 39 |         clean(): 清除所有proxy信息;
 40 |         getCount(): 返回proxy统计信息;
 41 |         changeTable(name): 切换操作对象
 42 | 
 43 | 
 44 |         所有方法需要相应类去具体实现：
 45 |             ssdb: ssdbClient.py
 46 |             redis: redisClient.py
 47 |             mongodb: mongodbClient.py
 48 | 
 49 |     """
 50 | 
 51 |     def __init__(self, db_conn):
 52 |         """
 53 |         init
 54 |         :return:
 55 |         """
 56 |         self.parseDbConn(db_conn)
 57 |         self.__initDbClient()
 58 | 
 59 |     @classmethod
 60 |     def parseDbConn(cls, db_conn):
 61 |         db_conf = urlparse(db_conn)
 62 |         cls.db_type = db_conf.scheme.upper().strip()
 63 |         cls.db_host = db_conf.hostname
 64 |         cls.db_port = db_conf.port
 65 |         cls.db_user = db_conf.username
 66 |         cls.db_pwd = db_conf.password
 67 |         cls.db_name = db_conf.path[1:]
 68 |         return cls
 69 | 
 70 |     def __initDbClient(self):
 71 |         """
 72 |         init DB Client
 73 |         :return:
 74 |         """
 75 |         __type = None
 76 |         if "SSDB" == self.db_type:
 77 |             __type = "ssdbClient"
 78 |         elif "REDIS" == self.db_type:
 79 |             __type = "redisClient"
 80 |         else:
 81 |             pass
 82 |         assert __type, 'type error, Not support DB type: {}'.format(self.db_type)
 83 |         self.client = getattr(__import__(__type), "%sClient" % self.db_type.title())(host=self.db_host,
 84 |                                                                                      port=self.db_port,
 85 |                                                                                      username=self.db_user,
 86 |                                                                                      password=self.db_pwd,
 87 |                                                                                      db=self.db_name)
 88 | 
 89 |     def get(self, https, **kwargs):
 90 |         return self.client.get(https, **kwargs)
 91 | 
 92 |     def put(self, key, **kwargs):
 93 |         return self.client.put(key, **kwargs)
 94 | 
 95 |     def update(self, key, value, **kwargs):
 96 |         return self.client.update(key, value, **kwargs)
 97 | 
 98 |     def delete(self, key, **kwargs):
 99 |         return self.client.delete(key, **kwargs)
100 | 
101 |     def exists(self, key, **kwargs):
102 |         return self.client.exists(key, **kwargs)
103 | 
104 |     def pop(self, https, **kwargs):
105 |         return self.client.pop(https, **kwargs)
106 | 
107 |     def getAll(self, https):
108 |         return self.client.getAll(https)
109 | 
110 |     def clear(self):
111 |         return self.client.clear()
112 | 
113 |     def changeTable(self, name):
114 |         self.client.changeTable(name)
115 | 
116 |     def getCount(self):
117 |         return self.client.getCount()
118 | 
119 |     def test(self):
120 |         return self.client.test()
121 | 


--------------------------------------------------------------------------------
/proxy_pool/db/redisClient.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -----------------------------------------------------
  4 |    File Name：     redisClient.py
  5 |    Description :   封装Redis相关操作
  6 |    Author :        JHao
  7 |    date：          2019/8/9
  8 | ------------------------------------------------------
  9 |    Change Activity:
 10 |                    2019/08/09: 封装Redis相关操作
 11 |                    2020/06/23: 优化pop方法, 改用hscan命令
 12 |                    2021/05/26: 区别http/https代理
 13 | ------------------------------------------------------
 14 | """
 15 | __author__ = 'JHao'
 16 | 
 17 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError
 18 | from redis.connection import BlockingConnectionPool
 19 | from handler.logHandler import LogHandler
 20 | from random import choice
 21 | from redis import Redis
 22 | import json
 23 | 
 24 | 
 25 | class RedisClient(object):
 26 |     """
 27 |     Redis client
 28 | 
 29 |     Redis中代理存放的结构为hash：
 30 |     key为ip:port, value为代理属性的字典;
 31 | 
 32 |     """
 33 | 
 34 |     def __init__(self, **kwargs):
 35 |         """
 36 |         init
 37 |         :param host: host
 38 |         :param port: port
 39 |         :param password: password
 40 |         :param db: db
 41 |         :return:
 42 |         """
 43 |         self.name = ""
 44 |         kwargs.pop("username")
 45 |         self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True,
 46 |                                                                    timeout=5,
 47 |                                                                    socket_timeout=5,
 48 |                                                                    **kwargs))
 49 | 
 50 |     def get(self, https):
 51 |         """
 52 |         返回一个代理
 53 |         :return:
 54 |         """
 55 |         if https:
 56 |             items = self.__conn.hvals(self.name)
 57 |             proxies = list(filter(lambda x: json.loads(x).get("https"), items))
 58 |             return choice(proxies) if proxies else None
 59 |         else:
 60 |             proxies = self.__conn.hkeys(self.name)
 61 |             proxy = choice(proxies) if proxies else None
 62 |             return self.__conn.hget(self.name, proxy) if proxy else None
 63 | 
 64 |     def put(self, proxy_obj):
 65 |         """
 66 |         将代理放入hash, 使用changeTable指定hash name
 67 |         :param proxy_obj: Proxy obj
 68 |         :return:
 69 |         """
 70 |         data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json)
 71 |         return data
 72 | 
 73 |     def pop(self, https):
 74 |         """
 75 |         弹出一个代理
 76 |         :return: dict {proxy: value}
 77 |         """
 78 |         proxy = self.get(https)
 79 |         if proxy:
 80 |             self.__conn.hdel(self.name, json.loads(proxy).get("proxy", ""))
 81 |         return proxy if proxy else None
 82 | 
 83 |     def delete(self, proxy_str):
 84 |         """
 85 |         移除指定代理, 使用changeTable指定hash name
 86 |         :param proxy_str: proxy str
 87 |         :return:
 88 |         """
 89 |         return self.__conn.hdel(self.name, proxy_str)
 90 | 
 91 |     def exists(self, proxy_str):
 92 |         """
 93 |         判断指定代理是否存在, 使用changeTable指定hash name
 94 |         :param proxy_str: proxy str
 95 |         :return:
 96 |         """
 97 |         return self.__conn.hexists(self.name, proxy_str)
 98 | 
 99 |     def update(self, proxy_obj):
100 |         """
101 |         更新 proxy 属性
102 |         :param proxy_obj:
103 |         :return:
104 |         """
105 |         return self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json)
106 | 
107 |     def getAll(self, https):
108 |         """
109 |         字典形式返回所有代理, 使用changeTable指定hash name
110 |         :return:
111 |         """
112 |         items = self.__conn.hvals(self.name)
113 |         if https:
114 |             return list(filter(lambda x: json.loads(x).get("https"), items))
115 |         else:
116 |             return items
117 | 
118 |     def clear(self):
119 |         """
120 |         清空所有代理, 使用changeTable指定hash name
121 |         :return:
122 |         """
123 |         return self.__conn.delete(self.name)
124 | 
125 |     def getCount(self):
126 |         """
127 |         返回代理数量
128 |         :return:
129 |         """
130 |         proxies = self.getAll(https=False)
131 |         return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))}
132 | 
133 |     def changeTable(self, name):
134 |         """
135 |         切换操作对象
136 |         :param name:
137 |         :return:
138 |         """
139 |         self.name = name
140 | 
141 |     def test(self):
142 |         log = LogHandler('redis_client')
143 |         try:
144 |             self.getCount()
145 |         except TimeoutError as e:
146 |             log.error('redis connection time out: %s' % str(e), exc_info=True)
147 |             return e
148 |         except ConnectionError as e:
149 |             log.error('redis connection error: %s' % str(e), exc_info=True)
150 |             return e
151 |         except ResponseError as e:
152 |             log.error('redis connection error: %s' % str(e), exc_info=True)
153 |             return e
154 | 
155 | 
156 | 


--------------------------------------------------------------------------------
/proxy_pool/db/ssdbClient.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # !/usr/bin/env python
  3 | """
  4 | -------------------------------------------------
  5 |    File Name：     ssdbClient.py
  6 |    Description :   封装SSDB操作
  7 |    Author :        JHao
  8 |    date：          2016/12/2
  9 | -------------------------------------------------
 10 |    Change Activity:
 11 |                    2016/12/2:
 12 |                    2017/09/22: PY3中 redis-py返回的数据是bytes型
 13 |                    2017/09/27: 修改pop()方法 返回{proxy:value}字典
 14 |                    2020/07/03: 2.1.0 优化代码结构
 15 |                    2021/05/26: 区分http和https代理
 16 | -------------------------------------------------
 17 | """
 18 | __author__ = 'JHao'
 19 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError
 20 | from redis.connection import BlockingConnectionPool
 21 | from handler.logHandler import LogHandler
 22 | from random import choice
 23 | from redis import Redis
 24 | import json
 25 | 
 26 | 
 27 | class SsdbClient(object):
 28 |     """
 29 |     SSDB client
 30 | 
 31 |     SSDB中代理存放的结构为hash：
 32 |     key为代理的ip:por, value为代理属性的字典;
 33 |     """
 34 | 
 35 |     def __init__(self, **kwargs):
 36 |         """
 37 |         init
 38 |         :param host: host
 39 |         :param port: port
 40 |         :param password: password
 41 |         :return:
 42 |         """
 43 |         self.name = ""
 44 |         kwargs.pop("username")
 45 |         self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True,
 46 |                                                                    timeout=5,
 47 |                                                                    socket_timeout=5,
 48 |                                                                    **kwargs))
 49 | 
 50 |     def get(self, https):
 51 |         """
 52 |         从hash中随机返回一个代理
 53 |         :return:
 54 |         """
 55 |         if https:
 56 |             items_dict = self.__conn.hgetall(self.name)
 57 |             proxies = list(filter(lambda x: json.loads(x).get("https"), items_dict.values()))
 58 |             return choice(proxies) if proxies else None
 59 |         else:
 60 |             proxies = self.__conn.hkeys(self.name)
 61 |             proxy = choice(proxies) if proxies else None
 62 |             return self.__conn.hget(self.name, proxy) if proxy else None
 63 | 
 64 |     def put(self, proxy_obj):
 65 |         """
 66 |         将代理放入hash
 67 |         :param proxy_obj: Proxy obj
 68 |         :return:
 69 |         """
 70 |         result = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json)
 71 |         return result
 72 | 
 73 |     def pop(self, https):
 74 |         """
 75 |         顺序弹出一个代理
 76 |         :return: proxy
 77 |         """
 78 |         proxy = self.get(https)
 79 |         if proxy:
 80 |             self.__conn.hdel(self.name, json.loads(proxy).get("proxy", ""))
 81 |         return proxy if proxy else None
 82 | 
 83 |     def delete(self, proxy_str):
 84 |         """
 85 |         移除指定代理, 使用changeTable指定hash name
 86 |         :param proxy_str: proxy str
 87 |         :return:
 88 |         """
 89 |         self.__conn.hdel(self.name, proxy_str)
 90 | 
 91 |     def exists(self, proxy_str):
 92 |         """
 93 |         判断指定代理是否存在, 使用changeTable指定hash name
 94 |         :param proxy_str: proxy str
 95 |         :return:
 96 |         """
 97 |         return self.__conn.hexists(self.name, proxy_str)
 98 | 
 99 |     def update(self, proxy_obj):
100 |         """
101 |         更新 proxy 属性
102 |         :param proxy_obj:
103 |         :return:
104 |         """
105 |         self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json)
106 | 
107 |     def getAll(self, https):
108 |         """
109 |         字典形式返回所有代理, 使用changeTable指定hash name
110 |         :return:
111 |         """
112 |         item_dict = self.__conn.hgetall(self.name)
113 |         if https:
114 |             return list(filter(lambda x: json.loads(x).get("https"), item_dict.values()))
115 |         else:
116 |             return item_dict.values()
117 | 
118 |     def clear(self):
119 |         """
120 |         清空所有代理, 使用changeTable指定hash name
121 |         :return:
122 |         """
123 |         return self.__conn.delete(self.name)
124 | 
125 |     def getCount(self):
126 |         """
127 |         返回代理数量
128 |         :return:
129 |         """
130 |         proxies = self.getAll(https=False)
131 |         return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))}
132 | 
133 |     def changeTable(self, name):
134 |         """
135 |         切换操作对象
136 |         :param name:
137 |         :return:
138 |         """
139 |         self.name = name
140 | 
141 |     def test(self):
142 |         log = LogHandler('ssdb_client')
143 |         try:
144 |             self.getCount()
145 |         except TimeoutError as e:
146 |             log.error('ssdb connection time out: %s' % str(e), exc_info=True)
147 |             return e
148 |         except ConnectionError as e:
149 |             log.error('ssdb connection error: %s' % str(e), exc_info=True)
150 |             return e
151 |         except ResponseError as e:
152 |             log.error('ssdb connection error: %s' % str(e), exc_info=True)
153 |             return e
154 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/changelog.rst:
--------------------------------------------------------------------------------
  1 | .. _changelog:
  2 | 
  3 | ChangeLog
  4 | ==========
  5 | 
  6 | 2.4.1 (2022-07-17)
  7 | ------------------
  8 | 
  9 | 1. 新增代理源 **FreeProxyList**; (2022-07-21)
 10 | 2. 新增代理源 **FateZero**; (2022-08-01)
 11 | 3. 新增代理属性 ``region``; (2022-08-16)
 12 | 
 13 | 2.4.0 (2021-11-17)
 14 | ------------------
 15 | 
 16 | 1. 移除无效代理源 **神鸡代理**; (2021-11-16)
 17 | 2. 移除无效代理源 **极速代理**; (2021-11-16)
 18 | 3. 移除代理源 **西拉代理**; (2021-11-16)
 19 | 4. 新增代理源 **蝶鸟IP**; (2021-11-16)
 20 | 5. 新增代理源 **PROXY11**; (2021-11-16)
 21 | 6. 多线程采集代理; (2021-11-17)
 22 | 
 23 | 2.3.0 (2021-05-27)
 24 | ------------------
 25 | 
 26 | 1. 修复Dockerfile时区问题; (2021-04-12)
 27 | 2. 新增Proxy属性 ``source``, 标记代理来源; (2021-04-13)
 28 | 3. 新增Proxy属性 ``https``, 标记支持https的代理; (2021-05-27)
 29 | 
 30 | 2.2.0 (2021-04-08)
 31 | ------------------
 32 | 
 33 | 1. 启动时检查数据库连通性;
 34 | 2. 新增免费代理源 **米扑代理**;
 35 | 3. 新增免费代理源 **Pzzqz**;
 36 | 4. 新增免费代理源 **神鸡代理**;
 37 | 5. 新增免费代理源 **极速代理**;
 38 | 6. 新增免费代理源 **小幻代理**;
 39 | 
 40 | 2.1.1 (2021-02-23)
 41 | ------------------
 42 | 
 43 | 1. Fix Bug `#493`_, 新增时区配置; (2020-08-12)
 44 | 2. 修复 **66代理** 采集; (2020-11-04)
 45 | 3. 修复 **全网代理** 采集, 解决HTML端口加密问题; (2020-11-04)
 46 | 4. 新增 **代理盒子** 免费源; (2020-11-04)
 47 | 5. 新增 ``POOL_SIZE_MIN`` 配置项, runProxyCheck时, 剩余代理少于POOL_SIZE_MIN触发抓取; (2021-02-23)
 48 | 
 49 | .. _#493: https://github.com/jhao104/proxy_pool/issues/493
 50 | 
 51 | 2.1.0 (2020.07)
 52 | ------------------
 53 | 
 54 | 1. 新增免费代理源 **西拉代理**  （2020-03-30）
 55 | 2. Fix Bug `#356`_ `#401`_
 56 | 3. 优化Docker镜像体积; (2020-06-19)
 57 | 4. 优化配置方式;
 58 | 5. 优化代码结构;
 59 | 6. 不再储存raw_proxy, 抓取后直接验证入库;
 60 | 
 61 | .. _#401: https://github.com/jhao104/proxy_pool/issues/401
 62 | .. _#356: https://github.com/jhao104/proxy_pool/issues/356
 63 | 
 64 | 2.0.1 (2019.10)
 65 | -----------------
 66 | 
 67 | 1. 新增免费代理源 **89免费代理**;
 68 | #. 新增免费代理源 **齐云代理**
 69 | 
 70 | 2.0.0 (2019.08)
 71 | ------------------
 72 | 
 73 | 1. WebApi集成Gunicorn方式启动, Windows平台暂不支持;
 74 | #. 优化Proxy调度程序;
 75 | #. 扩展Proxy属性;
 76 | #. 新增cli工具, 更加方便启动proxyPool
 77 | 
 78 | 1.14  (2019.07)
 79 | -----------------
 80 | 
 81 | 1. 修复 Queue阻塞导致的 ``ProxyValidSchedule`` 假死bug;
 82 | #. 修改代理源 **云代理** 抓取;
 83 | #. 修改代理源 **码农代理** 抓取;
 84 | #. 修改代理源 **代理66** 抓取, 引入 ``PyExecJS`` 模块破解加速乐动态Cookies加密;
 85 | 
 86 | 1.13  (2019.02)
 87 | -----------------
 88 | 
 89 | 1. 使用.py文件替换.ini作为配置文件;
 90 | 
 91 | #. 优化代理采集部分；
 92 | 
 93 | 1.12  (2018.04)
 94 | -----------------
 95 | 
 96 | 1. 优化代理格式检查;
 97 | 
 98 | #. 增加代理源;
 99 | 
100 | #. fix bug `#122`_  `#126`_
101 | 
102 | .. _#122: https://github.com/jhao104/proxy_pool/issues/122
103 | .. _#126: https://github.com/jhao104/proxy_pool/issues/126
104 | 
105 | 1.11  (2017.08)
106 | -----------------
107 | 
108 | 1. 使用多线程验证useful_pool;
109 | 
110 | 1.10  (2016.11)
111 | -----------------
112 | 
113 | 1. 第一版；
114 | 
115 | #. 支持PY2/PY3;
116 | 
117 | #. 代理池基本功能；
118 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | import sphinx_rtd_theme
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'ProxyPool'
21 | copyright = '2020, jhao104'
22 | author = 'jhao104'
23 | 
24 | master_doc = 'index'
25 | 
26 | # The full version, including alpha/beta/rc tags
27 | release = '2.1.0'
28 | 
29 | # -- General configuration ---------------------------------------------------
30 | 
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
33 | # ones.
34 | extensions = [
35 | ]
36 | 
37 | # If true, sectionauthor and moduleauthor directives will be shown in the
38 | # output. They are ignored by default.
39 | show_authors = False
40 | 
41 | # The name of the Pygments (syntax highlighting) style to use.
42 | pygments_style = "sphinx"
43 | 
44 | # Add any paths that contain templates here, relative to this directory.
45 | templates_path = ['_templates']
46 | 
47 | # The language for content autogenerated by Sphinx. Refer to documentation
48 | # for a list of supported languages.
49 | #
50 | # This is also used if you do content translation via gettext catalogs.
51 | # Usually you set "language" from the command line for these cases.
52 | language = 'zh_CN'
53 | 
54 | # List of patterns, relative to source directory, that match files and
55 | # directories to ignore when looking for source files.
56 | # This pattern also affects html_static_path and html_extra_path.
57 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
58 | 
59 | # -- Options for HTML output -------------------------------------------------
60 | 
61 | # The theme to use for HTML and HTML Help pages.  See the documentation for
62 | # a list of builtin themes.
63 | #
64 | html_theme = 'sphinx_rtd_theme'
65 | 
66 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
67 | 
68 | # Add any paths that contain custom static files (such as style sheets) here,
69 | # relative to this directory. They are copied after the builtin static files,
70 | # so a file named "default.css" will overwrite the builtin "default.css".
71 | html_static_path = ['_static']
72 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/dev/ext_fetcher.rst:
--------------------------------------------------------------------------------
 1 | .. ext_fetcher
 2 | 
 3 | 扩展代理源
 4 | -----------
 5 | 
 6 | 项目默认包含几个免费的代理获取源，但是免费的毕竟质量有限，如果直接运行可能拿到的代理质量不理想。因此提供了用户自定义扩展代理获取的方法。
 7 | 
 8 | 如果要添加一个新的代理获取方法, 过程如下:
 9 | 
10 | 1. 首先在 `ProxyFetcher`_ 类中添加自定义的获取代理的静态方法，该方法需要以生成器(yield)形式返回 ``host:ip`` 格式的代理字符串， 例如：
11 | 
12 | .. code-block:: python
13 | 
14 |     class ProxyFetcher(object):
15 |     # ....
16 |     # 自定义代理源获取方法
17 |     @staticmethod
18 |     def freeProxyCustom01():  # 命名不和已有重复即可
19 |         # 通过某网站或者某接口或某数据库获取代理
20 |         # 假设你已经拿到了一个代理列表
21 |         proxies = ["x.x.x.x:3128", "x.x.x.x:80"]
22 |         for proxy in proxies:
23 |             yield proxy
24 |         # 确保每个proxy都是 host:ip正确的格式返回
25 | 
26 | 2. 添加好方法后，修改配置文件 `setting.py`_ 中的 ``PROXY_FETCHER`` 项， 加入刚才添加的自定义方法的名字:
27 | 
28 | .. code-block:: python
29 | 
30 |     PROXY_FETCHER = [
31 |         # ....
32 |         "freeProxyCustom01"  #  # 确保名字和你添加方法名字一致
33 |     ]
34 | 
35 | .. _ProxyFetcher: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L20
36 | .. _setting.py: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47


--------------------------------------------------------------------------------
/proxy_pool/docs/dev/ext_validator.rst:
--------------------------------------------------------------------------------
 1 | .. ext_validator
 2 | 
 3 | 代理校验
 4 | -----------
 5 | 
 6 | 内置校验
 7 | >>>>>>>>>
 8 | 
 9 | 项目中使用的代理校验方法全部定义在 `validator.py`_ 中， 通过 `ProxyValidator`_ 类中提供的装饰器来区分。校验方法返回 ``True`` 表示
10 | 校验通过， 返回 ``False`` 表示校验不通过。
11 | 
12 | * 代理校验方法分为三类: ``preValidator`` 、 ``httpValidator`` 、 ``httpsValidator``：
13 | 
14 |     * **preValidator**: 预校验，在代理抓取后验证前调用，目前实现了 `formatValidator`_ 校验代理IP格式是否合法；
15 |     * **httpValidator**: 代理可用性校验，通过则认为代理可用， 目前实现了 `httpTimeOutValidator`_ 校验；
16 |     * **httpsValidator**: 校验代理是否支持https，目前实现了 `httpsTimeOutValidator`_ 校验。
17 | 
18 | 
19 | .. _validator.py: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py
20 | .. _ProxyValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L29
21 | .. _formatValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L51
22 | .. _httpTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L58
23 | .. _httpsTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L71
24 | 
25 | 每种校验可以定义多个方法，只有 **所有** 方法都返回 ``True`` 的情况下才视为该校验通过，校验方法执行顺序为: 先执行 **httpValidator** ， 前者通过后再执行 **httpsValidator** 。
26 | 只有 `preValidator` 校验通过的代理才会进入可用性校验， `httpValidator` 校验通过后认为代理可用准备更新入代理池, `httpValidator` 校验通过后视为代理支持https更新代理的 `https` 属性为 `True` 。
27 | 
28 | 扩展校验
29 | >>>>>>>>>
30 | 
31 | 在 `validator.py`_ 已有自定义校验的示例，自定义函数需返回True或者False，使用 `ProxyValidator`_ 中提供的装饰器来区分校验类型。 下面是两个例子:
32 | 
33 | * 1. 自定义一个代理可用性的校验(``addHttpValidator``):
34 | 
35 | .. code-block:: python
36 | 
37 |     @ProxyValidator.addHttpValidator
38 |     def customValidatorExample01(proxy):
39 |         """自定义代理可用性校验函数"""
40 |         proxies = {"http": "http://{proxy}".format(proxy=proxy)}
41 |         try:
42 |             r = requests.get("http://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5)
43 |             return True if r.status_code == 200 and len(r.content) > 200 else False
44 |         except Exception as e:
45 |             return False
46 | 
47 | * 2. 自定义一个代理是否支持https的校验(``addHttpsValidator``):
48 | 
49 | .. code-block:: python
50 | 
51 |     @ProxyValidator.addHttpsValidator
52 |     def customValidatorExample02(proxy):
53 |         """自定义代理是否支持https校验函数"""
54 |         proxies = {"https": "https://{proxy}".format(proxy=proxy)}
55 |         try:
56 |             r = requests.get("https://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5, verify=False)
57 |             return True if r.status_code == 200 and len(r.content) > 200 else False
58 |         except Exception as e:
59 |             return False
60 | 
61 | 注意，比如在运行代理可用性校验时，所有被 ``ProxyValidator.addHttpValidator`` 装饰的函数会被依次按定义顺序执行，只有当所有函数都返回True时才会判断代理可用。 ``HttpsValidator`` 运行机制也是如此。
62 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/dev/index.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | 开发指南
 3 | =========
 4 | 
 5 | .. module:: dev
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 | 
10 |    ext_fetcher
11 |    ext_validator
12 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/index.rst:
--------------------------------------------------------------------------------
  1 | .. ProxyPool documentation master file, created by
  2 |    sphinx-quickstart on Wed Jul  8 16:13:42 2020.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | ProxyPool
  7 | =====================================
  8 | 
  9 | ::
 10 | 
 11 |    ****************************************************************
 12 |    *** ______  ********************* ______ *********** _  ********
 13 |    *** | ___ \_ ******************** | ___ \ ********* | | ********
 14 |    *** | |_/ / \__ __   __  _ __   _ | |_/ /___ * ___  | | ********
 15 |    *** |  __/|  _// _ \ \ \/ /| | | ||  __// _ \ / _ \ | | ********
 16 |    *** | |   | | | (_) | >  < \ |_| || |  | (_) | (_) || |___  ****
 17 |    *** \_|   |_|  \___/ /_/\_\ \__  |\_|   \___/ \___/ \_____/ ****
 18 |    ****                       __ / /                          *****
 19 |    ************************* /___ / *******************************
 20 |    *************************       ********************************
 21 |    ****************************************************************
 22 | 
 23 | Python爬虫代理IP池
 24 | 
 25 | 安装
 26 | -----
 27 | 
 28 | * 下载代码
 29 | 
 30 | .. code-block:: console
 31 | 
 32 |     $ git clone git@github.com:jhao104/proxy_pool.git
 33 | 
 34 | * 安装依赖
 35 | 
 36 | .. code-block:: console
 37 | 
 38 |     $ pip install -r requirements.txt
 39 | 
 40 | * 更新配置
 41 | 
 42 | .. code-block:: python
 43 | 
 44 |    HOST = "0.0.0.0"
 45 |    PORT = 5000
 46 | 
 47 |    DB_CONN = 'redis://@127.0.0.1:8888'
 48 | 
 49 |    PROXY_FETCHER = [
 50 |        "freeProxy01",
 51 |        "freeProxy02",
 52 |        # ....
 53 |    ]
 54 | 
 55 | * 启动项目
 56 | 
 57 | .. code-block:: console
 58 | 
 59 |     $ python proxyPool.py schedule
 60 |     $ python proxyPool.py server
 61 | 
 62 | 使用
 63 | ______
 64 | 
 65 | * API
 66 | 
 67 | ============     ========    ================       ==============
 68 | Api               Method      Description            Params
 69 | ============     ========    ================       ==============
 70 | /                GET         API介绍                 无
 71 | /get             GET         返回一个代理             可选参数: `?type=https` 过滤支持https的代理
 72 | /pop             GET         返回并删除一个代理        可选参数: `?type=https` 过滤支持https的代理
 73 | /all             GET         返回所有代理             可选参数: `?type=https` 过滤支持https的代理
 74 | /count           GET         返回代理数量             无
 75 | /delete          GET         删除指定代理             `?proxy=host:ip`
 76 | ============     ========    ================       ==============
 77 | 
 78 | 
 79 | * 爬虫
 80 | 
 81 | .. code-block:: python
 82 | 
 83 |    import requests
 84 | 
 85 |    def get_proxy():
 86 |        return requests.get("http://127.0.0.1:5010/get?type=https").json()
 87 | 
 88 |    def delete_proxy(proxy):
 89 |        requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
 90 | 
 91 |    # your spider code
 92 | 
 93 |    def getHtml():
 94 |        # ....
 95 |        retry_count = 5
 96 |        proxy = get_proxy().get("proxy")
 97 |        while retry_count > 0:
 98 |            try:
 99 |                html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy), "https": "https://{}".format(proxy)})
100 |                # 使用代理访问
101 |                return html
102 |            except Exception:
103 |                retry_count -= 1
104 |                # 删除代理池中代理
105 |                delete_proxy(proxy)
106 |        return None
107 | 
108 | Contents
109 | --------
110 | 
111 | .. toctree::
112 |    :maxdepth: 2
113 | 
114 |    user/index
115 |    dev/index
116 |    changelog
117 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/user/how_to_config.rst:
--------------------------------------------------------------------------------
 1 | .. how_to_config
 2 | 
 3 | 配置参考
 4 | ---------
 5 | 
 6 | 配置文件 ``setting.py`` 位于项目的主目录下, 配置主要分为四类: **服务配置** 、 **数据库配置** 、 **采集配置** 、 **校验配置**.
 7 | 
 8 | 服务配置
 9 | >>>>>>>>>
10 | 
11 | * ``HOST``
12 | 
13 |     API服务监听的IP, 本机访问设置为 ``127.0.0.1``, 开启远程访问设置为: ``0.0.0.0``.
14 | 
15 | * ``PORT``
16 | 
17 |     API服务监听的端口.
18 | 
19 | 数据库配置
20 | >>>>>>>>>>>
21 | 
22 | * ``DB_CONN``
23 | 
24 |     用户存放代理IP的数据库URI, 配置格式为: ``db_type://[[user]:[pwd]]@ip:port/[db]``.
25 | 
26 |     目前支持的db_type有: ``ssdb`` 、 ``redis``.
27 | 
28 |     配置示例:
29 | 
30 | .. code-block:: python
31 | 
32 |     # SSDB IP: 127.0.0.1  Port: 8888
33 |     DB_CONN = 'ssdb://@127.0.0.1:8888'
34 |     # SSDB IP: 127.0.0.1  Port: 8899  Password:  123456
35 |     DB_CONN = 'ssdb://:123456@127.0.0.1:8888'
36 | 
37 |     # Redis IP: 127.0.0.1  Port: 6379
38 |     DB_CONN = 'redis://@127.0.0.1:6379'
39 |     # Redis IP: 127.0.0.1  Port: 6379  Password:  123456
40 |     DB_CONN = 'redis://:123456@127.0.0.1:6379'
41 |     # Redis IP: 127.0.0.1  Port: 6379  Password:  123456  DB: 15
42 |     DB_CONN = 'redis://:123456@127.0.0.1:6379/15'
43 | 
44 | 
45 | * ``TABLE_NAME``
46 | 
47 |     存放代理的数据载体名称, ssdb和redis的存放结构为hash.
48 | 
49 | 采集配置
50 | >>>>>>>>>
51 | 
52 | * ``PROXY_FETCHER``
53 | 
54 |     启用的代理采集方法名, 代理采集方法位于 ``fetcher/proxyFetcher.py`` 类中.
55 | 
56 |     由于各个代理源的稳定性不容易掌握, 当某个代理采集方法失效时, 可以该配置中注释掉其名称.
57 | 
58 |     如果有增加某些代理采集方法, 也请在该配置中添加其方法名, 具体请参考 :doc:`/dev/extend_fetcher`.
59 | 
60 |     调度程序每次执行采集任务时都会再次加载该配置, 保证每次运行的采集方法都是有效的.
61 | 
62 | 校验配置
63 | >>>>>>>>>
64 | 
65 | * ``HTTP_URL``
66 | 
67 |     用于检验代理是否可用的地址, 默认为 ``http://httpbin.org``, 可根据使用场景修改为其他地址.
68 | 
69 | * ``HTTPS_URL``
70 | 
71 |     用于检验代理是否支持HTTPS的地址, 默认为 ``https://www.qq.com``, 可根据使用场景修改为其他地址.
72 | 
73 | * ``VERIFY_TIMEOUT``
74 | 
75 |     检验代理的超时时间, 默认为 ``10`` , 单位秒. 使用代理访问 ``HTTP(S)_URL`` 耗时超过 ``VERIFY_TIMEOUT`` 时, 视为代理不可用.
76 | 
77 | * ``MAX_FAIL_COUNT``
78 | 
79 |     检验代理允许最大失败次数, 默认为 ``0``, 即出错一次即删除.
80 | 
81 | * ``POOL_SIZE_MIN``
82 | 
83 |     代理检测定时任务运行前若代理数量小于 `POOL_SIZE_MIN`, 则先运行抓取程序.


--------------------------------------------------------------------------------
/proxy_pool/docs/user/how_to_run.rst:
--------------------------------------------------------------------------------
 1 | .. how_to_run
 2 | 
 3 | 
 4 | 如何运行
 5 | ---------
 6 | 
 7 | 下载代码
 8 | >>>>>>>>>
 9 | 
10 | 本项目需要下载代码到本地运行, 通过 ``git`` 下载:
11 | 
12 | .. code-block:: console
13 | 
14 |     $ git clone git@github.com:jhao104/proxy_pool.git
15 | 
16 | 或者下载特定的 ``release`` 版本:
17 | 
18 | .. code-block:: console
19 | 
20 |     https://github.com/jhao104/proxy_pool/releases
21 | 
22 | 安装依赖
23 | >>>>>>>>>
24 | 
25 | 到项目目录下使用 ``pip`` 安装依赖库:
26 | 
27 | .. code-block:: console
28 | 
29 |     $ pip install -r requirements.txt
30 | 
31 | 
32 | 更新配置
33 | >>>>>>>>>
34 | 
35 | 配置文件 ``setting.py`` 位于项目的主目录下:
36 | 
37 | .. code-block:: python
38 | 
39 |     # 配置API服务
40 | 
41 |     HOST = "0.0.0.0"               # IP
42 |     PORT = 5000                    # 监听端口
43 | 
44 |     # 配置数据库
45 | 
46 |     DB_CONN = 'redis://@127.0.0.1:8888/0'
47 | 
48 |     # 配置 ProxyFetcher
49 | 
50 |     PROXY_FETCHER = [
51 |         "freeProxy01",      # 这里是启用的代理抓取方法，所有fetch方法位于fetcher/proxyFetcher.py
52 |         "freeProxy02",
53 |         # ....
54 |     ]
55 | 
56 | 更多配置请参考 :doc:`/user/how_to_config`
57 | 
58 | 启动项目
59 | >>>>>>>>>
60 | 
61 | 如果已配置好运行环境, 具备运行条件, 可以通过 ``proxyPool.py`` 启动.  ``proxyPool.py`` 是项目的CLI入口.
62 | 完整程序包含两部份: ``schedule`` 调度程序和 ``server`` API服务, 调度程序负责采集和验证代理, API服务提供代理服务HTTP接口.
63 | 
64 | 通过命令行程序分别启动调度程序和API服务:
65 | 
66 | .. code-block:: console
67 | 
68 |     # 启动调度程序
69 |     $ python proxyPool.py schedule
70 | 
71 |     # 启动webApi服务
72 |     $ python proxyPool.py server
73 | 
74 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/user/how_to_use.rst:
--------------------------------------------------------------------------------
 1 | .. how_to_use
 2 | 
 3 | 如何使用
 4 | ----------
 5 | 
 6 | 爬虫代码要对接代理池目前有两种方式: 一是通过调用API接口使用, 二是直接读取数据库.
 7 | 
 8 | 调用API
 9 | >>>>>>>>>
10 | 
11 | 启动ProxyPool的 ``server`` 后会提供如下几个http接口:
12 | 
13 | ============     ========    ================       ==============
14 | Api               Method      Description            Arg
15 | ============     ========    ================       ==============
16 | /                GET         API介绍                 无
17 | /get             GET         随机返回一个代理         无
18 | /get_all         GET         返回所有代理             无
19 | /get_status      GET         返回代理数量             无
20 | /delete          GET         删除指定代理             proxy=host:ip
21 | ============     ========    ================       ==============
22 | 
23 | 在代码中可以通过封装上面的API接口来使用代理, 例子:
24 | 
25 | .. code-block:: python
26 | 
27 |    import requests
28 | 
29 |    def get_proxy():
30 |        return requests.get("http://127.0.0.1:5010/get/").json()
31 | 
32 |    def delete_proxy(proxy):
33 |        requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy))
34 | 
35 |    # your spider code
36 | 
37 |    def getHtml():
38 |        # ....
39 |        retry_count = 5
40 |        proxy = get_proxy().get("proxy")
41 |        while retry_count > 0:
42 |            try:
43 |                # 使用代理访问
44 |                html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)})
45 |                return html
46 |            except Exception:
47 |                retry_count -= 1
48 |                # 删除代理池中代理
49 |                delete_proxy(proxy)
50 |        return None
51 | 
52 | 本例中我们在本地 ``127.0.0.1`` 启动端口为 ``5010`` 的 ``server``, 使用 ``/get`` 接口获取代理, ``/delete`` 删除代理.
53 | 
54 | 读数据库
55 | >>>>>>>>>
56 | 
57 | 目前支持配置两种数据库: ``REDIS`` 、 ``SSDB``.
58 | 
59 | * **REDIS** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME**
60 | 
61 | * **SSDB** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME**
62 | 
63 | 可以在代码中自行读取.
64 | 


--------------------------------------------------------------------------------
/proxy_pool/docs/user/index.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | 用户指南
 3 | =========
 4 | 
 5 | .. module:: user
 6 | 
 7 | .. toctree::
 8 |    :maxdepth: 2
 9 | 
10 |    how_to_run
11 |    how_to_use
12 |    how_to_config
13 | 


--------------------------------------------------------------------------------
/proxy_pool/fetcher/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     __init__.py
 5 |    Description :  
 6 |    Author :       JHao
 7 |    date：          2016/11/25
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2016/11/25: 
11 | -------------------------------------------------
12 | """


--------------------------------------------------------------------------------
/proxy_pool/fetcher/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/fetcher/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/fetcher/__pycache__/proxyFetcher.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/fetcher/__pycache__/proxyFetcher.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/fetcher/proxyFetcher.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -------------------------------------------------
  4 |    File Name：     proxyFetcher
  5 |    Description :
  6 |    Author :        JHao
  7 |    date：          2016/11/25
  8 | -------------------------------------------------
  9 |    Change Activity:
 10 |                    2016/11/25: proxyFetcher
 11 | -------------------------------------------------
 12 | """
 13 | __author__ = 'JHao'
 14 | 
 15 | import re
 16 | import json
 17 | from time import sleep
 18 | 
 19 | from util.webRequest import WebRequest
 20 | 
 21 | 
 22 | class ProxyFetcher(object):
 23 |     """
 24 |     proxy getter
 25 |     """
 26 | 
 27 |     @staticmethod
 28 |     def freeProxy01():
 29 |         """
 30 |         站大爷 https://www.zdaye.com/dayProxy.html
 31 |         """
 32 |         start_url = "https://www.zdaye.com/dayProxy.html"
 33 |         html_tree = WebRequest().get(start_url).tree
 34 |         latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip()
 35 |         from datetime import datetime
 36 |         interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S")
 37 |         if interval.seconds < 300:  # 只采集5分钟内的更新
 38 |             target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip()
 39 |             while target_url:
 40 |                 _tree = WebRequest().get(target_url).tree
 41 |                 for tr in _tree.xpath("//table//tr"):
 42 |                     ip = "".join(tr.xpath("./td[1]/text()")).strip()
 43 |                     port = "".join(tr.xpath("./td[2]/text()")).strip()
 44 |                     yield "%s:%s" % (ip, port)
 45 |                 next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href")
 46 |                 target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False
 47 |                 sleep(5)
 48 | 
 49 |     @staticmethod
 50 |     def freeProxy02():
 51 |         """
 52 |         代理66 http://www.66ip.cn/
 53 |         """
 54 |         url = "http://www.66ip.cn/"
 55 |         resp = WebRequest().get(url, timeout=10).tree
 56 |         for i, tr in enumerate(resp.xpath("(//table)[3]//tr")):
 57 |             if i > 0:
 58 |                 ip = "".join(tr.xpath("./td[1]/text()")).strip()
 59 |                 port = "".join(tr.xpath("./td[2]/text()")).strip()
 60 |                 yield "%s:%s" % (ip, port)
 61 | 
 62 |     @staticmethod
 63 |     def freeProxy03():
 64 |         """ 开心代理 """
 65 |         target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"]
 66 |         for url in target_urls:
 67 |             tree = WebRequest().get(url).tree
 68 |             for tr in tree.xpath("//table[@class='active']//tr")[1:]:
 69 |                 ip = "".join(tr.xpath('./td[1]/text()')).strip()
 70 |                 port = "".join(tr.xpath('./td[2]/text()')).strip()
 71 |                 yield "%s:%s" % (ip, port)
 72 | 
 73 |     @staticmethod
 74 |     def freeProxy04():
 75 |         """ FreeProxyList https://www.freeproxylists.net/zh/ """
 76 |         url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50"
 77 |         tree = WebRequest().get(url, verify=False).tree
 78 |         from urllib import parse
 79 | 
 80 |         def parse_ip(input_str):
 81 |             html_str = parse.unquote(input_str)
 82 |             ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str)
 83 |             return ips[0] if ips else None
 84 | 
 85 |         for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"):
 86 |             ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip())
 87 |             port = "".join(tr.xpath('./td[2]/text()')).strip()
 88 |             if ip:
 89 |                 yield "%s:%s" % (ip, port)
 90 | 
 91 |     @staticmethod
 92 |     def freeProxy05(page_count=1):
 93 |         """ 快代理 https://www.kuaidaili.com """
 94 |         url_pattern = [
 95 |             'https://www.kuaidaili.com/free/inha/{}/',
 96 |             'https://www.kuaidaili.com/free/intr/{}/'
 97 |         ]
 98 |         url_list = []
 99 |         for page_index in range(1, page_count + 1):
100 |             for pattern in url_pattern:
101 |                 url_list.append(pattern.format(page_index))
102 | 
103 |         for url in url_list:
104 |             tree = WebRequest().get(url).tree
105 |             proxy_list = tree.xpath('.//table//tr')
106 |             sleep(1)  # 必须sleep 不然第二条请求不到数据
107 |             for tr in proxy_list[1:]:
108 |                 yield ':'.join(tr.xpath('./td/text()')[0:2])
109 | 
110 |     @staticmethod
111 |     def freeProxy06():
112 |         """ FateZero http://proxylist.fatezero.org/ """
113 |         url = "http://proxylist.fatezero.org/proxy.list"
114 |         try:
115 |             resp_text = WebRequest().get(url).text
116 |             for each in resp_text.split("\n"):
117 |                 json_info = json.loads(each)
118 |                 if json_info.get("country") == "CN":
119 |                     yield "%s:%s" % (json_info.get("host", ""), json_info.get("port", ""))
120 |         except Exception as e:
121 |             print(e)
122 | 
123 |     @staticmethod
124 |     def freeProxy07():
125 |         """ 云代理 """
126 |         urls = ['http://www.ip3366.net/free/?stype=1', "http://www.ip3366.net/free/?stype=2"]
127 |         for url in urls:
128 |             r = WebRequest().get(url, timeout=10)
129 |             proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text)
130 |             for proxy in proxies:
131 |                 yield ":".join(proxy)
132 | 
133 |     @staticmethod
134 |     def freeProxy08():
135 |         """ 小幻代理 """
136 |         urls = ['https://ip.ihuan.me/address/5Lit5Zu9.html']
137 |         for url in urls:
138 |             r = WebRequest().get(url, timeout=10)
139 |             proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?</a></td><td>(\d+)</td>', r.text)
140 |             for proxy in proxies:
141 |                 yield ":".join(proxy)
142 | 
143 |     @staticmethod
144 |     def freeProxy09(page_count=1):
145 |         """ 免费代理库 """
146 |         for i in range(1, page_count + 1):
147 |             url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i)
148 |             html_tree = WebRequest().get(url).tree
149 |             for index, tr in enumerate(html_tree.xpath("//table//tr")):
150 |                 if index == 0:
151 |                     continue
152 |                 yield ":".join(tr.xpath("./td/text()")[0:2]).strip()
153 | 
154 |     @staticmethod
155 |     def freeProxy10():
156 |         """ 89免费代理 """
157 |         r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10)
158 |         proxies = re.findall(
159 |             r'<td.*?>[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?</td>[\s\S]*?<td.*?>[\s\S]*?(\d+)[\s\S]*?</td>',
160 |             r.text)
161 |         for proxy in proxies:
162 |             yield ':'.join(proxy)
163 | 
164 |     # @staticmethod
165 |     # def wallProxy01():
166 |     #     """
167 |     #     PzzQz https://pzzqz.com/
168 |     #     """
169 |     #     from requests import Session
170 |     #     from lxml import etree
171 |     #     session = Session()
172 |     #     try:
173 |     #         index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text
174 |     #         x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp)
175 |     #         if x_csrf_token:
176 |     #             data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""}
177 |     #             proxy_resp = session.post("https://pzzqz.com/", verify=False,
178 |     #                                       headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json()
179 |     #             tree = etree.HTML(proxy_resp["proxy_html"])
180 |     #             for tr in tree.xpath("//tr"):
181 |     #                 ip = "".join(tr.xpath("./td[1]/text()"))
182 |     #                 port = "".join(tr.xpath("./td[2]/text()"))
183 |     #                 yield "%s:%s" % (ip, port)
184 |     #     except Exception as e:
185 |     #         print(e)
186 | 
187 |     # @staticmethod
188 |     # def freeProxy10():
189 |     #     """
190 |     #     墙外网站 cn-proxy
191 |     #     :return:
192 |     #     """
193 |     #     urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
194 |     #     request = WebRequest()
195 |     #     for url in urls:
196 |     #         r = request.get(url, timeout=10)
197 |     #         proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.text)
198 |     #         for proxy in proxies:
199 |     #             yield ':'.join(proxy)
200 | 
201 |     # @staticmethod
202 |     # def freeProxy11():
203 |     #     """
204 |     #     https://proxy-list.org/english/index.php
205 |     #     :return:
206 |     #     """
207 |     #     urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)]
208 |     #     request = WebRequest()
209 |     #     import base64
210 |     #     for url in urls:
211 |     #         r = request.get(url, timeout=10)
212 |     #         proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
213 |     #         for proxy in proxies:
214 |     #             yield base64.b64decode(proxy).decode()
215 | 
216 |     # @staticmethod
217 |     # def freeProxy12():
218 |     #     urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
219 |     #     request = WebRequest()
220 |     #     for url in urls:
221 |     #         r = request.get(url, timeout=10)
222 |     #         proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text)
223 |     #         for proxy in proxies:
224 |     #             yield ':'.join(proxy)
225 | 
226 | 
227 | if __name__ == '__main__':
228 |     p = ProxyFetcher()
229 |     for _ in p.freeProxy06():
230 |         print(_)
231 | 
232 | # http://nntime.com/proxy-list-01.htm
233 | 


--------------------------------------------------------------------------------
/proxy_pool/handler/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     __init__.py
 5 |    Description :  
 6 |    Author :       JHao
 7 |    date：          2016/12/3
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2016/12/3: 
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | # from handler.ProxyManager import ProxyManager
16 | 


--------------------------------------------------------------------------------
/proxy_pool/handler/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/handler/__pycache__/configHandler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/configHandler.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/handler/__pycache__/logHandler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/logHandler.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/handler/__pycache__/proxyHandler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/proxyHandler.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/handler/configHandler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     configHandler
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2020/6/22
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2020/6/22:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | import os
16 | import setting
17 | from util.singleton import Singleton
18 | from util.lazyProperty import LazyProperty
19 | from util.six import reload_six, withMetaclass
20 | 
21 | 
22 | class ConfigHandler(withMetaclass(Singleton)):
23 | 
24 |     def __init__(self):
25 |         pass
26 | 
27 |     @LazyProperty
28 |     def serverHost(self):
29 |         return os.environ.get("HOST", setting.HOST)
30 | 
31 |     @LazyProperty
32 |     def serverPort(self):
33 |         return os.environ.get("PORT", setting.PORT)
34 | 
35 |     @LazyProperty
36 |     def dbConn(self):
37 |         return os.getenv("DB_CONN", setting.DB_CONN)
38 | 
39 |     @LazyProperty
40 |     def tableName(self):
41 |         return os.getenv("TABLE_NAME", setting.TABLE_NAME)
42 | 
43 |     @property
44 |     def fetchers(self):
45 |         reload_six(setting)
46 |         return setting.PROXY_FETCHER
47 | 
48 |     @LazyProperty
49 |     def httpUrl(self):
50 |         return os.getenv("HTTP_URL", setting.HTTP_URL)
51 | 
52 |     @LazyProperty
53 |     def httpsUrl(self):
54 |         return os.getenv("HTTPS_URL", setting.HTTPS_URL)
55 | 
56 |     @LazyProperty
57 |     def verifyTimeout(self):
58 |         return int(os.getenv("VERIFY_TIMEOUT", setting.VERIFY_TIMEOUT))
59 | 
60 |     # @LazyProperty
61 |     # def proxyCheckCount(self):
62 |     #     return int(os.getenv("PROXY_CHECK_COUNT", setting.PROXY_CHECK_COUNT))
63 | 
64 |     @LazyProperty
65 |     def maxFailCount(self):
66 |         return int(os.getenv("MAX_FAIL_COUNT", setting.MAX_FAIL_COUNT))
67 | 
68 |     # @LazyProperty
69 |     # def maxFailRate(self):
70 |     #     return int(os.getenv("MAX_FAIL_RATE", setting.MAX_FAIL_RATE))
71 | 
72 |     @LazyProperty
73 |     def poolSizeMin(self):
74 |         return int(os.getenv("POOL_SIZE_MIN", setting.POOL_SIZE_MIN))
75 | 
76 |     @LazyProperty
77 |     def proxyRegion(self):
78 |         return bool(os.getenv("PROXY_REGION", setting.PROXY_REGION))
79 | 
80 |     @LazyProperty
81 |     def timezone(self):
82 |         return os.getenv("TIMEZONE", setting.TIMEZONE)
83 | 
84 | 


--------------------------------------------------------------------------------
/proxy_pool/handler/logHandler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     LogHandler.py
 5 |    Description :  日志操作模块
 6 |    Author :       JHao
 7 |    date：          2017/3/6
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2017/03/06: log handler
11 |                    2017/09/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出)
12 |                    2020/07/13: Windows下TimedRotatingFileHandler线程不安全, 不再使用
13 | -------------------------------------------------
14 | """
15 | __author__ = 'JHao'
16 | 
17 | import os
18 | import logging
19 | import platform
20 | 
21 | from logging.handlers import TimedRotatingFileHandler
22 | 
23 | # 日志级别
24 | CRITICAL = 50
25 | FATAL = CRITICAL
26 | ERROR = 40
27 | WARNING = 30
28 | WARN = WARNING
29 | INFO = 20
30 | DEBUG = 10
31 | NOTSET = 0
32 | 
33 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__))
34 | ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir)
35 | LOG_PATH = os.path.join(ROOT_PATH, 'log')
36 | 
37 | if not os.path.exists(LOG_PATH):
38 |     try:
39 |         os.mkdir(LOG_PATH)
40 |     except FileExistsError:
41 |         pass
42 | 
43 | 
44 | class LogHandler(logging.Logger):
45 |     """
46 |     LogHandler
47 |     """
48 | 
49 |     def __init__(self, name, level=DEBUG, stream=True, file=True):
50 |         self.name = name
51 |         self.level = level
52 |         logging.Logger.__init__(self, self.name, level=level)
53 |         if stream:
54 |             self.__setStreamHandler__()
55 |         if file:
56 |             if platform.system() != "Windows":
57 |                 self.__setFileHandler__()
58 | 
59 |     def __setFileHandler__(self, level=None):
60 |         """
61 |         set file handler
62 |         :param level:
63 |         :return:
64 |         """
65 |         file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name))
66 |         # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天
67 |         file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15)
68 |         file_handler.suffix = '%Y%m%d.log'
69 |         if not level:
70 |             file_handler.setLevel(self.level)
71 |         else:
72 |             file_handler.setLevel(level)
73 |         formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
74 | 
75 |         file_handler.setFormatter(formatter)
76 |         self.file_handler = file_handler
77 |         self.addHandler(file_handler)
78 | 
79 |     def __setStreamHandler__(self, level=None):
80 |         """
81 |         set stream handler
82 |         :param level:
83 |         :return:
84 |         """
85 |         stream_handler = logging.StreamHandler()
86 |         formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s')
87 |         stream_handler.setFormatter(formatter)
88 |         if not level:
89 |             stream_handler.setLevel(self.level)
90 |         else:
91 |             stream_handler.setLevel(level)
92 |         self.addHandler(stream_handler)
93 | 
94 | 
95 | if __name__ == '__main__':
96 |     log = LogHandler('test')
97 |     log.info('this is a test msg')
98 | 


--------------------------------------------------------------------------------
/proxy_pool/handler/proxyHandler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     ProxyHandler.py
 5 |    Description :
 6 |    Author :       JHao
 7 |    date：          2016/12/3
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2016/12/03:
11 |                    2020/05/26: 区分http和https
12 | -------------------------------------------------
13 | """
14 | __author__ = 'JHao'
15 | 
16 | from helper.proxy import Proxy
17 | from db.dbClient import DbClient
18 | from handler.configHandler import ConfigHandler
19 | 
20 | 
21 | class ProxyHandler(object):
22 |     """ Proxy CRUD operator"""
23 | 
24 |     def __init__(self):
25 |         self.conf = ConfigHandler()
26 |         self.db = DbClient(self.conf.dbConn)
27 |         self.db.changeTable(self.conf.tableName)
28 | 
29 |     def get(self, https=False):
30 |         """
31 |         return a proxy
32 |         Args:
33 |             https: True/False
34 |         Returns:
35 |         """
36 |         proxy = self.db.get(https)
37 |         return Proxy.createFromJson(proxy) if proxy else None
38 | 
39 |     def pop(self, https):
40 |         """
41 |         return and delete a useful proxy
42 |         :return:
43 |         """
44 |         proxy = self.db.pop(https)
45 |         if proxy:
46 |             return Proxy.createFromJson(proxy)
47 |         return None
48 | 
49 |     def put(self, proxy):
50 |         """
51 |         put proxy into use proxy
52 |         :return:
53 |         """
54 |         self.db.put(proxy)
55 | 
56 |     def delete(self, proxy):
57 |         """
58 |         delete useful proxy
59 |         :param proxy:
60 |         :return:
61 |         """
62 |         return self.db.delete(proxy.proxy)
63 | 
64 |     def getAll(self, https=False):
65 |         """
66 |         get all proxy from pool as Proxy list
67 |         :return:
68 |         """
69 |         proxies = self.db.getAll(https)
70 |         return [Proxy.createFromJson(_) for _ in proxies]
71 | 
72 |     def exists(self, proxy):
73 |         """
74 |         check proxy exists
75 |         :param proxy:
76 |         :return:
77 |         """
78 |         return self.db.exists(proxy.proxy)
79 | 
80 |     def getCount(self):
81 |         """
82 |         return raw_proxy and use_proxy count
83 |         :return:
84 |         """
85 |         total_use_proxy = self.db.getCount()
86 |         return {'count': total_use_proxy}
87 | 


--------------------------------------------------------------------------------
/proxy_pool/helper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__init__.py


--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/check.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/check.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/fetch.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/fetch.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/launcher.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/launcher.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/proxy.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/proxy.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/scheduler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/scheduler.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/helper/__pycache__/validator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/validator.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/helper/check.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -------------------------------------------------
  4 |    File Name：     check
  5 |    Description :   执行代理校验
  6 |    Author :        JHao
  7 |    date：          2019/8/6
  8 | -------------------------------------------------
  9 |    Change Activity:
 10 |                    2019/08/06: 执行代理校验
 11 |                    2021/05/25: 分别校验http和https
 12 |                    2022/08/16: 获取代理Region信息
 13 | -------------------------------------------------
 14 | """
 15 | __author__ = 'JHao'
 16 | 
 17 | from util.six import Empty
 18 | from threading import Thread
 19 | from datetime import datetime
 20 | from util.webRequest import WebRequest
 21 | from handler.logHandler import LogHandler
 22 | from helper.validator import ProxyValidator
 23 | from handler.proxyHandler import ProxyHandler
 24 | from handler.configHandler import ConfigHandler
 25 | 
 26 | 
 27 | class DoValidator(object):
 28 |     """ 执行校验 """
 29 | 
 30 |     conf = ConfigHandler()
 31 | 
 32 |     @classmethod
 33 |     def validator(cls, proxy, work_type):
 34 |         """
 35 |         校验入口
 36 |         Args:
 37 |             proxy: Proxy Object
 38 |             work_type: raw/use
 39 |         Returns:
 40 |             Proxy Object
 41 |         """
 42 |         http_r = cls.httpValidator(proxy)
 43 |         https_r = False if not http_r else cls.httpsValidator(proxy)
 44 | 
 45 |         proxy.check_count += 1
 46 |         proxy.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
 47 |         proxy.last_status = True if http_r else False
 48 |         if http_r:
 49 |             if proxy.fail_count > 0:
 50 |                 proxy.fail_count -= 1
 51 |             proxy.https = True if https_r else False
 52 |             if work_type == "raw":
 53 |                 proxy.region = cls.regionGetter(proxy) if cls.conf.proxyRegion else ""
 54 |         else:
 55 |             proxy.fail_count += 1
 56 |         return proxy
 57 | 
 58 |     @classmethod
 59 |     def httpValidator(cls, proxy):
 60 |         for func in ProxyValidator.http_validator:
 61 |             if not func(proxy.proxy):
 62 |                 return False
 63 |         return True
 64 | 
 65 |     @classmethod
 66 |     def httpsValidator(cls, proxy):
 67 |         for func in ProxyValidator.https_validator:
 68 |             if not func(proxy.proxy):
 69 |                 return False
 70 |         return True
 71 | 
 72 |     @classmethod
 73 |     def preValidator(cls, proxy):
 74 |         for func in ProxyValidator.pre_validator:
 75 |             if not func(proxy):
 76 |                 return False
 77 |         return True
 78 | 
 79 |     @classmethod
 80 |     def regionGetter(cls, proxy):
 81 |         try:
 82 |             url = 'https://searchplugin.csdn.net/api/v1/ip/get?ip=%s' % proxy.proxy.split(':')[0]
 83 |             r = WebRequest().get(url=url, retry_time=1, timeout=2).json
 84 |             return r['data']['address']
 85 |         except:
 86 |             return 'error'
 87 | 
 88 | 
 89 | class _ThreadChecker(Thread):
 90 |     """ 多线程检测 """
 91 | 
 92 |     def __init__(self, work_type, target_queue, thread_name):
 93 |         Thread.__init__(self, name=thread_name)
 94 |         self.work_type = work_type
 95 |         self.log = LogHandler("checker")
 96 |         self.proxy_handler = ProxyHandler()
 97 |         self.target_queue = target_queue
 98 |         self.conf = ConfigHandler()
 99 | 
100 |     def run(self):
101 |         self.log.info("{}ProxyCheck - {}: start".format(self.work_type.title(), self.name))
102 |         while True:
103 |             try:
104 |                 proxy = self.target_queue.get(block=False)
105 |             except Empty:
106 |                 self.log.info("{}ProxyCheck - {}: complete".format(self.work_type.title(), self.name))
107 |                 break
108 |             proxy = DoValidator.validator(proxy, self.work_type)
109 |             if self.work_type == "raw":
110 |                 self.__ifRaw(proxy)
111 |             else:
112 |                 self.__ifUse(proxy)
113 |             self.target_queue.task_done()
114 | 
115 |     def __ifRaw(self, proxy):
116 |         if proxy.last_status:
117 |             if self.proxy_handler.exists(proxy):
118 |                 self.log.info('RawProxyCheck - {}: {} exist'.format(self.name, proxy.proxy.ljust(23)))
119 |             else:
120 |                 self.log.info('RawProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23)))
121 |                 self.proxy_handler.put(proxy)
122 |         else:
123 |             self.log.info('RawProxyCheck - {}: {} fail'.format(self.name, proxy.proxy.ljust(23)))
124 | 
125 |     def __ifUse(self, proxy):
126 |         if proxy.last_status:
127 |             self.log.info('UseProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23)))
128 |             self.proxy_handler.put(proxy)
129 |         else:
130 |             if proxy.fail_count > self.conf.maxFailCount:
131 |                 self.log.info('UseProxyCheck - {}: {} fail, count {} delete'.format(self.name,
132 |                                                                                     proxy.proxy.ljust(23),
133 |                                                                                     proxy.fail_count))
134 |                 self.proxy_handler.delete(proxy)
135 |             else:
136 |                 self.log.info('UseProxyCheck - {}: {} fail, count {} keep'.format(self.name,
137 |                                                                                   proxy.proxy.ljust(23),
138 |                                                                                   proxy.fail_count))
139 |                 self.proxy_handler.put(proxy)
140 | 
141 | 
142 | def Checker(tp, queue):
143 |     """
144 |     run Proxy ThreadChecker
145 |     :param tp: raw/use
146 |     :param queue: Proxy Queue
147 |     :return:
148 |     """
149 |     thread_list = list()
150 |     for index in range(20):
151 |         thread_list.append(_ThreadChecker(tp, queue, "thread_%s" % str(index).zfill(2)))
152 | 
153 |     for thread in thread_list:
154 |         thread.setDaemon(True)
155 |         thread.start()
156 | 
157 |     for thread in thread_list:
158 |         thread.join()
159 | 


--------------------------------------------------------------------------------
/proxy_pool/helper/fetch.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     fetchScheduler
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2019/8/6
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2021/11/18: 多线程采集
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | from threading import Thread
16 | from helper.proxy import Proxy
17 | from helper.check import DoValidator
18 | from handler.logHandler import LogHandler
19 | from handler.proxyHandler import ProxyHandler
20 | from fetcher.proxyFetcher import ProxyFetcher
21 | from handler.configHandler import ConfigHandler
22 | 
23 | 
24 | class _ThreadFetcher(Thread):
25 | 
26 |     def __init__(self, fetch_source, proxy_dict):
27 |         Thread.__init__(self)
28 |         self.fetch_source = fetch_source
29 |         self.proxy_dict = proxy_dict
30 |         self.fetcher = getattr(ProxyFetcher, fetch_source, None)
31 |         self.log = LogHandler("fetcher")
32 |         self.conf = ConfigHandler()
33 |         self.proxy_handler = ProxyHandler()
34 | 
35 |     def run(self):
36 |         self.log.info("ProxyFetch - {func}: start".format(func=self.fetch_source))
37 |         try:
38 |             for proxy in self.fetcher():
39 |                 self.log.info('ProxyFetch - %s: %s ok' % (self.fetch_source, proxy.ljust(23)))
40 |                 proxy = proxy.strip()
41 |                 if proxy in self.proxy_dict:
42 |                     self.proxy_dict[proxy].add_source(self.fetch_source)
43 |                 else:
44 |                     self.proxy_dict[proxy] = Proxy(
45 |                         proxy, source=self.fetch_source)
46 |         except Exception as e:
47 |             self.log.error("ProxyFetch - {func}: error".format(func=self.fetch_source))
48 |             self.log.error(str(e))
49 | 
50 | 
51 | class Fetcher(object):
52 |     name = "fetcher"
53 | 
54 |     def __init__(self):
55 |         self.log = LogHandler(self.name)
56 |         self.conf = ConfigHandler()
57 | 
58 |     def run(self):
59 |         """
60 |         fetch proxy with proxyFetcher
61 |         :return:
62 |         """
63 |         proxy_dict = dict()
64 |         thread_list = list()
65 |         self.log.info("ProxyFetch : start")
66 | 
67 |         for fetch_source in self.conf.fetchers:
68 |             self.log.info("ProxyFetch - {func}: start".format(func=fetch_source))
69 |             fetcher = getattr(ProxyFetcher, fetch_source, None)
70 |             if not fetcher:
71 |                 self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_source))
72 |                 continue
73 |             if not callable(fetcher):
74 |                 self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_source))
75 |                 continue
76 |             thread_list.append(_ThreadFetcher(fetch_source, proxy_dict))
77 | 
78 |         for thread in thread_list:
79 |             thread.setDaemon(True)
80 |             thread.start()
81 | 
82 |         for thread in thread_list:
83 |             thread.join()
84 | 
85 |         self.log.info("ProxyFetch - all complete!")
86 |         for _ in proxy_dict.values():
87 |             if DoValidator.preValidator(_.proxy):
88 |                 yield _
89 | 


--------------------------------------------------------------------------------
/proxy_pool/helper/launcher.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     launcher
 5 |    Description :   启动器
 6 |    Author :        JHao
 7 |    date：          2021/3/26
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2021/3/26: 启动器
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | import sys
16 | from db.dbClient import DbClient
17 | from handler.logHandler import LogHandler
18 | from handler.configHandler import ConfigHandler
19 | 
20 | log = LogHandler('launcher')
21 | 
22 | 
23 | def startServer():
24 |     __beforeStart()
25 |     from api.proxyApi import runFlask
26 |     runFlask()
27 | 
28 | 
29 | def startScheduler():
30 |     __beforeStart()
31 |     from helper.scheduler import runScheduler
32 |     runScheduler()
33 | 
34 | 
35 | def __beforeStart():
36 |     __showVersion()
37 |     __showConfigure()
38 |     if __checkDBConfig():
39 |         log.info('exit!')
40 |         sys.exit()
41 | 
42 | 
43 | def __showVersion():
44 |     from setting import VERSION
45 |     log.info("ProxyPool Version: %s" % VERSION)
46 | 
47 | 
48 | def __showConfigure():
49 |     conf = ConfigHandler()
50 |     log.info("ProxyPool configure HOST: %s" % conf.serverHost)
51 |     log.info("ProxyPool configure PORT: %s" % conf.serverPort)
52 |     log.info("ProxyPool configure PROXY_FETCHER: %s" % conf.fetchers)
53 | 
54 | 
55 | def __checkDBConfig():
56 |     conf = ConfigHandler()
57 |     db = DbClient(conf.dbConn)
58 |     log.info("============ DATABASE CONFIGURE ================")
59 |     log.info("DB_TYPE: %s" % db.db_type)
60 |     log.info("DB_HOST: %s" % db.db_host)
61 |     log.info("DB_PORT: %s" % db.db_port)
62 |     log.info("DB_NAME: %s" % db.db_name)
63 |     log.info("DB_USER: %s" % db.db_user)
64 |     log.info("=================================================")
65 |     return db.test()
66 | 


--------------------------------------------------------------------------------
/proxy_pool/helper/proxy.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -------------------------------------------------
  4 |    File Name：     Proxy
  5 |    Description :   代理对象类型封装
  6 |    Author :        JHao
  7 |    date：          2019/7/11
  8 | -------------------------------------------------
  9 |    Change Activity:
 10 |                    2019/7/11: 代理对象类型封装
 11 | -------------------------------------------------
 12 | """
 13 | __author__ = 'JHao'
 14 | 
 15 | import json
 16 | 
 17 | 
 18 | class Proxy(object):
 19 | 
 20 |     def __init__(self, proxy, fail_count=0, region="", anonymous="",
 21 |                  source="", check_count=0, last_status="", last_time="", https=False):
 22 |         self._proxy = proxy
 23 |         self._fail_count = fail_count
 24 |         self._region = region
 25 |         self._anonymous = anonymous
 26 |         self._source = source.split('/')
 27 |         self._check_count = check_count
 28 |         self._last_status = last_status
 29 |         self._last_time = last_time
 30 |         self._https = https
 31 | 
 32 |     @classmethod
 33 |     def createFromJson(cls, proxy_json):
 34 |         _dict = json.loads(proxy_json)
 35 |         return cls(proxy=_dict.get("proxy", ""),
 36 |                    fail_count=_dict.get("fail_count", 0),
 37 |                    region=_dict.get("region", ""),
 38 |                    anonymous=_dict.get("anonymous", ""),
 39 |                    source=_dict.get("source", ""),
 40 |                    check_count=_dict.get("check_count", 0),
 41 |                    last_status=_dict.get("last_status", ""),
 42 |                    last_time=_dict.get("last_time", ""),
 43 |                    https=_dict.get("https", False)
 44 |                    )
 45 | 
 46 |     @property
 47 |     def proxy(self):
 48 |         """ 代理 ip:port """
 49 |         return self._proxy
 50 | 
 51 |     @property
 52 |     def fail_count(self):
 53 |         """ 检测失败次数 """
 54 |         return self._fail_count
 55 | 
 56 |     @property
 57 |     def region(self):
 58 |         """ 地理位置(国家/城市) """
 59 |         return self._region
 60 | 
 61 |     @property
 62 |     def anonymous(self):
 63 |         """ 匿名 """
 64 |         return self._anonymous
 65 | 
 66 |     @property
 67 |     def source(self):
 68 |         """ 代理来源 """
 69 |         return '/'.join(self._source)
 70 | 
 71 |     @property
 72 |     def check_count(self):
 73 |         """ 代理检测次数 """
 74 |         return self._check_count
 75 | 
 76 |     @property
 77 |     def last_status(self):
 78 |         """ 最后一次检测结果  True -> 可用; False -> 不可用"""
 79 |         return self._last_status
 80 | 
 81 |     @property
 82 |     def last_time(self):
 83 |         """ 最后一次检测时间 """
 84 |         return self._last_time
 85 | 
 86 |     @property
 87 |     def https(self):
 88 |         """ 是否支持https """
 89 |         return self._https
 90 | 
 91 |     @property
 92 |     def to_dict(self):
 93 |         """ 属性字典 """
 94 |         return {"proxy": self.proxy,
 95 |                 "https": self.https,
 96 |                 "fail_count": self.fail_count,
 97 |                 "region": self.region,
 98 |                 "anonymous": self.anonymous,
 99 |                 "source": self.source,
100 |                 "check_count": self.check_count,
101 |                 "last_status": self.last_status,
102 |                 "last_time": self.last_time}
103 | 
104 |     @property
105 |     def to_json(self):
106 |         """ 属性json格式 """
107 |         return json.dumps(self.to_dict, ensure_ascii=False)
108 | 
109 |     @fail_count.setter
110 |     def fail_count(self, value):
111 |         self._fail_count = value
112 | 
113 |     @check_count.setter
114 |     def check_count(self, value):
115 |         self._check_count = value
116 | 
117 |     @last_status.setter
118 |     def last_status(self, value):
119 |         self._last_status = value
120 | 
121 |     @last_time.setter
122 |     def last_time(self, value):
123 |         self._last_time = value
124 | 
125 |     @https.setter
126 |     def https(self, value):
127 |         self._https = value
128 | 
129 |     @region.setter
130 |     def region(self, value):
131 |         self._region = value
132 | 
133 |     def add_source(self, source_str):
134 |         if source_str:
135 |             self._source.append(source_str)
136 |             self._source = list(set(self._source))
137 | 


--------------------------------------------------------------------------------
/proxy_pool/helper/scheduler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     proxyScheduler
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2019/8/5
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2019/08/05: proxyScheduler
11 |                    2021/02/23: runProxyCheck时,剩余代理少于POOL_SIZE_MIN时执行抓取
12 | -------------------------------------------------
13 | """
14 | __author__ = 'JHao'
15 | 
16 | from apscheduler.schedulers.blocking import BlockingScheduler
17 | from apscheduler.executors.pool import ProcessPoolExecutor
18 | 
19 | from util.six import Queue
20 | from helper.fetch import Fetcher
21 | from helper.check import Checker
22 | from handler.logHandler import LogHandler
23 | from handler.proxyHandler import ProxyHandler
24 | from handler.configHandler import ConfigHandler
25 | 
26 | 
27 | def __runProxyFetch():
28 |     proxy_queue = Queue()
29 |     proxy_fetcher = Fetcher()
30 | 
31 |     for proxy in proxy_fetcher.run():
32 |         proxy_queue.put(proxy)
33 | 
34 |     Checker("raw", proxy_queue)
35 | 
36 | 
37 | def __runProxyCheck():
38 |     proxy_handler = ProxyHandler()
39 |     proxy_queue = Queue()
40 |     if proxy_handler.db.getCount().get("total", 0) < proxy_handler.conf.poolSizeMin:
41 |         __runProxyFetch()
42 |     for proxy in proxy_handler.getAll():
43 |         proxy_queue.put(proxy)
44 |     Checker("use", proxy_queue)
45 | 
46 | 
47 | def runScheduler():
48 |     __runProxyFetch()
49 | 
50 |     timezone = ConfigHandler().timezone
51 |     scheduler_log = LogHandler("scheduler")
52 |     scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone)
53 | 
54 |     scheduler.add_job(__runProxyFetch, 'interval', minutes=4, id="proxy_fetch", name="proxy采集")
55 |     scheduler.add_job(__runProxyCheck, 'interval', minutes=2, id="proxy_check", name="proxy检查")
56 |     executors = {
57 |         'default': {'type': 'threadpool', 'max_workers': 20},
58 |         'processpool': ProcessPoolExecutor(max_workers=5)
59 |     }
60 |     job_defaults = {
61 |         'coalesce': False,
62 |         'max_instances': 10
63 |     }
64 | 
65 |     scheduler.configure(executors=executors, job_defaults=job_defaults, timezone=timezone)
66 | 
67 |     scheduler.start()
68 | 
69 | 
70 | if __name__ == '__main__':
71 |     runScheduler()
72 | 


--------------------------------------------------------------------------------
/proxy_pool/helper/validator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     _validators
 5 |    Description :   定义proxy验证方法
 6 |    Author :        JHao
 7 |    date：          2021/5/25
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2021/5/25:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | from re import findall
16 | from requests import head
17 | from util.six import withMetaclass
18 | from util.singleton import Singleton
19 | from handler.configHandler import ConfigHandler
20 | 
21 | conf = ConfigHandler()
22 | 
23 | HEADER = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
24 |           'Accept': '*/*',
25 |           'Connection': 'keep-alive',
26 |           'Accept-Language': 'zh-CN,zh;q=0.8'}
27 | 
28 | 
29 | class ProxyValidator(withMetaclass(Singleton)):
30 |     pre_validator = []
31 |     http_validator = []
32 |     https_validator = []
33 | 
34 |     @classmethod
35 |     def addPreValidator(cls, func):
36 |         cls.pre_validator.append(func)
37 |         return func
38 | 
39 |     @classmethod
40 |     def addHttpValidator(cls, func):
41 |         cls.http_validator.append(func)
42 |         return func
43 | 
44 |     @classmethod
45 |     def addHttpsValidator(cls, func):
46 |         cls.https_validator.append(func)
47 |         return func
48 | 
49 | 
50 | @ProxyValidator.addPreValidator
51 | def formatValidator(proxy):
52 |     """检查代理格式"""
53 |     verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
54 |     _proxy = findall(verify_regex, proxy)
55 |     return True if len(_proxy) == 1 and _proxy[0] == proxy else False
56 | 
57 | 
58 | @ProxyValidator.addHttpValidator
59 | def httpTimeOutValidator(proxy):
60 |     """ http检测超时 """
61 | 
62 |     proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)}
63 | 
64 |     try:
65 |         r = head(conf.httpUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout)
66 |         return True if r.status_code == 200 else False
67 |     except Exception as e:
68 |         return False
69 | 
70 | 
71 | @ProxyValidator.addHttpsValidator
72 | def httpsTimeOutValidator(proxy):
73 |     """https检测超时"""
74 | 
75 |     proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)}
76 |     try:
77 |         r = head(conf.httpsUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout, verify=False)
78 |         return True if r.status_code == 200 else False
79 |     except Exception as e:
80 |         return False
81 | 
82 | 
83 | @ProxyValidator.addHttpValidator
84 | def customValidatorExample(proxy):
85 |     """自定义validator函数，校验代理是否可用, 返回True/False"""
86 |     return True
87 | 


--------------------------------------------------------------------------------
/proxy_pool/proxyPool.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     proxy_pool
 5 |    Description :   proxy pool 启动入口
 6 |    Author :        JHao
 7 |    date：          2020/6/19
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2020/6/19:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | import click
16 | from helper.launcher import startServer, startScheduler
17 | from setting import BANNER, VERSION
18 | 
19 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help'])
20 | 
21 | 
22 | @click.group(context_settings=CONTEXT_SETTINGS)
23 | @click.version_option(version=VERSION)
24 | def cli():
25 |     """ProxyPool cli工具"""
26 | 
27 | @cli.command(name="schedule")
28 | def schedule():
29 |     """ 启动调度程序 """
30 |     click.echo(BANNER)
31 |     startScheduler()
32 | 
33 | 
34 | @cli.command(name="server")
35 | def server():
36 |     """ 启动api服务 """
37 |     click.echo(BANNER)
38 |     startServer()
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     cli()
43 | 


--------------------------------------------------------------------------------
/proxy_pool/requirements.txt:
--------------------------------------------------------------------------------
1 | APScheduler==3.2.0
2 | werkzeug==0.15.5
3 | Flask==2.2.2
4 | requests==2.20.0
5 | click==7.0
6 | gunicorn==19.9.0
7 | lxml
8 | redis
9 | 


--------------------------------------------------------------------------------
/proxy_pool/setting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     setting.py
 5 |    Description :   配置文件
 6 |    Author :        JHao
 7 |    date：          2019/2/15
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2019/2/15:
11 | -------------------------------------------------
12 | """
13 | 
14 | BANNER = r"""
15 | ****************************************************************
16 | *** ______  ********************* ______ *********** _  ********
17 | *** | ___ \_ ******************** | ___ \ ********* | | ********
18 | *** | |_/ / \__ __   __  _ __   _ | |_/ /___ * ___  | | ********
19 | *** |  __/|  _// _ \ \ \/ /| | | ||  __// _ \ / _ \ | | ********
20 | *** | |   | | | (_) | >  < \ |_| || |  | (_) | (_) || |___  ****
21 | *** \_|   |_|  \___/ /_/\_\ \__  |\_|   \___/ \___/ \_____/ ****
22 | ****                       __ / /                          *****
23 | ************************* /___ / *******************************
24 | *************************       ********************************
25 | ****************************************************************
26 | """
27 | 
28 | VERSION = "2.4.0"
29 | 
30 | # ############### server config ###############
31 | HOST = "0.0.0.0"
32 | 
33 | PORT = 5010
34 | 
35 | # ############### database config ###################
36 | # db connection uri
37 | # example:
38 | #      Redis: redis://:password@ip:port/db
39 | #      Ssdb:  ssdb://:password@ip:port
40 | DB_CONN = 'redis://:@127.0.0.1:6379/0'
41 | 
42 | # proxy table name
43 | TABLE_NAME = 'use_proxy'
44 | 
45 | 
46 | # ###### config the proxy fetch function ######
47 | PROXY_FETCHER = [
48 |     "freeProxy01",
49 |     "freeProxy02",
50 |     "freeProxy03",
51 |     "freeProxy04",
52 |     "freeProxy05",
53 |     "freeProxy06",
54 |     "freeProxy07",
55 |     "freeProxy08",
56 |     "freeProxy09",
57 |     "freeProxy10"
58 | ]
59 | 
60 | # ############# proxy validator #################
61 | # 代理验证目标网站
62 | HTTP_URL = "http://httpbin.org"
63 | 
64 | HTTPS_URL = "https://www.qq.com"
65 | 
66 | # 代理验证时超时时间
67 | VERIFY_TIMEOUT = 10
68 | 
69 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败次数,超过则剔除代理
70 | MAX_FAIL_COUNT = 0
71 | 
72 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败率,超过则剔除代理
73 | # MAX_FAIL_RATE = 0.1
74 | 
75 | # proxyCheck时代理数量少于POOL_SIZE_MIN触发抓取
76 | POOL_SIZE_MIN = 20
77 | 
78 | # ############# proxy attributes #################
79 | # 是否启用代理地域属性
80 | PROXY_REGION = True
81 | 
82 | # ############# scheduler config #################
83 | 
84 | # Set the timezone for the scheduler forcely (optional)
85 | # If it is running on a VM, and
86 | #   "ValueError: Timezone offset does not match system offset"
87 | #   was raised during scheduling.
88 | # Please uncomment the following line and set a timezone for the scheduler.
89 | # Otherwise it will detect the timezone from the system automatically.
90 | 
91 | TIMEZONE = "Asia/Shanghai"
92 | 


--------------------------------------------------------------------------------
/proxy_pool/test/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     __init__
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2019/2/15
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2019/2/15:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 


--------------------------------------------------------------------------------
/proxy_pool/test/testConfigHandler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     testGetConfig
 5 |    Description :   testGetConfig
 6 |    Author :        J_hao
 7 |    date：          2017/7/31
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2017/7/31:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'J_hao'
14 | 
15 | from handler.configHandler import ConfigHandler
16 | from time import sleep
17 | 
18 | 
19 | def testConfig():
20 |     """
21 |     :return:
22 |     """
23 |     conf = ConfigHandler()
24 |     print(conf.dbConn)
25 |     print(conf.serverPort)
26 |     print(conf.serverHost)
27 |     print(conf.tableName)
28 |     assert isinstance(conf.fetchers, list)
29 |     print(conf.fetchers)
30 | 
31 |     for _ in range(2):
32 |         print(conf.fetchers)
33 |         sleep(5)
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     testConfig()
38 | 
39 | 


--------------------------------------------------------------------------------
/proxy_pool/test/testDbClient.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     testDbClient
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2020/6/23
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2020/6/23:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | from db.dbClient import DbClient
16 | 
17 | 
18 | def testDbClient():
19 |     #  ############### ssdb ###############
20 |     ssdb_uri = "ssdb://:password@127.0.0.1:8888"
21 |     s = DbClient.parseDbConn(ssdb_uri)
22 |     assert s.db_type == "SSDB"
23 |     assert s.db_pwd == "password"
24 |     assert s.db_host == "127.0.0.1"
25 |     assert s.db_port == 8888
26 | 
27 |     #  ############### redis ###############
28 |     redis_uri = "redis://:password@127.0.0.1:6379/1"
29 |     r = DbClient.parseDbConn(redis_uri)
30 |     assert r.db_type == "REDIS"
31 |     assert r.db_pwd == "password"
32 |     assert r.db_host == "127.0.0.1"
33 |     assert r.db_port == 6379
34 |     assert r.db_name == "1"
35 |     print("DbClient ok!")
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     testDbClient()
40 | 


--------------------------------------------------------------------------------
/proxy_pool/test/testLogHandler.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     testLogHandler
 5 |    Description :
 6 |    Author :        J_hao
 7 |    date：          2017/8/2
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2017/8/2:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'J_hao'
14 | 
15 | from handler.logHandler import LogHandler
16 | 
17 | 
18 | def testLogHandler():
19 |     log = LogHandler('test')
20 |     log.info('this is info')
21 |     log.error('this is error')
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     testLogHandler()
26 | 


--------------------------------------------------------------------------------
/proxy_pool/test/testProxyClass.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     testProxyClass
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2019/8/8
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2019/8/8:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | import json
16 | from helper.proxy import Proxy
17 | 
18 | 
19 | def testProxyClass():
20 |     proxy = Proxy("127.0.0.1:8080")
21 | 
22 |     print(proxy.to_json)
23 | 
24 |     proxy.source = "test"
25 | 
26 |     proxy_str = json.dumps(proxy.to_dict, ensure_ascii=False)
27 | 
28 |     print(proxy_str)
29 | 
30 |     print(Proxy.createFromJson(proxy_str).to_dict)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     testProxyClass()
35 | 


--------------------------------------------------------------------------------
/proxy_pool/test/testProxyFetcher.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     testProxyFetcher
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2020/6/23
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2020/6/23:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | from fetcher.proxyFetcher import ProxyFetcher
16 | from handler.configHandler import ConfigHandler
17 | 
18 | 
19 | def testProxyFetcher():
20 |     conf = ConfigHandler()
21 |     proxy_getter_functions = conf.fetchers
22 |     proxy_counter = {_: 0 for _ in proxy_getter_functions}
23 |     for proxyGetter in proxy_getter_functions:
24 |         for proxy in getattr(ProxyFetcher, proxyGetter.strip())():
25 |             if proxy:
26 |                 print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy))
27 |                 proxy_counter[proxyGetter] = proxy_counter.get(proxyGetter) + 1
28 |     for key, value in proxy_counter.items():
29 |         print(key, value)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     testProxyFetcher()
34 | 


--------------------------------------------------------------------------------
/proxy_pool/test/testProxyValidator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     testProxyValidator
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2021/5/25
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2021/5/25:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | from helper.validator import ProxyValidator
16 | 
17 | 
18 | def testProxyValidator():
19 |     for _ in ProxyValidator.pre_validator:
20 |         print(_)
21 |     for _ in ProxyValidator.http_validator:
22 |         print(_)
23 |     for _ in ProxyValidator.https_validator:
24 |         print(_)
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     testProxyValidator()
29 | 


--------------------------------------------------------------------------------
/proxy_pool/test/testRedisClient.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     testRedisClient
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2020/6/23
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2020/6/23:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | 
16 | def testRedisClient():
17 |     from db.dbClient import DbClient
18 |     from helper.proxy import Proxy
19 | 
20 |     uri = "redis://:pwd@127.0.0.1:6379"
21 |     db = DbClient(uri)
22 |     db.changeTable("use_proxy")
23 |     proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}')
24 | 
25 |     print("put: ", db.put(proxy))
26 | 
27 |     print("get: ", db.get(https=None))
28 | 
29 |     print("exists: ", db.exists("27.38.96.101:9797"))
30 | 
31 |     print("exists: ", db.exists("27.38.96.101:8888"))
32 | 
33 |     print("pop: ", db.pop(https=None))
34 | 
35 |     print("getAll: ", db.getAll(https=None))
36 | 
37 |     print("getCount", db.getCount())
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     testRedisClient()
42 | 


--------------------------------------------------------------------------------
/proxy_pool/test/testSsdbClient.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     testSsdbClient
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2020/7/3
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2020/7/3:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | 
16 | def testSsdbClient():
17 |     from db.dbClient import DbClient
18 |     from helper.proxy import Proxy
19 | 
20 |     uri = "ssdb://@127.0.0.1:8888"
21 |     db = DbClient(uri)
22 |     db.changeTable("use_proxy")
23 |     proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}')
24 | 
25 |     print("put: ", db.put(proxy))
26 | 
27 |     print("get: ", db.get(https=None))
28 | 
29 |     print("exists: ", db.exists("27.38.96.101:9797"))
30 | 
31 |     print("exists: ", db.exists("27.38.96.101:8888"))
32 | 
33 |     print("getAll: ", db.getAll(https=None))
34 | 
35 |     # print("pop: ", db.pop(https=None))
36 | 
37 |     print("clear: ", db.clear())
38 | 
39 |     print("getCount", db.getCount())
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     testSsdbClient()
44 | 


--------------------------------------------------------------------------------
/proxy_pool/util/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     __init__
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2020/7/6
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2020/7/6:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 


--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/lazyProperty.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/lazyProperty.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/singleton.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/singleton.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/six.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/six.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/util/__pycache__/webRequest.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/webRequest.cpython-38.pyc


--------------------------------------------------------------------------------
/proxy_pool/util/lazyProperty.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     lazyProperty
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2016/12/3
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2016/12/3:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | 
16 | class LazyProperty(object):
17 |     """
18 |     LazyProperty
19 |     explain: http://www.spiderpy.cn/blog/5/
20 |     """
21 | 
22 |     def __init__(self, func):
23 |         self.func = func
24 | 
25 |     def __get__(self, instance, owner):
26 |         if instance is None:
27 |             return self
28 |         else:
29 |             value = self.func(instance)
30 |             setattr(instance, self.func.__name__, value)
31 |             return value
32 | 


--------------------------------------------------------------------------------
/proxy_pool/util/singleton.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     singleton
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2016/12/3
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2016/12/3:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | 
16 | class Singleton(type):
17 |     """
18 |     Singleton Metaclass
19 |     """
20 | 
21 |     _inst = {}
22 | 
23 |     def __call__(cls, *args, **kwargs):
24 |         if cls not in cls._inst:
25 |             cls._inst[cls] = super(Singleton, cls).__call__(*args)
26 |         return cls._inst[cls]
27 | 


--------------------------------------------------------------------------------
/proxy_pool/util/six.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | -------------------------------------------------
 4 |    File Name：     six
 5 |    Description :
 6 |    Author :        JHao
 7 |    date：          2020/6/22
 8 | -------------------------------------------------
 9 |    Change Activity:
10 |                    2020/6/22:
11 | -------------------------------------------------
12 | """
13 | __author__ = 'JHao'
14 | 
15 | import sys
16 | 
17 | PY2 = sys.version_info[0] == 2
18 | PY3 = sys.version_info[0] == 3
19 | 
20 | if PY3:
21 |     def iteritems(d, **kw):
22 |         return iter(d.items(**kw))
23 | else:
24 |     def iteritems(d, **kw):
25 |         return d.iteritems(**kw)
26 | 
27 | if PY3:
28 |     from urllib.parse import urlparse
29 | else:
30 |     from urlparse import urlparse
31 | 
32 | if PY3:
33 |     from imp import reload as reload_six
34 | else:
35 |     reload_six = reload
36 | 
37 | if PY3:
38 |     from queue import Empty, Queue
39 | else:
40 |     from Queue import Empty, Queue
41 | 
42 | 
43 | def withMetaclass(meta, *bases):
44 |     """Create a base class with a metaclass."""
45 | 
46 |     # This requires a bit of explanation: the basic idea is to make a dummy
47 |     # metaclass for one level of class instantiation that replaces itself with
48 |     # the actual metaclass.
49 |     class MetaClass(meta):
50 | 
51 |         def __new__(cls, name, this_bases, d):
52 |             return meta(name, bases, d)
53 | 
54 |     return type.__new__(MetaClass, 'temporary_class', (), {})
55 | 


--------------------------------------------------------------------------------
/proxy_pool/util/webRequest.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -------------------------------------------------
  4 |    File Name：     WebRequest
  5 |    Description :   Network Requests Class
  6 |    Author :        J_hao
  7 |    date：          2017/7/31
  8 | -------------------------------------------------
  9 |    Change Activity:
 10 |                    2017/7/31:
 11 | -------------------------------------------------
 12 | """
 13 | __author__ = 'J_hao'
 14 | 
 15 | from requests.models import Response
 16 | from lxml import etree
 17 | import requests
 18 | import random
 19 | import time
 20 | 
 21 | from handler.logHandler import LogHandler
 22 | 
 23 | requests.packages.urllib3.disable_warnings()
 24 | 
 25 | 
 26 | class WebRequest(object):
 27 |     name = "web_request"
 28 | 
 29 |     def __init__(self, *args, **kwargs):
 30 |         self.log = LogHandler(self.name, file=False)
 31 |         self.response = Response()
 32 | 
 33 |     @property
 34 |     def user_agent(self):
 35 |         """
 36 |         return an User-Agent at random
 37 |         :return:
 38 |         """
 39 |         ua_list = [
 40 |             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101',
 41 |             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122',
 42 |             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71',
 43 |             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95',
 44 |             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71',
 45 |             'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
 46 |             'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50',
 47 |             'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
 48 |         ]
 49 |         return random.choice(ua_list)
 50 | 
 51 |     @property
 52 |     def header(self):
 53 |         """
 54 |         basic header
 55 |         :return:
 56 |         """
 57 |         return {'User-Agent': self.user_agent,
 58 |                 'Accept': '*/*',
 59 |                 'Connection': 'keep-alive',
 60 |                 'Accept-Language': 'zh-CN,zh;q=0.8'}
 61 | 
 62 |     def get(self, url, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs):
 63 |         """
 64 |         get method
 65 |         :param url: target url
 66 |         :param header: headers
 67 |         :param retry_time: retry time
 68 |         :param retry_interval: retry interval
 69 |         :param timeout: network timeout
 70 |         :return:
 71 |         """
 72 |         headers = self.header
 73 |         if header and isinstance(header, dict):
 74 |             headers.update(header)
 75 |         while True:
 76 |             try:
 77 |                 self.response = requests.get(url, headers=headers, timeout=timeout, *args, **kwargs)
 78 |                 return self
 79 |             except Exception as e:
 80 |                 self.log.error("requests: %s error: %s" % (url, str(e)))
 81 |                 retry_time -= 1
 82 |                 if retry_time <= 0:
 83 |                     resp = Response()
 84 |                     resp.status_code = 200
 85 |                     return self
 86 |                 self.log.info("retry %s second after" % retry_interval)
 87 |                 time.sleep(retry_interval)
 88 | 
 89 |     @property
 90 |     def tree(self):
 91 |         return etree.HTML(self.response.content)
 92 | 
 93 |     @property
 94 |     def text(self):
 95 |         return self.response.text
 96 | 
 97 |     @property
 98 |     def json(self):
 99 |         try:
100 |             return self.response.json()
101 |         except Exception as e:
102 |             self.log.error(str(e))
103 |             return {}
104 | 
105 | 


--------------------------------------------------------------------------------