├── FlaskProject ├── .idea │ ├── .gitignore │ ├── flaskProject.iml │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ └── modules.xml ├── __pycache__ │ └── app.cpython-38.pyc ├── app.py ├── data.txt ├── static │ ├── css │ │ └── style.css │ ├── img │ │ ├── generated_leaf.jpg │ │ ├── generated_tree.jpg │ │ ├── icon.png │ │ ├── leaf.jpg │ │ └── tree.jpg │ ├── js │ │ ├── echarts.min.js │ │ └── main.js │ └── vendor │ │ ├── animate.css │ │ ├── animate.css │ │ └── animate.min.css │ │ ├── aos │ │ ├── aos.css │ │ └── aos.js │ │ ├── bootstrap │ │ ├── css │ │ │ ├── bootstrap-grid.css │ │ │ ├── bootstrap-grid.css.map │ │ │ ├── bootstrap-grid.min.css │ │ │ ├── bootstrap-grid.min.css.map │ │ │ ├── bootstrap-reboot.css │ │ │ ├── bootstrap-reboot.css.map │ │ │ ├── bootstrap-reboot.min.css │ │ │ ├── bootstrap-reboot.min.css.map │ │ │ ├── bootstrap.css │ │ │ ├── bootstrap.css.map │ │ │ ├── bootstrap.min.css │ │ │ └── bootstrap.min.css.map │ │ └── js │ │ │ ├── bootstrap.bundle.js │ │ │ ├── bootstrap.bundle.js.map │ │ │ ├── bootstrap.bundle.min.js │ │ │ ├── bootstrap.bundle.min.js.map │ │ │ ├── bootstrap.js │ │ │ ├── bootstrap.js.map │ │ │ ├── bootstrap.min.js │ │ │ └── bootstrap.min.js.map │ │ ├── boxicons │ │ ├── css │ │ │ ├── animations.css │ │ │ ├── boxicons.css │ │ │ ├── boxicons.min.css │ │ │ └── transformations.css │ │ └── fonts │ │ │ ├── boxicons.eot │ │ │ ├── boxicons.svg │ │ │ ├── boxicons.ttf │ │ │ ├── boxicons.woff │ │ │ └── boxicons.woff2 │ │ ├── counterup │ │ └── counterup.min.js │ │ ├── icofont │ │ ├── fonts │ │ │ ├── icofont.woff │ │ │ └── icofont.woff2 │ │ └── icofont.min.css │ │ ├── isotope-layout │ │ ├── isotope.pkgd.js │ │ └── isotope.pkgd.min.js │ │ ├── jquery-sticky │ │ └── jquery.sticky.js │ │ ├── jquery.easing │ │ └── jquery.easing.min.js │ │ ├── jquery │ │ ├── jquery.min.js │ │ └── jquery.min.map │ │ ├── php-email-form │ │ └── validate.js │ │ ├── venobox │ │ ├── venobox.css │ │ ├── venobox.js │ │ └── venobox.min.js │ │ └── waypoints │ │ └── jquery.waypoints.min.js ├── templates │ ├── index.html │ ├── movie.html │ ├── score.html │ └── word.html └── wordCloud.py ├── Master ├── .idea │ ├── .gitignore │ ├── Master.iml │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ └── modules.xml └── main.py ├── Pic ├── index.jpg ├── mongoDB_data.jpg ├── movies.jpg ├── proxy.jpg ├── redis_data.jpg ├── score.jpg ├── slave.jpg └── words.jpg ├── README.md ├── Slave ├── .idea │ ├── .gitignore │ ├── Slave.iml │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ └── modules.xml ├── movies │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-38.pyc │ │ ├── middlewares.cpython-38.pyc │ │ ├── pipelines.cpython-38.pyc │ │ └── settings.cpython-38.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ ├── spiders │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-38.pyc │ │ │ └── douban_redis.cpython-38.pyc │ │ └── douban_redis.py │ └── start.py └── scrapy.cfg └── proxy_pool ├── .idea ├── .gitignore ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── proxy_pool.iml ├── __pycache__ └── setting.cpython-38.pyc ├── api ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ └── proxyApi.cpython-38.pyc └── proxyApi.py ├── db ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── dbClient.cpython-38.pyc │ └── redisClient.cpython-38.pyc ├── dbClient.py ├── redisClient.py └── ssdbClient.py ├── docs ├── Makefile ├── changelog.rst ├── conf.py ├── dev │ ├── ext_fetcher.rst │ ├── ext_validator.rst │ └── index.rst ├── index.rst ├── make.bat └── user │ ├── how_to_config.rst │ ├── how_to_run.rst │ ├── how_to_use.rst │ └── index.rst ├── fetcher ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ └── proxyFetcher.cpython-38.pyc └── proxyFetcher.py ├── handler ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── configHandler.cpython-38.pyc │ ├── logHandler.cpython-38.pyc │ └── proxyHandler.cpython-38.pyc ├── configHandler.py ├── logHandler.py └── proxyHandler.py ├── helper ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-38.pyc │ ├── check.cpython-38.pyc │ ├── fetch.cpython-38.pyc │ ├── launcher.cpython-38.pyc │ ├── proxy.cpython-38.pyc │ ├── scheduler.cpython-38.pyc │ └── validator.cpython-38.pyc ├── check.py ├── fetch.py ├── launcher.py ├── proxy.py ├── scheduler.py └── validator.py ├── proxyPool.py ├── requirements.txt ├── setting.py ├── test ├── __init__.py ├── testConfigHandler.py ├── testDbClient.py ├── testLogHandler.py ├── testProxyClass.py ├── testProxyFetcher.py ├── testProxyValidator.py ├── testRedisClient.py └── testSsdbClient.py └── util ├── __init__.py ├── __pycache__ ├── __init__.cpython-38.pyc ├── lazyProperty.cpython-38.pyc ├── singleton.cpython-38.pyc ├── six.cpython-38.pyc └── webRequest.cpython-38.pyc ├── lazyProperty.py ├── singleton.py ├── six.py └── webRequest.py /FlaskProject/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /FlaskProject/.idea/flaskProject.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 18 | 19 | -------------------------------------------------------------------------------- /FlaskProject/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /FlaskProject/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /FlaskProject/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /FlaskProject/__pycache__/app.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/__pycache__/app.cpython-38.pyc -------------------------------------------------------------------------------- /FlaskProject/app.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, make_response,jsonify 2 | import pymongo 3 | 4 | app = Flask(__name__) 5 | 6 | def myCollection(): 7 | client = pymongo.MongoClient(host='localhost', port=27017) 8 | db = client.movies 9 | collection = db.douban 10 | return collection 11 | 12 | @app.route('/') 13 | def home(): 14 | return index() 15 | 16 | @app.route('/index') 17 | def index(): 18 | #电影 评分 词汇 团队成员 19 | movies_num = 0 20 | votes_num = 0 21 | words_num = 11655 22 | team_num = 8 23 | for item in myCollection().find(): 24 | movies_num += 1 25 | votes_num += int(item['comment_num']) 26 | votes_num = int(votes_num / 10000) 27 | return render_template("index.html",movies_num=movies_num,votes_num=votes_num,words_num=words_num,team_num=team_num) 28 | 29 | @app.route('/movie') 30 | def movie(): 31 | query = {} 32 | projection = {} 33 | 34 | projection["rank"] = u"$rank" 35 | projection["page_url"] = u"$page_url" 36 | projection["title"] = u"$title" 37 | projection["score"] = u"$score" 38 | projection["comment_num"] = u"$comment_num" 39 | projection["directedBy"] = u"$directedBy" 40 | projection["actors"] = u"$actors" 41 | projection["comment"] = u"$comment" 42 | projection["year"] = u"$year" 43 | projection["_id"] = 0 44 | 45 | cursor = myCollection().find(query, projection=projection) 46 | movies = [] 47 | for doc in cursor: 48 | movies.append({ 49 | 'rank': int(doc['rank']), 50 | 'link': doc['page_url'], 51 | 'title': doc['title'], 52 | 'score': doc['score'], 53 | 'comment_num': doc['comment_num'], 54 | 'directed_by': doc['directedBy'], 55 | # 'actors': doc['actors'], 56 | 'comment': doc['comment'], 57 | 'year': doc['year'], 58 | }) 59 | movies.sort(key=lambda x: x['rank'], reverse=False) 60 | 61 | return render_template("movie.html",movies = movies) 62 | 63 | 64 | @app.route('/word') 65 | def word(): 66 | return render_template("word.html") 67 | 68 | @app.route('/score') 69 | def score(): 70 | # sql = "select score,count(score) from movie250 group by score" 71 | pipeline = [ 72 | { 73 | u"$group": { 74 | u"_id": { 75 | u"score": u"$score" 76 | }, 77 | u"COUNT(score)": { 78 | u"$sum": 1 79 | } 80 | } 81 | }, 82 | { 83 | u"$project": { 84 | u"score": u"$_id.score", 85 | u"COUNT(score)": u"$COUNT(score)", 86 | u"_id": 0 87 | } 88 | } 89 | ] 90 | cursor = myCollection().aggregate(pipeline, allowDiskUse=True) 91 | score = [] # 评分 92 | num = [] # 每个评分统计出的电影数量 93 | score_num = {} 94 | for doc in cursor: 95 | score.append(doc['score']) 96 | score_num[doc['score']] = doc['COUNT(score)'] 97 | score.sort() 98 | for count in range(len(score_num)): 99 | num.append(score_num[score[count]]) 100 | count += 1 101 | 102 | return render_template("score.html",score=score,num=num) 103 | 104 | if __name__ == '__main__': 105 | app.run(debug=True) -------------------------------------------------------------------------------- /FlaskProject/static/img/generated_leaf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/generated_leaf.jpg -------------------------------------------------------------------------------- /FlaskProject/static/img/generated_tree.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/generated_tree.jpg -------------------------------------------------------------------------------- /FlaskProject/static/img/icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/icon.png -------------------------------------------------------------------------------- /FlaskProject/static/img/leaf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/leaf.jpg -------------------------------------------------------------------------------- /FlaskProject/static/img/tree.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/img/tree.jpg -------------------------------------------------------------------------------- /FlaskProject/static/js/main.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Template Name: Mamba - v2.0.1 3 | * Template URL: https://bootstrapmade.com/mamba-one-page-bootstrap-template-free/ 4 | * Author: BootstrapMade.com 5 | * License: https://bootstrapmade.com/license/ 6 | */ 7 | !(function($) { 8 | "use strict"; 9 | 10 | // Toggle .header-scrolled class to #header when page is scrolled 11 | $(window).scroll(function() { 12 | if ($(this).scrollTop() > 100) { 13 | $('#header').addClass('header-scrolled'); 14 | } else { 15 | $('#header').removeClass('header-scrolled'); 16 | } 17 | }); 18 | 19 | if ($(window).scrollTop() > 100) { 20 | $('#header').addClass('header-scrolled'); 21 | } 22 | 23 | // Stick the header at top on scroll 24 | $("#header").sticky({ 25 | topSpacing: 0, 26 | zIndex: '50' 27 | }); 28 | 29 | // Smooth scroll for the navigation menu and links with .scrollto classes 30 | $(document).on('click', '.nav-menu a, .mobile-nav a, .scrollto', function(e) { 31 | if (location.pathname.replace(/^\//, '') == this.pathname.replace(/^\//, '') && location.hostname == this.hostname) { 32 | e.preventDefault(); 33 | var target = $(this.hash); 34 | if (target.length) { 35 | 36 | var scrollto = target.offset().top; 37 | var scrolled = 2; 38 | 39 | if ($('#header-sticky-wrapper').length) { 40 | scrollto -= $('#header-sticky-wrapper').outerHeight() - scrolled; 41 | } 42 | 43 | if ($(this).attr("href") == '#header') { 44 | scrollto = 0; 45 | } 46 | 47 | $('html, body').animate({ 48 | scrollTop: scrollto 49 | }, 1500, 'easeInOutExpo'); 50 | 51 | if ($(this).parents('.nav-menu, .mobile-nav').length) { 52 | $('.nav-menu .active, .mobile-nav .active').removeClass('active'); 53 | $(this).closest('li').addClass('active'); 54 | } 55 | 56 | if ($('body').hasClass('mobile-nav-active')) { 57 | $('body').removeClass('mobile-nav-active'); 58 | $('.mobile-nav-toggle i').toggleClass('icofont-navigation-menu icofont-close'); 59 | $('.mobile-nav-overly').fadeOut(); 60 | } 61 | return false; 62 | } 63 | } 64 | }); 65 | 66 | // Mobile Navigation 67 | if ($('.nav-menu').length) { 68 | var $mobile_nav = $('.nav-menu').clone().prop({ 69 | class: 'mobile-nav d-lg-none' 70 | }); 71 | $('body').append($mobile_nav); 72 | $('body').prepend(''); 73 | $('body').append('
'); 74 | 75 | $(document).on('click', '.mobile-nav-toggle', function(e) { 76 | $('body').toggleClass('mobile-nav-active'); 77 | $('.mobile-nav-toggle i').toggleClass('icofont-navigation-menu icofont-close'); 78 | $('.mobile-nav-overly').toggle(); 79 | }); 80 | 81 | $(document).on('click', '.mobile-nav .drop-down > a', function(e) { 82 | e.preventDefault(); 83 | $(this).next().slideToggle(300); 84 | $(this).parent().toggleClass('active'); 85 | }); 86 | 87 | $(document).click(function(e) { 88 | var container = $(".mobile-nav, .mobile-nav-toggle"); 89 | if (!container.is(e.target) && container.has(e.target).length === 0) { 90 | if ($('body').hasClass('mobile-nav-active')) { 91 | $('body').removeClass('mobile-nav-active'); 92 | $('.mobile-nav-toggle i').toggleClass('icofont-navigation-menu icofont-close'); 93 | $('.mobile-nav-overly').fadeOut(); 94 | } 95 | } 96 | }); 97 | } else if ($(".mobile-nav, .mobile-nav-toggle").length) { 98 | $(".mobile-nav, .mobile-nav-toggle").hide(); 99 | } 100 | 101 | // Intro carousel 102 | var heroCarousel = $("#heroCarousel"); 103 | var heroCarouselIndicators = $("#hero-carousel-indicators"); 104 | heroCarousel.find(".carousel-inner").children(".carousel-item").each(function(index) { 105 | (index === 0) ? 106 | heroCarouselIndicators.append("
  • "): 107 | heroCarouselIndicators.append("
  • "); 108 | }); 109 | 110 | heroCarousel.on('slid.bs.carousel', function(e) { 111 | $(this).find('h2').addClass('animated fadeInDown'); 112 | $(this).find('p').addClass('animated fadeInUp'); 113 | $(this).find('.btn-get-started').addClass('animated fadeInUp'); 114 | }); 115 | 116 | // Back to top button 117 | $(window).scroll(function() { 118 | if ($(this).scrollTop() > 100) { 119 | $('.back-to-top').fadeIn('slow'); 120 | } else { 121 | $('.back-to-top').fadeOut('slow'); 122 | } 123 | }); 124 | 125 | $('.back-to-top').click(function() { 126 | $('html, body').animate({ 127 | scrollTop: 0 128 | }, 1500, 'easeInOutExpo'); 129 | return false; 130 | }); 131 | 132 | // Initiate the venobox plugin 133 | $(window).on('load', function() { 134 | $('.venobox').venobox(); 135 | }); 136 | 137 | // jQuery counterUp 138 | $('[data-toggle="counter-up"]').counterUp({ 139 | delay: 10, 140 | time: 1000 141 | }); 142 | 143 | // Porfolio isotope and filter 144 | $(window).on('load', function() { 145 | var portfolioIsotope = $('.portfolio-container').isotope({ 146 | itemSelector: '.portfolio-item', 147 | layoutMode: 'fitRows' 148 | }); 149 | 150 | $('#portfolio-flters li').on('click', function() { 151 | $("#portfolio-flters li").removeClass('filter-active'); 152 | $(this).addClass('filter-active'); 153 | 154 | portfolioIsotope.isotope({ 155 | filter: $(this).data('filter') 156 | }); 157 | }); 158 | 159 | // Initiate venobox (lightbox feature used in portofilo) 160 | $(document).ready(function() { 161 | $('.venobox').venobox(); 162 | }); 163 | }); 164 | 165 | // Initi AOS 166 | AOS.init({ 167 | duration: 1000, 168 | easing: "ease-in-out-back" 169 | }); 170 | 171 | })(jQuery); -------------------------------------------------------------------------------- /FlaskProject/static/vendor/bootstrap/css/bootstrap-reboot.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Reboot v4.4.1 (https://getbootstrap.com/) 3 | * Copyright 2011-2019 The Bootstrap Authors 4 | * Copyright 2011-2019 Twitter, Inc. 5 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 6 | * Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md) 7 | */ 8 | *, 9 | *::before, 10 | *::after { 11 | box-sizing: border-box; 12 | } 13 | 14 | html { 15 | font-family: sans-serif; 16 | line-height: 1.15; 17 | -webkit-text-size-adjust: 100%; 18 | -webkit-tap-highlight-color: rgba(0, 0, 0, 0); 19 | } 20 | 21 | article, aside, figcaption, figure, footer, header, hgroup, main, nav, section { 22 | display: block; 23 | } 24 | 25 | body { 26 | margin: 0; 27 | font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, "Helvetica Neue", Arial, "Noto Sans", sans-serif, "Apple Color Emoji", "Segoe UI Emoji", "Segoe UI Symbol", "Noto Color Emoji"; 28 | font-size: 1rem; 29 | font-weight: 400; 30 | line-height: 1.5; 31 | color: #212529; 32 | text-align: left; 33 | background-color: #fff; 34 | } 35 | 36 | [tabindex="-1"]:focus:not(:focus-visible) { 37 | outline: 0 !important; 38 | } 39 | 40 | hr { 41 | box-sizing: content-box; 42 | height: 0; 43 | overflow: visible; 44 | } 45 | 46 | h1, h2, h3, h4, h5, h6 { 47 | margin-top: 0; 48 | margin-bottom: 0.5rem; 49 | } 50 | 51 | p { 52 | margin-top: 0; 53 | margin-bottom: 1rem; 54 | } 55 | 56 | abbr[title], 57 | abbr[data-original-title] { 58 | text-decoration: underline; 59 | -webkit-text-decoration: underline dotted; 60 | text-decoration: underline dotted; 61 | cursor: help; 62 | border-bottom: 0; 63 | -webkit-text-decoration-skip-ink: none; 64 | text-decoration-skip-ink: none; 65 | } 66 | 67 | address { 68 | margin-bottom: 1rem; 69 | font-style: normal; 70 | line-height: inherit; 71 | } 72 | 73 | ol, 74 | ul, 75 | dl { 76 | margin-top: 0; 77 | margin-bottom: 1rem; 78 | } 79 | 80 | ol ol, 81 | ul ul, 82 | ol ul, 83 | ul ol { 84 | margin-bottom: 0; 85 | } 86 | 87 | dt { 88 | font-weight: 700; 89 | } 90 | 91 | dd { 92 | margin-bottom: .5rem; 93 | margin-left: 0; 94 | } 95 | 96 | blockquote { 97 | margin: 0 0 1rem; 98 | } 99 | 100 | b, 101 | strong { 102 | font-weight: bolder; 103 | } 104 | 105 | small { 106 | font-size: 80%; 107 | } 108 | 109 | sub, 110 | sup { 111 | position: relative; 112 | font-size: 75%; 113 | line-height: 0; 114 | vertical-align: baseline; 115 | } 116 | 117 | sub { 118 | bottom: -.25em; 119 | } 120 | 121 | sup { 122 | top: -.5em; 123 | } 124 | 125 | a { 126 | color: #007bff; 127 | text-decoration: none; 128 | background-color: transparent; 129 | } 130 | 131 | a:hover { 132 | color: #0056b3; 133 | text-decoration: underline; 134 | } 135 | 136 | a:not([href]) { 137 | color: inherit; 138 | text-decoration: none; 139 | } 140 | 141 | a:not([href]):hover { 142 | color: inherit; 143 | text-decoration: none; 144 | } 145 | 146 | pre, 147 | code, 148 | kbd, 149 | samp { 150 | font-family: SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace; 151 | font-size: 1em; 152 | } 153 | 154 | pre { 155 | margin-top: 0; 156 | margin-bottom: 1rem; 157 | overflow: auto; 158 | } 159 | 160 | figure { 161 | margin: 0 0 1rem; 162 | } 163 | 164 | img { 165 | vertical-align: middle; 166 | border-style: none; 167 | } 168 | 169 | svg { 170 | overflow: hidden; 171 | vertical-align: middle; 172 | } 173 | 174 | table { 175 | border-collapse: collapse; 176 | } 177 | 178 | caption { 179 | padding-top: 0.75rem; 180 | padding-bottom: 0.75rem; 181 | color: #6c757d; 182 | text-align: left; 183 | caption-side: bottom; 184 | } 185 | 186 | th { 187 | text-align: inherit; 188 | } 189 | 190 | label { 191 | display: inline-block; 192 | margin-bottom: 0.5rem; 193 | } 194 | 195 | button { 196 | border-radius: 0; 197 | } 198 | 199 | button:focus { 200 | outline: 1px dotted; 201 | outline: 5px auto -webkit-focus-ring-color; 202 | } 203 | 204 | input, 205 | button, 206 | select, 207 | optgroup, 208 | textarea { 209 | margin: 0; 210 | font-family: inherit; 211 | font-size: inherit; 212 | line-height: inherit; 213 | } 214 | 215 | button, 216 | input { 217 | overflow: visible; 218 | } 219 | 220 | button, 221 | select { 222 | text-transform: none; 223 | } 224 | 225 | select { 226 | word-wrap: normal; 227 | } 228 | 229 | button, 230 | [type="button"], 231 | [type="reset"], 232 | [type="submit"] { 233 | -webkit-appearance: button; 234 | } 235 | 236 | button:not(:disabled), 237 | [type="button"]:not(:disabled), 238 | [type="reset"]:not(:disabled), 239 | [type="submit"]:not(:disabled) { 240 | cursor: pointer; 241 | } 242 | 243 | button::-moz-focus-inner, 244 | [type="button"]::-moz-focus-inner, 245 | [type="reset"]::-moz-focus-inner, 246 | [type="submit"]::-moz-focus-inner { 247 | padding: 0; 248 | border-style: none; 249 | } 250 | 251 | input[type="radio"], 252 | input[type="checkbox"] { 253 | box-sizing: border-box; 254 | padding: 0; 255 | } 256 | 257 | input[type="date"], 258 | input[type="time"], 259 | input[type="datetime-local"], 260 | input[type="month"] { 261 | -webkit-appearance: listbox; 262 | } 263 | 264 | textarea { 265 | overflow: auto; 266 | resize: vertical; 267 | } 268 | 269 | fieldset { 270 | min-width: 0; 271 | padding: 0; 272 | margin: 0; 273 | border: 0; 274 | } 275 | 276 | legend { 277 | display: block; 278 | width: 100%; 279 | max-width: 100%; 280 | padding: 0; 281 | margin-bottom: .5rem; 282 | font-size: 1.5rem; 283 | line-height: inherit; 284 | color: inherit; 285 | white-space: normal; 286 | } 287 | 288 | progress { 289 | vertical-align: baseline; 290 | } 291 | 292 | [type="number"]::-webkit-inner-spin-button, 293 | [type="number"]::-webkit-outer-spin-button { 294 | height: auto; 295 | } 296 | 297 | [type="search"] { 298 | outline-offset: -2px; 299 | -webkit-appearance: none; 300 | } 301 | 302 | [type="search"]::-webkit-search-decoration { 303 | -webkit-appearance: none; 304 | } 305 | 306 | ::-webkit-file-upload-button { 307 | font: inherit; 308 | -webkit-appearance: button; 309 | } 310 | 311 | output { 312 | display: inline-block; 313 | } 314 | 315 | summary { 316 | display: list-item; 317 | cursor: pointer; 318 | } 319 | 320 | template { 321 | display: none; 322 | } 323 | 324 | [hidden] { 325 | display: none !important; 326 | } 327 | /*# sourceMappingURL=bootstrap-reboot.css.map */ -------------------------------------------------------------------------------- /FlaskProject/static/vendor/bootstrap/css/bootstrap-reboot.min.css: -------------------------------------------------------------------------------- 1 | /*! 2 | * Bootstrap Reboot v4.4.1 (https://getbootstrap.com/) 3 | * Copyright 2011-2019 The Bootstrap Authors 4 | * Copyright 2011-2019 Twitter, Inc. 5 | * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE) 6 | * Forked from Normalize.css, licensed MIT (https://github.com/necolas/normalize.css/blob/master/LICENSE.md) 7 | */*,::after,::before{box-sizing:border-box}html{font-family:sans-serif;line-height:1.15;-webkit-text-size-adjust:100%;-webkit-tap-highlight-color:transparent}article,aside,figcaption,figure,footer,header,hgroup,main,nav,section{display:block}body{margin:0;font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Roboto,"Helvetica Neue",Arial,"Noto Sans",sans-serif,"Apple Color Emoji","Segoe UI Emoji","Segoe UI Symbol","Noto Color Emoji";font-size:1rem;font-weight:400;line-height:1.5;color:#212529;text-align:left;background-color:#fff}[tabindex="-1"]:focus:not(:focus-visible){outline:0!important}hr{box-sizing:content-box;height:0;overflow:visible}h1,h2,h3,h4,h5,h6{margin-top:0;margin-bottom:.5rem}p{margin-top:0;margin-bottom:1rem}abbr[data-original-title],abbr[title]{text-decoration:underline;-webkit-text-decoration:underline dotted;text-decoration:underline dotted;cursor:help;border-bottom:0;-webkit-text-decoration-skip-ink:none;text-decoration-skip-ink:none}address{margin-bottom:1rem;font-style:normal;line-height:inherit}dl,ol,ul{margin-top:0;margin-bottom:1rem}ol ol,ol ul,ul ol,ul ul{margin-bottom:0}dt{font-weight:700}dd{margin-bottom:.5rem;margin-left:0}blockquote{margin:0 0 1rem}b,strong{font-weight:bolder}small{font-size:80%}sub,sup{position:relative;font-size:75%;line-height:0;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}a{color:#007bff;text-decoration:none;background-color:transparent}a:hover{color:#0056b3;text-decoration:underline}a:not([href]){color:inherit;text-decoration:none}a:not([href]):hover{color:inherit;text-decoration:none}code,kbd,pre,samp{font-family:SFMono-Regular,Menlo,Monaco,Consolas,"Liberation Mono","Courier New",monospace;font-size:1em}pre{margin-top:0;margin-bottom:1rem;overflow:auto}figure{margin:0 0 1rem}img{vertical-align:middle;border-style:none}svg{overflow:hidden;vertical-align:middle}table{border-collapse:collapse}caption{padding-top:.75rem;padding-bottom:.75rem;color:#6c757d;text-align:left;caption-side:bottom}th{text-align:inherit}label{display:inline-block;margin-bottom:.5rem}button{border-radius:0}button:focus{outline:1px dotted;outline:5px auto -webkit-focus-ring-color}button,input,optgroup,select,textarea{margin:0;font-family:inherit;font-size:inherit;line-height:inherit}button,input{overflow:visible}button,select{text-transform:none}select{word-wrap:normal}[type=button],[type=reset],[type=submit],button{-webkit-appearance:button}[type=button]:not(:disabled),[type=reset]:not(:disabled),[type=submit]:not(:disabled),button:not(:disabled){cursor:pointer}[type=button]::-moz-focus-inner,[type=reset]::-moz-focus-inner,[type=submit]::-moz-focus-inner,button::-moz-focus-inner{padding:0;border-style:none}input[type=checkbox],input[type=radio]{box-sizing:border-box;padding:0}input[type=date],input[type=datetime-local],input[type=month],input[type=time]{-webkit-appearance:listbox}textarea{overflow:auto;resize:vertical}fieldset{min-width:0;padding:0;margin:0;border:0}legend{display:block;width:100%;max-width:100%;padding:0;margin-bottom:.5rem;font-size:1.5rem;line-height:inherit;color:inherit;white-space:normal}progress{vertical-align:baseline}[type=number]::-webkit-inner-spin-button,[type=number]::-webkit-outer-spin-button{height:auto}[type=search]{outline-offset:-2px;-webkit-appearance:none}[type=search]::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{font:inherit;-webkit-appearance:button}output{display:inline-block}summary{display:list-item;cursor:pointer}template{display:none}[hidden]{display:none!important} 8 | /*# sourceMappingURL=bootstrap-reboot.min.css.map */ -------------------------------------------------------------------------------- /FlaskProject/static/vendor/boxicons/css/animations.css: -------------------------------------------------------------------------------- 1 | @-webkit-keyframes spin 2 | { 3 | 0% 4 | { 5 | -webkit-transform: rotate(0); 6 | transform: rotate(0); 7 | } 8 | 100% 9 | { 10 | -webkit-transform: rotate(359deg); 11 | transform: rotate(359deg); 12 | } 13 | } 14 | @keyframes spin 15 | { 16 | 0% 17 | { 18 | -webkit-transform: rotate(0); 19 | transform: rotate(0); 20 | } 21 | 100% 22 | { 23 | -webkit-transform: rotate(359deg); 24 | transform: rotate(359deg); 25 | } 26 | } 27 | @-webkit-keyframes burst 28 | { 29 | 0% 30 | { 31 | -webkit-transform: scale(1); 32 | transform: scale(1); 33 | 34 | opacity: 1; 35 | } 36 | 90% 37 | { 38 | -webkit-transform: scale(1.5); 39 | transform: scale(1.5); 40 | 41 | opacity: 0; 42 | } 43 | } 44 | @keyframes burst 45 | { 46 | 0% 47 | { 48 | -webkit-transform: scale(1); 49 | transform: scale(1); 50 | 51 | opacity: 1; 52 | } 53 | 90% 54 | { 55 | -webkit-transform: scale(1.5); 56 | transform: scale(1.5); 57 | 58 | opacity: 0; 59 | } 60 | } 61 | @-webkit-keyframes flashing 62 | { 63 | 0% 64 | { 65 | opacity: 1; 66 | } 67 | 45% 68 | { 69 | opacity: 0; 70 | } 71 | 90% 72 | { 73 | opacity: 1; 74 | } 75 | } 76 | @keyframes flashing 77 | { 78 | 0% 79 | { 80 | opacity: 1; 81 | } 82 | 45% 83 | { 84 | opacity: 0; 85 | } 86 | 90% 87 | { 88 | opacity: 1; 89 | } 90 | } 91 | @-webkit-keyframes fade-left 92 | { 93 | 0% 94 | { 95 | -webkit-transform: translateX(0); 96 | transform: translateX(0); 97 | 98 | opacity: 1; 99 | } 100 | 75% 101 | { 102 | -webkit-transform: translateX(-20px); 103 | transform: translateX(-20px); 104 | 105 | opacity: 0; 106 | } 107 | } 108 | @keyframes fade-left 109 | { 110 | 0% 111 | { 112 | -webkit-transform: translateX(0); 113 | transform: translateX(0); 114 | 115 | opacity: 1; 116 | } 117 | 75% 118 | { 119 | -webkit-transform: translateX(-20px); 120 | transform: translateX(-20px); 121 | 122 | opacity: 0; 123 | } 124 | } 125 | @-webkit-keyframes fade-right 126 | { 127 | 0% 128 | { 129 | -webkit-transform: translateX(0); 130 | transform: translateX(0); 131 | 132 | opacity: 1; 133 | } 134 | 75% 135 | { 136 | -webkit-transform: translateX(20px); 137 | transform: translateX(20px); 138 | 139 | opacity: 0; 140 | } 141 | } 142 | @keyframes fade-right 143 | { 144 | 0% 145 | { 146 | -webkit-transform: translateX(0); 147 | transform: translateX(0); 148 | 149 | opacity: 1; 150 | } 151 | 75% 152 | { 153 | -webkit-transform: translateX(20px); 154 | transform: translateX(20px); 155 | 156 | opacity: 0; 157 | } 158 | } 159 | @-webkit-keyframes fade-up 160 | { 161 | 0% 162 | { 163 | -webkit-transform: translateY(0); 164 | transform: translateY(0); 165 | 166 | opacity: 1; 167 | } 168 | 75% 169 | { 170 | -webkit-transform: translateY(-20px); 171 | transform: translateY(-20px); 172 | 173 | opacity: 0; 174 | } 175 | } 176 | @keyframes fade-up 177 | { 178 | 0% 179 | { 180 | -webkit-transform: translateY(0); 181 | transform: translateY(0); 182 | 183 | opacity: 1; 184 | } 185 | 75% 186 | { 187 | -webkit-transform: translateY(-20px); 188 | transform: translateY(-20px); 189 | 190 | opacity: 0; 191 | } 192 | } 193 | @-webkit-keyframes fade-down 194 | { 195 | 0% 196 | { 197 | -webkit-transform: translateY(0); 198 | transform: translateY(0); 199 | 200 | opacity: 1; 201 | } 202 | 75% 203 | { 204 | -webkit-transform: translateY(20px); 205 | transform: translateY(20px); 206 | 207 | opacity: 0; 208 | } 209 | } 210 | @keyframes fade-down 211 | { 212 | 0% 213 | { 214 | -webkit-transform: translateY(0); 215 | transform: translateY(0); 216 | 217 | opacity: 1; 218 | } 219 | 75% 220 | { 221 | -webkit-transform: translateY(20px); 222 | transform: translateY(20px); 223 | 224 | opacity: 0; 225 | } 226 | } 227 | @-webkit-keyframes tada 228 | { 229 | from 230 | { 231 | -webkit-transform: scale3d(1, 1, 1); 232 | transform: scale3d(1, 1, 1); 233 | } 234 | 235 | 10%, 236 | 20% 237 | { 238 | -webkit-transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg); 239 | transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg); 240 | } 241 | 242 | 30%, 243 | 50%, 244 | 70%, 245 | 90% 246 | { 247 | -webkit-transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg); 248 | transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg); 249 | } 250 | 251 | 40%, 252 | 60%, 253 | 80% 254 | { 255 | -webkit-transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, -10deg); 256 | transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, -10deg); 257 | } 258 | 259 | to 260 | { 261 | -webkit-transform: scale3d(1, 1, 1); 262 | transform: scale3d(1, 1, 1); 263 | } 264 | } 265 | 266 | @keyframes tada 267 | { 268 | from 269 | { 270 | -webkit-transform: scale3d(1, 1, 1); 271 | transform: scale3d(1, 1, 1); 272 | } 273 | 274 | 10%, 275 | 20% 276 | { 277 | -webkit-transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg); 278 | transform: scale3d(.95, .95, .95) rotate3d(0, 0, 1, -10deg); 279 | } 280 | 281 | 30%, 282 | 50%, 283 | 70%, 284 | 90% 285 | { 286 | -webkit-transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg); 287 | transform: scale3d(1, 1, 1) rotate3d(0, 0, 1, 10deg); 288 | } 289 | 290 | 40%, 291 | 60%, 292 | 80% 293 | { 294 | -webkit-transform: rotate3d(0, 0, 1, -10deg); 295 | transform: rotate3d(0, 0, 1, -10deg); 296 | } 297 | 298 | to 299 | { 300 | -webkit-transform: scale3d(1, 1, 1); 301 | transform: scale3d(1, 1, 1); 302 | } 303 | } 304 | .bx-spin 305 | { 306 | -webkit-animation: spin 2s linear infinite; 307 | animation: spin 2s linear infinite; 308 | } 309 | .bx-spin-hover:hover 310 | { 311 | -webkit-animation: spin 2s linear infinite; 312 | animation: spin 2s linear infinite; 313 | } 314 | 315 | .bx-tada 316 | { 317 | -webkit-animation: tada 1.5s ease infinite; 318 | animation: tada 1.5s ease infinite; 319 | } 320 | .bx-tada-hover:hover 321 | { 322 | -webkit-animation: tada 1.5s ease infinite; 323 | animation: tada 1.5s ease infinite; 324 | } 325 | 326 | .bx-flashing 327 | { 328 | -webkit-animation: flashing 1.5s infinite linear; 329 | animation: flashing 1.5s infinite linear; 330 | } 331 | .bx-flashing-hover:hover 332 | { 333 | -webkit-animation: flashing 1.5s infinite linear; 334 | animation: flashing 1.5s infinite linear; 335 | } 336 | 337 | .bx-burst 338 | { 339 | -webkit-animation: burst 1.5s infinite linear; 340 | animation: burst 1.5s infinite linear; 341 | } 342 | .bx-burst-hover:hover 343 | { 344 | -webkit-animation: burst 1.5s infinite linear; 345 | animation: burst 1.5s infinite linear; 346 | } 347 | .bx-fade-up 348 | { 349 | -webkit-animation: fade-up 1.5s infinite linear; 350 | animation: fade-up 1.5s infinite linear; 351 | } 352 | .bx-fade-up-hover:hover 353 | { 354 | -webkit-animation: fade-up 1.5s infinite linear; 355 | animation: fade-up 1.5s infinite linear; 356 | } 357 | .bx-fade-down 358 | { 359 | -webkit-animation: fade-down 1.5s infinite linear; 360 | animation: fade-down 1.5s infinite linear; 361 | } 362 | .bx-fade-down-hover:hover 363 | { 364 | -webkit-animation: fade-down 1.5s infinite linear; 365 | animation: fade-down 1.5s infinite linear; 366 | } 367 | .bx-fade-left 368 | { 369 | -webkit-animation: fade-left 1.5s infinite linear; 370 | animation: fade-left 1.5s infinite linear; 371 | } 372 | .bx-fade-left-hover:hover 373 | { 374 | -webkit-animation: fade-left 1.5s infinite linear; 375 | animation: fade-left 1.5s infinite linear; 376 | } 377 | .bx-fade-right 378 | { 379 | -webkit-animation: fade-right 1.5s infinite linear; 380 | animation: fade-right 1.5s infinite linear; 381 | } 382 | .bx-fade-right-hover:hover 383 | { 384 | -webkit-animation: fade-right 1.5s infinite linear; 385 | animation: fade-right 1.5s infinite linear; 386 | } -------------------------------------------------------------------------------- /FlaskProject/static/vendor/boxicons/css/transformations.css: -------------------------------------------------------------------------------- 1 | .bx-rotate-90 2 | { 3 | transform: rotate(90deg); 4 | 5 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=1)'; 6 | } 7 | .bx-rotate-180 8 | { 9 | transform: rotate(180deg); 10 | 11 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=2)'; 12 | } 13 | .bx-rotate-270 14 | { 15 | transform: rotate(270deg); 16 | 17 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=3)'; 18 | } 19 | .bx-flip-horizontal 20 | { 21 | transform: scaleX(-1); 22 | 23 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=0, mirror=1)'; 24 | } 25 | .bx-flip-vertical 26 | { 27 | transform: scaleY(-1); 28 | 29 | -ms-filter: 'progid:DXImageTransform.Microsoft.BasicImage(rotation=2, mirror=1)'; 30 | } 31 | -------------------------------------------------------------------------------- /FlaskProject/static/vendor/boxicons/fonts/boxicons.eot: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.eot -------------------------------------------------------------------------------- /FlaskProject/static/vendor/boxicons/fonts/boxicons.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.ttf -------------------------------------------------------------------------------- /FlaskProject/static/vendor/boxicons/fonts/boxicons.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff -------------------------------------------------------------------------------- /FlaskProject/static/vendor/boxicons/fonts/boxicons.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/boxicons/fonts/boxicons.woff2 -------------------------------------------------------------------------------- /FlaskProject/static/vendor/counterup/counterup.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | * jquery.counterup.js 2.1.0 3 | * 4 | * Copyright 2013, Benjamin Intal http://gambit.ph @bfintal 5 | * Released under the GPL v2 License 6 | * 7 | * Amended by Jeremy Paris, Ciro Mattia Gonano and others 8 | * 9 | * Date: Feb 24, 2017 10 | */ 11 | (function($){"use strict";$.fn.counterUp=function(options){var settings=$.extend({time:400,delay:10,offset:100,beginAt:0,formatter:false,context:"window",callback:function(){}},options),s;return this.each(function(){var $this=$(this),counter={time:$(this).data("counterup-time")||settings.time,delay:$(this).data("counterup-delay")||settings.delay,offset:$(this).data("counterup-offset")||settings.offset,beginAt:$(this).data("counterup-beginat")||settings.beginAt,context:$(this).data("counterup-context")||settings.context};var counterUpper=function(){var nums=[];var divisions=counter.time/counter.delay;var num=$(this).attr("data-num")?$(this).attr("data-num"):$this.text();var isComma=/[0-9]+,[0-9]+/.test(num);num=num.replace(/,/g,"");var decimalPlaces=(num.split(".")[1]||[]).length;if(counter.beginAt>num)counter.beginAt=num;var isTime=/[0-9]+:[0-9]+:[0-9]+/.test(num);if(isTime){var times=num.split(":"),m=1;s=0;while(times.length>0){s+=m*parseInt(times.pop(),10);m*=60}}for(var i=divisions;i>=counter.beginAt/num*divisions;i--){var newNum=parseFloat(num/divisions*i).toFixed(decimalPlaces);if(isTime){newNum=parseInt(s/divisions*i);var hours=parseInt(newNum/3600)%24;var minutes=parseInt(newNum/60)%60;var seconds=parseInt(newNum%60,10);newNum=(hours<10?"0"+hours:hours)+":"+(minutes<10?"0"+minutes:minutes)+":"+(seconds<10?"0"+seconds:seconds)}if(isComma){while(/(\d+)(\d{3})/.test(newNum.toString())){newNum=newNum.toString().replace(/(\d+)(\d{3})/,"$1"+","+"$2")}}if(settings.formatter){newNum=settings.formatter.call(this,newNum)}nums.unshift(newNum)}$this.data("counterup-nums",nums);$this.text(counter.beginAt);var f=function(){if(!$this.data("counterup-nums")){settings.callback.call(this);return}$this.html($this.data("counterup-nums").shift());if($this.data("counterup-nums").length){setTimeout($this.data("counterup-func"),counter.delay)}else{$this.data("counterup-nums",null);$this.data("counterup-func",null);settings.callback.call(this)}};$this.data("counterup-func",f);setTimeout($this.data("counterup-func"),counter.delay)};$this.waypoint(function(direction){counterUpper();this.destroy()},{offset:counter.offset+"%",context:counter.context})})}})(jQuery); 12 | -------------------------------------------------------------------------------- /FlaskProject/static/vendor/icofont/fonts/icofont.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/icofont/fonts/icofont.woff -------------------------------------------------------------------------------- /FlaskProject/static/vendor/icofont/fonts/icofont.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/FlaskProject/static/vendor/icofont/fonts/icofont.woff2 -------------------------------------------------------------------------------- /FlaskProject/static/vendor/jquery.easing/jquery.easing.min.js: -------------------------------------------------------------------------------- 1 | (function(factory){if(typeof define==="function"&&define.amd){define(["jquery"],function($){return factory($)})}else if(typeof module==="object"&&typeof module.exports==="object"){exports=factory(require("jquery"))}else{factory(jQuery)}})(function($){$.easing.jswing=$.easing.swing;var pow=Math.pow,sqrt=Math.sqrt,sin=Math.sin,cos=Math.cos,PI=Math.PI,c1=1.70158,c2=c1*1.525,c3=c1+1,c4=2*PI/3,c5=2*PI/4.5;function bounceOut(x){var n1=7.5625,d1=2.75;if(x<1/d1){return n1*x*x}else if(x<2/d1){return n1*(x-=1.5/d1)*x+.75}else if(x<2.5/d1){return n1*(x-=2.25/d1)*x+.9375}else{return n1*(x-=2.625/d1)*x+.984375}}$.extend($.easing,{def:"easeOutQuad",swing:function(x){return $.easing[$.easing.def](x)},easeInQuad:function(x){return x*x},easeOutQuad:function(x){return 1-(1-x)*(1-x)},easeInOutQuad:function(x){return x<.5?2*x*x:1-pow(-2*x+2,2)/2},easeInCubic:function(x){return x*x*x},easeOutCubic:function(x){return 1-pow(1-x,3)},easeInOutCubic:function(x){return x<.5?4*x*x*x:1-pow(-2*x+2,3)/2},easeInQuart:function(x){return x*x*x*x},easeOutQuart:function(x){return 1-pow(1-x,4)},easeInOutQuart:function(x){return x<.5?8*x*x*x*x:1-pow(-2*x+2,4)/2},easeInQuint:function(x){return x*x*x*x*x},easeOutQuint:function(x){return 1-pow(1-x,5)},easeInOutQuint:function(x){return x<.5?16*x*x*x*x*x:1-pow(-2*x+2,5)/2},easeInSine:function(x){return 1-cos(x*PI/2)},easeOutSine:function(x){return sin(x*PI/2)},easeInOutSine:function(x){return-(cos(PI*x)-1)/2},easeInExpo:function(x){return x===0?0:pow(2,10*x-10)},easeOutExpo:function(x){return x===1?1:1-pow(2,-10*x)},easeInOutExpo:function(x){return x===0?0:x===1?1:x<.5?pow(2,20*x-10)/2:(2-pow(2,-20*x+10))/2},easeInCirc:function(x){return 1-sqrt(1-pow(x,2))},easeOutCirc:function(x){return sqrt(1-pow(x-1,2))},easeInOutCirc:function(x){return x<.5?(1-sqrt(1-pow(2*x,2)))/2:(sqrt(1-pow(-2*x+2,2))+1)/2},easeInElastic:function(x){return x===0?0:x===1?1:-pow(2,10*x-10)*sin((x*10-10.75)*c4)},easeOutElastic:function(x){return x===0?0:x===1?1:pow(2,-10*x)*sin((x*10-.75)*c4)+1},easeInOutElastic:function(x){return x===0?0:x===1?1:x<.5?-(pow(2,20*x-10)*sin((20*x-11.125)*c5))/2:pow(2,-20*x+10)*sin((20*x-11.125)*c5)/2+1},easeInBack:function(x){return c3*x*x*x-c1*x*x},easeOutBack:function(x){return 1+c3*pow(x-1,3)+c1*pow(x-1,2)},easeInOutBack:function(x){return x<.5?pow(2*x,2)*((c2+1)*2*x-c2)/2:(pow(2*x-2,2)*((c2+1)*(x*2-2)+c2)+2)/2},easeInBounce:function(x){return 1-bounceOut(1-x)},easeOutBounce:bounceOut,easeInOutBounce:function(x){return x<.5?(1-bounceOut(1-2*x))/2:(1+bounceOut(2*x-1))/2}})}); -------------------------------------------------------------------------------- /FlaskProject/static/vendor/php-email-form/validate.js: -------------------------------------------------------------------------------- 1 | jQuery(document).ready(function($) { 2 | "use strict"; 3 | 4 | //Contact 5 | $('form.php-email-form').submit(function() { 6 | 7 | var f = $(this).find('.form-group'), 8 | ferror = false, 9 | emailExp = /^[^\s()<>@,;:\/]+@\w[\w\.-]+\.[a-z]{2,}$/i; 10 | 11 | f.children('input').each(function() { // run all inputs 12 | 13 | var i = $(this); // current input 14 | var rule = i.attr('data-rule'); 15 | 16 | if (rule !== undefined) { 17 | var ierror = false; // error flag for current input 18 | var pos = rule.indexOf(':', 0); 19 | if (pos >= 0) { 20 | var exp = rule.substr(pos + 1, rule.length); 21 | rule = rule.substr(0, pos); 22 | } else { 23 | rule = rule.substr(pos + 1, rule.length); 24 | } 25 | 26 | switch (rule) { 27 | case 'required': 28 | if (i.val() === '') { 29 | ferror = ierror = true; 30 | } 31 | break; 32 | 33 | case 'minlen': 34 | if (i.val().length < parseInt(exp)) { 35 | ferror = ierror = true; 36 | } 37 | break; 38 | 39 | case 'email': 40 | if (!emailExp.test(i.val())) { 41 | ferror = ierror = true; 42 | } 43 | break; 44 | 45 | case 'checked': 46 | if (! i.is(':checked')) { 47 | ferror = ierror = true; 48 | } 49 | break; 50 | 51 | case 'regexp': 52 | exp = new RegExp(exp); 53 | if (!exp.test(i.val())) { 54 | ferror = ierror = true; 55 | } 56 | break; 57 | } 58 | i.next('.validate').html((ierror ? (i.attr('data-msg') !== undefined ? i.attr('data-msg') : 'wrong Input') : '')).show('blind'); 59 | } 60 | }); 61 | f.children('textarea').each(function() { // run all inputs 62 | 63 | var i = $(this); // current input 64 | var rule = i.attr('data-rule'); 65 | 66 | if (rule !== undefined) { 67 | var ierror = false; // error flag for current input 68 | var pos = rule.indexOf(':', 0); 69 | if (pos >= 0) { 70 | var exp = rule.substr(pos + 1, rule.length); 71 | rule = rule.substr(0, pos); 72 | } else { 73 | rule = rule.substr(pos + 1, rule.length); 74 | } 75 | 76 | switch (rule) { 77 | case 'required': 78 | if (i.val() === '') { 79 | ferror = ierror = true; 80 | } 81 | break; 82 | 83 | case 'minlen': 84 | if (i.val().length < parseInt(exp)) { 85 | ferror = ierror = true; 86 | } 87 | break; 88 | } 89 | i.next('.validate').html((ierror ? (i.attr('data-msg') != undefined ? i.attr('data-msg') : 'wrong Input') : '')).show('blind'); 90 | } 91 | }); 92 | if (ferror) return false; 93 | else var str = $(this).serialize(); 94 | 95 | var this_form = $(this); 96 | var action = $(this).attr('action'); 97 | 98 | if( ! action ) { 99 | this_form.find('.loading').slideUp(); 100 | this_form.find('.error-message').slideDown().html('The form action property is not set!'); 101 | return false; 102 | } 103 | 104 | this_form.find('.sent-message').slideUp(); 105 | this_form.find('.error-message').slideUp(); 106 | this_form.find('.loading').slideDown(); 107 | 108 | $.ajax({ 109 | type: "POST", 110 | url: action, 111 | data: str, 112 | success: function(msg) { 113 | if (msg == 'OK') { 114 | this_form.find('.loading').slideUp(); 115 | this_form.find('.sent-message').slideDown(); 116 | this_form.find("input:not(input[type=submit]), textarea").val(''); 117 | } else { 118 | this_form.find('.loading').slideUp(); 119 | this_form.find('.error-message').slideDown().html(msg); 120 | } 121 | } 122 | }); 123 | return false; 124 | }); 125 | 126 | }); 127 | -------------------------------------------------------------------------------- /FlaskProject/static/vendor/waypoints/jquery.waypoints.min.js: -------------------------------------------------------------------------------- 1 | /*! 2 | Waypoints - 4.0.1 3 | Copyright © 2011-2016 Caleb Troughton 4 | Licensed under the MIT license. 5 | https://github.com/imakewebthings/waypoints/blob/master/licenses.txt 6 | */ 7 | !function(){"use strict";function t(o){if(!o)throw new Error("No options passed to Waypoint constructor");if(!o.element)throw new Error("No element option passed to Waypoint constructor");if(!o.handler)throw new Error("No handler option passed to Waypoint constructor");this.key="waypoint-"+e,this.options=t.Adapter.extend({},t.defaults,o),this.element=this.options.element,this.adapter=new t.Adapter(this.element),this.callback=o.handler,this.axis=this.options.horizontal?"horizontal":"vertical",this.enabled=this.options.enabled,this.triggerPoint=null,this.group=t.Group.findOrCreate({name:this.options.group,axis:this.axis}),this.context=t.Context.findOrCreateByElement(this.options.context),t.offsetAliases[this.options.offset]&&(this.options.offset=t.offsetAliases[this.options.offset]),this.group.add(this),this.context.add(this),i[this.key]=this,e+=1}var e=0,i={};t.prototype.queueTrigger=function(t){this.group.queueTrigger(this,t)},t.prototype.trigger=function(t){this.enabled&&this.callback&&this.callback.apply(this,t)},t.prototype.destroy=function(){this.context.remove(this),this.group.remove(this),delete i[this.key]},t.prototype.disable=function(){return this.enabled=!1,this},t.prototype.enable=function(){return this.context.refresh(),this.enabled=!0,this},t.prototype.next=function(){return this.group.next(this)},t.prototype.previous=function(){return this.group.previous(this)},t.invokeAll=function(t){var e=[];for(var o in i)e.push(i[o]);for(var n=0,r=e.length;r>n;n++)e[n][t]()},t.destroyAll=function(){t.invokeAll("destroy")},t.disableAll=function(){t.invokeAll("disable")},t.enableAll=function(){t.Context.refreshAll();for(var e in i)i[e].enabled=!0;return this},t.refreshAll=function(){t.Context.refreshAll()},t.viewportHeight=function(){return window.innerHeight||document.documentElement.clientHeight},t.viewportWidth=function(){return document.documentElement.clientWidth},t.adapters=[],t.defaults={context:window,continuous:!0,enabled:!0,group:"default",horizontal:!1,offset:0},t.offsetAliases={"bottom-in-view":function(){return this.context.innerHeight()-this.adapter.outerHeight()},"right-in-view":function(){return this.context.innerWidth()-this.adapter.outerWidth()}},window.Waypoint=t}(),function(){"use strict";function t(t){window.setTimeout(t,1e3/60)}function e(t){this.element=t,this.Adapter=n.Adapter,this.adapter=new this.Adapter(t),this.key="waypoint-context-"+i,this.didScroll=!1,this.didResize=!1,this.oldScroll={x:this.adapter.scrollLeft(),y:this.adapter.scrollTop()},this.waypoints={vertical:{},horizontal:{}},t.waypointContextKey=this.key,o[t.waypointContextKey]=this,i+=1,n.windowContext||(n.windowContext=!0,n.windowContext=new e(window)),this.createThrottledScrollHandler(),this.createThrottledResizeHandler()}var i=0,o={},n=window.Waypoint,r=window.onload;e.prototype.add=function(t){var e=t.options.horizontal?"horizontal":"vertical";this.waypoints[e][t.key]=t,this.refresh()},e.prototype.checkEmpty=function(){var t=this.Adapter.isEmptyObject(this.waypoints.horizontal),e=this.Adapter.isEmptyObject(this.waypoints.vertical),i=this.element==this.element.window;t&&e&&!i&&(this.adapter.off(".waypoints"),delete o[this.key])},e.prototype.createThrottledResizeHandler=function(){function t(){e.handleResize(),e.didResize=!1}var e=this;this.adapter.on("resize.waypoints",function(){e.didResize||(e.didResize=!0,n.requestAnimationFrame(t))})},e.prototype.createThrottledScrollHandler=function(){function t(){e.handleScroll(),e.didScroll=!1}var e=this;this.adapter.on("scroll.waypoints",function(){(!e.didScroll||n.isTouch)&&(e.didScroll=!0,n.requestAnimationFrame(t))})},e.prototype.handleResize=function(){n.Context.refreshAll()},e.prototype.handleScroll=function(){var t={},e={horizontal:{newScroll:this.adapter.scrollLeft(),oldScroll:this.oldScroll.x,forward:"right",backward:"left"},vertical:{newScroll:this.adapter.scrollTop(),oldScroll:this.oldScroll.y,forward:"down",backward:"up"}};for(var i in e){var o=e[i],n=o.newScroll>o.oldScroll,r=n?o.forward:o.backward;for(var s in this.waypoints[i]){var a=this.waypoints[i][s];if(null!==a.triggerPoint){var l=o.oldScroll=a.triggerPoint,p=l&&h,u=!l&&!h;(p||u)&&(a.queueTrigger(r),t[a.group.id]=a.group)}}}for(var c in t)t[c].flushTriggers();this.oldScroll={x:e.horizontal.newScroll,y:e.vertical.newScroll}},e.prototype.innerHeight=function(){return this.element==this.element.window?n.viewportHeight():this.adapter.innerHeight()},e.prototype.remove=function(t){delete this.waypoints[t.axis][t.key],this.checkEmpty()},e.prototype.innerWidth=function(){return this.element==this.element.window?n.viewportWidth():this.adapter.innerWidth()},e.prototype.destroy=function(){var t=[];for(var e in this.waypoints)for(var i in this.waypoints[e])t.push(this.waypoints[e][i]);for(var o=0,n=t.length;n>o;o++)t[o].destroy()},e.prototype.refresh=function(){var t,e=this.element==this.element.window,i=e?void 0:this.adapter.offset(),o={};this.handleScroll(),t={horizontal:{contextOffset:e?0:i.left,contextScroll:e?0:this.oldScroll.x,contextDimension:this.innerWidth(),oldScroll:this.oldScroll.x,forward:"right",backward:"left",offsetProp:"left"},vertical:{contextOffset:e?0:i.top,contextScroll:e?0:this.oldScroll.y,contextDimension:this.innerHeight(),oldScroll:this.oldScroll.y,forward:"down",backward:"up",offsetProp:"top"}};for(var r in t){var s=t[r];for(var a in this.waypoints[r]){var l,h,p,u,c,d=this.waypoints[r][a],f=d.options.offset,w=d.triggerPoint,y=0,g=null==w;d.element!==d.element.window&&(y=d.adapter.offset()[s.offsetProp]),"function"==typeof f?f=f.apply(d):"string"==typeof f&&(f=parseFloat(f),d.options.offset.indexOf("%")>-1&&(f=Math.ceil(s.contextDimension*f/100))),l=s.contextScroll-s.contextOffset,d.triggerPoint=Math.floor(y+l-f),h=w=s.oldScroll,u=h&&p,c=!h&&!p,!g&&u?(d.queueTrigger(s.backward),o[d.group.id]=d.group):!g&&c?(d.queueTrigger(s.forward),o[d.group.id]=d.group):g&&s.oldScroll>=d.triggerPoint&&(d.queueTrigger(s.forward),o[d.group.id]=d.group)}}return n.requestAnimationFrame(function(){for(var t in o)o[t].flushTriggers()}),this},e.findOrCreateByElement=function(t){return e.findByElement(t)||new e(t)},e.refreshAll=function(){for(var t in o)o[t].refresh()},e.findByElement=function(t){return o[t.waypointContextKey]},window.onload=function(){r&&r(),e.refreshAll()},n.requestAnimationFrame=function(e){var i=window.requestAnimationFrame||window.mozRequestAnimationFrame||window.webkitRequestAnimationFrame||t;i.call(window,e)},n.Context=e}(),function(){"use strict";function t(t,e){return t.triggerPoint-e.triggerPoint}function e(t,e){return e.triggerPoint-t.triggerPoint}function i(t){this.name=t.name,this.axis=t.axis,this.id=this.name+"-"+this.axis,this.waypoints=[],this.clearTriggerQueues(),o[this.axis][this.name]=this}var o={vertical:{},horizontal:{}},n=window.Waypoint;i.prototype.add=function(t){this.waypoints.push(t)},i.prototype.clearTriggerQueues=function(){this.triggerQueues={up:[],down:[],left:[],right:[]}},i.prototype.flushTriggers=function(){for(var i in this.triggerQueues){var o=this.triggerQueues[i],n="up"===i||"left"===i;o.sort(n?e:t);for(var r=0,s=o.length;s>r;r+=1){var a=o[r];(a.options.continuous||r===o.length-1)&&a.trigger([i])}}this.clearTriggerQueues()},i.prototype.next=function(e){this.waypoints.sort(t);var i=n.Adapter.inArray(e,this.waypoints),o=i===this.waypoints.length-1;return o?null:this.waypoints[i+1]},i.prototype.previous=function(e){this.waypoints.sort(t);var i=n.Adapter.inArray(e,this.waypoints);return i?this.waypoints[i-1]:null},i.prototype.queueTrigger=function(t,e){this.triggerQueues[e].push(t)},i.prototype.remove=function(t){var e=n.Adapter.inArray(t,this.waypoints);e>-1&&this.waypoints.splice(e,1)},i.prototype.first=function(){return this.waypoints[0]},i.prototype.last=function(){return this.waypoints[this.waypoints.length-1]},i.findOrCreate=function(t){return o[t.axis][t.name]||new i(t)},n.Group=i}(),function(){"use strict";function t(t){this.$element=e(t)}var e=window.jQuery,i=window.Waypoint;e.each(["innerHeight","innerWidth","off","offset","on","outerHeight","outerWidth","scrollLeft","scrollTop"],function(e,i){t.prototype[i]=function(){var t=Array.prototype.slice.call(arguments);return this.$element[i].apply(this.$element,t)}}),e.each(["extend","inArray","isEmptyObject"],function(i,o){t[o]=e[o]}),i.adapters.push({name:"jquery",Adapter:t}),i.Adapter=t}(),function(){"use strict";function t(t){return function(){var i=[],o=arguments[0];return t.isFunction(arguments[0])&&(o=t.extend({},arguments[1]),o.handler=arguments[0]),this.each(function(){var n=t.extend({},o,{element:this});"string"==typeof n.context&&(n.context=t(this).closest(n.context)[0]),i.push(new e(n))}),i}}var e=window.Waypoint;window.jQuery&&(window.jQuery.fn.waypoint=t(window.jQuery)),window.Zepto&&(window.Zepto.fn.waypoint=t(window.Zepto))}(); -------------------------------------------------------------------------------- /FlaskProject/templates/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 豆瓣Top250数据分析 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 61 | 62 | 63 |
    64 |
    65 | 66 |
    67 |

    豆瓣Top250数据分析

    68 |

    应用Python、Scrapy、Scrapy-Redis、MongoDB、Flask、Echarts、WordCloud等技术实现

    69 |
    70 | 71 | 72 |
    73 | 120 |
    121 | 122 |
    123 |
    124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /FlaskProject/templates/movie.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 豆瓣Top250数据分析 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 61 | 62 | 63 |
    64 |
    65 | 66 |
    67 |

    豆瓣Top250电影排行

    68 |
    69 | 70 | 71 |
    72 |
    73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | {% for movie in movies %} 86 | 87 | 88 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | {% endfor %} 101 |
    排名电影名称评分评价人数导演精彩短评上映时间(/年)
    {{movie['rank']}} 89 | 90 | {{ movie['title'] }} 91 | 92 | {{movie['score']}}{{movie['comment_num']}}{{movie['directed_by']}}{{movie['comment']}}{{movie['year']}}
    102 | 103 | 104 | 105 |
    106 |
    107 | 108 |
    109 |
    110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | -------------------------------------------------------------------------------- /FlaskProject/templates/score.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 豆瓣Top250评分分布图 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 62 | 63 | 64 |
    65 |
    66 | 67 |
    68 |

    豆瓣Top250数据分析

    69 |
    70 | 71 | 72 |
    73 |
    74 | 75 |
    76 | 77 | 78 | 79 |
    80 |
    81 | 82 |
    83 |
    84 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | -------------------------------------------------------------------------------- /FlaskProject/templates/word.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 豆瓣Top250数据分析 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 63 | 64 | 65 |
    66 |
    67 | 68 |
    69 |
    70 | 71 | 72 |
    73 | 74 |
    75 | 76 |
    77 |

    词频统计

    78 |

    根据250部电影提取出的词云块增强人们对经典电影的领悟

    79 |
    80 | 81 |
    82 |
    83 |

    关于250部电影

    84 |

    从电影中看百味人生

    85 |
    86 | 87 | 88 | 89 |
    90 |
    91 | 92 |
    93 |
    94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /FlaskProject/wordCloud.py: -------------------------------------------------------------------------------- 1 | from wordcloud import WordCloud #词云 2 | import jieba #分词 3 | from matplotlib import pyplot as plt #绘图 数据可视化 4 | from PIL import Image #图片处理 5 | import numpy as np #矩阵运算 6 | import pymongo #数据库 7 | 8 | client = pymongo.MongoClient(host='localhost', port=27017) 9 | db = client.movies 10 | collection = db.douban 11 | query = {} 12 | projection = {} 13 | 14 | projection["title"] = u"$title" 15 | projection["movie_type"] = u"$movie_type" 16 | projection["directedBy"] = u"$directedBy" 17 | projection["_id"] = 0 18 | 19 | cursor = collection.find(query, projection = projection) 20 | text = "" 21 | for doc in cursor: 22 | for content in doc.values(): 23 | content.replace('/',' ') 24 | text = text + content 25 | 26 | cut = jieba.cut(text) 27 | string = ' '.join(cut) 28 | print(len(string)) 29 | 30 | img = Image.open(r'./static/img/tree.jpg') 31 | img_array = np.array(img) #将图片转换为数组 32 | wc = WordCloud( 33 | background_color='white', 34 | mask=img_array, 35 | font_path="msyh.ttc" #字体所在位置C:\Windows\Fonts 36 | ) 37 | wc.generate_from_text(string) 38 | 39 | #绘制图片 40 | 41 | fig = plt.figure(1) 42 | plt.imshow(wc) 43 | plt.axis('off') #是否显示坐标轴 44 | 45 | # plt.show() #显示生成的词云图片 46 | plt.savefig('./static/img/generated_tree.jpg',dpi=500) 47 | -------------------------------------------------------------------------------- /Master/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /Master/.idea/Master.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Master/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /Master/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /Master/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Master/main.py: -------------------------------------------------------------------------------- 1 | import redis 2 | import pymongo 3 | import json 4 | 5 | def main(): 6 | r = redis.Redis(host='localhost',port=6379,db=0) 7 | client = pymongo.MongoClient(host='localhost', port=27017) 8 | db = client.movies 9 | collection = db.douban 10 | while True: 11 | source, data = r.blpop(["douban_redis:items"]) 12 | item = json.loads(data) 13 | print(item) 14 | collection.replace_one(filter={"rank":item["rank"]},replacement=item,upsert=True) 15 | 16 | if __name__ == '__main__': 17 | main() -------------------------------------------------------------------------------- /Pic/index.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/index.jpg -------------------------------------------------------------------------------- /Pic/mongoDB_data.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/mongoDB_data.jpg -------------------------------------------------------------------------------- /Pic/movies.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/movies.jpg -------------------------------------------------------------------------------- /Pic/proxy.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/proxy.jpg -------------------------------------------------------------------------------- /Pic/redis_data.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/redis_data.jpg -------------------------------------------------------------------------------- /Pic/score.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/score.jpg -------------------------------------------------------------------------------- /Pic/slave.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/slave.jpg -------------------------------------------------------------------------------- /Pic/words.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Pic/words.jpg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 1. 环境配置 2 | ## 1.1 爬虫部分软件包版本 3 | - Python 3.8.13 4 | - Scrapy 2.6.2 5 | - Scrapy-redis 0.7.3 6 | - pymongo 4.2.0 7 | - redis 4.3.4 8 | ## 1.2 数据库 9 | - MongoDB 10 | - Redis 11 | ## 1.3 前后端交互 12 | ### 前端 13 | - jinja2 14 | - Echarts 15 | ### 后端 16 | - flask 2.2.2 17 | ## 1.4 IP代理池 18 |   参考:[https://github.com/jhao104/proxy_pool](https://github.com/jhao104/proxy_pool)进行配置。 19 | # 2. 项目文件目录 20 | **---FlaskProject(数据可视化代码)**
    21 | ------static(用到的静态资源)
    22 | ------templates(前端展示模板)
    23 | ------app.py(后端代码)
    24 | ------data.txt(爬取到的数据示例)
    25 | ------wordCloud.py(生成词云代码)
    26 | **---Master(主机端的代码)**
    27 | ------main.py(将数据从redis中取出,放入到MongoDB)
    28 | **---Pic(运行效果图)**
    29 | **---proxy_pool(IP代理池)**
    30 | **---Slave(从机端的代码)**
    31 | ------movies
    32 | ---------spiders
    33 | ------------douban_redis.py(爬取数据的主要代码)
    34 | ---------middlewares.py(中间件,实现IP代理、动态User-Agent等功能)
    35 | ---------settings.py(爬虫的相关配置)
    36 | # 3. 项目配置过程 37 |   项目整体基于分布式的思想设计,分为**主机端代码**和**从机端代码。** 38 | ## 3.1 从机 39 |   从机负责执行爬虫程序,从网站爬取数据并存储到主机的Redis数据库中。Redis数据库可以记录爬取的url进度,因此爬虫程序可以中途暂停,从机数目可以任意设置,并且所有从机都执行相同的代码。
    40 |   在进行项目测试的时候可以通过在一台电脑上配置虚拟机,从而实现分布式的效果。虚拟机推荐使用[CentOS 7](http://isoredirect.centos.org/centos/7/isos/x86_64/)系统,系统轻量化、占用资源少。从机需要在`settings.py`文件中设置主机的IP和端口。
    41 |   从机环境配置完成后,cd到`spiders`文件夹下运行:`scrapy runspider douban_redis.py`命令可以启动从机程序,从机程序启动后会等待主机发放起始url。
    42 | ## 3.2 主机 43 |   主机负责维护Redis数据库,并将Redis数据库中的数据存储到MongoDB数据库中。
    44 |   启动Redis服务后,在`redis-cli.exe`中运行:`lpush douban:start_urls https://movie.douban.com/top250`命令即可在Redis数据库中插入起始url,插入成功后从机会自动开始爬取程序。
    45 |   主机端运行代理池`proxy_pool`下的代码可获取免费代理IP,并存入`redis`数据库中。
    46 |   主机端的main.py用于实现取数据的功能,可以将Redis数据库中的数据取出,放入到MongoDB数据库中。 47 | ## 3.3 可视化 48 |   安装flask后,在主机端打开`FlaskProject`文件,运行`app.py`即可启动后端服务。启动后端服务后,在浏览器访问在本机默认IP:端口`http://127.0.0.1:5000/`即可看到可视化效果。
    49 | ## 3.4 IP代理池 50 |   项目参考:[https://github.com/jhao104/proxy_pool](https://github.com/jhao104/proxy_pool)
    51 |   参考说明配置完环境后运行以下命令可以启动IP池程序。
    52 | ``` 53 | # 启动调度程序 54 | python proxyPool.py schedule 55 | 56 | # 启动webApi服务 57 | python proxyPool.py server 58 | ``` 59 | # 4. 运行截图 60 | ## 4.1 从机运行 61 | ### 从机爬虫程序 62 | ![从机爬虫程序](https://github.com/CoderDon/Crawler/raw/main/Pic/slave.jpg) 63 | ## 4.2 主机数据库 64 | ### Redis数据库缓存URL 65 | ![Redis数据库缓存URL](https://github.com/CoderDon/Crawler/raw/main/Pic/redis_data.jpg) 66 | ### Redis缓存代理IPs 67 | ![Redis缓存代理IPs](https://github.com/CoderDon/Crawler/raw/main/Pic/proxy.jpg) 68 | ### MongoDB数据库 69 | ![MongoDB数据库](https://github.com/CoderDon/Crawler/raw/main/Pic/mongoDB_data.jpg) 70 | ## 4.3 可视化 71 | ### 首页 72 | ![首页](https://github.com/CoderDon/Crawler/raw/main/Pic/index.jpg) 73 | ### 电影 74 | ![电影](https://github.com/CoderDon/Crawler/raw/main/Pic/movies.jpg) 75 | ### 评分 76 | ![评分](https://github.com/CoderDon/Crawler/raw/main/Pic/score.jpg) 77 | ### 词云 78 | ![词云](https://github.com/CoderDon/Crawler/raw/main/Pic/words.jpg) -------------------------------------------------------------------------------- /Slave/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Editor-based HTTP Client requests 5 | /httpRequests/ 6 | # Datasource local storage ignored files 7 | /dataSources/ 8 | /dataSources.local.xml 9 | -------------------------------------------------------------------------------- /Slave/.idea/Slave.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Slave/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /Slave/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /Slave/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Slave/movies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__init__.py -------------------------------------------------------------------------------- /Slave/movies/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /Slave/movies/__pycache__/middlewares.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/middlewares.cpython-38.pyc -------------------------------------------------------------------------------- /Slave/movies/__pycache__/pipelines.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/pipelines.cpython-38.pyc -------------------------------------------------------------------------------- /Slave/movies/__pycache__/settings.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/__pycache__/settings.cpython-38.pyc -------------------------------------------------------------------------------- /Slave/movies/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class MoviesItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | pass 12 | # page_url = scrapy.Field() 13 | # title = scrapy.Field() 14 | # year = scrapy.Field() 15 | # score = scrapy.Field() 16 | # directedBy = scrapy.Field() 17 | # actors = scrapy.Field() 18 | # movie_type = scrapy.Field() 19 | # comment = scrapy.Field() 20 | # introduc = scrapy.Field() 21 | # image_urls = scrapy.Field() 22 | # image_name = scrapy.Field() 23 | -------------------------------------------------------------------------------- /Slave/movies/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | from fake_useragent import UserAgent 8 | import redis 9 | import random 10 | from scrapy.exceptions import NotConfigured 11 | from twisted.internet.error import ConnectError, TimeoutError 12 | import json 13 | # useful for handling different item types with a single interface 14 | from itemadapter import is_item, ItemAdapter 15 | 16 | 17 | class RandomProxyMiddleWare(object): 18 | def __init__(self, settings): 19 | # 2.初始化配置及相关变量 20 | self.r = redis.Redis(host='localhost', port=6379, db=0) 21 | self.proxy_key = settings.get('PROXY_REDIS_KEY') 22 | self.max_failed = 1 23 | 24 | @property 25 | def proxies(self): 26 | # return [i.decode('utf-8') for i in self.r.hkeys('use_proxy')] 27 | # return [i.decode('utf-8') for i in self.r.hkeys('use_proxy') 28 | # if json.loads(self.r.hget('use_proxy', i.decode('utf-8')).decode('utf-8'))['https'] == True] 29 | return [] 30 | 31 | @classmethod 32 | def from_crawler(cls, crawler): 33 | # 1. 创建中间件对象 34 | # 默认代理是启用的 35 | if not crawler.settings.getbool('HTTPPROXY_ENABLED'): 36 | raise NotConfigured 37 | return cls(crawler.settings) 38 | 39 | def process_request(self, request, spider): 40 | # 3. 为每个request对象分配随机的ip代理 41 | if self.proxies and not request.meta.get('proxy'): 42 | proxies_list = self.proxies 43 | if proxies_list: 44 | request.meta['proxy'] = 'https://' + random.choice(proxies_list) 45 | 46 | def process_response(self, request, response, spider): 47 | # 4. 请求成功 48 | # 如果proxy为空则直接返回 49 | if not request.meta.get('proxy'): 50 | return response 51 | cur_proxy = request.meta.get('proxy').replace('https://', '') 52 | # 判断ip是否被对方封禁 53 | if response.status in (400, 401, 403): 54 | # 先拿到当前ip:port对应的value 55 | value = json.loads(self.r.hget(self.proxy_key, cur_proxy).decode('utf-8')) 56 | value['fail_count'] += 1 57 | self.r.hset(self.proxy_key, cur_proxy, 58 | str(value).replace("'", '"').replace('False', 'false').replace('True', 'true')) 59 | # 当某个IP的失败次数累积到一定的数量 60 | filed_times = json.loads(self.r.hget(self.proxy_key, cur_proxy).decode('utf-8'))['fail_count'] or 0 61 | if int(filed_times) >= self.max_failed: 62 | print('got wrong http code (%s) when use %s' % (response.status, cur_proxy)) 63 | # 可以认为该IP被对方封禁。从代理池中将该IP删除 64 | self.remove_proxy(cur_proxy) 65 | del request.meta['proxy'] 66 | # 返回request 将该请求重新->调度器 67 | return request 68 | return response 69 | 70 | def process_exception(self, request, exception, spider): 71 | # 4.1 请求失败 72 | cur_proxy = request.meta.get('proxy') 73 | # 请求使用代理,并且网络请求报错,认为该IP出错,删除,并重新->调度器 74 | if cur_proxy and isinstance(cur_proxy, (ConnectError, TimeoutError)): 75 | print('error (%s) occur when use proxy %s' % (exception, cur_proxy)) 76 | self.remove_proxy(cur_proxy) 77 | del request.meta['proxy'] 78 | return request 79 | 80 | def remove_proxy(self, proxy): 81 | if proxy in self.proxies: 82 | self.r.hdel(self.proxy_key, proxy) 83 | 84 | 85 | class UserAgentMiddleware(object): 86 | def process_request(self, request, spider): 87 | request.headers.setdefault(b'User-Agent', UserAgent().random) 88 | 89 | class MoviesSpiderMiddleware: 90 | # Not all methods need to be defined. If a method is not defined, 91 | # scrapy acts as if the spider middleware does not modify the 92 | # passed objects. 93 | 94 | @classmethod 95 | def from_crawler(cls, crawler): 96 | # This method is used by Scrapy to create your spiders. 97 | s = cls() 98 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 99 | return s 100 | 101 | def process_spider_input(self, response, spider): 102 | # Called for each response that goes through the spider 103 | # middleware and into the spider. 104 | 105 | # Should return None or raise an exception. 106 | return None 107 | 108 | def process_spider_output(self, response, result, spider): 109 | # Called with the results returned from the Spider, after 110 | # it has processed the response. 111 | 112 | # Must return an iterable of Request, or item objects. 113 | for i in result: 114 | yield i 115 | 116 | def process_spider_exception(self, response, exception, spider): 117 | # Called when a spider or process_spider_input() method 118 | # (from other spider middleware) raises an exception. 119 | 120 | # Should return either None or an iterable of Request or item objects. 121 | pass 122 | 123 | def process_start_requests(self, start_requests, spider): 124 | # Called with the start requests of the spider, and works 125 | # similarly to the process_spider_output() method, except 126 | # that it doesn’t have a response associated. 127 | 128 | # Must return only requests (not items). 129 | for r in start_requests: 130 | yield r 131 | 132 | def spider_opened(self, spider): 133 | spider.logger.info('Spider opened: %s' % spider.name) 134 | 135 | 136 | class MoviesDownloaderMiddleware: 137 | # Not all methods need to be defined. If a method is not defined, 138 | # scrapy acts as if the downloader middleware does not modify the 139 | # passed objects. 140 | 141 | @classmethod 142 | def from_crawler(cls, crawler): 143 | # This method is used by Scrapy to create your spiders. 144 | s = cls() 145 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 146 | return s 147 | 148 | def process_request(self, request, spider): 149 | # Called for each request that goes through the downloader 150 | # middleware. 151 | 152 | # Must either: 153 | # - return None: continue processing this request 154 | # - or return a Response object 155 | # - or return a Request object 156 | # - or raise IgnoreRequest: process_exception() methods of 157 | # installed downloader middleware will be called 158 | 159 | # 在请求页面时伪装成站内请求,用以反 反爬虫 160 | referer = request.url 161 | if referer: 162 | request.headers['referer'] = referer 163 | 164 | return None 165 | 166 | def process_response(self, request, response, spider): 167 | # Called with the response returned from the downloader. 168 | 169 | # Must either; 170 | # - return a Response object 171 | # - return a Request object 172 | # - or raise IgnoreRequest 173 | return response 174 | 175 | def process_exception(self, request, exception, spider): 176 | # Called when a download handler or a process_request() 177 | # (from other downloader middleware) raises an exception. 178 | 179 | # Must either: 180 | # - return None: continue processing this exception 181 | # - return a Response object: stops process_exception() chain 182 | # - return a Request object: stops process_exception() chain 183 | pass 184 | 185 | def spider_opened(self, spider): 186 | spider.logger.info('Spider opened: %s' % spider.name) 187 | -------------------------------------------------------------------------------- /Slave/movies/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | import pymongo 10 | from pymysql import connect 11 | from scrapy.pipelines.images import ImagesPipeline 12 | import scrapy 13 | from scrapy.exceptions import DropItem 14 | 15 | class MongoMoviesPipeline: 16 | def open_spider(self,spider): 17 | self.client = pymongo.MongoClient() 18 | 19 | def process_item(self, item, spider): 20 | # self.client.movies.douban.replace_one(filter={"page_url":item["page_url"]},replacement=item,upsert=True) 21 | return item 22 | 23 | def close_spider(self,spider): 24 | self.client.close() 25 | 26 | class ImagePipeline(ImagesPipeline): 27 | def get_media_requests(self, item, info): 28 | yield scrapy.Request(item['image_urls'], meta={"image_name": item['image_name']}) 29 | 30 | def file_path(self, request, response=None, info=None, *, item=None): 31 | file_name = request.meta['image_name'] + ".jpg" 32 | return file_name 33 | 34 | def item_completed(self, results, item, info): 35 | image_paths = [x['path'] for ok, x in results if ok] 36 | if not image_paths: 37 | raise DropItem("Item contains no images") 38 | return item 39 | 40 | # class MysqlMoviesPipeline: 41 | # def open_spider(self,spider): 42 | # self.client = connect(host='localhost',port='3306',user='root',password='123456',db='movies',charset='utf8') 43 | # self.cursor = self.client.cursor() 44 | # 45 | # def process_item(self, item, spider): 46 | # self.client.movies.douban.insert_one(item) 47 | # return item 48 | # 49 | # def close_spider(self,spider): 50 | # self.cursor.close() 51 | # self.client.close() 52 | -------------------------------------------------------------------------------- /Slave/movies/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for movies project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'movies' 11 | 12 | SPIDER_MODULES = ['movies.spiders'] 13 | NEWSPIDER_MODULE = 'movies.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = True 21 | LOG_LEVEL = 'DEBUG' 22 | 23 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter" 24 | SCHEDULER = "scrapy_redis.scheduler.Scheduler" 25 | SCHEDULER_PERSIST = True 26 | 27 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderPriorityQueue" 28 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue" 29 | # SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderStack" 30 | 31 | ITEM_PIPELINES = { 32 | 'scrapy_redis.pipelines.RedisPipeline': 300, 33 | } 34 | REDIS_HOST = '192.168.108.1' 35 | REDIS_PORT = '6379' 36 | 37 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 38 | #CONCURRENT_REQUESTS = 32 39 | 40 | # Configure a delay for requests for the same website (default: 0) 41 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 42 | # See also autothrottle settings and docs 43 | DOWNLOAD_DELAY = 5 44 | # The download delay setting will honor only one of: 45 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 46 | #CONCURRENT_REQUESTS_PER_IP = 16 47 | 48 | # Disable cookies (enabled by default) 49 | #COOKIES_ENABLED = False 50 | 51 | # Disable Telnet Console (enabled by default) 52 | #TELNETCONSOLE_ENABLED = False 53 | 54 | # Override the default request headers: 55 | #DEFAULT_REQUEST_HEADERS = { 56 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 57 | # 'Accept-Language': 'en', 58 | #} 59 | 60 | # Enable or disable spider middlewares 61 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 62 | #SPIDER_MIDDLEWARES = { 63 | # 'movies.middlewares.MoviesSpiderMiddleware': 543, 64 | #} 65 | 66 | # Enable or disable downloader middlewares 67 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 68 | DOWNLOADER_MIDDLEWARES = { 69 | 'movies.middlewares.RandomProxyMiddleWare': 241, 70 | 'movies.middlewares.UserAgentMiddleware': 242, 71 | 'movies.middlewares.MoviesDownloaderMiddleware': 243, 72 | } 73 | 74 | PROXY_REDIS_KEY = 'use_proxy' 75 | HTTPPROXY_ENABLED = True 76 | 77 | # Enable or disable extensions 78 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 79 | #EXTENSIONS = { 80 | # 'scrapy.extensions.telnet.TelnetConsole': None, 81 | #} 82 | 83 | # Configure item pipelines 84 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 85 | 86 | # ITEM_PIPELINES = { 87 | # 'movies.pipelines.MongoMoviesPipeline': 300, 88 | # # 'movies.pipelines.ImagePipeline': 301, 89 | # } 90 | # IMAGES_STORE ='../images/' 91 | # IMAGES_URLS_FIELD = 'image_urls' #对应item里面设定的字段,取到图片的url 92 | 93 | # Enable and configure the AutoThrottle extension (disabled by default) 94 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 95 | #AUTOTHROTTLE_ENABLED = True 96 | # The initial download delay 97 | #AUTOTHROTTLE_START_DELAY = 5 98 | # The maximum download delay to be set in case of high latencies 99 | #AUTOTHROTTLE_MAX_DELAY = 60 100 | # The average number of requests Scrapy should be sending in parallel to 101 | # each remote server 102 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 103 | # Enable showing throttling stats for every response received: 104 | #AUTOTHROTTLE_DEBUG = False 105 | 106 | # Enable and configure HTTP caching (disabled by default) 107 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 108 | #HTTPCACHE_ENABLED = True 109 | #HTTPCACHE_EXPIRATION_SECS = 0 110 | #HTTPCACHE_DIR = 'httpcache' 111 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 112 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 113 | -------------------------------------------------------------------------------- /Slave/movies/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Slave/movies/spiders/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/spiders/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /Slave/movies/spiders/__pycache__/douban_redis.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/Slave/movies/spiders/__pycache__/douban_redis.cpython-38.pyc -------------------------------------------------------------------------------- /Slave/movies/spiders/douban_redis.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scrapy_redis.spiders import RedisCrawlSpider 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import Rule 5 | import re 6 | 7 | class DoubanSpider(RedisCrawlSpider): 8 | name = 'douban_redis' 9 | allowed_domains = ['douban.com'] 10 | # start_urls = ['https://movie.douban.com/top250?start=0&filter='] 11 | # start_urls = ['https://movie.douban.com/top250?start={}&filter='.format(num) for num in range(0, 226, 25)] 12 | redis_key = 'douban:start_urls' 13 | 14 | rules = ( 15 | Rule(LinkExtractor(restrict_xpaths=r'//div[@class="hd"]/a'), callback='parse_info'), 16 | Rule(LinkExtractor(restrict_xpaths=r'//div[@class="paginator"]/a'), follow=True), 17 | ) 18 | 19 | def parse_info(self, response): 20 | page_url = response.url 21 | title = response.xpath("//h1/span[@property='v:itemreviewed']/text()").extract_first() 22 | year = response.xpath("//h1/span[@class='year']/text()").extract_first() 23 | score = response.xpath("//strong[@class='ll rating_num']/text()").extract_first() 24 | directedBy = response.xpath("//span[@class='attrs']/a[@rel='v:directedBy']/text()").extract_first() 25 | actors = response.xpath("string(//span[@class='actor']/span[@class='attrs']/span)").extract_first() 26 | if actors == '': 27 | actors = response.xpath("string(//span[@class='actor']/span[@class='attrs'])").extract_first() 28 | movie_type = '/'.join(response.xpath("//span[@property='v:genre']/text()").extract()) 29 | rank = re.findall(r"\d+",response.xpath("//span[@class='top250-no']/text()").extract_first())[0] 30 | comment_num = response.xpath("//span[@property='v:votes']/text()").extract_first() 31 | comments = response.xpath("//p/span[@class='short']/text()").extract() 32 | comment = '' 33 | # 任意选一条长度小于100的短评 34 | for cmt in comments: 35 | if len(cmt) < 100: 36 | comment = cmt 37 | # 没有长度小于100的短评 读取长文 38 | if comment == '': 39 | comments = response.xpath("//p/span[@class='full']/text()").extract_first() 40 | introduc = response.xpath("string(//div[@class='indent']/span[@class='all hidden'])").extract_first() 41 | if introduc == '': 42 | introduc = response.xpath("string(//div[@class='indent']/span[@property='v:summary'])").extract_first() 43 | image_url = response.xpath("//img[@title='点击看更多海报']/@src").extract_first() 44 | image_name = page_url.split('/')[-2] 45 | print(title) 46 | yield { 47 | "page_url":page_url, 48 | "title":title, 49 | "year":year, 50 | "score":score, 51 | "directedBy":directedBy, 52 | "actors":actors, 53 | "movie_type":movie_type, 54 | "rank":rank, 55 | "comment":comment, 56 | "comment_num":comment_num, 57 | "introduc":introduc, 58 | "image_urls": image_url, 59 | "image_name": image_name 60 | } 61 | 62 | -------------------------------------------------------------------------------- /Slave/movies/start.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | 3 | # execute('scrapy crawl douban'.split()) 4 | execute('scrapy crawl douban_redis'.split()) 5 | -------------------------------------------------------------------------------- /Slave/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = movies.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = movies 12 | -------------------------------------------------------------------------------- /proxy_pool/.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /proxy_pool/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /proxy_pool/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /proxy_pool/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /proxy_pool/.idea/proxy_pool.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | 13 | 15 | -------------------------------------------------------------------------------- /proxy_pool/__pycache__/setting.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/__pycache__/setting.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/api/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | -------------------------------------------------------------------------------- /proxy_pool/api/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/api/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/api/__pycache__/proxyApi.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/api/__pycache__/proxyApi.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/api/proxyApi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: ProxyApi.py 6 | Description : WebApi 7 | Author : JHao 8 | date: 2016/12/4 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/04: WebApi 12 | 2019/08/14: 集成Gunicorn启动方式 13 | 2020/06/23: 新增pop接口 14 | 2022/07/21: 更新count接口 15 | ------------------------------------------------- 16 | """ 17 | __author__ = 'JHao' 18 | 19 | import platform 20 | from werkzeug.wrappers import Response 21 | from flask import Flask, jsonify, request 22 | 23 | from util.six import iteritems 24 | from helper.proxy import Proxy 25 | from handler.proxyHandler import ProxyHandler 26 | from handler.configHandler import ConfigHandler 27 | 28 | app = Flask(__name__) 29 | conf = ConfigHandler() 30 | proxy_handler = ProxyHandler() 31 | 32 | 33 | class JsonResponse(Response): 34 | @classmethod 35 | def force_type(cls, response, environ=None): 36 | if isinstance(response, (dict, list)): 37 | response = jsonify(response) 38 | 39 | return super(JsonResponse, cls).force_type(response, environ) 40 | 41 | 42 | app.response_class = JsonResponse 43 | 44 | api_list = [ 45 | {"url": "/get", "params": "type: ''https'|''", "desc": "get a proxy"}, 46 | {"url": "/pop", "params": "", "desc": "get and delete a proxy"}, 47 | {"url": "/delete", "params": "proxy: 'e.g. 127.0.0.1:8080'", "desc": "delete an unable proxy"}, 48 | {"url": "/all", "params": "type: ''https'|''", "desc": "get all proxy from proxy pool"}, 49 | {"url": "/count", "params": "", "desc": "return proxy count"} 50 | # 'refresh': 'refresh proxy pool', 51 | ] 52 | 53 | 54 | @app.route('/') 55 | def index(): 56 | return {'url': api_list} 57 | 58 | 59 | @app.route('/get/') 60 | def get(): 61 | https = request.args.get("type", "").lower() == 'https' 62 | proxy = proxy_handler.get(https) 63 | return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} 64 | 65 | 66 | @app.route('/pop/') 67 | def pop(): 68 | https = request.args.get("type", "").lower() == 'https' 69 | proxy = proxy_handler.pop(https) 70 | return proxy.to_dict if proxy else {"code": 0, "src": "no proxy"} 71 | 72 | 73 | @app.route('/refresh/') 74 | def refresh(): 75 | # TODO refresh会有守护程序定时执行,由api直接调用性能较差,暂不使用 76 | return 'success' 77 | 78 | 79 | @app.route('/all/') 80 | def getAll(): 81 | https = request.args.get("type", "").lower() == 'https' 82 | proxies = proxy_handler.getAll(https) 83 | return jsonify([_.to_dict for _ in proxies]) 84 | 85 | 86 | @app.route('/delete/', methods=['GET']) 87 | def delete(): 88 | proxy = request.args.get('proxy') 89 | status = proxy_handler.delete(Proxy(proxy)) 90 | return {"code": 0, "src": status} 91 | 92 | 93 | @app.route('/count/') 94 | def getCount(): 95 | proxies = proxy_handler.getAll() 96 | http_type_dict = {} 97 | source_dict = {} 98 | for proxy in proxies: 99 | http_type = 'https' if proxy.https else 'http' 100 | http_type_dict[http_type] = http_type_dict.get(http_type, 0) + 1 101 | for source in proxy.source.split('/'): 102 | source_dict[source] = source_dict.get(source, 0) + 1 103 | return {"http_type": http_type_dict, "source": source_dict, "count": len(proxies)} 104 | 105 | 106 | def runFlask(): 107 | if platform.system() == "Windows": 108 | app.run(host=conf.serverHost, port=conf.serverPort) 109 | else: 110 | import gunicorn.app.base 111 | 112 | class StandaloneApplication(gunicorn.app.base.BaseApplication): 113 | 114 | def __init__(self, app, options=None): 115 | self.options = options or {} 116 | self.application = app 117 | super(StandaloneApplication, self).__init__() 118 | 119 | def load_config(self): 120 | _config = dict([(key, value) for key, value in iteritems(self.options) 121 | if key in self.cfg.settings and value is not None]) 122 | for key, value in iteritems(_config): 123 | self.cfg.set(key.lower(), value) 124 | 125 | def load(self): 126 | return self.application 127 | 128 | _options = { 129 | 'bind': '%s:%s' % (conf.serverHost, conf.serverPort), 130 | 'workers': 4, 131 | 'accesslog': '-', # log to stdout 132 | 'access_log_format': '%(h)s %(l)s %(t)s "%(r)s" %(s)s "%(a)s"' 133 | } 134 | StandaloneApplication(app, _options).run() 135 | 136 | 137 | if __name__ == '__main__': 138 | runFlask() 139 | -------------------------------------------------------------------------------- /proxy_pool/db/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/2: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /proxy_pool/db/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/db/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/db/__pycache__/dbClient.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/db/__pycache__/dbClient.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/db/__pycache__/redisClient.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/db/__pycache__/redisClient.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/db/dbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: DbClient.py 6 | Description : DB工厂类 7 | Author : JHao 8 | date: 2016/12/2 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/02: DB工厂类 12 | 2020/07/03: 取消raw_proxy储存 13 | ------------------------------------------------- 14 | """ 15 | __author__ = 'JHao' 16 | 17 | import os 18 | import sys 19 | 20 | from util.six import urlparse, withMetaclass 21 | from util.singleton import Singleton 22 | 23 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 24 | 25 | 26 | class DbClient(withMetaclass(Singleton)): 27 | """ 28 | DbClient DB工厂类 提供get/put/update/pop/delete/exists/getAll/clean/getCount/changeTable方法 29 | 30 | 31 | 抽象方法定义: 32 | get(): 随机返回一个proxy; 33 | put(proxy): 存入一个proxy; 34 | pop(): 顺序返回并删除一个proxy; 35 | update(proxy): 更新指定proxy信息; 36 | delete(proxy): 删除指定proxy; 37 | exists(proxy): 判断指定proxy是否存在; 38 | getAll(): 返回所有代理; 39 | clean(): 清除所有proxy信息; 40 | getCount(): 返回proxy统计信息; 41 | changeTable(name): 切换操作对象 42 | 43 | 44 | 所有方法需要相应类去具体实现: 45 | ssdb: ssdbClient.py 46 | redis: redisClient.py 47 | mongodb: mongodbClient.py 48 | 49 | """ 50 | 51 | def __init__(self, db_conn): 52 | """ 53 | init 54 | :return: 55 | """ 56 | self.parseDbConn(db_conn) 57 | self.__initDbClient() 58 | 59 | @classmethod 60 | def parseDbConn(cls, db_conn): 61 | db_conf = urlparse(db_conn) 62 | cls.db_type = db_conf.scheme.upper().strip() 63 | cls.db_host = db_conf.hostname 64 | cls.db_port = db_conf.port 65 | cls.db_user = db_conf.username 66 | cls.db_pwd = db_conf.password 67 | cls.db_name = db_conf.path[1:] 68 | return cls 69 | 70 | def __initDbClient(self): 71 | """ 72 | init DB Client 73 | :return: 74 | """ 75 | __type = None 76 | if "SSDB" == self.db_type: 77 | __type = "ssdbClient" 78 | elif "REDIS" == self.db_type: 79 | __type = "redisClient" 80 | else: 81 | pass 82 | assert __type, 'type error, Not support DB type: {}'.format(self.db_type) 83 | self.client = getattr(__import__(__type), "%sClient" % self.db_type.title())(host=self.db_host, 84 | port=self.db_port, 85 | username=self.db_user, 86 | password=self.db_pwd, 87 | db=self.db_name) 88 | 89 | def get(self, https, **kwargs): 90 | return self.client.get(https, **kwargs) 91 | 92 | def put(self, key, **kwargs): 93 | return self.client.put(key, **kwargs) 94 | 95 | def update(self, key, value, **kwargs): 96 | return self.client.update(key, value, **kwargs) 97 | 98 | def delete(self, key, **kwargs): 99 | return self.client.delete(key, **kwargs) 100 | 101 | def exists(self, key, **kwargs): 102 | return self.client.exists(key, **kwargs) 103 | 104 | def pop(self, https, **kwargs): 105 | return self.client.pop(https, **kwargs) 106 | 107 | def getAll(self, https): 108 | return self.client.getAll(https) 109 | 110 | def clear(self): 111 | return self.client.clear() 112 | 113 | def changeTable(self, name): 114 | self.client.changeTable(name) 115 | 116 | def getCount(self): 117 | return self.client.getCount() 118 | 119 | def test(self): 120 | return self.client.test() 121 | -------------------------------------------------------------------------------- /proxy_pool/db/redisClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ----------------------------------------------------- 4 | File Name: redisClient.py 5 | Description : 封装Redis相关操作 6 | Author : JHao 7 | date: 2019/8/9 8 | ------------------------------------------------------ 9 | Change Activity: 10 | 2019/08/09: 封装Redis相关操作 11 | 2020/06/23: 优化pop方法, 改用hscan命令 12 | 2021/05/26: 区别http/https代理 13 | ------------------------------------------------------ 14 | """ 15 | __author__ = 'JHao' 16 | 17 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError 18 | from redis.connection import BlockingConnectionPool 19 | from handler.logHandler import LogHandler 20 | from random import choice 21 | from redis import Redis 22 | import json 23 | 24 | 25 | class RedisClient(object): 26 | """ 27 | Redis client 28 | 29 | Redis中代理存放的结构为hash: 30 | key为ip:port, value为代理属性的字典; 31 | 32 | """ 33 | 34 | def __init__(self, **kwargs): 35 | """ 36 | init 37 | :param host: host 38 | :param port: port 39 | :param password: password 40 | :param db: db 41 | :return: 42 | """ 43 | self.name = "" 44 | kwargs.pop("username") 45 | self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, 46 | timeout=5, 47 | socket_timeout=5, 48 | **kwargs)) 49 | 50 | def get(self, https): 51 | """ 52 | 返回一个代理 53 | :return: 54 | """ 55 | if https: 56 | items = self.__conn.hvals(self.name) 57 | proxies = list(filter(lambda x: json.loads(x).get("https"), items)) 58 | return choice(proxies) if proxies else None 59 | else: 60 | proxies = self.__conn.hkeys(self.name) 61 | proxy = choice(proxies) if proxies else None 62 | return self.__conn.hget(self.name, proxy) if proxy else None 63 | 64 | def put(self, proxy_obj): 65 | """ 66 | 将代理放入hash, 使用changeTable指定hash name 67 | :param proxy_obj: Proxy obj 68 | :return: 69 | """ 70 | data = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 71 | return data 72 | 73 | def pop(self, https): 74 | """ 75 | 弹出一个代理 76 | :return: dict {proxy: value} 77 | """ 78 | proxy = self.get(https) 79 | if proxy: 80 | self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) 81 | return proxy if proxy else None 82 | 83 | def delete(self, proxy_str): 84 | """ 85 | 移除指定代理, 使用changeTable指定hash name 86 | :param proxy_str: proxy str 87 | :return: 88 | """ 89 | return self.__conn.hdel(self.name, proxy_str) 90 | 91 | def exists(self, proxy_str): 92 | """ 93 | 判断指定代理是否存在, 使用changeTable指定hash name 94 | :param proxy_str: proxy str 95 | :return: 96 | """ 97 | return self.__conn.hexists(self.name, proxy_str) 98 | 99 | def update(self, proxy_obj): 100 | """ 101 | 更新 proxy 属性 102 | :param proxy_obj: 103 | :return: 104 | """ 105 | return self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 106 | 107 | def getAll(self, https): 108 | """ 109 | 字典形式返回所有代理, 使用changeTable指定hash name 110 | :return: 111 | """ 112 | items = self.__conn.hvals(self.name) 113 | if https: 114 | return list(filter(lambda x: json.loads(x).get("https"), items)) 115 | else: 116 | return items 117 | 118 | def clear(self): 119 | """ 120 | 清空所有代理, 使用changeTable指定hash name 121 | :return: 122 | """ 123 | return self.__conn.delete(self.name) 124 | 125 | def getCount(self): 126 | """ 127 | 返回代理数量 128 | :return: 129 | """ 130 | proxies = self.getAll(https=False) 131 | return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} 132 | 133 | def changeTable(self, name): 134 | """ 135 | 切换操作对象 136 | :param name: 137 | :return: 138 | """ 139 | self.name = name 140 | 141 | def test(self): 142 | log = LogHandler('redis_client') 143 | try: 144 | self.getCount() 145 | except TimeoutError as e: 146 | log.error('redis connection time out: %s' % str(e), exc_info=True) 147 | return e 148 | except ConnectionError as e: 149 | log.error('redis connection error: %s' % str(e), exc_info=True) 150 | return e 151 | except ResponseError as e: 152 | log.error('redis connection error: %s' % str(e), exc_info=True) 153 | return e 154 | 155 | 156 | -------------------------------------------------------------------------------- /proxy_pool/db/ssdbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # !/usr/bin/env python 3 | """ 4 | ------------------------------------------------- 5 | File Name: ssdbClient.py 6 | Description : 封装SSDB操作 7 | Author : JHao 8 | date: 2016/12/2 9 | ------------------------------------------------- 10 | Change Activity: 11 | 2016/12/2: 12 | 2017/09/22: PY3中 redis-py返回的数据是bytes型 13 | 2017/09/27: 修改pop()方法 返回{proxy:value}字典 14 | 2020/07/03: 2.1.0 优化代码结构 15 | 2021/05/26: 区分http和https代理 16 | ------------------------------------------------- 17 | """ 18 | __author__ = 'JHao' 19 | from redis.exceptions import TimeoutError, ConnectionError, ResponseError 20 | from redis.connection import BlockingConnectionPool 21 | from handler.logHandler import LogHandler 22 | from random import choice 23 | from redis import Redis 24 | import json 25 | 26 | 27 | class SsdbClient(object): 28 | """ 29 | SSDB client 30 | 31 | SSDB中代理存放的结构为hash: 32 | key为代理的ip:por, value为代理属性的字典; 33 | """ 34 | 35 | def __init__(self, **kwargs): 36 | """ 37 | init 38 | :param host: host 39 | :param port: port 40 | :param password: password 41 | :return: 42 | """ 43 | self.name = "" 44 | kwargs.pop("username") 45 | self.__conn = Redis(connection_pool=BlockingConnectionPool(decode_responses=True, 46 | timeout=5, 47 | socket_timeout=5, 48 | **kwargs)) 49 | 50 | def get(self, https): 51 | """ 52 | 从hash中随机返回一个代理 53 | :return: 54 | """ 55 | if https: 56 | items_dict = self.__conn.hgetall(self.name) 57 | proxies = list(filter(lambda x: json.loads(x).get("https"), items_dict.values())) 58 | return choice(proxies) if proxies else None 59 | else: 60 | proxies = self.__conn.hkeys(self.name) 61 | proxy = choice(proxies) if proxies else None 62 | return self.__conn.hget(self.name, proxy) if proxy else None 63 | 64 | def put(self, proxy_obj): 65 | """ 66 | 将代理放入hash 67 | :param proxy_obj: Proxy obj 68 | :return: 69 | """ 70 | result = self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 71 | return result 72 | 73 | def pop(self, https): 74 | """ 75 | 顺序弹出一个代理 76 | :return: proxy 77 | """ 78 | proxy = self.get(https) 79 | if proxy: 80 | self.__conn.hdel(self.name, json.loads(proxy).get("proxy", "")) 81 | return proxy if proxy else None 82 | 83 | def delete(self, proxy_str): 84 | """ 85 | 移除指定代理, 使用changeTable指定hash name 86 | :param proxy_str: proxy str 87 | :return: 88 | """ 89 | self.__conn.hdel(self.name, proxy_str) 90 | 91 | def exists(self, proxy_str): 92 | """ 93 | 判断指定代理是否存在, 使用changeTable指定hash name 94 | :param proxy_str: proxy str 95 | :return: 96 | """ 97 | return self.__conn.hexists(self.name, proxy_str) 98 | 99 | def update(self, proxy_obj): 100 | """ 101 | 更新 proxy 属性 102 | :param proxy_obj: 103 | :return: 104 | """ 105 | self.__conn.hset(self.name, proxy_obj.proxy, proxy_obj.to_json) 106 | 107 | def getAll(self, https): 108 | """ 109 | 字典形式返回所有代理, 使用changeTable指定hash name 110 | :return: 111 | """ 112 | item_dict = self.__conn.hgetall(self.name) 113 | if https: 114 | return list(filter(lambda x: json.loads(x).get("https"), item_dict.values())) 115 | else: 116 | return item_dict.values() 117 | 118 | def clear(self): 119 | """ 120 | 清空所有代理, 使用changeTable指定hash name 121 | :return: 122 | """ 123 | return self.__conn.delete(self.name) 124 | 125 | def getCount(self): 126 | """ 127 | 返回代理数量 128 | :return: 129 | """ 130 | proxies = self.getAll(https=False) 131 | return {'total': len(proxies), 'https': len(list(filter(lambda x: json.loads(x).get("https"), proxies)))} 132 | 133 | def changeTable(self, name): 134 | """ 135 | 切换操作对象 136 | :param name: 137 | :return: 138 | """ 139 | self.name = name 140 | 141 | def test(self): 142 | log = LogHandler('ssdb_client') 143 | try: 144 | self.getCount() 145 | except TimeoutError as e: 146 | log.error('ssdb connection time out: %s' % str(e), exc_info=True) 147 | return e 148 | except ConnectionError as e: 149 | log.error('ssdb connection error: %s' % str(e), exc_info=True) 150 | return e 151 | except ResponseError as e: 152 | log.error('ssdb connection error: %s' % str(e), exc_info=True) 153 | return e 154 | -------------------------------------------------------------------------------- /proxy_pool/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /proxy_pool/docs/changelog.rst: -------------------------------------------------------------------------------- 1 | .. _changelog: 2 | 3 | ChangeLog 4 | ========== 5 | 6 | 2.4.1 (2022-07-17) 7 | ------------------ 8 | 9 | 1. 新增代理源 **FreeProxyList**; (2022-07-21) 10 | 2. 新增代理源 **FateZero**; (2022-08-01) 11 | 3. 新增代理属性 ``region``; (2022-08-16) 12 | 13 | 2.4.0 (2021-11-17) 14 | ------------------ 15 | 16 | 1. 移除无效代理源 **神鸡代理**; (2021-11-16) 17 | 2. 移除无效代理源 **极速代理**; (2021-11-16) 18 | 3. 移除代理源 **西拉代理**; (2021-11-16) 19 | 4. 新增代理源 **蝶鸟IP**; (2021-11-16) 20 | 5. 新增代理源 **PROXY11**; (2021-11-16) 21 | 6. 多线程采集代理; (2021-11-17) 22 | 23 | 2.3.0 (2021-05-27) 24 | ------------------ 25 | 26 | 1. 修复Dockerfile时区问题; (2021-04-12) 27 | 2. 新增Proxy属性 ``source``, 标记代理来源; (2021-04-13) 28 | 3. 新增Proxy属性 ``https``, 标记支持https的代理; (2021-05-27) 29 | 30 | 2.2.0 (2021-04-08) 31 | ------------------ 32 | 33 | 1. 启动时检查数据库连通性; 34 | 2. 新增免费代理源 **米扑代理**; 35 | 3. 新增免费代理源 **Pzzqz**; 36 | 4. 新增免费代理源 **神鸡代理**; 37 | 5. 新增免费代理源 **极速代理**; 38 | 6. 新增免费代理源 **小幻代理**; 39 | 40 | 2.1.1 (2021-02-23) 41 | ------------------ 42 | 43 | 1. Fix Bug `#493`_, 新增时区配置; (2020-08-12) 44 | 2. 修复 **66代理** 采集; (2020-11-04) 45 | 3. 修复 **全网代理** 采集, 解决HTML端口加密问题; (2020-11-04) 46 | 4. 新增 **代理盒子** 免费源; (2020-11-04) 47 | 5. 新增 ``POOL_SIZE_MIN`` 配置项, runProxyCheck时, 剩余代理少于POOL_SIZE_MIN触发抓取; (2021-02-23) 48 | 49 | .. _#493: https://github.com/jhao104/proxy_pool/issues/493 50 | 51 | 2.1.0 (2020.07) 52 | ------------------ 53 | 54 | 1. 新增免费代理源 **西拉代理** (2020-03-30) 55 | 2. Fix Bug `#356`_ `#401`_ 56 | 3. 优化Docker镜像体积; (2020-06-19) 57 | 4. 优化配置方式; 58 | 5. 优化代码结构; 59 | 6. 不再储存raw_proxy, 抓取后直接验证入库; 60 | 61 | .. _#401: https://github.com/jhao104/proxy_pool/issues/401 62 | .. _#356: https://github.com/jhao104/proxy_pool/issues/356 63 | 64 | 2.0.1 (2019.10) 65 | ----------------- 66 | 67 | 1. 新增免费代理源 **89免费代理**; 68 | #. 新增免费代理源 **齐云代理** 69 | 70 | 2.0.0 (2019.08) 71 | ------------------ 72 | 73 | 1. WebApi集成Gunicorn方式启动, Windows平台暂不支持; 74 | #. 优化Proxy调度程序; 75 | #. 扩展Proxy属性; 76 | #. 新增cli工具, 更加方便启动proxyPool 77 | 78 | 1.14 (2019.07) 79 | ----------------- 80 | 81 | 1. 修复 Queue阻塞导致的 ``ProxyValidSchedule`` 假死bug; 82 | #. 修改代理源 **云代理** 抓取; 83 | #. 修改代理源 **码农代理** 抓取; 84 | #. 修改代理源 **代理66** 抓取, 引入 ``PyExecJS`` 模块破解加速乐动态Cookies加密; 85 | 86 | 1.13 (2019.02) 87 | ----------------- 88 | 89 | 1. 使用.py文件替换.ini作为配置文件; 90 | 91 | #. 优化代理采集部分; 92 | 93 | 1.12 (2018.04) 94 | ----------------- 95 | 96 | 1. 优化代理格式检查; 97 | 98 | #. 增加代理源; 99 | 100 | #. fix bug `#122`_ `#126`_ 101 | 102 | .. _#122: https://github.com/jhao104/proxy_pool/issues/122 103 | .. _#126: https://github.com/jhao104/proxy_pool/issues/126 104 | 105 | 1.11 (2017.08) 106 | ----------------- 107 | 108 | 1. 使用多线程验证useful_pool; 109 | 110 | 1.10 (2016.11) 111 | ----------------- 112 | 113 | 1. 第一版; 114 | 115 | #. 支持PY2/PY3; 116 | 117 | #. 代理池基本功能; 118 | -------------------------------------------------------------------------------- /proxy_pool/docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | import sphinx_rtd_theme 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'ProxyPool' 21 | copyright = '2020, jhao104' 22 | author = 'jhao104' 23 | 24 | master_doc = 'index' 25 | 26 | # The full version, including alpha/beta/rc tags 27 | release = '2.1.0' 28 | 29 | # -- General configuration --------------------------------------------------- 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = [ 35 | ] 36 | 37 | # If true, sectionauthor and moduleauthor directives will be shown in the 38 | # output. They are ignored by default. 39 | show_authors = False 40 | 41 | # The name of the Pygments (syntax highlighting) style to use. 42 | pygments_style = "sphinx" 43 | 44 | # Add any paths that contain templates here, relative to this directory. 45 | templates_path = ['_templates'] 46 | 47 | # The language for content autogenerated by Sphinx. Refer to documentation 48 | # for a list of supported languages. 49 | # 50 | # This is also used if you do content translation via gettext catalogs. 51 | # Usually you set "language" from the command line for these cases. 52 | language = 'zh_CN' 53 | 54 | # List of patterns, relative to source directory, that match files and 55 | # directories to ignore when looking for source files. 56 | # This pattern also affects html_static_path and html_extra_path. 57 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 58 | 59 | # -- Options for HTML output ------------------------------------------------- 60 | 61 | # The theme to use for HTML and HTML Help pages. See the documentation for 62 | # a list of builtin themes. 63 | # 64 | html_theme = 'sphinx_rtd_theme' 65 | 66 | html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] 67 | 68 | # Add any paths that contain custom static files (such as style sheets) here, 69 | # relative to this directory. They are copied after the builtin static files, 70 | # so a file named "default.css" will overwrite the builtin "default.css". 71 | html_static_path = ['_static'] 72 | -------------------------------------------------------------------------------- /proxy_pool/docs/dev/ext_fetcher.rst: -------------------------------------------------------------------------------- 1 | .. ext_fetcher 2 | 3 | 扩展代理源 4 | ----------- 5 | 6 | 项目默认包含几个免费的代理获取源,但是免费的毕竟质量有限,如果直接运行可能拿到的代理质量不理想。因此提供了用户自定义扩展代理获取的方法。 7 | 8 | 如果要添加一个新的代理获取方法, 过程如下: 9 | 10 | 1. 首先在 `ProxyFetcher`_ 类中添加自定义的获取代理的静态方法,该方法需要以生成器(yield)形式返回 ``host:ip`` 格式的代理字符串, 例如: 11 | 12 | .. code-block:: python 13 | 14 | class ProxyFetcher(object): 15 | # .... 16 | # 自定义代理源获取方法 17 | @staticmethod 18 | def freeProxyCustom01(): # 命名不和已有重复即可 19 | # 通过某网站或者某接口或某数据库获取代理 20 | # 假设你已经拿到了一个代理列表 21 | proxies = ["x.x.x.x:3128", "x.x.x.x:80"] 22 | for proxy in proxies: 23 | yield proxy 24 | # 确保每个proxy都是 host:ip正确的格式返回 25 | 26 | 2. 添加好方法后,修改配置文件 `setting.py`_ 中的 ``PROXY_FETCHER`` 项, 加入刚才添加的自定义方法的名字: 27 | 28 | .. code-block:: python 29 | 30 | PROXY_FETCHER = [ 31 | # .... 32 | "freeProxyCustom01" # # 确保名字和你添加方法名字一致 33 | ] 34 | 35 | .. _ProxyFetcher: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/fetcher/proxyFetcher.py#L20 36 | .. _setting.py: https://github.com/jhao104/proxy_pool/blob/1a3666283806a22ef287fba1a8efab7b94e94bac/setting.py#L47 -------------------------------------------------------------------------------- /proxy_pool/docs/dev/ext_validator.rst: -------------------------------------------------------------------------------- 1 | .. ext_validator 2 | 3 | 代理校验 4 | ----------- 5 | 6 | 内置校验 7 | >>>>>>>>> 8 | 9 | 项目中使用的代理校验方法全部定义在 `validator.py`_ 中, 通过 `ProxyValidator`_ 类中提供的装饰器来区分。校验方法返回 ``True`` 表示 10 | 校验通过, 返回 ``False`` 表示校验不通过。 11 | 12 | * 代理校验方法分为三类: ``preValidator`` 、 ``httpValidator`` 、 ``httpsValidator``: 13 | 14 | * **preValidator**: 预校验,在代理抓取后验证前调用,目前实现了 `formatValidator`_ 校验代理IP格式是否合法; 15 | * **httpValidator**: 代理可用性校验,通过则认为代理可用, 目前实现了 `httpTimeOutValidator`_ 校验; 16 | * **httpsValidator**: 校验代理是否支持https,目前实现了 `httpsTimeOutValidator`_ 校验。 17 | 18 | 19 | .. _validator.py: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py 20 | .. _ProxyValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L29 21 | .. _formatValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L51 22 | .. _httpTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L58 23 | .. _httpsTimeOutValidator: https://github.com/jhao104/proxy_pool/blob/release-2.3.0/helper/validator.py#L71 24 | 25 | 每种校验可以定义多个方法,只有 **所有** 方法都返回 ``True`` 的情况下才视为该校验通过,校验方法执行顺序为: 先执行 **httpValidator** , 前者通过后再执行 **httpsValidator** 。 26 | 只有 `preValidator` 校验通过的代理才会进入可用性校验, `httpValidator` 校验通过后认为代理可用准备更新入代理池, `httpValidator` 校验通过后视为代理支持https更新代理的 `https` 属性为 `True` 。 27 | 28 | 扩展校验 29 | >>>>>>>>> 30 | 31 | 在 `validator.py`_ 已有自定义校验的示例,自定义函数需返回True或者False,使用 `ProxyValidator`_ 中提供的装饰器来区分校验类型。 下面是两个例子: 32 | 33 | * 1. 自定义一个代理可用性的校验(``addHttpValidator``): 34 | 35 | .. code-block:: python 36 | 37 | @ProxyValidator.addHttpValidator 38 | def customValidatorExample01(proxy): 39 | """自定义代理可用性校验函数""" 40 | proxies = {"http": "http://{proxy}".format(proxy=proxy)} 41 | try: 42 | r = requests.get("http://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5) 43 | return True if r.status_code == 200 and len(r.content) > 200 else False 44 | except Exception as e: 45 | return False 46 | 47 | * 2. 自定义一个代理是否支持https的校验(``addHttpsValidator``): 48 | 49 | .. code-block:: python 50 | 51 | @ProxyValidator.addHttpsValidator 52 | def customValidatorExample02(proxy): 53 | """自定义代理是否支持https校验函数""" 54 | proxies = {"https": "https://{proxy}".format(proxy=proxy)} 55 | try: 56 | r = requests.get("https://www.baidu.com/", headers=HEADER, proxies=proxies, timeout=5, verify=False) 57 | return True if r.status_code == 200 and len(r.content) > 200 else False 58 | except Exception as e: 59 | return False 60 | 61 | 注意,比如在运行代理可用性校验时,所有被 ``ProxyValidator.addHttpValidator`` 装饰的函数会被依次按定义顺序执行,只有当所有函数都返回True时才会判断代理可用。 ``HttpsValidator`` 运行机制也是如此。 62 | -------------------------------------------------------------------------------- /proxy_pool/docs/dev/index.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | 开发指南 3 | ========= 4 | 5 | .. module:: dev 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | ext_fetcher 11 | ext_validator 12 | -------------------------------------------------------------------------------- /proxy_pool/docs/index.rst: -------------------------------------------------------------------------------- 1 | .. ProxyPool documentation master file, created by 2 | sphinx-quickstart on Wed Jul 8 16:13:42 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | ProxyPool 7 | ===================================== 8 | 9 | :: 10 | 11 | **************************************************************** 12 | *** ______ ********************* ______ *********** _ ******** 13 | *** | ___ \_ ******************** | ___ \ ********* | | ******** 14 | *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** 15 | *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** 16 | *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** 17 | *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** 18 | **** __ / / ***** 19 | ************************* /___ / ******************************* 20 | ************************* ******************************** 21 | **************************************************************** 22 | 23 | Python爬虫代理IP池 24 | 25 | 安装 26 | ----- 27 | 28 | * 下载代码 29 | 30 | .. code-block:: console 31 | 32 | $ git clone git@github.com:jhao104/proxy_pool.git 33 | 34 | * 安装依赖 35 | 36 | .. code-block:: console 37 | 38 | $ pip install -r requirements.txt 39 | 40 | * 更新配置 41 | 42 | .. code-block:: python 43 | 44 | HOST = "0.0.0.0" 45 | PORT = 5000 46 | 47 | DB_CONN = 'redis://@127.0.0.1:8888' 48 | 49 | PROXY_FETCHER = [ 50 | "freeProxy01", 51 | "freeProxy02", 52 | # .... 53 | ] 54 | 55 | * 启动项目 56 | 57 | .. code-block:: console 58 | 59 | $ python proxyPool.py schedule 60 | $ python proxyPool.py server 61 | 62 | 使用 63 | ______ 64 | 65 | * API 66 | 67 | ============ ======== ================ ============== 68 | Api Method Description Params 69 | ============ ======== ================ ============== 70 | / GET API介绍 无 71 | /get GET 返回一个代理 可选参数: `?type=https` 过滤支持https的代理 72 | /pop GET 返回并删除一个代理 可选参数: `?type=https` 过滤支持https的代理 73 | /all GET 返回所有代理 可选参数: `?type=https` 过滤支持https的代理 74 | /count GET 返回代理数量 无 75 | /delete GET 删除指定代理 `?proxy=host:ip` 76 | ============ ======== ================ ============== 77 | 78 | 79 | * 爬虫 80 | 81 | .. code-block:: python 82 | 83 | import requests 84 | 85 | def get_proxy(): 86 | return requests.get("http://127.0.0.1:5010/get?type=https").json() 87 | 88 | def delete_proxy(proxy): 89 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 90 | 91 | # your spider code 92 | 93 | def getHtml(): 94 | # .... 95 | retry_count = 5 96 | proxy = get_proxy().get("proxy") 97 | while retry_count > 0: 98 | try: 99 | html = requests.get('https://www.example.com', proxies={"http": "http://{}".format(proxy), "https": "https://{}".format(proxy)}) 100 | # 使用代理访问 101 | return html 102 | except Exception: 103 | retry_count -= 1 104 | # 删除代理池中代理 105 | delete_proxy(proxy) 106 | return None 107 | 108 | Contents 109 | -------- 110 | 111 | .. toctree:: 112 | :maxdepth: 2 113 | 114 | user/index 115 | dev/index 116 | changelog 117 | -------------------------------------------------------------------------------- /proxy_pool/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /proxy_pool/docs/user/how_to_config.rst: -------------------------------------------------------------------------------- 1 | .. how_to_config 2 | 3 | 配置参考 4 | --------- 5 | 6 | 配置文件 ``setting.py`` 位于项目的主目录下, 配置主要分为四类: **服务配置** 、 **数据库配置** 、 **采集配置** 、 **校验配置**. 7 | 8 | 服务配置 9 | >>>>>>>>> 10 | 11 | * ``HOST`` 12 | 13 | API服务监听的IP, 本机访问设置为 ``127.0.0.1``, 开启远程访问设置为: ``0.0.0.0``. 14 | 15 | * ``PORT`` 16 | 17 | API服务监听的端口. 18 | 19 | 数据库配置 20 | >>>>>>>>>>> 21 | 22 | * ``DB_CONN`` 23 | 24 | 用户存放代理IP的数据库URI, 配置格式为: ``db_type://[[user]:[pwd]]@ip:port/[db]``. 25 | 26 | 目前支持的db_type有: ``ssdb`` 、 ``redis``. 27 | 28 | 配置示例: 29 | 30 | .. code-block:: python 31 | 32 | # SSDB IP: 127.0.0.1 Port: 8888 33 | DB_CONN = 'ssdb://@127.0.0.1:8888' 34 | # SSDB IP: 127.0.0.1 Port: 8899 Password: 123456 35 | DB_CONN = 'ssdb://:123456@127.0.0.1:8888' 36 | 37 | # Redis IP: 127.0.0.1 Port: 6379 38 | DB_CONN = 'redis://@127.0.0.1:6379' 39 | # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 40 | DB_CONN = 'redis://:123456@127.0.0.1:6379' 41 | # Redis IP: 127.0.0.1 Port: 6379 Password: 123456 DB: 15 42 | DB_CONN = 'redis://:123456@127.0.0.1:6379/15' 43 | 44 | 45 | * ``TABLE_NAME`` 46 | 47 | 存放代理的数据载体名称, ssdb和redis的存放结构为hash. 48 | 49 | 采集配置 50 | >>>>>>>>> 51 | 52 | * ``PROXY_FETCHER`` 53 | 54 | 启用的代理采集方法名, 代理采集方法位于 ``fetcher/proxyFetcher.py`` 类中. 55 | 56 | 由于各个代理源的稳定性不容易掌握, 当某个代理采集方法失效时, 可以该配置中注释掉其名称. 57 | 58 | 如果有增加某些代理采集方法, 也请在该配置中添加其方法名, 具体请参考 :doc:`/dev/extend_fetcher`. 59 | 60 | 调度程序每次执行采集任务时都会再次加载该配置, 保证每次运行的采集方法都是有效的. 61 | 62 | 校验配置 63 | >>>>>>>>> 64 | 65 | * ``HTTP_URL`` 66 | 67 | 用于检验代理是否可用的地址, 默认为 ``http://httpbin.org``, 可根据使用场景修改为其他地址. 68 | 69 | * ``HTTPS_URL`` 70 | 71 | 用于检验代理是否支持HTTPS的地址, 默认为 ``https://www.qq.com``, 可根据使用场景修改为其他地址. 72 | 73 | * ``VERIFY_TIMEOUT`` 74 | 75 | 检验代理的超时时间, 默认为 ``10`` , 单位秒. 使用代理访问 ``HTTP(S)_URL`` 耗时超过 ``VERIFY_TIMEOUT`` 时, 视为代理不可用. 76 | 77 | * ``MAX_FAIL_COUNT`` 78 | 79 | 检验代理允许最大失败次数, 默认为 ``0``, 即出错一次即删除. 80 | 81 | * ``POOL_SIZE_MIN`` 82 | 83 | 代理检测定时任务运行前若代理数量小于 `POOL_SIZE_MIN`, 则先运行抓取程序. -------------------------------------------------------------------------------- /proxy_pool/docs/user/how_to_run.rst: -------------------------------------------------------------------------------- 1 | .. how_to_run 2 | 3 | 4 | 如何运行 5 | --------- 6 | 7 | 下载代码 8 | >>>>>>>>> 9 | 10 | 本项目需要下载代码到本地运行, 通过 ``git`` 下载: 11 | 12 | .. code-block:: console 13 | 14 | $ git clone git@github.com:jhao104/proxy_pool.git 15 | 16 | 或者下载特定的 ``release`` 版本: 17 | 18 | .. code-block:: console 19 | 20 | https://github.com/jhao104/proxy_pool/releases 21 | 22 | 安装依赖 23 | >>>>>>>>> 24 | 25 | 到项目目录下使用 ``pip`` 安装依赖库: 26 | 27 | .. code-block:: console 28 | 29 | $ pip install -r requirements.txt 30 | 31 | 32 | 更新配置 33 | >>>>>>>>> 34 | 35 | 配置文件 ``setting.py`` 位于项目的主目录下: 36 | 37 | .. code-block:: python 38 | 39 | # 配置API服务 40 | 41 | HOST = "0.0.0.0" # IP 42 | PORT = 5000 # 监听端口 43 | 44 | # 配置数据库 45 | 46 | DB_CONN = 'redis://@127.0.0.1:8888/0' 47 | 48 | # 配置 ProxyFetcher 49 | 50 | PROXY_FETCHER = [ 51 | "freeProxy01", # 这里是启用的代理抓取方法,所有fetch方法位于fetcher/proxyFetcher.py 52 | "freeProxy02", 53 | # .... 54 | ] 55 | 56 | 更多配置请参考 :doc:`/user/how_to_config` 57 | 58 | 启动项目 59 | >>>>>>>>> 60 | 61 | 如果已配置好运行环境, 具备运行条件, 可以通过 ``proxyPool.py`` 启动. ``proxyPool.py`` 是项目的CLI入口. 62 | 完整程序包含两部份: ``schedule`` 调度程序和 ``server`` API服务, 调度程序负责采集和验证代理, API服务提供代理服务HTTP接口. 63 | 64 | 通过命令行程序分别启动调度程序和API服务: 65 | 66 | .. code-block:: console 67 | 68 | # 启动调度程序 69 | $ python proxyPool.py schedule 70 | 71 | # 启动webApi服务 72 | $ python proxyPool.py server 73 | 74 | -------------------------------------------------------------------------------- /proxy_pool/docs/user/how_to_use.rst: -------------------------------------------------------------------------------- 1 | .. how_to_use 2 | 3 | 如何使用 4 | ---------- 5 | 6 | 爬虫代码要对接代理池目前有两种方式: 一是通过调用API接口使用, 二是直接读取数据库. 7 | 8 | 调用API 9 | >>>>>>>>> 10 | 11 | 启动ProxyPool的 ``server`` 后会提供如下几个http接口: 12 | 13 | ============ ======== ================ ============== 14 | Api Method Description Arg 15 | ============ ======== ================ ============== 16 | / GET API介绍 无 17 | /get GET 随机返回一个代理 无 18 | /get_all GET 返回所有代理 无 19 | /get_status GET 返回代理数量 无 20 | /delete GET 删除指定代理 proxy=host:ip 21 | ============ ======== ================ ============== 22 | 23 | 在代码中可以通过封装上面的API接口来使用代理, 例子: 24 | 25 | .. code-block:: python 26 | 27 | import requests 28 | 29 | def get_proxy(): 30 | return requests.get("http://127.0.0.1:5010/get/").json() 31 | 32 | def delete_proxy(proxy): 33 | requests.get("http://127.0.0.1:5010/delete/?proxy={}".format(proxy)) 34 | 35 | # your spider code 36 | 37 | def getHtml(): 38 | # .... 39 | retry_count = 5 40 | proxy = get_proxy().get("proxy") 41 | while retry_count > 0: 42 | try: 43 | # 使用代理访问 44 | html = requests.get('http://www.example.com', proxies={"http": "http://{}".format(proxy)}) 45 | return html 46 | except Exception: 47 | retry_count -= 1 48 | # 删除代理池中代理 49 | delete_proxy(proxy) 50 | return None 51 | 52 | 本例中我们在本地 ``127.0.0.1`` 启动端口为 ``5010`` 的 ``server``, 使用 ``/get`` 接口获取代理, ``/delete`` 删除代理. 53 | 54 | 读数据库 55 | >>>>>>>>> 56 | 57 | 目前支持配置两种数据库: ``REDIS`` 、 ``SSDB``. 58 | 59 | * **REDIS** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** 60 | 61 | * **SSDB** 储存结构为 ``hash``, hash name为配置项中的 **TABLE_NAME** 62 | 63 | 可以在代码中自行读取. 64 | -------------------------------------------------------------------------------- /proxy_pool/docs/user/index.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | 用户指南 3 | ========= 4 | 5 | .. module:: user 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | how_to_run 11 | how_to_use 12 | how_to_config 13 | -------------------------------------------------------------------------------- /proxy_pool/fetcher/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : JHao 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: 11 | ------------------------------------------------- 12 | """ -------------------------------------------------------------------------------- /proxy_pool/fetcher/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/fetcher/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/fetcher/__pycache__/proxyFetcher.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/fetcher/__pycache__/proxyFetcher.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/fetcher/proxyFetcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxyFetcher 5 | Description : 6 | Author : JHao 7 | date: 2016/11/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/11/25: proxyFetcher 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import re 16 | import json 17 | from time import sleep 18 | 19 | from util.webRequest import WebRequest 20 | 21 | 22 | class ProxyFetcher(object): 23 | """ 24 | proxy getter 25 | """ 26 | 27 | @staticmethod 28 | def freeProxy01(): 29 | """ 30 | 站大爷 https://www.zdaye.com/dayProxy.html 31 | """ 32 | start_url = "https://www.zdaye.com/dayProxy.html" 33 | html_tree = WebRequest().get(start_url).tree 34 | latest_page_time = html_tree.xpath("//span[@class='thread_time_info']/text()")[0].strip() 35 | from datetime import datetime 36 | interval = datetime.now() - datetime.strptime(latest_page_time, "%Y/%m/%d %H:%M:%S") 37 | if interval.seconds < 300: # 只采集5分钟内的更新 38 | target_url = "https://www.zdaye.com/" + html_tree.xpath("//h3[@class='thread_title']/a/@href")[0].strip() 39 | while target_url: 40 | _tree = WebRequest().get(target_url).tree 41 | for tr in _tree.xpath("//table//tr"): 42 | ip = "".join(tr.xpath("./td[1]/text()")).strip() 43 | port = "".join(tr.xpath("./td[2]/text()")).strip() 44 | yield "%s:%s" % (ip, port) 45 | next_page = _tree.xpath("//div[@class='page']/a[@title='下一页']/@href") 46 | target_url = "https://www.zdaye.com/" + next_page[0].strip() if next_page else False 47 | sleep(5) 48 | 49 | @staticmethod 50 | def freeProxy02(): 51 | """ 52 | 代理66 http://www.66ip.cn/ 53 | """ 54 | url = "http://www.66ip.cn/" 55 | resp = WebRequest().get(url, timeout=10).tree 56 | for i, tr in enumerate(resp.xpath("(//table)[3]//tr")): 57 | if i > 0: 58 | ip = "".join(tr.xpath("./td[1]/text()")).strip() 59 | port = "".join(tr.xpath("./td[2]/text()")).strip() 60 | yield "%s:%s" % (ip, port) 61 | 62 | @staticmethod 63 | def freeProxy03(): 64 | """ 开心代理 """ 65 | target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"] 66 | for url in target_urls: 67 | tree = WebRequest().get(url).tree 68 | for tr in tree.xpath("//table[@class='active']//tr")[1:]: 69 | ip = "".join(tr.xpath('./td[1]/text()')).strip() 70 | port = "".join(tr.xpath('./td[2]/text()')).strip() 71 | yield "%s:%s" % (ip, port) 72 | 73 | @staticmethod 74 | def freeProxy04(): 75 | """ FreeProxyList https://www.freeproxylists.net/zh/ """ 76 | url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50" 77 | tree = WebRequest().get(url, verify=False).tree 78 | from urllib import parse 79 | 80 | def parse_ip(input_str): 81 | html_str = parse.unquote(input_str) 82 | ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str) 83 | return ips[0] if ips else None 84 | 85 | for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"): 86 | ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip()) 87 | port = "".join(tr.xpath('./td[2]/text()')).strip() 88 | if ip: 89 | yield "%s:%s" % (ip, port) 90 | 91 | @staticmethod 92 | def freeProxy05(page_count=1): 93 | """ 快代理 https://www.kuaidaili.com """ 94 | url_pattern = [ 95 | 'https://www.kuaidaili.com/free/inha/{}/', 96 | 'https://www.kuaidaili.com/free/intr/{}/' 97 | ] 98 | url_list = [] 99 | for page_index in range(1, page_count + 1): 100 | for pattern in url_pattern: 101 | url_list.append(pattern.format(page_index)) 102 | 103 | for url in url_list: 104 | tree = WebRequest().get(url).tree 105 | proxy_list = tree.xpath('.//table//tr') 106 | sleep(1) # 必须sleep 不然第二条请求不到数据 107 | for tr in proxy_list[1:]: 108 | yield ':'.join(tr.xpath('./td/text()')[0:2]) 109 | 110 | @staticmethod 111 | def freeProxy06(): 112 | """ FateZero http://proxylist.fatezero.org/ """ 113 | url = "http://proxylist.fatezero.org/proxy.list" 114 | try: 115 | resp_text = WebRequest().get(url).text 116 | for each in resp_text.split("\n"): 117 | json_info = json.loads(each) 118 | if json_info.get("country") == "CN": 119 | yield "%s:%s" % (json_info.get("host", ""), json_info.get("port", "")) 120 | except Exception as e: 121 | print(e) 122 | 123 | @staticmethod 124 | def freeProxy07(): 125 | """ 云代理 """ 126 | urls = ['http://www.ip3366.net/free/?stype=1', "http://www.ip3366.net/free/?stype=2"] 127 | for url in urls: 128 | r = WebRequest().get(url, timeout=10) 129 | proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 130 | for proxy in proxies: 131 | yield ":".join(proxy) 132 | 133 | @staticmethod 134 | def freeProxy08(): 135 | """ 小幻代理 """ 136 | urls = ['https://ip.ihuan.me/address/5Lit5Zu9.html'] 137 | for url in urls: 138 | r = WebRequest().get(url, timeout=10) 139 | proxies = re.findall(r'>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?(\d+)', r.text) 140 | for proxy in proxies: 141 | yield ":".join(proxy) 142 | 143 | @staticmethod 144 | def freeProxy09(page_count=1): 145 | """ 免费代理库 """ 146 | for i in range(1, page_count + 1): 147 | url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i) 148 | html_tree = WebRequest().get(url).tree 149 | for index, tr in enumerate(html_tree.xpath("//table//tr")): 150 | if index == 0: 151 | continue 152 | yield ":".join(tr.xpath("./td/text()")[0:2]).strip() 153 | 154 | @staticmethod 155 | def freeProxy10(): 156 | """ 89免费代理 """ 157 | r = WebRequest().get("https://www.89ip.cn/index_1.html", timeout=10) 158 | proxies = re.findall( 159 | r'[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?[\s\S]*?[\s\S]*?(\d+)[\s\S]*?', 160 | r.text) 161 | for proxy in proxies: 162 | yield ':'.join(proxy) 163 | 164 | # @staticmethod 165 | # def wallProxy01(): 166 | # """ 167 | # PzzQz https://pzzqz.com/ 168 | # """ 169 | # from requests import Session 170 | # from lxml import etree 171 | # session = Session() 172 | # try: 173 | # index_resp = session.get("https://pzzqz.com/", timeout=20, verify=False).text 174 | # x_csrf_token = re.findall('X-CSRFToken": "(.*?)"', index_resp) 175 | # if x_csrf_token: 176 | # data = {"http": "on", "ping": "3000", "country": "cn", "ports": ""} 177 | # proxy_resp = session.post("https://pzzqz.com/", verify=False, 178 | # headers={"X-CSRFToken": x_csrf_token[0]}, json=data).json() 179 | # tree = etree.HTML(proxy_resp["proxy_html"]) 180 | # for tr in tree.xpath("//tr"): 181 | # ip = "".join(tr.xpath("./td[1]/text()")) 182 | # port = "".join(tr.xpath("./td[2]/text()")) 183 | # yield "%s:%s" % (ip, port) 184 | # except Exception as e: 185 | # print(e) 186 | 187 | # @staticmethod 188 | # def freeProxy10(): 189 | # """ 190 | # 墙外网站 cn-proxy 191 | # :return: 192 | # """ 193 | # urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218'] 194 | # request = WebRequest() 195 | # for url in urls: 196 | # r = request.get(url, timeout=10) 197 | # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\w\W](\d+)', r.text) 198 | # for proxy in proxies: 199 | # yield ':'.join(proxy) 200 | 201 | # @staticmethod 202 | # def freeProxy11(): 203 | # """ 204 | # https://proxy-list.org/english/index.php 205 | # :return: 206 | # """ 207 | # urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)] 208 | # request = WebRequest() 209 | # import base64 210 | # for url in urls: 211 | # r = request.get(url, timeout=10) 212 | # proxies = re.findall(r"Proxy\('(.*?)'\)", r.text) 213 | # for proxy in proxies: 214 | # yield base64.b64decode(proxy).decode() 215 | 216 | # @staticmethod 217 | # def freeProxy12(): 218 | # urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1'] 219 | # request = WebRequest() 220 | # for url in urls: 221 | # r = request.get(url, timeout=10) 222 | # proxies = re.findall(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?(\d+)', r.text) 223 | # for proxy in proxies: 224 | # yield ':'.join(proxy) 225 | 226 | 227 | if __name__ == '__main__': 228 | p = ProxyFetcher() 229 | for _ in p.freeProxy06(): 230 | print(_) 231 | 232 | # http://nntime.com/proxy-list-01.htm 233 | -------------------------------------------------------------------------------- /proxy_pool/handler/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | # from handler.ProxyManager import ProxyManager 16 | -------------------------------------------------------------------------------- /proxy_pool/handler/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/handler/__pycache__/configHandler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/configHandler.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/handler/__pycache__/logHandler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/logHandler.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/handler/__pycache__/proxyHandler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/handler/__pycache__/proxyHandler.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/handler/configHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: configHandler 5 | Description : 6 | Author : JHao 7 | date: 2020/6/22 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/22: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import os 16 | import setting 17 | from util.singleton import Singleton 18 | from util.lazyProperty import LazyProperty 19 | from util.six import reload_six, withMetaclass 20 | 21 | 22 | class ConfigHandler(withMetaclass(Singleton)): 23 | 24 | def __init__(self): 25 | pass 26 | 27 | @LazyProperty 28 | def serverHost(self): 29 | return os.environ.get("HOST", setting.HOST) 30 | 31 | @LazyProperty 32 | def serverPort(self): 33 | return os.environ.get("PORT", setting.PORT) 34 | 35 | @LazyProperty 36 | def dbConn(self): 37 | return os.getenv("DB_CONN", setting.DB_CONN) 38 | 39 | @LazyProperty 40 | def tableName(self): 41 | return os.getenv("TABLE_NAME", setting.TABLE_NAME) 42 | 43 | @property 44 | def fetchers(self): 45 | reload_six(setting) 46 | return setting.PROXY_FETCHER 47 | 48 | @LazyProperty 49 | def httpUrl(self): 50 | return os.getenv("HTTP_URL", setting.HTTP_URL) 51 | 52 | @LazyProperty 53 | def httpsUrl(self): 54 | return os.getenv("HTTPS_URL", setting.HTTPS_URL) 55 | 56 | @LazyProperty 57 | def verifyTimeout(self): 58 | return int(os.getenv("VERIFY_TIMEOUT", setting.VERIFY_TIMEOUT)) 59 | 60 | # @LazyProperty 61 | # def proxyCheckCount(self): 62 | # return int(os.getenv("PROXY_CHECK_COUNT", setting.PROXY_CHECK_COUNT)) 63 | 64 | @LazyProperty 65 | def maxFailCount(self): 66 | return int(os.getenv("MAX_FAIL_COUNT", setting.MAX_FAIL_COUNT)) 67 | 68 | # @LazyProperty 69 | # def maxFailRate(self): 70 | # return int(os.getenv("MAX_FAIL_RATE", setting.MAX_FAIL_RATE)) 71 | 72 | @LazyProperty 73 | def poolSizeMin(self): 74 | return int(os.getenv("POOL_SIZE_MIN", setting.POOL_SIZE_MIN)) 75 | 76 | @LazyProperty 77 | def proxyRegion(self): 78 | return bool(os.getenv("PROXY_REGION", setting.PROXY_REGION)) 79 | 80 | @LazyProperty 81 | def timezone(self): 82 | return os.getenv("TIMEZONE", setting.TIMEZONE) 83 | 84 | -------------------------------------------------------------------------------- /proxy_pool/handler/logHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: LogHandler.py 5 | Description : 日志操作模块 6 | Author : JHao 7 | date: 2017/3/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/03/06: log handler 11 | 2017/09/21: 屏幕输出/文件输出 可选(默认屏幕和文件均输出) 12 | 2020/07/13: Windows下TimedRotatingFileHandler线程不安全, 不再使用 13 | ------------------------------------------------- 14 | """ 15 | __author__ = 'JHao' 16 | 17 | import os 18 | import logging 19 | import platform 20 | 21 | from logging.handlers import TimedRotatingFileHandler 22 | 23 | # 日志级别 24 | CRITICAL = 50 25 | FATAL = CRITICAL 26 | ERROR = 40 27 | WARNING = 30 28 | WARN = WARNING 29 | INFO = 20 30 | DEBUG = 10 31 | NOTSET = 0 32 | 33 | CURRENT_PATH = os.path.dirname(os.path.abspath(__file__)) 34 | ROOT_PATH = os.path.join(CURRENT_PATH, os.pardir) 35 | LOG_PATH = os.path.join(ROOT_PATH, 'log') 36 | 37 | if not os.path.exists(LOG_PATH): 38 | try: 39 | os.mkdir(LOG_PATH) 40 | except FileExistsError: 41 | pass 42 | 43 | 44 | class LogHandler(logging.Logger): 45 | """ 46 | LogHandler 47 | """ 48 | 49 | def __init__(self, name, level=DEBUG, stream=True, file=True): 50 | self.name = name 51 | self.level = level 52 | logging.Logger.__init__(self, self.name, level=level) 53 | if stream: 54 | self.__setStreamHandler__() 55 | if file: 56 | if platform.system() != "Windows": 57 | self.__setFileHandler__() 58 | 59 | def __setFileHandler__(self, level=None): 60 | """ 61 | set file handler 62 | :param level: 63 | :return: 64 | """ 65 | file_name = os.path.join(LOG_PATH, '{name}.log'.format(name=self.name)) 66 | # 设置日志回滚, 保存在log目录, 一天保存一个文件, 保留15天 67 | file_handler = TimedRotatingFileHandler(filename=file_name, when='D', interval=1, backupCount=15) 68 | file_handler.suffix = '%Y%m%d.log' 69 | if not level: 70 | file_handler.setLevel(self.level) 71 | else: 72 | file_handler.setLevel(level) 73 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 74 | 75 | file_handler.setFormatter(formatter) 76 | self.file_handler = file_handler 77 | self.addHandler(file_handler) 78 | 79 | def __setStreamHandler__(self, level=None): 80 | """ 81 | set stream handler 82 | :param level: 83 | :return: 84 | """ 85 | stream_handler = logging.StreamHandler() 86 | formatter = logging.Formatter('%(asctime)s %(filename)s[line:%(lineno)d] %(levelname)s %(message)s') 87 | stream_handler.setFormatter(formatter) 88 | if not level: 89 | stream_handler.setLevel(self.level) 90 | else: 91 | stream_handler.setLevel(level) 92 | self.addHandler(stream_handler) 93 | 94 | 95 | if __name__ == '__main__': 96 | log = LogHandler('test') 97 | log.info('this is a test msg') 98 | -------------------------------------------------------------------------------- /proxy_pool/handler/proxyHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: ProxyHandler.py 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/03: 11 | 2020/05/26: 区分http和https 12 | ------------------------------------------------- 13 | """ 14 | __author__ = 'JHao' 15 | 16 | from helper.proxy import Proxy 17 | from db.dbClient import DbClient 18 | from handler.configHandler import ConfigHandler 19 | 20 | 21 | class ProxyHandler(object): 22 | """ Proxy CRUD operator""" 23 | 24 | def __init__(self): 25 | self.conf = ConfigHandler() 26 | self.db = DbClient(self.conf.dbConn) 27 | self.db.changeTable(self.conf.tableName) 28 | 29 | def get(self, https=False): 30 | """ 31 | return a proxy 32 | Args: 33 | https: True/False 34 | Returns: 35 | """ 36 | proxy = self.db.get(https) 37 | return Proxy.createFromJson(proxy) if proxy else None 38 | 39 | def pop(self, https): 40 | """ 41 | return and delete a useful proxy 42 | :return: 43 | """ 44 | proxy = self.db.pop(https) 45 | if proxy: 46 | return Proxy.createFromJson(proxy) 47 | return None 48 | 49 | def put(self, proxy): 50 | """ 51 | put proxy into use proxy 52 | :return: 53 | """ 54 | self.db.put(proxy) 55 | 56 | def delete(self, proxy): 57 | """ 58 | delete useful proxy 59 | :param proxy: 60 | :return: 61 | """ 62 | return self.db.delete(proxy.proxy) 63 | 64 | def getAll(self, https=False): 65 | """ 66 | get all proxy from pool as Proxy list 67 | :return: 68 | """ 69 | proxies = self.db.getAll(https) 70 | return [Proxy.createFromJson(_) for _ in proxies] 71 | 72 | def exists(self, proxy): 73 | """ 74 | check proxy exists 75 | :param proxy: 76 | :return: 77 | """ 78 | return self.db.exists(proxy.proxy) 79 | 80 | def getCount(self): 81 | """ 82 | return raw_proxy and use_proxy count 83 | :return: 84 | """ 85 | total_use_proxy = self.db.getCount() 86 | return {'count': total_use_proxy} 87 | -------------------------------------------------------------------------------- /proxy_pool/helper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__init__.py -------------------------------------------------------------------------------- /proxy_pool/helper/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/helper/__pycache__/check.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/check.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/helper/__pycache__/fetch.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/fetch.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/helper/__pycache__/launcher.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/launcher.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/helper/__pycache__/proxy.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/proxy.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/helper/__pycache__/scheduler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/scheduler.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/helper/__pycache__/validator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/helper/__pycache__/validator.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/helper/check.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: check 5 | Description : 执行代理校验 6 | Author : JHao 7 | date: 2019/8/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/08/06: 执行代理校验 11 | 2021/05/25: 分别校验http和https 12 | 2022/08/16: 获取代理Region信息 13 | ------------------------------------------------- 14 | """ 15 | __author__ = 'JHao' 16 | 17 | from util.six import Empty 18 | from threading import Thread 19 | from datetime import datetime 20 | from util.webRequest import WebRequest 21 | from handler.logHandler import LogHandler 22 | from helper.validator import ProxyValidator 23 | from handler.proxyHandler import ProxyHandler 24 | from handler.configHandler import ConfigHandler 25 | 26 | 27 | class DoValidator(object): 28 | """ 执行校验 """ 29 | 30 | conf = ConfigHandler() 31 | 32 | @classmethod 33 | def validator(cls, proxy, work_type): 34 | """ 35 | 校验入口 36 | Args: 37 | proxy: Proxy Object 38 | work_type: raw/use 39 | Returns: 40 | Proxy Object 41 | """ 42 | http_r = cls.httpValidator(proxy) 43 | https_r = False if not http_r else cls.httpsValidator(proxy) 44 | 45 | proxy.check_count += 1 46 | proxy.last_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 47 | proxy.last_status = True if http_r else False 48 | if http_r: 49 | if proxy.fail_count > 0: 50 | proxy.fail_count -= 1 51 | proxy.https = True if https_r else False 52 | if work_type == "raw": 53 | proxy.region = cls.regionGetter(proxy) if cls.conf.proxyRegion else "" 54 | else: 55 | proxy.fail_count += 1 56 | return proxy 57 | 58 | @classmethod 59 | def httpValidator(cls, proxy): 60 | for func in ProxyValidator.http_validator: 61 | if not func(proxy.proxy): 62 | return False 63 | return True 64 | 65 | @classmethod 66 | def httpsValidator(cls, proxy): 67 | for func in ProxyValidator.https_validator: 68 | if not func(proxy.proxy): 69 | return False 70 | return True 71 | 72 | @classmethod 73 | def preValidator(cls, proxy): 74 | for func in ProxyValidator.pre_validator: 75 | if not func(proxy): 76 | return False 77 | return True 78 | 79 | @classmethod 80 | def regionGetter(cls, proxy): 81 | try: 82 | url = 'https://searchplugin.csdn.net/api/v1/ip/get?ip=%s' % proxy.proxy.split(':')[0] 83 | r = WebRequest().get(url=url, retry_time=1, timeout=2).json 84 | return r['data']['address'] 85 | except: 86 | return 'error' 87 | 88 | 89 | class _ThreadChecker(Thread): 90 | """ 多线程检测 """ 91 | 92 | def __init__(self, work_type, target_queue, thread_name): 93 | Thread.__init__(self, name=thread_name) 94 | self.work_type = work_type 95 | self.log = LogHandler("checker") 96 | self.proxy_handler = ProxyHandler() 97 | self.target_queue = target_queue 98 | self.conf = ConfigHandler() 99 | 100 | def run(self): 101 | self.log.info("{}ProxyCheck - {}: start".format(self.work_type.title(), self.name)) 102 | while True: 103 | try: 104 | proxy = self.target_queue.get(block=False) 105 | except Empty: 106 | self.log.info("{}ProxyCheck - {}: complete".format(self.work_type.title(), self.name)) 107 | break 108 | proxy = DoValidator.validator(proxy, self.work_type) 109 | if self.work_type == "raw": 110 | self.__ifRaw(proxy) 111 | else: 112 | self.__ifUse(proxy) 113 | self.target_queue.task_done() 114 | 115 | def __ifRaw(self, proxy): 116 | if proxy.last_status: 117 | if self.proxy_handler.exists(proxy): 118 | self.log.info('RawProxyCheck - {}: {} exist'.format(self.name, proxy.proxy.ljust(23))) 119 | else: 120 | self.log.info('RawProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) 121 | self.proxy_handler.put(proxy) 122 | else: 123 | self.log.info('RawProxyCheck - {}: {} fail'.format(self.name, proxy.proxy.ljust(23))) 124 | 125 | def __ifUse(self, proxy): 126 | if proxy.last_status: 127 | self.log.info('UseProxyCheck - {}: {} pass'.format(self.name, proxy.proxy.ljust(23))) 128 | self.proxy_handler.put(proxy) 129 | else: 130 | if proxy.fail_count > self.conf.maxFailCount: 131 | self.log.info('UseProxyCheck - {}: {} fail, count {} delete'.format(self.name, 132 | proxy.proxy.ljust(23), 133 | proxy.fail_count)) 134 | self.proxy_handler.delete(proxy) 135 | else: 136 | self.log.info('UseProxyCheck - {}: {} fail, count {} keep'.format(self.name, 137 | proxy.proxy.ljust(23), 138 | proxy.fail_count)) 139 | self.proxy_handler.put(proxy) 140 | 141 | 142 | def Checker(tp, queue): 143 | """ 144 | run Proxy ThreadChecker 145 | :param tp: raw/use 146 | :param queue: Proxy Queue 147 | :return: 148 | """ 149 | thread_list = list() 150 | for index in range(20): 151 | thread_list.append(_ThreadChecker(tp, queue, "thread_%s" % str(index).zfill(2))) 152 | 153 | for thread in thread_list: 154 | thread.setDaemon(True) 155 | thread.start() 156 | 157 | for thread in thread_list: 158 | thread.join() 159 | -------------------------------------------------------------------------------- /proxy_pool/helper/fetch.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: fetchScheduler 5 | Description : 6 | Author : JHao 7 | date: 2019/8/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/11/18: 多线程采集 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from threading import Thread 16 | from helper.proxy import Proxy 17 | from helper.check import DoValidator 18 | from handler.logHandler import LogHandler 19 | from handler.proxyHandler import ProxyHandler 20 | from fetcher.proxyFetcher import ProxyFetcher 21 | from handler.configHandler import ConfigHandler 22 | 23 | 24 | class _ThreadFetcher(Thread): 25 | 26 | def __init__(self, fetch_source, proxy_dict): 27 | Thread.__init__(self) 28 | self.fetch_source = fetch_source 29 | self.proxy_dict = proxy_dict 30 | self.fetcher = getattr(ProxyFetcher, fetch_source, None) 31 | self.log = LogHandler("fetcher") 32 | self.conf = ConfigHandler() 33 | self.proxy_handler = ProxyHandler() 34 | 35 | def run(self): 36 | self.log.info("ProxyFetch - {func}: start".format(func=self.fetch_source)) 37 | try: 38 | for proxy in self.fetcher(): 39 | self.log.info('ProxyFetch - %s: %s ok' % (self.fetch_source, proxy.ljust(23))) 40 | proxy = proxy.strip() 41 | if proxy in self.proxy_dict: 42 | self.proxy_dict[proxy].add_source(self.fetch_source) 43 | else: 44 | self.proxy_dict[proxy] = Proxy( 45 | proxy, source=self.fetch_source) 46 | except Exception as e: 47 | self.log.error("ProxyFetch - {func}: error".format(func=self.fetch_source)) 48 | self.log.error(str(e)) 49 | 50 | 51 | class Fetcher(object): 52 | name = "fetcher" 53 | 54 | def __init__(self): 55 | self.log = LogHandler(self.name) 56 | self.conf = ConfigHandler() 57 | 58 | def run(self): 59 | """ 60 | fetch proxy with proxyFetcher 61 | :return: 62 | """ 63 | proxy_dict = dict() 64 | thread_list = list() 65 | self.log.info("ProxyFetch : start") 66 | 67 | for fetch_source in self.conf.fetchers: 68 | self.log.info("ProxyFetch - {func}: start".format(func=fetch_source)) 69 | fetcher = getattr(ProxyFetcher, fetch_source, None) 70 | if not fetcher: 71 | self.log.error("ProxyFetch - {func}: class method not exists!".format(func=fetch_source)) 72 | continue 73 | if not callable(fetcher): 74 | self.log.error("ProxyFetch - {func}: must be class method".format(func=fetch_source)) 75 | continue 76 | thread_list.append(_ThreadFetcher(fetch_source, proxy_dict)) 77 | 78 | for thread in thread_list: 79 | thread.setDaemon(True) 80 | thread.start() 81 | 82 | for thread in thread_list: 83 | thread.join() 84 | 85 | self.log.info("ProxyFetch - all complete!") 86 | for _ in proxy_dict.values(): 87 | if DoValidator.preValidator(_.proxy): 88 | yield _ 89 | -------------------------------------------------------------------------------- /proxy_pool/helper/launcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: launcher 5 | Description : 启动器 6 | Author : JHao 7 | date: 2021/3/26 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/3/26: 启动器 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import sys 16 | from db.dbClient import DbClient 17 | from handler.logHandler import LogHandler 18 | from handler.configHandler import ConfigHandler 19 | 20 | log = LogHandler('launcher') 21 | 22 | 23 | def startServer(): 24 | __beforeStart() 25 | from api.proxyApi import runFlask 26 | runFlask() 27 | 28 | 29 | def startScheduler(): 30 | __beforeStart() 31 | from helper.scheduler import runScheduler 32 | runScheduler() 33 | 34 | 35 | def __beforeStart(): 36 | __showVersion() 37 | __showConfigure() 38 | if __checkDBConfig(): 39 | log.info('exit!') 40 | sys.exit() 41 | 42 | 43 | def __showVersion(): 44 | from setting import VERSION 45 | log.info("ProxyPool Version: %s" % VERSION) 46 | 47 | 48 | def __showConfigure(): 49 | conf = ConfigHandler() 50 | log.info("ProxyPool configure HOST: %s" % conf.serverHost) 51 | log.info("ProxyPool configure PORT: %s" % conf.serverPort) 52 | log.info("ProxyPool configure PROXY_FETCHER: %s" % conf.fetchers) 53 | 54 | 55 | def __checkDBConfig(): 56 | conf = ConfigHandler() 57 | db = DbClient(conf.dbConn) 58 | log.info("============ DATABASE CONFIGURE ================") 59 | log.info("DB_TYPE: %s" % db.db_type) 60 | log.info("DB_HOST: %s" % db.db_host) 61 | log.info("DB_PORT: %s" % db.db_port) 62 | log.info("DB_NAME: %s" % db.db_name) 63 | log.info("DB_USER: %s" % db.db_user) 64 | log.info("=================================================") 65 | return db.test() 66 | -------------------------------------------------------------------------------- /proxy_pool/helper/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: Proxy 5 | Description : 代理对象类型封装 6 | Author : JHao 7 | date: 2019/7/11 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/7/11: 代理对象类型封装 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import json 16 | 17 | 18 | class Proxy(object): 19 | 20 | def __init__(self, proxy, fail_count=0, region="", anonymous="", 21 | source="", check_count=0, last_status="", last_time="", https=False): 22 | self._proxy = proxy 23 | self._fail_count = fail_count 24 | self._region = region 25 | self._anonymous = anonymous 26 | self._source = source.split('/') 27 | self._check_count = check_count 28 | self._last_status = last_status 29 | self._last_time = last_time 30 | self._https = https 31 | 32 | @classmethod 33 | def createFromJson(cls, proxy_json): 34 | _dict = json.loads(proxy_json) 35 | return cls(proxy=_dict.get("proxy", ""), 36 | fail_count=_dict.get("fail_count", 0), 37 | region=_dict.get("region", ""), 38 | anonymous=_dict.get("anonymous", ""), 39 | source=_dict.get("source", ""), 40 | check_count=_dict.get("check_count", 0), 41 | last_status=_dict.get("last_status", ""), 42 | last_time=_dict.get("last_time", ""), 43 | https=_dict.get("https", False) 44 | ) 45 | 46 | @property 47 | def proxy(self): 48 | """ 代理 ip:port """ 49 | return self._proxy 50 | 51 | @property 52 | def fail_count(self): 53 | """ 检测失败次数 """ 54 | return self._fail_count 55 | 56 | @property 57 | def region(self): 58 | """ 地理位置(国家/城市) """ 59 | return self._region 60 | 61 | @property 62 | def anonymous(self): 63 | """ 匿名 """ 64 | return self._anonymous 65 | 66 | @property 67 | def source(self): 68 | """ 代理来源 """ 69 | return '/'.join(self._source) 70 | 71 | @property 72 | def check_count(self): 73 | """ 代理检测次数 """ 74 | return self._check_count 75 | 76 | @property 77 | def last_status(self): 78 | """ 最后一次检测结果 True -> 可用; False -> 不可用""" 79 | return self._last_status 80 | 81 | @property 82 | def last_time(self): 83 | """ 最后一次检测时间 """ 84 | return self._last_time 85 | 86 | @property 87 | def https(self): 88 | """ 是否支持https """ 89 | return self._https 90 | 91 | @property 92 | def to_dict(self): 93 | """ 属性字典 """ 94 | return {"proxy": self.proxy, 95 | "https": self.https, 96 | "fail_count": self.fail_count, 97 | "region": self.region, 98 | "anonymous": self.anonymous, 99 | "source": self.source, 100 | "check_count": self.check_count, 101 | "last_status": self.last_status, 102 | "last_time": self.last_time} 103 | 104 | @property 105 | def to_json(self): 106 | """ 属性json格式 """ 107 | return json.dumps(self.to_dict, ensure_ascii=False) 108 | 109 | @fail_count.setter 110 | def fail_count(self, value): 111 | self._fail_count = value 112 | 113 | @check_count.setter 114 | def check_count(self, value): 115 | self._check_count = value 116 | 117 | @last_status.setter 118 | def last_status(self, value): 119 | self._last_status = value 120 | 121 | @last_time.setter 122 | def last_time(self, value): 123 | self._last_time = value 124 | 125 | @https.setter 126 | def https(self, value): 127 | self._https = value 128 | 129 | @region.setter 130 | def region(self, value): 131 | self._region = value 132 | 133 | def add_source(self, source_str): 134 | if source_str: 135 | self._source.append(source_str) 136 | self._source = list(set(self._source)) 137 | -------------------------------------------------------------------------------- /proxy_pool/helper/scheduler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxyScheduler 5 | Description : 6 | Author : JHao 7 | date: 2019/8/5 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/08/05: proxyScheduler 11 | 2021/02/23: runProxyCheck时,剩余代理少于POOL_SIZE_MIN时执行抓取 12 | ------------------------------------------------- 13 | """ 14 | __author__ = 'JHao' 15 | 16 | from apscheduler.schedulers.blocking import BlockingScheduler 17 | from apscheduler.executors.pool import ProcessPoolExecutor 18 | 19 | from util.six import Queue 20 | from helper.fetch import Fetcher 21 | from helper.check import Checker 22 | from handler.logHandler import LogHandler 23 | from handler.proxyHandler import ProxyHandler 24 | from handler.configHandler import ConfigHandler 25 | 26 | 27 | def __runProxyFetch(): 28 | proxy_queue = Queue() 29 | proxy_fetcher = Fetcher() 30 | 31 | for proxy in proxy_fetcher.run(): 32 | proxy_queue.put(proxy) 33 | 34 | Checker("raw", proxy_queue) 35 | 36 | 37 | def __runProxyCheck(): 38 | proxy_handler = ProxyHandler() 39 | proxy_queue = Queue() 40 | if proxy_handler.db.getCount().get("total", 0) < proxy_handler.conf.poolSizeMin: 41 | __runProxyFetch() 42 | for proxy in proxy_handler.getAll(): 43 | proxy_queue.put(proxy) 44 | Checker("use", proxy_queue) 45 | 46 | 47 | def runScheduler(): 48 | __runProxyFetch() 49 | 50 | timezone = ConfigHandler().timezone 51 | scheduler_log = LogHandler("scheduler") 52 | scheduler = BlockingScheduler(logger=scheduler_log, timezone=timezone) 53 | 54 | scheduler.add_job(__runProxyFetch, 'interval', minutes=4, id="proxy_fetch", name="proxy采集") 55 | scheduler.add_job(__runProxyCheck, 'interval', minutes=2, id="proxy_check", name="proxy检查") 56 | executors = { 57 | 'default': {'type': 'threadpool', 'max_workers': 20}, 58 | 'processpool': ProcessPoolExecutor(max_workers=5) 59 | } 60 | job_defaults = { 61 | 'coalesce': False, 62 | 'max_instances': 10 63 | } 64 | 65 | scheduler.configure(executors=executors, job_defaults=job_defaults, timezone=timezone) 66 | 67 | scheduler.start() 68 | 69 | 70 | if __name__ == '__main__': 71 | runScheduler() 72 | -------------------------------------------------------------------------------- /proxy_pool/helper/validator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: _validators 5 | Description : 定义proxy验证方法 6 | Author : JHao 7 | date: 2021/5/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/5/25: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from re import findall 16 | from requests import head 17 | from util.six import withMetaclass 18 | from util.singleton import Singleton 19 | from handler.configHandler import ConfigHandler 20 | 21 | conf = ConfigHandler() 22 | 23 | HEADER = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 24 | 'Accept': '*/*', 25 | 'Connection': 'keep-alive', 26 | 'Accept-Language': 'zh-CN,zh;q=0.8'} 27 | 28 | 29 | class ProxyValidator(withMetaclass(Singleton)): 30 | pre_validator = [] 31 | http_validator = [] 32 | https_validator = [] 33 | 34 | @classmethod 35 | def addPreValidator(cls, func): 36 | cls.pre_validator.append(func) 37 | return func 38 | 39 | @classmethod 40 | def addHttpValidator(cls, func): 41 | cls.http_validator.append(func) 42 | return func 43 | 44 | @classmethod 45 | def addHttpsValidator(cls, func): 46 | cls.https_validator.append(func) 47 | return func 48 | 49 | 50 | @ProxyValidator.addPreValidator 51 | def formatValidator(proxy): 52 | """检查代理格式""" 53 | verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}" 54 | _proxy = findall(verify_regex, proxy) 55 | return True if len(_proxy) == 1 and _proxy[0] == proxy else False 56 | 57 | 58 | @ProxyValidator.addHttpValidator 59 | def httpTimeOutValidator(proxy): 60 | """ http检测超时 """ 61 | 62 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} 63 | 64 | try: 65 | r = head(conf.httpUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout) 66 | return True if r.status_code == 200 else False 67 | except Exception as e: 68 | return False 69 | 70 | 71 | @ProxyValidator.addHttpsValidator 72 | def httpsTimeOutValidator(proxy): 73 | """https检测超时""" 74 | 75 | proxies = {"http": "http://{proxy}".format(proxy=proxy), "https": "https://{proxy}".format(proxy=proxy)} 76 | try: 77 | r = head(conf.httpsUrl, headers=HEADER, proxies=proxies, timeout=conf.verifyTimeout, verify=False) 78 | return True if r.status_code == 200 else False 79 | except Exception as e: 80 | return False 81 | 82 | 83 | @ProxyValidator.addHttpValidator 84 | def customValidatorExample(proxy): 85 | """自定义validator函数,校验代理是否可用, 返回True/False""" 86 | return True 87 | -------------------------------------------------------------------------------- /proxy_pool/proxyPool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: proxy_pool 5 | Description : proxy pool 启动入口 6 | Author : JHao 7 | date: 2020/6/19 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/19: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import click 16 | from helper.launcher import startServer, startScheduler 17 | from setting import BANNER, VERSION 18 | 19 | CONTEXT_SETTINGS = dict(help_option_names=['-h', '--help']) 20 | 21 | 22 | @click.group(context_settings=CONTEXT_SETTINGS) 23 | @click.version_option(version=VERSION) 24 | def cli(): 25 | """ProxyPool cli工具""" 26 | 27 | @cli.command(name="schedule") 28 | def schedule(): 29 | """ 启动调度程序 """ 30 | click.echo(BANNER) 31 | startScheduler() 32 | 33 | 34 | @cli.command(name="server") 35 | def server(): 36 | """ 启动api服务 """ 37 | click.echo(BANNER) 38 | startServer() 39 | 40 | 41 | if __name__ == '__main__': 42 | cli() 43 | -------------------------------------------------------------------------------- /proxy_pool/requirements.txt: -------------------------------------------------------------------------------- 1 | APScheduler==3.2.0 2 | werkzeug==0.15.5 3 | Flask==2.2.2 4 | requests==2.20.0 5 | click==7.0 6 | gunicorn==19.9.0 7 | lxml 8 | redis 9 | -------------------------------------------------------------------------------- /proxy_pool/setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: setting.py 5 | Description : 配置文件 6 | Author : JHao 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/2/15: 11 | ------------------------------------------------- 12 | """ 13 | 14 | BANNER = r""" 15 | **************************************************************** 16 | *** ______ ********************* ______ *********** _ ******** 17 | *** | ___ \_ ******************** | ___ \ ********* | | ******** 18 | *** | |_/ / \__ __ __ _ __ _ | |_/ /___ * ___ | | ******** 19 | *** | __/| _// _ \ \ \/ /| | | || __// _ \ / _ \ | | ******** 20 | *** | | | | | (_) | > < \ |_| || | | (_) | (_) || |___ **** 21 | *** \_| |_| \___/ /_/\_\ \__ |\_| \___/ \___/ \_____/ **** 22 | **** __ / / ***** 23 | ************************* /___ / ******************************* 24 | ************************* ******************************** 25 | **************************************************************** 26 | """ 27 | 28 | VERSION = "2.4.0" 29 | 30 | # ############### server config ############### 31 | HOST = "0.0.0.0" 32 | 33 | PORT = 5010 34 | 35 | # ############### database config ################### 36 | # db connection uri 37 | # example: 38 | # Redis: redis://:password@ip:port/db 39 | # Ssdb: ssdb://:password@ip:port 40 | DB_CONN = 'redis://:@127.0.0.1:6379/0' 41 | 42 | # proxy table name 43 | TABLE_NAME = 'use_proxy' 44 | 45 | 46 | # ###### config the proxy fetch function ###### 47 | PROXY_FETCHER = [ 48 | "freeProxy01", 49 | "freeProxy02", 50 | "freeProxy03", 51 | "freeProxy04", 52 | "freeProxy05", 53 | "freeProxy06", 54 | "freeProxy07", 55 | "freeProxy08", 56 | "freeProxy09", 57 | "freeProxy10" 58 | ] 59 | 60 | # ############# proxy validator ################# 61 | # 代理验证目标网站 62 | HTTP_URL = "http://httpbin.org" 63 | 64 | HTTPS_URL = "https://www.qq.com" 65 | 66 | # 代理验证时超时时间 67 | VERIFY_TIMEOUT = 10 68 | 69 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败次数,超过则剔除代理 70 | MAX_FAIL_COUNT = 0 71 | 72 | # 近PROXY_CHECK_COUNT次校验中允许的最大失败率,超过则剔除代理 73 | # MAX_FAIL_RATE = 0.1 74 | 75 | # proxyCheck时代理数量少于POOL_SIZE_MIN触发抓取 76 | POOL_SIZE_MIN = 20 77 | 78 | # ############# proxy attributes ################# 79 | # 是否启用代理地域属性 80 | PROXY_REGION = True 81 | 82 | # ############# scheduler config ################# 83 | 84 | # Set the timezone for the scheduler forcely (optional) 85 | # If it is running on a VM, and 86 | # "ValueError: Timezone offset does not match system offset" 87 | # was raised during scheduling. 88 | # Please uncomment the following line and set a timezone for the scheduler. 89 | # Otherwise it will detect the timezone from the system automatically. 90 | 91 | TIMEZONE = "Asia/Shanghai" 92 | -------------------------------------------------------------------------------- /proxy_pool/test/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__ 5 | Description : 6 | Author : JHao 7 | date: 2019/2/15 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/2/15: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | -------------------------------------------------------------------------------- /proxy_pool/test/testConfigHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testGetConfig 5 | Description : testGetConfig 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from handler.configHandler import ConfigHandler 16 | from time import sleep 17 | 18 | 19 | def testConfig(): 20 | """ 21 | :return: 22 | """ 23 | conf = ConfigHandler() 24 | print(conf.dbConn) 25 | print(conf.serverPort) 26 | print(conf.serverHost) 27 | print(conf.tableName) 28 | assert isinstance(conf.fetchers, list) 29 | print(conf.fetchers) 30 | 31 | for _ in range(2): 32 | print(conf.fetchers) 33 | sleep(5) 34 | 35 | 36 | if __name__ == '__main__': 37 | testConfig() 38 | 39 | -------------------------------------------------------------------------------- /proxy_pool/test/testDbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testDbClient 5 | Description : 6 | Author : JHao 7 | date: 2020/6/23 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/23: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from db.dbClient import DbClient 16 | 17 | 18 | def testDbClient(): 19 | # ############### ssdb ############### 20 | ssdb_uri = "ssdb://:password@127.0.0.1:8888" 21 | s = DbClient.parseDbConn(ssdb_uri) 22 | assert s.db_type == "SSDB" 23 | assert s.db_pwd == "password" 24 | assert s.db_host == "127.0.0.1" 25 | assert s.db_port == 8888 26 | 27 | # ############### redis ############### 28 | redis_uri = "redis://:password@127.0.0.1:6379/1" 29 | r = DbClient.parseDbConn(redis_uri) 30 | assert r.db_type == "REDIS" 31 | assert r.db_pwd == "password" 32 | assert r.db_host == "127.0.0.1" 33 | assert r.db_port == 6379 34 | assert r.db_name == "1" 35 | print("DbClient ok!") 36 | 37 | 38 | if __name__ == '__main__': 39 | testDbClient() 40 | -------------------------------------------------------------------------------- /proxy_pool/test/testLogHandler.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testLogHandler 5 | Description : 6 | Author : J_hao 7 | date: 2017/8/2 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/8/2: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from handler.logHandler import LogHandler 16 | 17 | 18 | def testLogHandler(): 19 | log = LogHandler('test') 20 | log.info('this is info') 21 | log.error('this is error') 22 | 23 | 24 | if __name__ == '__main__': 25 | testLogHandler() 26 | -------------------------------------------------------------------------------- /proxy_pool/test/testProxyClass.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyClass 5 | Description : 6 | Author : JHao 7 | date: 2019/8/8 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2019/8/8: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import json 16 | from helper.proxy import Proxy 17 | 18 | 19 | def testProxyClass(): 20 | proxy = Proxy("127.0.0.1:8080") 21 | 22 | print(proxy.to_json) 23 | 24 | proxy.source = "test" 25 | 26 | proxy_str = json.dumps(proxy.to_dict, ensure_ascii=False) 27 | 28 | print(proxy_str) 29 | 30 | print(Proxy.createFromJson(proxy_str).to_dict) 31 | 32 | 33 | if __name__ == '__main__': 34 | testProxyClass() 35 | -------------------------------------------------------------------------------- /proxy_pool/test/testProxyFetcher.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyFetcher 5 | Description : 6 | Author : JHao 7 | date: 2020/6/23 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/23: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from fetcher.proxyFetcher import ProxyFetcher 16 | from handler.configHandler import ConfigHandler 17 | 18 | 19 | def testProxyFetcher(): 20 | conf = ConfigHandler() 21 | proxy_getter_functions = conf.fetchers 22 | proxy_counter = {_: 0 for _ in proxy_getter_functions} 23 | for proxyGetter in proxy_getter_functions: 24 | for proxy in getattr(ProxyFetcher, proxyGetter.strip())(): 25 | if proxy: 26 | print('{func}: fetch proxy {proxy}'.format(func=proxyGetter, proxy=proxy)) 27 | proxy_counter[proxyGetter] = proxy_counter.get(proxyGetter) + 1 28 | for key, value in proxy_counter.items(): 29 | print(key, value) 30 | 31 | 32 | if __name__ == '__main__': 33 | testProxyFetcher() 34 | -------------------------------------------------------------------------------- /proxy_pool/test/testProxyValidator.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testProxyValidator 5 | Description : 6 | Author : JHao 7 | date: 2021/5/25 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2021/5/25: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | from helper.validator import ProxyValidator 16 | 17 | 18 | def testProxyValidator(): 19 | for _ in ProxyValidator.pre_validator: 20 | print(_) 21 | for _ in ProxyValidator.http_validator: 22 | print(_) 23 | for _ in ProxyValidator.https_validator: 24 | print(_) 25 | 26 | 27 | if __name__ == '__main__': 28 | testProxyValidator() 29 | -------------------------------------------------------------------------------- /proxy_pool/test/testRedisClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testRedisClient 5 | Description : 6 | Author : JHao 7 | date: 2020/6/23 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/23: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | 16 | def testRedisClient(): 17 | from db.dbClient import DbClient 18 | from helper.proxy import Proxy 19 | 20 | uri = "redis://:pwd@127.0.0.1:6379" 21 | db = DbClient(uri) 22 | db.changeTable("use_proxy") 23 | proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') 24 | 25 | print("put: ", db.put(proxy)) 26 | 27 | print("get: ", db.get(https=None)) 28 | 29 | print("exists: ", db.exists("27.38.96.101:9797")) 30 | 31 | print("exists: ", db.exists("27.38.96.101:8888")) 32 | 33 | print("pop: ", db.pop(https=None)) 34 | 35 | print("getAll: ", db.getAll(https=None)) 36 | 37 | print("getCount", db.getCount()) 38 | 39 | 40 | if __name__ == '__main__': 41 | testRedisClient() 42 | -------------------------------------------------------------------------------- /proxy_pool/test/testSsdbClient.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: testSsdbClient 5 | Description : 6 | Author : JHao 7 | date: 2020/7/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/7/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | 16 | def testSsdbClient(): 17 | from db.dbClient import DbClient 18 | from helper.proxy import Proxy 19 | 20 | uri = "ssdb://@127.0.0.1:8888" 21 | db = DbClient(uri) 22 | db.changeTable("use_proxy") 23 | proxy = Proxy.createFromJson('{"proxy": "118.190.79.36:8090", "https": false, "fail_count": 0, "region": "", "anonymous": "", "source": "freeProxy14", "check_count": 4, "last_status": true, "last_time": "2021-05-26 10:58:04"}') 24 | 25 | print("put: ", db.put(proxy)) 26 | 27 | print("get: ", db.get(https=None)) 28 | 29 | print("exists: ", db.exists("27.38.96.101:9797")) 30 | 31 | print("exists: ", db.exists("27.38.96.101:8888")) 32 | 33 | print("getAll: ", db.getAll(https=None)) 34 | 35 | # print("pop: ", db.pop(https=None)) 36 | 37 | print("clear: ", db.clear()) 38 | 39 | print("getCount", db.getCount()) 40 | 41 | 42 | if __name__ == '__main__': 43 | testSsdbClient() 44 | -------------------------------------------------------------------------------- /proxy_pool/util/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: __init__ 5 | Description : 6 | Author : JHao 7 | date: 2020/7/6 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/7/6: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | -------------------------------------------------------------------------------- /proxy_pool/util/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/util/__pycache__/lazyProperty.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/lazyProperty.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/util/__pycache__/singleton.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/singleton.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/util/__pycache__/six.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/six.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/util/__pycache__/webRequest.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CoderDon/Crawler/94073a4571faab832ac0002c7784783307686a28/proxy_pool/util/__pycache__/webRequest.cpython-38.pyc -------------------------------------------------------------------------------- /proxy_pool/util/lazyProperty.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: lazyProperty 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | 16 | class LazyProperty(object): 17 | """ 18 | LazyProperty 19 | explain: http://www.spiderpy.cn/blog/5/ 20 | """ 21 | 22 | def __init__(self, func): 23 | self.func = func 24 | 25 | def __get__(self, instance, owner): 26 | if instance is None: 27 | return self 28 | else: 29 | value = self.func(instance) 30 | setattr(instance, self.func.__name__, value) 31 | return value 32 | -------------------------------------------------------------------------------- /proxy_pool/util/singleton.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: singleton 5 | Description : 6 | Author : JHao 7 | date: 2016/12/3 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2016/12/3: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | 16 | class Singleton(type): 17 | """ 18 | Singleton Metaclass 19 | """ 20 | 21 | _inst = {} 22 | 23 | def __call__(cls, *args, **kwargs): 24 | if cls not in cls._inst: 25 | cls._inst[cls] = super(Singleton, cls).__call__(*args) 26 | return cls._inst[cls] 27 | -------------------------------------------------------------------------------- /proxy_pool/util/six.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: six 5 | Description : 6 | Author : JHao 7 | date: 2020/6/22 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2020/6/22: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'JHao' 14 | 15 | import sys 16 | 17 | PY2 = sys.version_info[0] == 2 18 | PY3 = sys.version_info[0] == 3 19 | 20 | if PY3: 21 | def iteritems(d, **kw): 22 | return iter(d.items(**kw)) 23 | else: 24 | def iteritems(d, **kw): 25 | return d.iteritems(**kw) 26 | 27 | if PY3: 28 | from urllib.parse import urlparse 29 | else: 30 | from urlparse import urlparse 31 | 32 | if PY3: 33 | from imp import reload as reload_six 34 | else: 35 | reload_six = reload 36 | 37 | if PY3: 38 | from queue import Empty, Queue 39 | else: 40 | from Queue import Empty, Queue 41 | 42 | 43 | def withMetaclass(meta, *bases): 44 | """Create a base class with a metaclass.""" 45 | 46 | # This requires a bit of explanation: the basic idea is to make a dummy 47 | # metaclass for one level of class instantiation that replaces itself with 48 | # the actual metaclass. 49 | class MetaClass(meta): 50 | 51 | def __new__(cls, name, this_bases, d): 52 | return meta(name, bases, d) 53 | 54 | return type.__new__(MetaClass, 'temporary_class', (), {}) 55 | -------------------------------------------------------------------------------- /proxy_pool/util/webRequest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: WebRequest 5 | Description : Network Requests Class 6 | Author : J_hao 7 | date: 2017/7/31 8 | ------------------------------------------------- 9 | Change Activity: 10 | 2017/7/31: 11 | ------------------------------------------------- 12 | """ 13 | __author__ = 'J_hao' 14 | 15 | from requests.models import Response 16 | from lxml import etree 17 | import requests 18 | import random 19 | import time 20 | 21 | from handler.logHandler import LogHandler 22 | 23 | requests.packages.urllib3.disable_warnings() 24 | 25 | 26 | class WebRequest(object): 27 | name = "web_request" 28 | 29 | def __init__(self, *args, **kwargs): 30 | self.log = LogHandler(self.name, file=False) 31 | self.response = Response() 32 | 33 | @property 34 | def user_agent(self): 35 | """ 36 | return an User-Agent at random 37 | :return: 38 | """ 39 | ua_list = [ 40 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101', 41 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122', 42 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71', 43 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95', 44 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71', 45 | 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 46 | 'Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50', 47 | 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0', 48 | ] 49 | return random.choice(ua_list) 50 | 51 | @property 52 | def header(self): 53 | """ 54 | basic header 55 | :return: 56 | """ 57 | return {'User-Agent': self.user_agent, 58 | 'Accept': '*/*', 59 | 'Connection': 'keep-alive', 60 | 'Accept-Language': 'zh-CN,zh;q=0.8'} 61 | 62 | def get(self, url, header=None, retry_time=3, retry_interval=5, timeout=5, *args, **kwargs): 63 | """ 64 | get method 65 | :param url: target url 66 | :param header: headers 67 | :param retry_time: retry time 68 | :param retry_interval: retry interval 69 | :param timeout: network timeout 70 | :return: 71 | """ 72 | headers = self.header 73 | if header and isinstance(header, dict): 74 | headers.update(header) 75 | while True: 76 | try: 77 | self.response = requests.get(url, headers=headers, timeout=timeout, *args, **kwargs) 78 | return self 79 | except Exception as e: 80 | self.log.error("requests: %s error: %s" % (url, str(e))) 81 | retry_time -= 1 82 | if retry_time <= 0: 83 | resp = Response() 84 | resp.status_code = 200 85 | return self 86 | self.log.info("retry %s second after" % retry_interval) 87 | time.sleep(retry_interval) 88 | 89 | @property 90 | def tree(self): 91 | return etree.HTML(self.response.content) 92 | 93 | @property 94 | def text(self): 95 | return self.response.text 96 | 97 | @property 98 | def json(self): 99 | try: 100 | return self.response.json() 101 | except Exception as e: 102 | self.log.error(str(e)) 103 | return {} 104 | 105 | --------------------------------------------------------------------------------