├── __init__.py ├── crawler_news ├── __init__.py ├── spiders │ ├── __init__.py │ ├── localhost.py │ ├── tvbs.py │ ├── udn.py │ ├── chinatimes.py │ ├── cna.py │ ├── nownews.py │ ├── setn.py │ ├── ebc.py │ ├── EtToday.py │ └── LibertyTimes.py ├── pipelines │ ├── pipelines.py │ ├── jieba.py │ ├── ckiptagger.py │ ├── postgresql.py │ ├── cassandra.py │ ├── line.py │ └── mysql.py ├── items.py ├── extensions │ ├── redis.py │ ├── CassandraDatabase.py │ └── MysqlDatabase.py ├── settings.py.example └── middlewares │ └── middlewares.py ├── log └── .gitignore ├── tmp └── .gitignore ├── requirements.txt ├── run.sh ├── scrapy.cfg ├── app.py ├── LICENSE ├── .circleci └── config.yml ├── .gitignore ├── unittest.py ├── README.md └── .pylintrc /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /crawler_news/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /log/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /tmp/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | !.gitignore 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Scrapy==2.7.1 2 | pylint==2.15.8 3 | redis==4.4.0 4 | requests==2.28.1 5 | -------------------------------------------------------------------------------- /crawler_news/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | # crontab -e 4 | # */5 * * * * /home/bentsou/Project/crawler-news/run.sh 5 | 6 | # set -x 7 | 8 | workdir=/home/bentsou/Project/crawler-news/ 9 | cd $workdir 10 | 11 | . $HOME/.profile; 12 | /usr/bin/python $workdir/app.py 13 | 14 | # set +x 15 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = crawler_news.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawler_news 12 | -------------------------------------------------------------------------------- /crawler_news/pipelines/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class CrawlerNewsPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /crawler_news/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | class CrawlerNewsItem(scrapy.Item): 9 | url = scrapy.Field() # str 10 | article_from = scrapy.Field() # str 11 | article_type = scrapy.Field() # str 12 | title = scrapy.Field() # str 13 | publish_date = scrapy.Field() # str 14 | authors = scrapy.Field() # list json 15 | tags = scrapy.Field() # list json 16 | text = scrapy.Field() # list json 17 | text_html = scrapy.Field() # str 18 | images = scrapy.Field() # list json 19 | video = scrapy.Field() # list json 20 | links = scrapy.Field() # list json 21 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | from time import sleep 2 | import scrapy 3 | from scrapy.crawler import CrawlerProcess 4 | from scrapy.utils.project import get_project_settings 5 | 6 | from crawler_news.spiders import LibertyTimes 7 | from crawler_news.spiders import ebc 8 | from crawler_news.spiders import udn 9 | from crawler_news.spiders import EtToday 10 | 11 | settings = get_project_settings() 12 | process = CrawlerProcess(settings) 13 | 14 | print('start') 15 | 16 | for spider_name in process.spiders.list(): 17 | if spider_name != 'localhost': 18 | print ("Running spider %s" % (spider_name)) 19 | process.crawl(spider_name) 20 | 21 | 22 | process.start() # the script will block here until all crawling jobs are finished 23 | 24 | print('done') 25 | -------------------------------------------------------------------------------- /crawler_news/spiders/localhost.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # mac shell example 4 | # scrapy crawl ettoday -a page=$(date +"%Y-%m-%d") 5 | 6 | import scrapy 7 | from crawler_news.items import CrawlerNewsItem 8 | 9 | import time 10 | import re 11 | 12 | class LocalhostSpider(scrapy.Spider): 13 | name = 'localhost' 14 | allowed_domains = ['localhost'] 15 | base_url = 'http://localhost' 16 | 17 | custom_settings = { 18 | 'LOG_FILE': 'log/%s-%s.log' % (name, str(int(time.time()))), 19 | 'LOG_LEVEL': 'DEBUG', 20 | 'DEFAULT_REQUEST_HEADERS': { 21 | 'Accept': '*/*', 22 | 'Referer': 'https://www.nownews.com/', 23 | 'X-Requested-With': 'XMLHttpRequest' 24 | } 25 | } 26 | 27 | def start_requests(self): 28 | yield scrapy.Request(url=self.base_url, callback=self.parse) 29 | 30 | def parse(self, response): 31 | print("[*] OK!") 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 SecondDim 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /crawler_news/pipelines/jieba.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | import jieba.posseg as pseg 11 | import jieba 12 | # import paddle 13 | 14 | # paddle.enable_static() 15 | # jieba.enable_paddle() 16 | 17 | from ckiptagger import data_utils, construct_dictionary, WS, POS, NER 18 | 19 | class JiebaPipeline: 20 | def process_item(self, item, spider): 21 | text = item.get('text') 22 | for t in text: 23 | self._paddle_cut(t) 24 | # print(text) 25 | return item 26 | 27 | def _paddle_cut(self, test_sent): 28 | seg_list = jieba.cut(test_sent,use_paddle=True) 29 | print("Paddle Mode: " + '/'.join(list(seg_list))) 30 | # for word in list(seg_list): 31 | # print('%s' % (word)) 32 | 33 | def _default_cut(self, test_sent): 34 | seg_list = jieba.cut(test_sent, cut_all=False) 35 | print("Default Mode: " + "/ ".join(seg_list)) 36 | 37 | def _full_cut(self, test_sent): 38 | seg_list = jieba.cut(test_sent, cut_all=True) 39 | print("Full Mode: " + "/ ".join(seg_list)) 40 | -------------------------------------------------------------------------------- /crawler_news/pipelines/ckiptagger.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | import json 11 | import time 12 | 13 | daily_sec = 60 * 60 * 24 14 | time_epoch_unit = 5 * 60 15 | 16 | class CkiptaggerPipeline: 17 | def _json_dumps_item(self, item, key): 18 | if item.get(key): 19 | return json.dumps(item.get(key), ensure_ascii=False) 20 | else: 21 | return None 22 | 23 | def process_item(self, item, spider): 24 | spider.logger.info('send to work queue for parse. %s' % item.get('url')) 25 | 26 | time_epoch = int((time.time() - time_epoch_unit) / time_epoch_unit) * time_epoch_unit 27 | 28 | # TODO 時間應該要來自網頁內容 29 | 30 | data_obj = json.dumps( { 31 | 'url': item.get('url'), 32 | 'title': item.get('title'), 33 | 'tags': item.get('tags'), 34 | 'text': item.get('text'), 35 | 'time_epoch': time_epoch 36 | }, ensure_ascii=False ) 37 | 38 | spider.redis_client.lpush('ckiptagger_worker_queue', data_obj) 39 | 40 | # spider.redis_client.set(time_epoch, data_obj, ex=daily_sec) 41 | 42 | return item 43 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | python: circleci/python@0.2.1 5 | 6 | jobs: 7 | build-and-test: 8 | docker: 9 | - image: circleci/python:3.7-stretch 10 | steps: 11 | - checkout 12 | - python/load-cache 13 | - python/install-deps 14 | - python/save-cache 15 | # - run: 16 | # command: python -m pylint crawler_news/spiders/* 17 | # name: lint 18 | - run: 19 | command: | 20 | mv crawler_news/settings.py.example crawler_news/settings.py 21 | python -m scrapy check 22 | name: Test scrapy 23 | 24 | deploy-to-gcp: 25 | docker: 26 | - image: alpine:latest 27 | steps: 28 | - run: 29 | command: | 30 | apk --no-cache add openssh-client bash 31 | echo ${SSH_KEY} | base64 -d > circleci && chmod 400 circleci 32 | ssh -o "StrictHostKeyChecking no" -i circleci ${SSH_HOST} "cd /home/ubuntu/ProjectPM/projectpm-cicd && ./circleci.sh crawler-news master" 33 | name: Deploy on gcloud 34 | 35 | workflows: 36 | build-test-deploy: 37 | jobs: 38 | - build-and-test: 39 | filters: 40 | branches: 41 | only: 42 | - develop 43 | - /feature.*/ 44 | - /hotfix.*/ 45 | - /circleci.*/ 46 | - deploy-to-gcp: 47 | filters: 48 | branches: 49 | only: 50 | - master 51 | -------------------------------------------------------------------------------- /crawler_news/extensions/redis.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | import redis 9 | 10 | class RedisClient(): 11 | @classmethod 12 | def from_crawler(cls, crawler): 13 | # first check if the extension should be enabled and raise 14 | # NotConfigured otherwise 15 | # if not crawler.settings.getbool('MYEXT_ENABLED'): 16 | # raise NotConfigured 17 | 18 | # get the number of items from settings 19 | # item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000) 20 | 21 | # instantiate the extension object 22 | ext = cls() 23 | 24 | # connect the extension object to signals 25 | crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened) 26 | # crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed) 27 | # crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped) 28 | 29 | # return the extension object 30 | return ext 31 | 32 | def spider_opened(self, spider): 33 | host = spider.settings.get('REDIS_HOST', 'localhost') 34 | port = spider.settings.get('REDIS_PORT', 6379) 35 | db = spider.settings.get('REDIS_DATABASE', 0) 36 | 37 | spider.logger.info(f"Connect redis host:{host}, port:{port}, db:{db}") 38 | 39 | spider.redis_client = redis.Redis(host, port, db) 40 | -------------------------------------------------------------------------------- /crawler_news/pipelines/postgresql.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | import psycopg2 11 | 12 | # # Connect to your postgres DB 13 | # conn = psycopg2.connect("dbname=test user=postgres") 14 | 15 | # # Open a cursor to perform database operations 16 | # cur = conn.cursor() 17 | 18 | # # Execute a query 19 | # cur.execute("SELECT * FROM my_data") 20 | 21 | # # Retrieve query results 22 | # records = cur.fetchall() 23 | 24 | class PostgresqlPipeline: 25 | def open_spider(self, spider): 26 | settings = spider.settings 27 | print('[pipelines] PostgresqlPipeline open_spider') 28 | # self.db = psycopg2.connect("dbname=crawler_news user=crawler_news") 29 | # self.db = MysqlDatabase(host=settings['MYSQL_HOST'], 30 | # port=settings['MYSQL_PORT'], 31 | # user=settings['MYSQL_USER'], 32 | # password=settings['MYSQL_PASSWORD'], 33 | # db=settings['MYSQL_DB'], 34 | # table=settings['MYSQL_TABLE'], 35 | # charset=settings['MYSQL_CHARSET']) 36 | 37 | def close_spider(self, spider): 38 | # self.db.close() 39 | print('[pipelines] PostgresqlPipeline close_spider') 40 | pass 41 | 42 | def process_item(self, item, spider): 43 | print('[pipelines] PostgresqlPipeline process_item') 44 | pass 45 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # system 107 | .DS_Store 108 | .vscode/ 109 | 110 | crawler_news/settings.py 111 | -------------------------------------------------------------------------------- /crawler_news/pipelines/cassandra.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.exceptions import DropItem 9 | from src.CassandraDatabase import CassandraDatabase 10 | 11 | class CassandraPipeline(object): 12 | # TODO set database env in setting.py 13 | # def __init__(self, mongo_uri, mongo_db): 14 | # self.mongo_uri = mongo_uri 15 | # self.mongo_db = mongo_db 16 | 17 | # @classmethod 18 | # def from_crawler(cls, crawler): 19 | # return cls( 20 | # mongo_uri=crawler.settings.get('MONGO_URI'), 21 | # mongo_db=crawler.settings.get('MONGO_DATABASE', 'items') 22 | # ) 23 | 24 | def open_spider(self, spider): 25 | settings = spider.settings 26 | self.db = CassandraDatabase( 27 | keyspace=settings['CASSANDRA_KEYSPAC'], 28 | table=settings['CASSANDRA_TABLE'], 29 | host=settings['CASSANDRA_HOST'] 30 | ) 31 | self.db.create_table() 32 | 33 | def close_spider(self, spider): 34 | self.db.close() 35 | 36 | def process_item(self, item, spider): 37 | if item.get('url'): 38 | news = self.db.fetchOne(item['url']) 39 | if news == None: 40 | # TODO 塞進資料庫前,檢查資料格式 41 | 42 | try: 43 | self.db.insert(dict(item)) 44 | except Exception as e: 45 | spider.logger.error("---------- DB ERROR ----------") 46 | spider.logger.error(e) 47 | spider.logger.error(item) 48 | spider.logger.error("==============================") 49 | else: 50 | # TODO version2 版本判斷 51 | pass 52 | 53 | else: 54 | raise DropItem("Missing item.url in %s" % item) 55 | 56 | return item 57 | -------------------------------------------------------------------------------- /crawler_news/pipelines/line.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import requests 7 | import re 8 | import json 9 | 10 | from scrapy.exceptions import DropItem 11 | 12 | notify_key_words = 'notify_key_words' 13 | 14 | class LineNotifyPipeline(object): 15 | 16 | def open_spider(self, spider): 17 | self.token = spider.settings.get('LINE_NOTIFY_TOKEN') 18 | 19 | def _re(self, targets, key_words): 20 | if type(targets) is str: 21 | targets = [targets] 22 | 23 | if type(targets) is list: 24 | for target in targets: 25 | for key_word in key_words: 26 | match = re.search(key_word, target) 27 | if match: 28 | return match.group(0) 29 | 30 | return False 31 | 32 | def line_notify_message(self, msg): 33 | headers = { 34 | "Authorization": "Bearer " + self.token, 35 | "Content-Type" : "application/x-www-form-urlencoded" 36 | } 37 | 38 | payload = {'message': msg} 39 | r = requests.post("https://notify-api.line.me/api/notify", headers=headers, params=payload) 40 | 41 | return r.status_code 42 | 43 | async def process_item(self, item, spider): 44 | if not item.get('url'): 45 | raise DropItem("Missing item.url in %s" % item) 46 | 47 | key_words = json.loads(spider.redis_client.get(notify_key_words)) 48 | 49 | conditions = '' 50 | if self._re(item['title'], key_words): 51 | conditions += '標題 包含關鍵字 [%s]\n' % self._re(item['title'], key_words) 52 | if self._re(item['tags'], key_words): 53 | conditions += '標籤 包含關鍵字 [%s]\n' % self._re(item['tags'], key_words) 54 | if self._re(item['text'], key_words): 55 | conditions += '內文 包含關鍵字 [%s]\n' % self._re(item['text'], key_words) 56 | 57 | if conditions != '': 58 | msg = "觸發條件:\n%s\n\n新聞標題:%s\n\n新聞網址:%s" % (conditions, item.get('title'), item.get('url')) 59 | self.line_notify_message(msg) 60 | spider.logger.info('Send line notify message. %s' % item.get('url')) 61 | 62 | return item 63 | -------------------------------------------------------------------------------- /crawler_news/pipelines/mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import json 9 | from scrapy.exceptions import DropItem 10 | from src.MysqlDatabase import MysqlDatabase 11 | 12 | class MysqlPipeline(object): 13 | 14 | def open_spider(self, spider): 15 | settings = spider.settings 16 | self.db = MysqlDatabase(host=settings['MYSQL_HOST'], 17 | port=settings['MYSQL_PORT'], 18 | user=settings['MYSQL_USER'], 19 | password=settings['MYSQL_PASSWORD'], 20 | db=settings['MYSQL_DB'], 21 | table=settings['MYSQL_TABLE'], 22 | charset=settings['MYSQL_CHARSET']) 23 | 24 | def close_spider(self, spider): 25 | self.db.close() 26 | 27 | def _json_dumps_item(self, item, key): 28 | if item.get(key): 29 | return json.dumps(item.get(key), ensure_ascii=False) 30 | else: 31 | return None 32 | 33 | async def process_item(self, item, spider): 34 | if not item.get('url'): 35 | raise DropItem("Missing item.url in %s" % item) 36 | 37 | if not self.db.news_exist(item['url']): 38 | # TODO 塞進資料庫前,檢查資料格式 39 | 40 | item['authors'] = self._json_dumps_item(item, 'authors') 41 | item['tags'] = self._json_dumps_item(item, 'tags') 42 | item['text'] = self._json_dumps_item(item, 'text') 43 | item['images'] = self._json_dumps_item(item, 'images') 44 | item['video'] = self._json_dumps_item(item, 'video') 45 | item['links'] = self._json_dumps_item(item, 'links') 46 | 47 | try: 48 | self.db.insert(dict(item)) 49 | except Exception as e: 50 | spider.logger.error("---------- DB INSERT ERROR ----------") 51 | spider.logger.error(e) 52 | spider.logger.error(item) 53 | spider.logger.error("==============================") 54 | else: 55 | # TODO version2 版本判斷 56 | pass 57 | 58 | return item 59 | -------------------------------------------------------------------------------- /unittest.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import json 3 | 4 | import sys 5 | 6 | try: 7 | filename = sys.argv[1] 8 | with open('tmp/%s' % filename, 'r') as f: 9 | pass 10 | except FileNotFoundError: 11 | print('[X] file not exist. tmp/filename') 12 | sys.exit(0) 13 | except IndexError: 14 | print('[X] please enter filename.') 15 | sys.exit(0) 16 | 17 | class ResultTestCase(unittest.TestCase): 18 | def setUp(self): 19 | with open('tmp/%s' % filename, 'r') as f: 20 | self.data = json.loads( f.read() ) 21 | 22 | def tearDown(self): 23 | pass 24 | 25 | def test_url(self): 26 | e = [] 27 | for row in self.data: 28 | if not row['url']: 29 | e.append( (row) ) 30 | 31 | self.assertEqual(len(e), 0, e) 32 | 33 | def test_title(self): 34 | e = [] 35 | for row in self.data: 36 | if not row['title']: 37 | e.append( (row['url'], row['title']) ) 38 | 39 | self.assertEqual(len(e), 0, e) 40 | 41 | def test_publish_date(self): 42 | e = [] 43 | for row in self.data: 44 | if not row['publish_date']: 45 | e.append( (row['url'], row['publish_date']) ) 46 | 47 | self.assertEqual(len(e), 0, e) 48 | 49 | def test_authors(self): 50 | e = [] 51 | for row in self.data: 52 | if not row['authors']: 53 | e.append( (row['url'], row['authors']) ) 54 | 55 | self.assertEqual(len(e), 0, e) 56 | 57 | def test_tags(self): 58 | e = [] 59 | for row in self.data: 60 | if not row['tags']: 61 | e.append( (row['url'], row['tags']) ) 62 | 63 | self.assertEqual(len(e), 0, e) 64 | 65 | def test_text(self): 66 | e = [] 67 | for row in self.data: 68 | if not row['text']: 69 | e.append( (row['url'], row['text']) ) 70 | 71 | self.assertEqual(len(e), 0, e) 72 | 73 | def test_text_html(self): 74 | e = [] 75 | for row in self.data: 76 | if not row['text_html']: 77 | e.append( (row['url'], row['text_html']) ) 78 | 79 | self.assertEqual(len(e), 0, e) 80 | 81 | def test_images(self): 82 | e = [] 83 | for row in self.data: 84 | if not row['images']: 85 | e.append( (row['url'], row['images']) ) 86 | 87 | self.assertEqual(len(e), 0, e) 88 | 89 | def test_video(self): 90 | e = [] 91 | for row in self.data: 92 | if not row['video']: 93 | e.append( (row['url'], row['video']) ) 94 | 95 | self.assertEqual(len(e), 0, e) 96 | 97 | def test_links(self): 98 | e = [] 99 | for row in self.data: 100 | if len(row['links']) == 0: 101 | e.append( (row['url'], row['links']) ) 102 | 103 | self.assertEqual(len(e), 0, e) 104 | 105 | if __name__ == '__main__': 106 | unittest.main(argv = [sys.argv[0]]) 107 | 108 | -------------------------------------------------------------------------------- /crawler_news/spiders/tvbs.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from crawler_news.items import CrawlerNewsItem 3 | 4 | import time 5 | 6 | date_str = str(time.strftime("%F", time.localtime())) 7 | 8 | class TVBSSpider(scrapy.Spider): 9 | name = 'tvbs' 10 | allowed_domains = ['tvbs.com.tw'] 11 | base_url = 'https://news.tvbs.com.tw' 12 | 13 | custom_settings = { 14 | 'LOG_FILE': 'log/%s-%s.log' % (name, date_str), 15 | } 16 | 17 | def start_requests(self): 18 | list_url = '%s/realtime' % (self.base_url) 19 | yield scrapy.Request(url=list_url, callback=self.parse_list) 20 | 21 | def parse_list(self, response): 22 | for page_url in response.css('div.news_list>div.list>ul>li>a:first-child::attr(href)').getall(): 23 | page_url = self.base_url + page_url 24 | if not self.redis_client.exists(page_url): 25 | yield scrapy.Request(url=page_url, callback=self.parse_news,cb_kwargs=dict(req_url=page_url)) 26 | 27 | def parse_news(self, response, req_url): 28 | self.logger.info(f"request page: {req_url}") 29 | 30 | item = CrawlerNewsItem() 31 | 32 | item['url'] = req_url 33 | item['article_from'] = self.name 34 | item['article_type'] = 'news' 35 | 36 | item['title'] = self._parse_title(response) 37 | item['publish_date'] = self._parse_publish_date(response) 38 | item['authors'] = self._parse_authors(response) 39 | item['tags'] = self._parse_tags(response) 40 | item['text'] = self._parse_text(response) 41 | item['text_html'] = self._parse_text_html(response) 42 | item['images'] = self._parse_images(response) 43 | item['video'] = self._parse_video(response) 44 | item['links'] = self._parse_links(response) 45 | 46 | return item 47 | 48 | def _parse_title(self, response): 49 | return response.css('h1.title::text').get() 50 | 51 | def _parse_publish_date(self, response): 52 | return response.css('div.author::text').re_first(r'[0-9]+/[0-9]+/[0-9]+ [0-9]+:[0-9]+') 53 | 54 | def _parse_authors(self, response): 55 | return response.css('div.author>a::text').getall() 56 | 57 | def _parse_tags(self, response): 58 | tags = [] 59 | for t in response.css('div.article_keyword>a::text').getall(): 60 | tags.append(t.lstrip('#')) 61 | return tags 62 | 63 | def _parse_text(self, response): 64 | text = [] 65 | for t in response.css('#news_detail_div::text,#news_detail_div>p::text').getall(): 66 | if t.strip() != '': 67 | text.append(t.strip()) 68 | return text 69 | 70 | def _parse_text_html(self, response): 71 | return response.css('#news_detail_div').get() 72 | 73 | def _parse_images(self, response): 74 | return response.css('.article_new').css('img::attr(src)').getall() 75 | 76 | def _parse_video(self, response): 77 | return response.css('.article_new #ytframe iframe::attr(src)').getall() 78 | 79 | def _parse_links(self, response): 80 | return response.css('.article_new').css('a::attr(href)').getall() 81 | -------------------------------------------------------------------------------- /crawler_news/extensions/CassandraDatabase.py: -------------------------------------------------------------------------------- 1 | import cassandra 2 | from cassandra.cluster import Cluster 3 | 4 | class CassandraDatabase(): 5 | def __init__(self, keyspace, table, host=['localhost']): 6 | self.keyspace = keyspace 7 | self.host = host 8 | self.table = table 9 | 10 | self.connect() 11 | 12 | def __del__(self,): 13 | self.cluster.shutdown() 14 | 15 | def connect(self,): 16 | self.cluster = Cluster(self.host) 17 | self.session = self.cluster.connect() 18 | self.create_keyspace() 19 | self.session.set_keyspace(self.keyspace) 20 | 21 | def close(self,): 22 | self.cluster.shutdown() 23 | 24 | def create_keyspace(self): 25 | sql = """ 26 | SELECT keyspace_name 27 | FROM system_schema.keyspaces 28 | WHERE keyspace_name='%s' 29 | """ % self.keyspace 30 | 31 | if self.query(sql).one() == None: 32 | self.query(""" 33 | CREATE KEYSPACE %s 34 | WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '2' } 35 | """ % self.keyspace) 36 | 37 | def create_table(self,): 38 | sql = """ 39 | CREATE TABLE IF NOT EXISTS %s ( 40 | url varchar, article_from varchar, article_type varchar, 41 | title varchar, publish_date varchar, authors list, 42 | tags list, text_ list, text_html text, 43 | images list, video list, links list, 44 | PRIMARY KEY(url) 45 | ); 46 | """ % self.table 47 | self.query(sql) 48 | 49 | sql = """ 50 | CREATE INDEX IF NOT EXISTS ON %s(article_from); 51 | """ % self.table 52 | self.query(sql) 53 | 54 | sql = """ 55 | CREATE INDEX IF NOT EXISTS ON %s(article_type); 56 | """ % self.table 57 | self.query(sql) 58 | 59 | def query(self, sql, args=None): 60 | return self.session.execute(sql, args) 61 | 62 | def fetchOne(self, url): 63 | sql = """ 64 | SELECT * 65 | FROM %s 66 | WHERE url='%s' 67 | """ % (self.table, url) 68 | 69 | return self.query(sql).one() 70 | 71 | def insert(self, data={}): 72 | sql = "INSERT INTO %s "% (self.table) 73 | sql = sql + """ 74 | (url, article_from, article_type, 75 | title, publish_date, authors, tags, 76 | text_, text_html, images, video, links) 77 | VALUES 78 | (%(url)s, %(article_from)s, %(article_type)s, 79 | %(title)s, %(publish_date)s, %(authors)s, %(tags)s, 80 | %(text)s, %(text_html)s, %(images)s, %(video)s, %(links)s) 81 | """ 82 | 83 | return self.query(sql, data) 84 | 85 | 86 | def update(self,): 87 | raise 'Method not implemented.' 88 | 89 | def delete(self,): 90 | raise 'Method not implemented.' 91 | -------------------------------------------------------------------------------- /crawler_news/spiders/udn.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from crawler_news.items import CrawlerNewsItem 3 | 4 | import time 5 | import re 6 | 7 | date_str = str(time.strftime("%F", time.localtime())) 8 | 9 | class UdnSpider(scrapy.Spider): 10 | name = 'udn' 11 | allowed_domains = ['udn.com'] 12 | base_url = 'https://udn.com' 13 | 14 | custom_settings = { 15 | 'LOG_FILE': 'log/%s-%s.log' % (name, date_str), 16 | } 17 | 18 | def start_requests(self): 19 | list_url = '%s/news/breaknews/1/99' % (self.base_url) 20 | yield scrapy.Request(url=list_url, callback=self.parse_list) 21 | 22 | def parse_list(self, response): 23 | page_url_list = response.css('div.story-list__news h2>a::attr(href)').getall() 24 | 25 | self.logger.info(page_url_list) 26 | 27 | for page_url in page_url_list: 28 | page_url = self.base_url+page_url.split('?')[0] 29 | if not self.redis_client.exists(page_url): 30 | yield scrapy.Request(url=page_url, callback=self.parse_news) 31 | 32 | def parse_news(self, response): 33 | req_url = response.request.url 34 | 35 | self.logger.info(f"request page: {req_url}") 36 | 37 | item = CrawlerNewsItem() 38 | 39 | item['url'] = req_url 40 | item['article_from'] = self.name 41 | item['article_type'] = 'news' 42 | 43 | item['title'] = self._parse_title(response) 44 | item['publish_date'] = self._parse_publish_date(response) 45 | item['authors'] = self._parse_authors(response) 46 | item['tags'] = self._parse_tags(response) 47 | item['text'] = self._parse_text(response) 48 | item['text_html'] = self._parse_text_html(response) 49 | item['images'] = self._parse_images(response) 50 | item['video'] = self._parse_video(response) 51 | item['links'] = self._parse_links(response) 52 | 53 | return item 54 | 55 | def _parse_title(self, response): 56 | return response.css('h1::text').get() 57 | 58 | def _parse_publish_date(self, response): 59 | return response.css('time.article-content__time::text').get() 60 | 61 | def _parse_authors(self, response): 62 | return response.css('span.article-content__author a::text').get() 63 | 64 | def _parse_tags(self, response): 65 | return response.css('section.keyword>a::text').getall() 66 | 67 | def _parse_text(self, response): 68 | text = [] 69 | for t in response.css('section.article-content__editor>p *::text').getall(): 70 | if t.strip() != '': 71 | text.append(t.strip()) 72 | return text 73 | 74 | def _parse_text_html(self, response): 75 | return response.css('section.article-content__editor').getall() 76 | 77 | def _parse_images(self, response): 78 | return response.css('div#article_body').css('img::attr(src)').getall() 79 | 80 | def _parse_video(self, response): 81 | return response.css('div.video-container>iframe::attr(src)').getall() 82 | 83 | def _parse_links(self, response): 84 | return response.css('div#article_body').css('a::attr(href)').getall() 85 | 86 | -------------------------------------------------------------------------------- /crawler_news/spiders/chinatimes.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from crawler_news.items import CrawlerNewsItem 3 | 4 | import time 5 | import re 6 | 7 | class ChinatimesSpider(scrapy.Spider): 8 | name = 'chinatimes' 9 | allowed_domains = ['chinatimes.com'] 10 | base_url = 'https://www.chinatimes.com' 11 | 12 | date_str = str(time.strftime("%F", time.localtime())) 13 | 14 | custom_settings = { 15 | 'LOG_FILE': 'log/%s-%s.log' % (name, date_str), 16 | } 17 | 18 | def start_requests(self): 19 | list_url = '%s/realtimenews' % (self.base_url) 20 | yield scrapy.Request(url=list_url, callback=self.parse_list) 21 | 22 | def parse_list(self, response): 23 | for page_url in response.css('section.article-list>ul>li h3.title>a::attr(href)').getall(): 24 | page_url = self.base_url + page_url 25 | if not self.redis_client.exists(page_url): 26 | yield scrapy.Request(url=page_url, callback=self.parse_news) 27 | 28 | def parse_news(self, response): 29 | item = CrawlerNewsItem() 30 | 31 | item['url'] = response.url 32 | item['article_from'] = self.name 33 | item['article_type'] = 'news' 34 | 35 | item['title'] = self._parse_title(response) 36 | item['publish_date'] = self._parse_publish_date(response) 37 | item['authors'] = self._parse_authors(response) 38 | item['tags'] = self._parse_tags(response) 39 | item['text'] = self._parse_text(response) 40 | item['text_html'] = self._parse_text_html(response) 41 | item['images'] = self._parse_images(response) 42 | item['video'] = self._parse_video(response) 43 | item['links'] = self._parse_links(response) 44 | 45 | return item 46 | 47 | def _parse_title(self, response): 48 | return response.css('article.article-box h1.article-title::text').get() 49 | 50 | def _parse_publish_date(self, response): 51 | return response.css('article.article-box time::attr(datetime)').get() 52 | 53 | def _parse_authors(self, response): 54 | authors = response.css('article.article-box div.author>a::text').getall() 55 | if len(authors) == 0: 56 | authors = [response.css('article.article-box div.author::text').get(default='').strip()] 57 | return authors 58 | 59 | def _parse_tags(self, response): 60 | return response.css('article.article-box div.article-hash-tag a::text').getall() 61 | 62 | def _parse_text(self, response): 63 | return response.css('article.article-box div.article-body p::text').getall() 64 | 65 | def _parse_text_html(self, response): 66 | return response.css('article.article-box div.article-body').get() 67 | 68 | def _parse_images(self, response): 69 | images_list = [] 70 | images_list.extend(response.css('article.article-box div.main-figure').css('img::attr(src)').getall()) 71 | images_list.extend(response.css('article.article-box div.article-body').css('img::attr(src)').getall()) 72 | return images_list 73 | 74 | def _parse_video(self, response): 75 | return response.css('article.article-box div.article-body iframe::attr(src)').getall() 76 | 77 | def _parse_links(self, response): 78 | return response.css('article.article-box div.article-body').css('a::attr(href)').getall() 79 | -------------------------------------------------------------------------------- /crawler_news/spiders/cna.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from crawler_news.items import CrawlerNewsItem 3 | 4 | import time 5 | 6 | date_str = str(time.strftime("%F", time.localtime())) 7 | 8 | class CnaSpider(scrapy.Spider): 9 | name = 'cna' 10 | allowed_domains = ['cna.com.tw'] 11 | base_url = 'https://www.cna.com.tw' 12 | 13 | custom_settings = { 14 | 'LOG_FILE': 'log/%s-%s.log' % (name, date_str), 15 | } 16 | 17 | def start_requests(self): 18 | list_url = '%s/list/aall.aspx' % (self.base_url) 19 | yield scrapy.Request(url=list_url, callback=self.parse_list) 20 | 21 | def parse_list(self, response): 22 | for page_url in response.css('#jsMainList>li>a::attr(href)').getall(): 23 | if not self.redis_client.exists(page_url): 24 | yield scrapy.Request(url=page_url, callback=self.parse_news) 25 | 26 | def parse_news(self, response): 27 | req_url = response.request.url 28 | 29 | self.logger.info(f"request page: {req_url}") 30 | 31 | item = CrawlerNewsItem() 32 | 33 | item['url'] = req_url 34 | item['article_from'] = self.name 35 | item['article_type'] = 'news' 36 | 37 | item['title'] = self._parse_title(response) 38 | item['publish_date'] = self._parse_publish_date(response) 39 | item['authors'] = self._parse_authors(response) 40 | item['tags'] = self._parse_tags(response) 41 | item['text'] = self._parse_text(response) 42 | item['text_html'] = self._parse_text_html(response) 43 | item['images'] = self._parse_images(response) 44 | item['video'] = self._parse_video(response) 45 | item['links'] = self._parse_links(response) 46 | 47 | return item 48 | 49 | def _parse_title(self, response): 50 | return response.css('article.article h1 *::text').get() 51 | 52 | def _parse_publish_date(self, response): 53 | return response.css('article.article div.timeBox span::text').get() 54 | 55 | def _parse_authors(self, response): 56 | # inconsistent format 57 | pre_authors = response.css('article.article div.paragraph p::text').re(r'^([^)]*)|([^)]*)[0-9]*$') 58 | return list(map(lambda x: x[1:].split(')')[0], pre_authors)) 59 | 60 | def _parse_tags(self, response): 61 | tags = [] 62 | for t in response.css('.keywordTag a::text').getall(): 63 | tags.append(t.lstrip('#')) 64 | return tags 65 | 66 | def _parse_text(self, response): 67 | ret = [] 68 | for i in range(0,10): 69 | if len(response.css('article.article div.paragraph:nth-of-type(%s) p::text' % i).getall()) != 0: 70 | ret = response.css('article.article div.paragraph:nth-of-type(%s) p::text' % i).getall() 71 | break 72 | return ret 73 | 74 | def _parse_text_html(self, response): 75 | return response.css('article.article div.paragraph').get() 76 | 77 | def _parse_images(self, response): 78 | # parser error with div.fullPic 79 | return response.css('article.article').css('img::attr(src)').getall() 80 | 81 | def _parse_video(self, response): 82 | return response.css('article.article div.media iframe::attr(data-src)').getall() 83 | 84 | def _parse_links(self, response): 85 | return response.css('article.article div.paragraph p').css('a::attr(href)').getall() 86 | -------------------------------------------------------------------------------- /crawler_news/spiders/nownews.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from crawler_news.items import CrawlerNewsItem 3 | 4 | import time 5 | import re 6 | import json 7 | 8 | date_str = str(time.strftime("%F", time.localtime())) 9 | 10 | class NownewsSpider(scrapy.Spider): 11 | name = 'nownews' 12 | allowed_domains = ['nownews.com'] 13 | base_url = 'https://www.nownews.com' 14 | 15 | custom_settings = { 16 | 'LOG_FILE': 'log/%s-%s.log' % (name, date_str), 17 | # 'LOG_FILE': None, 18 | # 'DEFAULT_REQUEST_HEADERS': { 19 | # 'Accept': '*/*', 20 | # 'Referer': 'https://www.nownews.com/', 21 | # } 22 | } 23 | 24 | def start_requests(self): 25 | list_url = '%s/cat/breaking/' % (self.base_url) 26 | yield scrapy.Request(url=list_url, callback=self.parse_list) 27 | 28 | def parse_list(self, response): 29 | page_url_list = response.css('a::attr(href)').getall() 30 | for page_url in page_url_list: 31 | if re.match('https://www.nownews.com/news/*', page_url) and not self.redis_client.exists(page_url): 32 | yield scrapy.Request(url=page_url, callback=self.parse_news) 33 | 34 | def parse_news(self, response): 35 | req_url = response.request.url 36 | 37 | self.logger.info(f"request page: {req_url}") 38 | 39 | item = CrawlerNewsItem() 40 | 41 | item['url'] = req_url 42 | item['article_from'] = self.name 43 | item['article_type'] = 'news' 44 | 45 | item['title'] = self._parse_title(response) 46 | item['publish_date'] = self._parse_publish_date(response) 47 | item['authors'] = self._parse_authors(response) 48 | item['tags'] = self._parse_tags(response) 49 | item['text'] = self._parse_text(response) 50 | item['text_html'] = self._parse_text_html(response) 51 | item['images'] = self._parse_images(response) 52 | item['video'] = self._parse_video(response) 53 | item['links'] = self._parse_links(response) 54 | 55 | return item 56 | 57 | def _parse_title(self, response): 58 | return response.css('h1.article-title::text').get() 59 | 60 | def _parse_publish_date(self, response): 61 | return response.css('time span::text').get() 62 | 63 | def _parse_authors(self, response): 64 | return [response.css('div.infoBlk>div>p::text').get()] 65 | 66 | def _parse_tags(self, response): 67 | return response.css('div.keywordBlk ul.tag li>a::text').getall() 68 | 69 | def _parse_text(self, response): 70 | text = [] 71 | for t in response.css('article[itemprop=articleBody]::text').getall(): 72 | if t.strip() != '': 73 | text.append(t.strip()) 74 | return text 75 | 76 | def _parse_text_html(self, response): 77 | return response.css('article[itemprop=articleBody]').get() 78 | 79 | def _parse_images(self, response): 80 | return response.css('div.containerBlk').css('img::attr(src)').getall() 81 | 82 | def _parse_video(self, response): 83 | # TODO 84 | return response.css('article noscript>iframe::attr(src)').getall() 85 | 86 | def _parse_links(self, response): 87 | # TODO 88 | # links = response.css('article div.td-post-content').css('a::attr(href)').getall() 89 | # return list(filter(lambda x:x if not x == '#' else None , links)) 90 | return [] 91 | -------------------------------------------------------------------------------- /crawler_news/spiders/setn.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from crawler_news.items import CrawlerNewsItem 3 | 4 | import time 5 | import re 6 | 7 | date_str = str(time.strftime("%F", time.localtime())) 8 | 9 | class SetnSpider(scrapy.Spider): 10 | name = 'setn' 11 | allowed_domains = ['setn.com'] 12 | base_url = 'https://www.setn.com' 13 | 14 | custom_settings = { 15 | 'LOG_FILE': 'log/%s-%s.log' % (name, date_str), 16 | } 17 | 18 | def start_requests(self): 19 | list_url = 'https://www.setn.com/ViewAll.aspx?PageGroupID=1' 20 | yield scrapy.Request(url=list_url, callback=self.parse_list) 21 | 22 | def parse_list(self, response): 23 | for page_url in response.css('h3.view-li-title>a.gt ::attr(href)').getall(): 24 | page_url = self.base_url + page_url 25 | if not self.redis_client.exists(page_url): 26 | yield scrapy.Request(url=page_url, callback=self.parse_news) 27 | 28 | def parse_news(self, response): 29 | req_url = response.request.url 30 | 31 | self.logger.info(f"request page: {req_url}") 32 | 33 | item = CrawlerNewsItem() 34 | 35 | item['url'] = req_url 36 | item['article_from'] = self.name 37 | item['article_type'] = 'news' 38 | 39 | item['title'] = self._parse_title(response) 40 | item['publish_date'] = self._parse_publish_date(response) 41 | item['authors'] = self._parse_authors(response) 42 | item['tags'] = self._parse_tags(response) 43 | item['text'] = self._parse_text(response) 44 | item['text_html'] = self._parse_text_html(response) 45 | item['images'] = self._parse_images(response) 46 | item['video'] = self._parse_video(response) 47 | item['links'] = self._parse_links(response) 48 | 49 | return item 50 | 51 | def _parse_title(self, response): 52 | if re.match('https://www.setn.com/e', response.url): 53 | return response.css('h1#newsTitle::text').get() 54 | else: 55 | return response.css('h1.news-title-3::text').get() 56 | 57 | def _parse_publish_date(self, response): 58 | if re.match('https://www.setn.com/e', response.url): 59 | return response.css('div.titleBtnBlock>div.time::text').get() 60 | else: 61 | return response.css('time.page-date::text').get() 62 | 63 | def _parse_authors(self, response): 64 | if re.match('https://www.setn.com/e', response.url): 65 | return [response.css('div.Content2>p::text').get()] 66 | else: 67 | authors = response.css('div#Content1>p::text').get() 68 | 69 | if re.match(r'.+[/].+', authors) == None: 70 | return [response.css('div.page-title-text span::text').get()] 71 | else: 72 | return [authors] 73 | 74 | def _parse_tags(self, response): 75 | return response.css('div.page-keyword-area ul>li>a>strong::text').getall() 76 | 77 | def _parse_text(self, response): 78 | return response.css('article p *::text').getall() 79 | 80 | def _parse_text_html(self, response): 81 | return response.css('article').get() 82 | 83 | def _parse_images(self, response): 84 | return response.css('article').css('img::attr(src)').getall() 85 | 86 | def _parse_video(self, response): 87 | return response.css('article').css('iframe::attr(src)').getall() 88 | 89 | def _parse_links(self, response): 90 | return response.css('article').css('a::attr(href)').getall() 91 | -------------------------------------------------------------------------------- /crawler_news/spiders/ebc.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from crawler_news.items import CrawlerNewsItem 3 | 4 | import time 5 | import re 6 | 7 | date_str = str(time.strftime("%F", time.localtime())) 8 | 9 | class EBCSpider(scrapy.Spider): 10 | name = 'ebc' 11 | allowed_domains = ['news.ebc.net.tw'] 12 | base_url = 'https://news.ebc.net.tw' 13 | 14 | custom_settings = { 15 | 'LOG_FILE': 'log/%s-%s.log' % (name, date_str), 16 | } 17 | 18 | def start_requests(self): 19 | list_url = '%s/realtime' % self.base_url 20 | yield scrapy.Request(url=list_url, callback=self.parse_list) 21 | 22 | def parse_list(self, response): 23 | page_url_list = response.css('div.white-box>a::attr(href)').getall() 24 | 25 | self.logger.info(page_url_list) 26 | 27 | for page_url in page_url_list: 28 | page_url = self.base_url+page_url 29 | if not self.redis_client.exists(page_url): 30 | yield scrapy.Request(url=page_url, callback=self.parse_news) 31 | 32 | def parse_news(self, response): 33 | req_url = response.request.url 34 | 35 | self.logger.info(f"request page: {req_url}") 36 | 37 | item = CrawlerNewsItem() 38 | 39 | item['url'] = req_url 40 | item['article_from'] = self.name 41 | item['article_type'] = 'news' 42 | 43 | item['title'] = self._parse_title(response) 44 | item['publish_date'] = self._parse_publish_date(response) 45 | item['authors'] = self._parse_authors(response) 46 | item['tags'] = self._parse_tags(response) 47 | item['text'] = self._parse_text(response) 48 | item['text_html'] = self._parse_text_html(response) 49 | item['images'] = self._parse_images(response) 50 | item['video'] = self._parse_video(response) 51 | item['links'] = self._parse_links(response) 52 | 53 | return item 54 | 55 | def _parse_title(self, response): 56 | return response.css('div.fncnews-content>h1::text').get() 57 | 58 | def _parse_publish_date(self, response): 59 | pattern=r'(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})' #2019/12/22 13:53 60 | string=response.css('div.info>span.small-gray-text::text').get() 61 | return re.search(pattern,string).group(0) 62 | 63 | def _parse_authors(self, response): 64 | pattern=r'(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})' #2019/12/22 13:53 65 | string=response.css('div.info>span.small-gray-text::text').get() 66 | datetime=re.search(pattern,string).group(0) 67 | return [string.replace(datetime,'').strip()] #去掉日期時間 68 | 69 | def _parse_tags(self, response): 70 | return response.css('div.keyword>a::text').getall() 71 | 72 | def _parse_text(self, response): 73 | text = [] 74 | for t in response.css('content-ad p::text').getall(): 75 | if t.strip() != '': 76 | text.append(t.strip()) 77 | return text 78 | 79 | def _parse_text_html(self, response): 80 | return response.css('content-ad').get() 81 | 82 | def _parse_images(self, response): 83 | allImgList=response.css('content-ad img::attr(src)').getall() 84 | imgURLs=[] 85 | for imgurl in allImgList: 86 | if re.match(r'https://img.news.ebc.net.tw\S+',imgurl): 87 | imgURLs.append(imgurl) 88 | return imgURLs 89 | 90 | def _parse_video(self, response): 91 | fb_video=response.css('content-ad').css('iframe::attr(src)').getall() 92 | youtube=response.css('content-ad').css('div.fb-video::attr(data-href)').getall() 93 | return fb_video+youtube 94 | 95 | def _parse_links(self, response): 96 | return response.css('content-ad').css('a::attr(href)').getall() 97 | -------------------------------------------------------------------------------- /crawler_news/extensions/MysqlDatabase.py: -------------------------------------------------------------------------------- 1 | import pymysql.cursors 2 | 3 | class MysqlDatabase(): 4 | def __init__(self, host, port, user, password, db, table='test', charset='utf8mb4'): 5 | self.host = host 6 | self.port = int(port) 7 | self.user = user 8 | self.password = password 9 | self.database = db 10 | self.table = table 11 | self.charset = charset 12 | self.connect() 13 | 14 | def __del__(self,): 15 | if self.connection.open: 16 | self.connection.close() 17 | 18 | def connect(self,): 19 | self.connection = pymysql.connect( 20 | host=self.host, port=self.port, user=self.user, password=self.password, 21 | charset=self.charset, cursorclass=pymysql.cursors.DictCursor) 22 | 23 | self.create_db() 24 | self.connection.select_db(self.database) 25 | self.create_table() 26 | 27 | def close(self,): 28 | self.connection.close() 29 | 30 | def query(self, sql, params=()): 31 | with self.connection.cursor() as cursor: 32 | cursor.execute(sql, params) 33 | 34 | def execute(self, sql, params=()): 35 | with self.connection.cursor() as cursor: 36 | cursor.execute(sql, params) 37 | self.connection.commit() 38 | 39 | def fetch_one(self, sql, params=()): 40 | with self.connection.cursor() as cursor: 41 | cursor.execute(sql, params) 42 | result = cursor.fetchone() 43 | 44 | return result 45 | 46 | def create_db(self,): 47 | sql = "SHOW DATABASES LIKE '%s';" % self.database 48 | if self.fetch_one(sql) is None: 49 | sql = "CREATE DATABASE IF NOT EXISTS `%s`;" % self.database 50 | self.query(sql) 51 | 52 | def create_table(self,): 53 | sql = "SHOW TABLES LIKE '%s';" % self.table 54 | if self.fetch_one(sql) is None: 55 | sql = """ 56 | CREATE TABLE IF NOT EXISTS `%s`.`%s` ( 57 | `id` serial NOT NULL AUTO_INCREMENT, 58 | `url` varchar(255) NOT NULL, 59 | `article_from` varchar(255) NOT NULL DEFAULT 'UNKNOWN', 60 | `article_type` varchar(255) DEFAULT NULL, 61 | `title` varchar(255) DEFAULT NULL, 62 | `publish_date` varchar(255) DEFAULT NULL, 63 | `authors` json DEFAULT NULL, 64 | `tags` json DEFAULT NULL, 65 | `text` text DEFAULT NULL, 66 | `text_html` text DEFAULT NULL, 67 | `images` json DEFAULT NULL, 68 | `video` json DEFAULT NULL, 69 | `links` json DEFAULT NULL, 70 | `created_at` datetime NOT NULL DEFAULT NOW(), 71 | `updated_at` datetime NOT NULL DEFAULT NOW(), 72 | `deleted_at` datetime DEFAULT NULL, 73 | PRIMARY KEY (id), 74 | UNIQUE INDEX USING BTREE (url), 75 | INDEX USING BTREE (title), 76 | INDEX USING BTREE (article_from), 77 | INDEX USING BTREE (article_type), 78 | INDEX USING BTREE (created_at), 79 | INDEX USING BTREE (updated_at), 80 | INDEX USING BTREE (deleted_at) 81 | ) ENGINE=InnoDB; 82 | """ 83 | 84 | self.execute(sql % (self.database, self.table)) 85 | 86 | def news_exist(self, url): 87 | sql = "SELECT * FROM %s.%s WHERE url='%s';" 88 | if self.fetch_one(sql % (self.database, self.table, url)) is None: 89 | return False 90 | return True 91 | 92 | def insert(self, data={}): 93 | sql = "INSERT INTO `%s`.`%s` " % (self.database, self.table) 94 | sql = sql + """ 95 | (url, article_from, article_type, 96 | title, publish_date, authors, tags, 97 | text, text_html, images, video, links) 98 | VALUES 99 | (%(url)s, %(article_from)s, %(article_type)s, 100 | %(title)s, %(publish_date)s, %(authors)s, %(tags)s, 101 | %(text)s, %(text_html)s, %(images)s, %(video)s, %(links)s) 102 | """ 103 | 104 | return self.execute(sql, data) 105 | 106 | def update(self,): 107 | raise 'Method not implemented.' 108 | 109 | def delete(self,): 110 | raise 'Method not implemented.' 111 | -------------------------------------------------------------------------------- /crawler_news/settings.py.example: -------------------------------------------------------------------------------- 1 | # Scrapy settings for crawler_news project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'crawler_news' 11 | 12 | SPIDER_MODULES = ['crawler_news.spiders'] 13 | NEWSPIDER_MODULE = 'crawler_news.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = True 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | #CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | #DOWNLOAD_DELAY = 3 29 | # The download delay setting will honor only one of: 30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | #CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | #COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | #TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | DEFAULT_REQUEST_HEADERS = { 41 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 42 | 'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7', 43 | } 44 | 45 | # Enable or disable spider middlewares 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47 | SPIDER_MIDDLEWARES = { 48 | # 'crawler_news.middlewares.CrawlerNewsSpiderMiddleware': 543, 49 | } 50 | 51 | # Enable or disable downloader middlewares 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53 | DOWNLOADER_MIDDLEWARES = { 54 | # 'crawler_news.middlewares.CrawlerNewsDownloaderMiddleware': 543, 55 | } 56 | 57 | # Enable or disable extensions 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 59 | EXTENSIONS = { 60 | 'crawler_news.extensions.redis.RedisClient': 10, 61 | # 'crawler_news.extensions.telnet.TelnetConsole': None, 62 | } 63 | 64 | # Configure item pipelines 65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 66 | ITEM_PIPELINES = { 67 | 'crawler_news.pipelines.ckiptagger.CkiptaggerPipeline': 100, 68 | # 'crawler_news.pipelines.postgresql.PostgresqlPipeline': 100, 69 | # 'crawler_news.pipelines.jieba.JiebaPipeline': 300, 70 | # 'crawler_news.pipelines.line.LineNotifyPipeline': 100, 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | #HTTPCACHE_ENABLED = True 89 | #HTTPCACHE_EXPIRATION_SECS = 0 90 | #HTTPCACHE_DIR = 'httpcache' 91 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | 94 | # Set settings whose default value is deprecated to a future-proof value 95 | REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7' 96 | # TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' 97 | 98 | # LOG_FILE = 'log/scrapy.log' 99 | LOG_LEVEL = 'WARNING' # INFO # DEBUG # WARNING # ERROR # CRITICAL 100 | 101 | CASSANDRA_HOST = ['localhost'] # localhost 102 | CASSANDRA_KEYSPAC = '' 103 | CASSANDRA_TABLE = '' 104 | 105 | MYSQL_HOST = 'localhost' 106 | MYSQL_PORT = 3306 107 | MYSQL_USER = '' 108 | MYSQL_PASSWORD = '' 109 | MYSQL_DB = '' 110 | MYSQL_TABLE = '' 111 | MYSQL_CHARSET = '' 112 | 113 | REDIS_HOST = 'localhost' 114 | REDIS_PORT = 6379 115 | REDIS_DATABASE = 0 116 | 117 | LINE_NOTIFY_TOKEN = '' 118 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python crawler for news 2 | 3 | Use python scrapy build crawler for real-time Taiwan NEWS website. 4 | 5 | 使用 python scrapy 建置抓取台灣新聞網站即時新聞的爬蟲 6 | 7 | ## TODO LIST 8 | 9 | - 整理 setting 與 cicd 10 | - 部署 line notify 11 | 12 | - 用成 k8s 部到 GKC,或是 VM 即可? 13 | - 實作多執行緒,同步爬蟲執行,使用 python script 14 | - 可以考慮實作 docker install shell 15 | - 持續修改 Bug 16 | - 實作全網站一次性爬蟲(提供給 production) 17 | - 消滅 TODO 18 | - 寫一隻資料庫清理爬蟲 19 | 20 | ## TODO website 21 | 22 | List from [Alexa台灣排名](https://www.prlass.com/2992/%E5%8F%B0%E7%81%A3%E7%B6%B2%E8%B7%AF%E6%96%B0%E8%81%9E%E5%AA%92%E9%AB%94%E6%B5%81%E9%87%8F%E6%8E%92%E5%90%8D-2018-01/) 23 | 24 | [update 2022/3] [Alexa台灣排名](https://www.prlass.com/4941/%E5%8F%B0%E7%81%A3%E5%AA%92%E9%AB%94%E6%8E%92%E5%90%8D2022%E5%B9%B43%E6%9C%88/) 25 | 26 | `! Alexa停止營運了,之後再看看要換成什麼` 27 | 28 | 1. [自由時報](https://www.ltn.com.tw/) 29 | - [2022/12/30] 已更新 30 | 1. [東森新聞](https://news.ebc.net.tw/) 31 | - [2022/12/30] 已更新 32 | 1. [聯合新聞網](https://udn.com/news/index) 33 | - [2022/12/30] 已更新 34 | 1. [今日新聞](https://www.nownews.com/) 35 | - [2023/01/03] 已更新 36 | 1. [ettoday](https://www.ettoday.net/) 37 | - [2023/01/03] 已更新 38 | 1. [NEW] [巴哈姆特電玩資訊站](https://gnn.gamer.com.tw/) 39 | - TODO 40 | 1. [風傳媒](https://www.storm.mg/) 41 | - TODO 42 | 1. [公司還在嗎?] [頻果新聞網](https://tw.appledaily.com/home) 43 | - [2022/12] 尚未檢查 44 | - 要使用 javascript 45 | - 不能用 cookie,session 46 | - 新聞整體格式非主流,例:文章時間 47 | 1. [中時電子報](https://www.chinatimes.com/) 48 | - [2023/01/03] 已更新 49 | 1. [今周刊](https://www.businesstoday.com.tw/) 50 | - [2022/12] 尚未檢查 51 | - Maybe need javascript 52 | - Non-instant news 53 | - Mostly for business news 54 | 1. [TVBS](https://news.tvbs.com.tw/) 55 | - [2023/01/04] 已更新 56 | 1. [商業週刊](https://www.businessweekly.com.tw/) 57 | - [2022/12] 尚未檢查 58 | - Non-instant news 59 | - Mostly for business news 60 | 1. [三立新聞網](https://www.setn.com/) 61 | - [2023/01/03] 已更新 62 | 1. [NEW] [民視新聞](https://www.ftvnews.com.tw/) 63 | - [2022/12] 尚未檢查 64 | 1. [中央通訊社](https://www.cna.com.tw/) 65 | - [2023/01/04] 已更新 66 | 1. [關鍵評論網](https://www.thenewslens.com/) 67 | - [2022/12] 尚未檢查 68 | - Non-instant news 69 | 70 | 71 | ## Crawler step 72 | 73 | 1. Request real-time news lists. 74 | 2. Request news page from setp.1 list. 75 | 3. Parsing html and get target value. [item.py](crawler_news/items.py) 76 | - url 77 | - article_from 78 | - article_type 79 | - title 80 | - publish_date 81 | - authors 82 | - tags 83 | - text 84 | - text_html 85 | - images 86 | - video 87 | - links 88 | 4. Save into database. [pipelines.py](crawler_news/pipelines.py) 89 | - Default Use Cassandra 90 | - [TODO][feature] Use Mongo or Mysql 91 | 5. Done 92 | 93 | ## Requirement Install 94 | 95 | ### Develop Env 96 | 97 | - python 3.7.6 98 | - scrapy >= 2.0.0 99 | - Cassandra 3.11.4 100 | - Develop on macOS (main) 101 | 102 | ### python scrapy 103 | 104 | ```bash 105 | pip install scrapy 106 | # or 107 | pip3 install scrapy 108 | ``` 109 | 110 | ### Install Cassandra Database 111 | 112 | mac os 113 | 114 | ```bash 115 | brew install cassandra 116 | ``` 117 | 118 | python extension 119 | 120 | ```bash 121 | pip install cassandra-driver 122 | # or 123 | pip3 install cassandra-driver 124 | ``` 125 | 126 | start cassandra 127 | 128 | ```bash 129 | # start on bash 130 | cassandra -f 131 | 132 | # start on backgroud 133 | ``` 134 | 135 | ### Install Mysql Database 136 | 137 | mac os 138 | 139 | ```bash 140 | brew install mysql 141 | ``` 142 | 143 | python extension 144 | 145 | ```bash 146 | pip install PyMySQL 147 | # or 148 | pip3 install PyMySQL 149 | ``` 150 | 151 | ## RUN Project 152 | 153 | ### Run all in localhost terminal 154 | 155 | ```bash 156 | ./run_spiders.sh 157 | ``` 158 | 159 | ### Run in Docker use docker-compose.yml 160 | 161 | 1. build docker image 162 | 163 | ```bash 164 | docker build . -t crawler_news 165 | ``` 166 | 167 | If you want exec crawler without database. modify docker/setting.py and re-build. 168 | 169 | ```bash 170 | # run without database (linux base command) 171 | docker run --rm -it -v `pwd`/tmp:/src/tmp -v `pwd`/log:/src/log crawler_news 172 | ``` 173 | 174 | If you want exec single crawler. modify Dockerfile and re-build. 175 | 176 | ```Dockerfile 177 | CMD ["/bin/bash"] 178 | # or assign crawler 179 | CMD ["scrapy", "crawl", "ettoday"] 180 | ``` 181 | 182 | 1. run docker-compose 183 | 184 | ```bash 185 | # start 186 | docker-compose up -d 187 | 188 | # stop 189 | docker-compose down 190 | ``` 191 | -------------------------------------------------------------------------------- /crawler_news/middlewares/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals, Item 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | import json 12 | 13 | daily_sec = 60 * 60 * 24 14 | 15 | class CrawlerNewsSpiderMiddleware: 16 | # Not all methods need to be defined. If a method is not defined, 17 | # scrapy acts as if the spider middleware does not modify the 18 | # passed objects. 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | # print('[middleware] CrawlerNewsSpiderMiddleware from_crawler') 23 | 24 | # This method is used by Scrapy to create your spiders. 25 | s = cls() 26 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 27 | return s 28 | 29 | def process_spider_input(self, response, spider): 30 | # print('[middleware] CrawlerNewsSpiderMiddleware process_spider_input') 31 | 32 | # Called for each response that goes through the spider 33 | # middleware and into the spider. 34 | 35 | # Should return None or raise an exception. 36 | return None 37 | 38 | def process_spider_output(self, response, result, spider): 39 | # print('[middleware] CrawlerNewsSpiderMiddleware process_spider_output') 40 | 41 | # Called with the results returned from the Spider, after 42 | # it has processed the response. 43 | 44 | # Must return an iterable of Request, or item objects. 45 | for i in result: 46 | if is_item(i) : 47 | spider.redis_client.set(i.get('url'), json.dumps( dict(i) ), ex=daily_sec) 48 | yield i 49 | 50 | def process_spider_exception(self, response, exception, spider): 51 | # print('[middleware] CrawlerNewsSpiderMiddleware process_spider_exception') 52 | 53 | # Called when a spider or process_spider_input() method 54 | # (from other spider middleware) raises an exception. 55 | 56 | # Should return either None or an iterable of Request or item objects. 57 | pass 58 | 59 | def process_start_requests(self, start_requests, spider): 60 | # print('[middleware] CrawlerNewsSpiderMiddleware process_start_requests') 61 | 62 | # Called with the start requests of the spider, and works 63 | # similarly to the process_spider_output() method, except 64 | # that it doesn’t have a response associated. 65 | 66 | # Must return only requests (not items). 67 | for r in start_requests: 68 | yield r 69 | 70 | def spider_opened(self, spider): 71 | # print('[middleware] CrawlerNewsSpiderMiddleware spider_opened') 72 | 73 | spider.logger.info('Spider opened: %s' % spider.name) 74 | 75 | 76 | class CrawlerNewsDownloaderMiddleware: 77 | # Not all methods need to be defined. If a method is not defined, 78 | # scrapy acts as if the downloader middleware does not modify the 79 | # passed objects. 80 | 81 | @classmethod 82 | def from_crawler(cls, crawler): 83 | # print('[middleware] CrawlerNewsDownloaderMiddleware from_crawler') 84 | 85 | # This method is used by Scrapy to create your spiders. 86 | s = cls() 87 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 88 | return s 89 | 90 | def process_request(self, request, spider): 91 | # print('[middleware] CrawlerNewsDownloaderMiddleware process_request') 92 | 93 | # Called for each request that goes through the downloader 94 | # middleware. 95 | 96 | # Must either: 97 | # - return None: continue processing this request 98 | # - or return a Response object 99 | # - or return a Request object 100 | # - or raise IgnoreRequest: process_exception() methods of 101 | # installed downloader middleware will be called 102 | return None 103 | 104 | def process_response(self, request, response, spider): 105 | # print('[middleware] CrawlerNewsDownloaderMiddleware process_response') 106 | 107 | # Called with the response returned from the downloader. 108 | 109 | # Must either; 110 | # - return a Response object 111 | # - return a Request object 112 | # - or raise IgnoreRequest 113 | return response 114 | 115 | def process_exception(self, request, exception, spider): 116 | # print('[middleware] CrawlerNewsDownloaderMiddleware process_exception') 117 | 118 | # Called when a download handler or a process_request() 119 | # (from other downloader middleware) raises an exception. 120 | 121 | # Must either: 122 | # - return None: continue processing this exception 123 | # - return a Response object: stops process_exception() chain 124 | # - return a Request object: stops process_exception() chain 125 | pass 126 | 127 | def spider_opened(self, spider): 128 | # print('[middleware] CrawlerNewsDownloaderMiddleware spider_opened') 129 | 130 | spider.logger.info('Spider opened: %s' % spider.name) 131 | -------------------------------------------------------------------------------- /crawler_news/spiders/EtToday.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from crawler_news.items import CrawlerNewsItem 3 | from scrapy.exceptions import IgnoreRequest 4 | 5 | import time 6 | import re 7 | 8 | date_str = str(time.strftime("%F", time.localtime())) 9 | 10 | class EtTodaySpider(scrapy.Spider): 11 | name = 'ettoday' 12 | allowed_domains = ['ettoday.net'] 13 | base_url = 'https://www.ettoday.net' 14 | 15 | custom_settings = { 16 | 'LOG_FILE': 'log/%s-%s.log' % (name, date_str), 17 | 18 | # https://speed.ettoday.net/robots.txt 19 | 'ROBOTSTXT_OBEY': False 20 | } 21 | 22 | def start_requests(self): 23 | # TODO check date_page 1.exist 2.formet 3.default 2019-12-19 24 | date_page = getattr(self, 'page', time.strftime('%Y-%m-%d')) 25 | # * raise date_page.re('%Y-%m-%d') 26 | 27 | list_url = '%s/news/news-list.htm' % (self.base_url) 28 | yield scrapy.Request(url=list_url, callback=self.parse_list) 29 | 30 | def parse_list(self, response): 31 | # * raise 404 32 | for page_url in response.css('div.part_list_2>h3>a::attr(href)').getall(): 33 | page_url = self.base_url+page_url 34 | if not self.redis_client.exists(page_url): 35 | yield scrapy.Request(url=page_url,callback=self.parse_news,cb_kwargs=dict(req_url=page_url)) 36 | 37 | def parse_news(self, response, req_url): 38 | self.logger.info(f"request page: {req_url}") 39 | 40 | item = CrawlerNewsItem() 41 | 42 | item['url'] = req_url 43 | item['article_from'] = self.name 44 | item['article_type'] = 'news' 45 | 46 | item['title'] = self._parse_title(response) 47 | item['publish_date'] = self._parse_publish_date(response) 48 | item['authors'] = self._parse_authors(response) 49 | item['tags'] = self._parse_tags(response) 50 | item['text'] = self._parse_text(response) 51 | item['text_html'] = self._parse_text_html(response) 52 | item['images'] = self._parse_images(response) 53 | item['video'] = self._parse_video(response) 54 | item['links'] = self._parse_links(response) 55 | 56 | return item 57 | 58 | def _parse_title(self, response): 59 | if re.match('https://fashion.', response.url): 60 | return response.css('h1.title_article::text').get() 61 | else: 62 | return response.css('h1.title::text').get() 63 | 64 | def _parse_publish_date(self, response): 65 | if re.match('https://pets.', response.url): 66 | return response.css('time.news-time::text').get(default='').strip() 67 | if re.match('https://pets.', response.url): 68 | return response.css('.subject_article h1::text').get(default='').strip() 69 | else: 70 | return response.css('time.date::text').get(default='').strip() 71 | 72 | def _parse_authors(self, response): 73 | authors = response.css('div.story>p *::text') 74 | if authors.re_first(r'(^[^▲▼(\s]*/[^)\s]*)') != None: 75 | return [authors.re_first(r'(^[^▲▼(\s]*/[^)\s]*)')] 76 | elif authors.re_first(r'(^.+\/.+)') != None: 77 | return [authors.re_first(r'(^.+\/.+)')] 78 | 79 | def _parse_tags(self, response): 80 | news_tags = [] 81 | if re.match('https://www.', response.url): 82 | news_tags = news_tags + response.css('div.part_menu_5>a::text').getall() 83 | news_tags = news_tags + response.css('div.part_tag_1>a::text').getall() 84 | elif re.match('https://star.', response.url): 85 | news_tags = response.css('div.menu_txt_2>a::text').getall() 86 | elif re.match('https://fashion.', response.url): 87 | news_tags = response.css('div.part_keyword>a::text').getall() 88 | elif re.match('https://pets.', response.url) \ 89 | or re.match('https://sports.', response.url)\ 90 | or re.match('https://house.', response.url)\ 91 | or re.match('https://travel.', response.url)\ 92 | or re.match('https://health.', response.url)\ 93 | or re.match('https://speed.', response.url)\ 94 | or re.match('https://discovery.', response.url): 95 | news_tags = response.css('div.tag>a::text').getall() 96 | elif re.match('https://forum.', response.url): 97 | news_tags = response.css('div.part_tag>a::text').getall() 98 | else: 99 | pass 100 | 101 | return news_tags 102 | 103 | def _parse_text(self, response): 104 | text = [] 105 | for t in response.css('div.story[itemprop=articleBody]>p *::text').getall(): 106 | if t.strip() != '': 107 | text.append(t.strip()) 108 | return text 109 | 110 | def _parse_text_html(self, response): 111 | return response.css('div.story[itemprop=articleBody]').get() 112 | 113 | def _parse_images(self, response): 114 | return response.css('div.story').css('img::attr(src)').getall() 115 | 116 | def _parse_video(self, response): 117 | return response.css('div.story iframe::attr(src)').getall() 118 | 119 | def _parse_links(self, response): 120 | return response.css('div.story').css('a::attr(href)').getall() 121 | -------------------------------------------------------------------------------- /crawler_news/spiders/LibertyTimes.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from crawler_news.items import CrawlerNewsItem 3 | 4 | import time 5 | import re 6 | 7 | date_str = str(time.strftime("%F", time.localtime())) 8 | 9 | class LibertyTimesSpider(scrapy.Spider): 10 | name = 'libertytimes' 11 | allowed_domains = ['ltn.com.tw'] 12 | base_url = 'https://news.ltn.com.tw' 13 | 14 | custom_settings = { 15 | 'LOG_FILE': 'log/%s-%s.log' % (name, date_str), 16 | } 17 | 18 | def start_requests(self): 19 | list_url = '%s/list/breakingnews' % self.base_url 20 | yield scrapy.Request(url=list_url, callback=self.parse_list) 21 | 22 | def parse_list(self, response): 23 | page_url_list = response.css('ul.list>li>a.tit::attr(href)').getall() 24 | 25 | self.logger.info(page_url_list) 26 | 27 | for page_url in page_url_list: 28 | if not self.redis_client.exists(page_url): 29 | yield scrapy.Request(url=page_url, callback=self.parse_news) 30 | 31 | def parse_news(self, response): 32 | req_url = response.request.url 33 | 34 | self.logger.info(f"request page: {req_url}") 35 | 36 | item = CrawlerNewsItem() 37 | 38 | item['url'] = req_url 39 | item['article_from'] = self.name 40 | item['article_type'] = 'news' 41 | 42 | item['title'] = self._parse_title(response) 43 | item['publish_date'] = self._parse_publish_date(response) 44 | item['authors'] = self._parse_authors(response) 45 | item['tags'] = self._parse_tags(response) 46 | item['text'] = self._parse_text(response) 47 | item['text_html'] = self._parse_text_html(response) 48 | item['images'] = self._parse_images(response) 49 | item['video'] = self._parse_video(response) 50 | item['links'] = self._parse_links(response) 51 | 52 | return item 53 | 54 | def _parse_title(self, response): 55 | return response.css('h1::text').get() 56 | 57 | def _parse_publish_date(self, response): 58 | publish_date = response.css('div.content *::text').re_first(r'[0-9/-]+[\s]+[0-9:]+', default='').strip() 59 | 60 | if re.match('https://news', response.url): 61 | publish_date = response.css('div.whitecon span.time::text').get(default='').strip() 62 | elif re.match('https://sports', response.url): 63 | publish_date = response.css('div.c_time::text').get(default='').strip() 64 | elif re.match('https://istyle', response.url): 65 | publish_date = response.css('div.label-date::text').get(default='').strip() 66 | elif re.match('https://ent', response.url): 67 | publish_date = response.css('div.content div.date::text').get(default='').strip() 68 | elif re.match('https://auto', response.url): 69 | publish_date = response.css('div.con_writer span.h1dt::text').get(default='').strip() 70 | 71 | return publish_date 72 | 73 | def _parse_authors(self, response): 74 | return [response.css('div.content *::text').re_first(r'[\[〔[].+[/].+[]〕\]]',default='')] 75 | 76 | def _parse_tags(self, response): 77 | # no tags 78 | return [] 79 | 80 | def _parse_text(self, response): 81 | text = [] 82 | reStr = '' 83 | if re.match('https://sports', response.url): 84 | reStr = 'div[itemprop="articleBody"] p::text' 85 | elif re.match('https://ent', response.url): 86 | reStr = 'div.news_content p:not([class]) *::text' 87 | else: 88 | reStr = 'div.text>p:not([class]) *::text' 89 | 90 | if re.match('https://health', response.url) or re.match('https://art', response.url): 91 | _text = [] 92 | for v in response.css(reStr).getall(): 93 | if v.strip().startswith('☆') or v.strip().startswith('自由健康網') or v.strip().startswith('自由藝文網'): 94 | continue 95 | _text.append(v.strip()) 96 | else: 97 | _text = response.css(reStr).getall() 98 | 99 | for t in _text: 100 | if t.strip() != '': 101 | text.append(t.strip()) 102 | 103 | return text 104 | 105 | def _parse_text_html(self, response): 106 | if re.match('https://sports', response.url): 107 | return response.css('div.news_p').get() 108 | elif re.match('https://ent', response.url): 109 | return response.css('div.news_content').get() 110 | else: 111 | return response.css('div.text').get() 112 | 113 | def _parse_images(self, response): 114 | if re.match('https://sports', response.url): 115 | return response.css('div.news_p').css('img::attr(src)').getall() 116 | elif re.match('https://ent', response.url): 117 | return response.css('div.news_content').css('img::attr(data-original)').getall() 118 | else: 119 | return response.css('div.text').css('img::attr(src)').getall() 120 | 121 | 122 | def _parse_video(self, response): 123 | if re.match('https://sports', response.url): 124 | return response.css('div.news_p').css('iframe::attr(src)').getall() 125 | elif re.match('https://ent', response.url): 126 | return response.css('div.news_content').css('iframe::attr(src)').getall() 127 | else: 128 | return response.css('div.text').css('iframe::attr(src)').getall() 129 | 130 | def _parse_links(self, response): 131 | if re.match('https://sports', response.url): 132 | return response.css('div.news_p').css('a::attr(href)').getall() 133 | elif re.match('https://ent', response.url): 134 | return response.css('div.news_content').css('a::attr(href)').getall() 135 | else: 136 | return response.css('div.text').css('a::attr(href)').getall() 137 | 138 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MASTER] 2 | 3 | # A comma-separated list of package or module names from where C extensions may 4 | # be loaded. Extensions are loading into the active Python interpreter and may 5 | # run arbitrary code. 6 | extension-pkg-whitelist= 7 | 8 | # Add files or directories to the blacklist. They should be base names, not 9 | # paths. 10 | ignore=CVS 11 | 12 | # Add files or directories matching the regex patterns to the blacklist. The 13 | # regex matches against base names, not paths. 14 | ignore-patterns= 15 | 16 | # Python code to execute, usually for sys.path manipulation such as 17 | # pygtk.require(). 18 | #init-hook= 19 | 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the 21 | # number of processors available to use. 22 | jobs=1 23 | 24 | # Control the amount of potential inferred values when inferring a single 25 | # object. This can help the performance when dealing with large functions or 26 | # complex, nested conditions. 27 | limit-inference-results=100 28 | 29 | # List of plugins (as comma separated values of python module names) to load, 30 | # usually to register additional checkers. 31 | load-plugins= 32 | 33 | # Pickle collected data for later comparisons. 34 | persistent=yes 35 | 36 | # Specify a configuration file. 37 | #rcfile= 38 | 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit 40 | # user-friendly hints instead of false-positive error messages. 41 | suggestion-mode=yes 42 | 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the 44 | # active Python interpreter and may run arbitrary code. 45 | unsafe-load-any-extension=no 46 | 47 | 48 | [MESSAGES CONTROL] 49 | 50 | # Only show warnings with the listed confidence levels. Leave empty to show 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 52 | confidence= 53 | 54 | # Disable the message, report, category or checker with the given id(s). You 55 | # can either give multiple identifiers separated by comma (,) or put this 56 | # option multiple times (only on the command line, not in the configuration 57 | # file where it should appear only once). You can also use "--disable=all" to 58 | # disable everything first and then reenable specific checks. For example, if 59 | # you want to run only the similarities checker, you can use "--disable=all 60 | # --enable=similarities". If you want to run only the classes checker, but have 61 | # no Warning level messages displayed, use "--disable=all --enable=classes 62 | # --disable=W". 63 | disable=print-statement, 64 | parameter-unpacking, 65 | unpacking-in-except, 66 | old-raise-syntax, 67 | backtick, 68 | long-suffix, 69 | old-ne-operator, 70 | old-octal-literal, 71 | import-star-module-level, 72 | non-ascii-bytes-literal, 73 | raw-checker-failed, 74 | bad-inline-option, 75 | locally-disabled, 76 | file-ignored, 77 | suppressed-message, 78 | useless-suppression, 79 | deprecated-pragma, 80 | use-symbolic-message-instead, 81 | apply-builtin, 82 | basestring-builtin, 83 | buffer-builtin, 84 | cmp-builtin, 85 | coerce-builtin, 86 | execfile-builtin, 87 | file-builtin, 88 | long-builtin, 89 | raw_input-builtin, 90 | reduce-builtin, 91 | standarderror-builtin, 92 | unicode-builtin, 93 | xrange-builtin, 94 | coerce-method, 95 | delslice-method, 96 | getslice-method, 97 | setslice-method, 98 | no-absolute-import, 99 | old-division, 100 | dict-iter-method, 101 | dict-view-method, 102 | next-method-called, 103 | metaclass-assignment, 104 | indexing-exception, 105 | raising-string, 106 | reload-builtin, 107 | oct-method, 108 | hex-method, 109 | nonzero-method, 110 | cmp-method, 111 | input-builtin, 112 | round-builtin, 113 | intern-builtin, 114 | unichr-builtin, 115 | map-builtin-not-iterating, 116 | zip-builtin-not-iterating, 117 | range-builtin-not-iterating, 118 | filter-builtin-not-iterating, 119 | using-cmp-argument, 120 | eq-without-hash, 121 | div-method, 122 | idiv-method, 123 | rdiv-method, 124 | exception-message-attribute, 125 | invalid-str-codec, 126 | sys-max-int, 127 | bad-python3-import, 128 | deprecated-string-function, 129 | deprecated-str-translate-call, 130 | deprecated-itertools-function, 131 | deprecated-types-field, 132 | next-method-defined, 133 | dict-items-not-iterating, 134 | dict-keys-not-iterating, 135 | dict-values-not-iterating, 136 | deprecated-operator-function, 137 | deprecated-urllib-function, 138 | xreadlines-attribute, 139 | deprecated-sys-function, 140 | exception-escape, 141 | comprehension-escape 142 | 143 | # Enable the message, report, category or checker with the given id(s). You can 144 | # either give multiple identifier separated by comma (,) or put this option 145 | # multiple time (only on the command line, not in the configuration file where 146 | # it should appear only once). See also the "--disable" option for examples. 147 | enable=c-extension-no-member 148 | 149 | 150 | [REPORTS] 151 | 152 | # Python expression which should return a score less than or equal to 10. You 153 | # have access to the variables 'error', 'warning', 'refactor', and 'convention' 154 | # which contain the number of messages in each category, as well as 'statement' 155 | # which is the total number of statements analyzed. This score is used by the 156 | # global evaluation report (RP0004). 157 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10) 158 | 159 | # Template used to display messages. This is a python new-style format string 160 | # used to format the message information. See doc for all details. 161 | #msg-template= 162 | 163 | # Set the output format. Available formats are text, parseable, colorized, json 164 | # and msvs (visual studio). You can also give a reporter class, e.g. 165 | # mypackage.mymodule.MyReporterClass. 166 | output-format=text 167 | 168 | # Tells whether to display a full report or only the messages. 169 | reports=no 170 | 171 | # Activate the evaluation score. 172 | score=yes 173 | 174 | 175 | [REFACTORING] 176 | 177 | # Maximum number of nested blocks for function / method body 178 | max-nested-blocks=5 179 | 180 | # Complete name of functions that never returns. When checking for 181 | # inconsistent-return-statements if a never returning function is called then 182 | # it will be considered as an explicit return statement and no message will be 183 | # printed. 184 | never-returning-functions=sys.exit 185 | 186 | 187 | [LOGGING] 188 | 189 | # Format style used to check logging format string. `old` means using % 190 | # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings. 191 | logging-format-style=old 192 | 193 | # Logging modules to check that the string format arguments are in logging 194 | # function parameter format. 195 | logging-modules=logging 196 | 197 | 198 | [SPELLING] 199 | 200 | # Limits count of emitted suggestions for spelling mistakes. 201 | max-spelling-suggestions=4 202 | 203 | # Spelling dictionary name. Available dictionaries: none. To make it work, 204 | # install the python-enchant package. 205 | spelling-dict= 206 | 207 | # List of comma separated words that should not be checked. 208 | spelling-ignore-words= 209 | 210 | # A path to a file that contains the private dictionary; one word per line. 211 | spelling-private-dict-file= 212 | 213 | # Tells whether to store unknown words to the private dictionary (see the 214 | # --spelling-private-dict-file option) instead of raising a message. 215 | spelling-store-unknown-words=no 216 | 217 | 218 | [MISCELLANEOUS] 219 | 220 | # List of note tags to take in consideration, separated by a comma. 221 | notes=FIXME, 222 | XXX, 223 | TODO 224 | 225 | 226 | [TYPECHECK] 227 | 228 | # List of decorators that produce context managers, such as 229 | # contextlib.contextmanager. Add to this list to register other decorators that 230 | # produce valid context managers. 231 | contextmanager-decorators=contextlib.contextmanager 232 | 233 | # List of members which are set dynamically and missed by pylint inference 234 | # system, and so shouldn't trigger E1101 when accessed. Python regular 235 | # expressions are accepted. 236 | generated-members= 237 | 238 | # Tells whether missing members accessed in mixin class should be ignored. A 239 | # mixin class is detected if its name ends with "mixin" (case insensitive). 240 | ignore-mixin-members=yes 241 | 242 | # Tells whether to warn about missing members when the owner of the attribute 243 | # is inferred to be None. 244 | ignore-none=yes 245 | 246 | # This flag controls whether pylint should warn about no-member and similar 247 | # checks whenever an opaque object is returned when inferring. The inference 248 | # can return multiple potential results while evaluating a Python object, but 249 | # some branches might not be evaluated, which results in partial inference. In 250 | # that case, it might be useful to still emit no-member and other checks for 251 | # the rest of the inferred objects. 252 | ignore-on-opaque-inference=yes 253 | 254 | # List of class names for which member attributes should not be checked (useful 255 | # for classes with dynamically set attributes). This supports the use of 256 | # qualified names. 257 | ignored-classes=optparse.Values,thread._local,_thread._local 258 | 259 | # List of module names for which member attributes should not be checked 260 | # (useful for modules/projects where namespaces are manipulated during runtime 261 | # and thus existing member attributes cannot be deduced by static analysis). It 262 | # supports qualified module names, as well as Unix pattern matching. 263 | ignored-modules= 264 | 265 | # Show a hint with possible names when a member name was not found. The aspect 266 | # of finding the hint is based on edit distance. 267 | missing-member-hint=yes 268 | 269 | # The minimum edit distance a name should have in order to be considered a 270 | # similar match for a missing member name. 271 | missing-member-hint-distance=1 272 | 273 | # The total number of similar names that should be taken in consideration when 274 | # showing a hint for a missing member. 275 | missing-member-max-choices=1 276 | 277 | # List of decorators that change the signature of a decorated function. 278 | signature-mutators= 279 | 280 | 281 | [VARIABLES] 282 | 283 | # List of additional names supposed to be defined in builtins. Remember that 284 | # you should avoid defining new builtins when possible. 285 | additional-builtins= 286 | 287 | # Tells whether unused global variables should be treated as a violation. 288 | allow-global-unused-variables=yes 289 | 290 | # List of strings which can identify a callback function by name. A callback 291 | # name must start or end with one of those strings. 292 | callbacks=cb_, 293 | _cb 294 | 295 | # A regular expression matching the name of dummy variables (i.e. expected to 296 | # not be used). 297 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_ 298 | 299 | # Argument names that match this expression will be ignored. Default to name 300 | # with leading underscore. 301 | ignored-argument-names=_.*|^ignored_|^unused_ 302 | 303 | # Tells whether we should check for unused import in __init__ files. 304 | init-import=no 305 | 306 | # List of qualified module names which can have objects that can redefine 307 | # builtins. 308 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io 309 | 310 | 311 | [FORMAT] 312 | 313 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF. 314 | expected-line-ending-format= 315 | 316 | # Regexp for a line that is allowed to be longer than the limit. 317 | ignore-long-lines=^\s*(# )??$ 318 | 319 | # Number of spaces of indent required inside a hanging or continued line. 320 | indent-after-paren=4 321 | 322 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1 323 | # tab). 324 | indent-string=' ' 325 | 326 | # Maximum number of characters on a single line. 327 | max-line-length=100 328 | 329 | # Maximum number of lines in a module. 330 | max-module-lines=1000 331 | 332 | # List of optional constructs for which whitespace checking is disabled. `dict- 333 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}. 334 | # `trailing-comma` allows a space between comma and closing bracket: (a, ). 335 | # `empty-line` allows space-only lines. 336 | no-space-check=trailing-comma, 337 | dict-separator 338 | 339 | # Allow the body of a class to be on the same line as the declaration if body 340 | # contains single statement. 341 | single-line-class-stmt=no 342 | 343 | # Allow the body of an if to be on the same line as the test if there is no 344 | # else. 345 | single-line-if-stmt=no 346 | 347 | 348 | [SIMILARITIES] 349 | 350 | # Ignore comments when computing similarities. 351 | ignore-comments=yes 352 | 353 | # Ignore docstrings when computing similarities. 354 | ignore-docstrings=yes 355 | 356 | # Ignore imports when computing similarities. 357 | ignore-imports=no 358 | 359 | # Minimum lines number of a similarity. 360 | min-similarity-lines=4 361 | 362 | 363 | [BASIC] 364 | 365 | # Naming style matching correct argument names. 366 | argument-naming-style=snake_case 367 | 368 | # Regular expression matching correct argument names. Overrides argument- 369 | # naming-style. 370 | #argument-rgx= 371 | 372 | # Naming style matching correct attribute names. 373 | attr-naming-style=snake_case 374 | 375 | # Regular expression matching correct attribute names. Overrides attr-naming- 376 | # style. 377 | #attr-rgx= 378 | 379 | # Bad variable names which should always be refused, separated by a comma. 380 | bad-names=foo, 381 | bar, 382 | baz, 383 | toto, 384 | tutu, 385 | tata 386 | 387 | # Naming style matching correct class attribute names. 388 | class-attribute-naming-style=any 389 | 390 | # Regular expression matching correct class attribute names. Overrides class- 391 | # attribute-naming-style. 392 | #class-attribute-rgx= 393 | 394 | # Naming style matching correct class names. 395 | class-naming-style=PascalCase 396 | 397 | # Regular expression matching correct class names. Overrides class-naming- 398 | # style. 399 | #class-rgx= 400 | 401 | # Naming style matching correct constant names. 402 | const-naming-style=UPPER_CASE 403 | 404 | # Regular expression matching correct constant names. Overrides const-naming- 405 | # style. 406 | #const-rgx= 407 | 408 | # Minimum line length for functions/classes that require docstrings, shorter 409 | # ones are exempt. 410 | docstring-min-length=-1 411 | 412 | # Naming style matching correct function names. 413 | function-naming-style=snake_case 414 | 415 | # Regular expression matching correct function names. Overrides function- 416 | # naming-style. 417 | #function-rgx= 418 | 419 | # Good variable names which should always be accepted, separated by a comma. 420 | good-names=i, 421 | j, 422 | k, 423 | ex, 424 | Run, 425 | _ 426 | 427 | # Include a hint for the correct naming format with invalid-name. 428 | include-naming-hint=no 429 | 430 | # Naming style matching correct inline iteration names. 431 | inlinevar-naming-style=any 432 | 433 | # Regular expression matching correct inline iteration names. Overrides 434 | # inlinevar-naming-style. 435 | #inlinevar-rgx= 436 | 437 | # Naming style matching correct method names. 438 | method-naming-style=snake_case 439 | 440 | # Regular expression matching correct method names. Overrides method-naming- 441 | # style. 442 | #method-rgx= 443 | 444 | # Naming style matching correct module names. 445 | module-naming-style=snake_case 446 | 447 | # Regular expression matching correct module names. Overrides module-naming- 448 | # style. 449 | #module-rgx= 450 | 451 | # Colon-delimited sets of names that determine each other's naming style when 452 | # the name regexes allow several styles. 453 | name-group= 454 | 455 | # Regular expression which should only match function or class names that do 456 | # not require a docstring. 457 | no-docstring-rgx=^_ 458 | 459 | # List of decorators that produce properties, such as abc.abstractproperty. Add 460 | # to this list to register other decorators that produce valid properties. 461 | # These decorators are taken in consideration only for invalid-name. 462 | property-classes=abc.abstractproperty 463 | 464 | # Naming style matching correct variable names. 465 | variable-naming-style=snake_case 466 | 467 | # Regular expression matching correct variable names. Overrides variable- 468 | # naming-style. 469 | #variable-rgx= 470 | 471 | 472 | [STRING] 473 | 474 | # This flag controls whether the implicit-str-concat-in-sequence should 475 | # generate a warning on implicit string concatenation in sequences defined over 476 | # several lines. 477 | check-str-concat-over-line-jumps=no 478 | 479 | 480 | [IMPORTS] 481 | 482 | # List of modules that can be imported at any level, not just the top level 483 | # one. 484 | allow-any-import-level= 485 | 486 | # Allow wildcard imports from modules that define __all__. 487 | allow-wildcard-with-all=no 488 | 489 | # Analyse import fallback blocks. This can be used to support both Python 2 and 490 | # 3 compatible code, which means that the block might have code that exists 491 | # only in one or another interpreter, leading to false positives when analysed. 492 | analyse-fallback-blocks=no 493 | 494 | # Deprecated modules which should not be used, separated by a comma. 495 | deprecated-modules=optparse,tkinter.tix 496 | 497 | # Create a graph of external dependencies in the given file (report RP0402 must 498 | # not be disabled). 499 | ext-import-graph= 500 | 501 | # Create a graph of every (i.e. internal and external) dependencies in the 502 | # given file (report RP0402 must not be disabled). 503 | import-graph= 504 | 505 | # Create a graph of internal dependencies in the given file (report RP0402 must 506 | # not be disabled). 507 | int-import-graph= 508 | 509 | # Force import order to recognize a module as part of the standard 510 | # compatibility libraries. 511 | known-standard-library= 512 | 513 | # Force import order to recognize a module as part of a third party library. 514 | known-third-party=enchant 515 | 516 | # Couples of modules and preferred modules, separated by a comma. 517 | preferred-modules= 518 | 519 | 520 | [CLASSES] 521 | 522 | # List of method names used to declare (i.e. assign) instance attributes. 523 | defining-attr-methods=__init__, 524 | __new__, 525 | setUp, 526 | __post_init__ 527 | 528 | # List of member names, which should be excluded from the protected access 529 | # warning. 530 | exclude-protected=_asdict, 531 | _fields, 532 | _replace, 533 | _source, 534 | _make 535 | 536 | # List of valid names for the first argument in a class method. 537 | valid-classmethod-first-arg=cls 538 | 539 | # List of valid names for the first argument in a metaclass class method. 540 | valid-metaclass-classmethod-first-arg=cls 541 | 542 | 543 | [DESIGN] 544 | 545 | # Maximum number of arguments for function / method. 546 | max-args=5 547 | 548 | # Maximum number of attributes for a class (see R0902). 549 | max-attributes=7 550 | 551 | # Maximum number of boolean expressions in an if statement (see R0916). 552 | max-bool-expr=5 553 | 554 | # Maximum number of branch for function / method body. 555 | max-branches=12 556 | 557 | # Maximum number of locals for function / method body. 558 | max-locals=15 559 | 560 | # Maximum number of parents for a class (see R0901). 561 | max-parents=7 562 | 563 | # Maximum number of public methods for a class (see R0904). 564 | max-public-methods=20 565 | 566 | # Maximum number of return / yield for function / method body. 567 | max-returns=6 568 | 569 | # Maximum number of statements in function / method body. 570 | max-statements=50 571 | 572 | # Minimum number of public methods for a class (see R0903). 573 | min-public-methods=2 574 | 575 | 576 | [EXCEPTIONS] 577 | 578 | # Exceptions that will emit a warning when being caught. Defaults to 579 | # "BaseException, Exception". 580 | overgeneral-exceptions=BaseException, 581 | Exception 582 | --------------------------------------------------------------------------------