├── __init__.py
├── crawler_news
    ├── __init__.py
    ├── spiders
    │   ├── __init__.py
    │   ├── localhost.py
    │   ├── tvbs.py
    │   ├── udn.py
    │   ├── chinatimes.py
    │   ├── cna.py
    │   ├── nownews.py
    │   ├── setn.py
    │   ├── ebc.py
    │   ├── EtToday.py
    │   └── LibertyTimes.py
    ├── pipelines
    │   ├── pipelines.py
    │   ├── jieba.py
    │   ├── ckiptagger.py
    │   ├── postgresql.py
    │   ├── cassandra.py
    │   ├── line.py
    │   └── mysql.py
    ├── items.py
    ├── extensions
    │   ├── redis.py
    │   ├── CassandraDatabase.py
    │   └── MysqlDatabase.py
    ├── settings.py.example
    └── middlewares
    │   └── middlewares.py
├── log
    └── .gitignore
├── tmp
    └── .gitignore
├── requirements.txt
├── run.sh
├── scrapy.cfg
├── app.py
├── LICENSE
├── .circleci
    └── config.yml
├── .gitignore
├── unittest.py
├── README.md
└── .pylintrc


/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/crawler_news/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/log/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/tmp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !.gitignore
3 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Scrapy==2.7.1
2 | pylint==2.15.8
3 | redis==4.4.0
4 | requests==2.28.1
5 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #! /bin/bash
 2 | 
 3 | # crontab -e
 4 | # */5 * * * * /home/bentsou/Project/crawler-news/run.sh
 5 | 
 6 | # set -x
 7 | 
 8 | workdir=/home/bentsou/Project/crawler-news/
 9 | cd $workdir
10 | 
11 | . $HOME/.profile;
12 | /usr/bin/python $workdir/app.py
13 | 
14 | # set +x
15 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = crawler_news.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawler_news
12 | 


--------------------------------------------------------------------------------
/crawler_news/pipelines/pipelines.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | 
11 | class CrawlerNewsPipeline:
12 |     def process_item(self, item, spider):
13 |         return item
14 | 


--------------------------------------------------------------------------------
/crawler_news/items.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your scraped items
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | class CrawlerNewsItem(scrapy.Item):
 9 |     url = scrapy.Field() # str
10 |     article_from = scrapy.Field() # str
11 |     article_type = scrapy.Field() # str
12 |     title = scrapy.Field() # str
13 |     publish_date = scrapy.Field() # str
14 |     authors = scrapy.Field() # list json
15 |     tags = scrapy.Field() # list json
16 |     text = scrapy.Field() # list json
17 |     text_html = scrapy.Field() # str
18 |     images = scrapy.Field() # list json
19 |     video = scrapy.Field() # list json
20 |     links = scrapy.Field() # list json
21 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
 1 | from time import sleep
 2 | import scrapy
 3 | from scrapy.crawler import CrawlerProcess
 4 | from scrapy.utils.project import get_project_settings
 5 | 
 6 | from crawler_news.spiders import LibertyTimes
 7 | from crawler_news.spiders import ebc
 8 | from crawler_news.spiders import udn
 9 | from crawler_news.spiders import EtToday
10 | 
11 | settings = get_project_settings()
12 | process = CrawlerProcess(settings)
13 | 
14 | print('start')
15 | 
16 | for spider_name in process.spiders.list():
17 |     if spider_name != 'localhost':
18 |         print ("Running spider %s" % (spider_name))
19 |         process.crawl(spider_name)
20 | 
21 | 
22 | process.start() # the script will block here until all crawling jobs are finished
23 | 
24 | print('done')
25 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/localhost.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # mac shell example
 4 | # scrapy crawl ettoday -a page=$(date +"%Y-%m-%d")
 5 | 
 6 | import scrapy
 7 | from crawler_news.items import CrawlerNewsItem
 8 | 
 9 | import time
10 | import re
11 | 
12 | class LocalhostSpider(scrapy.Spider):
13 |     name = 'localhost'
14 |     allowed_domains = ['localhost']
15 |     base_url = 'http://localhost'
16 | 
17 |     custom_settings = {
18 |         'LOG_FILE': 'log/%s-%s.log' % (name, str(int(time.time()))),
19 |         'LOG_LEVEL': 'DEBUG',
20 |         'DEFAULT_REQUEST_HEADERS': {
21 |             'Accept': '*/*',
22 |             'Referer': 'https://www.nownews.com/',
23 |             'X-Requested-With': 'XMLHttpRequest'
24 |         }
25 |     }
26 | 
27 |     def start_requests(self):
28 |         yield scrapy.Request(url=self.base_url, callback=self.parse)
29 | 
30 |     def parse(self, response):
31 |         print("[*] OK!")
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 SecondDim
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/crawler_news/pipelines/jieba.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | import jieba.posseg as pseg
11 | import jieba
12 | # import paddle
13 | 
14 | # paddle.enable_static()
15 | # jieba.enable_paddle()
16 | 
17 | from ckiptagger import data_utils, construct_dictionary, WS, POS, NER
18 | 
19 | class JiebaPipeline:
20 |     def process_item(self, item, spider):
21 |         text = item.get('text')
22 |         for t in text:
23 |             self._paddle_cut(t)
24 |         # print(text)
25 |         return item
26 | 
27 |     def _paddle_cut(self, test_sent):
28 |         seg_list = jieba.cut(test_sent,use_paddle=True)
29 |         print("Paddle Mode: " + '/'.join(list(seg_list)))
30 |         # for word in list(seg_list):
31 |         #     print('%s' % (word))
32 | 
33 |     def _default_cut(self, test_sent):
34 |         seg_list = jieba.cut(test_sent, cut_all=False)
35 |         print("Default Mode: " + "/ ".join(seg_list))
36 | 
37 |     def _full_cut(self, test_sent):
38 |         seg_list = jieba.cut(test_sent, cut_all=True)
39 |         print("Full Mode: " + "/ ".join(seg_list))
40 | 


--------------------------------------------------------------------------------
/crawler_news/pipelines/ckiptagger.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | import json
11 | import time
12 | 
13 | daily_sec = 60 * 60 * 24
14 | time_epoch_unit = 5 * 60
15 | 
16 | class CkiptaggerPipeline:
17 |     def _json_dumps_item(self, item, key):
18 |         if item.get(key):
19 |             return json.dumps(item.get(key), ensure_ascii=False)
20 |         else:
21 |             return None
22 | 
23 |     def process_item(self, item, spider):
24 |         spider.logger.info('send to work queue for parse. %s' % item.get('url'))
25 | 
26 |         time_epoch = int((time.time() - time_epoch_unit) / time_epoch_unit) * time_epoch_unit
27 | 
28 |         # TODO 時間應該要來自網頁內容
29 | 
30 |         data_obj = json.dumps( {
31 |             'url': item.get('url'),
32 |             'title': item.get('title'),
33 |             'tags': item.get('tags'),
34 |             'text': item.get('text'),
35 |             'time_epoch': time_epoch
36 |         }, ensure_ascii=False )
37 | 
38 |         spider.redis_client.lpush('ckiptagger_worker_queue', data_obj)
39 | 
40 |         # spider.redis_client.set(time_epoch, data_obj, ex=daily_sec)
41 | 
42 |         return item
43 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | 
 3 | orbs:
 4 |   python: circleci/python@0.2.1
 5 | 
 6 | jobs:
 7 |   build-and-test:
 8 |     docker:
 9 |       - image: circleci/python:3.7-stretch
10 |     steps:
11 |       - checkout
12 |       - python/load-cache
13 |       - python/install-deps
14 |       - python/save-cache
15 |       # - run:
16 |       #     command: python -m pylint crawler_news/spiders/*
17 |       #     name: lint
18 |       - run:
19 |           command: |
20 |             mv crawler_news/settings.py.example crawler_news/settings.py
21 |             python -m scrapy check
22 |           name: Test scrapy
23 | 
24 |   deploy-to-gcp:
25 |     docker:
26 |       - image: alpine:latest
27 |     steps:
28 |       - run:
29 |           command: |
30 |             apk --no-cache add openssh-client bash
31 |             echo ${SSH_KEY} | base64 -d > circleci && chmod 400 circleci
32 |             ssh -o "StrictHostKeyChecking no" -i circleci ${SSH_HOST} "cd /home/ubuntu/ProjectPM/projectpm-cicd && ./circleci.sh crawler-news master"
33 |           name: Deploy on gcloud
34 | 
35 | workflows:
36 |   build-test-deploy:
37 |     jobs:
38 |       - build-and-test:
39 |           filters:
40 |             branches:
41 |               only:
42 |                 - develop
43 |                 - /feature.*/
44 |                 - /hotfix.*/
45 |                 - /circleci.*/
46 |       - deploy-to-gcp:
47 |           filters:
48 |             branches:
49 |               only:
50 |                 - master
51 | 


--------------------------------------------------------------------------------
/crawler_news/extensions/redis.py:
--------------------------------------------------------------------------------
 1 | # Define here the models for your spider middleware
 2 | #
 3 | # See documentation in:
 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 5 | 
 6 | from scrapy import signals
 7 | 
 8 | import redis
 9 | 
10 | class RedisClient():
11 |     @classmethod
12 |     def from_crawler(cls, crawler):
13 |         # first check if the extension should be enabled and raise
14 |         # NotConfigured otherwise
15 |         # if not crawler.settings.getbool('MYEXT_ENABLED'):
16 |         #     raise NotConfigured
17 | 
18 |         # get the number of items from settings
19 |         # item_count = crawler.settings.getint('MYEXT_ITEMCOUNT', 1000)
20 | 
21 |         # instantiate the extension object
22 |         ext = cls()
23 | 
24 |         # connect the extension object to signals
25 |         crawler.signals.connect(ext.spider_opened, signal=signals.spider_opened)
26 |         # crawler.signals.connect(ext.spider_closed, signal=signals.spider_closed)
27 |         # crawler.signals.connect(ext.item_scraped, signal=signals.item_scraped)
28 | 
29 |         # return the extension object
30 |         return ext
31 | 
32 |     def spider_opened(self, spider):
33 |         host = spider.settings.get('REDIS_HOST', 'localhost')
34 |         port = spider.settings.get('REDIS_PORT', 6379)
35 |         db = spider.settings.get('REDIS_DATABASE', 0)
36 | 
37 |         spider.logger.info(f"Connect redis host:{host}, port:{port}, db:{db}")
38 | 
39 |         spider.redis_client = redis.Redis(host, port, db)
40 | 


--------------------------------------------------------------------------------
/crawler_news/pipelines/postgresql.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | 
 7 | # useful for handling different item types with a single interface
 8 | from itemadapter import ItemAdapter
 9 | 
10 | import psycopg2
11 | 
12 | # # Connect to your postgres DB
13 | # conn = psycopg2.connect("dbname=test user=postgres")
14 | 
15 | # # Open a cursor to perform database operations
16 | # cur = conn.cursor()
17 | 
18 | # # Execute a query
19 | # cur.execute("SELECT * FROM my_data")
20 | 
21 | # # Retrieve query results
22 | # records = cur.fetchall()
23 | 
24 | class PostgresqlPipeline:
25 |     def open_spider(self, spider):
26 |         settings = spider.settings
27 |         print('[pipelines] PostgresqlPipeline open_spider')
28 |         # self.db = psycopg2.connect("dbname=crawler_news user=crawler_news")
29 |         # self.db = MysqlDatabase(host=settings['MYSQL_HOST'],
30 |         #                      port=settings['MYSQL_PORT'],
31 |         #                      user=settings['MYSQL_USER'],
32 |         #                      password=settings['MYSQL_PASSWORD'],
33 |         #                      db=settings['MYSQL_DB'],
34 |         #                      table=settings['MYSQL_TABLE'],
35 |         #                      charset=settings['MYSQL_CHARSET'])
36 | 
37 |     def close_spider(self, spider):
38 |         # self.db.close()
39 |         print('[pipelines] PostgresqlPipeline close_spider')
40 |         pass
41 | 
42 |     def process_item(self, item, spider):
43 |         print('[pipelines] PostgresqlPipeline process_item')
44 |         pass
45 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 
106 | # system
107 | .DS_Store
108 | .vscode/
109 | 
110 | crawler_news/settings.py
111 | 


--------------------------------------------------------------------------------
/crawler_news/pipelines/cassandra.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from scrapy.exceptions import DropItem
 9 | from src.CassandraDatabase import CassandraDatabase
10 | 
11 | class CassandraPipeline(object):
12 |     # TODO set database env in setting.py
13 |     # def __init__(self, mongo_uri, mongo_db):
14 |     #     self.mongo_uri = mongo_uri
15 |     #     self.mongo_db = mongo_db
16 | 
17 |     # @classmethod
18 |     # def from_crawler(cls, crawler):
19 |     #     return cls(
20 |     #         mongo_uri=crawler.settings.get('MONGO_URI'),
21 |     #         mongo_db=crawler.settings.get('MONGO_DATABASE', 'items')
22 |     #     )
23 | 
24 |     def open_spider(self, spider):
25 |         settings = spider.settings
26 |         self.db = CassandraDatabase(
27 |                         keyspace=settings['CASSANDRA_KEYSPAC'],
28 |                         table=settings['CASSANDRA_TABLE'],
29 |                         host=settings['CASSANDRA_HOST']
30 |                     )
31 |         self.db.create_table()
32 | 
33 |     def close_spider(self, spider):
34 |         self.db.close()
35 | 
36 |     def process_item(self, item, spider):
37 |         if item.get('url'):
38 |             news = self.db.fetchOne(item['url'])
39 |             if news == None:
40 |                 # TODO 塞進資料庫前，檢查資料格式
41 | 
42 |                 try:
43 |                     self.db.insert(dict(item))
44 |                 except Exception as e:
45 |                     spider.logger.error("---------- DB ERROR ----------")
46 |                     spider.logger.error(e)
47 |                     spider.logger.error(item)
48 |                     spider.logger.error("==============================")
49 |             else:
50 |                 # TODO version2 版本判斷
51 |                 pass
52 | 
53 |         else:
54 |             raise DropItem("Missing item.url in %s" % item)
55 | 
56 |         return item
57 | 


--------------------------------------------------------------------------------
/crawler_news/pipelines/line.py:
--------------------------------------------------------------------------------
 1 | # Define your item pipelines here
 2 | #
 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 5 | 
 6 | import requests
 7 | import re
 8 | import json
 9 | 
10 | from scrapy.exceptions import DropItem
11 | 
12 | notify_key_words = 'notify_key_words'
13 | 
14 | class LineNotifyPipeline(object):
15 | 
16 |     def open_spider(self, spider):
17 |         self.token = spider.settings.get('LINE_NOTIFY_TOKEN')
18 | 
19 |     def _re(self, targets, key_words):
20 |         if type(targets) is str:
21 |             targets = [targets]
22 | 
23 |         if type(targets) is list:
24 |             for target in targets:
25 |                 for key_word in key_words:
26 |                     match = re.search(key_word, target)
27 |                     if match:
28 |                         return match.group(0)
29 | 
30 |         return False
31 | 
32 |     def line_notify_message(self, msg):
33 |         headers = {
34 |             "Authorization": "Bearer " + self.token,
35 |             "Content-Type" : "application/x-www-form-urlencoded"
36 |         }
37 | 
38 |         payload = {'message': msg}
39 |         r = requests.post("https://notify-api.line.me/api/notify", headers=headers, params=payload)
40 | 
41 |         return r.status_code
42 | 
43 |     async def process_item(self, item, spider):
44 |         if not item.get('url'):
45 |             raise DropItem("Missing item.url in %s" % item)
46 | 
47 |         key_words = json.loads(spider.redis_client.get(notify_key_words))
48 | 
49 |         conditions = ''
50 |         if self._re(item['title'], key_words):
51 |             conditions += '標題 包含關鍵字 [%s]\n' % self._re(item['title'], key_words)
52 |         if self._re(item['tags'], key_words):
53 |             conditions += '標籤 包含關鍵字 [%s]\n' % self._re(item['tags'], key_words)
54 |         if self._re(item['text'], key_words):
55 |             conditions += '內文 包含關鍵字 [%s]\n' % self._re(item['text'], key_words)
56 | 
57 |         if conditions != '':
58 |             msg = "觸發條件：\n%s\n\n新聞標題：%s\n\n新聞網址：%s" % (conditions, item.get('title'), item.get('url'))
59 |             self.line_notify_message(msg)
60 |             spider.logger.info('Send line notify message. %s' % item.get('url'))
61 | 
62 |         return item
63 | 


--------------------------------------------------------------------------------
/crawler_news/pipelines/mysql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import json
 9 | from scrapy.exceptions import DropItem
10 | from src.MysqlDatabase import MysqlDatabase
11 | 
12 | class MysqlPipeline(object):
13 | 
14 |     def open_spider(self, spider):
15 |         settings = spider.settings
16 |         self.db = MysqlDatabase(host=settings['MYSQL_HOST'],
17 |                              port=settings['MYSQL_PORT'],
18 |                              user=settings['MYSQL_USER'],
19 |                              password=settings['MYSQL_PASSWORD'],
20 |                              db=settings['MYSQL_DB'],
21 |                              table=settings['MYSQL_TABLE'],
22 |                              charset=settings['MYSQL_CHARSET'])
23 | 
24 |     def close_spider(self, spider):
25 |         self.db.close()
26 | 
27 |     def _json_dumps_item(self, item, key):
28 |         if item.get(key):
29 |             return json.dumps(item.get(key), ensure_ascii=False)
30 |         else:
31 |             return None
32 | 
33 |     async def process_item(self, item, spider):
34 |         if not item.get('url'):
35 |             raise DropItem("Missing item.url in %s" % item)
36 | 
37 |         if not self.db.news_exist(item['url']):
38 |             # TODO 塞進資料庫前，檢查資料格式
39 | 
40 |             item['authors'] = self._json_dumps_item(item, 'authors')
41 |             item['tags'] = self._json_dumps_item(item, 'tags')
42 |             item['text'] = self._json_dumps_item(item, 'text')
43 |             item['images'] = self._json_dumps_item(item, 'images')
44 |             item['video'] = self._json_dumps_item(item, 'video')
45 |             item['links'] = self._json_dumps_item(item, 'links')
46 | 
47 |             try:
48 |                 self.db.insert(dict(item))
49 |             except Exception as e:
50 |                 spider.logger.error("---------- DB INSERT ERROR ----------")
51 |                 spider.logger.error(e)
52 |                 spider.logger.error(item)
53 |                 spider.logger.error("==============================")
54 |         else:
55 |             # TODO version2 版本判斷
56 |             pass
57 | 
58 |         return item
59 | 


--------------------------------------------------------------------------------
/unittest.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import json
  3 | 
  4 | import sys
  5 | 
  6 | try:
  7 |     filename = sys.argv[1]
  8 |     with open('tmp/%s' % filename, 'r') as f:
  9 |         pass
 10 | except FileNotFoundError:
 11 |     print('[X] file not exist. tmp/filename')
 12 |     sys.exit(0)
 13 | except IndexError:
 14 |     print('[X] please enter filename.')
 15 |     sys.exit(0)
 16 | 
 17 | class ResultTestCase(unittest.TestCase):
 18 |     def setUp(self):
 19 |         with open('tmp/%s' % filename, 'r') as f:
 20 |             self.data = json.loads( f.read() )
 21 | 
 22 |     def tearDown(self):
 23 |         pass
 24 | 
 25 |     def test_url(self):
 26 |         e = []
 27 |         for row in self.data:
 28 |             if not row['url']:
 29 |                 e.append( (row) )
 30 | 
 31 |         self.assertEqual(len(e), 0, e)
 32 | 
 33 |     def test_title(self):
 34 |         e = []
 35 |         for row in self.data:
 36 |             if not row['title']:
 37 |                 e.append( (row['url'], row['title']) )
 38 | 
 39 |         self.assertEqual(len(e), 0, e)
 40 | 
 41 |     def test_publish_date(self):
 42 |         e = []
 43 |         for row in self.data:
 44 |             if not row['publish_date']:
 45 |                 e.append( (row['url'], row['publish_date']) )
 46 | 
 47 |         self.assertEqual(len(e), 0, e)
 48 | 
 49 |     def test_authors(self):
 50 |         e = []
 51 |         for row in self.data:
 52 |             if not row['authors']:
 53 |                 e.append( (row['url'], row['authors']) )
 54 | 
 55 |         self.assertEqual(len(e), 0, e)
 56 | 
 57 |     def test_tags(self):
 58 |         e = []
 59 |         for row in self.data:
 60 |             if not row['tags']:
 61 |                 e.append( (row['url'], row['tags']) )
 62 | 
 63 |         self.assertEqual(len(e), 0, e)
 64 | 
 65 |     def test_text(self):
 66 |         e = []
 67 |         for row in self.data:
 68 |             if not row['text']:
 69 |                 e.append( (row['url'], row['text']) )
 70 | 
 71 |         self.assertEqual(len(e), 0, e)
 72 | 
 73 |     def test_text_html(self):
 74 |         e = []
 75 |         for row in self.data:
 76 |             if not row['text_html']:
 77 |                 e.append( (row['url'], row['text_html']) )
 78 | 
 79 |         self.assertEqual(len(e), 0, e)
 80 | 
 81 |     def test_images(self):
 82 |         e = []
 83 |         for row in self.data:
 84 |             if not row['images']:
 85 |                 e.append( (row['url'], row['images']) )
 86 | 
 87 |         self.assertEqual(len(e), 0, e)
 88 | 
 89 |     def test_video(self):
 90 |         e = []
 91 |         for row in self.data:
 92 |             if not row['video']:
 93 |                 e.append( (row['url'], row['video']) )
 94 | 
 95 |         self.assertEqual(len(e), 0, e)
 96 | 
 97 |     def test_links(self):
 98 |         e = []
 99 |         for row in self.data:
100 |             if len(row['links']) == 0:
101 |                 e.append( (row['url'], row['links']) )
102 | 
103 |         self.assertEqual(len(e), 0, e)
104 | 
105 | if __name__ == '__main__':
106 |     unittest.main(argv = [sys.argv[0]])
107 | 
108 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/tvbs.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from crawler_news.items import CrawlerNewsItem
 3 | 
 4 | import time
 5 | 
 6 | date_str = str(time.strftime("%F", time.localtime()))
 7 | 
 8 | class TVBSSpider(scrapy.Spider):
 9 |     name = 'tvbs'
10 |     allowed_domains = ['tvbs.com.tw']
11 |     base_url = 'https://news.tvbs.com.tw'
12 | 
13 |     custom_settings = {
14 |         'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
15 |     }
16 | 
17 |     def start_requests(self):
18 |         list_url = '%s/realtime' % (self.base_url)
19 |         yield scrapy.Request(url=list_url, callback=self.parse_list)
20 | 
21 |     def parse_list(self, response):
22 |         for page_url in response.css('div.news_list>div.list>ul>li>a:first-child::attr(href)').getall():
23 |             page_url = self.base_url + page_url
24 |             if not self.redis_client.exists(page_url):
25 |                 yield scrapy.Request(url=page_url, callback=self.parse_news,cb_kwargs=dict(req_url=page_url))
26 | 
27 |     def parse_news(self, response, req_url):
28 |         self.logger.info(f"request page: {req_url}")
29 | 
30 |         item = CrawlerNewsItem()
31 | 
32 |         item['url'] = req_url
33 |         item['article_from'] = self.name
34 |         item['article_type'] = 'news'
35 | 
36 |         item['title'] = self._parse_title(response)
37 |         item['publish_date'] = self._parse_publish_date(response)
38 |         item['authors'] = self._parse_authors(response)
39 |         item['tags'] = self._parse_tags(response)
40 |         item['text'] = self._parse_text(response)
41 |         item['text_html'] = self._parse_text_html(response)
42 |         item['images'] = self._parse_images(response)
43 |         item['video'] = self._parse_video(response)
44 |         item['links'] = self._parse_links(response)
45 | 
46 |         return item
47 | 
48 |     def _parse_title(self, response):
49 |         return response.css('h1.title::text').get()
50 | 
51 |     def _parse_publish_date(self, response):
52 |         return response.css('div.author::text').re_first(r'[0-9]+/[0-9]+/[0-9]+ [0-9]+:[0-9]+')
53 | 
54 |     def _parse_authors(self, response):
55 |         return response.css('div.author>a::text').getall()
56 | 
57 |     def _parse_tags(self, response):
58 |         tags = []
59 |         for t in response.css('div.article_keyword>a::text').getall():
60 |             tags.append(t.lstrip('#'))
61 |         return tags
62 | 
63 |     def _parse_text(self, response):
64 |         text = []
65 |         for t in response.css('#news_detail_div::text,#news_detail_div>p::text').getall():
66 |             if t.strip() != '':
67 |                 text.append(t.strip())
68 |         return text
69 | 
70 |     def _parse_text_html(self, response):
71 |         return response.css('#news_detail_div').get()
72 | 
73 |     def _parse_images(self, response):
74 |         return response.css('.article_new').css('img::attr(src)').getall()
75 | 
76 |     def _parse_video(self, response):
77 |         return response.css('.article_new #ytframe iframe::attr(src)').getall()
78 | 
79 |     def _parse_links(self, response):
80 |         return response.css('.article_new').css('a::attr(href)').getall()
81 | 


--------------------------------------------------------------------------------
/crawler_news/extensions/CassandraDatabase.py:
--------------------------------------------------------------------------------
 1 | import cassandra
 2 | from cassandra.cluster import Cluster
 3 | 
 4 | class CassandraDatabase():
 5 |     def __init__(self, keyspace, table, host=['localhost']):
 6 |         self.keyspace = keyspace
 7 |         self.host = host
 8 |         self.table = table
 9 | 
10 |         self.connect()
11 | 
12 |     def __del__(self,):
13 |         self.cluster.shutdown()
14 | 
15 |     def connect(self,):
16 |         self.cluster = Cluster(self.host)
17 |         self.session = self.cluster.connect()
18 |         self.create_keyspace()
19 |         self.session.set_keyspace(self.keyspace)
20 | 
21 |     def close(self,):
22 |         self.cluster.shutdown()
23 | 
24 |     def create_keyspace(self):
25 |         sql = """
26 |                 SELECT keyspace_name
27 |                 FROM system_schema.keyspaces
28 |                 WHERE keyspace_name='%s'
29 |             """ % self.keyspace
30 | 
31 |         if self.query(sql).one() == None:
32 |             self.query("""
33 |                 CREATE KEYSPACE %s
34 |                 WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': '2' }
35 |                 """ % self.keyspace)
36 | 
37 |     def create_table(self,):
38 |         sql = """
39 |                 CREATE TABLE IF NOT EXISTS %s (
40 |                     url varchar, article_from varchar, article_type varchar,
41 |                     title varchar, publish_date varchar, authors list<varchar>,
42 |                     tags list<varchar>, text_ list<varchar>, text_html text,
43 |                     images list<varchar>, video list<varchar>, links list<varchar>,
44 |                 PRIMARY KEY(url)
45 |                     );
46 |             """ % self.table
47 |         self.query(sql)
48 | 
49 |         sql = """
50 |                 CREATE INDEX IF NOT EXISTS ON %s(article_from);
51 |             """ % self.table
52 |         self.query(sql)
53 | 
54 |         sql = """
55 |                 CREATE INDEX IF NOT EXISTS ON %s(article_type);
56 |             """ % self.table
57 |         self.query(sql)
58 | 
59 |     def query(self, sql, args=None):
60 |         return self.session.execute(sql, args)
61 | 
62 |     def fetchOne(self, url):
63 |         sql = """
64 |                 SELECT *
65 |                 FROM %s
66 |                 WHERE url='%s'
67 |             """ % (self.table, url)
68 | 
69 |         return self.query(sql).one()
70 | 
71 |     def insert(self, data={}):
72 |         sql = "INSERT INTO %s "% (self.table)
73 |         sql = sql + """
74 |                 (url, article_from, article_type,
75 |                  title, publish_date, authors, tags,
76 |                  text_, text_html, images, video, links)
77 |                 VALUES
78 |                 (%(url)s, %(article_from)s, %(article_type)s,
79 |                  %(title)s, %(publish_date)s, %(authors)s, %(tags)s,
80 |                  %(text)s, %(text_html)s, %(images)s, %(video)s, %(links)s)
81 |             """
82 | 
83 |         return self.query(sql, data)
84 | 
85 | 
86 |     def update(self,):
87 |         raise 'Method not implemented.'
88 | 
89 |     def delete(self,):
90 |         raise 'Method not implemented.'
91 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/udn.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from crawler_news.items import CrawlerNewsItem
 3 | 
 4 | import time
 5 | import re
 6 | 
 7 | date_str = str(time.strftime("%F", time.localtime()))
 8 | 
 9 | class UdnSpider(scrapy.Spider):
10 |     name = 'udn'
11 |     allowed_domains = ['udn.com']
12 |     base_url = 'https://udn.com'
13 | 
14 |     custom_settings = {
15 |         'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
16 |     }
17 | 
18 |     def start_requests(self):
19 |         list_url = '%s/news/breaknews/1/99' % (self.base_url)
20 |         yield scrapy.Request(url=list_url, callback=self.parse_list)
21 | 
22 |     def parse_list(self, response):
23 |         page_url_list = response.css('div.story-list__news h2>a::attr(href)').getall()
24 | 
25 |         self.logger.info(page_url_list)
26 | 
27 |         for page_url in page_url_list:
28 |             page_url = self.base_url+page_url.split('?')[0]
29 |             if not self.redis_client.exists(page_url):
30 |                 yield scrapy.Request(url=page_url, callback=self.parse_news)
31 | 
32 |     def parse_news(self, response):
33 |         req_url = response.request.url
34 | 
35 |         self.logger.info(f"request page: {req_url}")
36 | 
37 |         item = CrawlerNewsItem()
38 | 
39 |         item['url'] = req_url
40 |         item['article_from'] = self.name
41 |         item['article_type'] = 'news'
42 | 
43 |         item['title'] = self._parse_title(response)
44 |         item['publish_date'] = self._parse_publish_date(response)
45 |         item['authors'] = self._parse_authors(response)
46 |         item['tags'] = self._parse_tags(response)
47 |         item['text'] = self._parse_text(response)
48 |         item['text_html'] = self._parse_text_html(response)
49 |         item['images'] = self._parse_images(response)
50 |         item['video'] = self._parse_video(response)
51 |         item['links'] = self._parse_links(response)
52 | 
53 |         return item
54 | 
55 |     def _parse_title(self, response):
56 |         return response.css('h1::text').get()
57 | 
58 |     def _parse_publish_date(self, response):
59 |         return response.css('time.article-content__time::text').get()
60 | 
61 |     def _parse_authors(self, response):
62 |         return response.css('span.article-content__author a::text').get()
63 | 
64 |     def _parse_tags(self, response):
65 |         return response.css('section.keyword>a::text').getall()
66 | 
67 |     def _parse_text(self, response):
68 |         text = []
69 |         for t in response.css('section.article-content__editor>p *::text').getall():
70 |             if t.strip() != '':
71 |                 text.append(t.strip())
72 |         return text
73 | 
74 |     def _parse_text_html(self, response):
75 |         return response.css('section.article-content__editor').getall()
76 | 
77 |     def _parse_images(self, response):
78 |         return response.css('div#article_body').css('img::attr(src)').getall()
79 | 
80 |     def _parse_video(self, response):
81 |         return response.css('div.video-container>iframe::attr(src)').getall()
82 | 
83 |     def _parse_links(self, response):
84 |         return response.css('div#article_body').css('a::attr(href)').getall()
85 | 
86 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/chinatimes.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from crawler_news.items import CrawlerNewsItem
 3 | 
 4 | import time
 5 | import re
 6 | 
 7 | class ChinatimesSpider(scrapy.Spider):
 8 |     name = 'chinatimes'
 9 |     allowed_domains = ['chinatimes.com']
10 |     base_url = 'https://www.chinatimes.com'
11 | 
12 |     date_str = str(time.strftime("%F", time.localtime()))
13 | 
14 |     custom_settings = {
15 |         'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
16 |     }
17 | 
18 |     def start_requests(self):
19 |         list_url = '%s/realtimenews' % (self.base_url)
20 |         yield scrapy.Request(url=list_url, callback=self.parse_list)
21 | 
22 |     def parse_list(self, response):
23 |         for page_url in response.css('section.article-list>ul>li h3.title>a::attr(href)').getall():
24 |             page_url = self.base_url + page_url
25 |             if not self.redis_client.exists(page_url):
26 |                 yield scrapy.Request(url=page_url, callback=self.parse_news)
27 | 
28 |     def parse_news(self, response):
29 |         item = CrawlerNewsItem()
30 | 
31 |         item['url'] = response.url
32 |         item['article_from'] = self.name
33 |         item['article_type'] = 'news'
34 | 
35 |         item['title'] = self._parse_title(response)
36 |         item['publish_date'] = self._parse_publish_date(response)
37 |         item['authors'] = self._parse_authors(response)
38 |         item['tags'] = self._parse_tags(response)
39 |         item['text'] = self._parse_text(response)
40 |         item['text_html'] = self._parse_text_html(response)
41 |         item['images'] = self._parse_images(response)
42 |         item['video'] = self._parse_video(response)
43 |         item['links'] = self._parse_links(response)
44 | 
45 |         return item
46 | 
47 |     def _parse_title(self, response):
48 |         return response.css('article.article-box h1.article-title::text').get()
49 | 
50 |     def _parse_publish_date(self, response):
51 |         return response.css('article.article-box time::attr(datetime)').get()
52 | 
53 |     def _parse_authors(self, response):
54 |         authors = response.css('article.article-box div.author>a::text').getall()
55 |         if len(authors) == 0:
56 |             authors = [response.css('article.article-box div.author::text').get(default='').strip()]
57 |         return authors
58 | 
59 |     def _parse_tags(self, response):
60 |         return response.css('article.article-box div.article-hash-tag a::text').getall()
61 | 
62 |     def _parse_text(self, response):
63 |         return response.css('article.article-box div.article-body p::text').getall()
64 | 
65 |     def _parse_text_html(self, response):
66 |         return response.css('article.article-box div.article-body').get()
67 | 
68 |     def _parse_images(self, response):
69 |         images_list = []
70 |         images_list.extend(response.css('article.article-box div.main-figure').css('img::attr(src)').getall())
71 |         images_list.extend(response.css('article.article-box div.article-body').css('img::attr(src)').getall())
72 |         return images_list
73 | 
74 |     def _parse_video(self, response):
75 |         return response.css('article.article-box div.article-body iframe::attr(src)').getall()
76 | 
77 |     def _parse_links(self, response):
78 |         return response.css('article.article-box div.article-body').css('a::attr(href)').getall()
79 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/cna.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from crawler_news.items import CrawlerNewsItem
 3 | 
 4 | import time
 5 | 
 6 | date_str = str(time.strftime("%F", time.localtime()))
 7 | 
 8 | class CnaSpider(scrapy.Spider):
 9 |     name = 'cna'
10 |     allowed_domains = ['cna.com.tw']
11 |     base_url = 'https://www.cna.com.tw'
12 | 
13 |     custom_settings = {
14 |         'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
15 |     }
16 | 
17 |     def start_requests(self):
18 |         list_url = '%s/list/aall.aspx' % (self.base_url)
19 |         yield scrapy.Request(url=list_url, callback=self.parse_list)
20 | 
21 |     def parse_list(self, response):
22 |         for page_url in response.css('#jsMainList>li>a::attr(href)').getall():
23 |             if not self.redis_client.exists(page_url):
24 |                 yield scrapy.Request(url=page_url, callback=self.parse_news)
25 | 
26 |     def parse_news(self, response):
27 |         req_url = response.request.url
28 | 
29 |         self.logger.info(f"request page: {req_url}")
30 | 
31 |         item = CrawlerNewsItem()
32 | 
33 |         item['url'] = req_url
34 |         item['article_from'] = self.name
35 |         item['article_type'] = 'news'
36 | 
37 |         item['title'] = self._parse_title(response)
38 |         item['publish_date'] = self._parse_publish_date(response)
39 |         item['authors'] = self._parse_authors(response)
40 |         item['tags'] = self._parse_tags(response)
41 |         item['text'] = self._parse_text(response)
42 |         item['text_html'] = self._parse_text_html(response)
43 |         item['images'] = self._parse_images(response)
44 |         item['video'] = self._parse_video(response)
45 |         item['links'] = self._parse_links(response)
46 | 
47 |         return item
48 | 
49 |     def _parse_title(self, response):
50 |         return response.css('article.article h1 *::text').get()
51 | 
52 |     def _parse_publish_date(self, response):
53 |         return response.css('article.article div.timeBox span::text').get()
54 | 
55 |     def _parse_authors(self, response):
56 |         # inconsistent format
57 |         pre_authors = response.css('article.article div.paragraph p::text').re(r'^（[^）]*）|（[^）]*）[0-9]*$')
58 |         return list(map(lambda x: x[1:].split('）')[0], pre_authors))
59 | 
60 |     def _parse_tags(self, response):
61 |         tags = []
62 |         for t in response.css('.keywordTag a::text').getall():
63 |             tags.append(t.lstrip('#'))
64 |         return tags
65 | 
66 |     def _parse_text(self, response):
67 |         ret = []
68 |         for i in range(0,10):
69 |             if len(response.css('article.article div.paragraph:nth-of-type(%s) p::text' % i).getall()) != 0:
70 |                 ret = response.css('article.article div.paragraph:nth-of-type(%s) p::text' % i).getall()
71 |                 break
72 |         return ret
73 | 
74 |     def _parse_text_html(self, response):
75 |         return response.css('article.article div.paragraph').get()
76 | 
77 |     def _parse_images(self, response):
78 |         # parser error with div.fullPic
79 |         return response.css('article.article').css('img::attr(src)').getall()
80 | 
81 |     def _parse_video(self, response):
82 |         return response.css('article.article div.media iframe::attr(data-src)').getall()
83 | 
84 |     def _parse_links(self, response):
85 |         return response.css('article.article div.paragraph p').css('a::attr(href)').getall()
86 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/nownews.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from crawler_news.items import CrawlerNewsItem
 3 | 
 4 | import time
 5 | import re
 6 | import json
 7 | 
 8 | date_str = str(time.strftime("%F", time.localtime()))
 9 | 
10 | class NownewsSpider(scrapy.Spider):
11 |     name = 'nownews'
12 |     allowed_domains = ['nownews.com']
13 |     base_url = 'https://www.nownews.com'
14 | 
15 |     custom_settings = {
16 |         'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
17 |         # 'LOG_FILE': None,
18 |         # 'DEFAULT_REQUEST_HEADERS': {
19 |         #     'Accept': '*/*',
20 |         #     'Referer': 'https://www.nownews.com/',
21 |         # }
22 |     }
23 | 
24 |     def start_requests(self):
25 |         list_url = '%s/cat/breaking/' % (self.base_url)
26 |         yield scrapy.Request(url=list_url, callback=self.parse_list)
27 | 
28 |     def parse_list(self, response):
29 |         page_url_list = response.css('a::attr(href)').getall()
30 |         for page_url in page_url_list:
31 |             if re.match('https://www.nownews.com/news/*', page_url) and not self.redis_client.exists(page_url):
32 |                 yield scrapy.Request(url=page_url, callback=self.parse_news)
33 | 
34 |     def parse_news(self, response):
35 |         req_url = response.request.url
36 | 
37 |         self.logger.info(f"request page: {req_url}")
38 | 
39 |         item = CrawlerNewsItem()
40 | 
41 |         item['url'] = req_url
42 |         item['article_from'] = self.name
43 |         item['article_type'] = 'news'
44 | 
45 |         item['title'] = self._parse_title(response)
46 |         item['publish_date'] = self._parse_publish_date(response)
47 |         item['authors'] = self._parse_authors(response)
48 |         item['tags'] = self._parse_tags(response)
49 |         item['text'] = self._parse_text(response)
50 |         item['text_html'] = self._parse_text_html(response)
51 |         item['images'] = self._parse_images(response)
52 |         item['video'] = self._parse_video(response)
53 |         item['links'] = self._parse_links(response)
54 | 
55 |         return item
56 | 
57 |     def _parse_title(self, response):
58 |         return response.css('h1.article-title::text').get()
59 | 
60 |     def _parse_publish_date(self, response):
61 |         return response.css('time span::text').get()
62 | 
63 |     def _parse_authors(self, response):
64 |         return [response.css('div.infoBlk>div>p::text').get()]
65 | 
66 |     def _parse_tags(self, response):
67 |         return response.css('div.keywordBlk ul.tag li>a::text').getall()
68 | 
69 |     def _parse_text(self, response):
70 |         text = []
71 |         for t in response.css('article[itemprop=articleBody]::text').getall():
72 |             if t.strip() != '':
73 |                 text.append(t.strip())
74 |         return text
75 | 
76 |     def _parse_text_html(self, response):
77 |         return response.css('article[itemprop=articleBody]').get()
78 | 
79 |     def _parse_images(self, response):
80 |         return response.css('div.containerBlk').css('img::attr(src)').getall()
81 | 
82 |     def _parse_video(self, response):
83 |         # TODO
84 |         return response.css('article noscript>iframe::attr(src)').getall()
85 | 
86 |     def _parse_links(self, response):
87 |         # TODO
88 |         # links = response.css('article div.td-post-content').css('a::attr(href)').getall()
89 |         # return list(filter(lambda x:x if not x == '#' else None , links))
90 |         return []
91 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/setn.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from crawler_news.items import CrawlerNewsItem
 3 | 
 4 | import time
 5 | import re
 6 | 
 7 | date_str = str(time.strftime("%F", time.localtime()))
 8 | 
 9 | class SetnSpider(scrapy.Spider):
10 |     name = 'setn'
11 |     allowed_domains = ['setn.com']
12 |     base_url = 'https://www.setn.com'
13 | 
14 |     custom_settings = {
15 |         'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
16 |     }
17 | 
18 |     def start_requests(self):
19 |         list_url = 'https://www.setn.com/ViewAll.aspx?PageGroupID=1'
20 |         yield scrapy.Request(url=list_url, callback=self.parse_list)
21 | 
22 |     def parse_list(self, response):
23 |         for page_url in response.css('h3.view-li-title>a.gt ::attr(href)').getall():
24 |             page_url = self.base_url + page_url
25 |             if not self.redis_client.exists(page_url):
26 |                 yield scrapy.Request(url=page_url, callback=self.parse_news)
27 | 
28 |     def parse_news(self, response):
29 |         req_url = response.request.url
30 | 
31 |         self.logger.info(f"request page: {req_url}")
32 | 
33 |         item = CrawlerNewsItem()
34 | 
35 |         item['url'] = req_url
36 |         item['article_from'] = self.name
37 |         item['article_type'] = 'news'
38 | 
39 |         item['title'] = self._parse_title(response)
40 |         item['publish_date'] = self._parse_publish_date(response)
41 |         item['authors'] = self._parse_authors(response)
42 |         item['tags'] = self._parse_tags(response)
43 |         item['text'] = self._parse_text(response)
44 |         item['text_html'] = self._parse_text_html(response)
45 |         item['images'] = self._parse_images(response)
46 |         item['video'] = self._parse_video(response)
47 |         item['links'] = self._parse_links(response)
48 | 
49 |         return item
50 | 
51 |     def _parse_title(self, response):
52 |         if re.match('https://www.setn.com/e', response.url):
53 |             return response.css('h1#newsTitle::text').get()
54 |         else:
55 |             return response.css('h1.news-title-3::text').get()
56 | 
57 |     def _parse_publish_date(self, response):
58 |         if re.match('https://www.setn.com/e', response.url):
59 |             return response.css('div.titleBtnBlock>div.time::text').get()
60 |         else:
61 |             return response.css('time.page-date::text').get()
62 | 
63 |     def _parse_authors(self, response):
64 |         if re.match('https://www.setn.com/e', response.url):
65 |             return [response.css('div.Content2>p::text').get()]
66 |         else:
67 |             authors = response.css('div#Content1>p::text').get()
68 | 
69 |             if re.match(r'.+[／].+', authors) == None:
70 |                 return [response.css('div.page-title-text span::text').get()]
71 |             else:
72 |                 return [authors]
73 | 
74 |     def _parse_tags(self, response):
75 |         return response.css('div.page-keyword-area ul>li>a>strong::text').getall()
76 | 
77 |     def _parse_text(self, response):
78 |         return response.css('article p *::text').getall()
79 | 
80 |     def _parse_text_html(self, response):
81 |         return response.css('article').get()
82 | 
83 |     def _parse_images(self, response):
84 |         return response.css('article').css('img::attr(src)').getall()
85 | 
86 |     def _parse_video(self, response):
87 |         return response.css('article').css('iframe::attr(src)').getall()
88 | 
89 |     def _parse_links(self, response):
90 |         return response.css('article').css('a::attr(href)').getall()
91 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/ebc.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from crawler_news.items import CrawlerNewsItem
 3 | 
 4 | import time
 5 | import re
 6 | 
 7 | date_str = str(time.strftime("%F", time.localtime()))
 8 | 
 9 | class EBCSpider(scrapy.Spider):
10 |     name = 'ebc'
11 |     allowed_domains = ['news.ebc.net.tw']
12 |     base_url = 'https://news.ebc.net.tw'
13 | 
14 |     custom_settings = {
15 |         'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
16 |     }
17 | 
18 |     def start_requests(self):
19 |         list_url = '%s/realtime' % self.base_url
20 |         yield scrapy.Request(url=list_url, callback=self.parse_list)
21 | 
22 |     def parse_list(self, response):
23 |         page_url_list = response.css('div.white-box>a::attr(href)').getall()
24 | 
25 |         self.logger.info(page_url_list)
26 | 
27 |         for page_url in page_url_list:
28 |             page_url = self.base_url+page_url
29 |             if not self.redis_client.exists(page_url):
30 |                 yield scrapy.Request(url=page_url, callback=self.parse_news)
31 | 
32 |     def parse_news(self, response):
33 |         req_url = response.request.url
34 | 
35 |         self.logger.info(f"request page: {req_url}")
36 | 
37 |         item = CrawlerNewsItem()
38 | 
39 |         item['url'] = req_url
40 |         item['article_from'] = self.name
41 |         item['article_type'] = 'news'
42 | 
43 |         item['title'] = self._parse_title(response)
44 |         item['publish_date'] = self._parse_publish_date(response)
45 |         item['authors'] = self._parse_authors(response)
46 |         item['tags'] = self._parse_tags(response)
47 |         item['text'] = self._parse_text(response)
48 |         item['text_html'] = self._parse_text_html(response)
49 |         item['images'] = self._parse_images(response)
50 |         item['video'] = self._parse_video(response)
51 |         item['links'] = self._parse_links(response)
52 | 
53 |         return item
54 | 
55 |     def _parse_title(self, response):
56 |         return response.css('div.fncnews-content>h1::text').get()
57 | 
58 |     def _parse_publish_date(self, response):
59 |         pattern=r'(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})' #2019/12/22 13:53
60 |         string=response.css('div.info>span.small-gray-text::text').get()
61 |         return re.search(pattern,string).group(0)
62 | 
63 |     def _parse_authors(self, response):
64 |         pattern=r'(\d{4})/(\d{2})/(\d{2}) (\d{2}):(\d{2})' #2019/12/22 13:53
65 |         string=response.css('div.info>span.small-gray-text::text').get()
66 |         datetime=re.search(pattern,string).group(0)
67 |         return [string.replace(datetime,'').strip()] #去掉日期時間
68 | 
69 |     def _parse_tags(self, response):
70 |         return response.css('div.keyword>a::text').getall()
71 | 
72 |     def _parse_text(self, response):
73 |         text = []
74 |         for t in response.css('content-ad p::text').getall():
75 |             if t.strip() != '':
76 |                 text.append(t.strip())
77 |         return text
78 | 
79 |     def _parse_text_html(self, response):
80 |         return response.css('content-ad').get()
81 | 
82 |     def _parse_images(self, response):
83 |         allImgList=response.css('content-ad img::attr(src)').getall()
84 |         imgURLs=[]
85 |         for imgurl in allImgList:
86 |             if re.match(r'https://img.news.ebc.net.tw\S+',imgurl):
87 |                 imgURLs.append(imgurl)
88 |         return imgURLs
89 | 
90 |     def _parse_video(self, response):
91 |         fb_video=response.css('content-ad').css('iframe::attr(src)').getall()
92 |         youtube=response.css('content-ad').css('div.fb-video::attr(data-href)').getall()
93 |         return fb_video+youtube
94 | 
95 |     def _parse_links(self, response):
96 |         return response.css('content-ad').css('a::attr(href)').getall()
97 | 


--------------------------------------------------------------------------------
/crawler_news/extensions/MysqlDatabase.py:
--------------------------------------------------------------------------------
  1 | import pymysql.cursors
  2 | 
  3 | class MysqlDatabase():
  4 |     def __init__(self, host, port, user, password, db, table='test', charset='utf8mb4'):
  5 |         self.host = host
  6 |         self.port = int(port)
  7 |         self.user = user
  8 |         self.password = password
  9 |         self.database = db
 10 |         self.table = table
 11 |         self.charset = charset
 12 |         self.connect()
 13 | 
 14 |     def __del__(self,):
 15 |         if self.connection.open:
 16 |             self.connection.close()
 17 | 
 18 |     def connect(self,):
 19 |         self.connection = pymysql.connect(
 20 |             host=self.host, port=self.port, user=self.user, password=self.password,
 21 |             charset=self.charset, cursorclass=pymysql.cursors.DictCursor)
 22 | 
 23 |         self.create_db()
 24 |         self.connection.select_db(self.database)
 25 |         self.create_table()
 26 | 
 27 |     def close(self,):
 28 |         self.connection.close()
 29 | 
 30 |     def query(self, sql, params=()):
 31 |         with self.connection.cursor() as cursor:
 32 |             cursor.execute(sql, params)
 33 | 
 34 |     def execute(self, sql, params=()):
 35 |         with self.connection.cursor() as cursor:
 36 |             cursor.execute(sql, params)
 37 |         self.connection.commit()
 38 | 
 39 |     def fetch_one(self, sql, params=()):
 40 |         with self.connection.cursor() as cursor:
 41 |             cursor.execute(sql, params)
 42 |             result = cursor.fetchone()
 43 | 
 44 |         return result
 45 | 
 46 |     def create_db(self,):
 47 |         sql = "SHOW DATABASES LIKE '%s';" % self.database
 48 |         if self.fetch_one(sql) is None:
 49 |             sql = "CREATE DATABASE IF NOT EXISTS `%s`;" % self.database
 50 |             self.query(sql)
 51 | 
 52 |     def create_table(self,):
 53 |         sql = "SHOW TABLES LIKE '%s';" % self.table
 54 |         if self.fetch_one(sql) is None:
 55 |             sql = """
 56 |                 CREATE TABLE IF NOT EXISTS `%s`.`%s` (
 57 |                     `id` serial NOT NULL AUTO_INCREMENT,
 58 |                     `url` varchar(255) NOT NULL,
 59 |                     `article_from` varchar(255) NOT NULL DEFAULT 'UNKNOWN',
 60 |                     `article_type` varchar(255) DEFAULT NULL,
 61 |                     `title` varchar(255) DEFAULT NULL,
 62 |                     `publish_date` varchar(255) DEFAULT NULL,
 63 |                     `authors` json DEFAULT NULL,
 64 |                     `tags` json DEFAULT NULL,
 65 |                     `text` text DEFAULT NULL,
 66 |                     `text_html` text DEFAULT NULL,
 67 |                     `images` json DEFAULT NULL,
 68 |                     `video` json DEFAULT NULL,
 69 |                     `links` json DEFAULT NULL,
 70 |                     `created_at` datetime NOT NULL DEFAULT NOW(),
 71 |                     `updated_at` datetime NOT NULL DEFAULT NOW(),
 72 |                     `deleted_at` datetime DEFAULT NULL,
 73 |                     PRIMARY KEY (id),
 74 |                     UNIQUE INDEX USING BTREE (url),
 75 |                     INDEX USING BTREE (title),
 76 |                     INDEX USING BTREE (article_from),
 77 |                     INDEX USING BTREE (article_type),
 78 |                     INDEX USING BTREE (created_at),
 79 |                     INDEX USING BTREE (updated_at),
 80 |                     INDEX USING BTREE (deleted_at)
 81 |                     ) ENGINE=InnoDB;
 82 |                 """
 83 | 
 84 |             self.execute(sql % (self.database, self.table))
 85 | 
 86 |     def news_exist(self, url):
 87 |         sql = "SELECT * FROM %s.%s WHERE url='%s';"
 88 |         if self.fetch_one(sql % (self.database, self.table, url)) is None:
 89 |             return False
 90 |         return True
 91 | 
 92 |     def insert(self, data={}):
 93 |         sql = "INSERT INTO `%s`.`%s` " % (self.database, self.table)
 94 |         sql = sql + """
 95 |                 (url, article_from, article_type,
 96 |                  title, publish_date, authors, tags,
 97 |                  text, text_html, images, video, links)
 98 |                 VALUES
 99 |                 (%(url)s, %(article_from)s, %(article_type)s,
100 |                  %(title)s, %(publish_date)s, %(authors)s, %(tags)s,
101 |                  %(text)s, %(text_html)s, %(images)s, %(video)s, %(links)s)
102 |             """
103 | 
104 |         return self.execute(sql, data)
105 | 
106 |     def update(self,):
107 |         raise 'Method not implemented.'
108 | 
109 |     def delete(self,):
110 |         raise 'Method not implemented.'
111 | 


--------------------------------------------------------------------------------
/crawler_news/settings.py.example:
--------------------------------------------------------------------------------
  1 | # Scrapy settings for crawler_news project
  2 | #
  3 | # For simplicity, this file contains only settings considered important or
  4 | # commonly used. You can find more settings consulting the documentation:
  5 | #
  6 | #     https://docs.scrapy.org/en/latest/topics/settings.html
  7 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
  8 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  9 | 
 10 | BOT_NAME = 'crawler_news'
 11 | 
 12 | SPIDER_MODULES = ['crawler_news.spiders']
 13 | NEWSPIDER_MODULE = 'crawler_news.spiders'
 14 | 
 15 | 
 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 17 | USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
 18 | 
 19 | # Obey robots.txt rules
 20 | ROBOTSTXT_OBEY = True
 21 | 
 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 23 | #CONCURRENT_REQUESTS = 32
 24 | 
 25 | # Configure a delay for requests for the same website (default: 0)
 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 27 | # See also autothrottle settings and docs
 28 | #DOWNLOAD_DELAY = 3
 29 | # The download delay setting will honor only one of:
 30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 31 | #CONCURRENT_REQUESTS_PER_IP = 16
 32 | 
 33 | # Disable cookies (enabled by default)
 34 | #COOKIES_ENABLED = False
 35 | 
 36 | # Disable Telnet Console (enabled by default)
 37 | #TELNETCONSOLE_ENABLED = False
 38 | 
 39 | # Override the default request headers:
 40 | DEFAULT_REQUEST_HEADERS = {
 41 |   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
 42 |   'Accept-Language': 'zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7',
 43 | }
 44 | 
 45 | # Enable or disable spider middlewares
 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 47 | SPIDER_MIDDLEWARES = {
 48 |    # 'crawler_news.middlewares.CrawlerNewsSpiderMiddleware': 543,
 49 | }
 50 | 
 51 | # Enable or disable downloader middlewares
 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 53 | DOWNLOADER_MIDDLEWARES = {
 54 |    # 'crawler_news.middlewares.CrawlerNewsDownloaderMiddleware': 543,
 55 | }
 56 | 
 57 | # Enable or disable extensions
 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
 59 | EXTENSIONS = {
 60 |    'crawler_news.extensions.redis.RedisClient': 10,
 61 |    # 'crawler_news.extensions.telnet.TelnetConsole': None,
 62 | }
 63 | 
 64 | # Configure item pipelines
 65 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 66 | ITEM_PIPELINES = {
 67 |    'crawler_news.pipelines.ckiptagger.CkiptaggerPipeline': 100,
 68 |    # 'crawler_news.pipelines.postgresql.PostgresqlPipeline': 100,
 69 |    # 'crawler_news.pipelines.jieba.JiebaPipeline': 300,
 70 |    # 'crawler_news.pipelines.line.LineNotifyPipeline': 100,
 71 | }
 72 | 
 73 | # Enable and configure the AutoThrottle extension (disabled by default)
 74 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 75 | #AUTOTHROTTLE_ENABLED = True
 76 | # The initial download delay
 77 | #AUTOTHROTTLE_START_DELAY = 5
 78 | # The maximum download delay to be set in case of high latencies
 79 | #AUTOTHROTTLE_MAX_DELAY = 60
 80 | # The average number of requests Scrapy should be sending in parallel to
 81 | # each remote server
 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 83 | # Enable showing throttling stats for every response received:
 84 | #AUTOTHROTTLE_DEBUG = False
 85 | 
 86 | # Enable and configure HTTP caching (disabled by default)
 87 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 88 | #HTTPCACHE_ENABLED = True
 89 | #HTTPCACHE_EXPIRATION_SECS = 0
 90 | #HTTPCACHE_DIR = 'httpcache'
 91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
 92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
 93 | 
 94 | # Set settings whose default value is deprecated to a future-proof value
 95 | REQUEST_FINGERPRINTER_IMPLEMENTATION = '2.7'
 96 | # TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor'
 97 | 
 98 | # LOG_FILE = 'log/scrapy.log'
 99 | LOG_LEVEL = 'WARNING' # INFO # DEBUG # WARNING # ERROR # CRITICAL
100 | 
101 | CASSANDRA_HOST = ['localhost'] # localhost
102 | CASSANDRA_KEYSPAC = ''
103 | CASSANDRA_TABLE = ''
104 | 
105 | MYSQL_HOST = 'localhost'
106 | MYSQL_PORT = 3306
107 | MYSQL_USER = ''
108 | MYSQL_PASSWORD = ''
109 | MYSQL_DB = ''
110 | MYSQL_TABLE = ''
111 | MYSQL_CHARSET = ''
112 | 
113 | REDIS_HOST = 'localhost'
114 | REDIS_PORT = 6379
115 | REDIS_DATABASE = 0
116 | 
117 | LINE_NOTIFY_TOKEN = ''
118 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Python crawler for news
  2 | 
  3 | Use python scrapy build crawler for real-time Taiwan NEWS website.
  4 | 
  5 | 使用 python scrapy 建置抓取台灣新聞網站即時新聞的爬蟲
  6 | 
  7 | ## TODO LIST
  8 | 
  9 | - 整理 setting 與 cicd
 10 | - 部署 line notify
 11 | 
 12 | - 用成 k8s 部到 GKC，或是 VM 即可？
 13 | - 實作多執行緒，同步爬蟲執行，使用 python script
 14 | - 可以考慮實作 docker install shell
 15 | - 持續修改 Bug
 16 | - 實作全網站一次性爬蟲（提供給 production）
 17 | - 消滅 TODO
 18 | - 寫一隻資料庫清理爬蟲
 19 | 
 20 | ## TODO website
 21 | 
 22 | List from [Alexa台灣排名](https://www.prlass.com/2992/%E5%8F%B0%E7%81%A3%E7%B6%B2%E8%B7%AF%E6%96%B0%E8%81%9E%E5%AA%92%E9%AB%94%E6%B5%81%E9%87%8F%E6%8E%92%E5%90%8D-2018-01/)
 23 | 
 24 | [update 2022/3] [Alexa台灣排名](https://www.prlass.com/4941/%E5%8F%B0%E7%81%A3%E5%AA%92%E9%AB%94%E6%8E%92%E5%90%8D2022%E5%B9%B43%E6%9C%88/)
 25 | 
 26 | `! Alexa停止營運了，之後再看看要換成什麼`
 27 | 
 28 | 1. [自由時報](https://www.ltn.com.tw/)
 29 |     - [2022/12/30] 已更新
 30 | 1. [東森新聞](https://news.ebc.net.tw/)
 31 |     - [2022/12/30] 已更新
 32 | 1. [聯合新聞網](https://udn.com/news/index)
 33 |     - [2022/12/30] 已更新
 34 | 1. [今日新聞](https://www.nownews.com/)
 35 |     - [2023/01/03] 已更新
 36 | 1. [ettoday](https://www.ettoday.net/)
 37 |     - [2023/01/03] 已更新
 38 | 1. [NEW] [巴哈姆特電玩資訊站](https://gnn.gamer.com.tw/)
 39 |     - TODO
 40 | 1. [風傳媒](https://www.storm.mg/)
 41 |     - TODO
 42 | 1. [公司還在嗎?] [頻果新聞網](https://tw.appledaily.com/home)
 43 |     - [2022/12] 尚未檢查
 44 |     - 要使用 javascript
 45 |     - 不能用 cookie,session
 46 |     - 新聞整體格式非主流，例：文章時間
 47 | 1. [中時電子報](https://www.chinatimes.com/)
 48 |     - [2023/01/03] 已更新
 49 | 1. [今周刊](https://www.businesstoday.com.tw/)
 50 |     - [2022/12] 尚未檢查
 51 |     - Maybe need javascript
 52 |     - Non-instant news
 53 |     - Mostly for business news
 54 | 1. [TVBS](https://news.tvbs.com.tw/)
 55 |     - [2023/01/04] 已更新
 56 | 1. [商業週刊](https://www.businessweekly.com.tw/)
 57 |     - [2022/12] 尚未檢查
 58 |     - Non-instant news
 59 |     - Mostly for business news
 60 | 1. [三立新聞網](https://www.setn.com/)
 61 |     - [2023/01/03] 已更新
 62 | 1. [NEW] [民視新聞](https://www.ftvnews.com.tw/)
 63 |     - [2022/12] 尚未檢查
 64 | 1. [中央通訊社](https://www.cna.com.tw/)
 65 |     - [2023/01/04] 已更新
 66 | 1. [關鍵評論網](https://www.thenewslens.com/)
 67 |     - [2022/12] 尚未檢查
 68 |     - Non-instant news
 69 | 
 70 | 
 71 | ## Crawler step
 72 | 
 73 | 1. Request real-time news lists.
 74 | 2. Request news page from setp.1 list.
 75 | 3. Parsing html and get target value. [item.py](crawler_news/items.py)
 76 |     - url
 77 |     - article_from
 78 |     - article_type
 79 |     - title
 80 |     - publish_date
 81 |     - authors
 82 |     - tags
 83 |     - text
 84 |     - text_html
 85 |     - images
 86 |     - video
 87 |     - links
 88 | 4. Save into database. [pipelines.py](crawler_news/pipelines.py)
 89 |     - Default Use Cassandra
 90 |     - [TODO][feature] Use Mongo or Mysql
 91 | 5. Done
 92 | 
 93 | ## Requirement Install
 94 | 
 95 | ### Develop Env
 96 | 
 97 | - python 3.7.6
 98 | - scrapy >= 2.0.0
 99 | - Cassandra 3.11.4
100 | - Develop on macOS (main)
101 | 
102 | ### python scrapy
103 | 
104 | ```bash
105 |     pip install scrapy
106 |     # or
107 |     pip3 install scrapy
108 | ```
109 | 
110 | ### Install Cassandra Database
111 | 
112 | mac os
113 | 
114 | ```bash
115 |     brew install cassandra
116 | ```
117 | 
118 | python extension
119 | 
120 | ```bash
121 |     pip install cassandra-driver
122 |     # or
123 |     pip3 install cassandra-driver
124 | ```
125 | 
126 | start cassandra
127 | 
128 | ```bash
129 |     # start on bash
130 |     cassandra -f
131 | 
132 |     # start on backgroud
133 | ```
134 | 
135 | ### Install Mysql Database
136 | 
137 | mac os
138 | 
139 | ```bash
140 |     brew install mysql
141 | ```
142 | 
143 | python extension
144 | 
145 | ```bash
146 |     pip install PyMySQL
147 |     # or
148 |     pip3 install PyMySQL
149 | ```
150 | 
151 | ## RUN Project
152 | 
153 | ### Run all in localhost terminal
154 | 
155 | ```bash
156 |     ./run_spiders.sh
157 | ```
158 | 
159 | ### Run in Docker use docker-compose.yml
160 | 
161 | 1. build docker image
162 | 
163 | ```bash
164 |     docker build . -t crawler_news
165 | ```
166 | 
167 | If you want exec crawler without database. modify docker/setting.py and re-build.
168 | 
169 | ```bash
170 |     # run without database (linux base command)
171 |     docker run --rm -it -v `pwd`/tmp:/src/tmp -v `pwd`/log:/src/log crawler_news
172 | ```
173 | 
174 | If you want exec single crawler. modify Dockerfile and re-build.
175 | 
176 | ```Dockerfile
177 |     CMD ["/bin/bash"]
178 |     # or assign crawler
179 |     CMD ["scrapy", "crawl", "ettoday"]
180 | ```
181 | 
182 | 1. run docker-compose
183 | 
184 | ```bash
185 |     # start
186 |     docker-compose up -d
187 | 
188 |     # stop
189 |     docker-compose down
190 | ```
191 | 


--------------------------------------------------------------------------------
/crawler_news/middlewares/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | 
  6 | from scrapy import signals, Item
  7 | 
  8 | # useful for handling different item types with a single interface
  9 | from itemadapter import is_item, ItemAdapter
 10 | 
 11 | import json
 12 | 
 13 | daily_sec = 60 * 60 * 24
 14 | 
 15 | class CrawlerNewsSpiderMiddleware:
 16 |     # Not all methods need to be defined. If a method is not defined,
 17 |     # scrapy acts as if the spider middleware does not modify the
 18 |     # passed objects.
 19 | 
 20 |     @classmethod
 21 |     def from_crawler(cls, crawler):
 22 |         # print('[middleware] CrawlerNewsSpiderMiddleware from_crawler')
 23 | 
 24 |         # This method is used by Scrapy to create your spiders.
 25 |         s = cls()
 26 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 27 |         return s
 28 | 
 29 |     def process_spider_input(self, response, spider):
 30 |         # print('[middleware] CrawlerNewsSpiderMiddleware process_spider_input')
 31 | 
 32 |         # Called for each response that goes through the spider
 33 |         # middleware and into the spider.
 34 | 
 35 |         # Should return None or raise an exception.
 36 |         return None
 37 | 
 38 |     def process_spider_output(self, response, result, spider):
 39 |         # print('[middleware] CrawlerNewsSpiderMiddleware process_spider_output')
 40 | 
 41 |         # Called with the results returned from the Spider, after
 42 |         # it has processed the response.
 43 | 
 44 |         # Must return an iterable of Request, or item objects.
 45 |         for i in result:
 46 |             if is_item(i) :
 47 |                 spider.redis_client.set(i.get('url'), json.dumps( dict(i) ), ex=daily_sec)
 48 |             yield i
 49 | 
 50 |     def process_spider_exception(self, response, exception, spider):
 51 |         # print('[middleware] CrawlerNewsSpiderMiddleware process_spider_exception')
 52 | 
 53 |         # Called when a spider or process_spider_input() method
 54 |         # (from other spider middleware) raises an exception.
 55 | 
 56 |         # Should return either None or an iterable of Request or item objects.
 57 |         pass
 58 | 
 59 |     def process_start_requests(self, start_requests, spider):
 60 |         # print('[middleware] CrawlerNewsSpiderMiddleware process_start_requests')
 61 | 
 62 |         # Called with the start requests of the spider, and works
 63 |         # similarly to the process_spider_output() method, except
 64 |         # that it doesn’t have a response associated.
 65 | 
 66 |         # Must return only requests (not items).
 67 |         for r in start_requests:
 68 |             yield r
 69 | 
 70 |     def spider_opened(self, spider):
 71 |         # print('[middleware] CrawlerNewsSpiderMiddleware spider_opened')
 72 | 
 73 |         spider.logger.info('Spider opened: %s' % spider.name)
 74 | 
 75 | 
 76 | class CrawlerNewsDownloaderMiddleware:
 77 |     # Not all methods need to be defined. If a method is not defined,
 78 |     # scrapy acts as if the downloader middleware does not modify the
 79 |     # passed objects.
 80 | 
 81 |     @classmethod
 82 |     def from_crawler(cls, crawler):
 83 |         # print('[middleware] CrawlerNewsDownloaderMiddleware from_crawler')
 84 | 
 85 |         # This method is used by Scrapy to create your spiders.
 86 |         s = cls()
 87 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 88 |         return s
 89 | 
 90 |     def process_request(self, request, spider):
 91 |         # print('[middleware] CrawlerNewsDownloaderMiddleware process_request')
 92 | 
 93 |         # Called for each request that goes through the downloader
 94 |         # middleware.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this request
 98 |         # - or return a Response object
 99 |         # - or return a Request object
100 |         # - or raise IgnoreRequest: process_exception() methods of
101 |         #   installed downloader middleware will be called
102 |         return None
103 | 
104 |     def process_response(self, request, response, spider):
105 |         # print('[middleware] CrawlerNewsDownloaderMiddleware process_response')
106 | 
107 |         # Called with the response returned from the downloader.
108 | 
109 |         # Must either;
110 |         # - return a Response object
111 |         # - return a Request object
112 |         # - or raise IgnoreRequest
113 |         return response
114 | 
115 |     def process_exception(self, request, exception, spider):
116 |         # print('[middleware] CrawlerNewsDownloaderMiddleware process_exception')
117 | 
118 |         # Called when a download handler or a process_request()
119 |         # (from other downloader middleware) raises an exception.
120 | 
121 |         # Must either:
122 |         # - return None: continue processing this exception
123 |         # - return a Response object: stops process_exception() chain
124 |         # - return a Request object: stops process_exception() chain
125 |         pass
126 | 
127 |     def spider_opened(self, spider):
128 |         # print('[middleware] CrawlerNewsDownloaderMiddleware spider_opened')
129 | 
130 |         spider.logger.info('Spider opened: %s' % spider.name)
131 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/EtToday.py:
--------------------------------------------------------------------------------
  1 | import scrapy
  2 | from crawler_news.items import CrawlerNewsItem
  3 | from scrapy.exceptions import IgnoreRequest
  4 | 
  5 | import time
  6 | import re
  7 | 
  8 | date_str = str(time.strftime("%F", time.localtime()))
  9 | 
 10 | class EtTodaySpider(scrapy.Spider):
 11 |     name = 'ettoday'
 12 |     allowed_domains = ['ettoday.net']
 13 |     base_url = 'https://www.ettoday.net'
 14 | 
 15 |     custom_settings = {
 16 |         'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
 17 | 
 18 |         # https://speed.ettoday.net/robots.txt
 19 |         'ROBOTSTXT_OBEY': False
 20 |     }
 21 | 
 22 |     def start_requests(self):
 23 |         # TODO check date_page 1.exist 2.formet 3.default 2019-12-19
 24 |         date_page = getattr(self, 'page', time.strftime('%Y-%m-%d'))
 25 |         # * raise date_page.re('%Y-%m-%d')
 26 | 
 27 |         list_url = '%s/news/news-list.htm' % (self.base_url)
 28 |         yield scrapy.Request(url=list_url, callback=self.parse_list)
 29 | 
 30 |     def parse_list(self, response):
 31 |         # * raise 404
 32 |         for page_url in response.css('div.part_list_2>h3>a::attr(href)').getall():
 33 |             page_url = self.base_url+page_url
 34 |             if not self.redis_client.exists(page_url):
 35 |                 yield scrapy.Request(url=page_url,callback=self.parse_news,cb_kwargs=dict(req_url=page_url))
 36 | 
 37 |     def parse_news(self, response, req_url):
 38 |         self.logger.info(f"request page: {req_url}")
 39 | 
 40 |         item = CrawlerNewsItem()
 41 | 
 42 |         item['url'] = req_url
 43 |         item['article_from'] = self.name
 44 |         item['article_type'] = 'news'
 45 | 
 46 |         item['title'] = self._parse_title(response)
 47 |         item['publish_date'] = self._parse_publish_date(response)
 48 |         item['authors'] = self._parse_authors(response)
 49 |         item['tags'] = self._parse_tags(response)
 50 |         item['text'] = self._parse_text(response)
 51 |         item['text_html'] = self._parse_text_html(response)
 52 |         item['images'] = self._parse_images(response)
 53 |         item['video'] = self._parse_video(response)
 54 |         item['links'] = self._parse_links(response)
 55 | 
 56 |         return item
 57 | 
 58 |     def _parse_title(self, response):
 59 |         if re.match('https://fashion.', response.url):
 60 |             return response.css('h1.title_article::text').get()
 61 |         else:
 62 |             return response.css('h1.title::text').get()
 63 | 
 64 |     def _parse_publish_date(self, response):
 65 |         if re.match('https://pets.', response.url):
 66 |             return response.css('time.news-time::text').get(default='').strip()
 67 |         if re.match('https://pets.', response.url):
 68 |             return response.css('.subject_article h1::text').get(default='').strip()
 69 |         else:
 70 |             return response.css('time.date::text').get(default='').strip()
 71 | 
 72 |     def _parse_authors(self, response):
 73 |         authors = response.css('div.story>p *::text')
 74 |         if authors.re_first(r'(^[^▲▼（\s]*／[^）\s]*)') != None:
 75 |             return [authors.re_first(r'(^[^▲▼（\s]*／[^）\s]*)')]
 76 |         elif authors.re_first(r'(^.+\/.+)') != None:
 77 |             return [authors.re_first(r'(^.+\/.+)')]
 78 | 
 79 |     def _parse_tags(self, response):
 80 |         news_tags = []
 81 |         if re.match('https://www.', response.url):
 82 |             news_tags = news_tags + response.css('div.part_menu_5>a::text').getall()
 83 |             news_tags = news_tags + response.css('div.part_tag_1>a::text').getall()
 84 |         elif re.match('https://star.', response.url):
 85 |             news_tags = response.css('div.menu_txt_2>a::text').getall()
 86 |         elif re.match('https://fashion.', response.url):
 87 |             news_tags = response.css('div.part_keyword>a::text').getall()
 88 |         elif re.match('https://pets.', response.url) \
 89 |                 or re.match('https://sports.', response.url)\
 90 |                 or re.match('https://house.', response.url)\
 91 |                 or re.match('https://travel.', response.url)\
 92 |                 or re.match('https://health.', response.url)\
 93 |                 or re.match('https://speed.', response.url)\
 94 |                 or re.match('https://discovery.', response.url):
 95 |             news_tags = response.css('div.tag>a::text').getall()
 96 |         elif re.match('https://forum.', response.url):
 97 |             news_tags = response.css('div.part_tag>a::text').getall()
 98 |         else:
 99 |             pass
100 | 
101 |         return news_tags
102 | 
103 |     def _parse_text(self, response):
104 |         text = []
105 |         for t in response.css('div.story[itemprop=articleBody]>p *::text').getall():
106 |             if t.strip() != '':
107 |                 text.append(t.strip())
108 |         return text
109 | 
110 |     def _parse_text_html(self, response):
111 |         return response.css('div.story[itemprop=articleBody]').get()
112 | 
113 |     def _parse_images(self, response):
114 |         return response.css('div.story').css('img::attr(src)').getall()
115 | 
116 |     def _parse_video(self, response):
117 |         return response.css('div.story iframe::attr(src)').getall()
118 | 
119 |     def _parse_links(self, response):
120 |         return response.css('div.story').css('a::attr(href)').getall()
121 | 


--------------------------------------------------------------------------------
/crawler_news/spiders/LibertyTimes.py:
--------------------------------------------------------------------------------
  1 | import scrapy
  2 | from crawler_news.items import CrawlerNewsItem
  3 | 
  4 | import time
  5 | import re
  6 | 
  7 | date_str = str(time.strftime("%F", time.localtime()))
  8 | 
  9 | class LibertyTimesSpider(scrapy.Spider):
 10 |     name = 'libertytimes'
 11 |     allowed_domains = ['ltn.com.tw']
 12 |     base_url = 'https://news.ltn.com.tw'
 13 | 
 14 |     custom_settings = {
 15 |         'LOG_FILE': 'log/%s-%s.log' % (name, date_str),
 16 |     }
 17 | 
 18 |     def start_requests(self):
 19 |         list_url = '%s/list/breakingnews' % self.base_url
 20 |         yield scrapy.Request(url=list_url, callback=self.parse_list)
 21 | 
 22 |     def parse_list(self, response):
 23 |         page_url_list = response.css('ul.list>li>a.tit::attr(href)').getall()
 24 | 
 25 |         self.logger.info(page_url_list)
 26 | 
 27 |         for page_url in page_url_list:
 28 |             if not self.redis_client.exists(page_url):
 29 |                 yield scrapy.Request(url=page_url, callback=self.parse_news)
 30 | 
 31 |     def parse_news(self, response):
 32 |         req_url = response.request.url
 33 | 
 34 |         self.logger.info(f"request page: {req_url}")
 35 | 
 36 |         item = CrawlerNewsItem()
 37 | 
 38 |         item['url'] = req_url
 39 |         item['article_from'] = self.name
 40 |         item['article_type'] = 'news'
 41 | 
 42 |         item['title'] = self._parse_title(response)
 43 |         item['publish_date'] = self._parse_publish_date(response)
 44 |         item['authors'] = self._parse_authors(response)
 45 |         item['tags'] = self._parse_tags(response)
 46 |         item['text'] = self._parse_text(response)
 47 |         item['text_html'] = self._parse_text_html(response)
 48 |         item['images'] = self._parse_images(response)
 49 |         item['video'] = self._parse_video(response)
 50 |         item['links'] = self._parse_links(response)
 51 | 
 52 |         return item
 53 | 
 54 |     def _parse_title(self, response):
 55 |         return response.css('h1::text').get()
 56 | 
 57 |     def _parse_publish_date(self, response):
 58 |         publish_date = response.css('div.content *::text').re_first(r'[0-9/-]+[\s]+[0-9:]+', default='').strip()
 59 | 
 60 |         if re.match('https://news', response.url):
 61 |             publish_date = response.css('div.whitecon span.time::text').get(default='').strip()
 62 |         elif re.match('https://sports', response.url):
 63 |             publish_date = response.css('div.c_time::text').get(default='').strip()
 64 |         elif re.match('https://istyle', response.url):
 65 |             publish_date = response.css('div.label-date::text').get(default='').strip()
 66 |         elif re.match('https://ent', response.url):
 67 |             publish_date = response.css('div.content div.date::text').get(default='').strip()
 68 |         elif re.match('https://auto', response.url):
 69 |             publish_date = response.css('div.con_writer span.h1dt::text').get(default='').strip()
 70 | 
 71 |         return publish_date
 72 | 
 73 |     def _parse_authors(self, response):
 74 |         return [response.css('div.content *::text').re_first(r'[\[〔［].+[／].+[］〕\]]',default='')]
 75 | 
 76 |     def _parse_tags(self, response):
 77 |         # no tags
 78 |         return []
 79 | 
 80 |     def _parse_text(self, response):
 81 |         text = []
 82 |         reStr = ''
 83 |         if re.match('https://sports', response.url):
 84 |             reStr = 'div[itemprop="articleBody"] p::text'
 85 |         elif re.match('https://ent', response.url):
 86 |             reStr = 'div.news_content p:not([class]) *::text'
 87 |         else:
 88 |             reStr = 'div.text>p:not([class]) *::text'
 89 | 
 90 |         if re.match('https://health', response.url) or re.match('https://art', response.url):
 91 |             _text = []
 92 |             for v in response.css(reStr).getall():
 93 |                 if v.strip().startswith('☆') or v.strip().startswith('自由健康網') or v.strip().startswith('自由藝文網'):
 94 |                     continue
 95 |                 _text.append(v.strip())
 96 |         else:
 97 |             _text = response.css(reStr).getall()
 98 | 
 99 |         for t in _text:
100 |             if t.strip() != '':
101 |                 text.append(t.strip())
102 | 
103 |         return text
104 | 
105 |     def _parse_text_html(self, response):
106 |         if re.match('https://sports', response.url):
107 |             return response.css('div.news_p').get()
108 |         elif re.match('https://ent', response.url):
109 |             return response.css('div.news_content').get()
110 |         else:
111 |             return response.css('div.text').get()
112 | 
113 |     def _parse_images(self, response):
114 |         if re.match('https://sports', response.url):
115 |             return response.css('div.news_p').css('img::attr(src)').getall()
116 |         elif re.match('https://ent', response.url):
117 |             return response.css('div.news_content').css('img::attr(data-original)').getall()
118 |         else:
119 |             return response.css('div.text').css('img::attr(src)').getall()
120 | 
121 | 
122 |     def _parse_video(self, response):
123 |         if re.match('https://sports', response.url):
124 |             return response.css('div.news_p').css('iframe::attr(src)').getall()
125 |         elif re.match('https://ent', response.url):
126 |             return response.css('div.news_content').css('iframe::attr(src)').getall()
127 |         else:
128 |             return response.css('div.text').css('iframe::attr(src)').getall()
129 | 
130 |     def _parse_links(self, response):
131 |         if re.match('https://sports', response.url):
132 |             return response.css('div.news_p').css('a::attr(href)').getall()
133 |         elif re.match('https://ent', response.url):
134 |             return response.css('div.news_content').css('a::attr(href)').getall()
135 |         else:
136 |             return response.css('div.text').css('a::attr(href)').getall()
137 | 
138 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # A comma-separated list of package or module names from where C extensions may
  4 | # be loaded. Extensions are loading into the active Python interpreter and may
  5 | # run arbitrary code.
  6 | extension-pkg-whitelist=
  7 | 
  8 | # Add files or directories to the blacklist. They should be base names, not
  9 | # paths.
 10 | ignore=CVS
 11 | 
 12 | # Add files or directories matching the regex patterns to the blacklist. The
 13 | # regex matches against base names, not paths.
 14 | ignore-patterns=
 15 | 
 16 | # Python code to execute, usually for sys.path manipulation such as
 17 | # pygtk.require().
 18 | #init-hook=
 19 | 
 20 | # Use multiple processes to speed up Pylint. Specifying 0 will auto-detect the
 21 | # number of processors available to use.
 22 | jobs=1
 23 | 
 24 | # Control the amount of potential inferred values when inferring a single
 25 | # object. This can help the performance when dealing with large functions or
 26 | # complex, nested conditions.
 27 | limit-inference-results=100
 28 | 
 29 | # List of plugins (as comma separated values of python module names) to load,
 30 | # usually to register additional checkers.
 31 | load-plugins=
 32 | 
 33 | # Pickle collected data for later comparisons.
 34 | persistent=yes
 35 | 
 36 | # Specify a configuration file.
 37 | #rcfile=
 38 | 
 39 | # When enabled, pylint would attempt to guess common misconfiguration and emit
 40 | # user-friendly hints instead of false-positive error messages.
 41 | suggestion-mode=yes
 42 | 
 43 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 44 | # active Python interpreter and may run arbitrary code.
 45 | unsafe-load-any-extension=no
 46 | 
 47 | 
 48 | [MESSAGES CONTROL]
 49 | 
 50 | # Only show warnings with the listed confidence levels. Leave empty to show
 51 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED.
 52 | confidence=
 53 | 
 54 | # Disable the message, report, category or checker with the given id(s). You
 55 | # can either give multiple identifiers separated by comma (,) or put this
 56 | # option multiple times (only on the command line, not in the configuration
 57 | # file where it should appear only once). You can also use "--disable=all" to
 58 | # disable everything first and then reenable specific checks. For example, if
 59 | # you want to run only the similarities checker, you can use "--disable=all
 60 | # --enable=similarities". If you want to run only the classes checker, but have
 61 | # no Warning level messages displayed, use "--disable=all --enable=classes
 62 | # --disable=W".
 63 | disable=print-statement,
 64 |         parameter-unpacking,
 65 |         unpacking-in-except,
 66 |         old-raise-syntax,
 67 |         backtick,
 68 |         long-suffix,
 69 |         old-ne-operator,
 70 |         old-octal-literal,
 71 |         import-star-module-level,
 72 |         non-ascii-bytes-literal,
 73 |         raw-checker-failed,
 74 |         bad-inline-option,
 75 |         locally-disabled,
 76 |         file-ignored,
 77 |         suppressed-message,
 78 |         useless-suppression,
 79 |         deprecated-pragma,
 80 |         use-symbolic-message-instead,
 81 |         apply-builtin,
 82 |         basestring-builtin,
 83 |         buffer-builtin,
 84 |         cmp-builtin,
 85 |         coerce-builtin,
 86 |         execfile-builtin,
 87 |         file-builtin,
 88 |         long-builtin,
 89 |         raw_input-builtin,
 90 |         reduce-builtin,
 91 |         standarderror-builtin,
 92 |         unicode-builtin,
 93 |         xrange-builtin,
 94 |         coerce-method,
 95 |         delslice-method,
 96 |         getslice-method,
 97 |         setslice-method,
 98 |         no-absolute-import,
 99 |         old-division,
100 |         dict-iter-method,
101 |         dict-view-method,
102 |         next-method-called,
103 |         metaclass-assignment,
104 |         indexing-exception,
105 |         raising-string,
106 |         reload-builtin,
107 |         oct-method,
108 |         hex-method,
109 |         nonzero-method,
110 |         cmp-method,
111 |         input-builtin,
112 |         round-builtin,
113 |         intern-builtin,
114 |         unichr-builtin,
115 |         map-builtin-not-iterating,
116 |         zip-builtin-not-iterating,
117 |         range-builtin-not-iterating,
118 |         filter-builtin-not-iterating,
119 |         using-cmp-argument,
120 |         eq-without-hash,
121 |         div-method,
122 |         idiv-method,
123 |         rdiv-method,
124 |         exception-message-attribute,
125 |         invalid-str-codec,
126 |         sys-max-int,
127 |         bad-python3-import,
128 |         deprecated-string-function,
129 |         deprecated-str-translate-call,
130 |         deprecated-itertools-function,
131 |         deprecated-types-field,
132 |         next-method-defined,
133 |         dict-items-not-iterating,
134 |         dict-keys-not-iterating,
135 |         dict-values-not-iterating,
136 |         deprecated-operator-function,
137 |         deprecated-urllib-function,
138 |         xreadlines-attribute,
139 |         deprecated-sys-function,
140 |         exception-escape,
141 |         comprehension-escape
142 | 
143 | # Enable the message, report, category or checker with the given id(s). You can
144 | # either give multiple identifier separated by comma (,) or put this option
145 | # multiple time (only on the command line, not in the configuration file where
146 | # it should appear only once). See also the "--disable" option for examples.
147 | enable=c-extension-no-member
148 | 
149 | 
150 | [REPORTS]
151 | 
152 | # Python expression which should return a score less than or equal to 10. You
153 | # have access to the variables 'error', 'warning', 'refactor', and 'convention'
154 | # which contain the number of messages in each category, as well as 'statement'
155 | # which is the total number of statements analyzed. This score is used by the
156 | # global evaluation report (RP0004).
157 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
158 | 
159 | # Template used to display messages. This is a python new-style format string
160 | # used to format the message information. See doc for all details.
161 | #msg-template=
162 | 
163 | # Set the output format. Available formats are text, parseable, colorized, json
164 | # and msvs (visual studio). You can also give a reporter class, e.g.
165 | # mypackage.mymodule.MyReporterClass.
166 | output-format=text
167 | 
168 | # Tells whether to display a full report or only the messages.
169 | reports=no
170 | 
171 | # Activate the evaluation score.
172 | score=yes
173 | 
174 | 
175 | [REFACTORING]
176 | 
177 | # Maximum number of nested blocks for function / method body
178 | max-nested-blocks=5
179 | 
180 | # Complete name of functions that never returns. When checking for
181 | # inconsistent-return-statements if a never returning function is called then
182 | # it will be considered as an explicit return statement and no message will be
183 | # printed.
184 | never-returning-functions=sys.exit
185 | 
186 | 
187 | [LOGGING]
188 | 
189 | # Format style used to check logging format string. `old` means using %
190 | # formatting, `new` is for `{}` formatting,and `fstr` is for f-strings.
191 | logging-format-style=old
192 | 
193 | # Logging modules to check that the string format arguments are in logging
194 | # function parameter format.
195 | logging-modules=logging
196 | 
197 | 
198 | [SPELLING]
199 | 
200 | # Limits count of emitted suggestions for spelling mistakes.
201 | max-spelling-suggestions=4
202 | 
203 | # Spelling dictionary name. Available dictionaries: none. To make it work,
204 | # install the python-enchant package.
205 | spelling-dict=
206 | 
207 | # List of comma separated words that should not be checked.
208 | spelling-ignore-words=
209 | 
210 | # A path to a file that contains the private dictionary; one word per line.
211 | spelling-private-dict-file=
212 | 
213 | # Tells whether to store unknown words to the private dictionary (see the
214 | # --spelling-private-dict-file option) instead of raising a message.
215 | spelling-store-unknown-words=no
216 | 
217 | 
218 | [MISCELLANEOUS]
219 | 
220 | # List of note tags to take in consideration, separated by a comma.
221 | notes=FIXME,
222 |       XXX,
223 |       TODO
224 | 
225 | 
226 | [TYPECHECK]
227 | 
228 | # List of decorators that produce context managers, such as
229 | # contextlib.contextmanager. Add to this list to register other decorators that
230 | # produce valid context managers.
231 | contextmanager-decorators=contextlib.contextmanager
232 | 
233 | # List of members which are set dynamically and missed by pylint inference
234 | # system, and so shouldn't trigger E1101 when accessed. Python regular
235 | # expressions are accepted.
236 | generated-members=
237 | 
238 | # Tells whether missing members accessed in mixin class should be ignored. A
239 | # mixin class is detected if its name ends with "mixin" (case insensitive).
240 | ignore-mixin-members=yes
241 | 
242 | # Tells whether to warn about missing members when the owner of the attribute
243 | # is inferred to be None.
244 | ignore-none=yes
245 | 
246 | # This flag controls whether pylint should warn about no-member and similar
247 | # checks whenever an opaque object is returned when inferring. The inference
248 | # can return multiple potential results while evaluating a Python object, but
249 | # some branches might not be evaluated, which results in partial inference. In
250 | # that case, it might be useful to still emit no-member and other checks for
251 | # the rest of the inferred objects.
252 | ignore-on-opaque-inference=yes
253 | 
254 | # List of class names for which member attributes should not be checked (useful
255 | # for classes with dynamically set attributes). This supports the use of
256 | # qualified names.
257 | ignored-classes=optparse.Values,thread._local,_thread._local
258 | 
259 | # List of module names for which member attributes should not be checked
260 | # (useful for modules/projects where namespaces are manipulated during runtime
261 | # and thus existing member attributes cannot be deduced by static analysis). It
262 | # supports qualified module names, as well as Unix pattern matching.
263 | ignored-modules=
264 | 
265 | # Show a hint with possible names when a member name was not found. The aspect
266 | # of finding the hint is based on edit distance.
267 | missing-member-hint=yes
268 | 
269 | # The minimum edit distance a name should have in order to be considered a
270 | # similar match for a missing member name.
271 | missing-member-hint-distance=1
272 | 
273 | # The total number of similar names that should be taken in consideration when
274 | # showing a hint for a missing member.
275 | missing-member-max-choices=1
276 | 
277 | # List of decorators that change the signature of a decorated function.
278 | signature-mutators=
279 | 
280 | 
281 | [VARIABLES]
282 | 
283 | # List of additional names supposed to be defined in builtins. Remember that
284 | # you should avoid defining new builtins when possible.
285 | additional-builtins=
286 | 
287 | # Tells whether unused global variables should be treated as a violation.
288 | allow-global-unused-variables=yes
289 | 
290 | # List of strings which can identify a callback function by name. A callback
291 | # name must start or end with one of those strings.
292 | callbacks=cb_,
293 |           _cb
294 | 
295 | # A regular expression matching the name of dummy variables (i.e. expected to
296 | # not be used).
297 | dummy-variables-rgx=_+$|(_[a-zA-Z0-9_]*[a-zA-Z0-9]+?$)|dummy|^ignored_|^unused_
298 | 
299 | # Argument names that match this expression will be ignored. Default to name
300 | # with leading underscore.
301 | ignored-argument-names=_.*|^ignored_|^unused_
302 | 
303 | # Tells whether we should check for unused import in __init__ files.
304 | init-import=no
305 | 
306 | # List of qualified module names which can have objects that can redefine
307 | # builtins.
308 | redefining-builtins-modules=six.moves,past.builtins,future.builtins,builtins,io
309 | 
310 | 
311 | [FORMAT]
312 | 
313 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
314 | expected-line-ending-format=
315 | 
316 | # Regexp for a line that is allowed to be longer than the limit.
317 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
318 | 
319 | # Number of spaces of indent required inside a hanging or continued line.
320 | indent-after-paren=4
321 | 
322 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
323 | # tab).
324 | indent-string='    '
325 | 
326 | # Maximum number of characters on a single line.
327 | max-line-length=100
328 | 
329 | # Maximum number of lines in a module.
330 | max-module-lines=1000
331 | 
332 | # List of optional constructs for which whitespace checking is disabled. `dict-
333 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
334 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
335 | # `empty-line` allows space-only lines.
336 | no-space-check=trailing-comma,
337 |                dict-separator
338 | 
339 | # Allow the body of a class to be on the same line as the declaration if body
340 | # contains single statement.
341 | single-line-class-stmt=no
342 | 
343 | # Allow the body of an if to be on the same line as the test if there is no
344 | # else.
345 | single-line-if-stmt=no
346 | 
347 | 
348 | [SIMILARITIES]
349 | 
350 | # Ignore comments when computing similarities.
351 | ignore-comments=yes
352 | 
353 | # Ignore docstrings when computing similarities.
354 | ignore-docstrings=yes
355 | 
356 | # Ignore imports when computing similarities.
357 | ignore-imports=no
358 | 
359 | # Minimum lines number of a similarity.
360 | min-similarity-lines=4
361 | 
362 | 
363 | [BASIC]
364 | 
365 | # Naming style matching correct argument names.
366 | argument-naming-style=snake_case
367 | 
368 | # Regular expression matching correct argument names. Overrides argument-
369 | # naming-style.
370 | #argument-rgx=
371 | 
372 | # Naming style matching correct attribute names.
373 | attr-naming-style=snake_case
374 | 
375 | # Regular expression matching correct attribute names. Overrides attr-naming-
376 | # style.
377 | #attr-rgx=
378 | 
379 | # Bad variable names which should always be refused, separated by a comma.
380 | bad-names=foo,
381 |           bar,
382 |           baz,
383 |           toto,
384 |           tutu,
385 |           tata
386 | 
387 | # Naming style matching correct class attribute names.
388 | class-attribute-naming-style=any
389 | 
390 | # Regular expression matching correct class attribute names. Overrides class-
391 | # attribute-naming-style.
392 | #class-attribute-rgx=
393 | 
394 | # Naming style matching correct class names.
395 | class-naming-style=PascalCase
396 | 
397 | # Regular expression matching correct class names. Overrides class-naming-
398 | # style.
399 | #class-rgx=
400 | 
401 | # Naming style matching correct constant names.
402 | const-naming-style=UPPER_CASE
403 | 
404 | # Regular expression matching correct constant names. Overrides const-naming-
405 | # style.
406 | #const-rgx=
407 | 
408 | # Minimum line length for functions/classes that require docstrings, shorter
409 | # ones are exempt.
410 | docstring-min-length=-1
411 | 
412 | # Naming style matching correct function names.
413 | function-naming-style=snake_case
414 | 
415 | # Regular expression matching correct function names. Overrides function-
416 | # naming-style.
417 | #function-rgx=
418 | 
419 | # Good variable names which should always be accepted, separated by a comma.
420 | good-names=i,
421 |            j,
422 |            k,
423 |            ex,
424 |            Run,
425 |            _
426 | 
427 | # Include a hint for the correct naming format with invalid-name.
428 | include-naming-hint=no
429 | 
430 | # Naming style matching correct inline iteration names.
431 | inlinevar-naming-style=any
432 | 
433 | # Regular expression matching correct inline iteration names. Overrides
434 | # inlinevar-naming-style.
435 | #inlinevar-rgx=
436 | 
437 | # Naming style matching correct method names.
438 | method-naming-style=snake_case
439 | 
440 | # Regular expression matching correct method names. Overrides method-naming-
441 | # style.
442 | #method-rgx=
443 | 
444 | # Naming style matching correct module names.
445 | module-naming-style=snake_case
446 | 
447 | # Regular expression matching correct module names. Overrides module-naming-
448 | # style.
449 | #module-rgx=
450 | 
451 | # Colon-delimited sets of names that determine each other's naming style when
452 | # the name regexes allow several styles.
453 | name-group=
454 | 
455 | # Regular expression which should only match function or class names that do
456 | # not require a docstring.
457 | no-docstring-rgx=^_
458 | 
459 | # List of decorators that produce properties, such as abc.abstractproperty. Add
460 | # to this list to register other decorators that produce valid properties.
461 | # These decorators are taken in consideration only for invalid-name.
462 | property-classes=abc.abstractproperty
463 | 
464 | # Naming style matching correct variable names.
465 | variable-naming-style=snake_case
466 | 
467 | # Regular expression matching correct variable names. Overrides variable-
468 | # naming-style.
469 | #variable-rgx=
470 | 
471 | 
472 | [STRING]
473 | 
474 | # This flag controls whether the implicit-str-concat-in-sequence should
475 | # generate a warning on implicit string concatenation in sequences defined over
476 | # several lines.
477 | check-str-concat-over-line-jumps=no
478 | 
479 | 
480 | [IMPORTS]
481 | 
482 | # List of modules that can be imported at any level, not just the top level
483 | # one.
484 | allow-any-import-level=
485 | 
486 | # Allow wildcard imports from modules that define __all__.
487 | allow-wildcard-with-all=no
488 | 
489 | # Analyse import fallback blocks. This can be used to support both Python 2 and
490 | # 3 compatible code, which means that the block might have code that exists
491 | # only in one or another interpreter, leading to false positives when analysed.
492 | analyse-fallback-blocks=no
493 | 
494 | # Deprecated modules which should not be used, separated by a comma.
495 | deprecated-modules=optparse,tkinter.tix
496 | 
497 | # Create a graph of external dependencies in the given file (report RP0402 must
498 | # not be disabled).
499 | ext-import-graph=
500 | 
501 | # Create a graph of every (i.e. internal and external) dependencies in the
502 | # given file (report RP0402 must not be disabled).
503 | import-graph=
504 | 
505 | # Create a graph of internal dependencies in the given file (report RP0402 must
506 | # not be disabled).
507 | int-import-graph=
508 | 
509 | # Force import order to recognize a module as part of the standard
510 | # compatibility libraries.
511 | known-standard-library=
512 | 
513 | # Force import order to recognize a module as part of a third party library.
514 | known-third-party=enchant
515 | 
516 | # Couples of modules and preferred modules, separated by a comma.
517 | preferred-modules=
518 | 
519 | 
520 | [CLASSES]
521 | 
522 | # List of method names used to declare (i.e. assign) instance attributes.
523 | defining-attr-methods=__init__,
524 |                       __new__,
525 |                       setUp,
526 |                       __post_init__
527 | 
528 | # List of member names, which should be excluded from the protected access
529 | # warning.
530 | exclude-protected=_asdict,
531 |                   _fields,
532 |                   _replace,
533 |                   _source,
534 |                   _make
535 | 
536 | # List of valid names for the first argument in a class method.
537 | valid-classmethod-first-arg=cls
538 | 
539 | # List of valid names for the first argument in a metaclass class method.
540 | valid-metaclass-classmethod-first-arg=cls
541 | 
542 | 
543 | [DESIGN]
544 | 
545 | # Maximum number of arguments for function / method.
546 | max-args=5
547 | 
548 | # Maximum number of attributes for a class (see R0902).
549 | max-attributes=7
550 | 
551 | # Maximum number of boolean expressions in an if statement (see R0916).
552 | max-bool-expr=5
553 | 
554 | # Maximum number of branch for function / method body.
555 | max-branches=12
556 | 
557 | # Maximum number of locals for function / method body.
558 | max-locals=15
559 | 
560 | # Maximum number of parents for a class (see R0901).
561 | max-parents=7
562 | 
563 | # Maximum number of public methods for a class (see R0904).
564 | max-public-methods=20
565 | 
566 | # Maximum number of return / yield for function / method body.
567 | max-returns=6
568 | 
569 | # Maximum number of statements in function / method body.
570 | max-statements=50
571 | 
572 | # Minimum number of public methods for a class (see R0903).
573 | min-public-methods=2
574 | 
575 | 
576 | [EXCEPTIONS]
577 | 
578 | # Exceptions that will emit a warning when being caught. Defaults to
579 | # "BaseException, Exception".
580 | overgeneral-exceptions=BaseException,
581 |                        Exception
582 | 


--------------------------------------------------------------------------------