├── .dockerignore ├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md └── workflows │ ├── build.yml │ ├── deploy.yml │ └── package.yml ├── .gitignore ├── Dockerfile ├── README.md ├── deployment-comment.yml ├── deployment-single.yml ├── deployment-universal.yml ├── docker-compose.yml ├── requirements.txt ├── scrapy.cfg └── weibo ├── __init__.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── comment.py ├── single.py └── universal.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | env.sh -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: Germey 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Environment (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - OS Version [e.g. 22] 29 | - Python [e.g. 3.6.2] 30 | 31 | **Additional context** 32 | Add any other context about the problem here. 33 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: 3 | push: 4 | branches: 5 | - release 6 | paths-ignore: 7 | - .gitignore 8 | - README.md 9 | - '.github/ISSUE_TEMPLATE/**' 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout Source 15 | uses: actions/checkout@v1 16 | - name: Docker Login 17 | run: docker login -u germey -p ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} 18 | - name: Build the Docker Image 19 | run: docker-compose build 20 | - name: Push the Docker Image 21 | run: docker-compose push 22 | -------------------------------------------------------------------------------- /.github/workflows/deploy.yml: -------------------------------------------------------------------------------- 1 | name: deploy 2 | on: 3 | push: 4 | branches: 5 | - release 6 | paths-ignore: 7 | - .gitignore 8 | - README.md 9 | - '.github/ISSUE_TEMPLATE/**' 10 | jobs: 11 | run: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout 15 | uses: actions/checkout@master 16 | - name: Docker Login 17 | uses: Azure/docker-login@v1 18 | with: 19 | username: germey 20 | password: ${{ secrets.DOCKERHUB_LOGIN_PASSWORD }} 21 | - name: Set Kubectl 22 | uses: Azure/k8s-set-context@v1 23 | with: 24 | kubeconfig: ${{ secrets.KUBE_CONFIG }} 25 | - name: Test Kubectl 26 | run: | 27 | kubectl get nodes 28 | kubectl get svc -n crawler 29 | - name: Generate Build Number 30 | uses: einaregilsson/build-number@v2 31 | with: 32 | token: ${{ secrets.github_token }} 33 | - name: Get Build Number 34 | run: | 35 | echo $BUILD_NUMBER 36 | - name: Build Push Deploy 37 | run: | 38 | docker-compose build 39 | docker tag germey/crawler-weibo-universal germey/crawler-weibo-universal:$BUILD_NUMBER 40 | docker push germey/crawler-weibo-universal:$BUILD_NUMBER 41 | docker tag germey/crawler-weibo-comment germey/crawler-weibo-comment:$BUILD_NUMBER 42 | docker push germey/crawler-weibo-comment:$BUILD_NUMBER 43 | cat deployment.yml | sed 's/\${TAG}/'"$BUILD_NUMBER"'/g' | kubectl apply -f - 44 | -------------------------------------------------------------------------------- /.github/workflows/package.yml: -------------------------------------------------------------------------------- 1 | name: package 2 | on: 3 | push: 4 | branches: 5 | - release 6 | paths-ignore: 7 | - .gitignore 8 | - README.md 9 | - '.github/ISSUE_TEMPLATE/**' 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout Source 15 | uses: actions/checkout@v1 16 | - name: Setup Docker 17 | uses: docker-practice/actions-setup-docker@0.0.1 18 | - name: Docker Login 19 | run: docker login -u Germey -p ${{ secrets.github_token }} docker.pkg.github.com 20 | - name: Build the Docker Image 21 | run: docker-compose build 22 | - name: Tag the Docker Image 23 | run: docker tag germey/crawler-weibo-universal docker.pkg.github.com/python3webspider/weibocrawler/crawler-weibo-universal:master 24 | - name: Push the Docker Image 25 | run: docker push docker.pkg.github.com/python3webspider/weibocrawler/crawler-weibo-universal:master 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | env.sh 134 | 135 | .idea/ -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.6 2 | WORKDIR /app 3 | ADD requirements.txt . 4 | RUN pip3 install -r requirements.txt 5 | ADD . . -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weibo 2 | 3 | Weibo Spider Using Scrapy -------------------------------------------------------------------------------- /deployment-comment.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | creationTimestamp: null 5 | name: crawler 6 | --- 7 | apiVersion: v1 8 | items: 9 | - apiVersion: apps/v1 10 | kind: Deployment 11 | metadata: 12 | annotations: 13 | kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml 14 | kompose.version: 1.20.0 () 15 | creationTimestamp: null 16 | labels: 17 | io.kompose.service: crawler-weibo-comment 18 | name: crawler-weibo-comment 19 | namespace: crawler 20 | spec: 21 | replicas: 10 22 | selector: 23 | matchLabels: 24 | io.kompose.service: crawler-weibo-comment 25 | revisionHistoryLimit: 1 26 | strategy: {} 27 | template: 28 | metadata: 29 | annotations: 30 | kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml 31 | kompose.version: 1.20.0 () 32 | creationTimestamp: null 33 | labels: 34 | io.kompose.service: crawler-weibo-comment 35 | spec: 36 | containers: 37 | - args: 38 | - scrapy 39 | - crawl 40 | - comment 41 | env: 42 | - name: ELASTICSEARCH_CONNECTION_STRING 43 | valueFrom: 44 | secretKeyRef: 45 | name: elasticsearch 46 | key: connection_string 47 | - name: REDIS_CONNECTION_STRING 48 | valueFrom: 49 | secretKeyRef: 50 | name: redis 51 | key: connection_string 52 | - name: PROXYPOOL_URL 53 | valueFrom: 54 | secretKeyRef: 55 | name: adslproxy 56 | key: universal 57 | - name: PROXYPOOL_ENABLED 58 | value: 'true' 59 | image: germey/crawler-weibo-comment 60 | name: crawler-weibo-comment 61 | resources: 62 | limits: 63 | memory: "200Mi" 64 | cpu: "150m" 65 | requests: 66 | memory: "200Mi" 67 | cpu: "150m" 68 | restartPolicy: Always 69 | status: {} 70 | kind: List 71 | metadata: {} 72 | -------------------------------------------------------------------------------- /deployment-single.yml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1beta1 2 | kind: CronJob 3 | metadata: 4 | name: crawler-weibo-single 5 | namespace: crawler 6 | spec: 7 | schedule: "*/20 * * * *" 8 | jobTemplate: 9 | spec: 10 | template: 11 | spec: 12 | containers: 13 | - args: 14 | - scrapy 15 | - crawl 16 | - single 17 | env: 18 | - name: WEIBO_WEIBO_ID 19 | value: '4467107636950632' 20 | - name: WEIBO_COOKIES 21 | valueFrom: 22 | secretKeyRef: 23 | name: weibo 24 | key: cookies 25 | - name: ELASTICSEARCH_CONNECTION_STRING 26 | valueFrom: 27 | secretKeyRef: 28 | name: elasticsearch 29 | key: connection_string 30 | - name: REDIS_CONNECTION_STRING 31 | valueFrom: 32 | secretKeyRef: 33 | name: redis 34 | key: connection_string 35 | - name: PROXYPOOL_URL 36 | valueFrom: 37 | secretKeyRef: 38 | name: adslproxy 39 | key: universal 40 | - name: PROXYPOOL_ENABLED 41 | value: 'true' 42 | image: germey/crawler-weibo-single 43 | name: crawler-weibo-single 44 | imagePullPolicy: Always 45 | resources: 46 | limits: 47 | memory: "200Mi" 48 | cpu: "150m" 49 | requests: 50 | memory: "200Mi" 51 | cpu: "150m" 52 | restartPolicy: OnFailure -------------------------------------------------------------------------------- /deployment-universal.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | creationTimestamp: null 5 | name: crawler 6 | --- 7 | apiVersion: v1 8 | items: 9 | - apiVersion: extensions/v1beta1 10 | kind: Deployment 11 | metadata: 12 | annotations: 13 | kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml 14 | kompose.version: 1.20.0 () 15 | creationTimestamp: null 16 | labels: 17 | io.kompose.service: crawler-weibo-universal 18 | name: crawler-weibo-universal 19 | namespace: crawler 20 | spec: 21 | replicas: 5 22 | revisionHistoryLimit: 1 23 | strategy: {} 24 | template: 25 | metadata: 26 | annotations: 27 | kompose.cmd: kompose convert -f docker-compose.yml -o deployment.yml 28 | kompose.version: 1.20.0 () 29 | creationTimestamp: null 30 | labels: 31 | io.kompose.service: crawler-weibo-universal 32 | spec: 33 | containers: 34 | - args: 35 | - scrapy 36 | - crawl 37 | - universal 38 | env: 39 | - name: ELASTICSEARCH_CONNECTION_STRING 40 | valueFrom: 41 | secretKeyRef: 42 | name: elasticsearch 43 | key: connection_string 44 | - name: REDIS_CONNECTION_STRING 45 | valueFrom: 46 | secretKeyRef: 47 | name: redis 48 | key: connection_string 49 | - name: PROXYPOOL_URL 50 | valueFrom: 51 | secretKeyRef: 52 | name: adslproxy 53 | key: universal 54 | - name: PROXYPOOL_ENABLED 55 | value: 'true' 56 | image: germey/crawler-weibo-universal:${TAG} 57 | name: crawler-weibo-universal 58 | resources: 59 | limits: 60 | memory: "200Mi" 61 | cpu: "150m" 62 | requests: 63 | memory: "200Mi" 64 | cpu: "150m" 65 | restartPolicy: Always 66 | status: {} 67 | kind: List 68 | metadata: {} 69 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | # crawler-weibo-universal: 4 | # container_name: 'crawler-weibo-universal' 5 | # restart: always 6 | # build: . 7 | # image: 'germey/crawler-weibo-universal' 8 | # command: 'scrapy crawl universal' 9 | # environment: 10 | # ELASTICSEARCH_CONNECTION_STRING: 11 | # REDIS_CONNECTION_STRING: 12 | crawler-weibo-comment: 13 | container_name: 'crawler-weibo-comment' 14 | restart: always 15 | build: . 16 | image: 'germey/crawler-weibo-comment:1' 17 | command: 'scrapy crawl comment' 18 | environment: 19 | ELASTICSEARCH_CONNECTION_STRING: 20 | REDIS_CONNECTION_STRING: 21 | crawler-weibo-single: 22 | container_name: 'crawler-weibo-single' 23 | restart: always 24 | build: . 25 | image: 'germey/crawler-weibo-single' 26 | command: 'scrapy crawl single' 27 | environment: 28 | ELASTICSEARCH_CONNECTION_STRING: 29 | REDIS_CONNECTION_STRING: 30 | PROXYPOOL_URL: 31 | PROXYTUNNEL_URL: 32 | WEIBO_COOKIES: 33 | WEIBO_WEIBO_ID: '4467107636950632' 34 | PROXYPOOL_ENABLED: 'false' 35 | START_COMMENT_ID: '4489943772285925' -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | elasticsearch==7.0.2 2 | Scrapy==1.8.1 3 | requests==2.22.0 4 | environs==7.2.0 5 | pymongo==3.10.1 6 | scrapy-redis==0.6.8 7 | pytz==2019.1 8 | dateparser==0.7.2 -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = weibo.settings -------------------------------------------------------------------------------- /weibo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Python3WebSpider/WeiboCrawler/17581878f5698b60a913646f599c0600ce5dd8fe/weibo/__init__.py -------------------------------------------------------------------------------- /weibo/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | from scrapy import Item, Field 9 | 10 | 11 | class UserItem(Item): 12 | collection = 'users' 13 | index = 'weibo-users' 14 | type = 'user' 15 | 16 | id = Field() 17 | name = Field() 18 | avatar = Field() 19 | cover = Field() 20 | gender = Field() 21 | description = Field() 22 | fans_count = Field() 23 | follows_count = Field() 24 | weibos_count = Field() 25 | verified = Field() 26 | verified_reason = Field() 27 | verified_type = Field() 28 | follows = Field() 29 | fans = Field() 30 | crawled_at = Field() 31 | 32 | 33 | class WeiboItem(Item): 34 | collection = 'weibos' 35 | index = 'weibo-weibos' 36 | type = 'weibo' 37 | id = Field() 38 | attitudes_count = Field() 39 | comments_count = Field() 40 | reposts_count = Field() 41 | picture = Field() 42 | pictures = Field() 43 | source = Field() 44 | text = Field() 45 | raw_text = Field() 46 | thumbnail = Field() 47 | user = Field() 48 | user_name = Field() 49 | created_at = Field() 50 | crawled_at = Field() 51 | 52 | 53 | class CommentItem(Item): 54 | collection = 'comments' 55 | index = 'weibo-comments' 56 | type = 'comment' 57 | 58 | id = Field() 59 | likes_count = Field() 60 | source = Field() 61 | text = Field() 62 | raw_text = Field() 63 | user = Field() 64 | created_at = Field() 65 | reply_id = Field() 66 | reply_text = Field() 67 | reply_raw_text = Field() 68 | weibo = Field() 69 | crawled_at = Field() 70 | -------------------------------------------------------------------------------- /weibo/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | import logging 8 | import requests 9 | import json 10 | import re 11 | 12 | from scrapy.downloadermiddlewares.retry import RetryMiddleware 13 | 14 | logger = logging.getLogger(__name__) 15 | 16 | 17 | class ProxytunnelMiddleware(object): 18 | def __init__(self, proxytunnel_url): 19 | self.logger = logger 20 | self.proxytunnel_url = proxytunnel_url 21 | 22 | def process_request(self, request, spider): 23 | """ 24 | if retry_times > 0,get random proxy 25 | :param request: 26 | :param spider: 27 | :return: 28 | """ 29 | if request.meta.get('retry_times'): 30 | self.logger.debug('Using proxytunnel') 31 | request.meta['proxy'] = self.proxytunnel_url 32 | 33 | @classmethod 34 | def from_crawler(cls, crawler): 35 | settings = crawler.settings 36 | return cls( 37 | proxytunnel_url=settings.get('PROXYTUNNEL_URL') 38 | ) 39 | 40 | 41 | class CSRFTokenMiddleware(object): 42 | 43 | def process_request(self, request, spider): 44 | pass 45 | 46 | 47 | class RetryCommentMiddleware(RetryMiddleware): 48 | 49 | def process_response(self, request, response, spider): 50 | try: 51 | result = json.loads(response.text) 52 | if not result.get('ok') == 1: 53 | logger.info('Retrying times %s', request.meta.get('retry_times', 0)) 54 | return self._retry(request, 'Status not OK', spider) or response 55 | return response 56 | except json.decoder.JSONDecodeError: 57 | logger.info('Json decode error, content %s', response.text) 58 | return self._retry(request, 'Json Decode Error', spider) or response 59 | 60 | 61 | class ProxypoolMiddleware(object): 62 | """ 63 | proxy middleware for changing proxy 64 | """ 65 | 66 | def __init__(self, proxypool_url): 67 | self.logger = logging.getLogger(__name__) 68 | if re.search('^https?://\S+:\S+@\S+', proxypool_url): 69 | result = re.search('https?://(\S+):(\S+)@\S+', proxypool_url) 70 | self.auth = result.group(1), result.group(2) 71 | self.proxypool_url = re.sub('(https?://)\S+:\S+@(\S+)', r'\1\2', proxypool_url) 72 | else: 73 | self.proxypool_url = proxypool_url 74 | 75 | def get_random_proxy(self): 76 | """ 77 | get random proxy form proxypol 78 | :return: 79 | """ 80 | try: 81 | if getattr(self, 'auth') and self.auth: 82 | response = requests.get(self.proxypool_url, timeout=5, auth=self.auth) 83 | else: 84 | response = requests.get(self.proxypool_url, timeout=5) 85 | if response.status_code == 200: 86 | proxy = response.text 87 | return proxy 88 | except requests.ConnectionError: 89 | return False 90 | 91 | def process_request(self, request, spider): 92 | """ 93 | if retry_times > 0,get random proxy 94 | :param request: 95 | :param spider: 96 | :return: 97 | """ 98 | if request.meta.get('retry_times'): 99 | proxy = self.get_random_proxy() 100 | self.logger.debug('Get proxy %s', proxy) 101 | if proxy: 102 | uri = 'http://{proxy}'.format(proxy=proxy) 103 | self.logger.debug('Using proxy %s', proxy) 104 | request.meta['proxy'] = uri 105 | 106 | @classmethod 107 | def from_crawler(cls, crawler): 108 | settings = crawler.settings 109 | return cls( 110 | proxypool_url=settings.get('PROXYPOOL_URL') 111 | ) 112 | -------------------------------------------------------------------------------- /weibo/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import re, time 4 | from datetime import datetime 5 | from elasticsearch import Elasticsearch 6 | from scrapy import Selector 7 | import pymongo 8 | from weibo.items import * 9 | from twisted.internet.threads import deferToThread 10 | import pytz 11 | import dateparser 12 | 13 | logging.getLogger('scrapy.core.scraper').setLevel(logging.INFO) 14 | 15 | 16 | class TimePipeline(): 17 | """ 18 | time pipeline 19 | """ 20 | 21 | def process_item(self, item, spider): 22 | """ 23 | add crawled_at attr 24 | :param item: 25 | :param spider: 26 | :return: 27 | """ 28 | if isinstance(item, UserItem) or isinstance(item, WeiboItem) or isinstance(item, CommentItem): 29 | item['crawled_at'] = datetime.now(tz=pytz.utc) 30 | return item 31 | 32 | 33 | class WeiboPipeline(): 34 | """ 35 | weibo pipeline 36 | """ 37 | 38 | def parse_time(self, date): 39 | """ 40 | parse weibo time 41 | :param date: 42 | :return: 43 | """ 44 | if re.match('刚刚', date): 45 | date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 46 | if re.match('\d+分钟前', date): 47 | minute = re.match('(\d+)', date).group(1) 48 | date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time() - float(minute) * 60)) 49 | if re.match('\d+小时前', date): 50 | hour = re.match('(\d+)', date).group(1) 51 | date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time() - float(hour) * 60 * 60)) 52 | if re.match('昨天.*', date): 53 | date = re.match('昨天(.*)', date).group(1).strip() 54 | date = time.strftime('%Y-%m-%d', time.localtime(time.time() - 24 * 60 * 60)) + ' ' + date + ':00' 55 | if re.match('\d{2}-\d{2}', date): 56 | date = time.strftime('%Y-', time.localtime()) + date + ' 00:00:00' 57 | return dateparser.parse(date) 58 | 59 | def process_item(self, item, spider): 60 | """ 61 | process weibo item 62 | :param item: 63 | :param spider: 64 | :return: 65 | """ 66 | if isinstance(item, WeiboItem): 67 | if item.get('created_at'): 68 | item['created_at'] = item['created_at'].strip() 69 | item['created_at'] = self.parse_time(item.get('created_at')) 70 | if item.get('pictures'): 71 | item['pictures'] = [pic.get('url') for pic in item.get('pictures')] 72 | if item.get('text'): 73 | item['raw_text'] = ''.join(Selector(text=item.get('text')).xpath('//text()').extract()) 74 | return item 75 | 76 | 77 | class CommentPipeline(): 78 | """ 79 | comment pipeline 80 | """ 81 | 82 | def parse_time(self, date): 83 | """ 84 | parse comment time 85 | :param date: 86 | :return: 87 | """ 88 | if re.match('刚刚', date): 89 | date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) 90 | if re.match('\d+分钟前', date): 91 | minute = re.match('(\d+)', date).group(1) 92 | date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time() - float(minute) * 60)) 93 | if re.match('\d+小时前', date): 94 | hour = re.match('(\d+)', date).group(1) 95 | date = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time() - float(hour) * 60 * 60)) 96 | if re.match('昨天.*', date): 97 | date = re.match('昨天(.*)', date).group(1).strip() 98 | date = time.strftime('%Y-%m-%d', time.localtime(time.time() - 24 * 60 * 60)) + ' ' + date + ':00' 99 | if re.match('\d{2}-\d{2}', date): 100 | date = time.strftime('%Y-', time.localtime()) + date + ' 00:00:00' 101 | return dateparser.parse(date) 102 | 103 | def process_item(self, item, spider): 104 | """ 105 | process comment item 106 | :param item: 107 | :param spider: 108 | :return: 109 | """ 110 | if isinstance(item, CommentItem): 111 | if item.get('user'): 112 | item['user'] = item.get('user').get('id') 113 | if item.get('text'): 114 | item['raw_text'] = ''.join(Selector(text=item.get('text')).xpath('//text()').extract()) 115 | if re.search('回复.*?\:(.*?)', item.get('raw_text')): 116 | item['raw_text'] = re.search('回复.*?\:(.*)', item.get('raw_text')).group(1) 117 | if item.get('reply_text'): 118 | item['reply_raw_text'] = ''.join(Selector(text=item.get('reply_text')).xpath('//text()').extract()) 119 | if re.search('回复.*?\:(.*?)', item.get('reply_raw_text')): 120 | item['reply_raw_text'] = re.search('回复.*?\:(.*)', item.get('reply_raw_text')).group(1) 121 | if item.get('created_at'): 122 | item['created_at'] = item['created_at'].strip() 123 | item['created_at'] = self.parse_time(item.get('created_at')) 124 | return item 125 | 126 | 127 | class MongoPipeline(object): 128 | """ 129 | mongodb pipeline 130 | """ 131 | 132 | def __init__(self, mongo_uri, mongo_db): 133 | """ 134 | init conn 135 | :param mongo_uri: 136 | :param mongo_db: 137 | """ 138 | self.mongo_uri = mongo_uri 139 | self.mongo_db = mongo_db 140 | 141 | @classmethod 142 | def from_crawler(cls, crawler): 143 | """ 144 | get settings 145 | :param crawler: 146 | :return: 147 | """ 148 | return cls( 149 | mongo_uri=crawler.settings.get('MONGO_URI'), 150 | mongo_db=crawler.settings.get('MONGO_DATABASE') 151 | ) 152 | 153 | def open_spider(self, spider): 154 | """ 155 | create conn while creating spider 156 | :param spider: 157 | :return: 158 | """ 159 | self.client = pymongo.MongoClient(self.mongo_uri) 160 | self.db = self.client[self.mongo_db] 161 | self.db[UserItem.collection].create_index([('id', pymongo.ASCENDING)]) 162 | self.db[WeiboItem.collection].create_index([('id', pymongo.ASCENDING)]) 163 | 164 | def close_spider(self, spider): 165 | """ 166 | close conn 167 | :param spider: 168 | :return: 169 | """ 170 | self.client.close() 171 | 172 | def _process_item(self, item, spider): 173 | """ 174 | main processor 175 | :param item: 176 | :param spider: 177 | :return: 178 | """ 179 | if isinstance(item, UserItem) or isinstance(item, WeiboItem) or isinstance(item, CommentItem): 180 | self.db[item.collection].update({'id': item.get('id')}, {'$set': item}, True) 181 | return item 182 | 183 | def process_item(self, item, spider): 184 | """ 185 | process item using defer 186 | :param item: 187 | :param spider: 188 | :return: 189 | """ 190 | return deferToThread(self._process_item, item, spider) 191 | 192 | 193 | class ElasticsearchPipeline(object): 194 | """ 195 | pipeline for elasticsearch 196 | """ 197 | 198 | def __init__(self, connection_string): 199 | """ 200 | init connection_string and mappings 201 | :param connection_string: 202 | """ 203 | self.connection_string = connection_string 204 | 205 | @classmethod 206 | def from_crawler(cls, crawler): 207 | """ 208 | class method for pipeline 209 | :param crawler: scrapy crawler 210 | :return: 211 | """ 212 | return cls( 213 | connection_string=crawler.settings.get('ELASTICSEARCH_CONNECTION_STRING'), 214 | ) 215 | 216 | def open_spider(self, spider): 217 | """ 218 | open spider to do 219 | :param spider: 220 | :return: 221 | """ 222 | self.conn = Elasticsearch( 223 | hosts=[self.connection_string] 224 | ) 225 | 226 | def _process_item(self, item, spider): 227 | """ 228 | main process 229 | :param item: user or weibo or comment item 230 | :param spider: 231 | :return: 232 | """ 233 | if isinstance(item, UserItem) or isinstance(item, WeiboItem) or isinstance(item, CommentItem): 234 | self.conn.index(index=item.index, 235 | id=item['id'], 236 | doc_type=item.type, 237 | body=dict(item), timeout=60) 238 | return item 239 | 240 | def process_item(self, item, spider): 241 | """ 242 | process item using deferToThread 243 | :param item: 244 | :param spider: 245 | :return: 246 | """ 247 | return deferToThread(self._process_item, item, spider) 248 | -------------------------------------------------------------------------------- /weibo/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import logging 3 | import urllib3 4 | from environs import Env 5 | 6 | urllib3.disable_warnings() 7 | logging.getLogger('py.warnings').setLevel(logging.ERROR) 8 | logging.getLogger('urllib3.connectionpool').setLevel(logging.INFO) 9 | logging.getLogger('elasticsearch').setLevel(logging.INFO) 10 | logging.getLogger('universal').setLevel(logging.INFO) 11 | 12 | env = Env() 13 | 14 | # Scrapy settings for weibo project 15 | # 16 | # For simplicity, this file contains only settings considered important or 17 | # commonly used. You can find more settings consulting the documentation: 18 | # 19 | # http://doc.scrapy.org/en/latest/topics/settings.html 20 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 21 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 22 | 23 | BOT_NAME = 'weibo' 24 | 25 | SPIDER_MODULES = ['weibo.spiders'] 26 | NEWSPIDER_MODULE = 'weibo.spiders' 27 | 28 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 29 | # USER_AGENT = 'weibo (+http://www.yourdomain.com)' 30 | 31 | # Obey robots.txt rules 32 | ROBOTSTXT_OBEY = False 33 | 34 | DEFAULT_REQUEST_HEADERS = { 35 | 'Accept': 'application/json, text/plain, */*', 36 | 'Accept-Encoding': 'gzip, deflate, sdch', 37 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4,zh-TW;q=0.2,mt;q=0.2', 38 | 'Connection': 'keep-alive', 39 | 'Host': 'm.weibo.cn', 40 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36', 41 | 'X-Requested-With': 'XMLHttpRequest', 42 | } 43 | 44 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 45 | # CONCURRENT_REQUESTS = 32 46 | 47 | # Configure a delay for requests for the same website (default: 0) 48 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 49 | # See also autothrottle settings and docs 50 | # DOWNLOAD_DELAY = 3 51 | # The download delay setting will honor only one of: 52 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 53 | # CONCURRENT_REQUESTS_PER_IP = 16 54 | 55 | # Disable cookies (enabled by default) 56 | # COOKIES_ENABLED = True 57 | # COOKIES_DEBUG = True 58 | # Disable Telnet Console (enabled by default) 59 | # TELNETCONSOLE_ENABLED = False 60 | 61 | # Override the default request headers: 62 | # DEFAULT_REQUEST_HEADERS = { 63 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 64 | # 'Accept-Language': 'en', 65 | # } 66 | 67 | # Enable or disable spider middlewares 68 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 69 | # SPIDER_MIDDLEWARES = { 70 | # 'weibo.middlewares.WeiboSpiderMiddleware': 543, 71 | # } 72 | 73 | # Enable or disable downloader middlewares 74 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 75 | DOWNLOADER_MIDDLEWARES = { 76 | 'weibo.middlewares.ProxypoolMiddleware': 555 if env.bool('PROXYPOOL_ENABLED', True) else None, 77 | # 'weibo.middlewares.ProxytunnelMiddleware': 556 if env.bool('PROXYTUNNEL_ENABLED', True) else None, 78 | } 79 | 80 | # Enable or disable extensions 81 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 82 | EXTENSIONS = { 83 | # 'scrapy_jsonrpc.webservice.WebService': 499, 84 | # 'scrapy_prometheus_exporter.prometheus.WebService': 500, 85 | } 86 | 87 | CONCURRENT_REQUESTS = 20 88 | 89 | # COOKIES_ENABLED = False 90 | 91 | # DOWNLOAD_DELAY = 10 92 | 93 | # Configure item pipelines 94 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 95 | ITEM_PIPELINES = { 96 | 'weibo.pipelines.TimePipeline': 300, 97 | 'weibo.pipelines.WeiboPipeline': 301, 98 | 'weibo.pipelines.CommentPipeline': 302, 99 | 'weibo.pipelines.ElasticsearchPipeline': 303 if env.bool('ELASTICSEARCH_PIPELINE_ENABLED', True) else None, 100 | } 101 | 102 | # Enable and configure the AutoThrottle extension (disabled by default) 103 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 104 | # AUTOTHROTTLE_ENABLED = True 105 | # The initial download delay 106 | # AUTOTHROTTLE_START_DELAY = 5 107 | # The maximum download delay to be set in case of high latencies 108 | # AUTOTHROTTLE_MAX_DELAY = 60 109 | # The average number of requests Scrapy should be sending in parallel to 110 | # each remote server 111 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 112 | # Enable showing throttling stats for every response received: 113 | # AUTOTHROTTLE_DEBUG = False 114 | 115 | # Enable and configure HTTP caching (disabled by default) 116 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 117 | # HTTPCACHE_ENABLED = True 118 | # HTTPCACHE_EXPIRATION_SECS = 0 119 | # HTTPCACHE_DIR = 'httpcache' 120 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 121 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 122 | 123 | # definition of distributed 124 | SCHEDULER = 'scrapy_redis.scheduler.Scheduler' 125 | DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 126 | REDIS_URL = env.str('REDIS_CONNECTION_STRING') 127 | SCHEDULER_PERSIST = True 128 | REDIS_START_URLS_BATCH_SIZE = 5 129 | SCHEDULER_QUEUE_KEY = 'weibo:%(spider)s:requests' 130 | SCHEDULER_DUPEFILTER_KEY = 'weibo:%(spider)s:dupefilter' 131 | 132 | # definition of retry 133 | RETRY_HTTP_CODES = [401, 403, 408, 414, 418, 500, 502, 503, 504] 134 | RETRY_TIMES = 20 135 | DOWNLOAD_TIMEOUT = 10 136 | 137 | # definition of proxy 138 | PROXYPOOL_URL = env.str('PROXYPOOL_URL') 139 | # PROXYTUNNEL_URL = env.str('PROXYTUNNEL_URL') 140 | 141 | # definition of elasticsearch 142 | ELASTICSEARCH_CONNECTION_STRING = env.str('ELASTICSEARCH_CONNECTION_STRING') 143 | -------------------------------------------------------------------------------- /weibo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /weibo/spiders/comment.py: -------------------------------------------------------------------------------- 1 | import json 2 | from scrapy import Request, Spider 3 | from weibo.items import * 4 | from urllib.parse import urlparse, parse_qs 5 | 6 | 7 | class CommentSpider(Spider): 8 | """ 9 | main comment spider to crawl all weibo 10 | """ 11 | name = 'comment' 12 | allowed_domains = ['m.weibo.cn'] 13 | user_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}' 14 | follow_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}' 15 | fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&page={page}' 16 | weibo_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}' 17 | comment_url = 'https://m.weibo.cn/api/comments/show?id={id}&page={page}' 18 | start_users = ['1195230310', '3217179555', '1742566624', '2282991915', '1288739185', '3952070245', '5878659096'] 19 | 20 | def start_requests(self): 21 | """ 22 | start from defined users 23 | :return: 24 | """ 25 | for uid in self.start_users: 26 | yield Request(self.user_url.format(uid=uid), callback=self.parse_user, dont_filter=True) 27 | 28 | def parse_user(self, response): 29 | """ 30 | parse user info 31 | :param response: user response 32 | """ 33 | self.logger.debug(response) 34 | result = json.loads(response.text) 35 | if result.get('data').get('userInfo'): 36 | user_info = result.get('data').get('userInfo') 37 | user_item = UserItem() 38 | field_map = { 39 | 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone', 40 | 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count', 41 | 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 42 | 'verified_reason': 'verified_reason', 'verified_type': 'verified_type' 43 | } 44 | for field, attr in field_map.items(): 45 | user_item[field] = user_info.get(attr) 46 | yield user_item 47 | uid = user_info.get('id') 48 | # follows 49 | yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, 50 | meta={'page': 1, 'uid': uid}, dont_filter=True) 51 | # fans 52 | yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans, 53 | meta={'page': 1, 'uid': uid}, dont_filter=True) 54 | # weibos 55 | yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos, 56 | meta={'page': 1, 'uid': uid, 'name': user_item['name']}, dont_filter=True, priority=5) 57 | 58 | def parse_follows(self, response): 59 | """ 60 | parse follows 61 | :param response: follows response 62 | """ 63 | result = json.loads(response.text) 64 | if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and \ 65 | result.get('data').get('cards')[-1].get( 66 | 'card_group'): 67 | # parse users 68 | follows = result.get('data').get('cards')[-1].get('card_group') 69 | for follow in follows: 70 | if follow.get('user'): 71 | uid = follow.get('user').get('id') 72 | yield Request(self.user_url.format(uid=uid), callback=self.parse_user, dont_filter=True) 73 | uid = response.meta.get('uid') 74 | 75 | # next page 76 | page = response.meta.get('page') + 1 77 | yield Request(self.follow_url.format(uid=uid, page=page), 78 | callback=self.parse_follows, meta={'page': page, 'uid': uid}, dont_filter=True) 79 | 80 | def parse_fans(self, response): 81 | """ 82 | parse fans 83 | :param response: fans response 84 | """ 85 | result = json.loads(response.text) 86 | if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and \ 87 | result.get('data').get('cards')[-1].get( 88 | 'card_group'): 89 | # parse users 90 | fans = result.get('data').get('cards')[-1].get('card_group') 91 | for fan in fans: 92 | if fan.get('user'): 93 | uid = fan.get('user').get('id') 94 | yield Request(self.user_url.format(uid=uid), callback=self.parse_user, dont_filter=True) 95 | 96 | uid = response.meta.get('uid') 97 | # next page 98 | page = response.meta.get('page') + 1 99 | yield Request(self.fan_url.format(uid=uid, page=page), 100 | callback=self.parse_fans, meta={'page': page, 'uid': uid}, dont_filter=True) 101 | 102 | def parse_weibos(self, response): 103 | """ 104 | parse weibos 105 | :param response: weibos response 106 | """ 107 | result = json.loads(response.text) 108 | if result.get('ok') and result.get('data').get('cards'): 109 | weibos = result.get('data').get('cards') 110 | for weibo in weibos: 111 | mblog = weibo.get('mblog') 112 | if mblog: 113 | weibo_item = WeiboItem() 114 | field_map = { 115 | 'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count', 116 | 'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics', 117 | 'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text', 118 | 'thumbnail': 'thumbnail_pic', 119 | } 120 | for field, attr in field_map.items(): 121 | weibo_item[field] = mblog.get(attr) 122 | weibo_item['user'] = response.meta.get('uid') 123 | weibo_item['user_name'] = response.meta.get('name') 124 | yield weibo_item 125 | comment_url = self.comment_url.format(id=weibo_item['id'], page=1) 126 | yield Request(comment_url, callback=self.parse_comments, dont_filter=True, priority=10) 127 | 128 | # next page 129 | uid = response.meta.get('uid') 130 | page = response.meta.get('page') + 1 131 | yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos, 132 | meta={'uid': uid, 'page': page, 'name': response.meta.get('name')}, dont_filter=True, 133 | priority=5) 134 | 135 | def parse_comments(self, response): 136 | """ 137 | parse comments 138 | :param response: 139 | :return: 140 | """ 141 | result = json.loads(response.text) 142 | if result.get('ok'): 143 | comments = result.get('data', {}).get('data') 144 | params = parse_qs(urlparse(response.url).query) 145 | if comments: 146 | for comment in comments: 147 | comment_item = CommentItem() 148 | field_map = { 149 | 'id': 'id', 150 | 'likes_count': 'like_counts', 151 | 'text': 'text', 152 | 'reply_text': 'reply_text', 153 | 'created_at': 'created_at', 154 | 'source': 'source', 155 | 'user': 'user', 156 | 'reply_id': 'reply_id', 157 | } 158 | for field, attr in field_map.items(): 159 | comment_item[field] = comment.get(attr) 160 | comment_item['weibo'] = params.get('id')[0] 161 | yield comment_item 162 | 163 | # next page 164 | page = str(int(params.get('page')[0]) + 1) if params.get('page') else '2' 165 | yield Request(self.comment_url.format(id=params.get('id')[0], page=page), 166 | callback=self.parse_comments, dont_filter=True, priority=10) 167 | -------------------------------------------------------------------------------- /weibo/spiders/single.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | from scrapy import Request, Spider 5 | from weibo.items import * 6 | import os 7 | from environs import Env 8 | import random 9 | 10 | env = Env() 11 | 12 | START_COMMENT_ID = env.str('START_COMMENT_ID', None) 13 | print('START_COMMENT_ID', START_COMMENT_ID) 14 | 15 | 16 | class SingleSpider(Spider): 17 | """ 18 | comment spider of single weibo 19 | """ 20 | name = 'single' 21 | allowed_domains = ['m.weibo.cn'] 22 | start_url = 'https://m.weibo.cn/comments/hotflow?id={weibo_id}&mid={weibo_id}&max_id_type=0' 23 | next_url = 'https://m.weibo.cn/comments/hotflow?id={weibo_id}&mid={weibo_id}&max_id={max_id}&max_id_type=1' 24 | weibo_id = os.getenv('WEIBO_WEIBO_ID', '4467107636950632') 25 | custom_settings = { 26 | 'DOWNLOAD_DELAY': 10, 27 | 'COOKIES_ENABLED': True, 28 | # 'LOG_LEVEL': 'INFO', 29 | # 'COOKIES_DEBUG': True, 30 | 'SCHEDULER': 'scrapy.core.scheduler.Scheduler', 31 | 'REDIS_START_URLS_BATCH_SIZE': 5, 32 | 'RETRY_TIMES': 50, 33 | 'SCHEDULER_QUEUE_KEY': 'weibo:%(spider)s:requests' + str(random.randint(0, 100000)).zfill(6), 34 | 'DOWNLOADER_MIDDLEWARES': { 35 | 'weibo.middlewares.CSRFTokenMiddleware': 701, 36 | 'weibo.middlewares.RetryCommentMiddleware': 551, 37 | 'weibo.middlewares.ProxypoolMiddleware': 555 if env.bool('PROXYPOOL_ENABLED', True) else None, 38 | # 'weibo.middlewares.ProxytunnelMiddleware': 556 if env.bool('PROXYTUNNEL_ENABLED', True) else None, 39 | } 40 | } 41 | 42 | headers = { 43 | "Accept": "application/json, text/plain, */*", 44 | "Referer": "https://m.weibo.cn/detail/" + weibo_id, 45 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Mobile Safari/537.36", 46 | "X-Requested-With": "XMLHttpRequest", 47 | } 48 | 49 | cookies = env.str('WEIBO_COOKIES', '') 50 | page = 1 51 | 52 | def start_requests(self): 53 | """ 54 | start from defined users 55 | :return: 56 | """ 57 | if not START_COMMENT_ID: 58 | url = self.start_url.format(weibo_id=self.weibo_id) 59 | else: 60 | url = self.next_url.format(weibo_id=self.weibo_id, max_id=START_COMMENT_ID) 61 | # 赋值初始 Cookies 62 | cookies = { 63 | cookies_item.split('=')[0].strip(): cookies_item.split('=')[1].strip() for cookies_item in 64 | self.cookies.split(';')} 65 | yield Request(url, headers=self.headers, cookies=cookies, callback=self.parse_comments, 66 | priority=10, dont_filter=True, meta={'page': self.page}) 67 | 68 | def parse_comments(self, response): 69 | """ 70 | parse comments 71 | :param response: 72 | :return: 73 | """ 74 | page = response.meta['page'] 75 | self.logger.info('Crawled Page %s', page) 76 | result = json.loads(response.text) 77 | if result.get('ok') == 1: 78 | data = result.get('data', {}) 79 | comments = data.get('data') 80 | max_id = data.get('max_id') 81 | if not max_id: 82 | self.logger.error('Cannot get max_id from %s', response.url) 83 | return 84 | if comments: 85 | for comment in comments: 86 | comment_item = CommentItem() 87 | field_map = { 88 | 'id': 'id', 89 | 'likes_count': 'like_counts', 90 | 'text': 'text', 91 | 'reply_text': 'reply_text', 92 | 'created_at': 'created_at', 93 | 'source': 'source', 94 | 'user': 'user', 95 | 'reply_id': 'reply_id', 96 | } 97 | for field, attr in field_map.items(): 98 | comment_item[field] = comment.get(attr) 99 | comment_item['weibo'] = self.weibo_id 100 | self.logger.info('Comment %s %s %s', 101 | comment_item['id'], 102 | comment_item['text'], 103 | comment_item['created_at']) 104 | yield comment_item 105 | else: 106 | self.logger.error('No Comments Data %s', data) 107 | # next page 108 | url = self.next_url.format(weibo_id=self.weibo_id, max_id=max_id) 109 | self.logger.info('Next url %s', url) 110 | yield Request(url, headers=self.headers, callback=self.parse_comments, priority=10, dont_filter=True, 111 | meta={'page': page + 1}) 112 | else: 113 | self.logger.error('Result not ok %s', result) 114 | -------------------------------------------------------------------------------- /weibo/spiders/universal.py: -------------------------------------------------------------------------------- 1 | import json 2 | from scrapy import Request, Spider 3 | from weibo.items import * 4 | from urllib.parse import urlparse, parse_qs 5 | import scrapy_redis 6 | 7 | class UniversalSpider(Spider): 8 | """ 9 | universal spider to crawl all weibo 10 | """ 11 | name = 'universal' 12 | allowed_domains = ['m.weibo.cn'] 13 | user_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&value={uid}&containerid=100505{uid}' 14 | follow_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_followers_-_{uid}&page={page}' 15 | fan_url = 'https://m.weibo.cn/api/container/getIndex?containerid=231051_-_fans_-_{uid}&page={page}' 16 | weibo_url = 'https://m.weibo.cn/api/container/getIndex?uid={uid}&type=uid&page={page}&containerid=107603{uid}' 17 | comment_url = 'https://m.weibo.cn/api/comments/show?id={id}&page={page}' 18 | start_users = ['1195230310', '3217179555', '1742566624', '2282991915', '1288739185', '3952070245', '5878659096'] 19 | 20 | def start_requests(self): 21 | """ 22 | start from defined users 23 | :return: 24 | """ 25 | for uid in self.start_users: 26 | yield Request(self.user_url.format(uid=uid), callback=self.parse_user, dont_filter=True) 27 | 28 | def parse_user(self, response): 29 | """ 30 | parse user info 31 | :param response: user response 32 | """ 33 | self.logger.debug(response) 34 | result = json.loads(response.text) 35 | if result.get('data').get('userInfo'): 36 | user_info = result.get('data').get('userInfo') 37 | user_item = UserItem() 38 | field_map = { 39 | 'id': 'id', 'name': 'screen_name', 'avatar': 'profile_image_url', 'cover': 'cover_image_phone', 40 | 'gender': 'gender', 'description': 'description', 'fans_count': 'followers_count', 41 | 'follows_count': 'follow_count', 'weibos_count': 'statuses_count', 'verified': 'verified', 42 | 'verified_reason': 'verified_reason', 'verified_type': 'verified_type' 43 | } 44 | for field, attr in field_map.items(): 45 | user_item[field] = user_info.get(attr) 46 | yield user_item 47 | uid = user_info.get('id') 48 | # follows 49 | yield Request(self.follow_url.format(uid=uid, page=1), callback=self.parse_follows, 50 | meta={'page': 1, 'uid': uid}, dont_filter=True) 51 | # fans 52 | yield Request(self.fan_url.format(uid=uid, page=1), callback=self.parse_fans, 53 | meta={'page': 1, 'uid': uid}, dont_filter=True) 54 | # weibos 55 | yield Request(self.weibo_url.format(uid=uid, page=1), callback=self.parse_weibos, 56 | meta={'page': 1, 'uid': uid, 'name': user_item['name']}, dont_filter=True) 57 | 58 | def parse_follows(self, response): 59 | """ 60 | parse follows 61 | :param response: follows response 62 | """ 63 | result = json.loads(response.text) 64 | if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and \ 65 | result.get('data').get('cards')[-1].get( 66 | 'card_group'): 67 | # parse users 68 | follows = result.get('data').get('cards')[-1].get('card_group') 69 | for follow in follows: 70 | if follow.get('user'): 71 | uid = follow.get('user').get('id') 72 | yield Request(self.user_url.format(uid=uid), callback=self.parse_user, dont_filter=True) 73 | uid = response.meta.get('uid') 74 | 75 | # next page 76 | page = response.meta.get('page') + 1 77 | yield Request(self.follow_url.format(uid=uid, page=page), 78 | callback=self.parse_follows, meta={'page': page, 'uid': uid}, dont_filter=True) 79 | 80 | def parse_fans(self, response): 81 | """ 82 | parse fans 83 | :param response: fans response 84 | """ 85 | result = json.loads(response.text) 86 | if result.get('ok') and result.get('data').get('cards') and len(result.get('data').get('cards')) and \ 87 | result.get('data').get('cards')[-1].get( 88 | 'card_group'): 89 | # parse users 90 | fans = result.get('data').get('cards')[-1].get('card_group') 91 | for fan in fans: 92 | if fan.get('user'): 93 | uid = fan.get('user').get('id') 94 | yield Request(self.user_url.format(uid=uid), callback=self.parse_user, dont_filter=True) 95 | 96 | uid = response.meta.get('uid') 97 | # next page 98 | page = response.meta.get('page') + 1 99 | yield Request(self.fan_url.format(uid=uid, page=page), 100 | callback=self.parse_fans, meta={'page': page, 'uid': uid}, dont_filter=True) 101 | 102 | def parse_weibos(self, response): 103 | """ 104 | parse weibos 105 | :param response: weibos response 106 | """ 107 | result = json.loads(response.text) 108 | if result.get('ok') and result.get('data').get('cards'): 109 | weibos = result.get('data').get('cards') 110 | for weibo in weibos: 111 | mblog = weibo.get('mblog') 112 | if mblog: 113 | weibo_item = WeiboItem() 114 | field_map = { 115 | 'id': 'id', 'attitudes_count': 'attitudes_count', 'comments_count': 'comments_count', 116 | 'reposts_count': 'reposts_count', 'picture': 'original_pic', 'pictures': 'pics', 117 | 'created_at': 'created_at', 'source': 'source', 'text': 'text', 'raw_text': 'raw_text', 118 | 'thumbnail': 'thumbnail_pic', 119 | } 120 | for field, attr in field_map.items(): 121 | weibo_item[field] = mblog.get(attr) 122 | weibo_item['user'] = response.meta.get('uid') 123 | weibo_item['user_name'] = response.meta.get('name') 124 | yield weibo_item 125 | comment_url = self.comment_url.format(id=weibo_item['id'], page=1) 126 | yield Request(comment_url, callback=self.parse_comments, dont_filter=True) 127 | 128 | # next page 129 | uid = response.meta.get('uid') 130 | page = response.meta.get('page') + 1 131 | yield Request(self.weibo_url.format(uid=uid, page=page), callback=self.parse_weibos, 132 | meta={'uid': uid, 'page': page, 'name': response.meta.get('name')}, dont_filter=True) 133 | 134 | def parse_comments(self, response): 135 | """ 136 | parse comments 137 | :param response: 138 | :return: 139 | """ 140 | result = json.loads(response.text) 141 | if result.get('ok'): 142 | comments = result.get('data', {}).get('data') 143 | params = parse_qs(urlparse(response.url).query) 144 | if comments: 145 | for comment in comments: 146 | comment_item = CommentItem() 147 | field_map = { 148 | 'id': 'id', 149 | 'likes_count': 'like_counts', 150 | 'text': 'text', 151 | 'reply_text': 'reply_text', 152 | 'created_at': 'created_at', 153 | 'source': 'source', 154 | 'user': 'user', 155 | 'reply_id': 'reply_id', 156 | } 157 | for field, attr in field_map.items(): 158 | comment_item[field] = comment.get(attr) 159 | comment_item['weibo'] = params.get('id')[0] 160 | yield comment_item 161 | 162 | # next page 163 | page = str(int(params.get('page')[0]) + 1) if params.get('page') else '2' 164 | yield Request(self.comment_url.format(id=params.get('id')[0], page=page), 165 | callback=self.parse_comments, dont_filter=True) 166 | --------------------------------------------------------------------------------