├── shortvideocrawl ├── shortvideocrawl │ ├── __init__.py │ ├── spiders │ │ ├── __init__.py │ │ ├── quanmin.py │ │ ├── kuaishou.py │ │ ├── xinpianchang.py │ │ ├── ixigua.py │ │ └── haokan.py │ ├── items.py │ ├── settings.py │ ├── middlewares.py │ └── pipelines.py └── scrapy.cfg ├── renovate.json ├── pyproject.toml ├── README.md ├── .gitignore ├── LICENSE └── poetry.lock /shortvideocrawl/shortvideocrawl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://docs.renovatebot.com/renovate-schema.json", 3 | "extends": [ 4 | "config:base" 5 | ] 6 | } 7 | -------------------------------------------------------------------------------- /shortvideocrawl/shortvideocrawl/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /shortvideocrawl/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = shortvideocrawl.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = shortvideocrawl 12 | -------------------------------------------------------------------------------- /shortvideocrawl/shortvideocrawl/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class ShortvideocrawlItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | id = scrapy.Field() 13 | file_urls = scrapy.Field() 14 | files = scrapy.Field() 15 | 16 | _cookies = scrapy.Field() 17 | _headers = scrapy.Field() 18 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "shortvideocrawl" 3 | version = "0.0.0" 4 | description = "" 5 | authors = ["zeohan.dxs "] 6 | readme = "README.md" 7 | package-mode = false 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.10" 11 | scrapy = "^2.7.1" 12 | 13 | 14 | [tool.poetry.group.dev.dependencies] 15 | black = {version = "^23.1a1", allow-prereleases = true} 16 | requests = "^2.31.0" 17 | 18 | [build-system] 19 | requires = ["poetry-core"] 20 | build-backend = "poetry.core.masonry.api" 21 | 22 | [tool.pyright] -------------------------------------------------------------------------------- /shortvideocrawl/shortvideocrawl/spiders/quanmin.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib.parse import urlencode 3 | 4 | import scrapy 5 | 6 | from ..items import ShortvideocrawlItem 7 | 8 | SEARCH_API = "https://quanmin.baidu.com/wise/growth/api/home/searchmorelist" 9 | 10 | 11 | class QuanminSpider(scrapy.Spider): 12 | name = "quanmin" 13 | allowed_domains = ["quanmin.baidu.com", "bdstatic.com"] 14 | 15 | query = "蔡徐坤" 16 | count = 20 17 | 18 | def start_requests(self): 19 | yield self.request(0) 20 | 21 | def request(self, page: int): 22 | query = { 23 | "rn": 12, 24 | "pn": page, 25 | "q": self.query, 26 | "type": "search", 27 | "_format": "json", 28 | } 29 | 30 | return scrapy.Request( 31 | SEARCH_API + "?" + urlencode(query), 32 | meta={"page": page}, 33 | ) 34 | 35 | def parse(self, response): 36 | resp = json.loads(response.body) 37 | data = resp["data"] 38 | if "list" in data.keys(): 39 | meta = data["list"]["video_list"] 40 | 41 | for m in meta: 42 | # print(m["play_url"]) 43 | yield ShortvideocrawlItem( 44 | id=m["vid"], 45 | file_urls=[m["play_url"]], 46 | ) 47 | 48 | if data["list"]["has_more"] != 0: 49 | # not enough, theoretically 10 per page 50 | if (response.meta["page"] + 1) * 10 < int(self.count): 51 | yield self.request(response.meta["page"] + 1) 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ShortVideoCrawl 2 | 3 | [![GitHub](https://img.shields.io/github/license/dxsooo/ShortVideoCrawl)](./LICENSE) 4 | [![CodeFactor](https://www.codefactor.io/repository/github/dxsooo/shortvideocrawl/badge)](https://www.codefactor.io/repository/github/dxsooo/shortvideocrawl) 5 | 6 | Short video crawler based on [scrapy](https://github.com/scrapy/scrapy), crawling with search query of the target sites. 7 | 8 | Supports: 9 | 10 | |Site|Name|Status| 11 | |-|-|-| 12 | |kuaishou| [kuaishou](https://www.kuaishou.com/)| :heavy_check_mark: | 13 | |xigua| [ixigua](https://www.ixigua.com/)| :construction: | 14 | |新片场|[xinpianchang](https://www.xinpianchang.com/)| :heavy_check_mark: | 15 | |haokan|[haokan](https://haokan.baidu.com/)| :construction: | 16 | |度小视/全民小视频*|quanmin| :heavy_check_mark: | 17 | 18 | 19 | 20 | > \*度小视/全民小视频官网已经下线,但是目前本项目仍可用(2024.6测试) 21 | 22 | ## Usage 23 | 24 | requirements: 25 | 26 | - python 3.10+ 27 | - poetry 28 | 29 | ### prepare 30 | 31 | ```bash 32 | git clone https://github.com/dxsooo/ShortVideoCrawl 33 | cd ShortVideoCrawl 34 | poetry install --only main 35 | poetry shell 36 | ``` 37 | 38 | ### run 39 | 40 | For example: 41 | 42 | ```bash 43 | cd shortvideocrawl 44 | 45 | # main parameters: 46 | # query: query word 47 | # count: target video count 48 | 49 | # kuaishou 50 | scrapy crawl kuaishou -a query='蔡徐坤' -a count=50 51 | 52 | # xigua, with highest resolution and size smaller than 64 MB, duration smaller than 5 min 53 | # scrapy crawl ixigua -a query='蔡徐坤' -a count=50 54 | 55 | # xinpianchang, with highest resolution and size smaller than 64 MB, duration smaller than 5 min, but can only get a fixed number of video 56 | scrapy crawl xinpianchang -a query='蔡徐坤' 57 | 58 | # haokan, with highest resolution 59 | # scrapy crawl haokan -a query='蔡徐坤' -a count=50 60 | 61 | # quanmin 62 | scrapy crawl quanmin -a query='蔡徐坤' -a count=50 63 | ``` 64 | 65 | videos are saved in `./videos`, named with video id of source platform. 66 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | shortvideocrawl/videos -------------------------------------------------------------------------------- /shortvideocrawl/shortvideocrawl/spiders/kuaishou.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import scrapy 4 | 5 | from ..items import ShortvideocrawlItem 6 | 7 | headers = { 8 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 9 | "content-type": "application/json", 10 | } 11 | 12 | cookies = { 13 | "kpf": "PC_WEB", 14 | "did": "web_35b63ae980e92610232940034cb2dc66", 15 | "clientid": "3", 16 | "kpn": "KUAISHOU_VISION", 17 | } 18 | 19 | SEARCH_API = "https://www.kuaishou.com/graphql" 20 | 21 | 22 | class KuaishouSpider(scrapy.Spider): 23 | name = "kuaishou" 24 | allowed_domains = ["www.kuaishou.com", "kwaicdn.com"] 25 | 26 | query = "蔡徐坤" 27 | count = 40 28 | 29 | def start_requests(self): 30 | yield self.request(0) 31 | 32 | def request(self, page: int): 33 | body = { 34 | "operationName": "visionSearchPhoto", 35 | "variables": { 36 | "keyword": self.query, 37 | "pcursor": str(page), 38 | "page": "search", 39 | }, 40 | "query": "fragment photoContent on PhotoEntity {\n __typename\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n riskTagContent\n riskTagUrl\n}\n\nfragment recoPhotoFragment on recoPhotoEntity {\n __typename\n id\n duration\n caption\n originCaption\n likeCount\n viewCount\n commentCount\n realLikeCount\n coverUrl\n photoUrl\n photoH265Url\n manifest\n manifestH265\n videoResource\n coverUrls {\n url\n __typename\n }\n timestamp\n expTag\n animatedCoverUrl\n distance\n videoRatio\n liked\n stereoType\n profileUserTopPhoto\n musicBlocked\n riskTagContent\n riskTagUrl\n}\n\nfragment feedContent on Feed {\n type\n author {\n id\n name\n headerUrl\n following\n headerUrls {\n url\n __typename\n }\n __typename\n }\n photo {\n ...photoContent\n ...recoPhotoFragment\n __typename\n }\n canAddComment\n llsid\n status\n currentPcursor\n tags {\n type\n name\n __typename\n }\n __typename\n}\n\nquery visionSearchPhoto($keyword: String, $pcursor: String, $searchSessionId: String, $page: String, $webPageArea: String) {\n visionSearchPhoto(keyword: $keyword, pcursor: $pcursor, searchSessionId: $searchSessionId, page: $page, webPageArea: $webPageArea) {\n result\n llsid\n webPageArea\n feeds {\n ...feedContent\n __typename\n }\n searchSessionId\n pcursor\n aladdinBanner {\n imgUrl\n link\n __typename\n }\n __typename\n }\n}\n", 41 | } 42 | return scrapy.Request( 43 | SEARCH_API, 44 | method="POST", 45 | cookies=cookies, 46 | headers=headers, 47 | body=json.dumps(body), 48 | ) 49 | 50 | def parse(self, response): 51 | resp = json.loads(response.body) 52 | # print(resp) 53 | data = resp["data"]["visionSearchPhoto"] 54 | for feed in data["feeds"]: 55 | yield ShortvideocrawlItem( 56 | id=feed["photo"]["id"], 57 | file_urls=[feed["photo"]["photoUrl"]], 58 | ) 59 | 60 | # next 61 | # print(data) 62 | if data["pcursor"] != "no_more" and data["pcursor"] is not None: 63 | next_page = int(data["pcursor"]) 64 | # not enough, theoretically 20 per page 65 | if next_page * 20 < int(self.count): 66 | yield self.request(next_page) 67 | -------------------------------------------------------------------------------- /shortvideocrawl/shortvideocrawl/spiders/xinpianchang.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib.parse import urlencode 3 | 4 | import scrapy 5 | 6 | from ..items import ShortvideocrawlItem 7 | 8 | GET_SEARCH_ID_API = "https://www.xinpianchang.com/api/xpc/v2/search/getSearchKwIdByKw" 9 | SEARCH_API = "https://www.xinpianchang.com/_next/data/%s/search.json" 10 | VIDEO_INFO_API = ( 11 | "https://mod-api.xinpianchang.com/mod/api/v2/media/%s?appKey=61a2f329348b3bf77" 12 | ) 13 | 14 | headers = { 15 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 16 | } 17 | 18 | 19 | class XinpianchangSpider(scrapy.Spider): 20 | name = "xinpianchang" 21 | allowed_domains = [ 22 | "www.xinpianchang.com", 23 | "mod-api.xinpianchang.com", 24 | "xpccdn.com", 25 | ] 26 | 27 | query = "蔡徐坤" 28 | 29 | def start_requests(self): 30 | yield scrapy.Request( 31 | GET_SEARCH_ID_API + "?" + urlencode({"kw": self.query}), 32 | callback=self.parse_search_id, 33 | headers=headers, 34 | ) 35 | 36 | def search_request(self, search_id, build_id): 37 | params = {"from": "inputSearch", "kw_id": search_id, "duration": "0,300"} 38 | return scrapy.Request( 39 | SEARCH_API % build_id + "?" + urlencode(params), 40 | headers=headers, 41 | callback=self.parse_search, 42 | ) 43 | 44 | def parse_search_id(self, response): 45 | resp = json.loads(response.body) 46 | search_id = resp["data"]["id"] 47 | yield scrapy.Request( 48 | "https://www.xinpianchang.com/", 49 | headers=headers, 50 | callback=self.parse_build_info, 51 | meta={"search_id": search_id}, 52 | ) 53 | 54 | def parse_build_info(self, response): 55 | data = response.xpath('//*[@id="__NEXT_DATA__"]/text()').get() 56 | build_id = json.loads(data)["buildId"] 57 | yield self.search_request(response.meta["search_id"], build_id) 58 | 59 | def parse_search(self, response): 60 | resp = json.loads(response.body) 61 | search_data = resp["pageProps"]["searchData"] 62 | if "list" in search_data: 63 | for d in search_data["list"]: 64 | # print(d) 65 | if "web_url" in d: 66 | yield scrapy.Request( 67 | d["web_url"].split("?")[0], 68 | headers=headers, 69 | callback=self.parse_detail, 70 | ) 71 | 72 | def parse_detail(self, response): 73 | # print(response.body) 74 | data = response.xpath('//*[@id="__NEXT_DATA__"]/text()').get() 75 | vid = json.loads(data)["props"]["pageProps"]["detail"]["vid"] 76 | yield scrapy.Request( 77 | VIDEO_INFO_API % vid, 78 | headers=headers, 79 | callback=self.parse_video_info, 80 | ) 81 | 82 | def parse_video_info(self, response): 83 | resp = json.loads(response.body) 84 | video_id = resp["data"]["mid"] 85 | url = self.get_highest_quality(resp["data"]["resource"]["progressive"]) 86 | if url != "": 87 | yield ShortvideocrawlItem( 88 | id=video_id, 89 | file_urls=[url], 90 | _headers={ 91 | "range": "bytes=0-", 92 | }, 93 | ) 94 | 95 | @staticmethod 96 | def get_highest_quality(data) -> str: 97 | url = "" 98 | max_width = 0 99 | for v in data: 100 | if ( 101 | v["width"] > max_width and v["filesize"] < 64 * 1024 * 1024 102 | ): # file size smaller than 64MB 103 | url = v["url"] 104 | max_width = v["width"] 105 | return url 106 | -------------------------------------------------------------------------------- /shortvideocrawl/shortvideocrawl/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for shortvideocrawl project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = "shortvideocrawl" 11 | 12 | SPIDER_MODULES = ["shortvideocrawl.spiders"] 13 | NEWSPIDER_MODULE = "shortvideocrawl.spiders" 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | # USER_AGENT = 'shortvideocrawl (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | # CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | DOWNLOAD_DELAY = 3 29 | # The download delay setting will honor only one of: 30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | # CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | # COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | # TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | # DEFAULT_REQUEST_HEADERS = { 41 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 42 | # 'Accept-Language': 'en', 43 | # } 44 | 45 | # Enable or disable spider middlewares 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47 | # SPIDER_MIDDLEWARES = { 48 | # 'shortvideocrawl.middlewares.ShortvideocrawlSpiderMiddleware': 543, 49 | # } 50 | 51 | # Enable or disable downloader middlewares 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53 | # DOWNLOADER_MIDDLEWARES = { 54 | # 'shortvideocrawl.middlewares.ShortvideocrawlDownloaderMiddleware': 543, 55 | # } 56 | 57 | # Enable or disable extensions 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 59 | # EXTENSIONS = { 60 | # 'scrapy.extensions.telnet.TelnetConsole': None, 61 | # } 62 | 63 | # Configure item pipelines 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 65 | ITEM_PIPELINES = { 66 | "shortvideocrawl.pipelines.ShortvideocrawlPipeline": 300, 67 | "shortvideocrawl.pipelines.VideosPipeline": 301, 68 | # "scrapy.pipelines.files.FilesPipeline": 301, 69 | } 70 | 71 | FILES_STORE = "videos" 72 | 73 | # unlimit 74 | DOWNLOAD_WARNSIZE = 0 75 | 76 | # Enable and configure the AutoThrottle extension (disabled by default) 77 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 78 | # AUTOTHROTTLE_ENABLED = True 79 | # The initial download delay 80 | # AUTOTHROTTLE_START_DELAY = 5 81 | # The maximum download delay to be set in case of high latencies 82 | # AUTOTHROTTLE_MAX_DELAY = 60 83 | # The average number of requests Scrapy should be sending in parallel to 84 | # each remote server 85 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 86 | # Enable showing throttling stats for every response received: 87 | # AUTOTHROTTLE_DEBUG = False 88 | 89 | # Enable and configure HTTP caching (disabled by default) 90 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 91 | # HTTPCACHE_ENABLED = True 92 | # HTTPCACHE_EXPIRATION_SECS = 0 93 | # HTTPCACHE_DIR = 'httpcache' 94 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 95 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 96 | 97 | # Set settings whose default value is deprecated to a future-proof value 98 | REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7" 99 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" 100 | -------------------------------------------------------------------------------- /shortvideocrawl/shortvideocrawl/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | 12 | class ShortvideocrawlSpiderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, or item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Request or item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class ShortvideocrawlDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /shortvideocrawl/shortvideocrawl/spiders/ixigua.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import re 4 | from urllib.parse import quote, urlencode 5 | 6 | import scrapy 7 | 8 | from ..items import ShortvideocrawlItem 9 | 10 | SEARCH_API = "https://www.ixigua.com/api/searchv2/complex/" 11 | headers = { 12 | "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36", 13 | "referer": "https://www.ixigua.com/", 14 | } 15 | cookies = { 16 | "csrf_session_id": "812a63a05b0adb796737198645a7db48", 17 | "support_webp": "true", 18 | "support_avif": "true", 19 | "msToken": "Ty1J7te-X2i7QNFCtXSru4Ynpvmh7B6j86CVTFcV3TxacHWwyTfweP_UvK54EuLEx3v7O_NDZPvgkDOqHZmMwXJTKEL3DC5-P5Fyqg8=", 20 | "fpk1": "U2FsdGVkX1+kemf8xYpe3wksHjGni1kMAfW8OpsdXThDsx8SaJ2kwcHajsP6Gpnfb9P8kkxnuIdyfipJez8j8w==", 21 | "fpk2": "d72690806e05ab108412ee33b4c5c3e1", 22 | "_tea_utm_cache_2285": "undefined", 23 | "ixigua-a-s": "1", 24 | "tt_scid": "LSErAHS5pGlnI2UJwICxaT4WdMNw36koPGT19HAcl7kBgJR378u6-Ggkhq.W7F7zc389", 25 | "ttwid": "1%7CLr-_X8mdYVnrdMcniUQYfLP0oRZwKi3caPq8oLe1wlg%7C1717690469%7C12dec422a6ddd831b29b75e8886f79caa4d1ca13a741c2557b10de86fad89446", 26 | "msToken": "uAbhlpb9Iw4kbIb3SF_Gcwvh9RcpOKJp4VUOy1x9k2q56nU_2kOaVWP6jbP-bWmBj_hcNvgbUB-ORByLnoxKS_M5Q-Cn0FUly-BpbJPmChvTGxeSercQKbn8Um6i3SA=", 27 | } 28 | 29 | 30 | class IxiguaSpider(scrapy.Spider): 31 | name = "ixigua" 32 | allowed_domains = ["www.ixigua.com"] 33 | 34 | query = "蔡徐坤" 35 | count = 20 36 | 37 | def start_requests(self): 38 | yield self.search_request(0) 39 | 40 | def search_request(self, page: int): 41 | offset = page * 10 42 | params = { 43 | "min_duration": 1, 44 | "max_duration": 300, 45 | } 46 | return scrapy.Request( 47 | SEARCH_API 48 | + quote(self.query) 49 | + "/" 50 | + str(offset) 51 | + "?" 52 | + urlencode(params), 53 | headers=headers, 54 | cookies=cookies, 55 | meta={"page": page}, 56 | callback=self.parse_search, 57 | ) 58 | 59 | def detail_request(self, vid): 60 | return scrapy.Request( 61 | f"https://www.ixigua.com/{vid}", 62 | cookies=cookies, 63 | callback=self.parse_detail, 64 | ) 65 | 66 | def parse_search(self, response): 67 | resp = json.loads(response.body) 68 | data = resp["data"] 69 | # print(json.dumps(data)) 70 | for d in data["data"]: 71 | # print(d["data"]["group_id"]) 72 | if "group_id" in d["data"].keys(): 73 | yield self.detail_request(d["data"]["group_id"]) 74 | 75 | if data["has_more"] != False: 76 | # not enough, theoretically 10 per page 77 | if (response.meta["page"] + 1) * 10 < int(self.count): 78 | yield self.search_request(response.meta["page"] + 1) 79 | 80 | def parse_detail(self, response): 81 | # print(response.text) 82 | pattern = re.compile( 83 | r"