├── .dockerignore ├── .env ├── .gitignore ├── Dockerfile ├── LICENSE ├── LearnSpider ├── __init__.py ├── asgi.py ├── local_settings.py ├── settings.py ├── urls.py ├── views.py └── wsgi.py ├── README.md ├── api ├── __init__.py ├── admin.py ├── apps.py ├── migrations │ └── __init__.py ├── models.py ├── tests.py ├── urls.py └── views.py ├── collect_data ├── __init__.py ├── collect_news.py ├── collect_news_binance.py └── collect_news_block_beats.py ├── docker-compose.yml ├── docs └── 开发者文档.md ├── learn_spider-2025-02-27.sql ├── logs ├── django_debug.log └── django_error.log ├── manage.py ├── requirements.txt ├── spider_demo └── demo │ ├── demo │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── hello_scrapy.py │ │ ├── parse_demo.py │ │ └── post_spider.py │ ├── output.json │ └── scrapy.cfg ├── static ├── bootstrap │ ├── bootstrap-icons.css │ ├── bootstrap-icons.woff │ ├── bootstrap-icons.woff2 │ ├── bootstrap.bundle.min.js │ └── bootstrap.min.css ├── imgs │ ├── 404.svg │ ├── favicon.ico │ ├── friends │ │ ├── learnspider_logo.png │ │ └── stardream_logo.png │ ├── logo-img.svg │ ├── logo.jpeg │ ├── logo.svg │ ├── sandbox │ │ ├── blog.jpeg │ │ ├── music.jpeg │ │ ├── news.jpeg │ │ ├── shop.jpeg │ │ ├── video.jpeg │ │ └── wallpaper.jpeg │ ├── setu │ │ ├── 1.jpg │ │ ├── 1.png │ │ ├── 2.jpg │ │ ├── 2.png │ │ ├── 3.jpg │ │ └── 3.png │ ├── sponsors │ │ ├── evolution_host.png │ │ ├── lky_logo.png │ │ ├── qgwl_logo.png │ │ └── yrx_logo.png │ └── support │ │ ├── lky.png │ │ ├── wx.jpg │ │ ├── yrx.png │ │ └── zfb.jpg └── js │ ├── cpython666.js │ ├── jquery-3.7.1.min.js │ └── popper.min.js ├── tasks.py ├── test ├── demo.py └── dp.py └── topics ├── __init__.py ├── admin.py ├── apps.py ├── decorators.py ├── management ├── __init__.py └── commands │ ├── __init__.py │ ├── fetch_news_data.py │ ├── fetch_news_web3.py │ ├── update_category.py │ ├── update_difficulty_scores.py │ ├── update_order_ids.py │ └── update_pass_status.py ├── migrations ├── 0001_initial.py ├── 0002_web3newstag.py ├── 0003_web3news_web3newstag_tag_id_alter_web3newstag_name_and_more.py └── __init__.py ├── models.py ├── scheduler.py ├── serializers.py ├── static └── topics │ └── css │ └── style.css ├── tasks.py ├── templates └── topics │ ├── 404.html │ ├── base.html │ ├── demo.html │ ├── header.html │ ├── index │ ├── index.html │ ├── list.html │ ├── sandbox.html │ ├── shorthand.html │ └── tools.html │ ├── pages │ ├── 111.html │ ├── 112.html │ ├── ajax.html │ ├── asyncio.html │ ├── course-buying-guide.html │ ├── css-sprite.html │ ├── demo.html │ ├── demo_get_server_time.html │ ├── devtools.html │ ├── easy-spider.html │ ├── h1-6.html │ ├── hello-get.html │ ├── hello-post-form.html │ ├── hello-post-json.html │ ├── intro.html │ ├── lsp-spider.html │ ├── news.html │ ├── pagination-1.html │ ├── pagination-2.html │ ├── pagination-table.html │ ├── redirect.html │ ├── rowspan-table.html │ ├── spider-guide.html │ ├── spider-roadmap.html │ ├── svg.html │ ├── table-key-value.html │ └── wenjuan.html │ ├── sandbox │ ├── news │ │ ├── about_us.html │ │ ├── category.html │ │ ├── detail_category.html │ │ ├── detail_news.html │ │ ├── detail_source.html │ │ ├── news_base.html │ │ ├── news_header.html │ │ ├── news_hot.html │ │ ├── news_hot_detail.html │ │ ├── news_index.html │ │ ├── notice.html │ │ ├── technology.html │ │ └── web3.html │ └── wallpaper │ │ ├── wallpaper.html │ │ ├── wallpaper_base.html │ │ └── wallpaper_header.html │ ├── solutions.html │ ├── tools │ └── encode.html │ └── views │ ├── encode.html │ ├── hello-spider.html │ ├── request-twice-cookie.html │ ├── request-twice.html │ ├── table.html │ └── ua.html ├── tests.py ├── urls.py └── views.py /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | .DS_Store 3 | .idea 4 | .env 5 | /static_root 6 | docs 7 | spider_demo 8 | __pycache__ -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | DJANGO_ENV=local -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | staticfiles 3 | __pycache__/ 4 | *.pyc 5 | .DS_Store 6 | /static/bootstrap/bootstrap-icons-1.11.3/ 7 | /static/bootstrap/bootstrap-icons-1.11.3.zip 8 | bigsetu 9 | LearnSpider/secret_settings.py 10 | static_root -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # 选择基础镜像 2 | #FROM python:3.11 3 | FROM python:3.11-slim 4 | # 设置工作目录 5 | WORKDIR /app 6 | 7 | # 更新pip到最新版本 8 | RUN pip install --upgrade pip 9 | 10 | # 复制项目文件 11 | COPY . . 12 | # 安装依赖 13 | RUN pip install --no-cache-dir -r requirements.txt -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple 14 | 15 | # 指定运行命令 16 | CMD ["python", "manage.py", "runserver", "0.0.0.0:8001"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 CodeFly 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /LearnSpider/__init__.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | 3 | pymysql.install_as_MySQLdb() 4 | -------------------------------------------------------------------------------- /LearnSpider/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for LearnSpider project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "LearnSpider.settings") 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /LearnSpider/local_settings.py: -------------------------------------------------------------------------------- 1 | SECRET_KEY = "your-secret-key" 2 | # settings 3 | # 多语言配置更改语言为中文 4 | LANGUAGE_CODE = "zh-hans" 5 | # 时区 6 | TIME_ZONE = "Asia/Shanghai" 7 | USE_TZ = True 8 | APPEND_SLASH = True 9 | 10 | SITE_URL = "http://learnspider.vip" 11 | 12 | CORS_ALLOWED_ORIGINS = [ 13 | "http://localhost:63342", # 允许的前端地址 14 | "http://localhost:8005", # 允许的前端地址(如果有必要) 15 | "http://localhost:5173", # 允许的前端地址(如果有必要) 16 | "http://localhost:4173", # 允许的前端地址(如果有必要) 17 | "http://localhost:4000", # 允许的前端地址(如果有必要) 18 | "http://www.learnspider.vip", 19 | "http://learnspider.vip", 20 | "http://localhost", 21 | "http://127.0.0.1", 22 | ] 23 | ALLOWED_HOSTS = [ 24 | "www.learnspider.vip", 25 | "learnspider.vip", 26 | "127.0.0.1", 27 | "localhost", 28 | "110.42.101.196", 29 | ] 30 | 31 | # 本地mysql 32 | # DATABASES = { 33 | # "default": { 34 | # "ENGINE": "django.db.backends.mysql", 35 | # "NAME": "learn_spider", 36 | # "HOST": "127.0.0.1", 37 | # "PORT": 3306, 38 | # "USER": "root", 39 | # "PASSWORD": "1234", 40 | # } 41 | # } 42 | -------------------------------------------------------------------------------- /LearnSpider/settings.py: -------------------------------------------------------------------------------- 1 | """ 2 | Django settings for LearnSpider project. 3 | 4 | Generated by 'django-admin startproject' using Django 5.0.7. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/topics/settings/ 8 | 9 | For the full list of settings and their values, see 10 | https://docs.djangoproject.com/en/5.0/ref/settings/ 11 | """ 12 | 13 | from pathlib import Path 14 | 15 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 16 | BASE_DIR = Path(__file__).resolve().parent.parent 17 | 18 | # Quick-start development settings - unsuitable for production 19 | # See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/ 20 | 21 | # SECURITY WARNING: keep the secret key used in production secret! 22 | SECRET_KEY = "django-insecure-3$@35gg*j)v%o)*82^1n-x7eh!iq22!y3#g@gr35=3hazt)=!g" 23 | 24 | # SECURITY WARNING: don't run with debug turned on in production! 25 | # Application definition 26 | 27 | INSTALLED_APPS = [ 28 | "django.contrib.admin", 29 | "django.contrib.auth", 30 | "django.contrib.contenttypes", 31 | "django.contrib.sessions", 32 | "django.contrib.messages", 33 | "django.contrib.staticfiles", 34 | "rest_framework", 35 | "topics", 36 | "api", 37 | ] 38 | 39 | MIDDLEWARE = [ 40 | "django.middleware.security.SecurityMiddleware", 41 | "django.contrib.sessions.middleware.SessionMiddleware", 42 | "django.middleware.common.CommonMiddleware", 43 | "django.middleware.csrf.CsrfViewMiddleware", 44 | "django.contrib.auth.middleware.AuthenticationMiddleware", 45 | "django.contrib.messages.middleware.MessageMiddleware", 46 | "django.middleware.clickjacking.XFrameOptionsMiddleware", 47 | ] 48 | 49 | ROOT_URLCONF = "LearnSpider.urls" 50 | 51 | TEMPLATES = [ 52 | { 53 | "BACKEND": "django.template.backends.django.DjangoTemplates", 54 | "DIRS": [], 55 | "APP_DIRS": True, 56 | "OPTIONS": { 57 | "context_processors": [ 58 | "django.template.context_processors.debug", 59 | "django.template.context_processors.request", 60 | "django.contrib.auth.context_processors.auth", 61 | "django.contrib.messages.context_processors.messages", 62 | ], 63 | }, 64 | }, 65 | ] 66 | 67 | WSGI_APPLICATION = "LearnSpider.wsgi.application" 68 | 69 | # Database 70 | # https://docs.djangoproject.com/en/5.0/ref/settings/#databases 71 | 72 | DATABASES = { 73 | "default": { 74 | "ENGINE": "django.db.backends.sqlite3", 75 | "NAME": BASE_DIR / "db.sqlite3", 76 | } 77 | } 78 | 79 | # Password validation 80 | # https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators 81 | 82 | AUTH_PASSWORD_VALIDATORS = [ 83 | { 84 | "NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator", 85 | }, 86 | { 87 | "NAME": "django.contrib.auth.password_validation.MinimumLengthValidator", 88 | }, 89 | { 90 | "NAME": "django.contrib.auth.password_validation.CommonPasswordValidator", 91 | }, 92 | { 93 | "NAME": "django.contrib.auth.password_validation.NumericPasswordValidator", 94 | }, 95 | ] 96 | 97 | # Internationalization 98 | # https://docs.djangoproject.com/en/5.0/topics/i18n/ 99 | 100 | 101 | USE_I18N = True 102 | TIME_ZONE = "Asia/Shanghai" 103 | USE_TZ = True 104 | 105 | LANGUAGE_CODE = "zh-Hans" 106 | # Static files (CSS, JavaScript, Images) 107 | # https://docs.djangoproject.com/en/5.0/howto/static-files/ 108 | 109 | STATIC_URL = "static/" 110 | STATICFILES_DIRS = [ 111 | BASE_DIR / "static", 112 | ] 113 | import os 114 | 115 | STATIC_ROOT = os.path.join(BASE_DIR, "static_root") 116 | # Default primary key field type 117 | # https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field 118 | 119 | DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField" 120 | 121 | DEBUG = True 122 | # DEBUG = False 123 | ALLOWED_HOSTS = [ 124 | "www.stardream.vip", 125 | "stardream.vip", 126 | "0.0.0.0", 127 | "127.0.0.1", 128 | "localhost", 129 | ] 130 | 131 | # PORT=8001 132 | topics_path_prefix = "topic/" 133 | 134 | # settings.py 的末尾添加以下代码 135 | 136 | try: 137 | from .local_settings import * 138 | from .secret_settings import * 139 | except ImportError as e: 140 | print(e) 141 | pass 142 | 143 | from dotenv import load_dotenv 144 | # 定义日志目录 145 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 146 | LOG_DIR = os.path.join(BASE_DIR, "logs") 147 | 148 | # 如果日志目录不存在,则创建 149 | if not os.path.exists(LOG_DIR): 150 | os.makedirs(LOG_DIR) 151 | load_dotenv() 152 | DJANGO_ENV = os.getenv("DJANGO_ENV") 153 | if DJANGO_ENV == "local": 154 | DEBUG = True 155 | else: 156 | DEBUG = False 157 | 158 | LOGGING = { 159 | "version": 1, 160 | "disable_existing_loggers": False, 161 | "formatters": { 162 | "verbose": { 163 | "format": "[%(asctime)s] %(message)s", 164 | "datefmt": "%d/%b/%Y %H:%M:%S", 165 | }, 166 | }, 167 | "handlers": { 168 | "debug_file": { 169 | "level": "DEBUG", 170 | "class": "logging.FileHandler", 171 | "filename": os.path.join(LOG_DIR, "django_debug.log"), 172 | "formatter": "verbose", 173 | }, 174 | "error_file": { 175 | "level": "ERROR", 176 | "class": "logging.FileHandler", 177 | "filename": os.path.join(LOG_DIR, "django_error.log"), 178 | "formatter": "verbose", 179 | }, 180 | }, 181 | "loggers": { 182 | "django": { 183 | "handlers": ["debug_file", "error_file"], 184 | "level": "DEBUG", 185 | "propagate": True, 186 | }, 187 | }, 188 | } 189 | print("DEBUG",DEBUG) 190 | -------------------------------------------------------------------------------- /LearnSpider/urls.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | from django.urls import include, path, re_path 3 | from django.views.static import serve 4 | from LearnSpider.settings import STATIC_ROOT 5 | 6 | # https://www.cnblogs.com/ddb1-1/p/12455147.html 7 | urlpatterns = [ 8 | path("admin/", admin.site.urls), 9 | path("", include("topics.urls")), 10 | path("", include("api.urls")), 11 | ] 12 | # re_path(r'^static/(?P.*)$', serve, {'document_root': STATIC_ROOT}),#static文件 13 | # 在项目根目录的 urls.py 中定义全局404处理 14 | from topics.views import error404 15 | from django.conf.urls import handler404 16 | 17 | handler404 = error404 18 | -------------------------------------------------------------------------------- /LearnSpider/views.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/LearnSpider/views.py -------------------------------------------------------------------------------- /LearnSpider/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for LearnSpider project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "LearnSpider.settings") 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

爬虫百战成神 (LearnSpider)

2 |

3 | 4 | StarDreamSpider 5 | 6 |

7 |

来一场爬虫与成神相结合的旅行吧!

8 | 9 | ## 项目介绍 10 | 11 | 爬虫百战成神(LearnSpider)是一个面向初学者到高级用户的爬虫练习网站。我们提供了多种技术示例代码、详细的文档讲解和视频演示,帮助用户从零开始学习并掌握爬虫技术。 12 | 13 | ## 详细介绍 14 | 15 | 这是一个使用django+drf做一个爬虫刷题网站,也就是一个靶场,我创建这个项目的代码仓库名为LearnSpider,中文叫:爬虫百战成神。 16 | 17 | 它不仅是一个练习场,也配套有每道题目的多种技术示例代码,文档讲解,视频演示。题目由易到难,由浅入深,想让大家在刷题与实践的过程中甚至是从零学会爬虫(因为思想学会后,剩下的代码其实就是工具的使用),在这个过程中增加自己对于代码和场景的理解。本项目目标覆盖爬虫初级,进阶和高级。涉及到requests,scrapy这些请求工具,还有selenium,drissionpage这些自动化工具框架。涉及到接口请求,静态页面解析,也涉及到代码混淆,接口加密,也包含各种抓包工具的使用,chrome开发者工具的使用等。包括一些新颖的反爬技术,比如前端层面的反爬,svg反爬,css反爬,雪碧图等,也比如新兴的反爬技术比如wasm,总之就是我会什么,就像教大家什么。所以此仓库的内容也会无限拓展。也欢迎大家的贡献。 18 | 19 | 此仓库旨在让大家在刷题的过程中以结果和成就感驱动学习,学习到某个知识点后可以快速应用,从而感受到学到了东西,爬虫是如此的简单有趣。而不是学完之后因为网站内容变动而没有刷题的地方,久而久之像没学一样。并且本项目最想让大家养成举一反三,逻辑推理的思考思维习惯。 20 | 21 | 搭建此项目使用的技术栈是Django+DRF+JQuery。使用django的模板语法实现前端,使用jquery实现页面js逻辑与请求,drf实现请求限流。数据库使用sqlite。前端样式实现使用的bootstrap,本来想着手搓的,后面做的时候有感觉没必要给自己增加无意义的工作量。 22 | 23 | ### TODO 24 | 25 | - docker部署(我的mac好像连接不上docker的网络,暂时搁置等后面再说) 26 | - 用户系统 27 | 28 | ### 项目目标 29 | 30 | - **覆盖范围**:从初级到高级的爬虫技术 31 | - **工具与框架**:requests、scrapy、selenium、drissionpage等 32 | - **技术点**: 33 | - 接口请求与静态页面解析 34 | - 代码混淆与接口加密 35 | - 各种抓包工具与Chrome开发者工具使用 36 | - 新颖的反爬技术(前端层面、SVG、CSS、雪碧图、WASM等) 37 | 38 | ### 项目特色 39 | 40 | 1. **全面覆盖**:包含从入门到高级的各类爬虫技术与工具使用。 41 | 2. **示例丰富**:每道题目提供多种技术示例代码。 42 | 3. **详细讲解**:文档与视频讲解,帮助理解每个技术点。 43 | 4. **持续更新**:内容会随着新技术的出现不断扩展。 44 | 5. **社区贡献**:欢迎大家贡献自己的题目和解法。 45 | 46 | ### 学习方式 47 | 48 | - 通过刷题和实践,从零开始学习爬虫技术。 49 | - 以结果和成就感驱动学习,快速应用所学知识。 50 | - 培养举一反三和逻辑推理的思维习惯。 51 | 52 | ## 账号密码 53 | 54 | LearnSpider 55 | LearnSpider (线上密码已被更改) 56 | 邮箱:cpython666@gmail.com 57 | 58 | ## 技术栈 59 | 60 | - 后端框架:Django + Django REST Framework (DRF) 61 | - 前端:使用Django模板语法 62 | 63 | ## 项目结构 64 | 65 | ```plaintext 66 | LearnSpider/ 67 | ├── backend/ # 后端代码 68 | ├── frontend/ # 前端代码 69 | ├── templates/ # Django 模板文件 70 | ├── static/ # 静态文件 71 | ├── docs/ # 项目文档 72 | ├── videos/ # 视频演示 73 | └── README.md # 项目说明 74 | ``` 75 | 76 | ## 安装与运行 77 | 78 | ### 环境要求 79 | 80 | - Python 3.11+ 81 | - Django 4.2+ 82 | - Django REST Framework 83 | 84 | ### 安装步骤 85 | 86 | 1. 克隆项目代码: 87 | 88 | ```bash 89 | git clone https://github.com/cpython666/LearnSpider.git 90 | cd LearnSpider 91 | ``` 92 | 93 | 2. 创建并激活虚拟环境: 94 | 95 | ```bash 96 | python -m venv venv 97 | source venv/bin/activate # 对于Windows用户:venv\Scripts\activate 98 | ``` 99 | 100 | 3. 安装依赖: 101 | 102 | ```bash 103 | pip install -r requirements.txt 104 | ``` 105 | 106 | 4. 导入数据到mysql,修改数据库连接配置!!!!!!!!!! 107 | 108 | 5. 运行数据库迁移: 109 | 110 | ```bash 111 | python manage.py migrate 112 | ``` 113 | 114 | 6. 【可选】收集静态文件。 115 | 116 | 线上运行的时候用nginx代理静态文件; 117 | 118 | 本地运行的时候确保debug为true,否则访问不到静态资源 119 | 120 | ``` 121 | python manage.py collectstatic --noinput 122 | ``` 123 | 124 | 125 | 126 | 1. 启动开发服务器: 127 | 128 | ```bash 129 | python manage.py runserver 130 | ``` 131 | 132 | 2. 在浏览器中打开 `http://127.0.0.1:8000` 查看项目。 133 | 134 | ### docker部署 135 | 136 | 环境:windows+dockerdesktop 137 | 启动命令 138 | 139 | ```bash 140 | docker build -t learn-spider-app . 141 | ``` 142 | 143 | ```bash 144 | docker run -d -p 80:8000 learn-spider-app 145 | ``` 146 | 147 | ```bash 148 | docker compose up -d 149 | ``` 150 | 151 | ## 贡献指南 152 | 153 | 1. Fork 本仓库 154 | 2. 创建一个新的分支 (`git checkout -b feature/你的特性`) 155 | 3. 提交你的更改 (`git commit -am '添加了新的功能'`) 156 | 4. 推送到分支 (`git push origin feature/你的特性`) 157 | 5. 创建一个新的 Pull Request 158 | 159 | ## 联系我们 160 | 161 | 如有任何问题或建议,请通过以下方式联系我们: 162 | 163 | - Email: 你的邮箱@example.com 164 | - GitHub Issues: https://github.com/你的用户名/LearnSpider/issues 165 | 166 | --- 167 | 168 | ### 建议 169 | 170 | 1. **用户体验优化**:虽然前端使用Django模板语法,但可以考虑引入一些现代的前端库和框架,如Bootstrap或Tailwind CSS,以提升用户体验。 171 | 2. **单元测试与持续集成**:增加单元测试,使用CI工具如Travis CI或GitHub Actions,确保代码质量。 172 | 3. **文档与教程**:持续完善文档,增加更多详细的教程和示例代码,帮助用户更好地理解和应用技术。 173 | 4. **社区互动**:建立一个论坛或使用GitHub Discussions,促进用户间的交流与分享。 174 | 5. **安全性与性能优化**:关注爬虫的安全性,避免被反爬机制检测,并优化性能,提升爬取效率。 175 | 176 | 希望这些建议对你的项目有所帮助!如果有更多问题,随时联系我。 177 | 178 | **注意:** 该项目仅供学习和交流使用,不得用于非法活动。作者对任何滥用项目所导致的问题概不负责。 179 | 180 | ### 常用工具命令 181 | 182 | 根据难度分数计算显示顺序 183 | 184 | ```bash 185 | python manage.py update_pass_status 186 | ``` 187 | 188 | 迁移模型 189 | 190 | ```bash 191 | python manage.py makemigrations 192 | python manage.py migrate 193 | ``` 194 | 195 | ```bash 196 | #更新题目的显示顺序id 197 | python manage.py update_difficulty_scores 198 | python manage.py update_order_ids 199 | ``` 200 | 201 | ## 项目赞助 202 | 203 | 赞助支持可以备注github名,会显示在下方列表 204 | 205 | | 日期 | 姓名 | 金额 | 206 | |------------|----------------------------------------------|----| 207 | | 2024.08.20 | [@cpython666](https://github.com/cpython666) | ¥0 | 208 | 209 |

210 | 211 | 微信 212 | 支付宝 213 |

214 | 215 | ## 部署时 216 | 217 | ### 收集静态文件 218 | 219 | ```bash 220 | python manage.py collectstatic --noinput 221 | ``` 222 | 223 | ## 部署时nginx配置 224 | 225 | ``` 226 | server { 227 | listen 80; 228 | server_name learnspider.vip; 229 | 230 | location / { 231 | proxy_pass http://127.0.0.1:8001; # 反向代理到 Django 服务器 232 | proxy_set_header Host $host; 233 | proxy_set_header X-Real-IP $remote_addr; 234 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 235 | proxy_set_header X-Forwarded-Proto $scheme; 236 | 237 | # 防止 WebSocket 断开(可选,若有 WebSocket 需求) 238 | proxy_http_version 1.1; 239 | proxy_set_header Upgrade $http_upgrade; 240 | proxy_set_header Connection "Upgrade"; 241 | } 242 | 243 | # 处理静态文件(如果 Django 直接提供静态文件,可以忽略) 244 | location /static/ { 245 | alias /usr/local/projects/learnspider_static/; 246 | } 247 | 248 | location /media/ { 249 | alias /path/to/your/media/; 250 | } 251 | } 252 | ``` -------------------------------------------------------------------------------- /api/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/api/__init__.py -------------------------------------------------------------------------------- /api/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /api/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class ApiConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "api" 7 | -------------------------------------------------------------------------------- /api/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/api/migrations/__init__.py -------------------------------------------------------------------------------- /api/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | 3 | # Create your models here. 4 | -------------------------------------------------------------------------------- /api/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /api/urls.py: -------------------------------------------------------------------------------- 1 | from django.urls import include 2 | from django.urls import path 3 | from rest_framework.routers import DefaultRouter 4 | 5 | from . import views 6 | from .views import TopicsViewSet 7 | from .views import check_answer 8 | 9 | router = DefaultRouter() 10 | router.register(r"api/topics", TopicsViewSet) 11 | 12 | urlpatterns = [ 13 | # 关于题目模型的的api接口 14 | path("", include(router.urls)), 15 | # ---------------页面所需的数据接口------开始----- 16 | path("api/ajax/", views.ajax, name="ajax"), 17 | path("api/pagination1//", views.pagination1, name="pagination1"), 18 | path( 19 | "api/pagination_table//", 20 | views.pagination_table, 21 | name="pagination_table", 22 | ), 23 | path("api/pagination1//", views.pagination1, name="pagination1"), 24 | path("api/post_intro_json/", views.post_intro_json, name="post_intro_json"), 25 | path("api/post_intro_form/", views.post_intro_form, name="post_intro_form"), 26 | # ---------------页面所需的数据接口--------结束-------- 27 | # ------------------工具接口-------------------- 28 | path("api/delay//", views.delay, name="delay"), 29 | # 延迟多少秒返回结果 30 | path("api/delay//", views.delay, name="delay"), 31 | # 返回请求客户端的IP 32 | path("api/ip/", views.get_client_ip, name="get_client_ip"), 33 | # 检查答案是否正确 34 | path("api/check-answer/", check_answer, name="check_answer"), 35 | path("api/server_time/", views.get_server_time, name="get_server_time"), 36 | path("api/ua/", views.get_user_agent, name="get_user_agent"), 37 | path("api/health/", views.health_check, name="health_check"), 38 | path("api/headers/", views.get_request_headers, name="get_request_headers"), 39 | path("api/reverse_string/", views.reverse_string, name="reverse_string"), 40 | path("api/base64_encode/", views.base64_encode, name="base64_encode"), 41 | path("api/base64_decode/", views.base64_decode, name="base64_decode"), 42 | # 返回服务器的时间戳,加密格式 43 | path( 44 | "api/server-timestamp/", views.get_server_timestamp, name="get_server_timestamp" 45 | ), 46 | # ------------------工具接口-------------------- 47 | ] 48 | -------------------------------------------------------------------------------- /collect_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/collect_data/__init__.py -------------------------------------------------------------------------------- /collect_data/collect_news.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from pprint import pprint 3 | from topics.models import Topics, News, NewsCategory, NewsPlatform 4 | 5 | 6 | def fetch_platform_news(platform_slug): 7 | # 来源:https://hot.hlds.fun/#/ 8 | url = f"https://dailyhotapi.hlds.fun/{platform_slug}" 9 | print(url) 10 | response = requests.get(url) 11 | if response.status_code == 200: 12 | res = response.json() 13 | data = res["data"] 14 | # pprint(data) 15 | res = [] 16 | for _ in data: 17 | if type(_["timestamp"]) != "int": 18 | _["timestamp"] = None 19 | res.append( 20 | { 21 | "title": _["title"], 22 | "url": _["url"], 23 | "timestamp": _["timestamp"], 24 | "hot": _.get("hot"), 25 | "desc": _.get("desc"), 26 | } 27 | ) 28 | return res 29 | else: 30 | print(response, response.text) 31 | 32 | 33 | def get_all_platforms(): 34 | platforms = NewsPlatform.objects.values_list("slug", flat=True).all() 35 | return list(platforms) 36 | 37 | 38 | if __name__ == "__main__": 39 | get_all_platforms() 40 | -------------------------------------------------------------------------------- /collect_data/collect_news_binance.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from topics.models import Web3NewsTag 3 | from pprint import pprint 4 | from urllib.parse import quote 5 | from datetime import datetime 6 | 7 | 8 | class BianNewsSpider: 9 | def __init__(self): 10 | self.headers = {"clienttype": "web", "lang": "zh-cn"} 11 | self.headers_en = { 12 | "clienttype": "web", 13 | } 14 | self.id_info_mapping = self.merge_tags() 15 | 16 | def fetch_tags(self, en=False): 17 | json_data = {} 18 | response = requests.post( 19 | "https://www.binance.com/bapi/composite/v1/friendly/pgc/news/tags", 20 | headers=self.headers_en if en else self.headers, 21 | json=json_data, 22 | ) 23 | if response.status_code != 200: 24 | raise Exception("请求失败:", response.status_code, response.text) 25 | else: 26 | res = response.json() 27 | tags = res["data"]["data"] 28 | return {_["id"]: _ for _ in tags} 29 | 30 | def merge_tags(self): 31 | tags = list(Web3NewsTag.objects.values_list("tag", flat=True).all()) 32 | id_info_mapping = self.fetch_tags() 33 | id_info_mapping_en = self.fetch_tags(en=True) 34 | for tag_id, info in id_info_mapping.items(): 35 | if tag_id in id_info_mapping_en: 36 | info_en = id_info_mapping_en[tag_id] 37 | info_en.pop("id") 38 | info_en.pop("tag") 39 | info |= {f"{k}_en": v for k, v in info_en.items()} 40 | id_info_mapping[tag_id] = info 41 | if info.get("tag") in tags: 42 | continue 43 | else: 44 | Web3NewsTag.objects.create( 45 | tag_id=info.get("id"), 46 | name=info.get("name"), 47 | desc=info.get("description"), 48 | tag=info.get("tag"), 49 | name_en=info.get("name_en"), 50 | desc_en=info.get("description_en"), 51 | url=f"https://www.binance.com/zh-CN/square/news/{quote(info.get('tag').lower())}", 52 | ) 53 | return id_info_mapping 54 | 55 | def fetch_new_by_tag(self, last_time=None): 56 | if not last_time: 57 | page_num = 100 58 | else: 59 | page_num = 0 60 | params = { 61 | "pageIndex": "1", 62 | "pageSize": "20", 63 | "tagId": "15", 64 | } 65 | response = requests.get( 66 | "https://www.binance.com/bapi/composite/v4/friendly/pgc/feed/news/list", 67 | params=params, 68 | headers=self.headers, 69 | ) 70 | if response.status_code != 200: 71 | raise Exception("请求失败", response.status_code, response.text) 72 | res = response.json() 73 | data = res["data"]["vos"] 74 | pprint(data) 75 | 76 | news_lst = [ 77 | { 78 | "title": _["title"], 79 | "subTitle": _["subTitle"], 80 | "webLink": _["webLink"], 81 | "authorName": _["title"], 82 | "published_time": datetime.fromtimestamp(_["date"]), 83 | } 84 | for _ in data 85 | ] 86 | pprint(news_lst) 87 | for news in news_lst: 88 | if news["published_time"] <= last_time: 89 | break 90 | 91 | # pprint(response.json()) 92 | 93 | 94 | # if __name__ == '__main__': 95 | # spider = BianNewsSpider() 96 | # print(spider.tags) 97 | -------------------------------------------------------------------------------- /collect_data/collect_news_block_beats.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from pprint import pprint 3 | 4 | params = { 5 | 'is_import': '0', 6 | 'page': '1', 7 | 'type': '0', 8 | } 9 | 10 | response = requests.get('https://appapi.blockbeats.cn/v6/flash/list', params=params) 11 | res = response.json() 12 | if res["status"] == 0: 13 | data = res["data"] 14 | total = data["total"] 15 | total_page = data["totalPage"] 16 | data_lst = data["data"] 17 | for data_item in data_lst: 18 | id = data_item["id"] 19 | url = data_item["h5"] 20 | title = data_item["title"] 21 | content = data_item["content"] 22 | time_str = data_item["time"] 23 | add_time = data_item["add_time"] 24 | rich_text = data_item["rich_text"] 25 | 26 | pprint(res) 27 | print(res.keys()) 28 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | web: 5 | image: python:3.11-slim 6 | container_name: learn_spider_app 7 | restart: unless-stopped 8 | working_dir: /app 9 | volumes: 10 | - .:/app # 映射当前目录 11 | ports: 12 | - "8001:8001" 13 | environment: 14 | - PYTHONUNBUFFERED=1 15 | command: > 16 | bash -c " 17 | pip install --no-cache-dir -r /app/requirements.txt -i https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple && 18 | python /app/manage.py collectstatic --noinput && # 收集静态文件 19 | python /app/manage.py runserver 0.0.0.0:8001 # 使用 6666 端口 20 | " 21 | -------------------------------------------------------------------------------- /docs/开发者文档.md: -------------------------------------------------------------------------------- 1 | ## 写在前面 2 | 3 | 或许是受够了没有文档的项目,所以自己的项目一定要有文档。 4 | 可以说我的文档写的不好,但我会尽我所能写好文档。 5 | 6 | ## 文档 7 | 8 | ## 题目地址 9 | 10 | 由两部分组成,分别是response_path和api_prefix 11 | response_path就是题目链接的后缀 12 | 而api_prefix是题目链接的前缀,也就是题目接口的类型 13 | 目前题目类型分为三种: 14 | 15 | - url/:题目链接不是本项目提供,跳转到一个公网url,比如序言中会有很多文档写在github,那么会重定向过去 16 | - page/:【静态页面】题目只有一个静态html,这样则可以直接通过一个视图返回一个静态的html页面,也包括注入python代码片段的页面。页面存放在topics/pages文件夹中。 17 | - view/:【需要django渲染数据进模板】题目需要通过单独的django视图控制,像校验ua,或者请求头中一些参数的时候,之后再返回页面。页面存放在topics/views文件夹中。 18 | - 可能还会有一种题目,需要大量的页面,目前还没设计,取个什么前缀名好呢🤔。 19 | 20 | ## 一些工具接口 21 | 22 | > 以 /api 开头 23 | 24 | - 返回请求客户端的ip http://localhost:8001/api/ip/ 25 | - 返回ua http://localhost:8001/api/ua/ 26 | - 返回请求头 http://localhost:8001/api/headers/ 27 | - 返回服务器时间 http://localhost:8001/api/server_time/ 28 | - 加密返回服务器时间 http://localhost:8001/api/server-timestamp/ 29 | - 返回服务器的健康状态 http://localhost:8001/api/health/ 30 | - 延迟多少秒返回结果 http://localhost:8001/api/delay/3/ 31 | - base64加密 http://localhost:8001/api/base64_encode/ 32 | - base64解密 http://localhost:8001/api/base64_decode/ 33 | - 反转字符串 http://localhost:8001/api/reverse_string/ 34 | 35 | # 关于题目模型的的api接口 36 | 37 | ## 细碎重要点‼️ 38 | 39 | - 40 | 页面的标题【title标签里面的内容】需要与数据库中的title字段保持一致【html中空格换行会被去除】,因为回答问题发送的请求会携带答案和页面的title字段去数据库中找到对应的问题,然后找到答案,对比答案...很蠢,但没办法。没想到更好的解决办法。传id意味着还需要吧id传进去。视图函数类型又多。id的序号也乱,因为会中间加题。 -------------------------------------------------------------------------------- /logs/django_error.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/logs/django_error.log -------------------------------------------------------------------------------- /manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | BANNER = r""" 7 | _ _ __ __ __ 8 | | | | | / / / / / / 9 | ___ _ __ _ _ | |_ | |__ ___ _ __ / /_ / /_ / /_ 10 | / __| | '_ \ | | | | | __| | '_ \ / _ \ | '_ \ | '_ \ | '_ \ | '_ \ 11 | | (__ | |_) | | |_| | | |_ | | | | | (_) | | | | | | (_) | | (_) | | (_) | 12 | \___| | .__/ \__, | \__| |_| |_| \___/ |_| |_| \___/ \___/ \___/ 13 | | | __/ | 14 | |_| |___/ 15 | 访问我的博客:https://cpython666.github.io/ 16 | 访问我的博客:https://stardreamfly.github.io/ 17 | """ 18 | 19 | 20 | def main(): 21 | """Run administrative tasks.""" 22 | os.environ.setdefault("DJANGO_SETTINGS_MODULE", "LearnSpider.settings") 23 | try: 24 | from django.core.management import execute_from_command_line 25 | except ImportError as exc: 26 | raise ImportError( 27 | "Couldn't import Django. Are you sure it's installed and " 28 | "available on your PYTHONPATH environment variable? Did you " 29 | "forget to activate a virtual environment?" 30 | ) from exc 31 | if "runserver" in sys.argv: 32 | if not os.environ.get("RUN_MAIN", None): 33 | print(BANNER) # Print the banner only if RUN_MAIN is not set 34 | else: 35 | print(BANNER) 36 | execute_from_command_line(sys.argv) 37 | 38 | 39 | if __name__ == "__main__": 40 | main() 41 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Django==5.0.7 2 | DrissionPage==4.1.0.17 3 | PyMySQL==1.1.1 4 | python-dotenv==1.0.1 5 | Requests==2.32.3 6 | djangorestframework 7 | apscheduler==3.11.0 8 | redis 9 | invoke -------------------------------------------------------------------------------- /spider_demo/demo/demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/spider_demo/demo/demo/__init__.py -------------------------------------------------------------------------------- /spider_demo/demo/demo/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class NewsItem(scrapy.Item): 10 | title = scrapy.Field() # 新闻标题 11 | url = scrapy.Field() # 新闻链接 12 | -------------------------------------------------------------------------------- /spider_demo/demo/demo/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | 12 | class DemoSpiderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, or item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Request or item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info("Spider opened: %s" % spider.name) 57 | 58 | 59 | class DemoDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | print("请求之前") 82 | return None 83 | 84 | def process_response(self, request, response, spider): 85 | # Called with the response returned from the downloader. 86 | 87 | # Must either; 88 | # - return a Response object 89 | # - return a Request object 90 | # - or raise IgnoreRequest 91 | print("响应之后") 92 | return response 93 | 94 | def process_exception(self, request, exception, spider): 95 | # Called when a download handler or a process_request() 96 | # (from other downloader middleware) raises an exception. 97 | 98 | # Must either: 99 | # - return None: continue processing this exception 100 | # - return a Response object: stops process_exception() chain 101 | # - return a Request object: stops process_exception() chain 102 | pass 103 | 104 | def spider_opened(self, spider): 105 | spider.logger.info("Spider opened: %s" % spider.name) 106 | -------------------------------------------------------------------------------- /spider_demo/demo/demo/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class NewsPipeline: 12 | def process_item(self, item, spider): 13 | # 数据清洗或验证的逻辑 14 | item["title"] = item["title"].strip() # 去除标题前后的空格 15 | print(f"pipeline item: {item}") 16 | return item 17 | -------------------------------------------------------------------------------- /spider_demo/demo/demo/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for demo project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = "demo" 11 | 12 | SPIDER_MODULES = ["demo.spiders"] 13 | NEWSPIDER_MODULE = "demo.spiders" 14 | 15 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 16 | # USER_AGENT = "demo (+http://www.yourdomain.com)" 17 | 18 | # Obey robots.txt rules 19 | ROBOTSTXT_OBEY = False 20 | # ROBOTSTXT_OBEY = True 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | # CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | # DOWNLOAD_DELAY = 3 29 | # The download delay setting will honor only one of: 30 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | # CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | # COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | # TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | # DEFAULT_REQUEST_HEADERS = { 41 | # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", 42 | # "Accept-Language": "en", 43 | # } 44 | 45 | # Enable or disable spider middlewares 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47 | # SPIDER_MIDDLEWARES = { 48 | # "demo.middlewares.DemoSpiderMiddleware": 543, 49 | # } 50 | 51 | # Enable or disable downloader middlewares 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53 | # DOWNLOADER_MIDDLEWARES = { 54 | # "demo.middlewares.DemoDownloaderMiddleware": 543, 55 | # } 56 | 57 | # Enable or disable extensions 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 59 | # EXTENSIONS = { 60 | # "scrapy.extensions.telnet.TelnetConsole": None, 61 | # } 62 | 63 | # Configure item pipelines 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 65 | # ITEM_PIPELINES = { 66 | # "demo.pipelines.DemoPipeline": 300, 67 | # } 68 | 69 | 70 | # Enable and configure the AutoThrottle extension (disabled by default) 71 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 72 | # AUTOTHROTTLE_ENABLED = True 73 | # The initial download delay 74 | # AUTOTHROTTLE_START_DELAY = 5 75 | # The maximum download delay to be set in case of high latencies 76 | # AUTOTHROTTLE_MAX_DELAY = 60 77 | # The average number of requests Scrapy should be sending in parallel to 78 | # each remote server 79 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 80 | # Enable showing throttling stats for every response received: 81 | # AUTOTHROTTLE_DEBUG = False 82 | 83 | # Enable and configure HTTP caching (disabled by default) 84 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 85 | # HTTPCACHE_ENABLED = True 86 | # HTTPCACHE_EXPIRATION_SECS = 0 87 | # HTTPCACHE_DIR = "httpcache" 88 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 89 | # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" 90 | 91 | # Set settings whose default value is deprecated to a future-proof value 92 | TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor" 93 | FEED_EXPORT_ENCODING = "utf-8" 94 | -------------------------------------------------------------------------------- /spider_demo/demo/demo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /spider_demo/demo/demo/spiders/hello_scrapy.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | 3 | import scrapy 4 | from scrapy import Request 5 | 6 | 7 | class HelloScrapySpider(scrapy.Spider): 8 | name = "hello_scrapy" 9 | # allowed_domains = ["baidu.com"] 10 | start_urls = ["https://cn.bing.com/"] 11 | 12 | def start_requests(self) -> Iterable[Request]: 13 | yield Request("http://localhost:8001/", callback=self.parse) 14 | 15 | def parse(self, response): 16 | print(1, response) 17 | print(response.css(".h1::text").get()) 18 | urls = [ 19 | "http://localhost:8001/?sort=asc¶m=1", 20 | "http://localhost:8001/?param=1&sort=asc", 21 | ] 22 | for url in urls: 23 | yield scrapy.Request( 24 | url, callback=self.parse_local, meta={"aaa": 111}, dont_filter=True 25 | ) 26 | 27 | def parse_local(self, response): 28 | print("请求了一次") 29 | # print(response.meta.get('aaa')) 30 | # print(response) 31 | -------------------------------------------------------------------------------- /spider_demo/demo/demo/spiders/parse_demo.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from demo.items import NewsItem 3 | 4 | 5 | class ParseDemoSpider(scrapy.Spider): 6 | custom_settings = { 7 | # "ITEM_PIPELINES": { 8 | # 'demo.pipelines.NewsPipeline': 1, # 启用 NewsPipeline 9 | # }, 10 | # "DOWNLOADER_MIDDLEWARES": { 11 | # 'demo.middlewares.DemoDownloaderMiddleware': 543 12 | # } 13 | } 14 | 15 | name = "parse_demo" 16 | start_urls = ["http://localhost:8001/sandbox/news/hot/"] 17 | 18 | def parse(self, response): 19 | card_lst = response.css(".card-body") 20 | for card in card_lst: 21 | # item = NewsItem() 22 | # item['title'] = card.css('a span::text').get() 23 | # item['url'] = card.css('a::attr(href)').get() 24 | yield { 25 | "title": card.css("a span::text").get(), 26 | "url": card.css("a::attr(href)").get(), 27 | } 28 | # print(card.css('a span::text').get(), card.css('a::attr(href)').get()) 29 | # yield item 30 | 31 | 32 | if __name__ == "__main__": 33 | from scrapy import Selector 34 | 35 | text = "111" 36 | response = Selector(text=text) 37 | print(response.xpath("//a/text()").get()) 38 | -------------------------------------------------------------------------------- /spider_demo/demo/demo/spiders/post_spider.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | import json 3 | 4 | 5 | class JsonRequestSpider(scrapy.Spider): 6 | name = "json_spider" 7 | 8 | start_urls = ["http://localhost:8001/api/post_intro_json/"] 9 | 10 | def start_requests(self): 11 | # JSON 数据 12 | data = { 13 | "password": "post", 14 | } 15 | 16 | # 请求头,指定发送 JSON 数据 17 | headers = { 18 | "Content-Type": "application/json", 19 | } 20 | 21 | # 发送 POST 请求 22 | yield scrapy.Request( 23 | url=self.start_urls[0], 24 | method="POST", 25 | headers=headers, 26 | body=json.dumps(data), # 将字典转换为 JSON 字符串 27 | callback=self.parse, 28 | ) 29 | 30 | def parse(self, response): 31 | # 解析响应数据 32 | data = json.loads(response.text) 33 | self.log(f"响应数据: {data}") 34 | 35 | 36 | class FormRequestSpider(scrapy.Spider): 37 | name = "form_spider" 38 | 39 | start_urls = ["http://localhost:8001/api/post_intro_form/"] 40 | 41 | def start_requests(self): 42 | # JSON 数据 43 | formdata = { 44 | "password": "post", 45 | } 46 | 47 | # 发送 POST 请求 48 | yield scrapy.FormRequest( 49 | url=self.start_urls[0], 50 | method="POST", 51 | formdata=formdata, # 将字典转换为 JSON 字符串 52 | callback=self.parse, 53 | ) 54 | 55 | def parse(self, response): 56 | # 解析响应数据 57 | data = json.loads(response.text) 58 | self.log(f"响应数据: {data}") 59 | -------------------------------------------------------------------------------- /spider_demo/demo/output.json: -------------------------------------------------------------------------------- 1 | [ 2 | {"title": "研究二次元的专家", "url": "https://b23.tv/BV1L6PKeyEpF"}, 3 | {"title": "春天的第一次公路旅行", "url": "https://www.douyin.com/hot/2009696"}, 4 | {"title": "博士配偶为初中学历被安排工作,湖北师范大学回应「符合政策,属劳务派遣」,怎样看待此事?", "url": "https://www.zhihu.com/question/13286722569"}, 5 | {"title": "老人离世4名后人治丧后意外身亡", "url": "https://www.baidu.com/s?wd=%E8%80%81%E4%BA%BA%E7%A6%BB%E4%B8%964%E5%90%8D%E5%90%8E%E4%BA%BA%E6%B2%BB%E4%B8%A7%E5%90%8E%E6%84%8F%E5%A4%96%E8%BA%AB%E4%BA%A1"}, 6 | {"title": "刚刚,全球首个混合推理模型Claude 3.7降世!最强编程大脑暴击DeepSeek R1", "url": "https://www.36kr.com/p/3181320656196736"}, 7 | {"title": "华为余承东预告“想不到的产品”:首款为原生鸿蒙而生的新形态手机 3 月见", "url": "https://www.ithome.com/0/833/121.htm"}, 8 | {"title": "本周看什么 | 最近值得一看的 9 部作品", "url": "https://sspai.com/post/96593"}, 9 | {"title": "中美俄均投下赞成票,联合国安理会通过涉乌决议", "url": "https://www.thepaper.cn/newsDetail_forward_30236174"}, 10 | {"title": "中美俄立场罕见一致释放什么信号", "url": "https://www.toutiao.com/trending/7475304273840983578/"}, 11 | {"title": "王祖贤在贴吧官宣创业", "url": "https://tieba.baidu.com/hottopic/browse/hottopic?topic_id=28339488&topic_name=%E7%8E%8B%E7%A5%96%E8%B4%A4%E5%9C%A8%E8%B4%B4%E5%90%A7%E5%AE%98%E5%AE%A3%E5%88%9B%E4%B8%9A"}, 12 | {"title": "MobVue 开源啦!", "url": "https://juejin.cn/post/7474782353463787556"}, 13 | {"title": "【6.9】误判", "url": "https://movie.douban.com/subject/36401937/"}, 14 | {"title": "总书记关心的事丨“绿电”点亮班彦村", "url": "https://new.qq.com/rain/a/20250225A01EPV00"}, 15 | {"title": "男子夜晚去找小三偷情 上门发现小三正和情人发生关系", "url": "https://www.163.com/dy/article/JP8MH31K0523WUD9.html"}, 16 | {"title": "AI未来进行式(DeepSeek、宇树科技、人形机器人、AI面试官……本书全部预言)", "url": "https://weread.qq.com/web/bookDetail/0bc32b20813ab6d9fg0114c1"}, 17 | {"title": "2025年2月25日 不停机更新 ", "url": "https://lol.qq.com/news/detail.shtml?docid=13900088934026462960"}, 18 | {"title": "3.1版本活动跃迁 | 第一期", "url": "https://www.miyoushe.com/sr/article/62249173"}, 19 | {"title": "「征战勇者淬炼之界」活动:突破试炼得名片", "url": "https://www.miyoushe.com/ys/article/62181653"}, 20 | {"title": "关于我就是李大嘴的几点原因", "url": "https://www.douban.com/group/topic/319699118/?_spm_id=MTIzMDA2NjUx"}, 21 | {"title": "AI 训练跟踪与可视化平台", "url": "https://hellogithub.com/repository/b442a9fa270e4ccb8847c9ee3445e41b"}, 22 | {"title": "开普勒当时是如何计算出行星轨道是椭圆的呢?", "url": "https://daily.zhihu.com/story/9779237"}, 23 | {"title": "18禁恋爱老婆来到三次元!一见面竟被作者做这样的事情?", "url": "https://www.jianshu.com/p/ca32a84e3861"} 24 | ] -------------------------------------------------------------------------------- /spider_demo/demo/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = demo.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = demo 12 | -------------------------------------------------------------------------------- /static/bootstrap/bootstrap-icons.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/bootstrap/bootstrap-icons.woff -------------------------------------------------------------------------------- /static/bootstrap/bootstrap-icons.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/bootstrap/bootstrap-icons.woff2 -------------------------------------------------------------------------------- /static/imgs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/favicon.ico -------------------------------------------------------------------------------- /static/imgs/friends/learnspider_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/friends/learnspider_logo.png -------------------------------------------------------------------------------- /static/imgs/friends/stardream_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/friends/stardream_logo.png -------------------------------------------------------------------------------- /static/imgs/logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/logo.jpeg -------------------------------------------------------------------------------- /static/imgs/sandbox/blog.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sandbox/blog.jpeg -------------------------------------------------------------------------------- /static/imgs/sandbox/music.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sandbox/music.jpeg -------------------------------------------------------------------------------- /static/imgs/sandbox/news.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sandbox/news.jpeg -------------------------------------------------------------------------------- /static/imgs/sandbox/shop.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sandbox/shop.jpeg -------------------------------------------------------------------------------- /static/imgs/sandbox/video.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sandbox/video.jpeg -------------------------------------------------------------------------------- /static/imgs/sandbox/wallpaper.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sandbox/wallpaper.jpeg -------------------------------------------------------------------------------- /static/imgs/setu/1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/setu/1.jpg -------------------------------------------------------------------------------- /static/imgs/setu/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/setu/1.png -------------------------------------------------------------------------------- /static/imgs/setu/2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/setu/2.jpg -------------------------------------------------------------------------------- /static/imgs/setu/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/setu/2.png -------------------------------------------------------------------------------- /static/imgs/setu/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/setu/3.jpg -------------------------------------------------------------------------------- /static/imgs/setu/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/setu/3.png -------------------------------------------------------------------------------- /static/imgs/sponsors/evolution_host.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sponsors/evolution_host.png -------------------------------------------------------------------------------- /static/imgs/sponsors/lky_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sponsors/lky_logo.png -------------------------------------------------------------------------------- /static/imgs/sponsors/qgwl_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sponsors/qgwl_logo.png -------------------------------------------------------------------------------- /static/imgs/sponsors/yrx_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/sponsors/yrx_logo.png -------------------------------------------------------------------------------- /static/imgs/support/lky.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/support/lky.png -------------------------------------------------------------------------------- /static/imgs/support/wx.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/support/wx.jpg -------------------------------------------------------------------------------- /static/imgs/support/yrx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/support/yrx.png -------------------------------------------------------------------------------- /static/imgs/support/zfb.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/static/imgs/support/zfb.jpg -------------------------------------------------------------------------------- /static/js/cpython666.js: -------------------------------------------------------------------------------- 1 | console.log( 2 | ` 3 | _ _ __ __ __ 4 | | | | | / / / / / / 5 | ___ _ __ _ _ | |_ | |__ ___ _ __ / /_ / /_ / /_ 6 | / __| | '_ \\ | | | | | __| | '_ \\ / _ \\ | '_ \\ | '_ \\ | '_ \\ | '_ \\ 7 | | (__ | |_) | | |_| | | |_ | | | | | (_) | | | | | | (_) | | (_) | | (_) | 8 | \\___| | .__/ \\__, | \\__| |_| |_| \\___/ |_| |_| \\___/ \\___/ \\___/ 9 | | | __/ | 10 | |_| |___/ 11 | `); 12 | console.log('访问我的博客:https://cpython666.github.io/'); 13 | console.log('访问我的博客:https://stardreamfly.github.io/'); -------------------------------------------------------------------------------- /tasks.py: -------------------------------------------------------------------------------- 1 | from invoke import task 2 | 3 | 4 | @task 5 | def up(c): 6 | c.run("docker compose up -d --build") 7 | 8 | 9 | @task 10 | def b(c): 11 | c.run("black .") 12 | 13 | 14 | @task 15 | def makemigrations(c): 16 | c.run("python manage.py makemigrations") 17 | 18 | 19 | @task(pre=[makemigrations]) 20 | def m(c): 21 | c.run("python manage.py migrate") 22 | 23 | print("运行迁移命令") 24 | -------------------------------------------------------------------------------- /test/demo.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | # res=requests.get('https://www.whu.edu.cn/') 4 | res = requests.get("http://localhost:8001/view/encode/") 5 | # res=requests.get('http://localhost:8001/demo1/') 6 | print(res.text) 7 | print(res.encoding) 8 | -------------------------------------------------------------------------------- /test/dp.py: -------------------------------------------------------------------------------- 1 | from DrissionPage import ChromiumPage 2 | 3 | page = ChromiumPage() 4 | page.get("http://localhost:8001/view/request-twice/") 5 | print(page.html) 6 | page.quit() 7 | -------------------------------------------------------------------------------- /topics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/__init__.py -------------------------------------------------------------------------------- /topics/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | from .models import Topics 3 | 4 | 5 | class TopicsAdmin(admin.ModelAdmin): 6 | list_display = ("title", "order_id", "category", "difficulty", "pass_status") 7 | list_editable = ("order_id",) # 允许在列表中编辑 order_id 8 | 9 | 10 | admin.site.register(Topics, TopicsAdmin) 11 | -------------------------------------------------------------------------------- /topics/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class TopicsConfig(AppConfig): 5 | default_auto_field = "django.db.models.BigAutoField" 6 | name = "topics" 7 | 8 | def ready(self): 9 | from topics.scheduler import start_scheduler 10 | from LearnSpider.settings import DJANGO_ENV 11 | 12 | # if DJANGO_ENV != "local": 13 | # start_scheduler() 14 | start_scheduler() 15 | -------------------------------------------------------------------------------- /topics/decorators.py: -------------------------------------------------------------------------------- 1 | from django.http import JsonResponse 2 | 3 | 4 | def require_ua(view_func): 5 | def _wrapped_view(request, *args, **kwargs): 6 | user_agent = request.META.get("HTTP_USER_AGENT") 7 | if not user_agent: 8 | return JsonResponse({"error": "User-Agent header is required"}, status=400) 9 | return view_func(request, *args, **kwargs) 10 | 11 | return _wrapped_view 12 | -------------------------------------------------------------------------------- /topics/management/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/management/__init__.py -------------------------------------------------------------------------------- /topics/management/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/management/commands/__init__.py -------------------------------------------------------------------------------- /topics/management/commands/fetch_news_data.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from topics.tasks import fetch_and_save_news 3 | 4 | 5 | # python manage.py fetch_news_data 6 | class Command(BaseCommand): 7 | help = "采集一批热点新闻" 8 | 9 | def handle(self, *args, **kwargs): 10 | fetch_and_save_news() 11 | -------------------------------------------------------------------------------- /topics/management/commands/fetch_news_web3.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from collect_data.collect_news_binance import BianNewsSpider 3 | 4 | 5 | # python manage.py fetch_news_web3 6 | class Command(BaseCommand): 7 | help = "批量更新题目的 category,规范化 category" 8 | 9 | def handle(self, *args, **kwargs): 10 | spider = BianNewsSpider() 11 | print(spider.id_info_mapping) 12 | # spider.fetch_new_by_tag() 13 | -------------------------------------------------------------------------------- /topics/management/commands/update_category.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from topics.models import Topics 3 | 4 | 5 | # python manage.py update_category 6 | class Command(BaseCommand): 7 | help = "批量更新题目的 category,规范化 category" 8 | 9 | def handle(self, *args, **kwargs): 10 | questions = Topics.objects.all() 11 | for question in questions: 12 | if question.category: 13 | question.category = ",".join( 14 | question.category.replace(",", ",") 15 | .replace(";", ",") 16 | .replace(";", ",") 17 | .split(",") 18 | ) 19 | question.save() 20 | self.stdout.write(self.style.SUCCESS("已成功规范化 category 字段")) 21 | -------------------------------------------------------------------------------- /topics/management/commands/update_difficulty_scores.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from topics.models import Topics 3 | 4 | 5 | # python manage.py update_difficulty_scores 6 | class Command(BaseCommand): 7 | help = "批量更新题目的 difficulty_score 字段,规则为目前的 difficulty_score 排序号乘以 10" 8 | 9 | def handle(self, *args, **kwargs): 10 | questions = Topics.objects.all().order_by("difficulty_score") 11 | for idx, question in enumerate(questions): 12 | question.difficulty_score = (idx + 1) * 10 13 | question.save() 14 | self.stdout.write(self.style.SUCCESS("Successfully updated difficulty_scores")) 15 | -------------------------------------------------------------------------------- /topics/management/commands/update_order_ids.py: -------------------------------------------------------------------------------- 1 | from django.core.management.base import BaseCommand 2 | from topics.models import Topics 3 | 4 | 5 | # python manage.py update_order_ids 6 | class Command(BaseCommand): 7 | help = "批量更新题目的 order_id,根绝题目的 order_id 排序好乘以 10" 8 | 9 | def handle(self, *args, **kwargs): 10 | questions = Topics.objects.all().order_by("order_id") 11 | for idx, question in enumerate(questions): 12 | question.order_id = (idx + 1) * 10 13 | question.save() 14 | self.stdout.write(self.style.SUCCESS("已成功更新 order_ids")) 15 | -------------------------------------------------------------------------------- /topics/management/commands/update_pass_status.py: -------------------------------------------------------------------------------- 1 | # api/management/commands/update_pass_status.py 2 | 3 | from django.core.management.base import BaseCommand 4 | from topics.models import Topics 5 | 6 | 7 | class Command(BaseCommand): 8 | help = "批量更新题目的 pass_status 字段为 false" 9 | 10 | def handle(self, *args, **kwargs): 11 | updated_count = Topics.objects.update(pass_status=False) 12 | self.stdout.write( 13 | self.style.SUCCESS( 14 | f"Successfully updated pass_status for {updated_count} topics" 15 | ) 16 | ) 17 | -------------------------------------------------------------------------------- /topics/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.7 on 2025-02-16 17:02 2 | 3 | import django.db.models.deletion 4 | from django.db import migrations, models 5 | 6 | 7 | class Migration(migrations.Migration): 8 | 9 | initial = True 10 | 11 | dependencies = [] 12 | 13 | operations = [ 14 | migrations.CreateModel( 15 | name="Category", 16 | fields=[ 17 | ("created_at", models.DateTimeField(auto_now_add=True, null=True)), 18 | ("updated_at", models.DateTimeField(auto_now=True, null=True)), 19 | ("id", models.AutoField(primary_key=True, serialize=False)), 20 | ( 21 | "name", 22 | models.CharField( 23 | blank=True, help_text="类别名", max_length=200, null=True 24 | ), 25 | ), 26 | ( 27 | "detail", 28 | models.CharField( 29 | blank=True, 30 | default=None, 31 | help_text="类别的描述", 32 | max_length=200, 33 | null=True, 34 | ), 35 | ), 36 | ], 37 | options={ 38 | "db_table": "sd_ls_category", 39 | }, 40 | ), 41 | migrations.CreateModel( 42 | name="NewsCategory", 43 | fields=[ 44 | ( 45 | "id", 46 | models.BigAutoField( 47 | auto_created=True, 48 | primary_key=True, 49 | serialize=False, 50 | verbose_name="ID", 51 | ), 52 | ), 53 | ("created_at", models.DateTimeField(auto_now_add=True, null=True)), 54 | ("updated_at", models.DateTimeField(auto_now=True, null=True)), 55 | ("name", models.CharField(max_length=100)), 56 | ("slug", models.CharField(max_length=255, unique=True)), 57 | ("description", models.TextField(blank=True, null=True)), 58 | ], 59 | options={ 60 | "db_table": "sd_ls_news_category", 61 | }, 62 | ), 63 | migrations.CreateModel( 64 | name="NewsPlatform", 65 | fields=[ 66 | ( 67 | "id", 68 | models.BigAutoField( 69 | auto_created=True, 70 | primary_key=True, 71 | serialize=False, 72 | verbose_name="ID", 73 | ), 74 | ), 75 | ("created_at", models.DateTimeField(auto_now_add=True, null=True)), 76 | ("updated_at", models.DateTimeField(auto_now=True, null=True)), 77 | ("name", models.CharField(max_length=100)), 78 | ("slug", models.SlugField(unique=True)), 79 | ( 80 | "description", 81 | models.CharField(blank=True, max_length=255, null=True), 82 | ), 83 | ], 84 | options={ 85 | "db_table": "sd_ls_news_platform", 86 | }, 87 | ), 88 | migrations.CreateModel( 89 | name="Topics", 90 | fields=[ 91 | ("created_at", models.DateTimeField(auto_now_add=True, null=True)), 92 | ("updated_at", models.DateTimeField(auto_now=True, null=True)), 93 | ("id", models.AutoField(primary_key=True, serialize=False)), 94 | ( 95 | "order_id", 96 | models.PositiveIntegerField( 97 | blank=True, 98 | default=3, 99 | help_text="题目排序,根据难度分排序,会变", 100 | null=True, 101 | ), 102 | ), 103 | ( 104 | "title", 105 | models.CharField( 106 | blank=True, 107 | default=None, 108 | help_text="题目的标题,最好有趣个性化一点", 109 | max_length=200, 110 | ), 111 | ), 112 | ( 113 | "detail", 114 | models.TextField( 115 | blank=True, 116 | default="暂无表述", 117 | help_text="题目的描述:简单创造一个背景故事", 118 | null=True, 119 | ), 120 | ), 121 | ( 122 | "goal", 123 | models.TextField( 124 | blank=True, 125 | default="暂无描述", 126 | help_text="题目的目标:掌握xxx", 127 | null=True, 128 | ), 129 | ), 130 | ( 131 | "question", 132 | models.TextField( 133 | blank=True, 134 | default="暂无题目要求", 135 | help_text="题目要求", 136 | null=True, 137 | ), 138 | ), 139 | ( 140 | "answer", 141 | models.CharField( 142 | blank=True, help_text="题目的答案", max_length=255, null=True 143 | ), 144 | ), 145 | ( 146 | "category", 147 | models.CharField( 148 | blank=True, 149 | default="成神之路", 150 | help_text="题目类别:成神之路,xpath特训", 151 | max_length=100, 152 | null=True, 153 | ), 154 | ), 155 | ( 156 | "difficulty", 157 | models.CharField( 158 | blank=True, 159 | choices=[ 160 | ("beginner", "初级"), 161 | ("intermediate", "中级"), 162 | ("advanced", "高级"), 163 | ("ultimate", "终极"), 164 | ], 165 | default="简单", 166 | help_text="难度", 167 | max_length=12, 168 | null=True, 169 | ), 170 | ), 171 | ( 172 | "difficulty_score", 173 | models.BigIntegerField( 174 | blank=True, 175 | default=200, 176 | help_text="难度分数,后续根据此字段排序order_id", 177 | null=True, 178 | ), 179 | ), 180 | ( 181 | "points", 182 | models.TextField( 183 | blank=True, 184 | default="暂未更新考点", 185 | help_text="本题的考点", 186 | null=True, 187 | ), 188 | ), 189 | ( 190 | "published", 191 | models.BooleanField( 192 | blank=True, default=False, help_text="是否发布", null=True 193 | ), 194 | ), 195 | ( 196 | "pass_status", 197 | models.BooleanField( 198 | blank=True, default=False, help_text="是否通过", null=True 199 | ), 200 | ), 201 | ( 202 | "solution_txt", 203 | models.URLField( 204 | blank=True, 205 | default="暂无表述", 206 | help_text="题解,图文讲解", 207 | null=True, 208 | ), 209 | ), 210 | ( 211 | "solution_video", 212 | models.URLField( 213 | blank=True, default="暂无表述", help_text="视频讲解", null=True 214 | ), 215 | ), 216 | ( 217 | "api_type", 218 | models.CharField( 219 | blank=True, 220 | default="直接对应视图", 221 | help_text="此题目的接口类型:直接对应视图,访问一个接口判断后决定是否返回视图,返回一个视图+【多个】api", 222 | max_length=255, 223 | null=True, 224 | ), 225 | ), 226 | ( 227 | "api_prefix", 228 | models.CharField( 229 | blank=True, 230 | default="topic/", 231 | help_text="topic/,view/,api/", 232 | max_length=255, 233 | null=True, 234 | ), 235 | ), 236 | ( 237 | "response_path", 238 | models.TextField( 239 | blank=True, help_text="题目路径【文件名】", null=True 240 | ), 241 | ), 242 | ], 243 | options={ 244 | "db_table": "sd_ls_topic", 245 | "ordering": ["order_id"], 246 | }, 247 | ), 248 | migrations.CreateModel( 249 | name="News", 250 | fields=[ 251 | ( 252 | "id", 253 | models.BigAutoField( 254 | auto_created=True, 255 | primary_key=True, 256 | serialize=False, 257 | verbose_name="ID", 258 | ), 259 | ), 260 | ("created_at", models.DateTimeField(auto_now_add=True, null=True)), 261 | ("updated_at", models.DateTimeField(auto_now=True, null=True)), 262 | ("title", models.CharField(max_length=200)), 263 | ( 264 | "url", 265 | models.CharField( 266 | blank=True, max_length=255, null=True, unique=True 267 | ), 268 | ), 269 | ("desc", models.CharField(blank=True, max_length=500, null=True)), 270 | ("publish_time", models.DateTimeField(blank=True, null=True)), 271 | ("timestamp", models.BigIntegerField(blank=True, default=0, null=True)), 272 | ("author", models.CharField(blank=True, max_length=255, null=True)), 273 | ("content", models.TextField()), 274 | ("hot", models.IntegerField(default=0)), 275 | ( 276 | "category", 277 | models.ForeignKey( 278 | blank=True, 279 | null=True, 280 | on_delete=django.db.models.deletion.CASCADE, 281 | related_name="news", 282 | to="topics.newscategory", 283 | ), 284 | ), 285 | ( 286 | "platform", 287 | models.ForeignKey( 288 | blank=True, 289 | null=True, 290 | on_delete=django.db.models.deletion.CASCADE, 291 | related_name="news", 292 | to="topics.newsplatform", 293 | ), 294 | ), 295 | ], 296 | options={ 297 | "db_table": "sd_ls_news", 298 | }, 299 | ), 300 | migrations.CreateModel( 301 | name="NewsRequestHistory", 302 | fields=[ 303 | ( 304 | "id", 305 | models.BigAutoField( 306 | auto_created=True, 307 | primary_key=True, 308 | serialize=False, 309 | verbose_name="ID", 310 | ), 311 | ), 312 | ("created_at", models.DateTimeField(auto_now_add=True, null=True)), 313 | ("updated_at", models.DateTimeField(auto_now=True, null=True)), 314 | ("request_time", models.DateTimeField(auto_now_add=True)), 315 | ("response_data", models.JSONField()), 316 | ("status", models.CharField(blank=True, max_length=255, null=True)), 317 | ( 318 | "platform", 319 | models.ForeignKey( 320 | on_delete=django.db.models.deletion.CASCADE, 321 | related_name="history", 322 | to="topics.newsplatform", 323 | ), 324 | ), 325 | ], 326 | options={ 327 | "db_table": "sd_ls_news_request_history", 328 | }, 329 | ), 330 | migrations.CreateModel( 331 | name="TopicCategoryRelation", 332 | fields=[ 333 | ("created_at", models.DateTimeField(auto_now_add=True, null=True)), 334 | ("updated_at", models.DateTimeField(auto_now=True, null=True)), 335 | ( 336 | "display_order", 337 | models.PositiveIntegerField(blank=True, default=100, null=True), 338 | ), 339 | ("id", models.AutoField(primary_key=True, serialize=False)), 340 | ( 341 | "category", 342 | models.ForeignKey( 343 | on_delete=django.db.models.deletion.CASCADE, 344 | to="topics.category", 345 | ), 346 | ), 347 | ( 348 | "topic", 349 | models.ForeignKey( 350 | on_delete=django.db.models.deletion.CASCADE, to="topics.topics" 351 | ), 352 | ), 353 | ], 354 | options={ 355 | "db_table": "sd_ls_topic_category_relation", 356 | "unique_together": {("topic", "category")}, 357 | }, 358 | ), 359 | ] 360 | -------------------------------------------------------------------------------- /topics/migrations/0002_web3newstag.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.7 on 2025-02-27 15:25 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ("topics", "0001_initial"), 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name="Web3NewsTag", 15 | fields=[ 16 | ( 17 | "id", 18 | models.BigAutoField( 19 | auto_created=True, 20 | primary_key=True, 21 | serialize=False, 22 | verbose_name="ID", 23 | ), 24 | ), 25 | ("created_at", models.DateTimeField(auto_now_add=True, null=True)), 26 | ("updated_at", models.DateTimeField(auto_now=True, null=True)), 27 | ("name", models.CharField(max_length=200)), 28 | ("tag", models.CharField(max_length=200)), 29 | ("name_en", models.CharField(max_length=200)), 30 | ("url", models.CharField(blank=True, max_length=255, null=True)), 31 | ("desc", models.CharField(blank=True, max_length=500, null=True)), 32 | ("desc_en", models.CharField(blank=True, max_length=500, null=True)), 33 | ], 34 | options={ 35 | "db_table": "sd_ls_news_web3_tags", 36 | }, 37 | ), 38 | ] 39 | -------------------------------------------------------------------------------- /topics/migrations/0003_web3news_web3newstag_tag_id_alter_web3newstag_name_and_more.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 5.0.7 on 2025-02-27 16:06 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ("topics", "0002_web3newstag"), 10 | ] 11 | 12 | operations = [ 13 | migrations.CreateModel( 14 | name="Web3News", 15 | fields=[ 16 | ( 17 | "id", 18 | models.BigAutoField( 19 | auto_created=True, 20 | primary_key=True, 21 | serialize=False, 22 | verbose_name="ID", 23 | ), 24 | ), 25 | ("created_at", models.DateTimeField(auto_now_add=True, null=True)), 26 | ("updated_at", models.DateTimeField(auto_now=True, null=True)), 27 | ("title", models.CharField(blank=True, max_length=200, null=True)), 28 | ("sub_title", models.CharField(blank=True, max_length=500, null=True)), 29 | ( 30 | "web_link", 31 | models.CharField( 32 | blank=True, max_length=255, null=True, unique=True 33 | ), 34 | ), 35 | ("author_name", models.CharField(blank=True, max_length=50, null=True)), 36 | ("published_time", models.DateTimeField(blank=True, null=True)), 37 | ], 38 | options={ 39 | "db_table": "sd_ls_news_web3", 40 | }, 41 | ), 42 | migrations.AddField( 43 | model_name="web3newstag", 44 | name="tag_id", 45 | field=models.IntegerField(blank=True, null=True), 46 | ), 47 | migrations.AlterField( 48 | model_name="web3newstag", 49 | name="name", 50 | field=models.CharField(blank=True, max_length=200, null=True), 51 | ), 52 | migrations.AlterField( 53 | model_name="web3newstag", 54 | name="name_en", 55 | field=models.CharField(blank=True, max_length=200, null=True), 56 | ), 57 | migrations.AlterField( 58 | model_name="web3newstag", 59 | name="tag", 60 | field=models.CharField(blank=True, max_length=200, null=True), 61 | ), 62 | ] 63 | -------------------------------------------------------------------------------- /topics/migrations/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/migrations/__init__.py -------------------------------------------------------------------------------- /topics/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | import json 3 | 4 | 5 | class BaseModel(models.Model): 6 | created_at = models.DateTimeField( 7 | auto_now_add=True, null=True, blank=True 8 | ) # 创建时间 9 | updated_at = models.DateTimeField(auto_now=True, null=True, blank=True) # 修改时间 10 | 11 | class Meta: 12 | abstract = True 13 | 14 | 15 | class OrderMixin(models.Model): 16 | display_order = models.PositiveIntegerField( 17 | blank=True, null=True, default=100 18 | ) # 记录显示顺序 19 | 20 | class Meta: 21 | abstract = True 22 | 23 | 24 | class Category(BaseModel): 25 | id = models.AutoField(primary_key=True) 26 | name = models.CharField(blank=True, null=True, max_length=200, help_text="类别名") 27 | detail = models.CharField( 28 | blank=True, null=True, default=None, max_length=200, help_text="类别的描述" 29 | ) 30 | 31 | class Meta: 32 | db_table = "sd_ls_category" # 自定义表名 33 | 34 | 35 | class Topics(BaseModel): 36 | DIFFICULTY_CHOICES = [ 37 | ("beginner", "初级"), 38 | ("intermediate", "中级"), 39 | ("advanced", "高级"), 40 | ("ultimate", "终极"), 41 | ] 42 | 43 | id = models.AutoField(primary_key=True) 44 | order_id = models.PositiveIntegerField( 45 | blank=True, null=True, default=3, help_text="题目排序,根据难度分排序,会变" 46 | ) 47 | title = models.CharField( 48 | blank=True, 49 | default=None, 50 | max_length=200, 51 | help_text="题目的标题,最好有趣个性化一点", 52 | ) 53 | detail = models.TextField( 54 | blank=True, 55 | null=True, 56 | default="暂无表述", 57 | help_text="题目的描述:简单创造一个背景故事", 58 | ) 59 | goal = models.TextField( 60 | blank=True, null=True, default="暂无描述", help_text="题目的目标:掌握xxx" 61 | ) 62 | question = models.TextField( 63 | blank=True, null=True, default="暂无题目要求", help_text="题目要求" 64 | ) 65 | answer = models.CharField( 66 | blank=True, null=True, max_length=255, help_text="题目的答案" 67 | ) 68 | category = models.CharField( 69 | blank=True, 70 | null=True, 71 | default="成神之路", 72 | max_length=100, 73 | help_text="题目类别:成神之路,xpath特训", 74 | ) 75 | difficulty = models.CharField( 76 | blank=True, 77 | null=True, 78 | default="简单", 79 | max_length=12, 80 | choices=DIFFICULTY_CHOICES, 81 | help_text="难度", 82 | ) 83 | difficulty_score = models.BigIntegerField( 84 | blank=True, 85 | null=True, 86 | default=200, 87 | help_text="难度分数,后续根据此字段排序order_id", 88 | ) 89 | points = models.TextField( 90 | blank=True, 91 | null=True, 92 | default="暂未更新考点", 93 | help_text="本题的考点", 94 | ) 95 | published = models.BooleanField( 96 | blank=True, null=True, default=False, help_text="是否发布" 97 | ) 98 | pass_status = models.BooleanField( 99 | blank=True, null=True, default=False, help_text="是否通过" 100 | ) 101 | solution_txt = models.URLField( 102 | blank=True, null=True, default="暂无表述", help_text="题解,图文讲解" 103 | ) 104 | solution_video = models.URLField( 105 | blank=True, null=True, default="暂无表述", help_text="视频讲解" 106 | ) 107 | api_type = models.CharField( 108 | blank=True, 109 | null=True, 110 | max_length=255, 111 | default="直接对应视图", 112 | help_text="此题目的接口类型:直接对应视图,访问一个接口判断后决定是否返回视图,返回一个视图+【多个】api", 113 | ) 114 | api_prefix = models.CharField( 115 | blank=True, 116 | null=True, 117 | max_length=255, 118 | default="topic/", 119 | help_text="topic/,view/,api/", 120 | ) 121 | response_path = models.TextField( 122 | blank=True, null=True, help_text="题目路径【文件名】" 123 | ) 124 | 125 | class Meta: 126 | db_table = "sd_ls_topic" # 自定义表名 127 | ordering = ["order_id"] # 默认按 order_id 排序 128 | 129 | def __str__(self): 130 | return self.title 131 | 132 | 133 | # 题目与 Category 的中间表 134 | class TopicCategoryRelation(BaseModel, OrderMixin): 135 | id = models.AutoField(primary_key=True) 136 | 137 | topic = models.ForeignKey(Topics, on_delete=models.CASCADE) 138 | category = models.ForeignKey(Category, on_delete=models.CASCADE) 139 | 140 | class Meta: 141 | db_table = "sd_ls_topic_category_relation" # 自定义表名 142 | unique_together = ("topic", "category") # 确保唯一关系 143 | 144 | def __str__(self): 145 | return f"Topic {self.topic_id} - Category {self.category_id}" 146 | 147 | 148 | # 平台表 149 | class NewsPlatform(BaseModel): 150 | name = models.CharField(max_length=100) # 平台名称 151 | slug = models.SlugField(unique=True) # 用于URL的标识符 152 | description = models.CharField(blank=True, null=True, max_length=255) # 平台简介 153 | 154 | class Meta: 155 | db_table = "sd_ls_news_platform" 156 | 157 | def __str__(self): 158 | return self.name 159 | 160 | 161 | # 新闻类别表 162 | class NewsCategory(BaseModel): 163 | name = models.CharField(max_length=100) # 类别名称 164 | slug = models.CharField(unique=True, max_length=255) # 用于URL的标识符 165 | description = models.TextField(blank=True, null=True) # 类别简介 166 | 167 | class Meta: 168 | db_table = "sd_ls_news_category" 169 | 170 | def __str__(self): 171 | return self.name 172 | 173 | 174 | class NewsRequestHistory(BaseModel): 175 | request_time = models.DateTimeField(auto_now_add=True) 176 | response_data = models.JSONField() # 处理后的结果数据 177 | status = models.CharField(blank=True, null=True, max_length=255) 178 | platform = models.ForeignKey( 179 | NewsPlatform, related_name="history", on_delete=models.CASCADE 180 | ) # 所属平台 181 | 182 | class Meta: 183 | db_table = "sd_ls_news_request_history" 184 | 185 | def __str__(self): 186 | return f"Request at {self.request_time}" 187 | 188 | 189 | # 新闻表 190 | class News(BaseModel): 191 | title = models.CharField(max_length=200) # 新闻标题 192 | url = models.CharField( 193 | blank=True, null=True, unique=True, max_length=255 194 | ) # 新闻链接 195 | desc = models.CharField(max_length=500, blank=True, null=True) 196 | publish_time = models.DateTimeField(blank=True, null=True) 197 | timestamp = models.BigIntegerField(default=0, blank=True, null=True) 198 | author = models.CharField(max_length=255, blank=True, null=True) 199 | content = models.TextField() # 新闻内容 200 | hot = models.IntegerField(default=0) # 新闻内容 201 | platform = models.ForeignKey( 202 | NewsPlatform, 203 | related_name="news", 204 | on_delete=models.CASCADE, 205 | blank=True, 206 | null=True, 207 | ) # 所属平台 208 | category = models.ForeignKey( 209 | NewsCategory, 210 | related_name="news", 211 | on_delete=models.CASCADE, 212 | blank=True, 213 | null=True, 214 | ) # 新闻类别 215 | 216 | class Meta: 217 | db_table = "sd_ls_news" 218 | 219 | def __str__(self): 220 | return self.title 221 | 222 | 223 | # 新闻表 224 | class Web3NewsTag(BaseModel): 225 | tag_id = models.IntegerField(blank=True, null=True) 226 | name = models.CharField(max_length=200, blank=True, null=True) 227 | tag = models.CharField(max_length=200, blank=True, null=True) 228 | name_en = models.CharField(max_length=200, blank=True, null=True) 229 | url = models.CharField(blank=True, null=True, max_length=255) 230 | desc = models.CharField(max_length=500, blank=True, null=True) 231 | desc_en = models.CharField(max_length=500, blank=True, null=True) 232 | 233 | class Meta: 234 | db_table = "sd_ls_news_web3_tags" 235 | 236 | def __str__(self): 237 | return self.name 238 | 239 | 240 | class Web3News(BaseModel): 241 | title = models.CharField(max_length=200, blank=True, null=True) 242 | sub_title = models.CharField(max_length=500, blank=True, null=True) 243 | web_link = models.CharField(unique=True, max_length=255, blank=True, null=True) 244 | author_name = models.CharField(max_length=50, blank=True, null=True) 245 | published_time = models.DateTimeField(blank=True, null=True) 246 | 247 | class Meta: 248 | db_table = "sd_ls_news_web3" 249 | 250 | def __str__(self): 251 | return self.title 252 | -------------------------------------------------------------------------------- /topics/scheduler.py: -------------------------------------------------------------------------------- 1 | # myapp/scheduler.py 2 | from apscheduler.schedulers.background import BackgroundScheduler 3 | from apscheduler.triggers.interval import IntervalTrigger 4 | from apscheduler.triggers.cron import CronTrigger # 使用 CronTrigger 5 | from topics.tasks import fetch_and_save_news # 假设你有任务在 tasks.py 中 6 | 7 | 8 | def start_scheduler(): 9 | scheduler = BackgroundScheduler() 10 | scheduler.add_job( 11 | fetch_and_save_news, 12 | trigger=CronTrigger( 13 | minute="0,10,20,30,40,50" 14 | ), # 每个整十分(00, 10, 20, 30, 40, 50)执行一次 15 | # trigger=IntervalTrigger(minutes=10), # 每 10 分钟执行一次 16 | id="fetch_and_save_news", 17 | name="定时任务:十分钟一次,获取新闻", 18 | replace_existing=True, 19 | ) 20 | scheduler.start() 21 | -------------------------------------------------------------------------------- /topics/serializers.py: -------------------------------------------------------------------------------- 1 | from rest_framework import serializers 2 | from .models import Topics 3 | 4 | 5 | class TopicsSerializer(serializers.ModelSerializer): 6 | class Meta: 7 | model = Topics 8 | fields = "__all__" 9 | 10 | def to_representation(self, instance): 11 | representation = super().to_representation(instance) 12 | response_path = representation.get("response_path") 13 | api_prefix = representation.get("api_prefix") 14 | 15 | representation["response_path"] = f"{api_prefix}{response_path}" 16 | 17 | # if api_prefix.startswith('url/'): 18 | # pass 19 | # elif response_path: 20 | # representation['response_path'] = f"{api_prefix}{response_path}" 21 | 22 | return representation 23 | -------------------------------------------------------------------------------- /topics/static/topics/css/style.css: -------------------------------------------------------------------------------- 1 | td, th { 2 | white-space: nowrap; 3 | overflow: hidden; 4 | text-overflow: ellipsis; 5 | max-width: 300px; 6 | } 7 | 8 | td span, th span { 9 | display: inline-block; 10 | max-width: 100%; 11 | white-space: nowrap; 12 | overflow: hidden; 13 | text-overflow: ellipsis; 14 | } 15 | 16 | .table-container { 17 | display: flex; 18 | overflow-x: auto; 19 | } 20 | 21 | .fixed-column { 22 | position: sticky; 23 | left: 0; 24 | z-index: 1; 25 | } 26 | 27 | #data-body-fixed,thead{ 28 | border-right: 2px solid #ffffff !important; /* 添加右侧边框 */ 29 | 30 | } -------------------------------------------------------------------------------- /topics/tasks.py: -------------------------------------------------------------------------------- 1 | # myapp/tasks.py 2 | from collect_data.collect_news import get_all_platforms, fetch_platform_news 3 | from topics.models import News, NewsPlatform, NewsRequestHistory 4 | from time import sleep 5 | 6 | 7 | def fetch_and_save_news(): 8 | for platform_slug in get_all_platforms(): 9 | platform = NewsPlatform.objects.get(slug=platform_slug) 10 | print(f"采集{platform.name} {platform.slug}中...") 11 | data = fetch_platform_news(platform.slug) 12 | if data: 13 | for item in data: 14 | try: 15 | item = item | {"platform": platform} 16 | News.objects.create(**item) 17 | except Exception as e: 18 | continue 19 | NewsRequestHistory.objects.create(response_data=data, platform=platform) 20 | # 检查记录数量,如果超过100,则删除前50条 21 | total=NewsRequestHistory.objects.count() 22 | print(f'记录数量:{total}') 23 | if total > 100: 24 | # 获取最新的 50 条记录的 ID 25 | preserve_ids = list(NewsRequestHistory.objects.all().order_by("-id")[:100].values_list("id", flat=True)) 26 | # 删除不在这些 ID 中的记录 27 | NewsRequestHistory.objects.exclude(id__in=preserve_ids).delete() 28 | print(f"{platform.name} {platform.slug} 保存完成") 29 | else: 30 | print(f"请求出错了!") 31 | sleep(1) 32 | -------------------------------------------------------------------------------- /topics/templates/topics/404.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | 4 | {% block content %} 5 | 6 | 9 |
10 | 找不到你想要的页面了~ 11 |
12 | {% endblock %} 13 | -------------------------------------------------------------------------------- /topics/templates/topics/base.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | 3 | 4 | 5 | 6 | 7 | 8 | {% block charset %} 9 | {% endblock %} 10 | 11 | 12 | {% block title %}爬虫百战成神{% endblock %} 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 33 | 34 | 35 | 36 |
37 | {% block header %} 38 | {% include "topics/header.html" %} 39 | {% endblock %} 40 |
41 | 42 |
43 | 44 | {% block question %}{% endblock %} 45 | 46 | {% block answer %} 47 | 48 |
49 |

提交答案

50 |
51 |
52 | 53 | 54 |
55 | 56 |
57 |
58 |
59 | 60 | 92 | {% endblock %} 93 | {% block content %}{% endblock %} 94 |
95 | 96 | 97 | 98 | 99 | 121 | {% block script %} 122 | 123 | {% endblock %} 124 | -------------------------------------------------------------------------------- /topics/templates/topics/demo.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %}新闻类别{% endblock %} 4 | {% block answer %} 5 | 6 | {% endblock %} 7 | {% block content %} 8 |
9 |

新闻类别

10 | 11 |
12 | 13 |
14 |
15 | 国际新闻 16 |
17 |
国际新闻
18 |

了解全球范围内的新闻热点,聚焦国际局势。

19 |

来源:BBC, CNN, Al Jazeera

20 | 查看国际新闻 21 |
22 |
23 |
24 | 25 | 26 |
27 |
28 | 国内新闻 29 |
30 |
国内新闻
31 |

关注本国的时事新闻,涵盖社会、政治、经济等各个方面。

32 |

来源:新华网, 人民日报, 中国日报

33 | 查看国内新闻 34 |
35 |
36 |
37 | 38 | 39 |
40 |
41 | 体育新闻 42 |
43 |
体育新闻
44 |

关注各类体育赛事、运动员动态及体育评论。

45 |

来源:ESPN, 体育日报, FIFA

46 | 查看体育新闻 47 |
48 |
49 |
50 | 51 | 52 |
53 |
54 | 科技新闻 55 |
56 |
科技新闻
57 |

报道最新的科技趋势、创新产品、以及技术突破。

58 |

来源:TechCrunch, Wired, The Verge

59 | 查看科技新闻 60 |
61 |
62 |
63 | 64 | 65 |
66 |
67 | 娱乐新闻 68 |
69 |
娱乐新闻
70 |

聚焦电影、电视剧、明星八卦及娱乐圈动态。

71 |

来源:娱乐头条, 新浪娱乐, 腾讯娱乐

72 | 查看娱乐新闻 73 |
74 |
75 |
76 | 77 | 78 |
79 |
80 | 财经新闻 81 |
82 |
财经新闻
83 |

提供全球财经新闻,涉及股票、市场、投资等领域。

84 |

来源:华尔街日报, 财新网, Bloomberg

85 | 查看财经新闻 86 |
87 |
88 |
89 |
90 |
91 | {% endblock %} 92 | -------------------------------------------------------------------------------- /topics/templates/topics/header.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | 3 | 4 | -------------------------------------------------------------------------------- /topics/templates/topics/index/index.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block answer %} 4 | 5 | {% endblock %} 6 | {% block content %} 7 | 54 | 55 |
56 | 57 | 58 |
59 |
60 | 莱卡云 61 |
62 |
莱卡云
63 |

低价格高性能服务器,速速上车,开启学习之旅~

莱卡云拥有精选华中/香港/美国/欧洲多种(境内境外)云服务器产品(境外服务器可以免备案),低至25元每月。 64 |

65 | 立即购买 67 |
68 |
69 |
70 | 71 | 72 |
73 |
74 | ... 76 |

爬虫百战成神 - LearnSpider

77 |
78 | 靶场 79 | 爬虫 80 | 教程 81 | 自动化 82 | 一站式 83 | DrissionPage 84 | Python 85 |
86 |
87 | 88 |
89 |
90 | 不仅是一个练习场,也配套有每道题目的多种技术示例代码,文档讲解,视频演示。 91 |
92 |
93 | 题目由易到难,由浅入深,想让大家在刷题与实践的过程中甚至是从零学会爬虫(因为思想学会后,剩下的代码其实就是工具的使用),在这个过程中增加自己对于代码和场景的理解。 94 |
95 |
96 | 本项目目标覆盖爬虫初级,进阶和高级。涉及到requests,scrapy这些请求工具,还有selenium,drissionpage这些自动化工具框架。 97 |
98 |
99 | 涉及到接口请求,静态页面解析,也涉及到代码混淆,接口加密,也包含各种抓包工具的使用,chrome开发者工具的使用等。 100 |
101 |
102 | 包括一些新颖的反爬技术,比如前端层面的反爬,svg反爬,css反爬,雪碧图等,也比如新兴的反爬技术比如wasm。 103 |
104 |
105 | 总之就是我会什么,就像教大家什么。所以此仓库的内容也会无限拓展。也欢迎大家的贡献。 106 |
107 | 108 |
109 | 此仓库旨在让大家在刷题的过程中以结果和成就感驱动学习,学习到某个知识点后可以快速应用,从而感受到学到了东西,感受到爬虫是如此的简单有趣。而不是学完之后因为网站内容变动而没有刷题的地方,久而久之像没学一样。 110 |
111 |
112 | 并且本项目最想让大家养成举一反三,逻辑推理的思考思维习惯。 113 |
114 |
115 | 116 |
117 |

cpython666 出品

118 | 开启爬虫成神之旅 119 |
120 |
121 | 122 | 123 |
124 |
125 | 猿人学 126 |
127 |
猿人学
128 |

爬虫逆向反爬虫课程,3000爬虫er圈子,不定期聚会,行业顶级资源。

爬虫行业每10人中有一人就是猿人学的学员。

报课提站长名字可享内部优惠价格。 129 |

130 | 了解更多 131 |
132 |
133 |
134 | 135 |
136 | 137 | 138 | 144 | 145 |
146 |

赞助商

147 |
148 | 157 | 166 | 175 | 185 |
186 |
187 | 188 | 189 |
190 |

友情链接

191 |
192 | 201 | 211 |
212 |
213 | 214 | {% endblock %} 215 | -------------------------------------------------------------------------------- /topics/templates/topics/index/list.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block answer %}{% endblock %} 4 | 5 | {% block content %} 6 |
7 |
8 | 题目列表 9 |
10 |
11 | 12 |
13 | 14 | 25 |
26 | 27 |
28 | 29 | 30 | 31 | 32 | 36 | 37 | 38 | 39 | 40 | 41 |
# 34 | 名称 35 | 链接
42 |
43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 |
难度发布状态题解类别简介
57 |
58 |
59 | 63 |
64 |
65 | 66 | {% endblock %} 67 | 68 | {% block script %} 69 | 237 | 238 | {% endblock %} -------------------------------------------------------------------------------- /topics/templates/topics/index/sandbox.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %}试金场{% endblock %} 4 | {% block answer %} 5 | 6 | {% endblock %} 7 | {% block content %} 8 |
9 |
10 | 11 |
12 | 13 |
14 | 新闻类型爬虫工具 15 |
16 |
星梦News
17 |

这个工具可以帮助你爬取新闻网站的新闻内容,如标题、正文和作者等。

18 | 进入工具 19 |
20 |
21 |
22 | 23 |
24 | 25 |
26 | 图片类型爬虫工具 27 |
28 |
星梦WallPaper
29 |

通过此工具,你可以爬取图片资源并进行批量下载。

30 | 进入工具 31 |
32 |
33 |
34 | 35 |
36 | 37 |
38 | 视频评论爬虫工具 39 |
40 |
星梦Video
41 |

这个工具帮助你抓取视频平台的评论数据,包括点赞、回复等。

42 | 敬请期待~ 43 |
44 |
45 |
46 | 47 |
48 | 49 |
50 | 音乐评论爬虫工具 51 |
52 |
星梦Music
53 |

该工具可爬取音乐平台中的歌曲评论,并按用户或时间排序。

54 | 敬请期待~ 55 |
56 |
57 |
58 | 59 |
60 | 61 |
62 | 电商商品爬虫工具 63 |
64 |
星梦Shop
65 |

此工具能帮助你爬取电商平台的商品信息,包括价格、评价、销量等。

66 | 敬请期待~ 67 |
68 |
69 |
70 |
71 |
72 | {% endblock %} 73 | -------------------------------------------------------------------------------- /topics/templates/topics/index/shorthand.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %}速记手册{% endblock %} 4 | {% block answer %} 5 | 6 | {% endblock %} 7 | {% block content %} 8 |
9 | ... 10 |
11 | 速记手册 12 |
13 |
14 | {% endblock %} -------------------------------------------------------------------------------- /topics/templates/topics/index/tools.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %}工具箱{% endblock %} 4 | 5 | {% block answer %} 6 | 7 | {% endblock %} 8 | 9 | {% block content %} 10 |
11 |
12 | 13 | 27 | 28 | 29 |
30 |
31 | 32 |
33 |
34 | 工具1 35 |
36 |
工具1
37 |

工具1的简短描述。它帮助你进行各种任务。

38 | 进入工具 39 |
40 |
41 |
42 | 43 | 44 |
45 |
46 | 工具2 47 |
48 |
工具2
49 |

工具2的简短描述。它用于处理复杂的数据分析任务。

50 | 进入工具 51 |
52 |
53 |
54 | 55 | 56 |
57 |
58 | 工具3 59 |
60 |
工具3
61 |

工具3是一个高效的文本处理工具,适用于各种文本分析需求。

62 | 进入工具 63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 | {% endblock %} 71 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/111.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 雪碧图 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

雪碧图

9 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/112.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 雪碧图 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

雪碧图

9 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/ajax.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %} 4 | 何为动态网页 5 | {% endblock %} 6 | {% block question %} 7 |

您好🦆, 爬虫er~👋

8 |

9 | 什么是动态网页呢?🤠 10 |

11 | 查看这个网页的源代码,可以找到数据吗?
12 | 请问:动态网页中加载的数据在浏览器的开发者工具的网络标签栏下面的哪个筛选框里?【完整的内容,区分大小写,包括分隔符】 13 | 14 | {% endblock %} 15 | 16 | {% block content %} 17 |
18 |

动态内容

19 |
20 | 21 |
22 |
23 | 55 | {% endblock %} 56 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/asyncio.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 协程,不是携程奥😁 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

协程,不是携程奥😁

9 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/course-buying-guide.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 爬虫买课指南 5 | {% endblock %} 6 | {% block question %} 7 |

8 | 又又又看完一篇文章之后,这个时候是否对于报班学习有一定认识了呢? 9 | 请回答:《有点感觉了》 10 |

11 | {% endblock %} 12 | {% block content %} 13 |

爬虫买课指南

14 |

15 | 学习爬虫需要报班吗?你需要知道的事情。 16 |

17 | 20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/css-sprite.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 雪碧图 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

雪碧图

9 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/demo.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {# #} 6 | Title 7 | 8 | 9 | 吧小伙伴还是喜爱u不误 10 | 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/demo_get_server_time.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 7 | 8 | Document 9 | 10 | 11 |

12 | 13 | 63 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/devtools.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 开发者工具 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

开发者工具

9 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/easy-spider.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 对爬虫祛魅 5 | {% endblock %} 6 | {% block question %} 7 |

8 | 看完本篇文章之后,你还害怕爬虫吗? 9 | 请回答:《不害怕!!!》 10 |

11 | {% endblock %} 12 | {% block content %} 13 |

对爬虫祛魅

14 |

15 | 知己知彼,方能百战不胜,最好对将要学习的东西先有个大局观。不然会一次又一次遇到新东西,好像看不到尽头,进而磨灭热情。 16 |

17 | 20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/h1-6.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 杂牌军写的前端代码 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

杂牌军写的前端代码

9 | 10 | 11 | 12 | {% endblock %} 13 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/hello-get.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | GET请求 5 | {% endblock %} 6 | 7 | {% block question %} 8 |

GET 请求介绍

9 |

10 | GET 请求是 HTTP 协议中最常见的请求方法之一,通常用于从服务器获取资源。你了解它吗?
11 | 请回答:【了解】或【不了解】 12 |

13 | {% endblock %} 14 | 15 | {% block content %} 16 |

17 | GET 请求是用于向服务器请求数据的一种方法。它的特点是:所有的数据(如查询参数)都会附加在 URL 中, 18 | 并通过浏览器的地址栏进行发送。GET 请求不会对服务器的数据进行修改,只是用于获取资源。 19 |

20 |

GET 请求的特点:

21 |
    22 |
  • 请求参数通过 URL 传递(例如:`?name=value&age=30`)。
  • 23 |
  • 数据大小受限(通常约为 2048 字符)。
  • 24 |
  • 请求是安全的,因为它不会修改服务器上的资源。
  • 25 |
  • GET 请求通常是幂等的,也就是说多次请求相同的 URL 会得到相同的响应。
  • 26 |
  • GET 请求支持浏览器缓存。
  • 27 |
28 | 29 |

示例:

30 |

如果我们通过以下 GET 请求访问一个页面:

31 |
32 | GET /search?q=python HTTP/1.1
33 | Host: example.com
34 | 
35 | 36 |

服务器会根据查询参数 `q=python` 返回相应的资源(比如搜索结果)。

37 | {% endblock %} 38 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/hello-post-form.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | POST请求-表单请求体 5 | {% endblock %} 6 | 7 | {% block question %} 8 |

POST 请求类型:JSON 与表单的区别

9 |

10 | 你知道 POST 请求发送数据的不同格式吗?我们有 JSON 格式和表单格式的区别哦!
11 | 请回答:【了解】或【不了解】 12 |

13 | {% endblock %} 14 | 15 | {% block content %} 16 |

17 | POST 请求是 HTTP 协议中常用的一种请求方法,主要用于向服务器提交数据。在提交数据时,POST 请求有不同的 18 | **数据格式**,其中最常用的有: 19 |

20 | 21 |
22 | 23 | 45 | 46 | {% endblock %} 47 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/hello-post-json.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | POST请求-JSON请求体 5 | {% endblock %} 6 | 7 | {% block question %} 8 |

POST 请求介绍

9 |

10 | POST 请求用于向服务器提交数据,你知道怎么通过代码发送一个 POST 请求吗?
11 | 请回答:【了解】或【不了解】 12 |

13 | {% endblock %} 14 | 15 | {% block content %} 16 | 17 |
18 | 19 | 43 | 44 | {% endblock %} 45 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/intro.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 题目介绍 5 | {% endblock %} 6 | {% block question %} 7 |

8 | 主要介绍这些题目的特点 9 |
10 | 11 | 请问:如何获取到数据呢?请回答:【灵活变通】 12 |

13 | {% endblock %} 14 | {% block content %} 15 |

题目介绍

16 |

17 | 不管你是用什么方式,自动化,协议,甚至是,复制粘贴。没错,不择手段,只要你能够获取答案!当然,我会尽量把题目设计成让你无法用手动解决,嘿嘿嘿~。 18 |
尽量做到从易到难,由浅入深,遇到黄色报错页面不要慌,是因为题目还没出到那里。 19 |

20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/lsp-spider.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %} 4 | 色图收集者 5 | {% endblock %} 6 | {% block question %} 7 |

您好🦆, 爬虫er~👋

8 |

9 | 下面是一些精彩的色图展示,请欣赏!🎨 10 |

11 | 将所有的图片下载到本地,存放到【学习资料】文件夹,图片命名为:标题-描述-图片名 12 |
13 | 偷着乐吧小伙子,这道题目答案是:【我只是爱学习】 14 |
15 | 记得常常温习哟~😉 16 | 20 | {% endblock %} 21 | 22 | {% block content %} 23 |
24 |

色图收集者

25 |
26 | 27 |
28 |
29 | 色图 1 30 |
31 |
色图标题 1
32 |

描述:这是一幅色彩鲜艳的图像,展示了美丽的风景。

33 |
34 |
35 |
36 | 37 |
38 |
39 | 色图 2 40 |
41 |
色图标题 2
42 |

描述:这幅图像展示了丰富的色彩和动态的设计。

43 |
44 |
45 |
46 | 47 |
48 |
49 | 色图 3 50 |
51 |
色图标题 3
52 |

描述:这幅图像以抽象的形式展现了色彩的碰撞。

53 |
54 |
55 |
56 | 57 |
58 |
59 | 色图 4 60 |
61 |
色图标题 4
62 |

描述:这幅图像以优雅的色调展示了艺术的美感。

63 |
64 |
65 |
66 | 67 |
68 |
69 | 色图 5 70 |
71 |
色图标题 5
72 |

描述:这幅图像充满了生动的色彩和细腻的细节。

73 |
74 |
75 |
76 | 77 |
78 |
79 | 色图 6 80 |
81 |
色图标题 6
82 |

描述:这幅图像展示了色彩和形状的完美结合。

83 |
84 |
85 |
86 |
87 |
88 | {% endblock %} 89 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/news.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 新闻收集者 5 | {% endblock %} 6 | {% block question %} 7 |

您好🦆, 爬虫er~👋

8 |

9 | 以下是一些与爬虫相关的新闻,希望你喜欢!🕷 10 |

11 | 获取所有的新闻信息,输出这样的格式:[{"title":"xxx",{"auther":"xxx",{"publish_date":"xxx",{"detail":"xxx",}...] 12 |
13 | 成功获取了吗?请回答【成功了!】 14 | {% endblock %} 15 | 16 | {% block content %} 17 |
18 |

新闻收集者

19 |
20 | 21 |
22 |
23 |
24 |
Python 爬虫挑战赛圆满结束
25 |
发布日期: 2024-07-30
26 |

作者: 爬虫大师

27 |

本次 Python 爬虫挑战赛吸引了来自全球的数百名爬虫爱好者。经过激烈的比拼,最终产生了前三名获奖者。他们的作品展示了爬虫技术的无限可能。

28 |
29 |
30 |
31 | 32 |
33 |
34 |
35 |
新型反爬虫技术亮相科技大会
36 |
发布日期: 2024-07-28
37 |

作者: 安全先锋

38 |

在今年的科技大会上,多个公司展示了他们最新的反爬虫技术。这些技术旨在保护网站内容,防止未经授权的爬取行为,引起了广泛关注。

39 |
40 |
41 |
42 | 43 |
44 |
45 |
46 |
知名数据科学家分享爬虫最佳实践
47 |
发布日期: 2024-07-25
48 |

作者: 数据达人

49 |

著名数据科学家在最近的研讨会上分享了爬虫的最佳实践,包括如何处理反爬机制、如何提高爬虫效率以及数据清洗的技巧,受到了与会者的热烈欢迎。

50 |
51 |
52 |
53 | 54 |
55 |
56 |
57 |
Web Scraping 在商业中的应用
58 |
发布日期: 2024-07-22
59 |

作者: 商业分析师

60 |

越来越多的企业开始利用 Web Scraping 技术来收集竞争对手的信息、市场趋势以及客户反馈。本文介绍了几种典型的商业应用案例。

61 |
62 |
63 |
64 | 65 |
66 |
67 |
68 |
AI 与爬虫技术的结合:未来展望
69 |
发布日期: 2024-07-20
70 |

作者: 技术前沿

71 |

随着 AI 技术的发展,爬虫技术也在不断进步。未来,AI 将在爬虫中发挥更加重要的作用,帮助爬虫变得更加智能和高效。

72 |
73 |
74 |
75 | 76 |
77 |
78 |
79 |
爬虫技术在学术研究中的应用
80 |
发布日期: 2024-07-18
81 |

作者: 学术达人

82 |

爬虫技术在学术研究中有着广泛的应用,例如数据收集、文献综述和社会网络分析。本文探讨了几种典型的学术应用场景。

83 |
84 |
85 |
86 |
87 |
88 | {% endblock %} 89 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/pagination-1.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %} 4 | 翻页网页 5 | {% endblock %} 6 | {% block question %} 7 |

您好🦆, 爬虫er~👋

8 |

9 | 这是需要翻页的网页🫣 10 |

11 | 它与上一道题目没太大区别对吧,只是多加了一个循环。
12 | 请问:第一百页返回的内容是什么?【一个json格式的答案】 13 | 14 | {% endblock %} 15 | 16 | {% block content %} 17 |
18 |

翻页内容

19 |
20 | 21 |
22 | 35 |
36 | 37 | 81 | {% endblock %} 82 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/pagination-2.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %}翻页网页2{% endblock %} 4 | {% block question %} 5 |

您好🦆, 爬虫er~👋

6 |

7 | 出bug了吗?🫣 8 |

9 | 怎么又回到列表页了?哎,对不起了🥺,直接把题目列表页面copy过来了。
10 | 发现这个翻页列表与上个翻页列表的区别了吧!请问每页都点一下请求了几次数据接口? 11 | 12 | {% endblock %} 13 | {% block content %} 14 |
15 |
16 | 题目列表 17 |
18 |
19 |
20 | 21 | 22 | 23 | 24 | 28 | 29 | 30 | 31 | 32 | 33 |
# 26 | 名称 27 | 链接
34 |
35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 |
难度状态题解类别简介
48 |
49 |
50 | 54 |
55 |
56 | 57 | {% endblock %} 58 | 59 | {% block script %} 60 | 212 | 213 | {% endblock %} -------------------------------------------------------------------------------- /topics/templates/topics/pages/pagination-table.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %} 4 | 翻页表格 5 | {% endblock %} 6 | {% block question %} 7 |

您好🦆, 爬虫er~👋

8 |

9 | 这是一个翻页的网页,并且数据格式是表格🫣 10 |

11 | 它与翻页题目没太大区别对吧,只是显示成了一个表格。
12 | {% endblock %} 13 | {% block answer %}{% endblock %} 14 | {% block content %} 15 |
16 |

翻页表格

17 |
18 | 19 |
20 | 33 |
34 | 35 | 115 | {% endblock %} 116 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/redirect.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 网页重定向 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

网页重定向

9 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/rowspan-table.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 跨行表格 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

跨行表格

9 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/spider-guide.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 爬虫学习指南 5 | {% endblock %} 6 | {% block question %} 7 |

8 | 又又看完一篇文章之后,你还害怕爬虫吗?
9 | 请问up推荐使用的浏览器是什么浏览器?【五个汉字】 10 |

11 | {% endblock %} 12 | {% block content %} 13 |

爬虫学习指南

14 |

15 | 爬虫涉及到的学科,如何学编程,网上自学优缺点,我的学习方法和建议。 16 |

17 | 20 | {% endblock %} 21 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/spider-roadmap.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 爬虫学习路线 5 | {% endblock %} 6 | {% block question %} 7 |

8 | 又又又看完一篇文章之后,有没有对于爬虫更清晰一点? 9 |
10 | 11 | 请问:dp的全称是什么?【12个字母,其中有两个是大写】 12 |

13 | {% endblock %} 14 | {% block content %} 15 |

爬虫学习路线

16 |

17 | 爬虫的学习路线,爬虫的人生规划 18 |

19 | 22 | {% endblock %} 23 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/svg.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | svg 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

svg

9 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/table-key-value.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 表格键值对获取 5 | {% endblock %} 6 | 7 | {% block content %} 8 |

表格键值对获取

9 | 10 | {% endblock %} 11 | -------------------------------------------------------------------------------- /topics/templates/topics/pages/wenjuan.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %} 4 | 自动化问卷填写 5 | {% endblock %} 6 | {% block question %} 7 |

自动化问卷填写🤖

8 |

9 | 解决自己的需求或者赚钱...都是一个有力的理由!🫣 10 |

11 |

12 | 小新为了一个作业,要填写几千份问卷!!!!!!
13 | 他无法想象要填写到什么时候,直到他从朋友那里得知了Python自动化...
14 | 没有花里胡哨的操作,只是拼接了一些简单基础的代码来自动化填写问卷。 15 |

16 | {% endblock %} 17 | 18 | {% block content %} 19 | 20 |
21 |
22 | 23 |
24 | 25 |
26 | 27 |
28 |
29 | 30 |
31 |
32 | 33 |
34 | 35 |
36 | 37 |
38 |
39 | 40 |
41 |
42 |
43 |
44 | 45 |
46 |
47 | 48 |
49 | 50 |
51 | 52 |
53 | 54 |
55 |
56 | 57 |
58 |
59 | 60 |
61 | 62 |
63 | 64 |
65 | 66 |
67 |
68 | 69 |
70 |
71 |
78 |
79 | 80 |
81 |
82 | 83 |
84 | 85 |
86 | 87 |
88 | 89 |
90 |
91 | 92 |
93 |
94 | 95 |
96 | 97 |
98 |
99 | 100 |
101 |
102 | 103 |
104 | 105 |
106 | 107 |
108 | 109 |
110 |
111 | 112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 | 121 | 122 | 130 | {% endblock %} 131 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/about_us.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/sandbox/news/news_base.html" %} 2 | {% load static %} 3 | {% block content %} 4 |
5 | 6 |

关于我们

7 | 8 | 9 |
10 |

我们的使命

11 |

12 | 星梦新闻致力于为用户提供最新、最全面的新闻信息。我们的目标是让用户能够随时随地了解全球范围内的热点新闻,覆盖各类领域,包括科技、娱乐、体育等。通过高效的新闻推荐和精准的搜索引擎,我们希望帮助用户更好地获取感兴趣的内容。 13 |

14 |
15 | 16 | 17 |
18 |

我们的团队

19 |
20 |
21 |
22 | 团队成员1 23 |
24 |
张三
25 |

项目经理,负责整体项目的规划和管理。

26 |
27 |
28 |
29 |
30 |
31 | 团队成员2 32 |
33 |
李四
34 |

前端开发,负责用户界面的设计与开发。

35 |
36 |
37 |
38 |
39 |
40 | 团队成员3 41 |
42 |
王五
43 |

后端开发,负责数据库和服务端的开发与优化。

44 |
45 |
46 |
47 |
48 |
49 | 50 | 51 |
52 |

联系我们

53 |

如果你有任何问题或建议,欢迎通过以下方式联系我们:

54 |
    55 |
  • 电子邮件:2942581284@qq.com
  • 56 |
  • 官方B站账号:@Python斗罗
  • 57 | {#
  • 客服热线:400-123-4567
  • #} 58 |
59 |
60 |
61 | {% endblock %} 62 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/category.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/sandbox/news/news_base.html" %} 2 | {% load static %} 3 | {% block title %}新闻来源与类别{% endblock %} 4 | {% block content %} 5 |
6 |
7 | 8 |
9 |
10 |
11 |

新闻来源

12 |
13 |
14 | 23 |
24 |
25 |
26 | 27 | 28 |
29 |
30 |
31 |

新闻类别

32 |
33 |
34 | 45 |
46 |
47 |
48 |
49 |
50 | {% endblock %} 51 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/detail_category.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/sandbox/news/news_base.html" %} 2 | {% block title %}{{ category.name }} 详情{% endblock %} 3 | {% block content %} 4 |
5 |

{{ category.name }}

6 |

{{ category.description }}

7 |
8 | {% endblock %} 9 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/detail_news.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/sandbox/news/news_base.html" %} 2 | {% block content %} 3 |
4 |

{{ news_item.title }}

5 |

{{ news_item.content }}

6 | 返回首页 7 |
8 | {% endblock %} 9 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/detail_source.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/sandbox/news/news_base.html" %} 2 | {% block title %}{{ source.name }} 详情{% endblock %} 3 | {% block content %} 4 |
5 |

{{ source.name }}

6 |

{{ source.description }}

7 |
8 | {% endblock %} 9 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/news_base.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %}星梦新闻{% endblock %} 4 | {% block header %} 5 | {% include "topics/sandbox/news/news_header.html" %} 6 | 7 | {% endblock %} 8 | {% block answer %} 9 | 10 | {% endblock %} 11 | {% block content %} 12 | {# 每个页面的内容#} 13 | {% endblock %} 14 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/news_header.html: -------------------------------------------------------------------------------- 1 | {% load static %} 2 | 3 | 4 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/news_hot.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/sandbox/news/news_base.html" %} 2 | 3 | {% block title %}热榜{% endblock %} 4 | 5 | {% block content %} 6 |
7 |
8 | {% for platform, news_list in platform_news_mapping.items %} 9 |
10 |
11 |
12 |
{{ platform.name }}
13 | 查看更多 15 |
16 |
17 | {% if news_list %} 18 |
    19 | {% for news in news_list %} 20 |
  • 21 | 22 |
    23 | 26 | {#{% if forloop.counter == 1 %}bg-warning text-white{% elif forloop.counter == 2 %}bg-secondary text-white{% elif forloop.counter == 3 %}bg-success text-white{% else %}bg-secondary text-white{% endif %}">#} 27 | {{ forloop.counter }} 28 | 29 | 31 | {{ news.title }} 33 | 34 |
    35 |
  • 36 | {% endfor %} 37 |
38 | {% else %} 39 |

暂无新闻

40 | {% endif %} 41 |
42 |
43 |
44 | {% endfor %} 45 |
46 |
47 | 48 | 49 | 55 | 56 | {% endblock %} 57 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/news_hot_detail.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/sandbox/news/news_base.html" %} 2 | 3 | {% block title %}热榜{% endblock %} 4 | 5 | {% block content %} 6 |
7 |

平台热榜

8 | 9 | 10 |
11 |
12 |
13 | {% for platform in platforms %} 14 | 21 | {% endfor %} 22 |
23 |
24 |
25 | 26 | {% if selected_platform %} 27 |
28 | 29 |
30 |
31 |
32 |

{{ selected_platform.name }} 热榜

33 |
34 |
35 | 36 |
37 | {% for news in news_lst %} 38 |
39 |
40 | 41 |
42 | #} 45 | {% if forloop.counter == 1 %}bg-warning text-white{% elif forloop.counter == 2 %}bg-secondary text-white{% elif forloop.counter == 3 %}bg-success text-white{% else %}bg-light text-dark{% endif %}"> 46 | 47 | {{ forloop.counter }} 48 | 49 |
50 | 61 |
62 |
63 | {% empty %} 64 |
65 |

该平台暂无热榜新闻。

66 |
67 | {% endfor %} 68 |
69 |
70 |
71 |
72 |
73 | {% else %} 74 |
75 |

请选择一个平台来查看热榜。

76 |
77 | {% endif %} 78 |
79 | {% endblock %} 80 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/news_index.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/sandbox/news/news_base.html" %} 2 | {% load static %} 3 | {% block content %} 4 |
5 | 6 |
7 | 星梦新闻 Logo 9 |

星梦新闻

10 |
11 | 12 | 13 | 22 | 23 | 24 |
25 |

最新热点新闻

26 |
27 | {% for news_item in latest_news %} 28 |
29 |
30 |
31 |
32 | {{ news_item.title }} 34 |
35 |

{{ news_item.summary }}

36 | {{ news_item.publish_date }} 37 |
38 |
39 |
40 | {% endfor %} 41 |
42 |
43 |
44 | {% endblock %} 45 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/notice.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/sandbox/news/news_base.html" %} 2 | {% load static %} 3 | {% block content %} 4 |
5 |

推送通知设置

6 | 7 | 8 |
9 |

是否接收通知

10 |
11 | 12 | 15 |
16 |
17 | 18 | 19 |
20 |

选择您感兴趣的新闻类型

21 |
22 | 23 | 26 |
27 |
28 | 29 | 32 |
33 |
34 | 35 | 38 |
39 |
40 | 41 | 44 |
45 |
46 | 47 | 48 |
49 |

选择订阅套餐

50 |
51 | 52 | 57 |
58 |
59 | 60 | 61 |
62 | 63 |
64 |
65 | 66 | {% endblock %} 67 | -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/technology.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/templates/topics/sandbox/news/technology.html -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/news/web3.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/templates/topics/sandbox/news/web3.html -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/wallpaper/wallpaper.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/templates/topics/sandbox/wallpaper/wallpaper.html -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/wallpaper/wallpaper_base.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/templates/topics/sandbox/wallpaper/wallpaper_base.html -------------------------------------------------------------------------------- /topics/templates/topics/sandbox/wallpaper/wallpaper_header.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/templates/topics/sandbox/wallpaper/wallpaper_header.html -------------------------------------------------------------------------------- /topics/templates/topics/solutions.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {% load static %} 3 | {% block title %}题解{% endblock %} 4 | 5 | {% block answer %} 6 | {% endblock %} 7 | 8 | {% block content %} 9 |
10 | ... 11 |
12 | 题解 13 |
14 |
15 | {% endblock %} -------------------------------------------------------------------------------- /topics/templates/topics/tools/encode.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cpython666/LearnSpider/c86901bb0e6066fe9f60030c861b24828d35e0f7/topics/templates/topics/tools/encode.html -------------------------------------------------------------------------------- /topics/templates/topics/views/encode.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | {#{% block charset %} {% endblock %}#} 3 | {#{% block charset %} {% endblock %}#} 4 | {#{% block charset %} {% endblock %}#} 5 | {% block title %} 6 | 网页编码 7 | {% endblock %} 8 | {% block question %} 9 |

您好🦆, 爬虫er~👋

10 |

11 | 尝试用requests请求这个页面,查看结果是否有什么不一样呢?
12 | 尝试解决它!【可以看教程的奥】
13 | 请回答:【解决了】 14 |

15 | {% endblock %} 16 | 17 | {% block content %} 18 |

有的网页编码是gbk,有的是utf-8,也有的是一些变种比如ISO-8859-1等等。当你的解码方法与网页的编码方法不同的时候,就会产生乱码。所以,如何判断知道网页的编码呢?

19 | {% endblock %} 20 | -------------------------------------------------------------------------------- /topics/templates/topics/views/hello-spider.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | 4 | {% block title %} 5 | Hello,爬虫! 6 | {% endblock %} 7 | {% block question %} 8 |

您好🦆, 爬虫er~👋

9 |

10 | 请问,这个页面有多少个:《Hello, Spider~》?【ps:题目中的这个不算!】 11 | 12 |

13 | {% endblock %} 14 | 15 | {% block content %} 16 | {% for greeting, button_class in greeting_buttons %} 17 | 18 | {% endfor %} 19 | {% endblock %} 20 | -------------------------------------------------------------------------------- /topics/templates/topics/views/request-twice-cookie.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Request-Twice 7 | 8 | 9 | 10 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /topics/templates/topics/views/request-twice.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | 4 | {% block title %} 5 | cookie反爬-pro 6 | {% endblock %} 7 | {% block question %} 8 |

您好🦆, 爬虫er~👋

9 |

10 | 请问,你用代码获取到这个页面的源代码了吗? 11 |

12 | {% endblock %} 13 | 14 | {% block content %} 15 | {% for greeting, button_class in greeting_buttons %} 16 | 17 | {% endfor %} 18 | {% endblock %} 19 | -------------------------------------------------------------------------------- /topics/templates/topics/views/table.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | 表格求和 5 | {% endblock %} 6 | {% block question %} 7 |

您好🦆, 爬虫er~👋

8 |

9 | 请问,表格中所有数字的总和为多少?【ctrl+f搜不到了吧,哈哈😂】 10 | 11 |

12 | {% endblock %} 13 | 14 | {% block content %} 15 | 16 |
17 |

表格求和

18 | 19 | 20 | 21 | {% for col in table_data.0 %} 22 | 23 | {% endfor %} 24 | 25 | 26 | 27 | {% for row in table_data %} 28 | 29 | {% for cell in row %} 30 | 31 | {% endfor %} 32 | 33 | {% endfor %} 34 | 35 |
列{{ forloop.counter }}
{{ cell }}
36 |
37 | {% endblock %} 38 | -------------------------------------------------------------------------------- /topics/templates/topics/views/ua.html: -------------------------------------------------------------------------------- 1 | {% extends "topics/base.html" %} 2 | 3 | {% block title %} 4 | UA检测 5 | {% endblock %} 6 | 7 | {% block question %} 8 |

您好🦆, 爬虫er~👋

9 |

10 | 请问,你用代码获取到这个页面的源代码了吗?[老实点🥸]
11 | 请回答:【获取到了】 12 |

13 | {% endblock %} 14 | 15 | {% block content %} 16 |

小新第一次请求之后成功获取了网页的源代码,虽然不知道网络的各种原理,但是还是很有成就感的,只是...这就是爬虫吗,就这么简单?尝试下这个网页吧!

17 | {% endblock %} 18 | -------------------------------------------------------------------------------- /topics/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /topics/urls.py: -------------------------------------------------------------------------------- 1 | from django.urls import path 2 | from . import views 3 | from django.urls import path 4 | from .views import topic_view 5 | 6 | urlpatterns = [ 7 | path("", views.index, name="index"), 8 | path("list", views.list, name="list"), 9 | path("tools", views.tools, name="tools"), 10 | path("shorthand", views.shorthand, name="shorthand"), 11 | path("solutions", views.solutions, name="solutions"), 12 | # ------------------ 试金场 ----------------- 13 | path("sandbox/", views.sandbox, name="sandbox"), 14 | path("sandbox/news/", views.sandbox_news, name="sandbox_news"), 15 | path("sandbox/news/search/", views.sandbox_news, name="sandbox_news_search"), 16 | path( 17 | "sandbox/news/about/", views.sandbox_news_about_us, name="sandbox_news_about_us" 18 | ), # 关于我们页面 19 | path("sandbox/news/notice/", views.sandbox_news_notice, name="sandbox_news_notice"), 20 | path( 21 | "sandbox/news/category/", 22 | views.sandbox_news_category, 23 | name="sandbox_news_category", 24 | ), 25 | path( 26 | "sandbox/news/category//", 27 | views.sandbox_news_category_detail, 28 | name="sandbox_news_category_detail", 29 | ), 30 | # 详情页面路由 31 | path( 32 | "sandbox/news/source//", 33 | views.sandbox_news_source_detail, 34 | name="sandbox_news_source_detail", 35 | ), 36 | # path('news/category//', views.category_detail, name='category_detail'), 37 | path("sandbox/news/hot/", views.sandbox_news_hot, name="sandbox_news_hot"), 38 | path( 39 | "sandbox/news/hot//", 40 | views.sandbox_news_hot_detail, 41 | name="sandbox_news_hot_detail", 42 | ), 43 | path("sandbox/news/category/technology/", views.sandbox_news, name="sandbox_news"), 44 | path("sandbox/news/category/web3/", views.sandbox_news, name="sandbox_news"), 45 | path( 46 | "sandbox/news/news_detail//", 47 | views.sandbox_news_detail, 48 | name="sandbox_news_detail", 49 | ), 50 | # ------------------------------------------ 51 | # topic开头 重定向到视图返回 /html pages静态页面类型 52 | path("page//", topic_view, name="topic_view"), 53 | # ------------------------------------------ 54 | # 请求视图类型 55 | path("view/hello-spider/", views.hello_spider, name="request_twice"), 56 | path("view/request-twice/", views.request_twice, name="request-twice"), 57 | path("view/ua/", views.ua, name="ua"), 58 | path("view/encode/", views.encode_page, name="encode"), 59 | path("view/table/", views.table, name="table"), 60 | # 混合请求接口类型 61 | path("demo/", views.demo), 62 | path("demo1/", views.demo1), 63 | ] 64 | -------------------------------------------------------------------------------- /topics/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import get_object_or_404 2 | from datetime import datetime 3 | from .models import Topics, NewsPlatform, News, NewsCategory, NewsRequestHistory 4 | from .decorators import require_ua 5 | import time 6 | import random 7 | from django.shortcuts import render 8 | 9 | 10 | def demo(request): 11 | return render(request, "topics/pages/demo_get_server_time.html") 12 | 13 | 14 | def demo1(request): 15 | return render(request, "topics/pages/demo.html") 16 | 17 | 18 | def hello_spider(request): # random_greetings 19 | greetings = [] 20 | button_classes = [ 21 | "btn btn-primary", 22 | "btn btn-secondary", 23 | "btn btn-success", 24 | "btn btn-danger", 25 | "btn btn-warning", 26 | "btn btn-info", 27 | "btn btn-light", 28 | "btn btn-dark", 29 | "btn btn-link", 30 | ] 31 | # 随机生成 666 个 "Hello, Spider~" 32 | for _ in range(666): 33 | greetings.append("Hello, Spider~") 34 | # 随机生成 "你好~世界!" 的数量(例如,随机 1 到 100 个) 35 | nihao_count = random.randint(1, 100) 36 | for _ in range(nihao_count): 37 | greetings.append("你好~世界!") 38 | # 随机生成 "Hello, World~" 的数量(例如,随机 1 到 100 个) 39 | hello_world_count = random.randint(1, 100) 40 | for _ in range(hello_world_count): 41 | greetings.append("Hello, World~") 42 | # 为每个 greeting 随机选择一个按钮样式 43 | greeting_buttons = [ 44 | (greeting, random.choice(button_classes)) for greeting in greetings 45 | ] 46 | # 打乱顺序 47 | random.shuffle(greeting_buttons) 48 | 49 | return render( 50 | request, 51 | "topics/views/hello-spider.html", 52 | {"greeting_buttons": greeting_buttons}, 53 | ) 54 | 55 | 56 | @require_ua 57 | def ua(request): 58 | return render(request, "topics/views/ua.html") 59 | 60 | 61 | def encode_page(request): 62 | response = render(request, "topics/views/encode.html") 63 | response["Content-Type"] = "text/html;" 64 | # response['Content-Type'] = 'text/html; charset=GB2312' 65 | # response['Content-Type'] = 'text/html;UTF-8' 66 | # response['Content-Type'] = 'text/html; charset=ISO-8859-1' 67 | return response 68 | 69 | 70 | def table(request): 71 | # 定义行数和列数,这里可以随机生成,或者根据你的需求来确定 72 | rows = random.randint(5, 10) 73 | cols = random.randint(5, 10) 74 | 75 | # 生成随机的表格数据,确保总和为666666 76 | total_sum = 666666 77 | table_data = [[0] * cols for _ in range(rows)] 78 | remaining_sum = total_sum 79 | 80 | for r in range(rows): 81 | for c in range(cols): 82 | if r == rows - 1 and c == cols - 1: 83 | table_data[r][c] = remaining_sum # 最后一个单元格填充剩余的数值 84 | else: 85 | # 确保 max_value 始终大于等于 1 86 | max_value = max(1, remaining_sum - (rows - r - 1) * (cols - c - 1)) 87 | value = random.randint(1, max_value) 88 | table_data[r][c] = value 89 | remaining_sum -= value 90 | 91 | context = {"table_data": table_data} 92 | return render(request, "topics/views/table.html", context) 93 | 94 | 95 | def request_twice(request): 96 | # get_content_or_script 97 | # 设定 Cookie 的过期时间为一秒 98 | # 考虑到以下几点,仍然进行过期时间的判断是一个更健壮的设计: 99 | # 浏览器行为不一致: 不同浏览器可能在处理过期Cookie时有不同的行为,有些可能不会立即删除。 100 | # 用户行为不确定: 用户可能会手动修改浏览器时间,或者在极端情况下,浏览器可能不会及时删除过期的Cookie。 101 | # 潜在的安全问题: 不信任客户端数据的完整性始终是一个好的安全实践。 102 | # 因此,尽管浏览器应该删除过期的Cookie,后端进行过期时间的验证仍然是推荐的做法,以确保系统的可靠性和安全性。 103 | COOKIE_NAME = "timestamp" 104 | COOKIE_EXPIRATION = 1 # 秒 105 | # 读取 Cookie 106 | cookie_value = request.COOKIES.get(COOKIE_NAME) 107 | if cookie_value: 108 | try: 109 | # 验证 Cookie 是否过期 110 | cookie_timestamp = float(cookie_value) 111 | current_time = time.time() 112 | if current_time - cookie_timestamp <= COOKIE_EXPIRATION: 113 | # 如果 Cookie 仍然有效,返回 HTML 内容 114 | return render(request, "topics/views/request-twice.html") 115 | except ValueError: 116 | pass 117 | # 如果没有有效的 Cookie,返回 JavaScript 代码来设置 Cookie 118 | return render(request, "topics/views/request-twice-cookie.html") 119 | 120 | 121 | def index(request): 122 | return render(request, "topics/index/index.html") 123 | 124 | 125 | def list(request): 126 | return render(request, "topics/index/list.html") 127 | 128 | 129 | def tools(request): 130 | return render(request, "topics/index/tools.html") 131 | 132 | 133 | def sandbox(request): 134 | return render(request, "topics/index/sandbox.html") 135 | 136 | 137 | def sandbox_news(request): 138 | # 模拟一些假新闻数据 139 | latest_news = [ 140 | { 141 | "id": 1, 142 | "title": "新科技革命:AI 将重塑未来", 143 | "summary": "随着 AI 技术的飞速发展,未来的科技将发生翻天覆地的变化...", 144 | "publish_date": datetime.now().strftime("%Y-%m-%d"), 145 | }, 146 | { 147 | "id": 2, 148 | "title": "2025年全球互联网将迎来新变革", 149 | "summary": "在未来几年,全球互联网将经历一场前所未有的革命...", 150 | "publish_date": datetime.now().strftime("%Y-%m-%d"), 151 | }, 152 | { 153 | "id": 3, 154 | "title": "量子计算的突破性进展", 155 | "summary": "量子计算作为一种新型计算模式,正在逐步突破技术瓶颈...", 156 | "publish_date": datetime.now().strftime("%Y-%m-%d"), 157 | }, 158 | { 159 | "id": 4, 160 | "title": "5G网络加速全球数字化进程", 161 | "summary": "5G网络的普及正在改变全球通信格局,推动各行各业的数字化转型...", 162 | "publish_date": datetime.now().strftime("%Y-%m-%d"), 163 | }, 164 | { 165 | "id": 5, 166 | "title": "未来科技:机器人将进入家庭生活", 167 | "summary": "随着人工智能和机器人技术的发展,智能机器人正在进入普通家庭...", 168 | "publish_date": datetime.now().strftime("%Y-%m-%d"), 169 | }, 170 | ] 171 | 172 | return render( 173 | request, 174 | "topics/sandbox/news/news_index.html", 175 | {"latest_news": latest_news, "search": "/sandbox/news/search"}, 176 | ) 177 | 178 | 179 | # 模拟分类和新闻数据 180 | categories = [ 181 | { 182 | "id": 1, 183 | "char_name": "technology", 184 | "name": "科技", 185 | "news": [ 186 | { 187 | "id": 1, 188 | "title": "AI 的未来", 189 | "summary": "探索人工智能的最新发展...", 190 | "publish_date": "2025-02-13", 191 | }, 192 | { 193 | "id": 2, 194 | "title": "5G 网络的全球影响", 195 | "summary": "5G 网络带来的技术革新...", 196 | "publish_date": "2025-02-12", 197 | }, 198 | ], 199 | }, 200 | { 201 | "id": 2, 202 | "char_name": "happy", 203 | "name": "娱乐", 204 | "news": [ 205 | { 206 | "id": 3, 207 | "title": "明星动态:新电影发布", 208 | "summary": "最新电影上映,明星动态...", 209 | "publish_date": "2025-02-14", 210 | }, 211 | { 212 | "id": 4, 213 | "title": "2025年超级碗回顾", 214 | "summary": "今年超级碗的精彩瞬间...", 215 | "publish_date": "2025-02-10", 216 | }, 217 | ], 218 | }, 219 | { 220 | "id": 3, 221 | "char_name": "sport", 222 | "name": "体育", 223 | "news": [ 224 | { 225 | "id": 5, 226 | "title": "足球世界杯的传奇时刻", 227 | "summary": "回顾世界杯历史上的经典时刻...", 228 | "publish_date": "2025-02-11", 229 | }, 230 | { 231 | "id": 6, 232 | "title": "NBA 历史最佳球员排名", 233 | "summary": "NBA 球员排名持续更新...", 234 | "publish_date": "2025-02-09", 235 | }, 236 | ], 237 | }, 238 | { 239 | "id": 4, 240 | "char_name": "web3", 241 | "name": "Web3", 242 | "news": [ 243 | { 244 | "id": 7, 245 | "title": "Web3:去中心化互联网的崛起", 246 | "summary": "Web3 作为去中心化的互联网理念,正在改变许多行业...", 247 | "publish_date": "2025-02-15", 248 | }, 249 | { 250 | "id": 8, 251 | "title": "NFT 的未来:如何定义数字所有权", 252 | "summary": "NFT 已成为区块链中的一个重要领域,它带来了数字资产的革命...", 253 | "publish_date": "2025-02-14", 254 | }, 255 | { 256 | "id": 9, 257 | "title": "DeFi:去中心化金融的现状与未来", 258 | "summary": "DeFi 带来了无银行的金融模式,它能否挑战传统金融体系?", 259 | "publish_date": "2025-02-13", 260 | }, 261 | ], 262 | }, 263 | ] 264 | 265 | 266 | def sandbox_news_category(request): 267 | # 模拟数据:新闻来源平台和新闻类别 268 | sources = [ 269 | {"name": "抖音", "slug": "douyin"}, 270 | {"name": "B站", "slug": "bilibili"}, 271 | {"name": "知乎", "slug": "zhihu"}, 272 | ] 273 | # 模拟数据:新闻类别 274 | categories = [ 275 | {"name": "国际新闻", "slug": "international"}, 276 | {"name": "国内新闻", "slug": "domestic"}, 277 | {"name": "科技新闻", "slug": "technology"}, 278 | {"name": "体育新闻", "slug": "sports"}, 279 | {"name": "娱乐新闻", "slug": "entertainment"}, 280 | ] 281 | 282 | # 将数据传递到模板 283 | return render( 284 | request, 285 | "topics/sandbox/news/category.html", 286 | {"sources": sources, "categories": categories}, 287 | ) 288 | 289 | 290 | def sandbox_news_category_detail(request, slug): 291 | # 模拟数据:新闻类别详情 292 | categories_details = { 293 | "international": { 294 | "name": "国际新闻", 295 | "description": "全球范围内的新闻热点,聚焦国际局势。", 296 | }, 297 | "domestic": { 298 | "name": "国内新闻", 299 | "description": "关注本国的时事新闻,涵盖社会、政治、经济等各个方面。", 300 | }, 301 | "technology": { 302 | "name": "科技新闻", 303 | "description": "报道最新的科技趋势、创新产品和技术突破。", 304 | }, 305 | "sports": { 306 | "name": "体育新闻", 307 | "description": "关注体育赛事、运动员动态及全球体育新闻。", 308 | }, 309 | "entertainment": { 310 | "name": "娱乐新闻", 311 | "description": "报道娱乐圈的最新动态、明星资讯、影视作品等。", 312 | }, 313 | } 314 | 315 | category = categories_details.get(slug, {}) 316 | return render( 317 | request, "topics/sandbox/news/detail_category.html", {"category": category} 318 | ) 319 | 320 | 321 | def sandbox_news_source_detail(request, slug): 322 | # 模拟数据:来源平台详情 323 | sources_details = { 324 | "douyin": { 325 | "name": "抖音", 326 | "description": "抖音是一款短视频分享社交平台,用户可以发布和观看短视频。", 327 | }, 328 | "bilibili": { 329 | "name": "B站", 330 | "description": "B站是一家以二次元文化为主的在线视频平台,提供丰富的视频内容。", 331 | }, 332 | "zhihu": { 333 | "name": "知乎", 334 | "description": "知乎是一个知识分享和问答社区,汇集了大量专业内容和用户互动。", 335 | }, 336 | } 337 | 338 | source = sources_details.get(slug, {}) 339 | print(source) 340 | return render(request, "topics/sandbox/news/detail_source.html", {"source": source}) 341 | 342 | 343 | def sandbox_news_hot(request): 344 | platforms = NewsPlatform.objects.all() 345 | # 第一次查询,获取所有数据行的 id 和 platform_id 346 | res = {} 347 | first_query_results = NewsRequestHistory.objects.order_by("-id").values( 348 | "id", "platform_id" 349 | )[: 2 * len(platforms)] 350 | for _ in first_query_results[::-1]: 351 | res[_["platform_id"]] = _["id"] 352 | # 使用 ids 查询真实数据 353 | latest_records = NewsRequestHistory.objects.filter(id__in=res.values()) 354 | id_news_mapping = {} 355 | # 将日志对象映射到新闻数据 356 | for log_obj in latest_records: 357 | id_news_mapping[log_obj.platform_id] = log_obj.response_data[:10] 358 | # 为每个平台添加新闻数据 359 | platform_news_mapping = {} 360 | for platform in platforms: 361 | platform_news_mapping[platform] = id_news_mapping.get(platform.id, []) 362 | # 将平台及其对应的新闻传递给模板 363 | return render( 364 | request, 365 | "topics/sandbox/news/news_hot.html", 366 | { 367 | "platform_news_mapping": platform_news_mapping, 368 | }, 369 | ) 370 | 371 | 372 | def sandbox_news_hot_detail(request, slug): 373 | platforms = NewsPlatform.objects.all() 374 | news = None 375 | if slug: 376 | selected_platform = NewsPlatform.objects.get(slug=slug) 377 | log_obj = ( 378 | NewsRequestHistory.objects.filter(platform_id=selected_platform.id) 379 | .order_by("-id") 380 | .first() 381 | ) 382 | news = log_obj.response_data 383 | else: 384 | selected_platform = None 385 | 386 | return render( 387 | request, 388 | "topics/sandbox/news/news_hot_detail.html", 389 | { 390 | "platforms": platforms, 391 | "selected_platform": selected_platform, 392 | "news_lst": news, 393 | }, 394 | ) 395 | 396 | 397 | def hot_news_detail_view(request, id): 398 | ... 399 | # news = get_object_or_404(HotNews, id=id) 400 | # return render(request, 'hot_news_detail.html', {'news': news}) 401 | 402 | 403 | def sandbox_news_detail(request, id): 404 | # 假设根据id获取新闻,实际上只是返回假数据 405 | news_item = { 406 | "id": id, 407 | "title": f"新闻 {id} 详情", 408 | "content": "这是新闻的详细内容,更多的细节信息可以在这里展示。", 409 | } 410 | return render( 411 | request, "topics/sandbox/news/detail_news.html", {"news_item": news_item} 412 | ) 413 | 414 | 415 | def sandbox_news_about_us(request): 416 | return render(request, "topics/sandbox/news/about_us.html") 417 | 418 | 419 | def sandbox_news_notice(request): 420 | return render(request, "topics/sandbox/news/notice.html") 421 | 422 | 423 | def shorthand(request): 424 | return render(request, "topics/index/shorthand.html") 425 | 426 | 427 | def solutions(request): 428 | return render(request, "topics/solutions.html") 429 | 430 | 431 | def topic_view(request, response_path): 432 | # 根据 path 获取对应的题目 433 | topic = get_object_or_404(Topics, response_path=response_path) 434 | # 返回对应的 HTML 视图 435 | return render(request, "topics/pages/" + response_path + ".html", {"topic": topic}) 436 | 437 | 438 | def error404(request, exception): 439 | return render(request, "topics/404.html", status=404) 440 | --------------------------------------------------------------------------------