├── Backend ├── Backend │ ├── __init__.py │ ├── asgi.py │ ├── wsgi.py │ ├── urls.py │ └── settings.py ├── accounts │ ├── __init__.py │ ├── migrations │ │ ├── __init__.py │ │ ├── 0003_customuser_avator.py │ │ ├── 0004_auto_20211205_2048.py │ │ ├── 0002_auto_20211204_1703.py │ │ └── 0001_initial.py │ ├── urls.py │ ├── tests.py │ ├── admin.py │ ├── views.py │ ├── apps.py │ ├── serializers.py │ └── models.py ├── search_blogs │ ├── __init__.py │ ├── migrations │ │ └── __init__.py │ ├── tests.py │ ├── admin.py │ ├── apps.py │ ├── urls.py │ ├── models.py │ └── views.py ├── static │ └── avator │ │ └── default.jpg └── manage.py ├── Crawler ├── Crawler │ ├── __init__.py │ ├── models │ │ ├── tempCodeRunnerFile.py │ │ └── es_blogs.py │ ├── spiders │ │ ├── __init__.py │ │ └── blog1.py │ ├── utils │ │ └── common.py │ ├── settings.py │ ├── items.py │ ├── middlewares.py │ └── pipelines.py └── scrapy.cfg ├── frontend ├── src │ ├── components │ │ ├── Footer.vue │ │ ├── ResultList │ │ │ ├── RelatedSearch.vue │ │ │ ├── SomeTips.vue │ │ │ ├── List.vue │ │ │ ├── DetailNav.vue │ │ │ ├── PageIndex.vue │ │ │ └── SearchBoxDetail.vue │ │ └── Home │ │ │ ├── Logo.vue │ │ │ ├── SampleNav.vue │ │ │ └── SearchBox.vue │ ├── assets │ │ ├── logo.png │ │ ├── img │ │ │ └── ava.jpg │ │ ├── logo.svg │ │ ├── empty.svg │ │ ├── userprofile.svg │ │ └── register.svg │ ├── App.vue │ ├── main.js │ ├── api │ │ ├── request.js │ │ └── index.js │ ├── views │ │ ├── Home.vue │ │ ├── activate.vue │ │ ├── ResultList.vue │ │ ├── UserProfile.vue │ │ ├── Login.vue │ │ └── Register.vue │ ├── store │ │ └── index.js │ └── router │ │ └── index.js ├── .browserslistrc ├── public │ ├── logo.png │ ├── js │ │ └── rem.js │ └── index.html ├── babel.config.js └── package.json ├── Engine ├── gerapy_auto_extractor │ ├── schemas │ │ ├── __init__.py │ │ └── tag.py │ ├── utils │ │ ├── __init__.py │ │ ├── helper.py │ │ ├── lcs.py │ │ ├── similarity.py │ │ ├── cluster.py │ │ └── preprocess.py │ ├── patterns │ │ ├── __init__.py │ │ ├── title.py │ │ └── datetime.py │ ├── helpers.py │ ├── __version__.py │ ├── settings.py │ ├── classifiers │ │ ├── models │ │ │ ├── list_model.pkl │ │ │ └── list_scaler.pkl │ │ ├── __init__.py │ │ ├── detail.py │ │ ├── base.py │ │ └── list.py │ ├── __init__.py │ └── extractors │ │ ├── __init__.py │ │ ├── base.py │ │ ├── datetime.py │ │ ├── content.py │ │ ├── title.py │ │ └── list.py ├── url_parser.py ├── pagerank.py └── html_extractor.py ├── requirements.txt ├── config.py ├── requirements_.txt ├── .vscode └── launch.json ├── README.md ├── .gitignore ├── draw ├── 功能分析.drawio └── 系统架构.drawio └── LICENSE /Backend/Backend/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Backend/accounts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Crawler/Crawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Backend/search_blogs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/src/components/Footer.vue: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Backend/accounts/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Backend/search_blogs/migrations/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/patterns/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /frontend/src/components/ResultList/RelatedSearch.vue: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Crawler/Crawler/models/tempCodeRunnerFile.py: -------------------------------------------------------------------------------- 1 | Completion -------------------------------------------------------------------------------- /frontend/.browserslistrc: -------------------------------------------------------------------------------- 1 | > 1% 2 | last 2 versions 3 | not dead 4 | -------------------------------------------------------------------------------- /Backend/accounts/urls.py: -------------------------------------------------------------------------------- 1 | from django.urls import path 2 | 3 | 4 | urlpatterns = [] -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/requirements.txt -------------------------------------------------------------------------------- /Backend/accounts/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /Backend/accounts/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /Backend/accounts/views.py: -------------------------------------------------------------------------------- 1 | from django.shortcuts import render 2 | 3 | # Create your views here. 4 | -------------------------------------------------------------------------------- /Backend/search_blogs/tests.py: -------------------------------------------------------------------------------- 1 | from django.test import TestCase 2 | 3 | # Create your tests here. 4 | -------------------------------------------------------------------------------- /Backend/search_blogs/admin.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | 3 | # Register your models here. 4 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/helpers.py: -------------------------------------------------------------------------------- 1 | from gerapy_auto_extractor.utils.helper import jsonify, content 2 | -------------------------------------------------------------------------------- /frontend/public/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/frontend/public/logo.png -------------------------------------------------------------------------------- /frontend/src/assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/frontend/src/assets/logo.png -------------------------------------------------------------------------------- /frontend/babel.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | presets: [ 3 | '@vue/cli-plugin-babel/preset' 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /frontend/src/assets/img/ava.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/frontend/src/assets/img/ava.jpg -------------------------------------------------------------------------------- /Backend/static/avator/default.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/Backend/static/avator/default.jpg -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/__version__.py: -------------------------------------------------------------------------------- 1 | VERSION = (0, 1, '2') 2 | 3 | version = __version__ = '.'.join(map(str, VERSION)) 4 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/schemas/tag.py: -------------------------------------------------------------------------------- 1 | SCRIPT = 'script' 2 | STYLE = 'style' 3 | P = 'p' 4 | BODY = 'body' 5 | HEAD = 'head' 6 | -------------------------------------------------------------------------------- /Backend/accounts/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class AccountsConfig(AppConfig): 5 | name = 'accounts' 6 | -------------------------------------------------------------------------------- /Backend/search_blogs/apps.py: -------------------------------------------------------------------------------- 1 | from django.apps import AppConfig 2 | 3 | 4 | class SearchBlogsConfig(AppConfig): 5 | name = 'search_blogs' 6 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/settings.py: -------------------------------------------------------------------------------- 1 | import environs 2 | 3 | env = environs.Env() 4 | env.read_env() 5 | 6 | APP_DEBUG = env.bool('APP_DEBUG', False) 7 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/classifiers/models/list_model.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/Engine/gerapy_auto_extractor/classifiers/models/list_model.pkl -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/classifiers/models/list_scaler.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/Engine/gerapy_auto_extractor/classifiers/models/list_scaler.pkl -------------------------------------------------------------------------------- /Crawler/Crawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | REDIS_HOST = "127.0.0.1" 2 | REDIS_PASSWORD = 123456 3 | ES_HOST = "xxxx" 4 | # ES_HOST = "localhost:9200" 5 | 6 | MYSQL_HOST = "xxxx" 7 | MYSQL_DBNAME = "xiusearch" 8 | MYSQL_USER = "justin3go" 9 | MYSQL_PASSWORD = "xxxx" -------------------------------------------------------------------------------- /Backend/search_blogs/urls.py: -------------------------------------------------------------------------------- 1 | from django.urls import path 2 | from .import views 3 | from django.conf.urls import url 4 | 5 | 6 | urlpatterns = [ 7 | url('search/$', views.SearchView.as_view()), 8 | url('search/suggest', views.SearchSuggest.as_view()) 9 | ] -------------------------------------------------------------------------------- /Crawler/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Crawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Crawler 12 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/patterns/title.py: -------------------------------------------------------------------------------- 1 | METAS = [ 2 | '//meta[starts-with(@property, "og:title")]/@content', 3 | '//meta[starts-with(@name, "og:title")]/@content', 4 | '//meta[starts-with(@property, "title")]/@content', 5 | '//meta[starts-with(@name, "title")]/@content', 6 | '//meta[starts-with(@property, "page:title")]/@content', 7 | ] 8 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/classifiers/__init__.py: -------------------------------------------------------------------------------- 1 | from gerapy_auto_extractor.settings import APP_DEBUG 2 | from gerapy_auto_extractor.extractors.content import extract_content 3 | from gerapy_auto_extractor.extractors.title import extract_title 4 | from gerapy_auto_extractor.extractors.datetime import extract_datetime 5 | from gerapy_auto_extractor.extractors.list import extract_list 6 | -------------------------------------------------------------------------------- /requirements_.txt: -------------------------------------------------------------------------------- 1 | django==3.1.13 2 | djangorestframework==3.12.4 3 | djoser==2.1.0 4 | drf-yasg==1.20.0 5 | redis==3.5.3 6 | scrapy==2.5.1 7 | elasticsearch==7.15.2 8 | elasticsearch-dsl==7.4.0 9 | fake-useragent==0.1.11 10 | coreapi==2.3.3 11 | django-cors-headers==3.10.0 12 | djangorestframework-simplejwt==4.8.0 13 | jieba 14 | pymysql 15 | lxml 16 | beautifulsoup4 17 | PyJWT==2.1.0 -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // 使用 IntelliSense 了解相关属性。 3 | // 悬停以查看现有属性的描述。 4 | // 欲了解更多信息,请访问: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python: Django", 9 | "type": "python", 10 | "request": "launch", 11 | "program": "${workspaceFolder}\\Backend\\manage.py", 12 | "args": [ 13 | "runserver" 14 | ], 15 | "django": true 16 | } 17 | ] 18 | } -------------------------------------------------------------------------------- /Backend/Backend/asgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASGI config for Backend project. 3 | 4 | It exposes the ASGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.1/howto/deployment/asgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.asgi import get_asgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Backend.settings') 15 | 16 | application = get_asgi_application() 17 | -------------------------------------------------------------------------------- /Backend/Backend/wsgi.py: -------------------------------------------------------------------------------- 1 | """ 2 | WSGI config for Backend project. 3 | 4 | It exposes the WSGI callable as a module-level variable named ``application``. 5 | 6 | For more information on this file, see 7 | https://docs.djangoproject.com/en/3.1/howto/deployment/wsgi/ 8 | """ 9 | 10 | import os 11 | 12 | from django.core.wsgi import get_wsgi_application 13 | 14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Backend.settings') 15 | 16 | application = get_wsgi_application() 17 | -------------------------------------------------------------------------------- /Backend/accounts/serializers.py: -------------------------------------------------------------------------------- 1 | from djoser.serializers import UserCreateSerializer,UserSerializer 2 | from django.contrib.auth import get_user_model 3 | 4 | User = get_user_model() 5 | 6 | 7 | class MyUserCreateSerializer(UserCreateSerializer): 8 | class Meta(UserCreateSerializer.Meta): 9 | model = User 10 | fields = ("id", "email", "username", "password") 11 | 12 | class MyUserSerializer(UserSerializer): 13 | class Meta: 14 | model = User 15 | fields = ("id", "email", "username", "avator") -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/utils/helper.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | 4 | def jsonify(data): 5 | """ 6 | format the output data 7 | :param data: 8 | :return: 9 | """ 10 | return json.dumps(data, indent=2, ensure_ascii=False, default=str) 11 | 12 | 13 | def content(file_path, encoding='utf-8'): 14 | """ 15 | get content of html file 16 | :param encoding: file encoding 17 | :param file_path: 18 | :return: 19 | """ 20 | with open(file_path, encoding=encoding) as f: 21 | return f.read() -------------------------------------------------------------------------------- /Backend/accounts/migrations/0003_customuser_avator.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.13 on 2021-12-04 13:44 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('accounts', '0002_auto_20211204_1703'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AddField( 14 | model_name='customuser', 15 | name='avator', 16 | field=models.CharField(default='/static/avator/default.png', max_length=255), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /Backend/accounts/migrations/0004_auto_20211205_2048.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.13 on 2021-12-05 12:48 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('accounts', '0003_customuser_avator'), 10 | ] 11 | 12 | operations = [ 13 | migrations.AlterField( 14 | model_name='customuser', 15 | name='avator', 16 | field=models.CharField(default='/static/avator/default.jpg', max_length=255), 17 | ), 18 | ] 19 | -------------------------------------------------------------------------------- /Backend/accounts/migrations/0002_auto_20211204_1703.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.13 on 2021-12-04 09:03 2 | 3 | from django.db import migrations 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | dependencies = [ 9 | ('accounts', '0001_initial'), 10 | ] 11 | 12 | operations = [ 13 | migrations.RemoveField( 14 | model_name='customuser', 15 | name='first_name', 16 | ), 17 | migrations.RemoveField( 18 | model_name='customuser', 19 | name='last_name', 20 | ), 21 | ] 22 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from gerapy_auto_extractor.settings import APP_DEBUG 2 | from gerapy_auto_extractor.extractors import extract_detail, extract_list, extract_datetime, extract_content, \ 3 | extract_title 4 | from gerapy_auto_extractor.classifiers.list import is_list, probability_of_list 5 | from gerapy_auto_extractor.classifiers.detail import is_detail, probability_of_detail 6 | from loguru import logger 7 | 8 | try: 9 | logger.level('inspect', no=100000 if APP_DEBUG else 0, color='') 10 | except (ValueError, TypeError): 11 | pass 12 | -------------------------------------------------------------------------------- /Crawler/Crawler/utils/common.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import redis 3 | import re 4 | import hashlib 5 | import sys 6 | sys.path.append("C:\My_app\code\咻Search") 7 | from config import REDIS_HOST, REDIS_PASSWORD 8 | 9 | def real_time_count(key, init): 10 | redis_cli = redis.Redis(host=REDIS_HOST, password=REDIS_PASSWORD) 11 | if redis_cli.get(key): 12 | count = pickle.loads(redis_cli.get(key)) 13 | count = count + 1 14 | count = pickle.dumps(count) 15 | redis_cli.set(key, count) 16 | else: 17 | count = pickle.dumps(init) 18 | redis_cli.set(key, count) -------------------------------------------------------------------------------- /frontend/src/App.vue: -------------------------------------------------------------------------------- 1 | 12 | 13 | 30 | -------------------------------------------------------------------------------- /frontend/src/main.js: -------------------------------------------------------------------------------- 1 | import router from './router' 2 | import store from './store' 3 | import { createApp } from 'vue' 4 | import ElementPlus from 'element-plus' 5 | import 'element-plus/dist/index.css' 6 | import App from './App.vue' 7 | 8 | const app = createApp(App) 9 | 10 | app.use(store) 11 | app.use(router) 12 | app.use(ElementPlus) 13 | app.mount('#app') 14 | 15 | 16 | // 跳转界面后到顶部 17 | router.beforeEach((to, from, next) => { 18 | // chrome 19 | document.body.scrollTop = 0 20 | // firefox 21 | document.documentElement.scrollTop = 0 22 | // safari 23 | window.pageYOffset = 0 24 | next() 25 | }) 26 | 27 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from gerapy_auto_extractor.extractors.content import extract_content 2 | from gerapy_auto_extractor.extractors.title import extract_title 3 | from gerapy_auto_extractor.extractors.datetime import extract_datetime 4 | from gerapy_auto_extractor.extractors.list import extract_list 5 | 6 | 7 | def extract_detail(html): 8 | """ 9 | extract detail information 10 | :param html: 11 | :return: 12 | """ 13 | return { 14 | 'title': extract_title(html), 15 | 'datetime': extract_datetime(html), 16 | 'content': extract_content(html) 17 | } 18 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/utils/lcs.py: -------------------------------------------------------------------------------- 1 | from difflib import SequenceMatcher 2 | 3 | 4 | def lcs_of_2(a, b): 5 | """ 6 | get longest common string 7 | :param a: 8 | :param b: 9 | :return: 10 | """ 11 | match = SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b)) 12 | return a[match[0]: match[0] + match[2]] 13 | 14 | 15 | def lcs_of_list(*args): 16 | """ 17 | get longest common string of list 18 | :param args: 19 | :return: 20 | """ 21 | if len(args) == 2: 22 | return lcs_of_2(args[0], args[1]) 23 | first = args[0] 24 | remains = args[1:] 25 | return lcs_of_2(first, lcs_of_list(*remains)) 26 | -------------------------------------------------------------------------------- /frontend/src/api/request.js: -------------------------------------------------------------------------------- 1 | import axios from 'axios' 2 | import store from '@/store' 3 | 4 | axios.defaults.timeout = 10000; 5 | axios.defaults.headers.post['Content-Type'] = 'application/x-www-form-urlencoded;charset=UTF-8;multipart/form-data'; 6 | 7 | // 添加请求拦截器,在请求头中加token 8 | axios.interceptors.request.use( 9 | config => { 10 | console.log("store.state.Jwt: ", store.state.Jwt) 11 | if (store.state.Jwt != '') { 12 | console.log("将token添加进入请求头之中...") 13 | config.headers.Authorization = 'JWT ' + store.state.Jwt.access; 14 | } 15 | return config; 16 | }, 17 | error => { 18 | return Promise.reject(error); 19 | }); 20 | 21 | export default axios; -------------------------------------------------------------------------------- /frontend/public/js/rem.js: -------------------------------------------------------------------------------- 1 | function remSize(){ 2 | // 获取屏幕的宽度 3 | var deviceWidth = document.documentElement.clientWidth || window.innerWidth; 4 | // 限制屏幕的宽度 5 | if(deviceWidth >= 750){ 6 | deviceWidth = 750 7 | } 8 | if(deviceWidth <= 320){ 9 | deviceWidth = 320 10 | } 11 | document.documentElement.style.fontSize = (deviceWidth/7.5) + 'px' 12 | // 设计稿是750px 13 | // 设置一半的宽度,那么就是375px 14 | // 1rem == 100px的设计稿宽度 15 | // 表达一半的宽度就是3.75rem 16 | 17 | // 设置字体大小 18 | document.querySelector('body').style.fontSize = 0.16 + 'rem' 19 | } 20 | 21 | remSize(); 22 | // 当窗口发生变化时我们也调用一下这个函数 23 | window.onresize = function(){ 24 | remSize(); 25 | } -------------------------------------------------------------------------------- /frontend/src/components/ResultList/SomeTips.vue: -------------------------------------------------------------------------------- 1 | 9 | 10 | 19 | 20 | -------------------------------------------------------------------------------- /frontend/src/views/Home.vue: -------------------------------------------------------------------------------- 1 | 9 | 10 | 27 | 28 | 32 | -------------------------------------------------------------------------------- /Backend/manage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Django's command-line utility for administrative tasks.""" 3 | import os 4 | import sys 5 | 6 | 7 | def main(): 8 | """Run administrative tasks.""" 9 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Backend.settings') 10 | try: 11 | from django.core.management import execute_from_command_line 12 | except ImportError as exc: 13 | raise ImportError( 14 | "Couldn't import Django. Are you sure it's installed and " 15 | "available on your PYTHONPATH environment variable? Did you " 16 | "forget to activate a virtual environment?" 17 | ) from exc 18 | execute_from_command_line(sys.argv) 19 | 20 | 21 | if __name__ == '__main__': 22 | main() 23 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/classifiers/detail.py: -------------------------------------------------------------------------------- 1 | from gerapy_auto_extractor.classifiers.list import probability_of_list 2 | 3 | 4 | def probability_of_detail(html, **kwargs): 5 | """ 6 | get probability of detail page 7 | :param html: 8 | :param kwargs: other kwargs 9 | :return: 10 | """ 11 | return 1 - probability_of_list(html, **kwargs) 12 | 13 | 14 | def is_detail(html, threshold=0.5, **kwargs): 15 | """ 16 | judge if this page is detail page 17 | :param html: source of html 18 | :param threshold: 19 | :param kwargs: 20 | :return: 21 | """ 22 | _probability_of_detail = probability_of_detail(html, **kwargs) 23 | if _probability_of_detail > threshold: 24 | return True 25 | return False 26 | -------------------------------------------------------------------------------- /frontend/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "frontend", 3 | "version": "0.1.0", 4 | "private": true, 5 | "scripts": { 6 | "serve": "vue-cli-service serve", 7 | "build": "vue-cli-service build" 8 | }, 9 | "dependencies": { 10 | "@trevoreyre/autocomplete-vue": "^2.2.0", 11 | "axios": "^0.24.0", 12 | "core-js": "^3.6.5", 13 | "element-plus": "^1.2.0-beta.6", 14 | "vue": "^3.0.0", 15 | "vue-router": "^4.0.0-0", 16 | "vuex": "^4.0.0-0" 17 | }, 18 | "devDependencies": { 19 | "@vue/cli-plugin-babel": "~4.5.0", 20 | "@vue/cli-plugin-router": "~4.5.0", 21 | "@vue/cli-plugin-vuex": "~4.5.0", 22 | "@vue/cli-service": "~4.5.0", 23 | "@vue/compiler-sfc": "^3.0.0", 24 | "less": "^3.0.4", 25 | "less-loader": "^5.0.0" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /frontend/src/views/activate.vue: -------------------------------------------------------------------------------- 1 | 4 | 5 | 30 | -------------------------------------------------------------------------------- /Crawler/Crawler/models/es_blogs.py: -------------------------------------------------------------------------------- 1 | from elasticsearch_dsl import connections, Document, Keyword, Text, Integer, Date, Completion, analyzer, Float 2 | import sys 3 | sys.path.append("C:\My_app\code\咻Search") 4 | from config import ES_HOST 5 | 6 | connections.create_connection(hosts=[ES_HOST]) 7 | 8 | my_analyzer = analyzer('ik_smart') 9 | 10 | 11 | class BlogsIndex(Document): 12 | suggest = Completion(analyzer=my_analyzer) 13 | page_url = Keyword() 14 | title = Text(analyzer="ik_max_word") 15 | keywords = Text(analyzer="ik_max_word") 16 | description = Text(analyzer="ik_max_word") 17 | content = Text(analyzer="ik_max_word") 18 | PR = Float() 19 | publish_time = Date() 20 | 21 | class Index: 22 | name = 'blogs' 23 | 24 | 25 | if __name__ == "__main__": 26 | BlogsIndex.init() 27 | -------------------------------------------------------------------------------- /Backend/search_blogs/models.py: -------------------------------------------------------------------------------- 1 | from elasticsearch_dsl import Text, Date, Keyword, Integer, Document, Completion, Double, Float 2 | from elasticsearch_dsl.connections import connections 3 | from elasticsearch_dsl import analyzer 4 | import sys 5 | sys.path.append("C:/My_app/code/咻Search") 6 | from config import ES_HOST 7 | 8 | 9 | # Create your models here. 10 | connections.create_connection(hosts=ES_HOST) 11 | 12 | my_analyzer = analyzer('ik_smart') 13 | 14 | class BlogsIndex(Document): 15 | suggest = Completion(analyzer=my_analyzer) 16 | title = Text(analyzer="ik_max_word") 17 | keywords = Text(analyzer="ik_max_word") 18 | description = Text(analyzer="ik_max_word") 19 | content = Text(analyzer="ik_max_word") 20 | PR = Float() 21 | publish_time = Date() 22 | 23 | class Index: 24 | name = 'blogs' 25 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/classifiers/base.py: -------------------------------------------------------------------------------- 1 | from lxml.html import fromstring 2 | from gerapy_auto_extractor.schemas.element import Element 3 | 4 | 5 | class BaseClassifier(object): 6 | 7 | def process(self, element: Element): 8 | """ 9 | you must implement this method in child class 10 | :param element: 11 | :return: 12 | """ 13 | raise NotImplementedError 14 | 15 | def classify(self, html, **kwargs): 16 | """ 17 | base extract method, firstly, it will convert html to WebElement, then it call 18 | process method that child class implements 19 | :param html: 20 | :return: 21 | """ 22 | self.kwargs = kwargs 23 | element = fromstring(html=html) 24 | element.__class__ = Element 25 | return self.process(element) 26 | -------------------------------------------------------------------------------- /Backend/Backend/urls.py: -------------------------------------------------------------------------------- 1 | from django.contrib import admin 2 | from django.urls import path 3 | from django.urls.conf import include, re_path 4 | 5 | # DRF YASG 6 | from rest_framework import permissions 7 | from drf_yasg.views import get_schema_view 8 | from drf_yasg import openapi 9 | 10 | schema_view = get_schema_view( 11 | openapi.Info( 12 | title="XiuSearch API", 13 | default_version="v1", 14 | description="XiuSearch的接口文档......", 15 | contact=openapi.Contact(email="justin3go@foxmail.com"), 16 | license=openapi.License(name="BSD License"), 17 | ), 18 | public=True, 19 | permission_classes=(permissions.AllowAny,), 20 | ) 21 | 22 | urlpatterns = [ 23 | path("admin/", admin.site.urls), 24 | re_path( 25 | r"^api/v1/docs/$", 26 | schema_view.with_ui("swagger", cache_timeout=0), 27 | name="schema-swagger-ui", 28 | ), 29 | path("api/v1/", include("accounts.urls")), 30 | path("api/v1/", include("djoser.urls")), 31 | path("api/v1/", include("djoser.urls.jwt")), 32 | path("api/v1/", include("search_blogs.urls")), 33 | ] -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/utils/similarity.py: -------------------------------------------------------------------------------- 1 | import distance 2 | 3 | 4 | def similarity1(s1, s2): 5 | """ 6 | get similarity of two strings 7 | :param s1: 8 | :param s2: 9 | :return: 10 | """ 11 | if not s1 or not s2: 12 | return 0 13 | edit_distance = distance.levenshtein(s1, s2) 14 | similarity_score = 1 - edit_distance / (len(s1) + len(s2)) 15 | return similarity_score 16 | 17 | 18 | def similarity2(s1, s2): 19 | """ 20 | get similarity of two strings 21 | :param s1: 22 | :param s2: 23 | :return: 24 | """ 25 | if not s1 or not s2: 26 | return 0 27 | s1_set = set(list(s1)) 28 | s2_set = set(list(s2)) 29 | intersection = s1_set.intersection(s2_set) 30 | union = s2_set.intersection(s2_set) 31 | return len(intersection) / len(union) 32 | 33 | 34 | def similarity(s1, s2): 35 | """ 36 | get similarity of two strings 37 | :param s1: 38 | :param s2: 39 | :return: 40 | """ 41 | return similarity2(s1, s2) 42 | 43 | 44 | if __name__ == '__main__': 45 | s1 = 'hello' 46 | s2 = 'world' 47 | print(similarity(s1, s2)) 48 | -------------------------------------------------------------------------------- /frontend/src/components/Home/Logo.vue: -------------------------------------------------------------------------------- 1 | 15 | 16 | 53 | -------------------------------------------------------------------------------- /frontend/public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | XiuSearch 10 | 11 | 12 |
13 | 14 | 15 | 36 | 37 | -------------------------------------------------------------------------------- /Crawler/Crawler/settings.py: -------------------------------------------------------------------------------- 1 | from fake_useragent import UserAgent 2 | import time 3 | import sys 4 | sys.path.append("C:/My_app/code/咻Search") 5 | 6 | 7 | BOT_NAME = 'Crawler' 8 | SPIDER_MODULES = ['Crawler.spiders'] 9 | NEWSPIDER_MODULE = 'Crawler.spiders' 10 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 11 | USER_AGENT = UserAgent().random 12 | # Obey robots.txt rules 13 | ROBOTSTXT_OBEY = True 14 | DOWNLOAD_DELAY = 0.5 15 | COOKIES_ENABLED = False 16 | ITEM_PIPELINES = { 17 | 'Crawler.pipelines.MysqlTwistedPipeline': 200, 18 | 'Crawler.pipelines.ElasticSearchPipeline': 300, 19 | } 20 | # Broad Crawls --广泛的爬取官网推荐的设置 21 | # 应用推荐的优先级队列 22 | SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue' 23 | # 增加并发--Scrapy 下载器将执行的最大并发(即同时)请求数。 24 | CONCURRENT_REQUESTS = 100 25 | # 增加 Twisted IO 线程池最大大小 26 | REACTOR_THREADPOOL_MAXSIZE = 20 27 | # 降低日志级别 28 | LOG_LEVEL = 'INFO' 29 | # 禁用 cookie 30 | COOKIES_ENABLED = False 31 | # 禁用重试 32 | RETRY_ENABLED = False 33 | # 减少下载超时 34 | DOWNLOAD_TIMEOUT = 15 35 | # 禁用重定向 36 | REDIRECT_ENABLED = False 37 | # 启用“Ajax 可抓取页面”的抓取 38 | AJAXCRAWL_ENABLED = True 39 | 40 | # My Settings 41 | # 爬行深度 42 | DEPTH_LIMIT = 10 43 | # log 44 | LOG_FILE = "all.log" 45 | now_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime()) 46 | JOBDIR="breakpoints/" + str(now_time) 47 | 48 | 49 | -------------------------------------------------------------------------------- /frontend/src/store/index.js: -------------------------------------------------------------------------------- 1 | import { createStore } from 'vuex' 2 | // 接口引入 3 | import { login, authorization } from '@/api/index.js' 4 | 5 | export default createStore({ 6 | state: { 7 | SearchValue: '', 8 | SearchResult: { 9 | hitList: [], 10 | }, 11 | Jwt: JSON.parse(localStorage.getItem("jwt")) || '', 12 | UserProfile: JSON.parse(localStorage.getItem("user")) || '', 13 | }, 14 | mutations: { 15 | SetSearchValue(state, value) { 16 | state.SearchValue = value 17 | // 同时保存到本地 18 | let historyList = JSON.parse(localStorage.getItem("historyList")) || []; 19 | // 不添加重复值及空值 20 | if (historyList.indexOf(value) == -1 && value) { 21 | historyList.push(value) 22 | localStorage.setItem("historyList", JSON.stringify(historyList)); 23 | } 24 | 25 | }, 26 | SetSearchResult(state, value) { 27 | state.SearchResult = value 28 | }, 29 | SetJwt(state, value) { 30 | console.log("将jwt提交到了vuex") 31 | state.Jwt = value 32 | localStorage.setItem("jwt", JSON.stringify(value)); 33 | }, 34 | SetUserProfile(state, value) { 35 | state.UserProfile = value 36 | localStorage.setItem("user", JSON.stringify(value)); 37 | }, 38 | }, 39 | actions: { 40 | // TODO 这些方法放在vuex里面有什么作用,其他地方直接调用api里面的不好吗 41 | }, 42 | modules: { 43 | } 44 | }) 45 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/extractors/base.py: -------------------------------------------------------------------------------- 1 | from lxml.html import fromstring 2 | from loguru import logger 3 | from lxml.html import etree 4 | from gerapy_auto_extractor.schemas.element import Element 5 | 6 | 7 | class BaseExtractor(object): 8 | """ 9 | Base Extractor which provide common methods 10 | """ 11 | 12 | kwargs = None 13 | 14 | def to_string(self, element: Element, limit: int = None): 15 | """ 16 | convert element to string 17 | :param element: 18 | :param limit: 19 | :return: 20 | """ 21 | result = etree.tostring(element, pretty_print=True, encoding="utf-8", method='html').decode('utf-8') 22 | if limit: 23 | return result[:limit] 24 | return result 25 | 26 | def process(self, element: Element): 27 | """ 28 | process method that you should implement 29 | :param element: 30 | :return: 31 | """ 32 | logger.error('You must implement process method in your extractor.') 33 | return False # 随便返回一个,不然VSCODE无法识别语法,后面上线的时候再改,而且不改也不影响 34 | raise NotImplementedError 35 | 36 | def extract(self, html, **kwargs): 37 | """ 38 | base extract method, firstly, it will convert html to WebElement, then it call 39 | process method that child class implements 40 | :param html: 41 | :return: 42 | """ 43 | self.kwargs = kwargs 44 | element = fromstring(html=html) 45 | element.__class__ = Element 46 | return self.process(element) 47 | -------------------------------------------------------------------------------- /frontend/src/api/index.js: -------------------------------------------------------------------------------- 1 | import axios from './request.js' 2 | import store from '@/store' 3 | 4 | // let baseUrl = 'http://39.106.132.154:8000/api/v1' 5 | const ip = 'http://39.106.132.154:8000' 6 | // const ip = 'http://localhost:8000' 7 | const baseUrl = `${ip}/api/v1` 8 | 9 | 10 | export function getIP(){ 11 | return ip 12 | } 13 | 14 | //获取搜索结果 15 | export function getSearchResult(q, p) { 16 | return axios.get(`${baseUrl}/search?q=${q}&p=${p}`) 17 | } 18 | //根据输入的部分文本获取搜索建议 19 | export function getSearchSuggest(someText) { 20 | return axios.get(`${baseUrl}/search/suggest?input=${someText}`) 21 | } 22 | //登录 23 | export function login(email, password) { 24 | return axios.post(`${baseUrl}/jwt/create`, { 25 | "email": email, 26 | "password": password 27 | }) 28 | } 29 | //注册 30 | export function register(username, email, password, re_password) { 31 | return axios.post(`${baseUrl}/users/`, { 32 | "username": username, 33 | "email": email, 34 | "password": password, 35 | "re_password": re_password 36 | }) 37 | } 38 | //获取用户资料 39 | export function getUserProfile() { 40 | return axios.get(`${baseUrl}/users/me`) 41 | } 42 | //验证token是否失效 43 | export function authorization(token) { 44 | return axios.post(`${baseUrl}/jwt/verify`,{ 45 | "token": token, 46 | }) 47 | } 48 | //通过刷新token进行刷新 49 | export function refreshToken(refresh){ 50 | return axios.post(`${baseUrl}/jwt/refresh`,{ 51 | "refresh": refresh, 52 | }) 53 | } 54 | //激活账号 55 | export function activate(uid, token){ 56 | return axios.post(`${baseUrl}/users/activation/`,{ 57 | "uid": uid, 58 | "token": token 59 | }) 60 | } 61 | 62 | -------------------------------------------------------------------------------- /frontend/src/views/ResultList.vue: -------------------------------------------------------------------------------- 1 | 11 | 12 | 58 | 63 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/utils/cluster.py: -------------------------------------------------------------------------------- 1 | from gerapy_auto_extractor.utils.similarity import similarity 2 | from collections import defaultdict 3 | 4 | 5 | def cluster(items, threshold=0.9): 6 | """ 7 | cluster names 8 | :param items: 9 | :param threshold: 10 | :return: cluster map, for example {"foo": 0, "bar": 1} 11 | """ 12 | number = -1 13 | clusters_map = {} 14 | clusters = [] 15 | for name in items: 16 | for c in clusters: 17 | if all(similarity(name, w) > threshold for w in c): 18 | c.append(name) 19 | clusters_map[name] = number 20 | break 21 | else: 22 | number += 1 23 | clusters.append([name]) 24 | clusters_map[name] = number 25 | return clusters_map 26 | 27 | 28 | def cluster_dict(data: dict, threshold=0.8): 29 | """ 30 | cluster dict, convert id key to cluster id key 31 | :param threshold: 32 | :param data: 33 | :return: 34 | """ 35 | ids = data.keys() 36 | clusters_map = cluster(ids, threshold) 37 | result = defaultdict(list) 38 | for k, v in data.items(): 39 | if isinstance(v, list): 40 | for i in v: 41 | result[clusters_map[k]].append(i) 42 | else: 43 | result[clusters_map[k]].append(v) 44 | return dict(result) 45 | 46 | 47 | if __name__ == '__main__': 48 | data = { 49 | '/html/body/div[@class="main"]/div[1]/ul': ['child1', 'child2', 'child3'], 50 | '/html/body/div[@class="main"]/div[2]/ul': ['child4', 'child5', 'child6'], 51 | '/html/body/div[@class="main"]/div[3]/ul': ['child7', 'child8', 'child9'], 52 | '/html/body/header/div[1]': ['child10', 'child11', 'child12'], 53 | '/html/body/header/div[2]': ['child13', 'child14', 'child15'], 54 | } 55 | print(cluster_dict(data, threshold=0.7)) 56 | -------------------------------------------------------------------------------- /Backend/accounts/models.py: -------------------------------------------------------------------------------- 1 | from django.db import models 2 | from django.contrib.auth.models import ( 3 | AbstractBaseUser, 4 | PermissionsMixin, 5 | BaseUserManager, 6 | ) 7 | 8 | 9 | class CustomUserManager(BaseUserManager): 10 | def create_user(self, email, username, password=None, **extra_fields): 11 | if not email: 12 | raise ValueError("User must have an email") 13 | email = self.normalize_email(email) 14 | user = self.model(email=email, username=username, **extra_fields) 15 | user.set_password(password) 16 | user.save(using=self._db) 17 | return user 18 | 19 | def create_superuser(self, username, email, password=None, **extra_fields): 20 | user = self.create_user( 21 | username, email, password=password, **extra_fields) 22 | user.is_active = True 23 | user.is_staff = True 24 | user.is_admin = True 25 | user.save(using=self._db) 26 | return user 27 | 28 | 29 | class CustomUser(AbstractBaseUser, PermissionsMixin): 30 | email = models.EmailField(max_length=255, unique=True) 31 | username = models.CharField(max_length=255, unique=True) 32 | # TODO 用户上传头像需要将图片路径修改为uid加头像 33 | avator = models.CharField( 34 | max_length=255, default='/static/avator/default.jpg') 35 | # first_name = models.CharField(max_length=255) 36 | # last_name = models.CharField(max_length=255) 37 | is_active = models.BooleanField(default=True) 38 | is_staff = models.BooleanField(default=False) 39 | is_admin = models.BooleanField(default=False) 40 | 41 | objects = CustomUserManager() 42 | 43 | USERNAME_FIELD = "email" 44 | REQUIRED_FIELDS = ["username"] 45 | 46 | def get_name(self): 47 | return self.username 48 | 49 | def has_perm(self, perm, obj=None): 50 | return True 51 | 52 | def has_module_perms(self, app_label): 53 | return True 54 | 55 | def __str__(self): 56 | return self.email 57 | -------------------------------------------------------------------------------- /Engine/url_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib import parse 3 | # 这类词加分减分 4 | # 可以把这些url分词排序然后认为判断加入消极还是积极 5 | NEG_WORDS = ['user', 'list', 'author', 'comment','writer'] 6 | POS_WORDS = ['article', 'blog', 'details', 'question'] 7 | # 参考https://help.aliyun.com/document_detail/65096.html 8 | # 还要记住匹配的时候不能区分大小写,同时匹配的时候也仅仅需要匹配url的最后四位就可以了 9 | # 这类词一票否决 10 | FILE_WORDS = ['.gif','.png','.bmp','.jpeg','.jpg', '.svg', 11 | '.mp3','.wma','.flv','.mp4','.wmv','.ogg','.avi', 12 | '.doc','.docx','.xls','.xlsx','.ppt','.pptx','.txt','.pdf', 13 | '.zip','.exe','.tat','.ico','.css','.js','.swf','.apk','.m3u8','.ts'] 14 | 15 | # 还有就是如果包含很长一串数字的一般都是内容界面 16 | # 不应该是各种文件名的后缀 17 | def is_static_url(url): 18 | ''' 19 | 20 | ''' 21 | for w in FILE_WORDS: 22 | if w in url[-5:]: 23 | return True 24 | 25 | return False 26 | 27 | # 暂时不用 28 | # 这个有点麻烦,先用别人实现的,自己后面再参考着来实现在我这种应用场景下的判断 29 | def is_content_url(url, threshold=0.4): 30 | ''' 31 | 判断一个url是否为内容界面,而不是列表界面或者主页又或者用户页等; 32 | param: 33 | url:传入的url; 34 | threshold:决定是否为内容界面的阈值; 35 | return: 36 | bool; 37 | ''' 38 | suffix = re.findall('[a-z]+', (url[-5:]).lower()) 39 | if len(suffix) != 0: 40 | if suffix[-1] in FILE_WORDS: 41 | return False 42 | score = 0 43 | if re.match("[0-9]"*10, url, flags=0) != None: 44 | score += 30 45 | for w in NEG_WORDS: 46 | if w in url: 47 | score -= 10 48 | for w in POS_WORDS: 49 | if w in url: 50 | score += 15 51 | if(score/(len(url)*2) >= threshold): 52 | return True 53 | else: 54 | return False 55 | 56 | # 暂时不用 57 | STOP_WORD = "javascript:" 58 | def url_filter(urls): 59 | cleaned_urls = [] 60 | for url in urls: 61 | if is_static_url(url): 62 | continue 63 | if STOP_WORD in url.lower(): 64 | continue 65 | cleaned_urls.append(url) 66 | return cleaned_urls 67 | -------------------------------------------------------------------------------- /Backend/accounts/migrations/0001_initial.py: -------------------------------------------------------------------------------- 1 | # Generated by Django 3.1.13 on 2021-10-25 09:45 2 | 3 | from django.db import migrations, models 4 | 5 | 6 | class Migration(migrations.Migration): 7 | 8 | initial = True 9 | 10 | dependencies = [ 11 | ('auth', '0012_alter_user_first_name_max_length'), 12 | ] 13 | 14 | operations = [ 15 | migrations.CreateModel( 16 | name='CustomUser', 17 | fields=[ 18 | ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), 19 | ('password', models.CharField(max_length=128, verbose_name='password')), 20 | ('last_login', models.DateTimeField(blank=True, null=True, verbose_name='last login')), 21 | ('is_superuser', models.BooleanField(default=False, help_text='Designates that this user has all permissions without explicitly assigning them.', verbose_name='superuser status')), 22 | ('email', models.EmailField(max_length=255, unique=True)), 23 | ('username', models.CharField(max_length=255, unique=True)), 24 | ('first_name', models.CharField(max_length=255)), 25 | ('last_name', models.CharField(max_length=255)), 26 | ('is_active', models.BooleanField(default=True)), 27 | ('is_staff', models.BooleanField(default=False)), 28 | ('is_admin', models.BooleanField(default=False)), 29 | ('groups', models.ManyToManyField(blank=True, help_text='The groups this user belongs to. A user will get all permissions granted to each of their groups.', related_name='user_set', related_query_name='user', to='auth.Group', verbose_name='groups')), 30 | ('user_permissions', models.ManyToManyField(blank=True, help_text='Specific permissions for this user.', related_name='user_set', related_query_name='user', to='auth.Permission', verbose_name='user permissions')), 31 | ], 32 | options={ 33 | 'abstract': False, 34 | }, 35 | ), 36 | ] 37 | -------------------------------------------------------------------------------- /frontend/src/components/Home/SampleNav.vue: -------------------------------------------------------------------------------- 1 | 24 | 38 | 39 | 86 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/extractors/datetime.py: -------------------------------------------------------------------------------- 1 | import re 2 | from dateparser import parse 3 | from lxml.html import HtmlElement 4 | from gerapy_auto_extractor.patterns.datetime import METAS_CONTENT, REGEXES 5 | from loguru import logger 6 | from gerapy_auto_extractor.extractors.base import BaseExtractor 7 | 8 | 9 | class DatetimeExtractor(BaseExtractor): 10 | """ 11 | Datetime Extractor which auto extract datetime info. 12 | """ 13 | 14 | def extract_by_regex(self, element: HtmlElement) -> str: 15 | """ 16 | extract datetime according to predefined regex 17 | :param element: 18 | :return: 19 | """ 20 | text = ''.join(element.xpath('.//text()')) 21 | for regex in REGEXES: 22 | result = re.search(regex, text) 23 | if result: 24 | return result.group(1) 25 | 26 | def extract_by_meta(self, element: HtmlElement) -> str: 27 | """ 28 | extract according to meta 29 | :param element: 30 | :return: str 31 | """ 32 | for xpath in METAS_CONTENT: 33 | datetime = element.xpath(xpath) 34 | if datetime: 35 | return ''.join(datetime) 36 | 37 | 38 | def process(self, element: HtmlElement): 39 | """ 40 | extract datetime 41 | :param html: 42 | :return: 43 | """ 44 | return self.extract_by_meta(element) or \ 45 | self.extract_by_regex(element) 46 | 47 | 48 | datetime_extractor = DatetimeExtractor() 49 | 50 | 51 | def parse_datetime(datetime): 52 | """ 53 | parse datetime using dateparser lib 54 | :param datetime: 55 | :return: 56 | """ 57 | if not datetime: 58 | return None 59 | try: 60 | return parse(datetime) 61 | except TypeError: 62 | logger.exception(f'Error Occurred while parsing datetime extracted. datetime is {datetime}') 63 | 64 | 65 | def extract_datetime(html, parse=True): 66 | """ 67 | extract datetime from html 68 | :param parse: 69 | :param html: 70 | :return: 71 | """ 72 | result = datetime_extractor.extract(html) 73 | if not parse: 74 | return result 75 | return parse_datetime(result) 76 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/extractors/content.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gerapy_auto_extractor.schemas.element import Element 3 | from gerapy_auto_extractor.utils.preprocess import preprocess4content_extractor 4 | from gerapy_auto_extractor.extractors.base import BaseExtractor 5 | from gerapy_auto_extractor.utils.element import descendants_of_body 6 | 7 | 8 | class ContentExtractor(BaseExtractor): 9 | """ 10 | extract content from detail page 11 | """ 12 | 13 | def process(self, element: Element): 14 | """ 15 | extract content from html 16 | :param element: 17 | :return: 18 | """ 19 | # preprocess 20 | preprocess4content_extractor(element) 21 | 22 | # start to evaluate every child element 23 | element_infos = [] 24 | descendants = descendants_of_body(element) 25 | 26 | # get std of density_of_text among all elements 27 | density_of_text = [descendant.density_of_text for descendant in descendants] 28 | density_of_text_std = np.std(density_of_text, ddof=1) 29 | 30 | # get density_score of every element 31 | for descendant in descendants: 32 | score = np.log(density_of_text_std) * \ 33 | descendant.density_of_text * \ 34 | np.log10(descendant.number_of_p_descendants + 2) * \ 35 | np.log(descendant.density_of_punctuation) 36 | descendant.density_score = score 37 | 38 | # sort element info by density_score 39 | descendants = sorted(descendants, key=lambda x: x.density_score, reverse=True) 40 | descendant_first = descendants[0] if descendants else None 41 | if descendant_first is None: 42 | return None 43 | paragraphs = descendant_first.xpath('.//p//text()') 44 | paragraphs = [paragraph.strip() if paragraph else '' for paragraph in paragraphs] 45 | paragraphs = list(filter(lambda x: x, paragraphs)) 46 | text = '\n'.join(paragraphs) 47 | text = text.strip() 48 | return text 49 | 50 | 51 | content_extractor = ContentExtractor() 52 | 53 | 54 | def extract_content(html): 55 | """ 56 | extract content from detail html 57 | :return: 58 | """ 59 | return content_extractor.extract(html) 60 | -------------------------------------------------------------------------------- /frontend/src/components/ResultList/List.vue: -------------------------------------------------------------------------------- 1 | 28 | 29 | 60 | 61 | 98 | -------------------------------------------------------------------------------- /frontend/src/router/index.js: -------------------------------------------------------------------------------- 1 | import { createRouter, createWebHistory } from 'vue-router' 2 | import Home from '../views/Home.vue' 3 | import store from '@/store/index.js' 4 | import { authorization, refreshToken } from "@/api/index.js"; 5 | 6 | const routes = [ 7 | { 8 | path: '/', 9 | name: 'Home', 10 | component: Home 11 | }, 12 | { 13 | path: '/search', 14 | name: 'search', 15 | component: () => import('../views/ResultList.vue') 16 | }, 17 | { 18 | path: '/login', 19 | name: 'login', 20 | component: () => import('../views/Login.vue') 21 | }, 22 | { 23 | path: '/register', 24 | name: 'register', 25 | component: () => import('../views/Register.vue') 26 | }, 27 | { 28 | path: '/userprofile', 29 | name: 'userprofile', 30 | component: () => import('../views/UserProfile.vue') 31 | }, 32 | { 33 | path: '/activate', 34 | name: 'activate', 35 | component: () => import('../views/activate.vue') 36 | } 37 | ] 38 | 39 | const router = createRouter({ 40 | history: createWebHistory(process.env.BASE_URL), 41 | routes 42 | }) 43 | // 注册全局前置守卫 44 | router.beforeEach(async (to, from, next) => { 45 | // 动态设置title 46 | // to.meta && setTitle(to.meta.title) 47 | // 获取token 48 | console.log() 49 | const access = store.state.Jwt.access || '' 50 | const refresh = store.state.Jwt.refresh || '' 51 | 52 | if (access) { // 已登录 53 | console.log("已经登录:", access) 54 | // 调用接口判断access是否失效 55 | let res = await authorization(access).then((data) => data).catch((err) => err) 56 | let code = res.status || '' 57 | if (code == 200) { 58 | if (to.name === 'login') next({ name: 'Home' }) 59 | else next() 60 | } else { 61 | // 失效就使用刷新token 62 | console.log("使用刷新token") 63 | let res = await refreshToken(refresh).then((data) => data).catch((err) => err) 64 | let code = res.status || '' 65 | if (code == 200) { 66 | console.log("刷新成功...") 67 | store.commit('SetJwt', { "access": res.data.access, "refresh": refresh }) 68 | } else { 69 | store.commit('SetJwt', '') 70 | next({ name: 'login' }) 71 | } 72 | } 73 | } else { // 未登录 74 | // 如果去的页面是登陆页,直接跳到登陆页 75 | if (to.name != 'userprofile') next() 76 | // 如果不是登陆页,强行跳转到登陆页 77 | else next({ name: 'login' }) 78 | } 79 | }) 80 | 81 | export default router 82 | -------------------------------------------------------------------------------- /Crawler/Crawler/items.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from Crawler.models.es_blogs import BlogsIndex 3 | from Crawler.utils.common import real_time_count 4 | 5 | # 统计索引的数据 6 | COUNT_INIT = 0 7 | 8 | 9 | class DetailItem(scrapy.Item): 10 | page_url = scrapy.Field() # 当前网页的url 11 | 12 | encode = scrapy.Field() 13 | keywords = scrapy.Field() 14 | description = scrapy.Field() 15 | lang = scrapy.Field() 16 | 17 | title = scrapy.Field() 18 | content = scrapy.Field() 19 | publish_time = scrapy.Field() 20 | 21 | urls = scrapy.Field() # 包含的url 22 | 23 | def save_to_mysql(self): 24 | # 插入的sql语句 25 | insert_sql = """ 26 | insert into search_blogs(page_url, urls) 27 | VALUES (%s, %s) 28 | """ 29 | 30 | sql_params = ( 31 | str(self['page_url']) or 'NAN', str(self['urls']) or 'NAN' 32 | ) 33 | return insert_sql, sql_params 34 | 35 | def save_to_es(self): 36 | blogs = BlogsIndex() 37 | blogs.suggest = self['title'] # 为title建立建议字段 38 | blogs.page_url = self['page_url'] 39 | blogs.title = self['title'] 40 | blogs.keywords = self['keywords'] 41 | blogs.description = self['description'] 42 | blogs.content = self['content'] 43 | blogs.publish_time = self['publish_time'] 44 | 45 | real_time_count('view_count', COUNT_INIT) 46 | blogs.save() 47 | print("已建立索引到elasticsearch中......") 48 | 49 | def help_fields(self): 50 | for field in self.fields: 51 | print(field, "= scrapy.Field()") 52 | 53 | 54 | class ListItem(scrapy.Item): 55 | # 列表页不需要存内容以及标题,并且暂时不用建立索引,保存到数据库中就可以了 56 | page_url = scrapy.Field() # 当前网页的url 57 | 58 | encode = scrapy.Field() 59 | keywords = scrapy.Field() 60 | description = scrapy.Field() 61 | lang = scrapy.Field() 62 | 63 | publish_time = scrapy.Field() 64 | 65 | urls = scrapy.Field() # 包含的url 66 | 67 | def save_to_mysql(self): 68 | # 插入的sql语句 69 | insert_sql = """ 70 | insert into search_list(page_url, urls) 71 | VALUES (%s, %s) 72 | """ 73 | sql_params = ( 74 | str(self['page_url']) or 'NAN', str(self['urls']) or 'NAN' 75 | ) 76 | 77 | return insert_sql, sql_params 78 | 79 | 80 | def help_fields(self): 81 | for field in self.fields: 82 | print(field, "= scrapy.Field()") -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XiuSearch 2 | 3 | ### 简介 4 | XiuSearch是一款搜索技术博客的搜索引擎,当然,如果你将种子网址换成新闻网站,这就是一个新闻搜索引擎,它对于文章搜索来说是通用的。 5 | 6 | [演示链接](http://justin3go.cc/) 7 | [接口文档](http://justin3go.cc:8000/api/v1/docs/) 8 | 9 | > 如果网址失效,下方视频中也有演示的效果. 10 | 11 | [视频介绍链接](https://www.bilibili.com/video/BV16m4y1X78V) 12 | 13 | 项目架构图 14 | 15 | ![image-20220122123051493](https://webplus-cn-shenzhen-s-6130b804f968dd14cecc43e2.oss-cn-shenzhen.aliyuncs.com/blogs/image-20220122123051493.png) 16 | 17 | ### 功能 18 | + 历史记录与搜索建议 19 | + 检索使用elasticsearch→快 20 | + 倒排索引 21 | + 向量空间模型与布尔模型 22 | + 关键词高亮 23 | + Swagger文档(采用前后端分离开发) 24 | + 适合搜索引擎的爬虫 25 | + 断点续爬 26 | + 分页显示 27 | + JWT登录 28 | + 邮箱注册(重置密码、重置邮箱) 29 | + pagerank 30 | + 正文标题提取 31 | + 列表页详情页区分 32 | + redis统计实时爬取数量(没有展示在前端) 33 | 34 | 35 | ### 主要技术栈 36 | + Scrapy 2.5.1 37 | + ElasticSearch 7.15.2 38 | + Django 3.1 39 | + DjangoRestFramework 3.12 40 | + Vue3 41 | 42 | ### 相关算法 43 | + PageRank 44 | + 投票机制实现内容提取 45 | + SVM二分类模型区分列表页与详情页 46 | 47 | ### 安装教程 48 | 49 | ```python 50 | # 这个是直接导出的完整环境 51 | pip install -r requirments -i https://pypi.tuna.tsinghua.edu.cn/simple 52 | # 这个是我印象中使用的技术栈,也可以直接安装这个 53 | pip install -r requirments_ -i https://pypi.tuna.tsinghua.edu.cn/simple 54 | ``` 55 | 56 | ### 使用说明 57 | 58 | 1. 修改根目录的config.py,其中包含elasticsearch,mysql,redis的配置(这里省略这三部分的安装,请自行百度google)。 59 | 60 | 2. 修改/Backend/Backend/settings.py 61 | 62 | ```python 63 | # 修改数据库配置 64 | DATABASES = { 65 | 'default': { 66 | 'ENGINE': 'django.db.backends.mysql', 67 | 'PASSWORD': 'xxxxxx', 68 | 'NAME': 'xxxx', 69 | 'USER': 'xxxx', 70 | } 71 | } 72 | # 修改邮箱配置 73 | # EMAIL CONFIG 74 | EMAIL_BACKEND = "django.core.mail.backends.smtp.EmailBackend" 75 | EMAIL_HOST = "smtp.qq.com" 76 | EMAIL_HOST_USER = "justin3go@qq.com" 77 | EMAIL_HOST_PASSWORD = "xxxxxxx" # 这个不是qq密码,需要自己去qq邮箱申请 78 | EMAIL_PORT = 25 79 | # 如果部署,则需要如下配置,原因是阿里云不支持25端口发邮件 80 | EMAIL_USE_TLS = True 81 | EMAIL_PORT = 465 82 | DEFAULT_FROM_EMAIL = EMAIL_HOST_USER 83 | ``` 84 | 85 | 3. 爬取数据 86 | 87 | ```shell 88 | cd ./Crawler 89 | scrapy crawl blog1 90 | ``` 91 | 92 | 4. 运行django 93 | 94 | ```shell 95 | cd ./Backend 96 | # 迁移数据库 97 | python manage.py makemigrations 98 | python manage.py migrate 99 | # 运行 100 | python manage.py runserver 101 | # 打开localhost:8000/api/v1/docs/ 可以看到swagger文档 102 | 效果应该和 http://justin3go.cc:8000/api/v1/docs/ 一样 103 | ``` 104 | 105 | 5. 运行vue 106 | 107 | ```shell 108 | cd fontend 109 | npm run serve 110 | ``` 111 | **欢迎iusse,感谢⭐star⭐** 112 | 113 | -------------------------------------------------------------------------------- /frontend/src/assets/logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 9 | 10 | 11 | 12 | 13 | 16 | 17 | 18 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 30 | 31 | 32 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | node_modules 141 | 142 | breakpoints -------------------------------------------------------------------------------- /draw/功能分析.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /frontend/src/components/ResultList/DetailNav.vue: -------------------------------------------------------------------------------- 1 | 31 | 40 | 57 | 58 | 119 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/extractors/title.py: -------------------------------------------------------------------------------- 1 | from gerapy_auto_extractor.extractors.base import BaseExtractor 2 | from lxml.html import HtmlElement 3 | from gerapy_auto_extractor.patterns.title import METAS 4 | from gerapy_auto_extractor.utils.lcs import lcs_of_2 5 | from gerapy_auto_extractor.utils.similarity import similarity2 6 | 7 | 8 | class TitleExtractor(BaseExtractor): 9 | """ 10 | Title Extractor which extract title of page 11 | """ 12 | 13 | def extract_by_meta(self, element: HtmlElement) -> str: 14 | """ 15 | extract according to meta 16 | :param element: 17 | :return: str 18 | """ 19 | for xpath in METAS: 20 | title = element.xpath(xpath) 21 | if title: 22 | return ''.join(title) 23 | 24 | def extract_by_title(self, element: HtmlElement): 25 | """ 26 | get title from tag 27 | :param element: 28 | :return: 29 | """ 30 | return ''.join(element.xpath('//title//text()')).strip() 31 | 32 | def extract_by_hs(self, element: HtmlElement): 33 | """ 34 | get title from all h1-h3 tag 35 | :param element: 36 | :return: 37 | """ 38 | hs = element.xpath('//h1//text()|//h2//text()|//h3//text()') 39 | return hs or [] 40 | 41 | def extract_by_h(self, element: HtmlElement): 42 | """ 43 | extract by h tag, priority h1, h2, h3 44 | :param elemeent: 45 | :return: 46 | """ 47 | for xpath in ['//h1', '//h2', '//h3']: 48 | children = element.xpath(xpath) 49 | if not children: 50 | continue 51 | child = children[0] 52 | texts = child.xpath('./text()') 53 | if texts and len(texts): 54 | return texts[0].strip() 55 | 56 | def process(self, element: HtmlElement): 57 | """ 58 | extract title from element 59 | :param element: 60 | :return: 61 | """ 62 | title_extracted_by_meta = self.extract_by_meta(element) 63 | title_extracted_by_h = self.extract_by_h(element) 64 | title_extracted_by_hs = self.extract_by_hs(element) 65 | title_extracted_by_title = self.extract_by_title(element) 66 | 67 | # split logic to add more 68 | if title_extracted_by_meta: 69 | return title_extracted_by_meta 70 | 71 | # get most similar h 72 | title_extracted_by_hs = sorted(title_extracted_by_hs, 73 | key=lambda x: similarity2(x, title_extracted_by_title), 74 | reverse=True) 75 | if title_extracted_by_hs: 76 | return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title) 77 | 78 | if title_extracted_by_title: 79 | return title_extracted_by_title 80 | 81 | return title_extracted_by_h 82 | 83 | 84 | title_extractor = TitleExtractor() 85 | 86 | 87 | def extract_title(html): 88 | """ 89 | extract title from html 90 | :param html: 91 | :return: 92 | """ 93 | result = title_extractor.extract(html) 94 | return result 95 | -------------------------------------------------------------------------------- /Engine/pagerank.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import random 3 | import pymysql 4 | import ast 5 | import sys 6 | sys.path.append("C:/My_app/code/咻Search") 7 | from config import MYSQL_HOST, MYSQL_DBNAME, MYSQL_USER, MYSQL_PASSWORD 8 | 9 | 10 | class PageRank(): 11 | ''' 12 | G: 传入图的邻接矩阵 13 | T: 迭代计算次数上限 14 | eps: 误差上限 15 | beta: 公式里面的beta 16 | return: list 17 | 注:误差小于eps或者迭代次数大于T结束迭代计算 18 | ''' 19 | def __init__(self, G, T=300, eps=1e-6, beta=0.8) -> None: 20 | self.G = G 21 | self.N = len(G) 22 | self.T = T 23 | self.eps = eps 24 | self.beta = beta 25 | 26 | 27 | def GtoM(self, G): 28 | ''' 29 | 创建概率转换矩阵 30 | ''' 31 | M = np.zeros((self.N, self.N)) 32 | for i in range(self.N): 33 | D_i = sum(G[i]) 34 | if D_i == 0: 35 | continue 36 | for j in range(self.N): 37 | M[j][i] = G[i][j] / D_i #归一化并转置 38 | return M 39 | 40 | def computePR(self, M): 41 | ''' 42 | 计算PR值 43 | ''' 44 | R = np.ones(self.N) / self.N 45 | teleport = np.ones(self.N) / self.N 46 | for time in range(self.T): 47 | A = self.beta * M + (1-self.beta)*teleport 48 | R_new = np.dot(A, R) 49 | if np.linalg.norm(R_new - R) < self.eps: 50 | break 51 | R = R_new.copy() 52 | return np.around(R_new, 5) 53 | 54 | def getPR(self): 55 | M = self.GtoM(self.G) 56 | return self.computePR(M) 57 | 58 | def urls2G(): 59 | ''' 60 | 将数据库中urls的关系转化为图 61 | ''' 62 | # 连接数据库 63 | # 加上charset='utf8',避免 'latin-1' encoding 报错等问题 64 | conn = pymysql.connect(host=MYSQL_HOST, user=MYSQL_USER, passwd=MYSQL_PASSWORD, 65 | db=MYSQL_DBNAME, charset='utf8') 66 | # 创建cursor 67 | cursor_blogs = conn.cursor() 68 | cursor_list = conn.cursor() 69 | sql_blogs = 'SELECT page_url, urls FROM search_blogs;' 70 | sql_list = 'SELECT page_url, urls FROM search_blogs;' 71 | # 执行sql语句 72 | cursor_blogs.execute(sql_blogs) 73 | cursor_list.execute(sql_list) 74 | # 获取全部查询信息 75 | re_blogs = cursor_blogs.fetchall() 76 | re_list = cursor_list.fetchall() 77 | 78 | # 将获取的元组信息转换为图 79 | blogs_index = [url[0] for url in re_blogs] 80 | blogs_point = [ast.literal_eval(url[1]) for url in re_blogs] 81 | 82 | list_index = [url[0] for url in re_list] 83 | list_point = [ast.literal_eval(url[1]) for url in re_list] 84 | indexs = blogs_index + list_index 85 | points = blogs_point + list_point 86 | G = np.zeros((len(indexs), len(indexs))) 87 | for i, index in enumerate(indexs): 88 | # 依次判断包含的url是是否在爬取过的列表中,有些广告之类的链接页会包含,但没爬取 89 | for p_url in points[i]: 90 | try: 91 | p_index = indexs.index(p_url) 92 | except: 93 | p_index = -1 94 | if p_index != -1: 95 | G[i][p_index] = 1 96 | 97 | return G 98 | 99 | if __name__ == "__main__": 100 | # def create_data(N, alpha=0.5): 101 | # G = np.zeros((N, N)) 102 | # for i in range(N): 103 | # for j in range(N): 104 | # if i == j: 105 | # continue 106 | # if random.random() < alpha: 107 | # G[i][j] = 1 108 | # return G 109 | # G = create_data(10) 110 | # PR = PageRank(G) 111 | # print(PR.getPR()) 112 | G = urls2G() 113 | print(type(G)) 114 | PR = PageRank(G) 115 | print(PR.getPR()) -------------------------------------------------------------------------------- /frontend/src/views/UserProfile.vue: -------------------------------------------------------------------------------- 1 | <template> 2 | <div class="user-profile"> 3 | <div v-show="leftShow" class="left"></div> 4 | <div class="center"> 5 | <div class="user"> 6 | <img class="avator" :src="avator" alt="" /> 7 | <div class="username">{{ username }}</div> 8 | <div class="email">邮箱:{{ email }}</div> 9 | <div class="btns"> 10 | <div 11 | class="btn1" 12 | :class="{ active: IsActive1 }" 13 | @mouseenter="IsActive1 = true" 14 | @mouseleave="IsActive1 = false" 15 | @click="resetemail" 16 | > 17 | 重置邮箱 18 | </div> 19 | <div 20 | class="btn2" 21 | :class="{ active: IsActive2 }" 22 | @mouseenter="IsActive2 = true" 23 | @mouseleave="IsActive2 = false" 24 | @click="resetpassword" 25 | > 26 | 重置密码 27 | </div> 28 | </div> 29 | </div> 30 | <div class="content"> 31 | <img src="@/assets/userprofile.svg" alt="" /> 32 | </div> 33 | </div> 34 | <div v-show="rightShow" class="right"></div> 35 | </div> 36 | </template> 37 | <script> 38 | export default { 39 | data() { 40 | return { 41 | IsActive1: false, 42 | IsActive2: false, 43 | leftShow: false, 44 | rightShow: false, 45 | }; 46 | }, 47 | methods: { 48 | resetemail() { 49 | this.leftShow = !this.leftShow 50 | alert("别点了,不想写前端了...") 51 | }, 52 | resetpassword() { 53 | this.rightShow = !this.rightShow 54 | alert("别点了,不想写前端了...") 55 | }, 56 | }, 57 | }; 58 | </script> 59 | <script setup> 60 | import { onMounted } from "vue"; 61 | import { getIP } from "@/api/index.js"; 62 | import store from "@/store/index.js"; 63 | 64 | const avator = getIP() + store.state.UserProfile.avator || ""; 65 | const email = store.state.UserProfile.email; 66 | const id = store.state.UserProfile.id; 67 | const username = store.state.UserProfile.username; 68 | 69 | onMounted(async () => {}); 70 | </script> 71 | <style lang="less" scoped> 72 | .user-profile { 73 | margin: 0.3rem 0.3rem 0rem; 74 | .center { 75 | .user { 76 | margin: auto; 77 | padding: 0.2rem 0.2rem 0.3rem 0.2rem; 78 | background-color: #fff; 79 | width: 4.5rem; 80 | border-radius: 0.2rem; 81 | box-shadow: 0.1rem 0.1rem 0.1rem #999; 82 | .avator { 83 | margin-left: 1.55rem; 84 | height: 1rem; 85 | width: 1rem; 86 | border-radius: 0.5rem; 87 | } 88 | .username { 89 | text-align: center; 90 | font-size: 0.2rem; 91 | } 92 | .email { 93 | margin-top: 0.3rem; 94 | } 95 | .btns { 96 | margin-top: 0.2rem; 97 | color: #999; 98 | .btn1 { 99 | display: block; 100 | margin-top: 0.1rem; 101 | cursor: pointer; 102 | height: 0.3rem; 103 | line-height: 0.3rem; 104 | width: 0.75rem; 105 | text-align: center; 106 | border-radius: 0.05rem; 107 | } 108 | .btn2 { 109 | margin-top: 0.1rem; 110 | cursor: pointer; 111 | height: 0.3rem; 112 | line-height: 0.3rem; 113 | width: 0.75rem; 114 | text-align: center; 115 | border-radius: 0.05rem; 116 | } 117 | .active { 118 | color: #111; 119 | background-color: #999; 120 | } 121 | } 122 | } 123 | .content { 124 | z-index: 20; 125 | img { 126 | margin: 0.1rem 2.25rem 0; 127 | z-index: 1000; 128 | } 129 | } 130 | } 131 | } 132 | .left { 133 | position: absolute; 134 | top: 0.3rem; 135 | left: 0.3rem; 136 | margin: auto; 137 | padding: 0.2rem 0.2rem 0.3rem 0.2rem; 138 | background-color: #fff; 139 | width: 4.5rem; 140 | height: 2.5rem; 141 | border-radius: 0.2rem; 142 | box-shadow: 0.1rem 0.1rem 0.1rem #999; 143 | } 144 | .right { 145 | position: absolute; 146 | top: 0.3rem; 147 | right: 0.3rem; 148 | margin: auto; 149 | padding: 0.2rem 0.2rem 0.3rem 0.2rem; 150 | background-color: #fff; 151 | width: 4.5rem; 152 | height: 2.5rem; 153 | border-radius: 0.2rem; 154 | box-shadow: 0.1rem 0.1rem 0.1rem #999; 155 | } 156 | </style> 157 | -------------------------------------------------------------------------------- /Crawler/Crawler/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | 12 | class CrawlerSpiderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, or item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Request or item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class CrawlerDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /frontend/src/assets/empty.svg: -------------------------------------------------------------------------------- 1 | <svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" width="797.5" height="834.5" viewBox="0 0 797.5 834.5" xmlns:xlink="http://www.w3.org/1999/xlink"><title>void -------------------------------------------------------------------------------- /Crawler/Crawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | from Crawler.items import DetailItem, ListItem 10 | import codecs,os,json 11 | import copy 12 | import pymysql 13 | import MySQLdb 14 | import MySQLdb.cursors 15 | from twisted.enterprise import adbapi 16 | from config import MYSQL_HOST, MYSQL_DBNAME, MYSQL_USER, MYSQL_PASSWORD 17 | 18 | 19 | class ElasticSearchPipeline(object): 20 | """通用的ElasticSearch存储方法""" 21 | 22 | def process_item(self, item, spider): 23 | # 只有详情页才加索引 24 | if isinstance(item, DetailItem): 25 | item.save_to_es() 26 | return item 27 | 28 | class MysqlTwistedPipeline(object): 29 | 30 | def __init__(self, dbpool): 31 | self.dbpool = dbpool 32 | 33 | @classmethod 34 | def from_crawler(cls, crawler): 35 | # 读取settings中的配置 36 | params = dict( 37 | host=MYSQL_HOST, 38 | db=MYSQL_DBNAME, 39 | user=MYSQL_USER, 40 | passwd=MYSQL_PASSWORD, 41 | charset='utf8', 42 | cursorclass=pymysql.cursors.DictCursor, 43 | use_unicode=False 44 | ) 45 | # 创建连接池,pymysql为使用的连接模块 46 | dbpool = adbapi.ConnectionPool('pymysql', **params) 47 | return cls(dbpool) 48 | 49 | def process_item(self, item, spider): 50 | query = self.dbpool.runInteraction(self.do_insert, item) 51 | query.addErrback(self.handle_error, item, spider) 52 | print("已存入mysql中......") 53 | return item 54 | 55 | # 执行数据库操作的回调函数 56 | def do_insert(self, cursor, item): 57 | sql, params = item.save_to_mysql() 58 | cursor.execute(sql, params) 59 | 60 | # 当数据库操作失败的回调函数 61 | def handle_error(self, failue, item, spider): 62 | print(failue) 63 | 64 | # 使用json存到本地文件的代码 65 | # class CrawlerPipeline: 66 | # def __init__(self): 67 | # # 必须使用 w+ 模式打开文件,以便后续进行 读写操作(w+模式,意味既可读,亦可写) 68 | # # 注意:此处打开文件使用的不是 python 的 open 方法,而是 codecs 中的 open 方法 69 | # self.json_file = codecs.open('data.json', 'w+', encoding='UTF-8') 70 | 71 | # def open_spider(self, spider): 72 | # # 在爬虫开始时,首先写入一个 '[' 符号,构造一个 json 数组 73 | # # 为使得 Json 文件具有更高的易读性,我们辅助输出了 '\n'(换行符) 74 | # self.json_file.write('[\n') 75 | 76 | 77 | # def process_item(self, item, spider): 78 | # item_json = json.dumps(dict(item), ensure_ascii=False) 79 | # self.json_file.write('\t' + item_json + ',\n') 80 | # return item 81 | 82 | # if isinstance(item, DetailItem): 83 | # page_url = item['page_url'] 84 | # encode = item['encode'] 85 | # keywords = item['keywords'] 86 | # description = item['description'] 87 | # lang = item['lang'] 88 | # title = item['title'] 89 | # content = item['content'] 90 | # urls_cleaned = item['urls'] 91 | # publish_time = item['publish_time'] 92 | # f = open("./data.json", 'w+', encoding="utf-8") 93 | 94 | 95 | # if isinstance(item, ListItem): 96 | # page_url = item['page_url'] 97 | # encode = item['encode'] 98 | # keywords = item['keywords'] 99 | # description = item['description'] 100 | # lang = item['lang'] 101 | # urls_cleaned = item['urls'] 102 | # publish_time = item['publish_time'] 103 | 104 | # # 爬虫结束时执行的方法 105 | # def close_spider(self, spider): 106 | # # 在结束后,需要对 process_item 最后一次执行输出的 “逗号” 去除 107 | # # 当前文件指针处于文件尾,我们需要首先使用 SEEK 方法,定位到文件尾前的两个字符(一个','(逗号), 一个'\n'(换行符))的位置 108 | # self.json_file.seek(-2, os.SEEK_END) 109 | # # 使用 truncate() 方法,将后面的数据清空 110 | # self.json_file.truncate() 111 | # # 重新输出'\n',并输出']',与 open_spider(self, spider) 时输出的 '[' 相对应,构成一个完整的数组格式 112 | # self.json_file.write('\n]') 113 | # # 关闭文件 114 | # self.json_file.close() 115 | 116 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/patterns/datetime.py: -------------------------------------------------------------------------------- 1 | REGEXES = [ 2 | "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", 3 | "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", 4 | "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])", 5 | "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])", 6 | "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)", 7 | "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", 8 | "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", 9 | "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])", 10 | "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])", 11 | "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)", 12 | "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", 13 | "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", 14 | "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])", 15 | "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])", 16 | "(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)", 17 | "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", 18 | "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", 19 | "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])", 20 | "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])", 21 | "(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)", 22 | "(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])", 23 | "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])", 24 | "(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])", 25 | "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])", 26 | "(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)", 27 | "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})", 28 | "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})", 29 | "(\d{4}年\d{1,2}月\d{1,2}日)", 30 | "(\d{2}年\d{1,2}月\d{1,2}日)", 31 | "(\d{1,2}月\d{1,2}日)" 32 | ] 33 | 34 | METAS_CONTENT = [ 35 | '//meta[starts-with(@property, "rnews:datePublished")]/@content', 36 | '//meta[starts-with(@property, "article:published_time")]/@content', 37 | '//meta[starts-with(@property, "og:published_time")]/@content', 38 | '//meta[starts-with(@property, "og:release_date")]/@content', 39 | '//meta[starts-with(@itemprop, "datePublished")]/@content', 40 | '//meta[starts-with(@itemprop, "dateUpdate")]/@content', 41 | '//meta[starts-with(@name, "OriginalPublicationDate")]/@content', 42 | '//meta[starts-with(@name, "article_date_original")]/@content', 43 | '//meta[starts-with(@name, "og:time")]/@content', 44 | '//meta[starts-with(@name, "apub:time")]/@content', 45 | '//meta[starts-with(@name, "publication_date")]/@content', 46 | '//meta[starts-with(@name, "sailthru.date")]/@content', 47 | '//meta[starts-with(@name, "PublishDate")]/@content', 48 | '//meta[starts-with(@name, "publishdate")]/@content', 49 | '//meta[starts-with(@name, "PubDate")]/@content', 50 | '//meta[starts-with(@name, "pubtime")]/@content', 51 | '//meta[starts-with(@name, "_pubtime")]/@content', 52 | '//meta[starts-with(@name, "weibo: article:create_at")]/@content', 53 | '//meta[starts-with(@pubdate, "pubdate")]/@content', 54 | ] 55 | 56 | METAS_MATCH = [ 57 | '//meta[starts-with(@property, "rnews:datePublished")]', 58 | '//meta[starts-with(@property, "article:published_time")]', 59 | '//meta[starts-with(@property, "og:published_time")]', 60 | '//meta[starts-with(@property, "og:release_date")]', 61 | '//meta[starts-with(@itemprop, "datePublished")]', 62 | '//meta[starts-with(@itemprop, "dateUpdate")]', 63 | '//meta[starts-with(@name, "OriginalPublicationDate")]', 64 | '//meta[starts-with(@name, "article_date_original")]', 65 | '//meta[starts-with(@name, "og:time")]', 66 | '//meta[starts-with(@name, "apub:time")]', 67 | '//meta[starts-with(@name, "publication_date")]', 68 | '//meta[starts-with(@name, "sailthru.date")]', 69 | '//meta[starts-with(@name, "PublishDate")]', 70 | '//meta[starts-with(@name, "publishdate")]', 71 | '//meta[starts-with(@name, "PubDate")]', 72 | '//meta[starts-with(@name, "pubtime")]', 73 | '//meta[starts-with(@name, "_pubtime")]', 74 | '//meta[starts-with(@name, "weibo: article:create_at")]', 75 | '//meta[starts-with(@pubdate, "pubdate")]', 76 | ] 77 | -------------------------------------------------------------------------------- /frontend/src/components/ResultList/PageIndex.vue: -------------------------------------------------------------------------------- 1 | 39 | 40 | 115 | 116 | 169 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | from lxml.html import HtmlElement, etree 2 | 3 | from gerapy_auto_extractor.schemas.element import Element 4 | from gerapy_auto_extractor.utils.element import children, remove_element, remove_children 5 | 6 | CONTENT_EXTRACTOR_USELESS_TAGS = ['meta', 'style', 'script', 'link', 'video', 'audio', 'iframe', 'source', 'svg', 7 | 'path', 8 | 'symbol', 'img', 'footer', 'header'] 9 | CONTENT_EXTRACTOR_STRIP_TAGS = ['span', 'blockquote'] 10 | CONTENT_EXTRACTOR_NOISE_XPATHS = [ 11 | '//div[contains(@class, "comment")]', 12 | '//div[contains(@class, "advertisement")]', 13 | '//div[contains(@class, "advert")]', 14 | '//div[contains(@style, "display: none")]', 15 | ] 16 | 17 | 18 | def preprocess4content_extractor(element: HtmlElement): 19 | """ 20 | preprocess element for content extraction 21 | :param element: 22 | :return: 23 | """ 24 | # remove tag and its content 25 | etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS) 26 | # only move tag pair 27 | etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS) 28 | 29 | remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATHS) 30 | 31 | for child in children(element): 32 | 33 | # merge text in span or strong to parent p tag 34 | if child.tag.lower() == 'p': 35 | etree.strip_tags(child, 'span') 36 | etree.strip_tags(child, 'strong') 37 | 38 | if not (child.text and child.text.strip()): 39 | remove_element(child) 40 | 41 | # if a div tag does not contain any sub node, it could be converted to p node. 42 | if child.tag.lower() == 'div' and not child.getchildren(): 43 | child.tag = 'p' 44 | 45 | 46 | LIST_EXTRACTOR_USELESS_TAGS = CONTENT_EXTRACTOR_USELESS_TAGS 47 | LIST_EXTRACTOR_STRIP_TAGS = CONTENT_EXTRACTOR_STRIP_TAGS 48 | LIST_EXTRACTOR_NOISE_XPATHS = CONTENT_EXTRACTOR_NOISE_XPATHS 49 | 50 | 51 | def preprocess4list_extractor(element: Element): 52 | """ 53 | preprocess element for list extraction 54 | :param element: 55 | :return: 56 | """ 57 | # remove tag and its content 58 | etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS) 59 | # only move tag pair 60 | etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS) 61 | 62 | remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATHS) 63 | 64 | for child in children(element): 65 | 66 | # merge text in span or strong to parent p tag 67 | if child.tag.lower() == 'p': 68 | etree.strip_tags(child, 'span') 69 | etree.strip_tags(child, 'strong') 70 | 71 | if not (child.text and child.text.strip()): 72 | remove_element(child) 73 | 74 | # if a div tag does not contain any sub node, it could be converted to p node. 75 | if child.tag.lower() == 'div' and not child.getchildren(): 76 | child.tag = 'p' 77 | 78 | 79 | LIST_CLASSIFIER_USELESS_TAGS = ['style', 'script', 'link', 'video', 'audio', 'iframe', 'source', 'svg', 'path', 80 | 'symbol', 'footer', 'header'] 81 | LIST_CLASSIFIER_STRIP_TAGS = ['span', 'blockquote'] 82 | LIST_CLASSIFIER_NOISE_XPATHS = [ 83 | '//div[contains(@class, "comment")]', 84 | '//div[contains(@class, "advertisement")]', 85 | '//div[contains(@class, "advert")]', 86 | '//div[contains(@style, "display: none")]', 87 | ] 88 | 89 | 90 | def preprocess4list_classifier(element: HtmlElement): 91 | """ 92 | preprocess element for list classifier 93 | :param element: 94 | :return: 95 | """ 96 | # remove tag and its content 97 | etree.strip_elements(element, *LIST_CLASSIFIER_USELESS_TAGS) 98 | # only move tag pair 99 | etree.strip_tags(element, *LIST_CLASSIFIER_STRIP_TAGS) 100 | 101 | remove_children(element, LIST_CLASSIFIER_NOISE_XPATHS) 102 | 103 | for child in children(element): 104 | 105 | # merge text in span or strong to parent p tag 106 | if child.tag.lower() == 'p': 107 | etree.strip_tags(child, 'span') 108 | etree.strip_tags(child, 'strong') 109 | 110 | if not (child.text and child.text.strip()): 111 | remove_element(child) 112 | 113 | # if a div tag does not contain any sub node, it could be converted to p node. 114 | if child.tag.lower() == 'div' and not child.getchildren(): 115 | child.tag = 'p' 116 | -------------------------------------------------------------------------------- /Crawler/Crawler/spiders/blog1.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("C:/My_app/code/咻Search/Engine") 3 | from Crawler.items import DetailItem, ListItem 4 | from urllib import parse 5 | import re 6 | import scrapy 7 | from url_parser import is_static_url 8 | from gerapy_auto_extractor.extractors.title import extract_title 9 | from gerapy_auto_extractor.extractors.datetime import extract_datetime 10 | from gerapy_auto_extractor.classifiers.detail import is_detail 11 | from gerapy_auto_extractor.classifiers.list import is_list 12 | from html_extractor import MainContent 13 | 14 | # from lxml.html.clean import Cleaner 15 | 16 | 17 | class Blog1Spider(scrapy.Spider): 18 | name = 'blog1' 19 | # allowed_domains = ['*'] 20 | start_urls = ['https://www.51cto.com/', 21 | 'https://www.iteye.com/', 'https://www.cnblogs.com/', 22 | 'http://www.blogjava.net/','https://blogread.cn//it/', 23 | 'http://blog.chinaunix.net/', 'https://www.oschina.net/', 24 | 'http://blog.itpub.net/', 'https://cuiqingcai.com/', 25 | 'http://blog.jobbole.com/', 'https://segmentfault.com/', 26 | 'https://www.infoq.cn/','https://www.v2ex.com/', 27 | 'https://www.jianshu.com/','https://blogs.360.cn/', 28 | 'https://tech.meituan.com/','http://www.ruanyifeng.com/blog/', 29 | 'http://it.deepinmind.com/','https://coolshell.cn/', 30 | 'https://imzl.com/','https://www.itzhai.com/', 31 | 'http://macshuo.com/','http://ifeve.com/', 32 | 'http://blog.zhaojie.me/','https://juejin.cn/', 33 | 'https://www.runoob.com/'] 34 | 35 | # 规则 36 | rule_encode = "//meta/@charset" 37 | rule_keywords = "//meta[@name='keywords']/@content" 38 | rule_description = "//meta[@name='description']/@content" 39 | rule_lang = "//@lang" 40 | rule_url = "//@href" # 简答地提取url的规则 41 | # 保留标签的 src 属性 42 | # safe_attrs = frozenset(['src']) 43 | # 删除 a 标签 44 | # remove_tags = frozenset(['script','style','link']) 45 | # 实例化 46 | # cleaner = Cleaner( 47 | # style=True, 48 | # scripts=True, 49 | # javascript=True, 50 | # meta=False, 51 | # # safe_attrs=safe_attrs, 52 | # # remove_tags=remove_tags, 53 | # ) 54 | 55 | def parse(self, response): 56 | page_url = response.request.url 57 | print("-"*100) 58 | print("开始爬取%s......" % page_url) 59 | if response.status == 200: 60 | # cleaned_html = self.cleaner.clean_html(response.body.decode('utf-8')) 61 | # with open("./test.html", 'w', encoding="utf-8") as f: 62 | # f.write(str(cleaned_html)) 63 | # sys.exit() 64 | # 获取内容 65 | encode = response.xpath(self.rule_encode).extract() 66 | keywords = response.xpath(self.rule_keywords).extract() 67 | description = response.xpath(self.rule_description).extract() 68 | lang = response.xpath(self.rule_lang).extract() 69 | # 这里代码检测有问题,实际没问题,只能说VS有点垃圾,继承关系都搞不懂 70 | publish_time = extract_datetime(response.body.decode('utf-8')) 71 | 72 | urls = response.xpath(self.rule_url).extract() 73 | urls_cleaned = [] 74 | for url in urls: 75 | if is_static_url(url) or "javascript:" in url.lower(): 76 | continue 77 | # 绝对链接不变,相对链接转换为绝对链接 78 | full_url = parse.urljoin(page_url, url) 79 | urls_cleaned.append(full_url) 80 | 81 | # 如果符合详情页规则,就下载该网页,提取其正文 82 | if is_detail(response.body, 0.3): 83 | print("该网页符合详情页规则.....") 84 | print("提取[ %s ]携带的正文标题中......" % page_url) 85 | 86 | extractor = MainContent() 87 | title, content = extractor.extract(page_url, response.body) 88 | 89 | # 保存... 90 | detail_item = DetailItem() 91 | detail_item['page_url'] = page_url 92 | detail_item['encode'] = encode 93 | detail_item['keywords'] = keywords 94 | detail_item['description'] = description 95 | detail_item['lang'] = lang 96 | detail_item['title'] = title 97 | detail_item['content'] = content 98 | detail_item['urls'] = urls_cleaned 99 | detail_item['publish_time'] = publish_time 100 | 101 | yield detail_item 102 | 103 | # 如果不符合详情页规则,就下载该网页,不提取其正文 104 | elif is_list(response.body, 0.9): 105 | print("该网页符合列表页规则.....") 106 | # 保存... 107 | list_item = ListItem() 108 | list_item['page_url'] = page_url 109 | list_item['encode'] = encode 110 | list_item['keywords'] = keywords 111 | list_item['description'] = description 112 | list_item['lang'] = lang 113 | list_item['urls'] = urls_cleaned 114 | list_item['publish_time'] = publish_time 115 | 116 | yield list_item 117 | else: 118 | print("跳过爬取!!!!!!") 119 | 120 | for url in urls_cleaned: 121 | yield scrapy.Request(url=url, callback=self.parse) 122 | 123 | else: 124 | print("[ %s ]未爬取成功......" % page_url) 125 | return 126 | -------------------------------------------------------------------------------- /frontend/src/views/Login.vue: -------------------------------------------------------------------------------- 1 | 54 | 55 | 162 | 211 | -------------------------------------------------------------------------------- /frontend/src/components/Home/SearchBox.vue: -------------------------------------------------------------------------------- 1 | 45 | 46 | 152 | 153 | 219 | -------------------------------------------------------------------------------- /draw/系统架构.drawio: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /Backend/Backend/settings.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from pathlib import Path 3 | import os 4 | 5 | # Build paths inside the project like this: BASE_DIR / 'subdir'. 6 | BASE_DIR = Path(__file__).resolve().parent.parent 7 | 8 | # Quick-start development settings - unsuitable for production 9 | # See https://docs.djangoproject.com/en/3.1/howto/deployment/checklist/ 10 | 11 | # SECURITY WARNING: keep the secret key used in production secret! 12 | SECRET_KEY = 'vk3^9_hs96iew8f%*$v_ir=_)3eq-=y#jw#e0^x1nq%as^c^3#' 13 | 14 | # SECURITY WARNING: don't run with debug turned on in production! 15 | DEBUG = True 16 | 17 | ALLOWED_HOSTS = [] 18 | 19 | # Installed Apps 20 | DJANGO_APPS = [ 21 | "django.contrib.admin", 22 | "django.contrib.auth", 23 | "django.contrib.contenttypes", 24 | "django.contrib.sessions", 25 | "django.contrib.messages", 26 | "django.contrib.staticfiles", 27 | ] 28 | 29 | PROJECT_APPS = [ 30 | "accounts", 31 | "search_blogs" 32 | ] 33 | 34 | THIRD_PARTY_APPS = [ 35 | "rest_framework", 36 | "drf_yasg", 37 | "djoser", 38 | "corsheaders", 39 | "rest_framework_simplejwt", 40 | "rest_framework_simplejwt.token_blacklist", 41 | ] 42 | 43 | INSTALLED_APPS = DJANGO_APPS + PROJECT_APPS + THIRD_PARTY_APPS 44 | 45 | MIDDLEWARE = [ 46 | "django.middleware.security.SecurityMiddleware", 47 | "django.contrib.sessions.middleware.SessionMiddleware", 48 | "corsheaders.middleware.CorsMiddleware", # middleware for cors-headers 49 | "django.middleware.common.CommonMiddleware", 50 | "django.middleware.csrf.CsrfViewMiddleware", 51 | "django.contrib.auth.middleware.AuthenticationMiddleware", 52 | "django.contrib.messages.middleware.MessageMiddleware", 53 | "django.middleware.clickjacking.XFrameOptionsMiddleware", 54 | ] 55 | 56 | ROOT_URLCONF = 'Backend.urls' 57 | 58 | TEMPLATES = [ 59 | { 60 | 'BACKEND': 'django.template.backends.django.DjangoTemplates', 61 | 'DIRS': [], 62 | 'APP_DIRS': True, 63 | 'OPTIONS': { 64 | 'context_processors': [ 65 | 'django.template.context_processors.debug', 66 | 'django.template.context_processors.request', 67 | 'django.contrib.auth.context_processors.auth', 68 | 'django.contrib.messages.context_processors.messages', 69 | ], 70 | }, 71 | }, 72 | ] 73 | 74 | WSGI_APPLICATION = 'Backend.wsgi.application' 75 | 76 | # Database 77 | # https://docs.djangoproject.com/en/3.1/ref/settings/#databases 78 | 79 | DATABASES = { 80 | 'default': { 81 | 'ENGINE': 'django.db.backends.mysql', 82 | 'PASSWORD': 'xxxx', 83 | 'NAME': 'xxxx', 84 | 'USER': 'root', 85 | } 86 | } 87 | 88 | # Password validation 89 | # https://docs.djangoproject.com/en/3.1/ref/settings/#auth-password-validators 90 | 91 | AUTH_PASSWORD_VALIDATORS = [ 92 | { 93 | 'NAME': 94 | 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', 95 | }, 96 | { 97 | 'NAME': 98 | 'django.contrib.auth.password_validation.MinimumLengthValidator', 99 | }, 100 | { 101 | 'NAME': 102 | 'django.contrib.auth.password_validation.CommonPasswordValidator', 103 | }, 104 | { 105 | 'NAME': 106 | 'django.contrib.auth.password_validation.NumericPasswordValidator', 107 | }, 108 | ] 109 | 110 | # Internationalization 111 | # https://docs.djangoproject.com/en/3.1/topics/i18n/ 112 | 113 | LANGUAGE_CODE = 'en-us' 114 | 115 | TIME_ZONE = 'UTC' 116 | 117 | USE_I18N = True 118 | 119 | USE_L10N = True 120 | 121 | USE_TZ = True 122 | 123 | # Static files (CSS, JavaScript, Images) 124 | # https://docs.djangoproject.com/en/3.1/howto/static-files/ 125 | 126 | STATIC_URL = '/static/' 127 | STATICFILES_DIRS=[ 128 | os.path.join(BASE_DIR,'static') 129 | ] 130 | 131 | 132 | # EMAIL CONFIG 133 | EMAIL_BACKEND = "django.core.mail.backends.smtp.EmailBackend" 134 | EMAIL_HOST = "smtp.qq.com" 135 | EMAIL_PORT = 25 136 | EMAIL_HOST_USER = "justin3go@qq.com" 137 | EMAIL_HOST_PASSWORD = "xxxxxxxxx" 138 | EMAIL_USE_TLS = True 139 | DEFAULT_FROM_EMAIL = EMAIL_HOST_USER 140 | 141 | REST_FRAMEWORK = { 142 | # "DEFAULT_PERMISSION_CLASSES": ["rest_framework.permissions.IsAuthenticated"], 143 | "DEFAULT_PERMISSION_CLASSES": ["rest_framework.permissions.AllowAny"], 144 | "DEFAULT_AUTHENTICATION_CLASSES": ( 145 | "rest_framework_simplejwt.authentication.JWTAuthentication", 146 | ), 147 | } 148 | 149 | 150 | SIMPLE_JWT = { 151 | "AUTH_HEADER_TYPES": ("JWT",), 152 | "ACCESS_TOKEN_LIFETIME": timedelta(minutes=60), 153 | "REFRESH_TOKEN_LIFETIME": timedelta(days=1), 154 | "AUTH_TOKEN_CLASSES": ("rest_framework_simplejwt.tokens.AccessToken",), 155 | } 156 | DOMAIN = ('localhost:8080') 157 | SITE_NAME = ('XiuSearch') 158 | # DJOSER CONFIG 159 | DJOSER = { 160 | "LOGIN_FIELD": "email", 161 | "USER_CREATE_PASSWORD_RETYPE": True, 162 | "USERNAME_CHANGED_EMAIL_CONFIRMATION": True, 163 | "PASSWORD_CHANGED_EMAIL_CONFIRMATION": True, 164 | "SEND_CONFIRMATION_EMAIL": True, 165 | "SET_USERNAME_RETYPE": True, 166 | "SET_PASSWORD_RETYPE": True, 167 | "USERNAME_RESET_CONFIRM_URL": "password/reset/confirm?uid={uid}&token={token}", 168 | "PASSWORD_RESET_CONFIRM_URL": "email/reset/confirm?uid={uid}&token={token}", 169 | "ACTIVATION_URL": "activate?uid={uid}&token={token}", 170 | "SEND_ACTIVATION_EMAIL": True, 171 | "SOCIAL_AUTH_TOKEN_STRATEGY": "djoser.social.token.jwt.TokenStrategy", 172 | # TODO 173 | "SOCIAL_AUTH_ALLOWED_REDIRECT_URIS": [ 174 | "your redirect url", 175 | "your redirect url", 176 | ], 177 | "SERIALIZERS": { 178 | "user_create": "accounts.serializers.MyUserCreateSerializer", # custom serializer 179 | "user": "accounts.serializers.MyUserSerializer", 180 | "current_user": "accounts.serializers.MyUserSerializer", 181 | "user_delete": "djoser.serializers.UserSerializer", 182 | }, 183 | } 184 | 185 | # CORS HEADERS 186 | CORS_ORIGIN_ALLOW_ALL = True 187 | CORS_ALLOW_CREDENTIALS = True 188 | 189 | # 覆盖django的用户类 190 | AUTH_USER_MODEL = 'accounts.CustomUser' -------------------------------------------------------------------------------- /Backend/search_blogs/views.py: -------------------------------------------------------------------------------- 1 | from config import ES_HOST 2 | from datetime import datetime 3 | 4 | from elasticsearch import Elasticsearch 5 | 6 | from django.utils.datastructures import OrderedSet 7 | from drf_yasg import openapi 8 | from drf_yasg.utils import swagger_auto_schema 9 | from rest_framework.views import APIView 10 | from rest_framework.response import Response 11 | from rest_framework import status 12 | from rest_framework.permissions import AllowAny 13 | 14 | from search_blogs.models import BlogsIndex 15 | import sys 16 | sys.path.append("C:/My_app/code/咻Search") 17 | # Create your views here. 18 | # class IndexView(View): 19 | # pass 20 | 21 | client = Elasticsearch(hosts=[ES_HOST]) 22 | 23 | 24 | class SearchView(APIView): 25 | ''' 26 | 返回搜索结果的接口 27 | ''' 28 | permission_classes = [AllowAny] 29 | 30 | q = openapi.Parameter('q', 31 | openapi.IN_QUERY, 32 | description="查询语句", 33 | type=openapi.TYPE_STRING) 34 | p = openapi.Parameter('p', 35 | openapi.IN_QUERY, 36 | description="页码", 37 | type=openapi.TYPE_STRING) 38 | 39 | @swagger_auto_schema(manual_parameters=[q, p], responses={200: {}}) 40 | def get(self, request): 41 | # 获取参数 42 | key_words = request.query_params.get("q", "") 43 | page = request.query_params.get("p", "1") 44 | # key_words = q 45 | # s_type = ["title", "keywords", "description", "content"] 46 | # page = p 47 | 48 | try: 49 | page = int(page) 50 | except: 51 | page = 1 52 | try: 53 | start_time = datetime.now() # 计时 54 | response = client.search(index="blogs", 55 | body={ 56 | "query": { 57 | "multi_match": { 58 | "query": key_words, 59 | "fields": 60 | ["title", "content"] 61 | } 62 | }, 63 | "from": (page - 1) * 10, 64 | "size": 10, 65 | "highlight": { 66 | "pre_tags": 67 | [""], 68 | "post_tags": [""], 69 | "fields": { 70 | "title": {}, 71 | "content": {}, 72 | }, 73 | "fragment_size": 74 | 40 75 | } 76 | }) 77 | end_time = datetime.now() 78 | search_cost_time = (end_time - start_time).total_seconds() 79 | 80 | total_nums = response["hits"]["total"]["value"] 81 | 82 | if (total_nums % 10) > 0: 83 | page_nums = int(total_nums / 10) + 1 84 | else: 85 | page_nums = int(total_nums / 10) 86 | 87 | hit_list = [] 88 | # 这里封装的时候也可以重新排序-->不过elastic里面应该有,后面可以看看 89 | for hit in response["hits"]["hits"]: 90 | hit_dict = {} 91 | # title 92 | if "title" in hit["highlight"]: 93 | hit_dict["title"] = "".join(hit["highlight"].get( 94 | "title", "")) 95 | else: 96 | hit_dict["title"] = hit["_source"].get("title", "") 97 | 98 | # content 99 | if "content" in hit["highlight"]: 100 | hit_dict["content"] = "".join(hit["highlight"].get( 101 | "content", "")) # 取前五百个词 102 | else: 103 | hit_dict["content"] = hit["_source"].get("content", "") 104 | 105 | hit_dict["page_url"] = hit["_source"].get("page_url", "") 106 | hit_dict["score"] = hit["_score"] 107 | 108 | hit_list.append(hit_dict) 109 | 110 | result = { 111 | "page": page, 112 | "searchCostTime": search_cost_time, 113 | "totalNums": total_nums, 114 | "pageNums": page_nums, 115 | "hitList": hit_list, 116 | } 117 | return Response(result, status=status.HTTP_200_OK) 118 | except Exception as e: 119 | return Response(e, status=status.HTTP_500_INTERNAL_SERVER_ERROR) 120 | 121 | 122 | class SearchSuggest(APIView): 123 | ''' 124 | 根据输入返回搜索建议的接口 125 | ''' 126 | permission_classes = [AllowAny] 127 | 128 | input = openapi.Parameter('input', 129 | openapi.IN_QUERY, 130 | description="输入文本", 131 | type=openapi.TYPE_STRING) 132 | 133 | @swagger_auto_schema(manual_parameters=[input], responses={200: []}) 134 | def get(self, request): 135 | input_text = request.query_params.get("input", "") 136 | suggest_list = [] 137 | if input_text: 138 | s_ = BlogsIndex.search() 139 | s = s_.suggest('my_suggest', 140 | input_text, 141 | completion={ 142 | "field": "suggest", 143 | "fuzzy": { 144 | "fuzziness": 2 145 | }, 146 | "size": 8 147 | }) 148 | suggestions = s.execute() 149 | name_set = OrderedSet() 150 | for match in suggestions.suggest.my_suggest[0].options[:10]: 151 | source = match._source 152 | name_set.add(source["title"]) 153 | for name in name_set: 154 | suggest_list.append(name) 155 | return Response(suggest_list, status=status.HTTP_200_OK) 156 | -------------------------------------------------------------------------------- /frontend/src/views/Register.vue: -------------------------------------------------------------------------------- 1 | 67 | 68 | 202 | 253 | -------------------------------------------------------------------------------- /frontend/src/components/ResultList/SearchBoxDetail.vue: -------------------------------------------------------------------------------- 1 | 46 | 47 | 189 | 190 | 256 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 木兰宽松许可证, 第2版 2 | 3 | 木兰宽松许可证, 第2版 4 | 2020年1月 http://license.coscl.org.cn/MulanPSL2 5 | 6 | 7 | 您对“软件”的复制、使用、修改及分发受木兰宽松许可证,第2版(“本许可证”)的如下条款的约束: 8 | 9 | 0. 定义 10 | 11 | “软件”是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。 12 | 13 | “贡献”是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。 14 | 15 | “贡献者”是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。 16 | 17 | “法人实体”是指提交贡献的机构及其“关联实体”。 18 | 19 | “关联实体”是指,对“本许可证”下的行为方而言,控制、受控制或与其共同受控制的机构,此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。 20 | 21 | 1. 授予版权许可 22 | 23 | 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可,您可以复制、使用、修改、分发其“贡献”,不论修改与否。 24 | 25 | 2. 授予专利许可 26 | 27 | 每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的(根据本条规定撤销除外)专利许可,供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求,不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地,就“软件”或其中的“贡献”对任何人发起专利侵权诉讼(包括反诉或交叉诉讼)或其他专利维权行动,指控其侵犯专利权,则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。 28 | 29 | 3. 无商标许可 30 | 31 | “本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可,但您为满足第4条规定的声明义务而必须使用除外。 32 | 33 | 4. 分发限制 34 | 35 | 您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发,不论修改与否,但您必须向接收者提供“本许可证”的副本,并保留“软件”中的版权、商标、专利及免责声明。 36 | 37 | 5. 免责声明与责任限制 38 | 39 | “软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下,“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任,不论因何种原因导致或者基于何种法律理论,即使其曾被建议有此种损失的可能性。 40 | 41 | 6. 语言 42 | “本许可证”以中英文双语表述,中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致,以中文版为准。 43 | 44 | 条款结束 45 | 46 | 如何将木兰宽松许可证,第2版,应用到您的软件 47 | 48 | 如果您希望将木兰宽松许可证,第2版,应用到您的新软件,为了方便接收者查阅,建议您完成如下三步: 49 | 50 | 1, 请您补充如下声明中的空白,包括软件名、软件的首次发表年份以及您作为版权人的名字; 51 | 52 | 2, 请您在软件包的一级目录下创建以“LICENSE”为名的文件,将整个许可证文本放入该文件中; 53 | 54 | 3, 请将如下声明文本放入每个源文件的头部注释中。 55 | 56 | Copyright (c) [Year] [name of copyright holder] 57 | [Software Name] is licensed under Mulan PSL v2. 58 | You can use this software according to the terms and conditions of the Mulan PSL v2. 59 | You may obtain a copy of Mulan PSL v2 at: 60 | http://license.coscl.org.cn/MulanPSL2 61 | THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 62 | See the Mulan PSL v2 for more details. 63 | 64 | 65 | Mulan Permissive Software License,Version 2 66 | 67 | Mulan Permissive Software License,Version 2 (Mulan PSL v2) 68 | January 2020 http://license.coscl.org.cn/MulanPSL2 69 | 70 | Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions: 71 | 72 | 0. Definition 73 | 74 | Software means the program and related documents which are licensed under this License and comprise all Contribution(s). 75 | 76 | Contribution means the copyrightable work licensed by a particular Contributor under this License. 77 | 78 | Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License. 79 | 80 | Legal Entity means the entity making a Contribution and all its Affiliates. 81 | 82 | Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity. 83 | 84 | 1. Grant of Copyright License 85 | 86 | Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not. 87 | 88 | 2. Grant of Patent License 89 | 90 | Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken. 91 | 92 | 3. No Trademark License 93 | 94 | No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in Section 4. 95 | 96 | 4. Distribution Restriction 97 | 98 | You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software. 99 | 100 | 5. Disclaimer of Warranty and Limitation of Liability 101 | 102 | THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 103 | 104 | 6. Language 105 | 106 | THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL. 107 | 108 | END OF THE TERMS AND CONDITIONS 109 | 110 | How to Apply the Mulan Permissive Software License,Version 2 (Mulan PSL v2) to Your Software 111 | 112 | To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps: 113 | 114 | i Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner; 115 | 116 | ii Create a file named “LICENSE” which contains the whole context of this License in the first directory of your software package; 117 | 118 | iii Attach the statement to the appropriate annotated syntax at the beginning of each source file. 119 | 120 | 121 | Copyright (c) [Year] [name of copyright holder] 122 | [Software Name] is licensed under Mulan PSL v2. 123 | You can use this software according to the terms and conditions of the Mulan PSL v2. 124 | You may obtain a copy of Mulan PSL v2 at: 125 | http://license.coscl.org.cn/MulanPSL2 126 | THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE. 127 | See the Mulan PSL v2 for more details. 128 | -------------------------------------------------------------------------------- /Engine/html_extractor.py: -------------------------------------------------------------------------------- 1 | import re 2 | import traceback 3 | 4 | import cchardet 5 | import lxml 6 | import lxml.html 7 | from lxml.html import HtmlComment 8 | 9 | REGEXES = { 10 | 'okMaybeItsACandidateRe': re.compile( 11 | 'and|article|artical|body|column|main|shadow', re.I), 12 | 'positiveRe': re.compile( 13 | ('article|arti|body|content|entry|hentry|main|page|' 14 | 'artical|zoom|arti|context|message|editor|' 15 | 'pagination|post|txt|text|blog|story'), re.I), 16 | 'negativeRe': re.compile( 17 | ('copyright|combx|comment|com-|contact|foot|footer|footnote|decl|copy|' 18 | 'notice|' 19 | 'masthead|media|meta|outbrain|promo|related|scroll|link|pagebottom|bottom|' 20 | 'other|shoutbox|sidebar|sponsor|shopping|tags|tool|widget'), re.I), 21 | } 22 | 23 | 24 | 25 | class MainContent: 26 | def __init__(self,): 27 | self.non_content_tag = set([ 28 | 'head', 29 | 'meta', 30 | 'script', 31 | 'style', 32 | 'object', 'embed', 33 | 'iframe', 34 | 'marquee', 35 | 'select', 36 | ]) 37 | self.title = '' 38 | self.p_space = re.compile(r'\s') 39 | self.p_html = re.compile(r'', re.IGNORECASE|re.DOTALL) 40 | self.p_content_stop = re.compile(r'正文.*结束|正文下|相关阅读|声明') 41 | self.p_clean_tree = re.compile(r'author|post-add|copyright') 42 | 43 | def get_title(self, doc): 44 | title = '' 45 | title_el = doc.xpath('//title') 46 | if title_el: 47 | title = title_el[0].text_content().strip() 48 | if len(title) < 7: 49 | tt = doc.xpath('//meta[@name="title"]') 50 | if tt: 51 | title = tt[0].get('content', '') 52 | if len(title) < 7: 53 | tt = doc.xpath('//*[contains(@id, "title") or contains(@class, "title")]') 54 | if not tt: 55 | tt = doc.xpath('//*[contains(@id, "font01") or contains(@class, "font01")]') 56 | for t in tt: 57 | ti = t.text_content().strip() 58 | if ti in title and len(ti)*2 > len(title): 59 | title = ti 60 | break 61 | if len(ti) > 20: continue 62 | if len(ti) > len(title) or len(ti) > 7: 63 | title = ti 64 | return title 65 | 66 | def shorten_title(self, title): 67 | spliters = [' - ', '–', '—', '-', '|', '::'] 68 | for s in spliters: 69 | if s not in title: 70 | continue 71 | tts = title.split(s) 72 | if len(tts) < 2: 73 | continue 74 | title = tts[0] 75 | break 76 | return title 77 | 78 | def calc_node_weight(self, node): 79 | weight = 1 80 | attr = '%s %s %s' % ( 81 | node.get('class', ''), 82 | node.get('id', ''), 83 | node.get('style', '') 84 | ) 85 | if attr: 86 | mm = REGEXES['negativeRe'].findall(attr) 87 | weight -= 2 * len(mm) 88 | mm = REGEXES['positiveRe'].findall(attr) 89 | weight += 4 * len(mm) 90 | if node.tag in ['div', 'p', 'table']: 91 | weight += 2 92 | return weight 93 | 94 | def get_main_block(self, url, html, short_title=True): 95 | ''' return (title, etree_of_main_content_block) 96 | ''' 97 | if isinstance(html, bytes): 98 | encoding = cchardet.detect(html)['encoding'] 99 | if encoding is None: 100 | return None, None 101 | html = html.decode(encoding, 'ignore') 102 | try: 103 | doc = lxml.html.fromstring(html) 104 | doc.make_links_absolute(base_url=url) 105 | except : 106 | traceback.print_exc() 107 | return None, None 108 | self.title = self.get_title(doc) 109 | if short_title: 110 | self.title = self.shorten_title(self.title) 111 | body = doc.xpath('//body') 112 | if not body: 113 | return self.title, None 114 | candidates = [] 115 | nodes = body[0].getchildren() 116 | while nodes: 117 | node = nodes.pop(0) 118 | children = node.getchildren() 119 | tlen = 0 120 | for child in children: 121 | if isinstance(child, HtmlComment): 122 | continue 123 | if child.tag in self.non_content_tag: 124 | continue 125 | if child.tag == 'a': 126 | continue 127 | if child.tag == 'textarea': 128 | # FIXME: this tag is only part of content? 129 | continue 130 | attr = '%s%s%s' % (child.get('class', ''), 131 | child.get('id', ''), 132 | child.get('style')) 133 | if 'display' in attr and 'none' in attr: 134 | continue 135 | nodes.append(child) 136 | if child.tag == 'p': 137 | weight = 3 138 | else: 139 | weight = 1 140 | text = '' if not child.text else child.text.strip() 141 | tail = '' if not child.tail else child.tail.strip() 142 | tlen += (len(text) + len(tail)) * weight 143 | if tlen < 10: 144 | continue 145 | weight = self.calc_node_weight(node) 146 | candidates.append((node, tlen*weight)) 147 | if not candidates: 148 | return self.title, None 149 | candidates.sort(key=lambda a: a[1], reverse=True) 150 | good = candidates[0][0] 151 | if good.tag in ['p', 'pre', 'code', 'blockquote']: 152 | for i in range(5): 153 | good = good.getparent() 154 | if good.tag == 'div': 155 | break 156 | good = self.clean_etree(good, url) 157 | return self.title, good 158 | 159 | def clean_etree(self, tree, url=''): 160 | to_drop = [] 161 | drop_left = False 162 | for node in tree.iterdescendants(): 163 | if drop_left: 164 | to_drop.append(node) 165 | continue 166 | if isinstance(node, HtmlComment): 167 | to_drop.append(node) 168 | if self.p_content_stop.search(node.text): 169 | drop_left = True 170 | continue 171 | if node.tag in self.non_content_tag: 172 | to_drop.append(node) 173 | continue 174 | attr = '%s %s' % ( 175 | node.get('class', ''), 176 | node.get('id', '') 177 | ) 178 | if self.p_clean_tree.search(attr): 179 | to_drop.append(node) 180 | continue 181 | aa = node.xpath('.//a') 182 | if aa: 183 | text_node = len(self.p_space.sub('', node.text_content())) 184 | text_aa = 0 185 | for a in aa: 186 | alen = len(self.p_space.sub('', a.text_content())) 187 | if alen > 5: 188 | text_aa += alen 189 | if text_aa > text_node * 0.4: 190 | to_drop.append(node) 191 | for node in to_drop: 192 | try: 193 | node.drop_tree() 194 | except: 195 | pass 196 | return tree 197 | 198 | def get_text(self, doc): 199 | lxml.etree.strip_elements(doc, 'script') 200 | lxml.etree.strip_elements(doc, 'style') 201 | for ch in doc.iterdescendants(): 202 | if not isinstance(ch.tag, str): 203 | continue 204 | if ch.tag in ['div', 'h1', 'h2', 'h3', 'p', 'br', 'table', 'tr', 'dl']: 205 | if not ch.tail: 206 | ch.tail = '\n' 207 | else: 208 | ch.tail = '\n' + ch.tail.strip() + '\n' 209 | if ch.tag in ['th', 'td']: 210 | if not ch.text: 211 | ch.text = ' ' 212 | else: 213 | ch.text += ' ' 214 | # if ch.tail: 215 | # ch.tail = ch.tail.strip() 216 | lines = doc.text_content().split('\n') 217 | content = [] 218 | for l in lines: 219 | l = l.strip() 220 | if not l: 221 | continue 222 | content.append(l) 223 | return '\n'.join(content) 224 | 225 | def extract(self, url, html): 226 | '''return (title, content) 227 | ''' 228 | title, node = self.get_main_block(url, html) 229 | if node is None: 230 | print('\tno main block got !!!!!', url) 231 | return title, '', '' 232 | content = self.get_text(node) 233 | return title, content -------------------------------------------------------------------------------- /frontend/src/assets/userprofile.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/classifiers/list.py: -------------------------------------------------------------------------------- 1 | import joblib 2 | import numpy as np 3 | from glob import glob 4 | from loguru import logger 5 | from os.path import join, dirname, abspath 6 | from sklearn.metrics import classification_report 7 | from sklearn.model_selection import train_test_split, GridSearchCV 8 | from sklearn.svm import SVC 9 | from sklearn.preprocessing import StandardScaler 10 | from gerapy_auto_extractor.extractors.title import TitleExtractor 11 | from gerapy_auto_extractor.patterns.datetime import METAS_MATCH as DATETIME_METAS 12 | from gerapy_auto_extractor.schemas.element import Element 13 | from gerapy_auto_extractor.utils.element import number_of_p_descendants, \ 14 | number_of_a_descendants, number_of_punctuation, density_of_punctuation, density_of_text, number_of_clusters, \ 15 | file2element, number_of_a_char, number_of_char, number_of_p_children 16 | from gerapy_auto_extractor.utils.preprocess import preprocess4list_classifier 17 | from gerapy_auto_extractor.utils.similarity import similarity1 18 | from gerapy_auto_extractor.classifiers.base import BaseClassifier 19 | 20 | DATASETS_DIR = join(dirname(dirname(dirname(abspath(__file__)))), 'datasets') 21 | DATASETS_LIST_DIR = join(DATASETS_DIR, 'list') 22 | DATASETS_DETAIL_DIR = join(DATASETS_DIR, 'detail') 23 | 24 | MODELS_DIR = join(dirname(abspath(__file__)), 'models') 25 | 26 | 27 | class ListClassifier(BaseClassifier): 28 | 29 | def __init__(self, model_path=None, scaler_path=None): 30 | """ 31 | init features and extractors 32 | :param model_path: classifier model file 33 | """ 34 | self.model_path = model_path if model_path else join(MODELS_DIR, 'list_model.pkl') 35 | self.scaler_path = scaler_path if scaler_path else join(MODELS_DIR, 'list_scaler.pkl') 36 | self.title_extractor = TitleExtractor() 37 | self.feature_funcs = { 38 | 'number_of_a_char': number_of_a_char, 39 | 'number_of_a_char_log10': self._number_of_a_char_log10, 40 | 'number_of_char': number_of_char, 41 | 'number_of_char_log10': self._number_of_char_log10, 42 | 'rate_of_a_char': self._rate_of_a_char, 43 | 'number_of_p_descendants': number_of_p_descendants, 44 | 'number_of_a_descendants': number_of_a_descendants, 45 | 'number_of_punctuation': number_of_punctuation, 46 | 'density_of_punctuation': density_of_punctuation, 47 | 'number_of_clusters': self._number_of_clusters, 48 | 'density_of_text': density_of_text, 49 | 'max_density_of_text': self._max_density_of_text, 50 | 'max_number_of_p_children': self._max_number_of_p_children, 51 | 'has_datetime_meta': self._has_datetime_mata, 52 | 'similarity_of_title': self._similarity_of_title, 53 | } 54 | self.feature_names = self.feature_funcs.keys() 55 | 56 | def _number_of_clusters(self, element: Element): 57 | """ 58 | get number of clusters like list 59 | :param element: 60 | :return: 61 | """ 62 | tags = ['div', 'li', 'ul'] 63 | return number_of_clusters(element, tags=tags) 64 | 65 | def _similarity_of_title(self, element: Element): 66 | """ 67 | get similarity of and (<h> or <meta>) 68 | :param element: 69 | :return: 70 | """ 71 | _title_extract_by_title = self.title_extractor.extract_by_title(element) 72 | _title_extract_by_meta = self.title_extractor.extract_by_meta(element) 73 | _title_extract_by_h = self.title_extractor.extract_by_h(element) 74 | 75 | _title_target = None 76 | if _title_extract_by_meta: 77 | _title_target = _title_extract_by_meta 78 | elif _title_extract_by_h: 79 | _title_target = _title_extract_by_h 80 | 81 | if not _title_target: 82 | return 2 83 | if not _title_extract_by_title: 84 | return 3 85 | return similarity1(_title_target, _title_extract_by_title) 86 | 87 | def _has_datetime_mata(self, element: Element): 88 | """ 89 | has datetime meta 90 | :param element: 91 | :return: 92 | """ 93 | for xpath in DATETIME_METAS: 94 | datetime = element.xpath(xpath) 95 | if datetime: 96 | return True 97 | return False 98 | 99 | def _max_number_of_p_children(self, element: Element): 100 | """ 101 | get max number of p children an element contains 102 | :param element: 103 | :return: 104 | """ 105 | _number_of_p_children_list = [] 106 | for descendant in element.descendants: 107 | _number_of_p_children = number_of_p_children(descendant) 108 | _number_of_p_children_list.append(_number_of_p_children) 109 | return max(_number_of_p_children_list) 110 | 111 | def _max_density_of_text(self, element: Element): 112 | """ 113 | get max density_of_text 114 | :param element: 115 | :return: 116 | """ 117 | _density_of_text_list = [] 118 | for descendant in element.descendants: 119 | _density_of_text = density_of_text(descendant) 120 | _density_of_text_list.append(_density_of_text) 121 | return np.max(_density_of_text_list) 122 | 123 | def _rate_of_a_char(self, element: Element): 124 | """ 125 | rate of a 126 | :param element: 127 | :return: 128 | """ 129 | _number_of_a_char = number_of_a_char(element) 130 | _number_of_char = number_of_char(element) 131 | if _number_of_char == 0: 132 | return 0 133 | return _number_of_a_char / _number_of_char 134 | 135 | def _number_of_char_log10(self, element: Element): 136 | """ 137 | log10 of number of char 138 | :param element: 139 | :return: 140 | """ 141 | if element is None: 142 | return 0 143 | return np.log10(number_of_char(element) + 1) 144 | 145 | def _number_of_a_char_log10(self, element: Element): 146 | """ 147 | log10 of number of a char 148 | :param element: 149 | :return: 150 | """ 151 | if element is None: 152 | return 0 153 | return np.log10(number_of_a_char(element) + 1) 154 | 155 | def features_to_list(self, features: dict): 156 | """ 157 | convert features to list 158 | :param features: 159 | :param label: 160 | :return: 161 | """ 162 | return [features.get(feature_name) for feature_name in self.feature_names] 163 | 164 | def features(self, element: Element): 165 | """ 166 | build feature map using element 167 | :param element: 168 | :return: 169 | """ 170 | features = {} 171 | for feature_name, feature_func in self.feature_funcs.items(): 172 | features[feature_name] = feature_func(element) 173 | return features 174 | 175 | def process(self, element: Element): 176 | """ 177 | get probability of list 178 | :param element: 179 | :return: 180 | """ 181 | preprocess4list_classifier(element) 182 | x = [self.features_to_list(self.features(element))] 183 | # scale 184 | ss = joblib.load(self.scaler_path) 185 | x = ss.transform(x) 186 | # load model 187 | clf = joblib.load(self.model_path) 188 | # predict 189 | result = clf.predict_proba(x) 190 | if result.any() and len(result) and len(result[0]): 191 | return result[0][1] 192 | return 0 193 | 194 | def train(self): 195 | """ 196 | build dataset 197 | :return: 198 | """ 199 | list_file_paths = list(glob(f'{DATASETS_LIST_DIR}/*.html')) 200 | detail_file_paths = list(glob(f'{DATASETS_DETAIL_DIR}/*.html')) 201 | 202 | x_data, y_data = [], [] 203 | 204 | for index, list_file_path in enumerate(list_file_paths): 205 | logger.log('inspect', f'list_file_path {list_file_path}') 206 | element = file2element(list_file_path) 207 | if element is None: 208 | continue 209 | preprocess4list_classifier(element) 210 | x = self.features_to_list(self.features(element)) 211 | x_data.append(x) 212 | y_data.append(1) 213 | 214 | for index, detail_file_path in enumerate(detail_file_paths): 215 | logger.log('inspect', f'detail_file_path {detail_file_path}') 216 | element = file2element(detail_file_path) 217 | if element is None: 218 | continue 219 | preprocess4list_classifier(element) 220 | x = self.features_to_list(self.features(element)) 221 | x_data.append(x) 222 | y_data.append(0) 223 | 224 | # preprocess data 225 | ss = StandardScaler() 226 | x_data = ss.fit_transform(x_data) 227 | joblib.dump(ss, self.scaler_path) 228 | x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=5) 229 | 230 | # set up grid search 231 | c_range = np.logspace(-5, 20, 5, base=2) 232 | gamma_range = np.logspace(-9, 10, 5, base=2) 233 | param_grid = [ 234 | {'kernel': ['rbf'], 'C': c_range, 'gamma': gamma_range}, 235 | {'kernel': ['linear'], 'C': c_range}, 236 | ] 237 | grid = GridSearchCV(SVC(probability=True), param_grid, cv=5, verbose=10, n_jobs=-1) 238 | clf = grid.fit(x_train, y_train) 239 | y_true, y_pred = y_test, clf.predict(x_test) 240 | logger.log('inspect', f'\n{classification_report(y_true, y_pred)}') 241 | score = grid.score(x_test, y_test) 242 | logger.log('inspect', f'test accuracy {score}') 243 | # save model 244 | joblib.dump(grid.best_estimator_, self.model_path) 245 | 246 | 247 | list_classifier = ListClassifier() 248 | 249 | 250 | def probability_of_list(html, **kwargs): 251 | """ 252 | get probability of list page 253 | :param html: 254 | :param kwargs: other kwargs 255 | :return: 256 | """ 257 | return list_classifier.classify(html, **kwargs) 258 | 259 | 260 | def is_list(html, threshold=0.5, **kwargs): 261 | """ 262 | judge if this page is list page 263 | :param html: source of html 264 | :param threshold: 265 | :param kwargs: 266 | :return: 267 | """ 268 | _probability_of_list = probability_of_list(html, **kwargs) 269 | if _probability_of_list > threshold: 270 | return True 271 | return False 272 | -------------------------------------------------------------------------------- /Engine/gerapy_auto_extractor/extractors/list.py: -------------------------------------------------------------------------------- 1 | import math 2 | import operator 3 | from loguru import logger 4 | import numpy as np 5 | from collections import defaultdict 6 | from urllib.parse import urljoin 7 | from gerapy_auto_extractor.utils.cluster import cluster_dict 8 | from gerapy_auto_extractor.utils.preprocess import preprocess4list_extractor 9 | from gerapy_auto_extractor.extractors.base import BaseExtractor 10 | from gerapy_auto_extractor.utils.element import descendants_of_body 11 | from gerapy_auto_extractor.schemas.element import Element 12 | 13 | LIST_MIN_NUMBER = 5 14 | LIST_MIN_LENGTH = 8 15 | LIST_MAX_LENGTH = 44 16 | SIMILARITY_THRESHOLD = 0.8 17 | 18 | 19 | class ListExtractor(BaseExtractor): 20 | """ 21 | extract list from index page 22 | """ 23 | 24 | def __init__(self, min_number=LIST_MIN_NUMBER, min_length=LIST_MIN_LENGTH, max_length=LIST_MAX_LENGTH, 25 | similarity_threshold=SIMILARITY_THRESHOLD): 26 | """ 27 | init list extractor 28 | """ 29 | super(ListExtractor, self).__init__() 30 | self.min_number = min_number 31 | self.min_length = min_length 32 | self.max_length = max_length 33 | self.avg_length = (self.min_length + self.max_length) / 2 34 | self.similarity_threshold = similarity_threshold 35 | 36 | def _probability_of_title_with_length(self, length): 37 | """ 38 | get the probability of title according to length 39 | import matplotlib.pyplot as plt 40 | x = np.asarray(range(5, 40)) 41 | y = list_extractor.probability_of_title_with_length(x) 42 | plt.plot(x, y, 'g', label='m=0, sig=2') 43 | plt.show() 44 | :param length: 45 | :return: 46 | """ 47 | sigma = 6 48 | return np.exp(-1 * ((length - self.avg_length) ** 2) / (2 * (sigma ** 2))) / (math.sqrt(2 * np.pi) * sigma) 49 | 50 | def _build_clusters(self, element): 51 | """ 52 | build candidate clusters according to element 53 | :return: 54 | """ 55 | descendants_tree = defaultdict(list) 56 | descendants = descendants_of_body(element) 57 | for descendant in descendants: 58 | # if one element does not have enough siblings, it can not become a child of candidate element 59 | if descendant.number_of_siblings + 1 < self.min_number: 60 | continue 61 | # if min length is larger than specified max length, it can not become a child of candidate element 62 | if descendant.a_descendants_group_text_min_length > self.max_length: 63 | continue 64 | # if max length is smaller than specified min length, it can not become a child of candidate element 65 | if descendant.a_descendants_group_text_max_length < self.min_length: 66 | continue 67 | # descendant element must have same siblings which their similarity should not below similarity_threshold 68 | if descendant.similarity_with_siblings < self.similarity_threshold: 69 | continue 70 | descendants_tree[descendant.parent_selector].append(descendant) 71 | descendants_tree = dict(descendants_tree) 72 | 73 | # cut tree, remove parent block 74 | selectors = sorted(list(descendants_tree.keys())) 75 | last_selector = None 76 | for selector in selectors[::-1]: 77 | # if later selector 78 | if last_selector and selector and last_selector.startswith(selector): 79 | del descendants_tree[selector] 80 | last_selector = selector 81 | clusters = cluster_dict(descendants_tree) 82 | return clusters 83 | 84 | def _evaluate_cluster(self, cluster): 85 | """ 86 | calculate score of cluster using similarity, numbers, or other info 87 | :param cluster: 88 | :return: 89 | """ 90 | score = dict() 91 | 92 | # calculate avg_similarity_with_siblings 93 | score['avg_similarity_with_siblings'] = np.mean( 94 | [element.similarity_with_siblings for element in cluster]) 95 | 96 | # calculate number of elements 97 | score['number_of_elements'] = len(cluster) 98 | 99 | # calculate probability of it contains title 100 | # score['probability_of_title_with_length'] = np.mean([ 101 | # self._probability_of_title_with_length(len(a_descendant.text)) \ 102 | # for a_descendant in itertools.chain(*[element.a_descendants for element in cluster]) \ 103 | # ]) 104 | 105 | # TODO: add more quota to select best cluster 106 | score['clusters_score'] = \ 107 | score['avg_similarity_with_siblings'] \ 108 | * np.log10(score['number_of_elements'] + 1) \ 109 | # * clusters_score[cluster_id]['probability_of_title_with_length'] 110 | return score 111 | 112 | def _extend_cluster(self, cluster): 113 | """ 114 | extend cluster's elements except for missed children 115 | :param cluster: 116 | :return: 117 | """ 118 | result = [element.selector for element in cluster] 119 | for element in cluster: 120 | path_raw = element.path_raw 121 | siblings = list(element.siblings) 122 | for sibling in siblings: 123 | # skip invalid element 124 | if not isinstance(sibling, Element): 125 | continue 126 | sibling_selector = sibling.selector 127 | sibling_path_raw = sibling.path_raw 128 | if sibling_path_raw != path_raw: 129 | continue 130 | # add missed sibling 131 | if sibling_selector not in result: 132 | cluster.append(sibling) 133 | result.append(sibling_selector) 134 | 135 | cluster = sorted(cluster, key=lambda x: x.nth) 136 | logger.log('inspect', f'cluster after extend {cluster}') 137 | return cluster 138 | 139 | def _best_cluster(self, clusters): 140 | """ 141 | use clustering algorithm to choose best cluster from candidate clusters 142 | :param clusters: 143 | :return: 144 | """ 145 | if not clusters: 146 | logger.log('inspect', 'there is on cluster, just return empty result') 147 | return [] 148 | if len(clusters) == 1: 149 | logger.log('inspect', 'there is only one cluster, just return first cluster') 150 | return clusters[0] 151 | # choose best cluster using score 152 | clusters_score = defaultdict(dict) 153 | clusters_score_arg_max = 0 154 | clusters_score_max = -1 155 | for cluster_id, cluster in clusters.items(): 156 | # calculate avg_similarity_with_siblings 157 | clusters_score[cluster_id] = self._evaluate_cluster(cluster) 158 | # get max score arg index 159 | if clusters_score[cluster_id]['clusters_score'] > clusters_score_max: 160 | clusters_score_max = clusters_score[cluster_id]['clusters_score'] 161 | clusters_score_arg_max = cluster_id 162 | logger.log('inspect', f'clusters_score {clusters_score}') 163 | best_cluster = clusters[clusters_score_arg_max] 164 | return best_cluster 165 | 166 | def _extract_cluster(self, cluster): 167 | """ 168 | extract title and href from best cluster 169 | :param cluster: 170 | :return: 171 | """ 172 | if not cluster: 173 | return None 174 | # get best tag path of title 175 | probabilities_of_title = defaultdict(list) 176 | for element in cluster: 177 | descendants = element.a_descendants 178 | for descendant in descendants: 179 | path = descendant.path 180 | descendant_text = descendant.text 181 | probability_of_title_with_length = self._probability_of_title_with_length(len(descendant_text)) 182 | # probability_of_title_with_descendants = self.probability_of_title_with_descendants(descendant) 183 | # TODO: add more quota to calculate probability_of_title 184 | probability_of_title = probability_of_title_with_length 185 | probabilities_of_title[path].append(probability_of_title) 186 | 187 | # get most probable tag_path 188 | probabilities_of_title_avg = {k: np.mean(v) for k, v in probabilities_of_title.items()} 189 | if not probabilities_of_title_avg: 190 | return None 191 | best_path = max(probabilities_of_title_avg.items(), key=operator.itemgetter(1))[0] 192 | logger.log('inspect', f'best tag path {best_path}') 193 | 194 | # extract according to best tag path 195 | result = [] 196 | for element in cluster: 197 | descendants = element.a_descendants 198 | for descendant in descendants: 199 | path = descendant.path 200 | if path != best_path: 201 | continue 202 | title = descendant.text 203 | url = descendant.attrib.get('href') 204 | if not url: 205 | continue 206 | if url.startswith('//'): 207 | url = 'http:' + url 208 | base_url = self.kwargs.get('base_url') 209 | if base_url: 210 | url = urljoin(base_url, url) 211 | result.append({ 212 | 'title': title, 213 | 'url': url 214 | }) 215 | return result 216 | 217 | def process(self, element: Element): 218 | """ 219 | extract content from html 220 | :param element: 221 | :return: 222 | """ 223 | # preprocess 224 | preprocess4list_extractor(element) 225 | 226 | # build clusters 227 | clusters = self._build_clusters(element) 228 | logger.log('inspect', f'after build clusters {clusters}') 229 | 230 | # choose best cluster 231 | best_cluster = self._best_cluster(clusters) 232 | logger.log('inspect', f'best cluster {best_cluster}') 233 | 234 | extended_cluster = self._extend_cluster(best_cluster) 235 | logger.log('inspect', f'extended cluster {extended_cluster}') 236 | 237 | # extract result from best cluster 238 | return self._extract_cluster(best_cluster) 239 | 240 | 241 | list_extractor = ListExtractor() 242 | 243 | 244 | def extract_list(html, **kwargs): 245 | """ 246 | extract list from index html 247 | :param: base_url 248 | :return: 249 | """ 250 | return list_extractor.extract(html, **kwargs) 251 | -------------------------------------------------------------------------------- /frontend/src/assets/register.svg: -------------------------------------------------------------------------------- 1 | <svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" width="861.34285" height="648.67936" viewBox="0 0 861.34285 648.67936" xmlns:xlink="http://www.w3.org/1999/xlink"><path d="M487.96668,298.31443c37.10109-23.94307,88.20046-14.64015,124.34793,10.72884a196.9347,196.9347,0,0,1,32.30722,28.82658c5.64318,6.18962,10.99283,12.68743,16.10981,19.409,1.26068,1.64909,2.51531,3.31471,3.74573,4.99679q1.26287,1.70574,2.50473,3.43526c10.37769,14.42492,19.89188,29.60015,29.20575,44.63751,23.25089,37.5382,47.10783,76.19386,82.53441,102.54908,7.29895,5.42842,17.482,10.31693,24.95986,5.13332,6.48912-4.50162,6.7671-13.96718,5.29023-21.72428-5.49032-28.74-25.29548-52.55662-46.33553-72.88759-21.03978-20.33084-44.332-38.97058-60.22963-63.5374-15.8979-24.56687-23.16127-57.36431-9.23631-83.09406,12.00085-22.17255,37.8208-34.60974,63.02364-35.189,25.20065-.57377,49.68543,9.22225,71.00676,22.6735,21.31941,13.45676,40.066,30.53133,59.63091,46.42986,11.79386,9.5917,28.26094,19.10907,41.11381,10.98808,10.85632-6.863,11.70313-23.33834,5.233-34.43131s-18.01809-18.0644-29.08878-24.56575q-51.5577-30.28842-103.11733-60.5713c-10.54549-6.18859-21.32085-12.575-29.10512-22.01161-7.78648-9.43112-12.10823-22.68629-7.81685-34.1389,4.23393-11.2862,16.05933-18.45281,28.02325-19.94843,11.95605-1.49217,24.0148,1.823,35.19936,6.31353,41.91245,16.83224,140.602,126.95133,166.90453,145.77839,16.90328,12.0919,34.65985,27.40012,36.36931,48.11155,2.1296,25.78425-23.526,47.18186-49.27763,49.68313-25.74808,2.50866-50.70386-8.68727-73.71019-20.52382s-46.48564-24.88724-72.30913-26.53029c-25.823-1.64282-54.79532,12.16074-60.91654,37.29958-5.32348,21.87428,8.20967,44.44617,25.83789,58.44235,13.79843,10.95842,29.99225,18.25607,45.63451,26.46205,4.3438,2.27173,8.64194,4.62083,12.8341,7.132a180.25086,180.25086,0,0,1,29.93459,22.5045,183.67679,183.67679,0,0,1,25.77414,29.64094,179.18159,179.18159,0,0,1,15.62068,27.17341q1.00641,2.1759,1.95446,4.4124c.59864,1.42551,1.17874,2.86947,1.72867,4.32782,7.17972,18.91307,9.97662,40.10751.8045,57.76316-9.14885,17.60784-28.96516,28.03412-48.75259,29.48a89.80071,89.80071,0,0,1-33.791-4.46408,137.2297,137.2297,0,0,1-23.12969-9.94084C794.52567,600.7,767.40355,571.54178,742.6339,541.566c-24.7637-29.9737-48.08255-61.5336-77.48153-86.97744-29.39734-25.44945-66.23669-44.69009-105.12049-44.39784-38.88188.2867-78.96477,23.97778-90.00792,61.25757-16.09646-27.73614-26.98109-59.35475-25.544-91.388S461.024,315.70525,487.96668,298.31443Z" transform="translate(-169.32858 -125.66032)" fill="#3f3d56"/><circle cx="469.48709" cy="272.01678" r="4.2718" fill="#251aff"/><circle cx="308.49926" cy="275.79175" r="6.60457" fill="#ff6584"/><circle cx="785.86508" cy="221.51745" r="2.44032" fill="#ff6584"/><circle cx="736.48895" cy="90.36211" r="2.44032" fill="#ff6584"/><path d="M843.02633,456.74236a26.07891,26.07891,0,0,1,2.78-5.68327c4.3438,2.27173,8.64194,4.62083,12.8341,7.132a180.25086,180.25086,0,0,1,29.93459,22.5045,26.02472,26.02472,0,0,1-45.54867-23.95319Z" transform="translate(-169.32858 -125.66032)" fill="#ff6584"/><circle cx="362.47378" cy="249.1678" r="2.49578" fill="#f0f0f0"/><circle cx="668.2498" cy="64.51306" r="2.49578" fill="#f0f0f0"/><circle cx="645.77831" cy="20.05675" r="2.49578" fill="#f0f0f0"/><circle cx="822.50254" cy="185.77006" r="2.49578" fill="#f0f0f0"/><circle cx="691.4035" cy="189.35566" r="2.49578" fill="#f0f0f0"/><circle cx="621.83508" cy="427.40632" r="2.49578" fill="#f0f0f0"/><circle cx="543.40782" cy="193.29038" r="2.49578" fill="#f0f0f0"/><circle cx="539.90427" cy="146.09722" r="2.49578" fill="#f0f0f0"/><path d="M866.14147,532.28381c-3.70615-.15179-7.30281-.16975-10.76983-.04989-21.35628.73815-35.58147,6.85689-39.02807,16.788s3.92974,23.54668,20.23724,37.35613q2.419,2.0484,5.04258,4.06347a80.0276,80.0276,0,0,1,24.51808-58.15768Z" transform="translate(-169.32858 -125.66032)" fill="none"/><path d="M846.04959,563.68214a79.49915,79.49915,0,0,1,20.09249-31.39921q1.80051-1.72372,3.7035-3.33106a80.08128,80.08128,0,0,1,44.50356-18.61538,179.18438,179.18438,0,0,1,15.62068,27.17341q1.00639,2.17594,1.95446,4.4124c.59864,1.42551,1.17874,2.86947,1.72867,4.32782,7.17972,18.91307,9.97662,40.10751.8045,57.76316-9.14885,17.60784-28.96517,28.03416-48.7526,29.48007a89.80131,89.80131,0,0,1-33.791-4.46411,79.90212,79.90212,0,0,1-10.1306-34.05964c-.09842-1.50666-.14666-3.01457-.16151-4.52952A79.68475,79.68475,0,0,1,846.04959,563.68214Z" transform="translate(-169.32858 -125.66032)" fill="#251aff"/><path d="M813.01622,547.86864c4.00872-11.55075,19.00577-18.34978,42.2361-19.15377,4.64842-.16374,9.76062,1.76955,14.59326.237,23.44495-7.43488,44.71546-2.57642,62.07868,12.9704.59863,1.42553,1.17876,2.86948,1.72869,4.32783-.23093-.08636-.46379-.16717-.69664-.248-23.462-8.14252-46.75216-12.8952-66.81423-13.7192-3.707-.14949-8.129-2.29424-10.76958-.04705-21.20758,18.04834-35.55151,10.486-39.02911,16.78748-5.07941,9.204,3.93167,23.5454,20.23837,37.35631q2.41941,2.05122,5.04,4.06041c.01485,1.51495.06309,3.02286.16151,4.52952q-3.94577-2.91643-7.47707-5.90033C816.56833,574.04719,809.00944,559.41384,813.01622,547.86864Z" transform="translate(-169.32858 -125.66032)" fill="#e4e4e4"/><circle cx="731.55933" cy="501.52259" r="2.49578" fill="#f0f0f0"/><circle cx="687.40279" cy="469.42248" r="2.49578" fill="#f0f0f0"/><circle cx="721.18621" cy="454.43059" r="24.27028" fill="#f0f0f0"/><path d="M509.19491,311.783c5.67345.26659,17.40774,3.08394,30.12317,6.72643,12.70782,3.64607,26.49182,8.09431,40.10029,12.4941q35.73268,11.54366,71.46211,23.08,4.91208,1.5836,9.85116,3.19521c1.26068,1.64909,2.51531,3.31471,3.74573,4.99679-2.89437-1.02316-5.8003-2.013-8.69749-2.95637-12.37844-4.02258-23.1198-6.67554-35.13327-10.33536-18.48821-5.62729-37.99248-13.0176-56.909-19.8995C544.823,322.19693,523.12131,314.78335,509.19491,311.783Z" transform="translate(-169.32858 -125.66032)" fill="#f0f0f0" opacity="0.3"/><path d="M932.75759,262.92414l53.58031,88.947-50.55816-90.71607a1.84948,1.84948,0,1,0-3.02215,1.76906Z" transform="translate(-169.32858 -125.66032)" fill="#f0f0f0" opacity="0.3"/><circle cx="737.2964" cy="397.26895" r="2.49578" fill="#f0f0f0"/><circle cx="299.53813" cy="375.5268" r="16" fill="#251aff"/><path d="M459.3879,492.40273a11.44042,11.44042,0,0,1-7.3141-15.94512L423.10256,447.935l20.66168-4.40845,24.344,27.76138a11.50245,11.50245,0,0,1-8.72027,21.11472Z" transform="translate(-169.32858 -125.66032)" fill="#a0616a"/><path d="M320.86565,270.42666l16.157,10.64925,18.91194,4.43s14.99907,107.19866,22.36959,113.62868,1.16,7.03958,3.15737,15.909,6.61965,27.01462,6.61965,27.01462c-44.89679,27.82945-85.006,31.22731-119.22623,4.57454a8.05237,8.05237,0,0,1,3.76528-9.36653c5.786-3.59436-6.76409-17.67495-2.02758-20.495s-3.63329-23.91315-3.63329-23.91315l1.46049-45.29872-4.0663-6.94806,5.72828-44.60031,14.76652-6.56237,5.05588-14.98693Z" transform="translate(-169.32858 -125.66032)" fill="#251aff"/><path d="M327.8667,274.18712l26.64115,8.8716s29.47047-3.11176,30.66466,40.75832-4.29794,67.41167-4.29794,67.41167S442.17772,514.507,375.77221,478.097L356.3667,401.68712l-20-120Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M354.5843,371.17746l13.81877-1.90862s1.33426,20.8342,7.94429,23.99172,4.38173,3.67716,5.16118,7.01962-5.8779,12.60375-.64184,14.90694,51,60,51,60l27-25c-1.85564-5.02813-10.78512-.93686-11.345-6.26714-.70457-8.05889-14.8379-30.07437-17.84566-32.89719s-1.81872-7.79907-3.37763-14.484-.4767-.48633-3.07488-11.62786-37.80825-67.91285-39.03245-78.2-15.55843-16.34242-15.55843-16.34242l-11.14153,2.59818Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><polygon points="68.581 615.998 84.845 620.268 109.058 559.567 85.054 553.264 68.581 615.998" fill="#a0616a"/><path d="M232.881,758.47391l50.014,13.13347.16612-.6325a20.12793,20.12793,0,0,0-14.35407-24.579l-.00121-.00032-7.31533-9.33008-18.86557,2.45516-4.36588-1.14648Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><polygon points="151.957 628.165 168.771 628.164 176.771 563.303 151.953 563.305 151.957 628.165" fill="#a0616a"/><path d="M320.69272,771.36718l51.70963-.00195v-.654a20.12793,20.12793,0,0,0-20.12682-20.12651h-.00125l-9.44543-7.1658-17.62308,7.16689-4.51391.00015Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M279.2272,443.4438c-4.062,5.07751-2.031,22.341-2.031,22.341s-7.10852,55.85264-4.062,60.93015-2.031,9.13953-5.07752,16.248-6.093,24.37206-6.093,24.37206c-17.26354,14.217-16.248,79.2092-16.248,79.2092l-6.093,57.88364c2.031,6.093,30.46508,7.10852,34.52709,6.093S317.8163,566.3196,317.8163,566.3196s-2.031,138.10834-2.031,144.20136,27.41857,3.0465,33.51158,3.0465S393.979,464.76932,393.979,464.76932v-14.217l-5.07751-7.10851S283.28921,438.36629,279.2272,443.4438Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M288.8667,277.18712,263.92876,293.0758,251.8667,303.18712l16,102s-24.30892,92.71612,14,87,36.16111-89.03846,36.16111-89.03846Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M322.23542,422.083a11.44044,11.44044,0,0,1-16.54431-5.83342l-40.64366.98165,11.20149-17.91273,36.87863,1.814A11.50245,11.50245,0,0,1,322.2354,422.083Z" transform="translate(-169.32858 -125.66032)" fill="#a0616a"/><circle cx="132.62957" cy="105.61497" r="32.11879" fill="#a0616a"/><path d="M265.52656,210.17108c3.60658-5.57961,10.37456-9.29825,16.93562-8.25331a23.12708,23.12708,0,0,1,38.75042-13.4894,7.19154,7.19154,0,0,1,7.03282-.70384,14.82856,14.82856,0,0,1,5.71367,4.58627,33.28016,33.28016,0,0,1,4.65684,33.85669c.80185-2.915-2.11948-5.65739-5.07118-6.31152-2.95144-.65414-6.04757-.03529-9.04074-.46126-3.83627-.54587-7.28476-2.77134-11.12709-3.27218-3.22918-.42092-6.47658.407-9.62557,1.23693-3.149.82981-6.388,1.67606-9.62178,1.29208-3.234-.384-7.39959,11.87152-7.34369,20.5035.01062,1.63584-.32631,3.73609-1.925,4.08314-1.96872.42749-3.10954-2.28971-4.9834-3.0301a3.46671,3.46671,0,0,0-4.25212,2.26113,5.652,5.652,0,0,0,1.07909,5.06131,16.70142,16.70142,0,0,0,4.04925,3.51261l-.77175.64187c-1.28373,1.69591-3.94655,1.7694-5.7969.72015a12.5112,12.5112,0,0,1-4.16206-4.739c-3.4232-5.69369-6.426-11.76023-7.63408-18.29308S261.92023,215.75069,265.52656,210.17108Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M276.8667,306.18712l-14.43307-12.3687s-16.85215,5.25847-19.70951,16.31358c-22.49139,28.75861-34.60418,62.34589-37.85742,100.05512,29.36371,11.09012,65.31909,15.02441,104,16l-8-30-55-10Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M225.72794,757.227l-7.71-12.39-1.54,7.08008c-.27,1.24-.54,2.5-.79,3.75-2.18995-1.87012-4.52-3.6001-6.79981-5.26q-10.5-7.62012-20.99023-15.26l2.18994,12.7c1.3501,7.82007,2.76025,15.8,6.1001,22.94995.37011.81006.77,1.61011,1.20019,2.39014h32.54a10.48765,10.48765,0,0,0,.54-2.24011.77484.77484,0,0,0,.00976-.15C230.9882,765.947,228.308,761.37706,225.72794,757.227Z" transform="translate(-169.32858 -125.66032)" fill="#f2f2f2"/><path d="M513.29792,742.99305l14.58608-23.44,2.91353,13.39444c.51083,2.34587,1.02167,4.72962,1.49463,7.09443,4.143-3.538,8.55119-6.81084,12.8642-9.95114q19.86441-14.41611,39.71035-28.86963l-4.143,24.02639c-2.55418,14.79438-5.222,29.8913-11.54046,43.41783-.70021,1.53251-1.45676,3.04608-2.27059,4.52178H505.35179a19.84078,19.84078,0,0,1-1.02167-4.238,1.46446,1.46446,0,0,1-.01848-.28382C503.34632,759.48991,508.41681,750.84426,513.29792,742.99305Z" transform="translate(-169.32858 -125.66032)" fill="#f2f2f2"/><path d="M501.855,773.68712H434.87842A11.52467,11.52467,0,0,1,423.3667,762.1754V527.19884a11.52466,11.52466,0,0,1,11.51172-11.51172H501.855a11.52466,11.52466,0,0,1,11.51172,11.51172V762.1754A11.52467,11.52467,0,0,1,501.855,773.68712Z" transform="translate(-169.32858 -125.66032)" fill="#f2f2f2"/><path d="M486.04,521.68712H451.69336A4.332,4.332,0,0,1,447.3667,517.36v-12.3457a4.332,4.332,0,0,1,4.32666-4.32715H486.04a4.332,4.332,0,0,1,4.32666,4.32715V517.36A4.332,4.332,0,0,1,486.04,521.68712Z" transform="translate(-169.32858 -125.66032)" fill="#3f3d56"/><path d="M584.26926,774.03235l-413.75.30733a1.19069,1.19069,0,0,1,0-2.38137l413.75-.30733a1.19069,1.19069,0,0,1,0,2.38137Z" transform="translate(-169.32858 -125.66032)" fill="#cacaca"/></svg> --------------------------------------------------------------------------------