├── Backend
    ├── Backend
    │   ├── __init__.py
    │   ├── asgi.py
    │   ├── wsgi.py
    │   ├── urls.py
    │   └── settings.py
    ├── accounts
    │   ├── __init__.py
    │   ├── migrations
    │   │   ├── __init__.py
    │   │   ├── 0003_customuser_avator.py
    │   │   ├── 0004_auto_20211205_2048.py
    │   │   ├── 0002_auto_20211204_1703.py
    │   │   └── 0001_initial.py
    │   ├── urls.py
    │   ├── tests.py
    │   ├── admin.py
    │   ├── views.py
    │   ├── apps.py
    │   ├── serializers.py
    │   └── models.py
    ├── search_blogs
    │   ├── __init__.py
    │   ├── migrations
    │   │   └── __init__.py
    │   ├── tests.py
    │   ├── admin.py
    │   ├── apps.py
    │   ├── urls.py
    │   ├── models.py
    │   └── views.py
    ├── static
    │   └── avator
    │   │   └── default.jpg
    └── manage.py
├── Crawler
    ├── Crawler
    │   ├── __init__.py
    │   ├── models
    │   │   ├── tempCodeRunnerFile.py
    │   │   └── es_blogs.py
    │   ├── spiders
    │   │   ├── __init__.py
    │   │   └── blog1.py
    │   ├── utils
    │   │   └── common.py
    │   ├── settings.py
    │   ├── items.py
    │   ├── middlewares.py
    │   └── pipelines.py
    └── scrapy.cfg
├── frontend
    ├── src
    │   ├── components
    │   │   ├── Footer.vue
    │   │   ├── ResultList
    │   │   │   ├── RelatedSearch.vue
    │   │   │   ├── SomeTips.vue
    │   │   │   ├── List.vue
    │   │   │   ├── DetailNav.vue
    │   │   │   ├── PageIndex.vue
    │   │   │   └── SearchBoxDetail.vue
    │   │   └── Home
    │   │   │   ├── Logo.vue
    │   │   │   ├── SampleNav.vue
    │   │   │   └── SearchBox.vue
    │   ├── assets
    │   │   ├── logo.png
    │   │   ├── img
    │   │   │   └── ava.jpg
    │   │   ├── logo.svg
    │   │   ├── empty.svg
    │   │   ├── userprofile.svg
    │   │   └── register.svg
    │   ├── App.vue
    │   ├── main.js
    │   ├── api
    │   │   ├── request.js
    │   │   └── index.js
    │   ├── views
    │   │   ├── Home.vue
    │   │   ├── activate.vue
    │   │   ├── ResultList.vue
    │   │   ├── UserProfile.vue
    │   │   ├── Login.vue
    │   │   └── Register.vue
    │   ├── store
    │   │   └── index.js
    │   └── router
    │   │   └── index.js
    ├── .browserslistrc
    ├── public
    │   ├── logo.png
    │   ├── js
    │   │   └── rem.js
    │   └── index.html
    ├── babel.config.js
    └── package.json
├── Engine
    ├── gerapy_auto_extractor
    │   ├── schemas
    │   │   ├── __init__.py
    │   │   └── tag.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   ├── helper.py
    │   │   ├── lcs.py
    │   │   ├── similarity.py
    │   │   ├── cluster.py
    │   │   └── preprocess.py
    │   ├── patterns
    │   │   ├── __init__.py
    │   │   ├── title.py
    │   │   └── datetime.py
    │   ├── helpers.py
    │   ├── __version__.py
    │   ├── settings.py
    │   ├── classifiers
    │   │   ├── models
    │   │   │   ├── list_model.pkl
    │   │   │   └── list_scaler.pkl
    │   │   ├── __init__.py
    │   │   ├── detail.py
    │   │   ├── base.py
    │   │   └── list.py
    │   ├── __init__.py
    │   └── extractors
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── datetime.py
    │   │   ├── content.py
    │   │   ├── title.py
    │   │   └── list.py
    ├── url_parser.py
    ├── pagerank.py
    └── html_extractor.py
├── requirements.txt
├── config.py
├── requirements_.txt
├── .vscode
    └── launch.json
├── README.md
├── .gitignore
├── draw
    ├── 功能分析.drawio
    └── 系统架构.drawio
└── LICENSE


/Backend/Backend/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Backend/accounts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Crawler/Crawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Backend/search_blogs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/frontend/src/components/Footer.vue:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Backend/accounts/migrations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Backend/search_blogs/migrations/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/patterns/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/frontend/src/components/ResultList/RelatedSearch.vue:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Crawler/Crawler/models/tempCodeRunnerFile.py:
--------------------------------------------------------------------------------
1 | Completion


--------------------------------------------------------------------------------
/frontend/.browserslistrc:
--------------------------------------------------------------------------------
1 | > 1%
2 | last 2 versions
3 | not dead
4 | 


--------------------------------------------------------------------------------
/Backend/accounts/urls.py:
--------------------------------------------------------------------------------
1 | from django.urls import path
2 | 
3 | 
4 | urlpatterns = []


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/requirements.txt


--------------------------------------------------------------------------------
/Backend/accounts/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/Backend/accounts/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/Backend/accounts/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 | 
3 | # Create your views here.
4 | 


--------------------------------------------------------------------------------
/Backend/search_blogs/tests.py:
--------------------------------------------------------------------------------
1 | from django.test import TestCase
2 | 
3 | # Create your tests here.
4 | 


--------------------------------------------------------------------------------
/Backend/search_blogs/admin.py:
--------------------------------------------------------------------------------
1 | from django.contrib import admin
2 | 
3 | # Register your models here.
4 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/helpers.py:
--------------------------------------------------------------------------------
1 | from gerapy_auto_extractor.utils.helper import jsonify, content
2 | 


--------------------------------------------------------------------------------
/frontend/public/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/frontend/public/logo.png


--------------------------------------------------------------------------------
/frontend/src/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/frontend/src/assets/logo.png


--------------------------------------------------------------------------------
/frontend/babel.config.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   presets: [
3 |     '@vue/cli-plugin-babel/preset'
4 |   ]
5 | }
6 | 


--------------------------------------------------------------------------------
/frontend/src/assets/img/ava.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/frontend/src/assets/img/ava.jpg


--------------------------------------------------------------------------------
/Backend/static/avator/default.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/Backend/static/avator/default.jpg


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/__version__.py:
--------------------------------------------------------------------------------
1 | VERSION = (0, 1, '2')
2 | 
3 | version = __version__ = '.'.join(map(str, VERSION))
4 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/schemas/tag.py:
--------------------------------------------------------------------------------
1 | SCRIPT = 'script'
2 | STYLE = 'style'
3 | P = 'p'
4 | BODY = 'body'
5 | HEAD = 'head'
6 | 


--------------------------------------------------------------------------------
/Backend/accounts/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 | 
3 | 
4 | class AccountsConfig(AppConfig):
5 |     name = 'accounts'
6 | 


--------------------------------------------------------------------------------
/Backend/search_blogs/apps.py:
--------------------------------------------------------------------------------
1 | from django.apps import AppConfig
2 | 
3 | 
4 | class SearchBlogsConfig(AppConfig):
5 |     name = 'search_blogs'
6 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/settings.py:
--------------------------------------------------------------------------------
1 | import environs
2 | 
3 | env = environs.Env()
4 | env.read_env()
5 | 
6 | APP_DEBUG = env.bool('APP_DEBUG', False)
7 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/classifiers/models/list_model.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/Engine/gerapy_auto_extractor/classifiers/models/list_model.pkl


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/classifiers/models/list_scaler.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Justin3go/xiu-search/HEAD/Engine/gerapy_auto_extractor/classifiers/models/list_scaler.pkl


--------------------------------------------------------------------------------
/Crawler/Crawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 | REDIS_HOST = "127.0.0.1"
2 | REDIS_PASSWORD = 123456
3 | ES_HOST = "xxxx"
4 | # ES_HOST = "localhost:9200"
5 | 
6 | MYSQL_HOST = "xxxx"
7 | MYSQL_DBNAME = "xiusearch"
8 | MYSQL_USER = "justin3go"
9 | MYSQL_PASSWORD = "xxxx"


--------------------------------------------------------------------------------
/Backend/search_blogs/urls.py:
--------------------------------------------------------------------------------
1 | from django.urls import path
2 | from .import views
3 | from django.conf.urls import url
4 | 
5 | 
6 | urlpatterns = [
7 |     url('search/$', views.SearchView.as_view()),
8 |     url('search/suggest', views.SearchSuggest.as_view())
9 | ]


--------------------------------------------------------------------------------
/Crawler/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = Crawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = Crawler
12 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/patterns/title.py:
--------------------------------------------------------------------------------
1 | METAS = [
2 |     '//meta[starts-with(@property, "og:title")]/@content',
3 |     '//meta[starts-with(@name, "og:title")]/@content',
4 |     '//meta[starts-with(@property, "title")]/@content',
5 |     '//meta[starts-with(@name, "title")]/@content',
6 |     '//meta[starts-with(@property, "page:title")]/@content',
7 | ]
8 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/classifiers/__init__.py:
--------------------------------------------------------------------------------
1 | from gerapy_auto_extractor.settings import APP_DEBUG
2 | from gerapy_auto_extractor.extractors.content import extract_content
3 | from gerapy_auto_extractor.extractors.title import extract_title
4 | from gerapy_auto_extractor.extractors.datetime import extract_datetime
5 | from gerapy_auto_extractor.extractors.list import extract_list
6 | 


--------------------------------------------------------------------------------
/requirements_.txt:
--------------------------------------------------------------------------------
 1 | django==3.1.13
 2 | djangorestframework==3.12.4
 3 | djoser==2.1.0
 4 | drf-yasg==1.20.0
 5 | redis==3.5.3
 6 | scrapy==2.5.1
 7 | elasticsearch==7.15.2
 8 | elasticsearch-dsl==7.4.0
 9 | fake-useragent==0.1.11
10 | coreapi==2.3.3
11 | django-cors-headers==3.10.0
12 | djangorestframework-simplejwt==4.8.0 
13 | jieba
14 | pymysql
15 | lxml
16 | beautifulsoup4
17 | PyJWT==2.1.0


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // 使用 IntelliSense 了解相关属性。 
 3 |   // 悬停以查看现有属性的描述。
 4 |   // 欲了解更多信息，请访问: https://go.microsoft.com/fwlink/?linkid=830387
 5 |   "version": "0.2.0",
 6 |   "configurations": [
 7 |     {
 8 |       "name": "Python: Django",
 9 |       "type": "python",
10 |       "request": "launch",
11 |       "program": "${workspaceFolder}\\Backend\\manage.py",
12 |       "args": [
13 |         "runserver"
14 |       ],
15 |       "django": true
16 |     }
17 |   ]
18 | }


--------------------------------------------------------------------------------
/Backend/Backend/asgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ASGI config for Backend project.
 3 | 
 4 | It exposes the ASGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/3.1/howto/deployment/asgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.asgi import get_asgi_application
13 | 
14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Backend.settings')
15 | 
16 | application = get_asgi_application()
17 | 


--------------------------------------------------------------------------------
/Backend/Backend/wsgi.py:
--------------------------------------------------------------------------------
 1 | """
 2 | WSGI config for Backend project.
 3 | 
 4 | It exposes the WSGI callable as a module-level variable named ``application``.
 5 | 
 6 | For more information on this file, see
 7 | https://docs.djangoproject.com/en/3.1/howto/deployment/wsgi/
 8 | """
 9 | 
10 | import os
11 | 
12 | from django.core.wsgi import get_wsgi_application
13 | 
14 | os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Backend.settings')
15 | 
16 | application = get_wsgi_application()
17 | 


--------------------------------------------------------------------------------
/Backend/accounts/serializers.py:
--------------------------------------------------------------------------------
 1 | from djoser.serializers import UserCreateSerializer,UserSerializer
 2 | from django.contrib.auth import get_user_model
 3 | 
 4 | User = get_user_model()
 5 | 
 6 | 
 7 | class MyUserCreateSerializer(UserCreateSerializer):
 8 |     class Meta(UserCreateSerializer.Meta):
 9 |         model = User
10 |         fields = ("id", "email", "username", "password")
11 |         
12 | class MyUserSerializer(UserSerializer):
13 |     class Meta:
14 |         model = User
15 |         fields = ("id", "email", "username", "avator")


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/utils/helper.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | 
 4 | def jsonify(data):
 5 |     """
 6 |     format the output data
 7 |     :param data:
 8 |     :return:
 9 |     """
10 |     return json.dumps(data, indent=2, ensure_ascii=False, default=str)
11 | 
12 | 
13 | def content(file_path, encoding='utf-8'):
14 |     """
15 |     get content of html file
16 |     :param encoding: file encoding
17 |     :param file_path:
18 |     :return:
19 |     """
20 |     with open(file_path, encoding=encoding) as f:
21 |         return f.read()


--------------------------------------------------------------------------------
/Backend/accounts/migrations/0003_customuser_avator.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 3.1.13 on 2021-12-04 13:44
 2 | 
 3 | from django.db import migrations, models
 4 | 
 5 | 
 6 | class Migration(migrations.Migration):
 7 | 
 8 |     dependencies = [
 9 |         ('accounts', '0002_auto_20211204_1703'),
10 |     ]
11 | 
12 |     operations = [
13 |         migrations.AddField(
14 |             model_name='customuser',
15 |             name='avator',
16 |             field=models.CharField(default='/static/avator/default.png', max_length=255),
17 |         ),
18 |     ]
19 | 


--------------------------------------------------------------------------------
/Backend/accounts/migrations/0004_auto_20211205_2048.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 3.1.13 on 2021-12-05 12:48
 2 | 
 3 | from django.db import migrations, models
 4 | 
 5 | 
 6 | class Migration(migrations.Migration):
 7 | 
 8 |     dependencies = [
 9 |         ('accounts', '0003_customuser_avator'),
10 |     ]
11 | 
12 |     operations = [
13 |         migrations.AlterField(
14 |             model_name='customuser',
15 |             name='avator',
16 |             field=models.CharField(default='/static/avator/default.jpg', max_length=255),
17 |         ),
18 |     ]
19 | 


--------------------------------------------------------------------------------
/Backend/accounts/migrations/0002_auto_20211204_1703.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 3.1.13 on 2021-12-04 09:03
 2 | 
 3 | from django.db import migrations
 4 | 
 5 | 
 6 | class Migration(migrations.Migration):
 7 | 
 8 |     dependencies = [
 9 |         ('accounts', '0001_initial'),
10 |     ]
11 | 
12 |     operations = [
13 |         migrations.RemoveField(
14 |             model_name='customuser',
15 |             name='first_name',
16 |         ),
17 |         migrations.RemoveField(
18 |             model_name='customuser',
19 |             name='last_name',
20 |         ),
21 |     ]
22 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/__init__.py:
--------------------------------------------------------------------------------
 1 | from gerapy_auto_extractor.settings import APP_DEBUG
 2 | from gerapy_auto_extractor.extractors import extract_detail, extract_list, extract_datetime, extract_content, \
 3 |     extract_title
 4 | from gerapy_auto_extractor.classifiers.list import is_list, probability_of_list
 5 | from gerapy_auto_extractor.classifiers.detail import is_detail, probability_of_detail
 6 | from loguru import logger
 7 | 
 8 | try:
 9 |     logger.level('inspect', no=100000 if APP_DEBUG else 0, color='<yellow>')
10 | except (ValueError, TypeError):
11 |     pass
12 | 


--------------------------------------------------------------------------------
/Crawler/Crawler/utils/common.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import redis
 3 | import re
 4 | import hashlib
 5 | import sys
 6 | sys.path.append("C:\My_app\code\咻Search")
 7 | from config import REDIS_HOST, REDIS_PASSWORD
 8 | 
 9 | def real_time_count(key, init):
10 |     redis_cli = redis.Redis(host=REDIS_HOST, password=REDIS_PASSWORD)
11 |     if redis_cli.get(key):
12 |         count = pickle.loads(redis_cli.get(key))
13 |         count = count + 1
14 |         count = pickle.dumps(count)
15 |         redis_cli.set(key, count)
16 |     else:
17 |         count = pickle.dumps(init)
18 |         redis_cli.set(key, count)


--------------------------------------------------------------------------------
/frontend/src/App.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 | 	<div>
 3 | 		<div class="body">
 4 | 			<router-view />
 5 | 		</div>
 6 | 		<div class="footer">
 7 | 			<p>本网站非盈利网站，纯技术分享，如有侵权请及时联系@justin3go@qq.com</p>
 8 | 			<p>Copyright&copy 2021-justin3go-渝ICP备2021006879号</p>
 9 | 		</div>
10 | 	</div>
11 | </template>
12 | 
13 | <style lang="less">
14 | * {
15 | 	margin: 0;
16 | 	padding: 0;
17 | 	// 设置是多少就是多少，不会因为内边距的宽高而撑大
18 | 	box-sizing: border-box;
19 | 	font-family: "微软雅黑";
20 | }
21 | .footer {
22 | 	text-align: center;
23 | 	color: #999;
24 | 	margin-bottom: 0.1rem;
25 | }
26 | .body {
27 | 	min-height: calc(100vh - 0.62rem);;
28 | }
29 | </style>
30 | 


--------------------------------------------------------------------------------
/frontend/src/main.js:
--------------------------------------------------------------------------------
 1 | import router from './router'
 2 | import store from './store'
 3 | import { createApp } from 'vue'
 4 | import ElementPlus from 'element-plus'
 5 | import 'element-plus/dist/index.css'
 6 | import App from './App.vue'
 7 | 
 8 | const app = createApp(App)
 9 | 
10 | app.use(store)
11 | app.use(router)
12 | app.use(ElementPlus)
13 | app.mount('#app')
14 | 
15 | 
16 | // 跳转界面后到顶部
17 | router.beforeEach((to, from, next) => {
18 |   // chrome
19 |   document.body.scrollTop = 0
20 |   // firefox
21 |   document.documentElement.scrollTop = 0
22 |   // safari
23 |   window.pageYOffset = 0
24 |   next()
25 | })
26 | 
27 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/extractors/__init__.py:
--------------------------------------------------------------------------------
 1 | from gerapy_auto_extractor.extractors.content import extract_content
 2 | from gerapy_auto_extractor.extractors.title import extract_title
 3 | from gerapy_auto_extractor.extractors.datetime import extract_datetime
 4 | from gerapy_auto_extractor.extractors.list import extract_list
 5 | 
 6 | 
 7 | def extract_detail(html):
 8 |     """
 9 |     extract detail information
10 |     :param html:
11 |     :return:
12 |     """
13 |     return {
14 |         'title': extract_title(html),
15 |         'datetime': extract_datetime(html),
16 |         'content': extract_content(html)
17 |     }
18 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/utils/lcs.py:
--------------------------------------------------------------------------------
 1 | from difflib import SequenceMatcher
 2 | 
 3 | 
 4 | def lcs_of_2(a, b):
 5 |     """
 6 |     get longest common string
 7 |     :param a:
 8 |     :param b:
 9 |     :return:
10 |     """
11 |     match = SequenceMatcher(None, a, b).find_longest_match(0, len(a), 0, len(b))
12 |     return a[match[0]: match[0] + match[2]]
13 | 
14 | 
15 | def lcs_of_list(*args):
16 |     """
17 |     get longest common string of list
18 |     :param args:
19 |     :return:
20 |     """
21 |     if len(args) == 2:
22 |         return lcs_of_2(args[0], args[1])
23 |     first = args[0]
24 |     remains = args[1:]
25 |     return lcs_of_2(first, lcs_of_list(*remains))
26 | 


--------------------------------------------------------------------------------
/frontend/src/api/request.js:
--------------------------------------------------------------------------------
 1 | import axios from 'axios'
 2 | import store from '@/store'
 3 | 
 4 | axios.defaults.timeout = 10000;
 5 | axios.defaults.headers.post['Content-Type'] = 'application/x-www-form-urlencoded;charset=UTF-8;multipart/form-data';
 6 | 
 7 | // 添加请求拦截器，在请求头中加token
 8 | axios.interceptors.request.use(
 9 |   config => {
10 |     console.log("store.state.Jwt: ", store.state.Jwt)
11 |     if (store.state.Jwt != '') {
12 |       console.log("将token添加进入请求头之中...")
13 |       config.headers.Authorization = 'JWT ' + store.state.Jwt.access;
14 |     }
15 |     return config;
16 |   },
17 |   error => {
18 |     return Promise.reject(error);
19 |   });
20 | 
21 | export default axios;


--------------------------------------------------------------------------------
/frontend/public/js/rem.js:
--------------------------------------------------------------------------------
 1 | function remSize(){
 2 |     // 获取屏幕的宽度
 3 |     var deviceWidth = document.documentElement.clientWidth || window.innerWidth;
 4 |     // 限制屏幕的宽度
 5 |     if(deviceWidth >= 750){
 6 |         deviceWidth = 750
 7 |     }
 8 |     if(deviceWidth <= 320){
 9 |         deviceWidth = 320
10 |     }
11 |     document.documentElement.style.fontSize = (deviceWidth/7.5) + 'px'
12 |     // 设计稿是750px
13 |     // 设置一半的宽度，那么就是375px
14 |     // 1rem == 100px的设计稿宽度
15 |     // 表达一半的宽度就是3.75rem
16 | 
17 |     // 设置字体大小
18 |     document.querySelector('body').style.fontSize = 0.16 + 'rem'
19 | }
20 | 
21 | remSize();
22 | // 当窗口发生变化时我们也调用一下这个函数
23 | window.onresize = function(){
24 |     remSize();
25 | }


--------------------------------------------------------------------------------
/frontend/src/components/ResultList/SomeTips.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 |   <div class="some-tips">
 3 |     <div class="tool"></div>
 4 |     <div class="tips">
 5 |       找到约{{SearchResult.totalNums}}个结果(用时{{SearchResult.searchCostTime}}秒)
 6 |     </div>
 7 |   </div>
 8 | </template>
 9 | 
10 | <script>
11 | import store from "@/store/index.js";
12 | import { mapState, mapMutations } from "vuex";
13 | export default {
14 | computed: {
15 |   ...mapState(['SearchResult'])
16 | }
17 | }
18 | </script>
19 | 
20 | <style lang="less" scoped>
21 | .some-tips{
22 |   // .tool{
23 |   //   border-bottom: 1px solid #999;
24 |   // }
25 |   .tips{
26 |     text-align: left;
27 |     padding-left: 1.5rem;
28 |   }
29 | }
30 | </style>


--------------------------------------------------------------------------------
/frontend/src/views/Home.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 | 	<div class="home">
 3 | 		<sample-nav></sample-nav>
 4 | 		<logo></logo>
 5 | 		<search-box></search-box>
 6 | 		<footer></footer>
 7 | 	</div>
 8 | </template>
 9 | 
10 | <script>
11 | // @ is an alias to /src
12 | import SampleNav from "@/components/Home/SampleNav.vue";
13 | import Logo from "@/components/Home/Logo.vue";
14 | import SearchBox from "@/components/Home/SearchBox.vue";
15 | import Footer from "@/components/Footer.vue";
16 | 
17 | export default {
18 | 	name: "Home",
19 | 	components: {
20 | 		SampleNav,
21 | 		Logo,
22 | 		SearchBox,
23 | 		Footer,
24 | 	},
25 | };
26 | </script>
27 | 
28 | <style lang="less" scoped>
29 | .home {
30 | }
31 | </style>
32 | 


--------------------------------------------------------------------------------
/Backend/manage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | """Django's command-line utility for administrative tasks."""
 3 | import os
 4 | import sys
 5 | 
 6 | 
 7 | def main():
 8 |     """Run administrative tasks."""
 9 |     os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Backend.settings')
10 |     try:
11 |         from django.core.management import execute_from_command_line
12 |     except ImportError as exc:
13 |         raise ImportError(
14 |             "Couldn't import Django. Are you sure it's installed and "
15 |             "available on your PYTHONPATH environment variable? Did you "
16 |             "forget to activate a virtual environment?"
17 |         ) from exc
18 |     execute_from_command_line(sys.argv)
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     main()
23 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/classifiers/detail.py:
--------------------------------------------------------------------------------
 1 | from gerapy_auto_extractor.classifiers.list import probability_of_list
 2 | 
 3 | 
 4 | def probability_of_detail(html, **kwargs):
 5 |     """
 6 |     get probability of detail page
 7 |     :param html:
 8 |     :param kwargs: other kwargs
 9 |     :return:
10 |     """
11 |     return 1 - probability_of_list(html, **kwargs)
12 | 
13 | 
14 | def is_detail(html, threshold=0.5, **kwargs):
15 |     """
16 |     judge if this page is detail page
17 |     :param html: source of html
18 |     :param threshold:
19 |     :param kwargs:
20 |     :return:
21 |     """
22 |     _probability_of_detail = probability_of_detail(html, **kwargs)
23 |     if _probability_of_detail > threshold:
24 |         return True
25 |     return False
26 | 


--------------------------------------------------------------------------------
/frontend/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "frontend",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "serve": "vue-cli-service serve",
 7 |     "build": "vue-cli-service build"
 8 |   },
 9 |   "dependencies": {
10 |     "@trevoreyre/autocomplete-vue": "^2.2.0",
11 |     "axios": "^0.24.0",
12 |     "core-js": "^3.6.5",
13 |     "element-plus": "^1.2.0-beta.6",
14 |     "vue": "^3.0.0",
15 |     "vue-router": "^4.0.0-0",
16 |     "vuex": "^4.0.0-0"
17 |   },
18 |   "devDependencies": {
19 |     "@vue/cli-plugin-babel": "~4.5.0",
20 |     "@vue/cli-plugin-router": "~4.5.0",
21 |     "@vue/cli-plugin-vuex": "~4.5.0",
22 |     "@vue/cli-service": "~4.5.0",
23 |     "@vue/compiler-sfc": "^3.0.0",
24 |     "less": "^3.0.4",
25 |     "less-loader": "^5.0.0"
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/frontend/src/views/activate.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 | 	<div></div>
 3 | </template>
 4 | 
 5 | <script setup>
 6 | import { activate } from "@/api/index.js";
 7 | import { useRouter, useRoute } from "vue-router";
 8 | import { ElLoading } from "element-plus";
 9 | const router = useRouter();
10 | const route = useRoute();
11 | const uid = route.query.uid;
12 | const token = route.query.token;
13 | const loading = ElLoading.service({
14 | 	lock: true,
15 | 	text: "拼命加载中...",
16 | 	background: "rgba(255,255,255,0.5)",
17 | });
18 | activate(uid, token)
19 | 	.then((res) => {
20 |     loading.close()
21 | 		alert("激活成功...");
22 | 		router.push("/");
23 | 	})
24 | 	.catch((err) => {
25 | 		loading.close()
26 | 		alert("激活失败:" + err.response.data.detail);
27 | 		router.push("/");
28 | 	});
29 | </script>
30 | 


--------------------------------------------------------------------------------
/Crawler/Crawler/models/es_blogs.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch_dsl import connections, Document, Keyword, Text, Integer, Date, Completion, analyzer, Float
 2 | import sys
 3 | sys.path.append("C:\My_app\code\咻Search")
 4 | from config import ES_HOST
 5 | 
 6 | connections.create_connection(hosts=[ES_HOST])
 7 | 
 8 | my_analyzer = analyzer('ik_smart')
 9 | 
10 | 
11 | class BlogsIndex(Document):
12 |     suggest = Completion(analyzer=my_analyzer)
13 |     page_url = Keyword()
14 |     title = Text(analyzer="ik_max_word")
15 |     keywords = Text(analyzer="ik_max_word")
16 |     description = Text(analyzer="ik_max_word")
17 |     content = Text(analyzer="ik_max_word")
18 |     PR = Float()
19 |     publish_time = Date()
20 | 
21 |     class Index:
22 |         name = 'blogs'
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     BlogsIndex.init()
27 | 


--------------------------------------------------------------------------------
/Backend/search_blogs/models.py:
--------------------------------------------------------------------------------
 1 | from elasticsearch_dsl import Text, Date, Keyword, Integer, Document, Completion, Double, Float
 2 | from elasticsearch_dsl.connections import connections
 3 | from elasticsearch_dsl import analyzer
 4 | import sys
 5 | sys.path.append("C:/My_app/code/咻Search")
 6 | from config import ES_HOST
 7 | 
 8 | 
 9 | # Create your models here.
10 | connections.create_connection(hosts=ES_HOST)
11 | 
12 | my_analyzer = analyzer('ik_smart')
13 | 
14 | class BlogsIndex(Document):
15 |     suggest = Completion(analyzer=my_analyzer)
16 |     title = Text(analyzer="ik_max_word")
17 |     keywords = Text(analyzer="ik_max_word")
18 |     description = Text(analyzer="ik_max_word")
19 |     content = Text(analyzer="ik_max_word")
20 |     PR = Float()
21 |     publish_time = Date()
22 | 
23 |     class Index:
24 |         name = 'blogs'
25 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/classifiers/base.py:
--------------------------------------------------------------------------------
 1 | from lxml.html import fromstring
 2 | from gerapy_auto_extractor.schemas.element import Element
 3 | 
 4 | 
 5 | class BaseClassifier(object):
 6 |     
 7 |     def process(self, element: Element):
 8 |         """
 9 |         you must implement this method in child class
10 |         :param element:
11 |         :return:
12 |         """
13 |         raise NotImplementedError
14 |     
15 |     def classify(self, html, **kwargs):
16 |         """
17 |         base extract method, firstly, it will convert html to WebElement, then it call
18 |         process method that child class implements
19 |         :param html:
20 |         :return:
21 |         """
22 |         self.kwargs = kwargs
23 |         element = fromstring(html=html)
24 |         element.__class__ = Element
25 |         return self.process(element)
26 | 


--------------------------------------------------------------------------------
/Backend/Backend/urls.py:
--------------------------------------------------------------------------------
 1 | from django.contrib import admin
 2 | from django.urls import path
 3 | from django.urls.conf import include, re_path
 4 | 
 5 | # DRF YASG
 6 | from rest_framework import permissions
 7 | from drf_yasg.views import get_schema_view
 8 | from drf_yasg import openapi
 9 | 
10 | schema_view = get_schema_view(
11 |     openapi.Info(
12 |         title="XiuSearch API",
13 |         default_version="v1",
14 |         description="XiuSearch的接口文档......",
15 |         contact=openapi.Contact(email="justin3go@foxmail.com"),
16 |         license=openapi.License(name="BSD License"),
17 |     ),
18 |     public=True,
19 |     permission_classes=(permissions.AllowAny,),
20 | )
21 | 
22 | urlpatterns = [
23 |     path("admin/", admin.site.urls),
24 |     re_path(
25 |         r"^api/v1/docs/$",
26 |         schema_view.with_ui("swagger", cache_timeout=0),
27 |         name="schema-swagger-ui",
28 |     ),
29 |     path("api/v1/", include("accounts.urls")),
30 |     path("api/v1/", include("djoser.urls")),
31 |     path("api/v1/", include("djoser.urls.jwt")),
32 |     path("api/v1/", include("search_blogs.urls")),
33 | ]


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/utils/similarity.py:
--------------------------------------------------------------------------------
 1 | import distance
 2 | 
 3 | 
 4 | def similarity1(s1, s2):
 5 |     """
 6 |     get similarity of two strings
 7 |     :param s1:
 8 |     :param s2:
 9 |     :return:
10 |     """
11 |     if not s1 or not s2:
12 |         return 0
13 |     edit_distance = distance.levenshtein(s1, s2)
14 |     similarity_score = 1 - edit_distance / (len(s1) + len(s2))
15 |     return similarity_score
16 | 
17 | 
18 | def similarity2(s1, s2):
19 |     """
20 |     get similarity of two strings
21 |     :param s1:
22 |     :param s2:
23 |     :return:
24 |     """
25 |     if not s1 or not s2:
26 |         return 0
27 |     s1_set = set(list(s1))
28 |     s2_set = set(list(s2))
29 |     intersection = s1_set.intersection(s2_set)
30 |     union = s2_set.intersection(s2_set)
31 |     return len(intersection) / len(union)
32 | 
33 | 
34 | def similarity(s1, s2):
35 |     """
36 |     get similarity of two strings
37 |     :param s1:
38 |     :param s2:
39 |     :return:
40 |     """
41 |     return similarity2(s1, s2)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     s1 = 'hello'
46 |     s2 = 'world'
47 |     print(similarity(s1, s2))
48 | 


--------------------------------------------------------------------------------
/frontend/src/components/Home/Logo.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 | 	<div class="logo">
 3 | 		<div class="container">
 4 | 			<img src="@/assets/logo.svg" alt="" />
 5 | 			<div class="text">
 6 | 				<div class="master">XiuSearch</div>
 7 | 				<div class="slug">
 8 | 					技术人自己的搜索引擎
 9 | 					<div class="beta">beta</div>
10 | 				</div>
11 | 			</div>
12 | 		</div>
13 | 	</div>
14 | </template>
15 | 
16 | <style lang="less" scoped>
17 | .logo {
18 | 	margin-top: 1.6rem;
19 | 	margin-bottom: 0.2rem;
20 | 	display: flex;
21 | 	.container {
22 | 		margin: auto;
23 | 		display: flex;
24 | 		img {
25 | 			height: 1rem;
26 | 			width: 1rem;
27 | 		}
28 | 		.text {
29 | 			.master {
30 | 				font-size: 0.5rem;
31 | 				font-style: italic;
32 | 				font-weight: 900;
33 | 			}
34 | 			.slug {
35 | 				font-size: 0.2rem;
36 | 				height: 0.2rem;
37 | 				position: relative;
38 | 				.beta{
39 | 					position: absolute;
40 | 					top: 0;
41 | 					right: 0.1rem;
42 | 					background-color: rgb(255, 102, 14);
43 | 					border-radius: 0.1rem;
44 | 					font-size: 0.13rem;
45 | 					padding: 0 0.05rem 0 0.05rem;
46 | 					font-weight: 900;
47 | 				}
48 | 			}
49 | 		}
50 | 	}
51 | }
52 | </style>
53 | 


--------------------------------------------------------------------------------
/frontend/public/index.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="">
 3 | 	<head>
 4 | 		<meta charset="utf-8" />
 5 | 		<meta http-equiv="X-UA-Compatible" content="IE=edge" />
 6 | 		<meta name="viewport" content="width=device-width,initial-scale=1.0" />
 7 | 		<script src="//at.alicdn.com/t/font_2966608_vmshpd40bg8.js"></script>
 8 | 		<link rel="icon" type="image/x-icon" href="logo.png" />
 9 | 		<title>XiuSearch</title>
10 | 	</head>
11 | 	<body>
12 | 		<div id="app"></div>
13 | 		<script src="<%= BASE_URL %>js/rem.js"></script>
14 | 	</body>
15 | 	<style>
16 | 		body {
17 | 			background-color: #ccc;
18 |       min-height:100%;margin:0;padding:0;position:relative;
19 | 		}
20 | 		:root {
21 | 			--el-color-primary: #2600ff;
22 | 			--el-color-primary-light-1: #4524ff;
23 | 			--el-color-primary-light-2: #3f1dff;
24 | 			--el-color-primary-light-3: #5739ff;
25 | 			--el-color-primary-light-4: #765eff;
26 | 			--el-color-primary-light-5: #937dff;
27 | 			--el-color-primary-light-6: #b3a7ff;
28 | 			--el-color-primary-light-7: #cec5ff;
29 | 			--el-color-primary-light-8: #e9e6ff;
30 | 			--el-color-primary-light-9: #ffffff;
31 | 		}
32 |     #app{
33 |       min-height: 7rem;
34 |     }
35 | 	</style>
36 | </html>
37 | 


--------------------------------------------------------------------------------
/Crawler/Crawler/settings.py:
--------------------------------------------------------------------------------
 1 | from fake_useragent import UserAgent
 2 | import time
 3 | import sys
 4 | sys.path.append("C:/My_app/code/咻Search")
 5 | 
 6 | 
 7 | BOT_NAME = 'Crawler'
 8 | SPIDER_MODULES = ['Crawler.spiders']
 9 | NEWSPIDER_MODULE = 'Crawler.spiders'
10 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
11 | USER_AGENT = UserAgent().random
12 | # Obey robots.txt rules
13 | ROBOTSTXT_OBEY = True
14 | DOWNLOAD_DELAY = 0.5
15 | COOKIES_ENABLED = False
16 | ITEM_PIPELINES = {
17 |     'Crawler.pipelines.MysqlTwistedPipeline': 200,
18 |     'Crawler.pipelines.ElasticSearchPipeline': 300,
19 | }
20 | # Broad Crawls --广泛的爬取官网推荐的设置
21 | # 应用推荐的优先级队列
22 | SCHEDULER_PRIORITY_QUEUE = 'scrapy.pqueues.DownloaderAwarePriorityQueue'
23 | # 增加并发--Scrapy 下载器将执行的最大并发（即同时）请求数。
24 | CONCURRENT_REQUESTS = 100
25 | # 增加 Twisted IO 线程池最大大小
26 | REACTOR_THREADPOOL_MAXSIZE = 20
27 | # 降低日志级别
28 | LOG_LEVEL = 'INFO'
29 | # 禁用 cookie
30 | COOKIES_ENABLED = False
31 | # 禁用重试
32 | RETRY_ENABLED = False
33 | # 减少下载超时
34 | DOWNLOAD_TIMEOUT = 15
35 | # 禁用重定向
36 | REDIRECT_ENABLED = False
37 | # 启用“Ajax 可抓取页面”的抓取
38 | AJAXCRAWL_ENABLED = True
39 | 
40 | # My Settings
41 | # 爬行深度
42 | DEPTH_LIMIT = 10
43 | # log
44 | LOG_FILE = "all.log"
45 | now_time = time.strftime("%Y_%m_%d_%H_%M_%S", time.localtime())
46 | JOBDIR="breakpoints/" + str(now_time)
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/frontend/src/store/index.js:
--------------------------------------------------------------------------------
 1 | import { createStore } from 'vuex'
 2 | // 接口引入
 3 | import { login, authorization } from '@/api/index.js'
 4 | 
 5 | export default createStore({
 6 |   state: {
 7 |     SearchValue: '',
 8 |     SearchResult: {
 9 |       hitList: [],
10 |     },
11 |     Jwt: JSON.parse(localStorage.getItem("jwt")) || '',
12 |     UserProfile: JSON.parse(localStorage.getItem("user")) || '',
13 |   },
14 |   mutations: {
15 |     SetSearchValue(state, value) {
16 |       state.SearchValue = value
17 |       // 同时保存到本地
18 |       let historyList = JSON.parse(localStorage.getItem("historyList")) || [];
19 |       // 不添加重复值及空值
20 |       if (historyList.indexOf(value) == -1 && value) {
21 |         historyList.push(value)
22 |         localStorage.setItem("historyList", JSON.stringify(historyList));
23 |       }
24 | 
25 |     },
26 |     SetSearchResult(state, value) {
27 |       state.SearchResult = value
28 |     },
29 |     SetJwt(state, value) {
30 |       console.log("将jwt提交到了vuex")
31 |       state.Jwt = value
32 |       localStorage.setItem("jwt", JSON.stringify(value));
33 |     },
34 |     SetUserProfile(state, value) {
35 |       state.UserProfile = value
36 |       localStorage.setItem("user", JSON.stringify(value));
37 |     },
38 |   },
39 |   actions: {
40 |     // TODO 这些方法放在vuex里面有什么作用，其他地方直接调用api里面的不好吗
41 |   },
42 |   modules: {
43 |   }
44 | })
45 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/extractors/base.py:
--------------------------------------------------------------------------------
 1 | from lxml.html import fromstring
 2 | from loguru import logger
 3 | from lxml.html import etree
 4 | from gerapy_auto_extractor.schemas.element import Element
 5 | 
 6 | 
 7 | class BaseExtractor(object):
 8 |     """
 9 |     Base Extractor which provide common methods
10 |     """
11 |     
12 |     kwargs = None
13 |     
14 |     def to_string(self, element: Element, limit: int = None):
15 |         """
16 |         convert element to string
17 |         :param element:
18 |         :param limit:
19 |         :return:
20 |         """
21 |         result = etree.tostring(element, pretty_print=True, encoding="utf-8", method='html').decode('utf-8')
22 |         if limit:
23 |             return result[:limit]
24 |         return result
25 |     
26 |     def process(self, element: Element):
27 |         """
28 |         process method that you should implement
29 |         :param element:
30 |         :return:
31 |         """
32 |         logger.error('You must implement process method in your extractor.')
33 |         return False # 随便返回一个，不然VSCODE无法识别语法，后面上线的时候再改，而且不改也不影响
34 |         raise NotImplementedError
35 |     
36 |     def extract(self, html, **kwargs):
37 |         """
38 |         base extract method, firstly, it will convert html to WebElement, then it call
39 |         process method that child class implements
40 |         :param html:
41 |         :return:
42 |         """
43 |         self.kwargs = kwargs
44 |         element = fromstring(html=html)
45 |         element.__class__ = Element
46 |         return self.process(element)
47 | 


--------------------------------------------------------------------------------
/frontend/src/api/index.js:
--------------------------------------------------------------------------------
 1 | import axios from './request.js'
 2 | import store from '@/store'
 3 | 
 4 | // let baseUrl = 'http://39.106.132.154:8000/api/v1'
 5 | const ip = 'http://39.106.132.154:8000'
 6 | // const ip = 'http://localhost:8000'
 7 | const baseUrl = `${ip}/api/v1`
 8 | 
 9 | 
10 | export function getIP(){
11 |   return ip
12 | }
13 | 
14 | //获取搜索结果
15 | export function getSearchResult(q, p) {
16 |   return axios.get(`${baseUrl}/search?q=${q}&p=${p}`)
17 | }
18 | //根据输入的部分文本获取搜索建议
19 | export function getSearchSuggest(someText) {
20 |   return axios.get(`${baseUrl}/search/suggest?input=${someText}`)
21 | }
22 | //登录
23 | export function login(email, password) {
24 |   return axios.post(`${baseUrl}/jwt/create`, {
25 |     "email": email,
26 |     "password": password
27 |   })
28 | }
29 | //注册
30 | export function register(username, email, password, re_password) {
31 |   return axios.post(`${baseUrl}/users/`, {
32 |     "username": username,
33 |     "email": email,
34 |     "password": password,
35 |     "re_password": re_password
36 |   })
37 | }
38 | //获取用户资料
39 | export function getUserProfile() {
40 |   return axios.get(`${baseUrl}/users/me`)
41 | }
42 | //验证token是否失效
43 | export function authorization(token) {
44 |   return axios.post(`${baseUrl}/jwt/verify`,{
45 |     "token": token,
46 |   })
47 | }
48 | //通过刷新token进行刷新
49 | export function refreshToken(refresh){
50 |   return axios.post(`${baseUrl}/jwt/refresh`,{
51 |     "refresh": refresh,
52 |   })
53 | }
54 | //激活账号
55 | export function activate(uid, token){
56 |   return axios.post(`${baseUrl}/users/activation/`,{
57 |     "uid": uid,
58 |     "token": token
59 |   })
60 | }
61 | 
62 | 


--------------------------------------------------------------------------------
/frontend/src/views/ResultList.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 | 	<div class="result-list">
 3 | 		<detail-nav></detail-nav>
 4 | 		<some-tips></some-tips>
 5 | 		<list></list>
 6 | 		<related-search></related-search>
 7 | 		<el-backtop />
 8 | 		<page-index></page-index>
 9 | 	</div>
10 | </template>
11 | 
12 | <script>
13 | import DetailNav from "@/components/ResultList/DetailNav.vue";
14 | import SomeTips from "@/components/ResultList/SomeTips.vue";
15 | import List from "@/components/ResultList/List.vue";
16 | import RelatedSearch from "@/components/ResultList/RelatedSearch.vue";
17 | import PageIndex from "@/components/ResultList/PageIndex.vue";
18 | 
19 | import { onMounted, onBeforeMount, reactive } from "vue";
20 | import { useRoute } from "vue-router";
21 | import { getSearchResult } from "@/api/index.js";
22 | 
23 | import store from "@/store/index.js";
24 | export default {
25 | 	components: {
26 | 		DetailNav,
27 | 		SomeTips,
28 | 		List,
29 | 		RelatedSearch,
30 | 		PageIndex,
31 | 	},
32 | 
33 | 	setup(props) {
34 | 		const route = useRoute();
35 | 		let state = reactive({
36 | 			hitList: [],
37 | 			page: 0,
38 | 			pageNums: 0,
39 | 			searchCostTime: 0,
40 | 			totalNums: 0,
41 | 		});
42 | 		onBeforeMount(async () => {
43 | 			// 直接放到SearchBoxDetail中执行，不然父子组件的执行顺序不对
44 | 			// let q = route.query.q;
45 | 			// let p = route.query.p;
46 | 			// let res = await getSearchResult(q, p);
47 | 			// console.log("getSearchResult: ", res.data);
48 | 			// store.commit("SetSearchResult", res.data);
49 | 			// // 注意这里，刷新或者进入这个界面也需要把搜索值提交上去
50 | 			// store.commit("SetSearchValue", q);
51 | 		});
52 | 		return {
53 | 			state,
54 | 		};
55 | 	},
56 | };
57 | </script>
58 | <style lang="less" scoped>
59 | hr {
60 | 	margin-top: 0.5rem;
61 | }
62 | </style>
63 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/utils/cluster.py:
--------------------------------------------------------------------------------
 1 | from gerapy_auto_extractor.utils.similarity import similarity
 2 | from collections import defaultdict
 3 | 
 4 | 
 5 | def cluster(items, threshold=0.9):
 6 |     """
 7 |     cluster names
 8 |     :param items:
 9 |     :param threshold:
10 |     :return: cluster map, for example {"foo": 0, "bar": 1}
11 |     """
12 |     number = -1
13 |     clusters_map = {}
14 |     clusters = []
15 |     for name in items:
16 |         for c in clusters:
17 |             if all(similarity(name, w) > threshold for w in c):
18 |                 c.append(name)
19 |                 clusters_map[name] = number
20 |                 break
21 |         else:
22 |             number += 1
23 |             clusters.append([name])
24 |             clusters_map[name] = number
25 |     return clusters_map
26 | 
27 | 
28 | def cluster_dict(data: dict, threshold=0.8):
29 |     """
30 |     cluster dict, convert id key to cluster id key
31 |     :param threshold:
32 |     :param data:
33 |     :return:
34 |     """
35 |     ids = data.keys()
36 |     clusters_map = cluster(ids, threshold)
37 |     result = defaultdict(list)
38 |     for k, v in data.items():
39 |         if isinstance(v, list):
40 |             for i in v:
41 |                 result[clusters_map[k]].append(i)
42 |         else:
43 |             result[clusters_map[k]].append(v)
44 |     return dict(result)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     data = {
49 |         '/html/body/div[@class="main"]/div[1]/ul': ['child1', 'child2', 'child3'],
50 |         '/html/body/div[@class="main"]/div[2]/ul': ['child4', 'child5', 'child6'],
51 |         '/html/body/div[@class="main"]/div[3]/ul': ['child7', 'child8', 'child9'],
52 |         '/html/body/header/div[1]': ['child10', 'child11', 'child12'],
53 |         '/html/body/header/div[2]': ['child13', 'child14', 'child15'],
54 |     }
55 |     print(cluster_dict(data, threshold=0.7))
56 | 


--------------------------------------------------------------------------------
/Backend/accounts/models.py:
--------------------------------------------------------------------------------
 1 | from django.db import models
 2 | from django.contrib.auth.models import (
 3 |     AbstractBaseUser,
 4 |     PermissionsMixin,
 5 |     BaseUserManager,
 6 | )
 7 | 
 8 | 
 9 | class CustomUserManager(BaseUserManager):
10 |     def create_user(self, email, username, password=None, **extra_fields):
11 |         if not email:
12 |             raise ValueError("User must have an email")
13 |         email = self.normalize_email(email)
14 |         user = self.model(email=email, username=username, **extra_fields)
15 |         user.set_password(password)
16 |         user.save(using=self._db)
17 |         return user
18 | 
19 |     def create_superuser(self, username, email, password=None, **extra_fields):
20 |         user = self.create_user(
21 |             username, email, password=password, **extra_fields)
22 |         user.is_active = True
23 |         user.is_staff = True
24 |         user.is_admin = True
25 |         user.save(using=self._db)
26 |         return user
27 | 
28 | 
29 | class CustomUser(AbstractBaseUser, PermissionsMixin):
30 |     email = models.EmailField(max_length=255, unique=True)
31 |     username = models.CharField(max_length=255, unique=True)
32 |     # TODO 用户上传头像需要将图片路径修改为uid加头像
33 |     avator = models.CharField(
34 |         max_length=255, default='/static/avator/default.jpg')
35 |     # first_name = models.CharField(max_length=255)
36 |     # last_name = models.CharField(max_length=255)
37 |     is_active = models.BooleanField(default=True)
38 |     is_staff = models.BooleanField(default=False)
39 |     is_admin = models.BooleanField(default=False)
40 | 
41 |     objects = CustomUserManager()
42 | 
43 |     USERNAME_FIELD = "email"
44 |     REQUIRED_FIELDS = ["username"]
45 | 
46 |     def get_name(self):
47 |         return self.username
48 | 
49 |     def has_perm(self, perm, obj=None):
50 |         return True
51 | 
52 |     def has_module_perms(self, app_label):
53 |         return True
54 | 
55 |     def __str__(self):
56 |         return self.email
57 | 


--------------------------------------------------------------------------------
/Engine/url_parser.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from urllib import parse
 3 | # 这类词加分减分
 4 | # 可以把这些url分词排序然后认为判断加入消极还是积极
 5 | NEG_WORDS = ['user', 'list', 'author', 'comment','writer']
 6 | POS_WORDS = ['article', 'blog', 'details', 'question']
 7 | # 参考https://help.aliyun.com/document_detail/65096.html  
 8 | # 还要记住匹配的时候不能区分大小写，同时匹配的时候也仅仅需要匹配url的最后四位就可以了
 9 | # 这类词一票否决
10 | FILE_WORDS = ['.gif','.png','.bmp','.jpeg','.jpg', '.svg',
11 |               '.mp3','.wma','.flv','.mp4','.wmv','.ogg','.avi',
12 |               '.doc','.docx','.xls','.xlsx','.ppt','.pptx','.txt','.pdf',
13 |               '.zip','.exe','.tat','.ico','.css','.js','.swf','.apk','.m3u8','.ts']
14 | 
15 | # 还有就是如果包含很长一串数字的一般都是内容界面
16 | # 不应该是各种文件名的后缀
17 | def is_static_url(url):
18 |     '''
19 |     
20 |     '''
21 |     for w in FILE_WORDS:
22 |         if w in url[-5:]:
23 |             return True
24 | 
25 |     return False
26 | 
27 | # 暂时不用
28 | # 这个有点麻烦，先用别人实现的，自己后面再参考着来实现在我这种应用场景下的判断
29 | def is_content_url(url, threshold=0.4):
30 |     '''
31 |     判断一个url是否为内容界面，而不是列表界面或者主页又或者用户页等;
32 |     param:
33 |         url:传入的url;
34 |         threshold:决定是否为内容界面的阈值;
35 |     return:
36 |         bool;
37 |     '''
38 |     suffix = re.findall('[a-z]+', (url[-5:]).lower())
39 |     if len(suffix) != 0:
40 |         if suffix[-1] in FILE_WORDS:
41 |             return False
42 |     score = 0
43 |     if re.match("[0-9]"*10, url, flags=0) != None:
44 |         score += 30
45 |     for w in NEG_WORDS:
46 |         if w in url:
47 |             score -= 10
48 |     for w in POS_WORDS:
49 |         if w in url:
50 |             score += 15
51 |     if(score/(len(url)*2) >= threshold):
52 |         return True
53 |     else:
54 |         return False
55 | 
56 | # 暂时不用
57 | STOP_WORD = "javascript:"
58 | def url_filter(urls):
59 |     cleaned_urls = []
60 |     for url in urls:
61 |         if is_static_url(url):
62 |             continue
63 |         if STOP_WORD in url.lower():
64 |             continue
65 |         cleaned_urls.append(url)
66 |     return cleaned_urls
67 |                 


--------------------------------------------------------------------------------
/Backend/accounts/migrations/0001_initial.py:
--------------------------------------------------------------------------------
 1 | # Generated by Django 3.1.13 on 2021-10-25 09:45
 2 | 
 3 | from django.db import migrations, models
 4 | 
 5 | 
 6 | class Migration(migrations.Migration):
 7 | 
 8 |     initial = True
 9 | 
10 |     dependencies = [
11 |         ('auth', '0012_alter_user_first_name_max_length'),
12 |     ]
13 | 
14 |     operations = [
15 |         migrations.CreateModel(
16 |             name='CustomUser',
17 |             fields=[
18 |                 ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
19 |                 ('password', models.CharField(max_length=128, verbose_name='password')),
20 |                 ('last_login', models.DateTimeField(blank=True, null=True, verbose_name='last login')),
21 |                 ('is_superuser', models.BooleanField(default=False, help_text='Designates that this user has all permissions without explicitly assigning them.', verbose_name='superuser status')),
22 |                 ('email', models.EmailField(max_length=255, unique=True)),
23 |                 ('username', models.CharField(max_length=255, unique=True)),
24 |                 ('first_name', models.CharField(max_length=255)),
25 |                 ('last_name', models.CharField(max_length=255)),
26 |                 ('is_active', models.BooleanField(default=True)),
27 |                 ('is_staff', models.BooleanField(default=False)),
28 |                 ('is_admin', models.BooleanField(default=False)),
29 |                 ('groups', models.ManyToManyField(blank=True, help_text='The groups this user belongs to. A user will get all permissions granted to each of their groups.', related_name='user_set', related_query_name='user', to='auth.Group', verbose_name='groups')),
30 |                 ('user_permissions', models.ManyToManyField(blank=True, help_text='Specific permissions for this user.', related_name='user_set', related_query_name='user', to='auth.Permission', verbose_name='user permissions')),
31 |             ],
32 |             options={
33 |                 'abstract': False,
34 |             },
35 |         ),
36 |     ]
37 | 


--------------------------------------------------------------------------------
/frontend/src/components/Home/SampleNav.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 | 	<div class="sample-nav">
 3 | 		<div class="left">
 4 | 			<!-- XiuSearch -->
 5 | 		</div>
 6 | 		<div class="right">
 7 | 			<svg class="icon" aria-hidden="true" @click="gomore">
 8 | 				<use xlink:href="#icon-yingyongchengxu"></use>
 9 | 			</svg>
10 | 			<img
11 | 				v-if="store.state.UserProfile != ''"
12 | 				class="ava"
13 | 				:src="avator"
14 | 				alt=""
15 | 				@click="gotouser"
16 | 			/>
17 | 			<div class="link" v-else>
18 | 				<router-link class="btn1" :to="{ path: '/login' }">登录</router-link>|
19 | 				<router-link class="btn2" :to="{ path: '/register' }">注册</router-link>
20 | 			</div>
21 | 		</div>
22 | 	</div>
23 | </template>
24 | <script setup>
25 | import { getIP } from "@/api/index.js";
26 | import store from "@/store/index.js";
27 | import { useRouter } from "vue-router";
28 | 
29 | const router = useRouter();
30 | const avator = getIP() + store.state.UserProfile.avator || "";
31 | function gotouser() {
32 | 	router.push({ path: "/userprofile" });
33 | }
34 | function gomore(){
35 | 	alert("别点了，不想写前端了...")
36 | }
37 | </script>
38 | 
39 | <style lang="less" scoped>
40 | .sample-nav {
41 | 	display: flex;
42 | 	justify-content: space-between;
43 | 	align-items: center;
44 | 	margin-top: 0.1rem;
45 | 	.left {
46 | 		font-family: "Gill Sans", "Gill Sans MT", Calibri, "Trebuchet MS",
47 | 			sans-serif;
48 | 		font-style: italic;
49 | 		font-size: 0.4rem;
50 | 		font-weight: 900;
51 | 		color: rgb(54, 6, 228);
52 | 		opacity: 0.8;
53 | 		margin: 0 0 0 0.1rem;
54 | 	}
55 | 	.right {
56 | 		display: flex;
57 | 		justify-content: space-around;
58 | 		align-items: center;
59 | 		margin: 0 0.15rem 0 0;
60 | 		.icon {
61 | 			width: 0.25rem;
62 | 			height: 0.25rem;
63 | 			fill: #222;
64 | 			margin-right: 0.2rem;
65 | 			cursor: pointer;
66 | 		}
67 | 		.ava {
68 | 			height: 0.4rem;
69 | 			height: 0.4rem;
70 | 			border-radius: 0.4rem;
71 | 			cursor: pointer;
72 | 		}
73 | 		.btn1 {
74 | 			font-size: 0.15rem;
75 | 			margin-right: 0.05rem;
76 | 			font-weight: 900;
77 | 		}
78 | 		.btn2 {
79 | 			font-size: 0.15rem;
80 | 			margin-left: 0.05rem;
81 | 			font-weight: 900;
82 | 		}
83 | 	}
84 | }
85 | </style>
86 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/extractors/datetime.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from dateparser import parse
 3 | from lxml.html import HtmlElement
 4 | from gerapy_auto_extractor.patterns.datetime import METAS_CONTENT, REGEXES
 5 | from loguru import logger
 6 | from gerapy_auto_extractor.extractors.base import BaseExtractor
 7 | 
 8 | 
 9 | class DatetimeExtractor(BaseExtractor):
10 |     """
11 |     Datetime Extractor which auto extract datetime info.
12 |     """
13 |     
14 |     def extract_by_regex(self, element: HtmlElement) -> str:
15 |         """
16 |         extract datetime according to predefined regex
17 |         :param element:
18 |         :return:
19 |         """
20 |         text = ''.join(element.xpath('.//text()'))
21 |         for regex in REGEXES:
22 |             result = re.search(regex, text)
23 |             if result:
24 |                 return result.group(1)
25 |     
26 |     def extract_by_meta(self, element: HtmlElement) -> str:
27 |         """
28 |         extract according to meta
29 |         :param element:
30 |         :return: str
31 |         """
32 |         for xpath in METAS_CONTENT:
33 |             datetime = element.xpath(xpath)
34 |             if datetime:
35 |                 return ''.join(datetime)
36 |             
37 |     
38 |     def process(self, element: HtmlElement):
39 |         """
40 |         extract datetime
41 |         :param html:
42 |         :return:
43 |         """
44 |         return self.extract_by_meta(element) or \
45 |                self.extract_by_regex(element)
46 | 
47 | 
48 | datetime_extractor = DatetimeExtractor()
49 | 
50 | 
51 | def parse_datetime(datetime):
52 |     """
53 |     parse datetime using dateparser lib
54 |     :param datetime:
55 |     :return:
56 |     """
57 |     if not datetime:
58 |         return None
59 |     try:
60 |         return parse(datetime)
61 |     except TypeError:
62 |         logger.exception(f'Error Occurred while parsing datetime extracted. datetime is {datetime}')
63 | 
64 | 
65 | def extract_datetime(html, parse=True):
66 |     """
67 |     extract datetime from html
68 |     :param parse:
69 |     :param html:
70 |     :return:
71 |     """
72 |     result = datetime_extractor.extract(html)
73 |     if not parse:
74 |         return result
75 |     return parse_datetime(result)
76 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/extractors/content.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gerapy_auto_extractor.schemas.element import Element
 3 | from gerapy_auto_extractor.utils.preprocess import preprocess4content_extractor
 4 | from gerapy_auto_extractor.extractors.base import BaseExtractor
 5 | from gerapy_auto_extractor.utils.element import descendants_of_body
 6 | 
 7 | 
 8 | class ContentExtractor(BaseExtractor):
 9 |     """
10 |     extract content from detail page
11 |     """
12 |     
13 |     def process(self, element: Element):
14 |         """
15 |         extract content from html
16 |         :param element:
17 |         :return:
18 |         """
19 |         # preprocess
20 |         preprocess4content_extractor(element)
21 |         
22 |         # start to evaluate every child element
23 |         element_infos = []
24 |         descendants = descendants_of_body(element)
25 |         
26 |         # get std of density_of_text among all elements
27 |         density_of_text = [descendant.density_of_text for descendant in descendants]
28 |         density_of_text_std = np.std(density_of_text, ddof=1)
29 |         
30 |         # get density_score of every element
31 |         for descendant in descendants:
32 |             score = np.log(density_of_text_std) * \
33 |                     descendant.density_of_text * \
34 |                     np.log10(descendant.number_of_p_descendants + 2) * \
35 |                     np.log(descendant.density_of_punctuation)
36 |             descendant.density_score = score
37 |         
38 |         # sort element info by density_score
39 |         descendants = sorted(descendants, key=lambda x: x.density_score, reverse=True)
40 |         descendant_first = descendants[0] if descendants else None
41 |         if descendant_first is None:
42 |             return None
43 |         paragraphs = descendant_first.xpath('.//p//text()')
44 |         paragraphs = [paragraph.strip() if paragraph else '' for paragraph in paragraphs]
45 |         paragraphs = list(filter(lambda x: x, paragraphs))
46 |         text = '\n'.join(paragraphs)
47 |         text = text.strip()
48 |         return text
49 | 
50 | 
51 | content_extractor = ContentExtractor()
52 | 
53 | 
54 | def extract_content(html):
55 |     """
56 |     extract content from detail html
57 |     :return:
58 |     """
59 |     return content_extractor.extract(html)
60 | 


--------------------------------------------------------------------------------
/frontend/src/components/ResultList/List.vue:
--------------------------------------------------------------------------------
 1 | <template>
 2 | 	<div class="list">
 3 | 		<el-empty
 4 | 			v-if="SearchResult.totalNums == 0"
 5 | 			description="没有相关内容，试试其他关键词吧~"
 6 | 			style="margin-top: 1.8rem"
 7 | 		></el-empty>
 8 | 		<div class="hit-list">
 9 | 			<div
10 | 				class="hit-item"
11 | 				v-for="(item, i) in SearchResult.hitList"
12 | 				:key="i"
13 | 				:class="{ active: IsActive && current_index == i }"
14 | 				@mouseenter="(IsActive = true), (current_index = i)"
15 | 				@mouseleave="(IsActive = false), (current_index = -1)"
16 | 			>
17 | 				<a
18 | 					class="title"
19 | 					:href="item.page_url"
20 | 					target="_blank"
21 | 					v-html="item.title"
22 | 				></a>
23 | 				<div class="content" v-html="item.content"></div>
24 | 			</div>
25 | 		</div>
26 | 	</div>
27 | </template>
28 | 
29 | <script>
30 | import store from "@/store/index.js";
31 | import { mapState, mapMutations } from "vuex";
32 | 
33 | export default {
34 | 	data() {
35 | 		return {
36 | 			IsActive: false,
37 | 			current_index: -1, // 找到当前卡片，避免全部高亮
38 | 			loading: false,
39 | 			svg: `
40 |         <path class="path" d="
41 |           M 30 15
42 |           L 28 17
43 |           M 25.61 25.61
44 |           A 15 15, 0, 0, 1, 15 30
45 |           A 15 15, 0, 1, 1, 27.99 7.5
46 |           L 15 15
47 |         " style="stroke-width: 4px; fill: rgba(0, 0, 0, 0)"/>
48 |       `,
49 | 		};
50 | 	},
51 | 	props: ["hitList"],
52 | 	updated() {
53 | 		console.log("SearchResult: ", this.SearchResult);
54 | 	},
55 | 	computed: {
56 | 		...mapState(["SearchResult"]),
57 | 	},
58 | };
59 | </script>
60 | 
61 | <style lang="less">
62 | .list {
63 | 	min-height: 7rem;
64 | 	.hit-list {
65 | 		margin-left: 1.5rem;
66 | 		text-align: left;
67 | 		width: 8rem;
68 | 	}
69 | 	.hit-item {
70 | 		border-radius: 0.1rem;
71 | 		margin-top: 0.1rem;
72 | 		padding: 0.1rem 0.1rem 0.2rem 0.1rem;
73 | 		.title {
74 | 			font-size: 0.2rem;
75 | 			font-weight: 900;
76 | 		}
77 | 		.content {
78 | 			color: #666;
79 | 			overflow: hidden;
80 | 			text-overflow: ellipsis;
81 | 			display: -webkit-box;
82 | 			-webkit-box-orient: vertical;
83 | 			-webkit-line-clamp: 3;
84 | 
85 | 			.highlight {
86 | 				color: rgb(255, 13, 13);
87 | 				font-weight: 400;
88 | 			}
89 | 		}
90 | 	}
91 | 	.active {
92 | 		box-shadow: 0.05rem 0.05rem 0.1rem #999;
93 | 		background-color: #eee;
94 | 		transition: all 0.8s;
95 | 	}
96 | }
97 | </style>
98 | 


--------------------------------------------------------------------------------
/frontend/src/router/index.js:
--------------------------------------------------------------------------------
 1 | import { createRouter, createWebHistory } from 'vue-router'
 2 | import Home from '../views/Home.vue'
 3 | import store from '@/store/index.js'
 4 | import { authorization, refreshToken } from "@/api/index.js";
 5 | 
 6 | const routes = [
 7 |   {
 8 |     path: '/',
 9 |     name: 'Home',
10 |     component: Home
11 |   },
12 |   {
13 |     path: '/search',
14 |     name: 'search',
15 |     component: () => import('../views/ResultList.vue')
16 |   },
17 |   {
18 |     path: '/login',
19 |     name: 'login',
20 |     component: () => import('../views/Login.vue')
21 |   },
22 |   {
23 |     path: '/register',
24 |     name: 'register',
25 |     component: () => import('../views/Register.vue')
26 |   },
27 |   {
28 |     path: '/userprofile',
29 |     name: 'userprofile',
30 |     component: () => import('../views/UserProfile.vue')
31 |   },
32 |   {
33 |     path: '/activate',
34 |     name: 'activate',
35 |     component: () => import('../views/activate.vue')
36 |   }
37 | ]
38 | 
39 | const router = createRouter({
40 |   history: createWebHistory(process.env.BASE_URL),
41 |   routes
42 | })
43 | // 注册全局前置守卫
44 | router.beforeEach(async (to, from, next) => {
45 |   // 动态设置title
46 |   // to.meta && setTitle(to.meta.title)
47 |   // 获取token
48 |   console.log()
49 |   const access = store.state.Jwt.access || ''
50 |   const refresh = store.state.Jwt.refresh || ''
51 | 
52 |   if (access) { // 已登录
53 |     console.log("已经登录:", access)
54 |     // 调用接口判断access是否失效
55 |     let res = await authorization(access).then((data) => data).catch((err) => err)
56 |     let code = res.status || ''
57 |     if (code == 200) {
58 |       if (to.name === 'login') next({ name: 'Home' })
59 |       else next()
60 |     } else {
61 |       // 失效就使用刷新token
62 |       console.log("使用刷新token")
63 |       let res = await refreshToken(refresh).then((data) => data).catch((err) => err)
64 |       let code = res.status || ''
65 |       if (code == 200) {
66 |         console.log("刷新成功...")
67 |         store.commit('SetJwt', { "access": res.data.access, "refresh": refresh })
68 |       } else {
69 |         store.commit('SetJwt', '')
70 |         next({ name: 'login' })
71 |       }
72 |     }
73 |   } else { // 未登录
74 |     // 如果去的页面是登陆页，直接跳到登陆页
75 |     if (to.name != 'userprofile') next()
76 |     // 如果不是登陆页，强行跳转到登陆页
77 |     else next({ name: 'login' })
78 |   }
79 | })
80 | 
81 | export default router
82 | 


--------------------------------------------------------------------------------
/Crawler/Crawler/items.py:
--------------------------------------------------------------------------------
 1 | import scrapy
 2 | from Crawler.models.es_blogs import BlogsIndex
 3 | from Crawler.utils.common import real_time_count
 4 | 
 5 | # 统计索引的数据
 6 | COUNT_INIT = 0
 7 | 
 8 | 
 9 | class DetailItem(scrapy.Item):
10 |     page_url  = scrapy.Field()  # 当前网页的url
11 |     
12 |     encode = scrapy.Field()
13 |     keywords = scrapy.Field()
14 |     description = scrapy.Field()
15 |     lang = scrapy.Field()
16 |     
17 |     title = scrapy.Field()
18 |     content = scrapy.Field()
19 |     publish_time = scrapy.Field() 
20 |     
21 |     urls = scrapy.Field()  # 包含的url
22 |     
23 |     def save_to_mysql(self):
24 |         # 插入的sql语句
25 |         insert_sql = """
26 |                    insert into search_blogs(page_url, urls)
27 |                    VALUES (%s, %s)
28 |                """     
29 |         
30 |         sql_params = (
31 |             str(self['page_url']) or 'NAN', str(self['urls']) or 'NAN'
32 |         )
33 |         return insert_sql, sql_params
34 |     
35 |     def save_to_es(self):
36 |         blogs = BlogsIndex()
37 |         blogs.suggest = self['title']   # 为title建立建议字段
38 |         blogs.page_url = self['page_url']
39 |         blogs.title = self['title']
40 |         blogs.keywords = self['keywords']
41 |         blogs.description = self['description']
42 |         blogs.content = self['content']
43 |         blogs.publish_time = self['publish_time']
44 | 
45 |         real_time_count('view_count', COUNT_INIT)
46 |         blogs.save()
47 |         print("已建立索引到elasticsearch中......")
48 | 
49 |     def help_fields(self):
50 |         for field in self.fields:
51 |             print(field, "= scrapy.Field()")
52 | 
53 | 
54 | class ListItem(scrapy.Item):
55 |     # 列表页不需要存内容以及标题，并且暂时不用建立索引，保存到数据库中就可以了
56 |     page_url  = scrapy.Field()  # 当前网页的url
57 |     
58 |     encode = scrapy.Field()
59 |     keywords = scrapy.Field()
60 |     description = scrapy.Field()
61 |     lang = scrapy.Field()
62 |     
63 |     publish_time = scrapy.Field() 
64 |     
65 |     urls = scrapy.Field()  # 包含的url
66 |     
67 |     def save_to_mysql(self):
68 |         # 插入的sql语句
69 |         insert_sql = """
70 |                    insert into search_list(page_url, urls)
71 |                    VALUES (%s, %s)
72 |                """
73 |         sql_params = (
74 |             str(self['page_url']) or 'NAN', str(self['urls']) or 'NAN'
75 |         )
76 | 
77 |         return insert_sql, sql_params
78 |     
79 | 
80 |     def help_fields(self):
81 |         for field in self.fields:
82 |             print(field, "= scrapy.Field()")


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # XiuSearch
  2 | 
  3 | ### 简介
  4 | XiuSearch是一款搜索技术博客的搜索引擎，当然，如果你将种子网址换成新闻网站，这就是一个新闻搜索引擎，它对于文章搜索来说是通用的。
  5 | 
  6 | [演示链接](http://justin3go.cc/)
  7 | [接口文档](http://justin3go.cc:8000/api/v1/docs/)
  8 | 
  9 | > 如果网址失效，下方视频中也有演示的效果.
 10 | 
 11 | [视频介绍链接](https://www.bilibili.com/video/BV16m4y1X78V)
 12 | 
 13 | 项目架构图
 14 | 
 15 | ![image-20220122123051493](https://webplus-cn-shenzhen-s-6130b804f968dd14cecc43e2.oss-cn-shenzhen.aliyuncs.com/blogs/image-20220122123051493.png)
 16 | 
 17 | ### 功能
 18 | + 历史记录与搜索建议
 19 | + 检索使用elasticsearch→快
 20 |   + 倒排索引
 21 |   + 向量空间模型与布尔模型
 22 |   + 关键词高亮
 23 | + Swagger文档(采用前后端分离开发)
 24 | + 适合搜索引擎的爬虫
 25 | + 断点续爬
 26 | + 分页显示
 27 | + JWT登录
 28 | + 邮箱注册(重置密码、重置邮箱)
 29 | + pagerank
 30 | + 正文标题提取
 31 | + 列表页详情页区分
 32 | + redis统计实时爬取数量(没有展示在前端)
 33 | 
 34 | 
 35 | ### 主要技术栈
 36 | + Scrapy 2.5.1
 37 | + ElasticSearch 7.15.2
 38 | + Django 3.1
 39 | + DjangoRestFramework 3.12
 40 | + Vue3
 41 | 
 42 | ### 相关算法
 43 | + PageRank
 44 | + 投票机制实现内容提取
 45 | + SVM二分类模型区分列表页与详情页
 46 | 
 47 | ### 安装教程
 48 | 
 49 | ```python
 50 | # 这个是直接导出的完整环境
 51 | pip install -r requirments -i https://pypi.tuna.tsinghua.edu.cn/simple  
 52 | # 这个是我印象中使用的技术栈，也可以直接安装这个
 53 | pip install -r requirments_ -i https://pypi.tuna.tsinghua.edu.cn/simple
 54 | ```
 55 | 
 56 | ### 使用说明
 57 | 
 58 | 1. 修改根目录的config.py，其中包含elasticsearch,mysql,redis的配置(这里省略这三部分的安装，请自行百度google)。
 59 | 
 60 | 2. 修改/Backend/Backend/settings.py
 61 | 
 62 |    ```python
 63 |    # 修改数据库配置
 64 |    DATABASES = {
 65 |        'default': {
 66 |            'ENGINE': 'django.db.backends.mysql',
 67 |            'PASSWORD': 'xxxxxx',
 68 |            'NAME': 'xxxx',
 69 |            'USER': 'xxxx',
 70 |        }
 71 |    }
 72 |    # 修改邮箱配置
 73 |    # EMAIL CONFIG
 74 |    EMAIL_BACKEND = "django.core.mail.backends.smtp.EmailBackend"
 75 |    EMAIL_HOST = "smtp.qq.com"
 76 |    EMAIL_HOST_USER = "justin3go@qq.com"
 77 |    EMAIL_HOST_PASSWORD = "xxxxxxx"  # 这个不是qq密码，需要自己去qq邮箱申请
 78 |    EMAIL_PORT = 25
 79 |    # 如果部署，则需要如下配置，原因是阿里云不支持25端口发邮件
 80 |    EMAIL_USE_TLS = True
 81 |    EMAIL_PORT = 465 
 82 |    DEFAULT_FROM_EMAIL = EMAIL_HOST_USER
 83 |    ```
 84 | 
 85 | 3. 爬取数据
 86 | 
 87 |    ```shell
 88 |    cd ./Crawler
 89 |    scrapy crawl blog1
 90 |    ```
 91 | 
 92 | 4. 运行django
 93 | 
 94 |    ```shell
 95 |    cd ./Backend
 96 |    # 迁移数据库
 97 |    python manage.py makemigrations
 98 |    python manage.py migrate
 99 |    # 运行
100 |    python manage.py runserver
101 |    # 打开localhost:8000/api/v1/docs/ 可以看到swagger文档
102 |    效果应该和 http://justin3go.cc:8000/api/v1/docs/ 一样
103 |    ```
104 | 
105 | 5. 运行vue
106 | 
107 |    ```shell
108 |    cd fontend
109 |    npm run serve
110 |    ```
111 | **欢迎iusse，感谢⭐star⭐**
112 |    
113 | 


--------------------------------------------------------------------------------
/frontend/src/assets/logo.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="utf-8"?>
 2 | <!-- Generator: Adobe Illustrator 23.0.2, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
 3 | <svg version="1.1" id="图层_1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px"
 4 | 	 viewBox="0 0 204.5 204.5" style="enable-background:new 0 0 204.5 204.5;" xml:space="preserve">
 5 | <style type="text/css">
 6 | 	.st0{fill:#1709D3;}
 7 | 	.st1{fill:#1709D3;stroke:#0788DE;stroke-width:0.185;}
 8 | </style>
 9 | <def></def>
10 | <g id="shape_t3Zg6LY53Z" transform="translate(171.66175586978,177.73913043478) rotate(0,79.751844298233,90.227401555653) scale(1,1)">
11 | 	<g>
12 | 		<g>
13 | 			<path class="st0" d="M-76.3,15c-35.7,0-66.6-26.7-71.7-62.1c-0.6-4.8,2.6-9.4,7.4-10c4.8-0.6,9.4,2.6,10,7.4
14 | 				c3.9,26.7,27.1,46.9,54.3,46.9c28.4,0,52.4-22.1,54.6-50.4c0.3-4.8,4.7-8.6,9.5-8.1c4.8,0.3,8.6,4.7,8.1,9.5
15 | 				C-7-14.2-38.7,15-76.3,15z"/>
16 | 		</g>
17 | 		<g>
18 | 			<path class="st0" d="M-126.1-88.2c-1.9,0-3.9-0.6-5.5-1.9c-3.9-3.1-4.5-8.6-1.5-12.4c13.9-17.5,34.6-27.3,56.7-27.3
19 | 				c21.7,0,42.2,9.7,55.9,26.3c3.1,3.7,2.6,9.4-1.1,12.4C-25.3-88-31-88.5-34-92.2c-10.5-12.6-25.9-19.9-42.3-19.9
20 | 				c-16.8,0-32.5,7.6-42.8,20.7C-120.9-89.3-123.5-88.2-126.1-88.2z"/>
21 | 		</g>
22 | 		<g>
23 | 			<g>
24 | 				<circle class="st0" cx="-136.6" cy="-73.3" r="9.7"/>
25 | 			</g>
26 | 		</g>
27 | 		<g>
28 | 			<path class="st0" d="M-10.2-106.5c-1.1,0-2.4-0.5-3.2-1.6c-10.2-13.7-24.9-23.8-41.2-28.4c-2.1-0.6-3.2-2.7-2.7-4.8
29 | 				c0.6-2.1,2.7-3.2,4.8-2.7c17.9,5.2,34.1,16.2,45.4,31.3c1.3,1.8,1,4.2-0.8,5.5C-8.6-106.8-9.4-106.5-10.2-106.5z"/>
30 | 		</g>
31 | 		<g>
32 | 			<path class="st0" d="M5.4-113.7c-1.9,0-3.9-1-5-2.7c-12-19.1-31.3-32.6-53.3-37.2c-3.2-0.6-5.3-3.7-4.7-6.9s3.7-5.3,6.9-4.7
33 | 				c25.2,5.2,47.5,20.7,61.1,42.5c1.8,2.7,0.8,6.5-1.9,8.1C7.5-113.9,6.6-113.7,5.4-113.7z"/>
34 | 		</g>
35 | 	</g>
36 | </g>
37 | <g id="shape_TkCzO9nAfD" transform="translate(99.06330327411997,-148.38948416339997) rotate(0,167.08368170518,445.916984146314) scale(1,1)">
38 | 	<g>
39 | 		<g transform="translate(111.26341500000001, 0)">
40 | 			<path class="st1" d="M-127.9,265.6l-17.7-24.9c0.4-1,0.9-2.1,1.6-3.1c0.7-1,1.5-2,2.5-2.8c1-0.9,2.2-1.6,3.5-2.1
41 | 				c1.3-0.6,2.8-0.8,4.4-0.8l0,0c2.2,0,4,0.5,5.7,1.6c1.6,1.1,3.3,2.9,5,5.5l0,0l8.7,13.2l12.8-18.9c0.6-0.3,1.4-0.6,2.5-0.8
42 | 				c1.1-0.3,2.1-0.4,3.2-0.4l0,0c3.4,0,6.1,0.8,8,2.4c1.9,1.6,2.9,3.6,2.9,6l0,0c0,2.1-0.5,3.9-1.6,5.6c-1.1,1.6-2.5,3.6-4.4,5.9
43 | 				l0,0l-11.4,14l18.1,25.9c-0.6,2.7-1.9,4.9-3.8,6.5c-2,1.6-4.6,2.4-8,2.4l0,0c-2.2,0-4-0.5-5.6-1.6s-3.1-2.9-4.8-5.5l0,0l-9-14.2
44 | 				l-14,21.1c-0.6,0.1-1.2,0.1-1.7,0.2c-0.5,0-1,0.1-1.6,0.1l0,0c-4.2,0-7.3-0.9-9.3-2.6s-3-3.8-3-6.3l0,0c0-2.2,0.5-4,1.4-5.6
45 | 				s2.2-3.5,3.9-5.6l0,0L-127.9,265.6z"/>
46 | 		</g>
47 | 	</g>
48 | </g>
49 | </svg>
50 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | node_modules
141 | 
142 | breakpoints


--------------------------------------------------------------------------------
/draw/功能分析.drawio:
--------------------------------------------------------------------------------
 1 | <mxfile host="65bd71144e">
 2 |     <diagram id="Noz6gLVHBlK1gitg_ubN" name="第 1 页">
 3 |         <mxGraphModel dx="720" dy="585" grid="0" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="900" pageHeight="1600" background="#ffffff" math="0" shadow="0">
 4 |             <root>
 5 |                 <mxCell id="0"/>
 6 |                 <mxCell id="1" parent="0"/>
 7 |                 <mxCell id="2" value="搜索功能" style="shape=step;perimeter=stepPerimeter;whiteSpace=wrap;html=1;fixedSize=1;size=10;fillColor=#10739E;strokeColor=none;fontSize=17;fontStyle=1;align=center;" parent="1" vertex="1">
 8 |                     <mxGeometry x="100" y="230" width="200" height="30" as="geometry"/>
 9 |                 </mxCell>
10 |                 <mxCell id="3" value="&amp;nbsp;- 速度快&lt;br&gt;&amp;nbsp;- 分页显示&lt;br&gt;&amp;nbsp;- 去除垃圾内容&lt;br&gt;&amp;nbsp;- 相关性排序&lt;br&gt;&amp;nbsp;- 关键词高亮显示&lt;br&gt;&amp;nbsp;- 简洁美观的界面" style="shape=rect;fillColor=#B1DDF0;strokeColor=none;fontSize=12;html=1;whiteSpace=wrap;align=left;verticalAlign=top;spacing=5;" parent="1" vertex="1">
11 |                     <mxGeometry x="100" y="270" width="190" height="120" as="geometry"/>
12 |                 </mxCell>
13 |                 <mxCell id="4" value="历史记录及搜索建议" style="shape=step;perimeter=stepPerimeter;whiteSpace=wrap;html=1;fixedSize=1;size=10;fillColor=#10739E;strokeColor=none;fontSize=17;fontStyle=1;align=center;" parent="1" vertex="1">
14 |                     <mxGeometry x="330" y="230" width="200" height="30" as="geometry"/>
15 |                 </mxCell>
16 |                 <mxCell id="5" value="&amp;nbsp;- 保存用户的搜索历史&lt;br&gt;&amp;nbsp;- 实时建议功能&lt;br&gt;&amp;nbsp;- 建议与历史记录结合&lt;br&gt;&amp;nbsp;- 简洁美观的下拉显示&lt;br&gt;&amp;nbsp;- 使用图标区分建议和历史." style="shape=rect;fillColor=#B1DDF0;strokeColor=none;fontSize=12;html=1;whiteSpace=wrap;align=left;verticalAlign=top;spacing=5;" parent="1" vertex="1">
17 |                     <mxGeometry x="330" y="270" width="190" height="120" as="geometry"/>
18 |                 </mxCell>
19 |                 <mxCell id="6" value="用户系统" style="shape=step;perimeter=stepPerimeter;whiteSpace=wrap;html=1;fixedSize=1;size=10;fillColor=#10739E;strokeColor=none;fontSize=17;fontStyle=1;align=center;" parent="1" vertex="1">
20 |                     <mxGeometry x="560" y="230" width="200" height="30" as="geometry"/>
21 |                 </mxCell>
22 |                 <mxCell id="7" value="&amp;nbsp;- JWT登录功能&lt;br&gt;&amp;nbsp;- 使用邮箱注册&lt;br&gt;&amp;nbsp;- 邮箱重置&lt;br&gt;&amp;nbsp;- 密码重置&lt;br&gt;&amp;nbsp;- 交互优雅的登录注册界面." style="shape=rect;fillColor=#B1DDF0;strokeColor=none;fontSize=12;html=1;whiteSpace=wrap;align=left;verticalAlign=top;spacing=5;" parent="1" vertex="1">
23 |                     <mxGeometry x="560" y="270" width="190" height="120" as="geometry"/>
24 |                 </mxCell>
25 |             </root>
26 |         </mxGraphModel>
27 |     </diagram>
28 | </mxfile>


--------------------------------------------------------------------------------
/frontend/src/components/ResultList/DetailNav.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<div class="detail-nav">
  3 | 		<div class="logo">
  4 | 			Xiu
  5 | 			<div class="beta">beta</div>
  6 | 		</div>
  7 | 		<div class="search">
  8 | 			<search-box-detail></search-box-detail>
  9 | 		</div>
 10 | 		<div class="right">
 11 | 			<svg class="icon" aria-hidden="true" @click="goset">
 12 | 				<use xlink:href="#icon-shezhi"></use>
 13 | 			</svg>
 14 | 			<svg class="icon" aria-hidden="true" @click="gomore">
 15 | 				<use xlink:href="#icon-yingyongchengxu"></use>
 16 | 			</svg>
 17 | 			<img
 18 | 				v-if="store.state.UserProfile != ''"
 19 | 				class="ava"
 20 | 				:src="avator"
 21 | 				alt=""
 22 | 				@click="gotouser"
 23 | 			/>
 24 | 			<div class="link" v-else>
 25 | 				<router-link class="btn1" :to="{ path: '/login' }">登录</router-link>|
 26 | 				<router-link class="btn2" :to="{ path: '/register' }">注册</router-link>
 27 | 			</div>
 28 | 		</div>
 29 | 	</div>
 30 | </template>
 31 | <script>
 32 | import SearchBoxDetail from "@/components/ResultList/SearchBoxDetail.vue";
 33 | 
 34 | export default {
 35 | 	components: {
 36 | 		SearchBoxDetail,
 37 | 	},
 38 | };
 39 | </script>
 40 | <script setup>
 41 | import { getIP } from "@/api/index.js";
 42 | import store from "@/store/index.js";
 43 | import { useRouter } from "vue-router";
 44 | 
 45 | const router = useRouter();
 46 | const avator = getIP() + store.state.UserProfile.avator || "";
 47 | function gotouser() {
 48 | 	router.push({ path: "/userprofile" });
 49 | }
 50 | function gomore() {
 51 | 	alert("别点了，不想写前端了...");
 52 | }
 53 | function goset() {
 54 | 	alert("别点了，不想写前端了...");
 55 | }
 56 | </script>
 57 | 
 58 | <style lang="less" scoped>
 59 | .detail-nav {
 60 | 	display: flex;
 61 | 	justify-content: space-between;
 62 | 	align-items: center;
 63 | 	padding: 0.1rem 0.15rem 0.15rem 0.2rem;
 64 | 	border-bottom: 1px solid #999;
 65 | 	position: sticky;
 66 | 	position: -webkit-sticky; // 兼容 -webkit 内核的浏览器
 67 | 	top: 0px;
 68 | 	background-color: #ccc;
 69 | 	.logo {
 70 | 		font-size: 0.3rem;
 71 | 		font-weight: 900;
 72 | 		font-style: italic;
 73 | 		position: relative;
 74 | 		.beta {
 75 | 			position: absolute;
 76 | 			top: 0;
 77 | 			left: 0.38rem;
 78 | 			background-color: rgb(255, 102, 14);
 79 | 			border-radius: 0.1rem;
 80 | 			font-size: 0.13rem;
 81 | 			padding: 0 0.05rem 0 0.05rem;
 82 | 			font-weight: 900;
 83 | 		}
 84 | 	}
 85 | 	.search {
 86 | 		position: absolute;
 87 | 		top: 0.1rem;
 88 | 		left: 1rem;
 89 | 	}
 90 | 	.right {
 91 | 		display: flex;
 92 | 		justify-content: space-around;
 93 | 		align-items: center;
 94 | 		.icon {
 95 | 			width: 0.25rem;
 96 | 			height: 0.25rem;
 97 | 			fill: #222;
 98 | 			margin-right: 0.2rem;
 99 | 		}
100 | 		.ava {
101 | 			height: 0.4rem;
102 | 			height: 0.4rem;
103 | 			border-radius: 0.4rem;
104 | 			cursor: pointer;
105 | 		}
106 | 		.btn1 {
107 | 			font-size: 0.15rem;
108 | 			margin-right: 0.05rem;
109 | 			font-weight: 900;
110 | 		}
111 | 		.btn2 {
112 | 			font-size: 0.15rem;
113 | 			margin-left: 0.05rem;
114 | 			font-weight: 900;
115 | 		}
116 | 	}
117 | }
118 | </style>
119 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/extractors/title.py:
--------------------------------------------------------------------------------
 1 | from gerapy_auto_extractor.extractors.base import BaseExtractor
 2 | from lxml.html import HtmlElement
 3 | from gerapy_auto_extractor.patterns.title import METAS
 4 | from gerapy_auto_extractor.utils.lcs import lcs_of_2
 5 | from gerapy_auto_extractor.utils.similarity import similarity2
 6 | 
 7 | 
 8 | class TitleExtractor(BaseExtractor):
 9 |     """
10 |     Title Extractor which extract title of page
11 |     """
12 |     
13 |     def extract_by_meta(self, element: HtmlElement) -> str:
14 |         """
15 |         extract according to meta
16 |         :param element:
17 |         :return: str
18 |         """
19 |         for xpath in METAS:
20 |             title = element.xpath(xpath)
21 |             if title:
22 |                 return ''.join(title)
23 |     
24 |     def extract_by_title(self, element: HtmlElement):
25 |         """
26 |         get title from <title> tag
27 |         :param element:
28 |         :return:
29 |         """
30 |         return ''.join(element.xpath('//title//text()')).strip()
31 |     
32 |     def extract_by_hs(self, element: HtmlElement):
33 |         """
34 |         get title from all h1-h3 tag
35 |         :param element:
36 |         :return:
37 |         """
38 |         hs = element.xpath('//h1//text()|//h2//text()|//h3//text()')
39 |         return hs or []
40 |     
41 |     def extract_by_h(self, element: HtmlElement):
42 |         """
43 |         extract by h tag, priority h1, h2, h3
44 |         :param elemeent:
45 |         :return:
46 |         """
47 |         for xpath in ['//h1', '//h2', '//h3']:
48 |             children = element.xpath(xpath)
49 |             if not children:
50 |                 continue
51 |             child = children[0]
52 |             texts = child.xpath('./text()')
53 |             if texts and len(texts):
54 |                 return texts[0].strip()
55 |     
56 |     def process(self, element: HtmlElement):
57 |         """
58 |         extract title from element
59 |         :param element:
60 |         :return:
61 |         """
62 |         title_extracted_by_meta = self.extract_by_meta(element)
63 |         title_extracted_by_h = self.extract_by_h(element)
64 |         title_extracted_by_hs = self.extract_by_hs(element)
65 |         title_extracted_by_title = self.extract_by_title(element)
66 |         
67 |         # split logic to add more
68 |         if title_extracted_by_meta:
69 |             return title_extracted_by_meta
70 |         
71 |         # get most similar h
72 |         title_extracted_by_hs = sorted(title_extracted_by_hs,
73 |                                        key=lambda x: similarity2(x, title_extracted_by_title),
74 |                                        reverse=True)
75 |         if title_extracted_by_hs:
76 |             return lcs_of_2(title_extracted_by_hs[0], title_extracted_by_title)
77 |         
78 |         if title_extracted_by_title:
79 |             return title_extracted_by_title
80 |         
81 |         return title_extracted_by_h
82 | 
83 | 
84 | title_extractor = TitleExtractor()
85 | 
86 | 
87 | def extract_title(html):
88 |     """
89 |     extract title from html
90 |     :param html:
91 |     :return:
92 |     """
93 |     result = title_extractor.extract(html)
94 |     return result
95 | 


--------------------------------------------------------------------------------
/Engine/pagerank.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | import pymysql
  4 | import ast
  5 | import sys
  6 | sys.path.append("C:/My_app/code/咻Search")
  7 | from config import MYSQL_HOST, MYSQL_DBNAME, MYSQL_USER, MYSQL_PASSWORD
  8 | 
  9 | 
 10 | class PageRank():
 11 |     '''
 12 |     G: 传入图的邻接矩阵
 13 |     T: 迭代计算次数上限
 14 |     eps: 误差上限
 15 |     beta: 公式里面的beta
 16 |     return: list
 17 |     注：误差小于eps或者迭代次数大于T结束迭代计算
 18 |     '''
 19 |     def __init__(self, G, T=300, eps=1e-6, beta=0.8) -> None:
 20 |         self.G = G
 21 |         self.N = len(G)
 22 |         self.T = T
 23 |         self.eps = eps
 24 |         self.beta = beta
 25 | 
 26 |     
 27 |     def GtoM(self, G):
 28 |         '''
 29 |         创建概率转换矩阵
 30 |         '''
 31 |         M = np.zeros((self.N, self.N))
 32 |         for i in range(self.N):
 33 |             D_i = sum(G[i])
 34 |             if D_i == 0:
 35 |                 continue
 36 |             for j in range(self.N):
 37 |                 M[j][i] = G[i][j] / D_i  #归一化并转置
 38 |         return M
 39 |     
 40 |     def computePR(self, M):
 41 |         '''
 42 |         计算PR值
 43 |         '''
 44 |         R = np.ones(self.N) / self.N
 45 |         teleport = np.ones(self.N) / self.N
 46 |         for time in range(self.T):
 47 |             A = self.beta * M + (1-self.beta)*teleport
 48 |             R_new = np.dot(A, R)
 49 |             if np.linalg.norm(R_new - R) < self.eps:
 50 |                 break
 51 |             R = R_new.copy()
 52 |         return np.around(R_new, 5)
 53 |     
 54 |     def getPR(self):
 55 |         M = self.GtoM(self.G)
 56 |         return self.computePR(M)
 57 |     
 58 | def urls2G():
 59 |     '''
 60 |     将数据库中urls的关系转化为图
 61 |     '''
 62 |     # 连接数据库
 63 |     # 加上charset='utf8'，避免 'latin-1' encoding 报错等问题
 64 |     conn = pymysql.connect(host=MYSQL_HOST, user=MYSQL_USER, passwd=MYSQL_PASSWORD, 
 65 |                         db=MYSQL_DBNAME, charset='utf8')
 66 |     # 创建cursor
 67 |     cursor_blogs = conn.cursor()
 68 |     cursor_list = conn.cursor()
 69 |     sql_blogs = 'SELECT page_url, urls FROM search_blogs;'
 70 |     sql_list = 'SELECT page_url, urls FROM search_blogs;'
 71 |     # 执行sql语句
 72 |     cursor_blogs.execute(sql_blogs)
 73 |     cursor_list.execute(sql_list)
 74 |     # 获取全部查询信息
 75 |     re_blogs = cursor_blogs.fetchall()
 76 |     re_list = cursor_list.fetchall()
 77 |     
 78 |     # 将获取的元组信息转换为图
 79 |     blogs_index = [url[0] for  url in re_blogs]
 80 |     blogs_point = [ast.literal_eval(url[1]) for  url in re_blogs]
 81 |     
 82 |     list_index = [url[0] for  url in re_list]
 83 |     list_point = [ast.literal_eval(url[1]) for  url in re_list]
 84 |     indexs = blogs_index + list_index
 85 |     points = blogs_point + list_point
 86 |     G = np.zeros((len(indexs), len(indexs)))
 87 |     for i, index in enumerate(indexs):
 88 |         # 依次判断包含的url是是否在爬取过的列表中，有些广告之类的链接页会包含，但没爬取
 89 |         for p_url in points[i]:
 90 |             try:
 91 |                 p_index = indexs.index(p_url)
 92 |             except:
 93 |                 p_index = -1
 94 |             if p_index != -1:
 95 |                 G[i][p_index] = 1
 96 |                 
 97 |     return G
 98 |     
 99 | if __name__ == "__main__":
100 |     # def create_data(N, alpha=0.5): 
101 |     #     G = np.zeros((N, N))
102 |     #     for i in range(N):
103 |     #         for j in range(N):
104 |     #             if i == j:
105 |     #                 continue
106 |     #             if random.random() < alpha:
107 |     #                 G[i][j] = 1
108 |     #     return G
109 |     # G = create_data(10)
110 |     # PR = PageRank(G)
111 |     # print(PR.getPR())
112 |     G = urls2G()
113 |     print(type(G))
114 |     PR = PageRank(G)
115 |     print(PR.getPR())


--------------------------------------------------------------------------------
/frontend/src/views/UserProfile.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<div class="user-profile">
  3 | 		<div v-show="leftShow" class="left"></div>
  4 | 		<div class="center">
  5 | 			<div class="user">
  6 | 				<img class="avator" :src="avator" alt="" />
  7 | 				<div class="username">{{ username }}</div>
  8 | 				<div class="email">邮箱：{{ email }}</div>
  9 | 				<div class="btns">
 10 | 					<div
 11 | 						class="btn1"
 12 | 						:class="{ active: IsActive1 }"
 13 | 						@mouseenter="IsActive1 = true"
 14 | 						@mouseleave="IsActive1 = false"
 15 | 						@click="resetemail"
 16 | 					>
 17 | 						重置邮箱
 18 | 					</div>
 19 | 					<div
 20 | 						class="btn2"
 21 | 						:class="{ active: IsActive2 }"
 22 | 						@mouseenter="IsActive2 = true"
 23 | 						@mouseleave="IsActive2 = false"
 24 | 						@click="resetpassword"
 25 | 					>
 26 | 						重置密码
 27 | 					</div>
 28 | 				</div>
 29 | 			</div>
 30 | 			<div class="content">
 31 | 				<img src="@/assets/userprofile.svg" alt="" />
 32 | 			</div>
 33 | 		</div>
 34 | 		<div v-show="rightShow" class="right"></div>
 35 | 	</div>
 36 | </template>
 37 | <script>
 38 | export default {
 39 | 	data() {
 40 | 		return {
 41 | 			IsActive1: false,
 42 | 			IsActive2: false,
 43 | 			leftShow: false,
 44 | 			rightShow: false,
 45 | 		};
 46 | 	},
 47 | 	methods: {
 48 | 		resetemail() {
 49 | 			this.leftShow = !this.leftShow
 50 | 			alert("别点了，不想写前端了...")
 51 | 		},
 52 | 		resetpassword() {
 53 | 			this.rightShow = !this.rightShow
 54 | 			alert("别点了，不想写前端了...")
 55 | 		},
 56 | 	},
 57 | };
 58 | </script>
 59 | <script setup>
 60 | import { onMounted } from "vue";
 61 | import { getIP } from "@/api/index.js";
 62 | import store from "@/store/index.js";
 63 | 
 64 | const avator = getIP() + store.state.UserProfile.avator || "";
 65 | const email = store.state.UserProfile.email;
 66 | const id = store.state.UserProfile.id;
 67 | const username = store.state.UserProfile.username;
 68 | 
 69 | onMounted(async () => {});
 70 | </script>
 71 | <style lang="less" scoped>
 72 | .user-profile {
 73 | 	margin: 0.3rem 0.3rem 0rem;
 74 | 	.center {
 75 | 		.user {
 76 | 			margin: auto;
 77 | 			padding: 0.2rem 0.2rem 0.3rem 0.2rem;
 78 | 			background-color: #fff;
 79 | 			width: 4.5rem;
 80 | 			border-radius: 0.2rem;
 81 | 			box-shadow: 0.1rem 0.1rem 0.1rem #999;
 82 | 			.avator {
 83 | 				margin-left: 1.55rem;
 84 | 				height: 1rem;
 85 | 				width: 1rem;
 86 | 				border-radius: 0.5rem;
 87 | 			}
 88 | 			.username {
 89 | 				text-align: center;
 90 | 				font-size: 0.2rem;
 91 | 			}
 92 | 			.email {
 93 | 				margin-top: 0.3rem;
 94 | 			}
 95 | 			.btns {
 96 | 				margin-top: 0.2rem;
 97 | 				color: #999;
 98 | 				.btn1 {
 99 | 					display: block;
100 | 					margin-top: 0.1rem;
101 | 					cursor: pointer;
102 | 					height: 0.3rem;
103 | 					line-height: 0.3rem;
104 | 					width: 0.75rem;
105 | 					text-align: center;
106 | 					border-radius: 0.05rem;
107 | 				}
108 | 				.btn2 {
109 | 					margin-top: 0.1rem;
110 | 					cursor: pointer;
111 | 					height: 0.3rem;
112 | 					line-height: 0.3rem;
113 | 					width: 0.75rem;
114 | 					text-align: center;
115 | 					border-radius: 0.05rem;
116 | 				}
117 | 				.active {
118 | 					color: #111;
119 | 					background-color: #999;
120 | 				}
121 | 			}
122 | 		}
123 | 		.content {
124 | 			z-index: 20;
125 | 			img {
126 | 				margin: 0.1rem 2.25rem 0;
127 | 				z-index: 1000;
128 | 			}
129 | 		}
130 | 	}
131 | }
132 | .left {
133 | 	position: absolute;
134 | 	top: 0.3rem;
135 | 	left: 0.3rem;
136 | 	margin: auto;
137 | 	padding: 0.2rem 0.2rem 0.3rem 0.2rem;
138 | 	background-color: #fff;
139 | 	width: 4.5rem;
140 | 	height: 2.5rem;
141 | 	border-radius: 0.2rem;
142 | 	box-shadow: 0.1rem 0.1rem 0.1rem #999;
143 | }
144 | .right {
145 | 	position: absolute;
146 | 	top: 0.3rem;
147 | 	right: 0.3rem;
148 | 	margin: auto;
149 | 	padding: 0.2rem 0.2rem 0.3rem 0.2rem;
150 | 	background-color: #fff;
151 | 	width: 4.5rem;
152 | 	height: 2.5rem;
153 | 	border-radius: 0.2rem;
154 | 	box-shadow: 0.1rem 0.1rem 0.1rem #999;
155 | }
156 | </style>
157 | 


--------------------------------------------------------------------------------
/Crawler/Crawler/middlewares.py:
--------------------------------------------------------------------------------
  1 | # Define here the models for your spider middleware
  2 | #
  3 | # See documentation in:
  4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  5 | 
  6 | from scrapy import signals
  7 | 
  8 | # useful for handling different item types with a single interface
  9 | from itemadapter import is_item, ItemAdapter
 10 | 
 11 | 
 12 | class CrawlerSpiderMiddleware:
 13 |     # Not all methods need to be defined. If a method is not defined,
 14 |     # scrapy acts as if the spider middleware does not modify the
 15 |     # passed objects.
 16 | 
 17 |     @classmethod
 18 |     def from_crawler(cls, crawler):
 19 |         # This method is used by Scrapy to create your spiders.
 20 |         s = cls()
 21 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 22 |         return s
 23 | 
 24 |     def process_spider_input(self, response, spider):
 25 |         # Called for each response that goes through the spider
 26 |         # middleware and into the spider.
 27 | 
 28 |         # Should return None or raise an exception.
 29 |         return None
 30 | 
 31 |     def process_spider_output(self, response, result, spider):
 32 |         # Called with the results returned from the Spider, after
 33 |         # it has processed the response.
 34 | 
 35 |         # Must return an iterable of Request, or item objects.
 36 |         for i in result:
 37 |             yield i
 38 | 
 39 |     def process_spider_exception(self, response, exception, spider):
 40 |         # Called when a spider or process_spider_input() method
 41 |         # (from other spider middleware) raises an exception.
 42 | 
 43 |         # Should return either None or an iterable of Request or item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class CrawlerDownloaderMiddleware:
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/frontend/src/assets/empty.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" width="797.5" height="834.5" viewBox="0 0 797.5 834.5" xmlns:xlink="http://www.w3.org/1999/xlink"><title>void</title><ellipse cx="308.5" cy="780" rx="308.5" ry="54.5" fill="#3f3d56"/><circle cx="496" cy="301.5" r="301.5" fill="#3f3d56"/><circle cx="496" cy="301.5" r="248.89787" opacity="0.05"/><circle cx="496" cy="301.5" r="203.99362" opacity="0.05"/><circle cx="496" cy="301.5" r="146.25957" opacity="0.05"/><path d="M398.42029,361.23224s-23.70394,66.72221-13.16886,90.42615,27.21564,46.52995,27.21564,46.52995S406.3216,365.62186,398.42029,361.23224Z" transform="translate(-201.25 -32.75)" fill="#d0cde1"/><path d="M398.42029,361.23224s-23.70394,66.72221-13.16886,90.42615,27.21564,46.52995,27.21564,46.52995S406.3216,365.62186,398.42029,361.23224Z" transform="translate(-201.25 -32.75)" opacity="0.1"/><path d="M415.10084,515.74682s-1.75585,16.68055-2.63377,17.55847.87792,2.63377,0,5.26754-1.75585,6.14547,0,7.02339-9.65716,78.13521-9.65716,78.13521-28.09356,36.8728-16.68055,94.81576l3.51169,58.82089s27.21564,1.75585,27.21564-7.90132c0,0-1.75585-11.413-1.75585-16.68055s4.38962-5.26754,1.75585-7.90131-2.63377-4.38962-2.63377-4.38962,4.38961-3.51169,3.51169-4.38962,7.90131-63.2105,7.90131-63.2105,9.65716-9.65716,9.65716-14.92471v-5.26754s4.38962-11.413,4.38962-12.29093,23.70394-54.43127,23.70394-54.43127l9.65716,38.62864,10.53509,55.3092s5.26754,50.04165,15.80262,69.356c0,0,18.4364,63.21051,18.4364,61.45466s30.72733-6.14547,29.84941-14.04678-18.4364-118.5197-18.4364-118.5197L533.62054,513.991Z" transform="translate(-201.25 -32.75)" fill="#2f2e41"/><path d="M391.3969,772.97846s-23.70394,46.53-7.90131,48.2858,21.94809,1.75585,28.97148-5.26754c3.83968-3.83968,11.61528-8.99134,17.87566-12.87285a23.117,23.117,0,0,0,10.96893-21.98175c-.463-4.29531-2.06792-7.83444-6.01858-8.16366-10.53508-.87792-22.826-10.53508-22.826-10.53508Z" transform="translate(-201.25 -32.75)" fill="#2f2e41"/><path d="M522.20753,807.21748s-23.70394,46.53-7.90131,48.28581,21.94809,1.75584,28.97148-5.26754c3.83968-3.83969,11.61528-8.99134,17.87566-12.87285a23.117,23.117,0,0,0,10.96893-21.98175c-.463-4.29531-2.06792-7.83444-6.01857-8.16367-10.53509-.87792-22.826-10.53508-22.826-10.53508Z" transform="translate(-201.25 -32.75)" fill="#2f2e41"/><circle cx="295.90488" cy="215.43252" r="36.90462" fill="#ffb8b8"/><path d="M473.43048,260.30832S447.07,308.81154,444.9612,308.81154,492.41,324.62781,492.41,324.62781s13.70743-46.39439,15.81626-50.61206Z" transform="translate(-201.25 -32.75)" fill="#ffb8b8"/><path d="M513.86726,313.3854s-52.67543-28.97148-57.943-28.09356-61.45466,50.04166-60.57673,70.2339,7.90131,53.55335,7.90131,53.55335,2.63377,93.05991,7.90131,93.93783-.87792,16.68055.87793,16.68055,122.90931,0,123.78724-2.63377S513.86726,313.3854,513.86726,313.3854Z" transform="translate(-201.25 -32.75)" fill="#d0cde1"/><path d="M543.2777,521.89228s16.68055,50.91958,2.63377,49.16373-20.19224-43.89619-20.19224-43.89619Z" transform="translate(-201.25 -32.75)" fill="#ffb8b8"/><path d="M498.50359,310.31267s-32.48318,7.02339-27.21563,50.91957,14.9247,87.79237,14.9247,87.79237l32.48318,71.11182,3.51169,13.16886,23.70394-6.14547L528.353,425.32067s-6.14547-108.86253-14.04678-112.37423A33.99966,33.99966,0,0,0,498.50359,310.31267Z" transform="translate(-201.25 -32.75)" fill="#d0cde1"/><polygon points="277.5 414.958 317.885 486.947 283.86 411.09 277.5 414.958" opacity="0.1"/><path d="M533.896,237.31585l.122-2.82012,5.6101,1.39632a6.26971,6.26971,0,0,0-2.5138-4.61513l5.97581-.33413a64.47667,64.47667,0,0,0-43.1245-26.65136c-12.92583-1.87346-27.31837.83756-36.182,10.43045-4.29926,4.653-7.00067,10.57018-8.92232,16.60685-3.53926,11.11821-4.26038,24.3719,3.11964,33.40938,7.5006,9.18513,20.602,10.98439,32.40592,12.12114,4.15328.4,8.50581.77216,12.35457-.83928a29.721,29.721,0,0,0-1.6539-13.03688,8.68665,8.68665,0,0,1-.87879-4.15246c.5247-3.51164,5.20884-4.39635,8.72762-3.9219s7.74984,1.20031,10.062-1.49432c1.59261-1.85609,1.49867-4.559,1.70967-6.99575C521.28248,239.785,533.83587,238.70653,533.896,237.31585Z" transform="translate(-201.25 -32.75)" fill="#2f2e41"/><circle cx="559" cy="744.5" r="43" fill="#6c63ff"/><circle cx="54" cy="729.5" r="43" fill="#6c63ff"/><circle cx="54" cy="672.5" r="31" fill="#6c63ff"/><circle cx="54" cy="624.5" r="22" fill="#6c63ff"/></svg>


--------------------------------------------------------------------------------
/Crawler/Crawler/pipelines.py:
--------------------------------------------------------------------------------
  1 | # Define your item pipelines here
  2 | #
  3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
  4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
  5 | 
  6 | 
  7 | # useful for handling different item types with a single interface
  8 | from itemadapter import ItemAdapter
  9 | from Crawler.items import DetailItem, ListItem
 10 | import codecs,os,json
 11 | import copy
 12 | import pymysql
 13 | import MySQLdb
 14 | import MySQLdb.cursors
 15 | from twisted.enterprise import adbapi
 16 | from config import MYSQL_HOST, MYSQL_DBNAME, MYSQL_USER, MYSQL_PASSWORD
 17 | 
 18 | 
 19 | class ElasticSearchPipeline(object):
 20 |     """通用的ElasticSearch存储方法"""
 21 | 
 22 |     def process_item(self, item, spider):
 23 |         # 只有详情页才加索引
 24 |         if isinstance(item, DetailItem):
 25 |             item.save_to_es()
 26 |         return item
 27 |     
 28 | class MysqlTwistedPipeline(object):
 29 |     
 30 |     def __init__(self, dbpool):
 31 |         self.dbpool = dbpool
 32 | 
 33 |     @classmethod
 34 |     def from_crawler(cls, crawler):
 35 |         # 读取settings中的配置
 36 |         params = dict(
 37 |             host=MYSQL_HOST,
 38 |             db=MYSQL_DBNAME,
 39 |             user=MYSQL_USER,
 40 |             passwd=MYSQL_PASSWORD,
 41 |             charset='utf8',
 42 |             cursorclass=pymysql.cursors.DictCursor,
 43 |             use_unicode=False
 44 |         )
 45 |         # 创建连接池，pymysql为使用的连接模块
 46 |         dbpool = adbapi.ConnectionPool('pymysql', **params)
 47 |         return cls(dbpool)
 48 | 
 49 |     def process_item(self, item, spider):
 50 |         query = self.dbpool.runInteraction(self.do_insert, item)
 51 |         query.addErrback(self.handle_error, item, spider)
 52 |         print("已存入mysql中......")
 53 |         return item
 54 |         
 55 |     # 执行数据库操作的回调函数
 56 |     def do_insert(self, cursor, item):
 57 |         sql, params = item.save_to_mysql()
 58 |         cursor.execute(sql, params)
 59 | 
 60 |     # 当数据库操作失败的回调函数
 61 |     def handle_error(self, failue, item, spider):
 62 |         print(failue)
 63 | 
 64 | # 使用json存到本地文件的代码
 65 | # class CrawlerPipeline:
 66 | #     def __init__(self):
 67 | #         # 必须使用 w+ 模式打开文件，以便后续进行 读写操作（w+模式，意味既可读，亦可写）
 68 | #         # 注意：此处打开文件使用的不是 python 的 open 方法，而是 codecs 中的 open 方法
 69 | #         self.json_file = codecs.open('data.json', 'w+', encoding='UTF-8')
 70 | 
 71 | #     def open_spider(self, spider):
 72 | #         # 在爬虫开始时，首先写入一个 '[' 符号，构造一个 json 数组
 73 | #         # 为使得 Json 文件具有更高的易读性，我们辅助输出了 '\n'（换行符）
 74 | #         self.json_file.write('[\n')
 75 |         
 76 |         
 77 | #     def process_item(self, item, spider):
 78 | #         item_json = json.dumps(dict(item), ensure_ascii=False)
 79 | #         self.json_file.write('\t' + item_json + ',\n')
 80 | #         return item
 81 | 
 82 | #         if isinstance(item, DetailItem):
 83 | #             page_url = item['page_url']
 84 | #             encode = item['encode']
 85 | #             keywords = item['keywords']
 86 | #             description = item['description']
 87 | #             lang = item['lang']
 88 | #             title = item['title']
 89 | #             content = item['content']
 90 | #             urls_cleaned = item['urls']
 91 | #             publish_time = item['publish_time']
 92 | #             f = open("./data.json", 'w+', encoding="utf-8")
 93 |             
 94 | 
 95 | #         if isinstance(item, ListItem):
 96 | #             page_url = item['page_url']
 97 | #             encode = item['encode']
 98 | #             keywords = item['keywords']
 99 | #             description = item['description']
100 | #             lang = item['lang']
101 | #             urls_cleaned = item['urls']
102 | #             publish_time = item['publish_time']
103 |             
104 | #     # 爬虫结束时执行的方法
105 | #     def close_spider(self, spider):
106 | #         # 在结束后，需要对 process_item 最后一次执行输出的 “逗号” 去除
107 | #         # 当前文件指针处于文件尾，我们需要首先使用 SEEK 方法，定位到文件尾前的两个字符（一个','(逗号), 一个'\n'(换行符)）的位置
108 | #         self.json_file.seek(-2, os.SEEK_END)
109 | #         # 使用 truncate() 方法，将后面的数据清空
110 | #         self.json_file.truncate()
111 | #         # 重新输出'\n'，并输出']'，与 open_spider(self, spider) 时输出的 '[' 相对应，构成一个完整的数组格式
112 | #         self.json_file.write('\n]')
113 | #         # 关闭文件
114 | #         self.json_file.close()
115 | 
116 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/patterns/datetime.py:
--------------------------------------------------------------------------------
 1 | REGEXES = [
 2 |     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
 3 |     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
 4 |     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
 5 |     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
 6 |     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
 7 |     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
 8 |     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
 9 |     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[0-1]?[0-9]:[0-5]?[0-9])",
10 |     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[2][0-3]:[0-5]?[0-9])",
11 |     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2}\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
12 |     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
13 |     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
14 |     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
15 |     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
16 |     "(\d{4}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
17 |     "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
18 |     "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
19 |     "(\d{2}年\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
20 |     "(\d{2}年\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
21 |     "(\d{2}年\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
22 |     "(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9]:[0-5]?[0-9])",
23 |     "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9]:[0-5]?[0-9])",
24 |     "(\d{1,2}月\d{1,2}日\s*?[0-1]?[0-9]:[0-5]?[0-9])",
25 |     "(\d{1,2}月\d{1,2}日\s*?[2][0-3]:[0-5]?[0-9])",
26 |     "(\d{1,2}月\d{1,2}日\s*?[1-24]\d时[0-60]\d分)([1-24]\d时)",
27 |     "(\d{4}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
28 |     "(\d{2}[-|/|.]\d{1,2}[-|/|.]\d{1,2})",
29 |     "(\d{4}年\d{1,2}月\d{1,2}日)",
30 |     "(\d{2}年\d{1,2}月\d{1,2}日)",
31 |     "(\d{1,2}月\d{1,2}日)"
32 | ]
33 | 
34 | METAS_CONTENT = [
35 |     '//meta[starts-with(@property, "rnews:datePublished")]/@content',
36 |     '//meta[starts-with(@property, "article:published_time")]/@content',
37 |     '//meta[starts-with(@property, "og:published_time")]/@content',
38 |     '//meta[starts-with(@property, "og:release_date")]/@content',
39 |     '//meta[starts-with(@itemprop, "datePublished")]/@content',
40 |     '//meta[starts-with(@itemprop, "dateUpdate")]/@content',
41 |     '//meta[starts-with(@name, "OriginalPublicationDate")]/@content',
42 |     '//meta[starts-with(@name, "article_date_original")]/@content',
43 |     '//meta[starts-with(@name, "og:time")]/@content',
44 |     '//meta[starts-with(@name, "apub:time")]/@content',
45 |     '//meta[starts-with(@name, "publication_date")]/@content',
46 |     '//meta[starts-with(@name, "sailthru.date")]/@content',
47 |     '//meta[starts-with(@name, "PublishDate")]/@content',
48 |     '//meta[starts-with(@name, "publishdate")]/@content',
49 |     '//meta[starts-with(@name, "PubDate")]/@content',
50 |     '//meta[starts-with(@name, "pubtime")]/@content',
51 |     '//meta[starts-with(@name, "_pubtime")]/@content',
52 |     '//meta[starts-with(@name, "weibo: article:create_at")]/@content',
53 |     '//meta[starts-with(@pubdate, "pubdate")]/@content',
54 | ]
55 | 
56 | METAS_MATCH = [
57 |     '//meta[starts-with(@property, "rnews:datePublished")]',
58 |     '//meta[starts-with(@property, "article:published_time")]',
59 |     '//meta[starts-with(@property, "og:published_time")]',
60 |     '//meta[starts-with(@property, "og:release_date")]',
61 |     '//meta[starts-with(@itemprop, "datePublished")]',
62 |     '//meta[starts-with(@itemprop, "dateUpdate")]',
63 |     '//meta[starts-with(@name, "OriginalPublicationDate")]',
64 |     '//meta[starts-with(@name, "article_date_original")]',
65 |     '//meta[starts-with(@name, "og:time")]',
66 |     '//meta[starts-with(@name, "apub:time")]',
67 |     '//meta[starts-with(@name, "publication_date")]',
68 |     '//meta[starts-with(@name, "sailthru.date")]',
69 |     '//meta[starts-with(@name, "PublishDate")]',
70 |     '//meta[starts-with(@name, "publishdate")]',
71 |     '//meta[starts-with(@name, "PubDate")]',
72 |     '//meta[starts-with(@name, "pubtime")]',
73 |     '//meta[starts-with(@name, "_pubtime")]',
74 |     '//meta[starts-with(@name, "weibo: article:create_at")]',
75 |     '//meta[starts-with(@pubdate, "pubdate")]',
76 | ]
77 | 


--------------------------------------------------------------------------------
/frontend/src/components/ResultList/PageIndex.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<div class="page-index">
  3 | 		<div class="select-pages" v-if="SearchResult.totalNums != 0">
  4 | 			<div
  5 | 				v-if="SearchResult.page != 1"
  6 | 				class="previous-page"
  7 | 				:class="{ active: PActive }"
  8 | 				@mouseenter="PActive = true"
  9 | 				@mouseleave="PActive = false"
 10 | 				@click="previous"
 11 | 			>
 12 | 				&lt
 13 | 			</div>
 14 | 			<!-- 这里active可以再总体或一个是否等于当前页码 -->
 15 | 			<div
 16 | 				class="page-item"
 17 | 				:class="{ active: (IsActive && CurrentIndex == i) || (i == SearchResult.page) }"
 18 | 				v-for="i in pagers"
 19 | 				:key="i"
 20 | 				@mouseenter="(IsActive = true), (CurrentIndex = i)"
 21 | 				@mouseleave="(IsActive = false), (CurrentIndex = -1)"
 22 | 				@click="jump(i)"
 23 | 			>
 24 | 				{{ i }}
 25 | 			</div>
 26 | 			<div
 27 | 				v-if="SearchResult.page != SearchResult.pageNums"
 28 | 				class="next-page"
 29 | 				:class="{ active: NActive }"
 30 | 				@mouseenter="NActive = true"
 31 | 				@mouseleave="NActive = false"
 32 | 				@click="next"
 33 | 			>
 34 | 				&gt
 35 | 			</div>
 36 | 		</div>
 37 | 	</div>
 38 | </template>
 39 | 
 40 | <script>
 41 | import store from "@/store/index.js";
 42 | import { mapState, mapMutations } from "vuex";
 43 | 
 44 | export default {
 45 | 	data() {
 46 | 		return {
 47 | 			IsActive: false,
 48 | 			CurrentIndex: -1,
 49 | 			PActive: false,
 50 | 			NActive: false,
 51 | 		};
 52 | 	},
 53 | 	computed: {
 54 | 		...mapState(["SearchResult", "SearchValue"]),
 55 | 		// 动态维护一个数组随页码以及页面总数变化而变化
 56 | 		pagers() {
 57 | 			console.log("SearchValue: ", this.SearchValue)
 58 | 			const array = [];
 59 | 			const perPages = 9;  // 显示多少页
 60 | 			const pageCount = this.SearchResult.pageNums;
 61 | 			let current = this.SearchResult.page;
 62 | 			const _offset = (perPages - 1) / 2;
 63 | 
 64 | 			const offset = {
 65 | 				start: current - _offset,
 66 | 				end: current + _offset,
 67 | 			};
 68 | 			//-1, 3
 69 | 			if (offset.start < 1) {
 70 | 				offset.end = offset.end + (1 - offset.start);
 71 | 				offset.start = 1;
 72 | 			}
 73 | 			if (offset.end > pageCount) {
 74 | 				offset.start = offset.start - (offset.end - pageCount);
 75 | 				offset.end = pageCount;
 76 | 			}
 77 | 			if (offset.start < 1) offset.start = 1;
 78 | 
 79 | 			this.showPrevMore = offset.start > 1;
 80 | 			this.showNextMore = offset.end < pageCount;
 81 | 
 82 | 			for (let i = offset.start; i <= offset.end; i++) {
 83 | 				array.push(i);
 84 | 			}
 85 | 			return array;
 86 | 		},
 87 | 	},
 88 | 	methods: {
 89 | 		next() {
 90 | 			this.$router.push({
 91 | 				path: "/search",
 92 | 				query: { q: this.SearchValue, p: this.SearchResult.page + 1 },
 93 | 			});
 94 | 		},
 95 | 		previous() {
 96 | 			this.$router.push({
 97 | 				path: "/search",
 98 | 				query: { q: this.SearchValue, p: this.SearchResult.page - 1 },
 99 | 			});
100 | 		},
101 | 		jump(i) {
102 | 			// 因为在Searchboxdetail里面监听了路由的变化，所以这里一改变路由就可以直接跳转
103 | 			// 这里P也不用提交，因为跳转->被监听到->获取数据->提交
104 | 			if(this.SearchResult.page != i){
105 | 				// 不是当前页才跳转
106 | 				this.$router.push({
107 | 					path: "/search",
108 | 					query: { q: this.SearchValue, p: i },
109 | 				});
110 | 			}
111 | 		},
112 | 	},
113 | };
114 | </script>
115 | 
116 | <style lang="less" scoped>
117 | .page-index {
118 | 	margin-top: 0.4rem;
119 | 	border-top: 1px solid #999;
120 | 	// background-color: #bbb;
121 | 	.select-pages {	
122 | 		margin: 0.4rem 0 0rem 1.5rem;
123 | 		padding-bottom: 1rem;
124 | 		width: 7rem;
125 | 		color: blue;
126 | 		display: flex;
127 | 		justify-content: left;
128 | 		align-items: center;
129 | 		.previous-page {
130 | 			margin-left: 0rem;
131 | 			height: 0.4rem;
132 | 			width: 0.4rem;
133 | 			background-color: #eee;
134 | 			border-radius: 0.1rem;
135 | 			box-shadow: 0.05rem 0.05rem 0.05rem #999;
136 | 			text-align: center;
137 | 			line-height: 0.4rem;
138 | 			cursor: pointer; 
139 | 		}
140 | 		.page-item {
141 | 			margin-left: 0.3rem;
142 | 			height: 0.4rem;
143 | 			width: 0.4rem;
144 | 			background-color: #eee;
145 | 			border-radius: 0.1rem;
146 | 			box-shadow: 0.05rem 0.05rem 0.05rem #999;
147 | 			text-align: center;
148 | 			line-height: 0.4rem;
149 | 			cursor: pointer; 
150 | 		}
151 | 		.next-page {
152 | 			margin-left: 0.3rem;
153 | 			height: 0.4rem;
154 | 			width: 0.4rem;
155 | 			background-color: #eee;
156 | 			border-radius: 0.1rem;
157 | 			box-shadow: 0.05rem 0.05rem 0.05rem #999;
158 | 			text-align: center;
159 | 			line-height: 0.4rem;
160 | 			cursor: pointer; 
161 | 		}
162 | 		.active {
163 | 			color: #fff;
164 | 			background-color: rgb(92, 92, 255);
165 | 		}
166 | 	}
167 | }
168 | </style>
169 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/utils/preprocess.py:
--------------------------------------------------------------------------------
  1 | from lxml.html import HtmlElement, etree
  2 | 
  3 | from gerapy_auto_extractor.schemas.element import Element
  4 | from gerapy_auto_extractor.utils.element import children, remove_element, remove_children
  5 | 
  6 | CONTENT_EXTRACTOR_USELESS_TAGS = ['meta', 'style', 'script', 'link', 'video', 'audio', 'iframe', 'source', 'svg',
  7 |                                   'path',
  8 |                                   'symbol', 'img', 'footer', 'header']
  9 | CONTENT_EXTRACTOR_STRIP_TAGS = ['span', 'blockquote']
 10 | CONTENT_EXTRACTOR_NOISE_XPATHS = [
 11 |     '//div[contains(@class, "comment")]',
 12 |     '//div[contains(@class, "advertisement")]',
 13 |     '//div[contains(@class, "advert")]',
 14 |     '//div[contains(@style, "display: none")]',
 15 | ]
 16 | 
 17 | 
 18 | def preprocess4content_extractor(element: HtmlElement):
 19 |     """
 20 |     preprocess element for content extraction
 21 |     :param element:
 22 |     :return:
 23 |     """
 24 |     # remove tag and its content
 25 |     etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS)
 26 |     # only move tag pair
 27 |     etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS)
 28 |     
 29 |     remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATHS)
 30 |     
 31 |     for child in children(element):
 32 |         
 33 |         # merge text in span or strong to parent p tag
 34 |         if child.tag.lower() == 'p':
 35 |             etree.strip_tags(child, 'span')
 36 |             etree.strip_tags(child, 'strong')
 37 |             
 38 |             if not (child.text and child.text.strip()):
 39 |                 remove_element(child)
 40 |         
 41 |         # if a div tag does not contain any sub node, it could be converted to p node.
 42 |         if child.tag.lower() == 'div' and not child.getchildren():
 43 |             child.tag = 'p'
 44 | 
 45 | 
 46 | LIST_EXTRACTOR_USELESS_TAGS = CONTENT_EXTRACTOR_USELESS_TAGS
 47 | LIST_EXTRACTOR_STRIP_TAGS = CONTENT_EXTRACTOR_STRIP_TAGS
 48 | LIST_EXTRACTOR_NOISE_XPATHS = CONTENT_EXTRACTOR_NOISE_XPATHS
 49 | 
 50 | 
 51 | def preprocess4list_extractor(element: Element):
 52 |     """
 53 |     preprocess element for list extraction
 54 |     :param element:
 55 |     :return:
 56 |     """
 57 |     # remove tag and its content
 58 |     etree.strip_elements(element, *CONTENT_EXTRACTOR_USELESS_TAGS)
 59 |     # only move tag pair
 60 |     etree.strip_tags(element, *CONTENT_EXTRACTOR_STRIP_TAGS)
 61 |     
 62 |     remove_children(element, CONTENT_EXTRACTOR_NOISE_XPATHS)
 63 |     
 64 |     for child in children(element):
 65 |         
 66 |         # merge text in span or strong to parent p tag
 67 |         if child.tag.lower() == 'p':
 68 |             etree.strip_tags(child, 'span')
 69 |             etree.strip_tags(child, 'strong')
 70 |             
 71 |             if not (child.text and child.text.strip()):
 72 |                 remove_element(child)
 73 |         
 74 |         # if a div tag does not contain any sub node, it could be converted to p node.
 75 |         if child.tag.lower() == 'div' and not child.getchildren():
 76 |             child.tag = 'p'
 77 | 
 78 | 
 79 | LIST_CLASSIFIER_USELESS_TAGS = ['style', 'script', 'link', 'video', 'audio', 'iframe', 'source', 'svg', 'path',
 80 |                                 'symbol', 'footer', 'header']
 81 | LIST_CLASSIFIER_STRIP_TAGS = ['span', 'blockquote']
 82 | LIST_CLASSIFIER_NOISE_XPATHS = [
 83 |     '//div[contains(@class, "comment")]',
 84 |     '//div[contains(@class, "advertisement")]',
 85 |     '//div[contains(@class, "advert")]',
 86 |     '//div[contains(@style, "display: none")]',
 87 | ]
 88 | 
 89 | 
 90 | def preprocess4list_classifier(element: HtmlElement):
 91 |     """
 92 |     preprocess element for list classifier
 93 |     :param element:
 94 |     :return:
 95 |     """
 96 |     # remove tag and its content
 97 |     etree.strip_elements(element, *LIST_CLASSIFIER_USELESS_TAGS)
 98 |     # only move tag pair
 99 |     etree.strip_tags(element, *LIST_CLASSIFIER_STRIP_TAGS)
100 |     
101 |     remove_children(element, LIST_CLASSIFIER_NOISE_XPATHS)
102 |     
103 |     for child in children(element):
104 |         
105 |         # merge text in span or strong to parent p tag
106 |         if child.tag.lower() == 'p':
107 |             etree.strip_tags(child, 'span')
108 |             etree.strip_tags(child, 'strong')
109 |             
110 |             if not (child.text and child.text.strip()):
111 |                 remove_element(child)
112 |         
113 |         # if a div tag does not contain any sub node, it could be converted to p node.
114 |         if child.tag.lower() == 'div' and not child.getchildren():
115 |             child.tag = 'p'
116 | 


--------------------------------------------------------------------------------
/Crawler/Crawler/spiders/blog1.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append("C:/My_app/code/咻Search/Engine")
  3 | from Crawler.items import DetailItem, ListItem
  4 | from urllib import parse
  5 | import re
  6 | import scrapy
  7 | from url_parser import is_static_url
  8 | from gerapy_auto_extractor.extractors.title import extract_title
  9 | from gerapy_auto_extractor.extractors.datetime import extract_datetime
 10 | from gerapy_auto_extractor.classifiers.detail import is_detail
 11 | from gerapy_auto_extractor.classifiers.list import is_list
 12 | from html_extractor import MainContent
 13 | 
 14 | # from lxml.html.clean import Cleaner
 15 | 
 16 | 
 17 | class Blog1Spider(scrapy.Spider):
 18 |     name = 'blog1'
 19 |     # allowed_domains = ['*']
 20 |     start_urls = ['https://www.51cto.com/',
 21 |                   'https://www.iteye.com/', 'https://www.cnblogs.com/',
 22 |                   'http://www.blogjava.net/','https://blogread.cn//it/',
 23 |                   'http://blog.chinaunix.net/', 'https://www.oschina.net/',
 24 |                   'http://blog.itpub.net/', 'https://cuiqingcai.com/',
 25 |                   'http://blog.jobbole.com/', 'https://segmentfault.com/',
 26 |                   'https://www.infoq.cn/','https://www.v2ex.com/',
 27 |                   'https://www.jianshu.com/','https://blogs.360.cn/',
 28 |                   'https://tech.meituan.com/','http://www.ruanyifeng.com/blog/',
 29 |                   'http://it.deepinmind.com/','https://coolshell.cn/',
 30 |                   'https://imzl.com/','https://www.itzhai.com/',
 31 |                   'http://macshuo.com/','http://ifeve.com/',
 32 |                   'http://blog.zhaojie.me/','https://juejin.cn/',
 33 |                   'https://www.runoob.com/']
 34 | 
 35 |     # 规则
 36 |     rule_encode = "//meta/@charset"
 37 |     rule_keywords = "//meta[@name='keywords']/@content"
 38 |     rule_description = "//meta[@name='description']/@content"
 39 |     rule_lang = "//@lang"
 40 |     rule_url = "//@href"  # 简答地提取url的规则
 41 |     # 保留标签的 src 属性
 42 |     # safe_attrs = frozenset(['src'])
 43 |     # 删除 a 标签
 44 |     # remove_tags = frozenset(['script','style','link'])
 45 |     # 实例化
 46 |     # cleaner = Cleaner(
 47 |     #     style=True,
 48 |     #     scripts=True,
 49 |     #     javascript=True,
 50 |     #     meta=False,
 51 |     #     # safe_attrs=safe_attrs,
 52 |     #     # remove_tags=remove_tags,
 53 |     # )
 54 | 
 55 |     def parse(self, response):
 56 |         page_url = response.request.url
 57 |         print("-"*100)
 58 |         print("开始爬取%s......" % page_url)
 59 |         if response.status == 200:
 60 |             # cleaned_html = self.cleaner.clean_html(response.body.decode('utf-8'))
 61 |             # with open("./test.html", 'w', encoding="utf-8") as f:
 62 |             #     f.write(str(cleaned_html))
 63 |             # sys.exit()
 64 |             # 获取内容
 65 |             encode = response.xpath(self.rule_encode).extract()
 66 |             keywords = response.xpath(self.rule_keywords).extract()
 67 |             description = response.xpath(self.rule_description).extract()
 68 |             lang = response.xpath(self.rule_lang).extract()
 69 |             # 这里代码检测有问题，实际没问题，只能说VS有点垃圾，继承关系都搞不懂
 70 |             publish_time = extract_datetime(response.body.decode('utf-8'))
 71 | 
 72 |             urls = response.xpath(self.rule_url).extract()
 73 |             urls_cleaned = []
 74 |             for url in urls:
 75 |                 if is_static_url(url) or "javascript:" in url.lower():
 76 |                     continue
 77 |                 # 绝对链接不变，相对链接转换为绝对链接
 78 |                 full_url = parse.urljoin(page_url, url)
 79 |                 urls_cleaned.append(full_url)
 80 | 
 81 |             # 如果符合详情页规则，就下载该网页,提取其正文
 82 |             if is_detail(response.body, 0.3):
 83 |                 print("该网页符合详情页规则.....")
 84 |                 print("提取[ %s ]携带的正文标题中......" % page_url)
 85 | 
 86 |                 extractor = MainContent()
 87 |                 title, content = extractor.extract(page_url, response.body)
 88 | 
 89 |                 # 保存...
 90 |                 detail_item = DetailItem()
 91 |                 detail_item['page_url'] = page_url
 92 |                 detail_item['encode'] = encode
 93 |                 detail_item['keywords'] = keywords
 94 |                 detail_item['description'] = description
 95 |                 detail_item['lang'] = lang
 96 |                 detail_item['title'] = title
 97 |                 detail_item['content'] = content
 98 |                 detail_item['urls'] = urls_cleaned
 99 |                 detail_item['publish_time'] = publish_time
100 | 
101 |                 yield detail_item
102 | 
103 |             # 如果不符合详情页规则，就下载该网页,不提取其正文
104 |             elif is_list(response.body, 0.9):
105 |                 print("该网页符合列表页规则.....")
106 |                 # 保存...
107 |                 list_item = ListItem()
108 |                 list_item['page_url'] = page_url
109 |                 list_item['encode'] = encode
110 |                 list_item['keywords'] = keywords
111 |                 list_item['description'] = description
112 |                 list_item['lang'] = lang
113 |                 list_item['urls'] = urls_cleaned
114 |                 list_item['publish_time'] = publish_time
115 | 
116 |                 yield list_item
117 |             else:
118 |                 print("跳过爬取!!!!!!")
119 | 
120 |             for url in urls_cleaned:
121 |                 yield scrapy.Request(url=url, callback=self.parse)
122 | 
123 |         else:
124 |             print("[ %s ]未爬取成功......" % page_url)
125 |             return
126 | 


--------------------------------------------------------------------------------
/frontend/src/views/Login.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<div class="login">
  3 | 		<div class="welcome">欢迎回到XiuSearch这个大家庭~</div>
  4 | 		<div class="container">
  5 | 			<div class="pic">
  6 | 				<img src="@/assets/login.svg" alt="" />
  7 | 			</div>
  8 | 			<div class="login-part">
  9 | 				<div class="some-tips">在这里登录您的账号</div>
 10 | 				<el-form
 11 | 					ref="form"
 12 | 					:model="form"
 13 | 					:label-position="'top'"
 14 | 					label-width="0.8rem"
 15 | 					:rules="rules"
 16 | 					class="demo-dynamic"
 17 | 				>
 18 | 					<el-form-item label="您的邮箱:" prop="userEmail">
 19 | 						<el-autocomplete
 20 | 							v-model="form.userEmail"
 21 | 							:fetch-suggestions="querySearchEmail"
 22 | 							:trigger-on-focus="false"
 23 | 							placeholder="请输入您的邮箱"
 24 | 							clearable
 25 | 							type="email"
 26 | 						>
 27 | 						</el-autocomplete>
 28 | 					</el-form-item>
 29 | 
 30 | 					<el-form-item label="您的密码:" prop="userPassword">
 31 | 						<el-input
 32 | 							v-model="form.userPassword"
 33 | 							placeholder="请输入您的密码"
 34 | 							show-password
 35 | 							clearable
 36 | 						></el-input>
 37 | 					</el-form-item>
 38 | 					<div class="btn">
 39 | 						<el-form-item>
 40 | 							<el-button type="primary" @click="onSubmit('form')"
 41 | 								>确认</el-button
 42 | 							>
 43 | 							<el-button>取消</el-button>
 44 | 						</el-form-item>
 45 | 						<el-form-item>
 46 | 							<el-button @click="toregister()">注册</el-button>
 47 | 						</el-form-item>
 48 | 					</div>
 49 | 				</el-form>
 50 | 			</div>
 51 | 		</div>
 52 | 	</div>
 53 | </template>
 54 | 
 55 | <script>
 56 | import { login, getUserProfile } from "@/api/index.js";
 57 | import { ElLoading } from "element-plus";
 58 | export default {
 59 | 	data() {
 60 | 		return {
 61 | 			form: {
 62 | 				userEmail: "",
 63 | 				userPassword: "",
 64 | 			},
 65 | 			rules: {
 66 | 				userEmail: [
 67 | 					{ required: true, message: "请输入邮箱", trigger: "blur" },
 68 | 					{
 69 | 						min: 6,
 70 | 						max: 40,
 71 | 						message: "长度在 6 到 40 个字符",
 72 | 						trigger: "blur",
 73 | 					},
 74 | 					{
 75 | 						required: true,
 76 | 						type: "email",
 77 | 						message: "请输入正确的邮箱格式",
 78 | 						trigger: "blur",
 79 | 					},
 80 | 				],
 81 | 				userPassword: [
 82 | 					{ required: true, message: "请输入密码", trigger: "blur" },
 83 | 					{
 84 | 						min: 6,
 85 | 						message: "密码长度至少为6",
 86 | 						trigger: "blur",
 87 | 					},
 88 | 				],
 89 | 			},
 90 | 		};
 91 | 	},
 92 | 	methods: {
 93 | 		onSubmit(formName) {
 94 | 			this.$refs[formName].validate((valid) => {
 95 | 				if (valid) {
 96 | 					const loading = ElLoading.service({
 97 | 						lock: true,
 98 | 						text: "拼命加载中...",
 99 | 						background: "rgba(255,255,255,0.5)",
100 | 					});
101 | 					login(this.form.userEmail, this.form.userPassword)
102 | 						.then(async (res) => {
103 | 							this.$store.commit("SetJwt", res.data);
104 | 							let info = await getUserProfile();
105 | 							console.log("info: ", info);
106 | 							this.$store.commit("SetUserProfile", info.data);
107 | 							loading.close()
108 | 							alert("登录成功!");
109 | 							this.$router.push("/");
110 | 						})
111 | 						.catch((err) => {
112 | 							loading.close()
113 | 							console.log(err.response.data);
114 | 							alert("登录失败，请检查您的账号及密码信息......");
115 | 						});
116 | 				} else {
117 | 					alert("提交失败!");
118 | 					return false;
119 | 				}
120 | 			});
121 | 		},
122 | 		toregister() {
123 | 			this.$router.push("/register");
124 | 		},
125 | 		// 邮箱自动填充后缀名
126 | 		querySearchEmail(queryString, callback) {
127 | 			const emailList = [
128 | 				{ value: "@qq.com" },
129 | 				{ value: "@163.com" },
130 | 				{ value: "@gmail.com" },
131 | 				{ value: "@foxmail.com" },
132 | 				{ value: "@sina.com" },
133 | 				{ value: "@126.com" },
134 | 				{ value: "@sohu.com" },
135 | 				{ value: "@yahoo.com.cn" },
136 | 				{ value: "@msn.com" },
137 | 				{ value: "@hotmail.com" },
138 | 				{ value: "@ask.com" },
139 | 			];
140 | 			let results = [];
141 | 			let queryList = [];
142 | 			emailList.map((item) => {
143 | 				queryList.push({ value: queryString.split("@")[0] + item.value });
144 | 			});
145 | 			results = queryString
146 | 				? queryList.filter(this.createFilter(queryString))
147 | 				: queryList;
148 | 			callback(results);
149 | 		},
150 | 
151 | 		// 邮箱填写过滤
152 | 		createFilter(queryString) {
153 | 			return (item) => {
154 | 				return (
155 | 					item.value.toLowerCase().indexOf(queryString.toLowerCase()) === 0
156 | 				);
157 | 			};
158 | 		},
159 | 	},
160 | };
161 | </script>
162 | <style lang="less" scoped>
163 | .welcome {
164 | 	font-family: "Trebuchet MS", "Lucida Sans Unicode", "Lucida Grande",
165 | 		"Lucida Sans", Arial, sans-serif;
166 | 	font-style: italic;
167 | 	font-size: 0.8rem;
168 | 	margin-top: 0.5rem;
169 | 	text-align: center;
170 | 	font-weight: 900;
171 | 	color: rgb(20, 19, 68);
172 | }
173 | .container {
174 | 	margin-top: 0.4rem;
175 | 	display: flex;
176 | 	justify-content: space-around;
177 | 	align-items: center;
178 | 	img {
179 | 		height: 5rem;
180 | 		width: 5rem;
181 | 	}
182 | 	.login-part {
183 | 		background-color: #fff;
184 | 		padding: 0.5rem;
185 | 		width: 6.5rem;
186 | 		border-radius: 0.2rem;
187 | 		box-shadow: 0.1rem 0.1rem 0.1rem #999;
188 | 		.some-tips {
189 | 			font-size: 0.2rem;
190 | 			color: #ccc;
191 | 		}
192 | 		.el-form {
193 | 			/deep/ .el-autocomplete {
194 | 				width: 5.5rem;
195 | 				outline-style: none;
196 | 			}
197 | 			.btn {
198 | 				display: flex;
199 | 				justify-content: space-between;
200 | 				align-items: center;
201 | 			}
202 | 			.el-form-item {
203 | 				/deep/ .el-input {
204 | 					outline: none;
205 | 				}
206 | 			}
207 | 		}
208 | 	}
209 | }
210 | </style>
211 | 


--------------------------------------------------------------------------------
/frontend/src/components/Home/SearchBox.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<div class="search-box">
  3 | 		<div class="search">
  4 | 			<input
  5 | 				@focus="focusShow"
  6 | 				@blur="blurShow"
  7 | 				@keyup.enter="enterToSearch"
  8 | 				class="input-box"
  9 | 				type="text"
 10 | 				placeholder="Search..."
 11 | 				v-model="searchQuery"
 12 | 			/>
 13 | 			<router-link :to="{ path: '/search', query: { q: searchQuery, p: 1 } }">
 14 | 				<svg class="icon" aria-hidden="true">
 15 | 					<use xlink:href="#icon-sousuo"></use>
 16 | 				</svg>
 17 | 			</router-link>
 18 | 		</div>
 19 | 		<!-- {{searchedsuggests}} -->
 20 | 		<div
 21 | 			class="suggests"
 22 | 			v-show="show && searchedsuggests.list.length != 0"
 23 | 			@mouseenter="isMouseOnSerchBox = true"
 24 | 			@mouseleave="isMouseOnSerchBox = false"
 25 | 		>
 26 | 			<div
 27 | 				class="suggest-item"
 28 | 				v-for="(item, index) in searchedsuggests.list"
 29 | 				:key="index"
 30 | 				@click="checkToSearch(item.value)"
 31 | 			>
 32 | 				<svg v-if="item.isHistory" class="icon" aria-hidden="true">
 33 | 					<use xlink:href="#icon-lishixiao"></use>
 34 | 				</svg>
 35 | 				<svg v-else class="icon" aria-hidden="true">
 36 | 					<use xlink:href="#icon-sousuo"></use>
 37 | 				</svg>
 38 | 				<div class="text">
 39 | 					{{ item.value }}
 40 | 				</div>
 41 | 			</div>
 42 | 		</div>
 43 | 	</div>
 44 | </template>
 45 | 
 46 | <script>
 47 | import {
 48 | 	onMounted,
 49 | 	onUpdated,
 50 | 	computed,
 51 | 	reactive,
 52 | 	ref,
 53 | 	watch,
 54 | 	watchEffect,
 55 | } from "vue";
 56 | import { useRouter } from "vue-router";
 57 | import store from "@/store/index.js";
 58 | import { getSearchSuggest } from "@/api/index.js";
 59 | 
 60 | export default {
 61 | 	name: "search-box",
 62 | 	data() {
 63 | 		return {
 64 | 			show: false,
 65 | 			isMouseOnSerchBox: false,
 66 | 		};
 67 | 	},
 68 | 	methods: {
 69 | 		focusShow() {
 70 | 			this.show = true;
 71 | 		},
 72 | 		blurShow() {
 73 | 			// 这个if重点，没这个会造成建议选项无法点击，一点击就触发input焦点消失，然后这个建议的下拉选择项也跟着消失
 74 | 			// 应该是如果点击区域不在search-box中才隐藏-->这里通过@mouseenter这个属性来判断
 75 | 			if (!this.isMouseOnSerchBox) {
 76 | 				this.show = false;
 77 | 			} else {
 78 | 				// 马上重置为false
 79 | 				this.isMouseOnSerchBox = false;
 80 | 			}
 81 | 		},
 82 | 	},
 83 | 	components: {},
 84 | 	setup(props) {
 85 | 		const router = useRouter();
 86 | 		const searchQuery = ref("");
 87 | 		// 测试数据-->历史记录+搜索建议
 88 | 		let historyList = JSON.parse(localStorage.getItem("historyList")) || [];
 89 | 
 90 | 		let searchedsuggests = reactive({
 91 | 			list: [],
 92 | 		});
 93 | 		watchEffect(async () => {
 94 | 			let su = [];
 95 | 			// 防止item为空tostring方法会报错
 96 | 			if (historyList != []) {
 97 | 				su = historyList.filter((item) => {
 98 | 					return item.toString().indexOf(searchQuery.value) != -1;
 99 | 				});
100 | 			}
101 | 			let suggests = [];
102 | 			// 如果历史记录大于n条就直接返回，否则就拼接请求的搜索建议
103 | 			if (su.length < 8) {
104 | 				// 获取搜索建议
105 | 				console.log("小于8条......");
106 | 				let res = await getSearchSuggest(searchQuery.value);
107 | 				console.log("res: ", res.data);
108 | 				//拼接并去前8条,同时区分是否是历史记录
109 | 				for (let i of su) {
110 | 					suggests.push({ isHistory: true, value: i });
111 | 				}
112 | 				console.log(8 - su.length);
113 | 				for (let j = 0; j < 8 - su.length; j++) {
114 | 					if (res.data.length <= j) {
115 | 						break;
116 | 					}
117 | 					suggests.push({ isHistory: false, value: res.data[j] });
118 | 				}
119 | 			} else {
120 | 				for (let i = 0; i < 8; i++) {
121 | 					suggests.push({ isHistory: true, value: su[i] });
122 | 				}
123 | 			}
124 | 			console.log("suggests: ", suggests);
125 | 			searchedsuggests.list = suggests;
126 | 		});
127 | 
128 | 		function enterToSearch() {
129 | 			console.log("触发回车搜索事件......");
130 | 			store.commit("SetSearchValue", searchQuery.value);
131 | 			router.push({ path: "/search", query: { q: searchQuery.value, p: 1 } });
132 | 		}
133 | 		function checkToSearch(value) {
134 | 			// 从建议的选项中跳转
135 | 			console.log(value);
136 | 			store.commit("SetSearchValue", value);
137 | 			router.push({ path: "/search", query: { q: value, p: 1 } });
138 | 		}
139 | 		onUpdated(() => {
140 | 			console.log(searchQuery);
141 | 		});
142 | 		onMounted(() => {});
143 | 		return {
144 | 			searchedsuggests,
145 | 			searchQuery,
146 | 			enterToSearch,
147 | 			checkToSearch,
148 | 		};
149 | 	},
150 | };
151 | </script>
152 | 
153 | <style lang="less" scoped>
154 | .search-box {
155 | 	background-color: #fff;
156 | 	width: 5rem;
157 | 	margin: auto;
158 | 	border-radius: 0.2rem;
159 | 	box-shadow: 0.05rem 0.05rem 0.1rem #666;
160 | 	.search {
161 | 		display: block;
162 | 		position: relative;
163 | 		left: 0;
164 | 		top: 0;
165 | 		width: 5rem;
166 | 		margin: auto;
167 | 		.input-box {
168 | 			width: 5rem;
169 | 			height: 0.4rem;
170 | 			padding: 0 0.5rem 0 0.2rem;
171 | 			color: #111;
172 | 			font-size: 0.2rem;
173 | 			border-radius: 0.2rem;
174 | 			border: 0;
175 | 			// box-shadow: 0.05rem 0.05rem 0.05rem #666;
176 | 			outline: none;
177 | 		}
178 | 		.icon {
179 | 			position: absolute;
180 | 			top: 0.05rem;
181 | 			right: 0.1rem;
182 | 			height: 0.3rem;
183 | 			width: 0.3rem;
184 | 			fill: #999;
185 | 		}
186 | 	}
187 | 	.suggests {
188 | 		padding-bottom: 0.15rem;
189 | 		margin: 0 0.1rem 0 0.1rem;
190 | 		border-top: #ccc 1px solid;
191 | 		.suggest-item {
192 | 			margin-top: 0.1rem;
193 | 			height: 0.3rem;
194 | 			padding-left: 0.1rem;
195 | 			cursor: pointer;
196 | 			display: flex;
197 | 			justify-content: left;
198 | 			align-items: center;
199 | 			.icon {
200 | 				height: 0.15rem;
201 | 				width: 0.15rem;
202 | 				fill: #999;
203 | 			}
204 | 			.text {
205 | 				width: 4.3rem;
206 | 				padding-left: 0.1rem;
207 | 				line-height: 0.3rem;
208 | 				overflow: hidden;
209 | 				text-overflow: ellipsis;
210 | 				display: -webkit-box;
211 | 				-webkit-box-orient: vertical;
212 | 				-webkit-line-clamp: 1;
213 | 				text-align: left;
214 | 			}
215 | 		}
216 | 	}
217 | }
218 | </style>
219 | 


--------------------------------------------------------------------------------
/draw/系统架构.drawio:
--------------------------------------------------------------------------------
 1 | <mxfile host="65bd71144e">
 2 |     <diagram id="MEvr5g-fmV_9ntufl5vK" name="第 1 页">
 3 |         <mxGraphModel dx="421" dy="587" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="1200" pageHeight="1920" background="none" math="0" shadow="1">
 4 |             <root>
 5 |                 <mxCell id="0"/>
 6 |                 <mxCell id="1" parent="0"/>
 7 |                 <mxCell id="3" value="" style="verticalLabelPosition=bottom;html=1;verticalAlign=top;strokeWidth=1;align=center;outlineConnect=0;dashed=0;outlineConnect=0;shape=mxgraph.aws3d.application2;fillColor=#86E83A;strokeColor=#B0F373;aspect=fixed;" parent="1" vertex="1">
 8 |                     <mxGeometry x="180" y="236.7" width="119.44" height="102.1" as="geometry"/>
 9 |                 </mxCell>
10 |                 <mxCell id="4" value="" style="verticalLabelPosition=bottom;html=1;verticalAlign=top;strokeWidth=1;align=center;outlineConnect=0;dashed=0;outlineConnect=0;shape=mxgraph.aws3d.ami2;aspect=fixed;fillColor=#FF9900;strokeColor=#ffffff;" parent="1" vertex="1">
11 |                     <mxGeometry x="329.99999999999994" y="430" width="147" height="99.32" as="geometry"/>
12 |                 </mxCell>
13 |                 <mxCell id="6" value="" style="verticalLabelPosition=bottom;html=1;verticalAlign=top;strokeWidth=1;align=center;outlineConnect=0;dashed=0;outlineConnect=0;shape=mxgraph.aws3d.end_user;fillColor=#b1ddf0;aspect=fixed;strokeColor=#10739e;" parent="1" vertex="1">
14 |                     <mxGeometry x="80" y="314.77" width="49" height="100.46" as="geometry"/>
15 |                 </mxCell>
16 |                 <mxCell id="7" value="" style="verticalLabelPosition=bottom;html=1;verticalAlign=top;strokeWidth=1;align=center;outlineConnect=0;dashed=0;outlineConnect=0;shape=mxgraph.aws3d.arrowNW;fillColor=#000000;aspect=fixed;" parent="1" vertex="1">
17 |                     <mxGeometry x="270" y="320" width="105" height="60" as="geometry"/>
18 |                 </mxCell>
19 |                 <mxCell id="2" value="" style="verticalLabelPosition=bottom;html=1;verticalAlign=top;strokeWidth=1;align=center;outlineConnect=0;dashed=0;outlineConnect=0;shape=mxgraph.aws3d.application;fillColor=#4286c5;strokeColor=#57A2D8;aspect=fixed;" parent="1" vertex="1">
20 |                     <mxGeometry x="330" y="280" width="153.2" height="170" as="geometry"/>
21 |                 </mxCell>
22 |                 <mxCell id="8" value="" style="verticalLabelPosition=bottom;html=1;verticalAlign=top;strokeWidth=1;align=center;outlineConnect=0;dashed=0;outlineConnect=0;shape=mxgraph.aws3d.flatEdge;fillColor=#000000;aspect=fixed;" parent="1" vertex="1">
23 |                     <mxGeometry x="290" y="520" width="63.2" height="36" as="geometry"/>
24 |                 </mxCell>
25 |                 <mxCell id="9" value="" style="verticalLabelPosition=bottom;html=1;verticalAlign=top;strokeWidth=1;align=center;outlineConnect=0;dashed=0;outlineConnect=0;shape=mxgraph.aws3d.flatEdge;fillColor=#000000;aspect=fixed;rotation=-180;" parent="1" vertex="1">
26 |                     <mxGeometry x="140" y="330" width="63.2" height="36" as="geometry"/>
27 |                 </mxCell>
28 |                 <mxCell id="10" value="&lt;b&gt;&lt;font style=&quot;font-size: 25px&quot; color=&quot;#00ff00&quot;&gt;Vue&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;" parent="1" vertex="1">
29 |                     <mxGeometry x="210" y="210" width="60" height="20" as="geometry"/>
30 |                 </mxCell>
31 |                 <mxCell id="11" value="&lt;font style=&quot;font-size: 25px&quot; color=&quot;#007fff&quot;&gt;&lt;b&gt;Django&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;" parent="1" vertex="1">
32 |                     <mxGeometry x="360" y="250" width="100" height="20" as="geometry"/>
33 |                 </mxCell>
34 |                 <mxCell id="12" value="&lt;font style=&quot;font-size: 25px&quot; color=&quot;#cc6600&quot;&gt;&lt;b&gt;Engine&lt;/b&gt;&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;shadow=0;glass=0;" parent="1" vertex="1">
35 |                     <mxGeometry x="483.2" y="469.66" width="100" height="20" as="geometry"/>
36 |                 </mxCell>
37 |                 <mxCell id="5" value="" style="verticalLabelPosition=bottom;html=1;verticalAlign=top;strokeWidth=1;align=center;outlineConnect=0;dashed=0;outlineConnect=0;shape=mxgraph.aws3d.client;aspect=fixed;strokeColor=#10739e;fillColor=#b1ddf0;" parent="1" vertex="1">
38 |                     <mxGeometry x="239.44" y="489.66" width="60" height="104" as="geometry"/>
39 |                 </mxCell>
40 |                 <mxCell id="13" value="&lt;b&gt;&lt;font style=&quot;font-size: 25px&quot; color=&quot;#ffcf0f&quot;&gt;Crawler&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;" parent="1" vertex="1">
41 |                     <mxGeometry x="335" y="560" width="110" height="20" as="geometry"/>
42 |                 </mxCell>
43 |                 <mxCell id="14" value="&lt;b&gt;&lt;font style=&quot;font-size: 25px&quot; color=&quot;#66ffff&quot;&gt;User&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;" parent="1" vertex="1">
44 |                     <mxGeometry y="355" width="70" height="20" as="geometry"/>
45 |                 </mxCell>
46 |                 <mxCell id="15" value="&lt;b&gt;&lt;font style=&quot;font-size: 25px&quot; color=&quot;#66ffff&quot;&gt;Web&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;resizable=0;points=[];autosize=1;strokeColor=none;" parent="1" vertex="1">
47 |                     <mxGeometry x="160" y="531.66" width="70" height="20" as="geometry"/>
48 |                 </mxCell>
49 |             </root>
50 |         </mxGraphModel>
51 |     </diagram>
52 | </mxfile>


--------------------------------------------------------------------------------
/Backend/Backend/settings.py:
--------------------------------------------------------------------------------
  1 | from datetime import timedelta
  2 | from pathlib import Path
  3 | import os
  4 | 
  5 | # Build paths inside the project like this: BASE_DIR / 'subdir'.
  6 | BASE_DIR = Path(__file__).resolve().parent.parent
  7 | 
  8 | # Quick-start development settings - unsuitable for production
  9 | # See https://docs.djangoproject.com/en/3.1/howto/deployment/checklist/
 10 | 
 11 | # SECURITY WARNING: keep the secret key used in production secret!
 12 | SECRET_KEY = 'vk3^9_hs96iew8f%*$v_ir=_)3eq-=y#jw#e0^x1nq%as^c^3#'
 13 | 
 14 | # SECURITY WARNING: don't run with debug turned on in production!
 15 | DEBUG = True
 16 | 
 17 | ALLOWED_HOSTS = []
 18 | 
 19 | # Installed Apps
 20 | DJANGO_APPS = [
 21 |     "django.contrib.admin",
 22 |     "django.contrib.auth",
 23 |     "django.contrib.contenttypes",
 24 |     "django.contrib.sessions",
 25 |     "django.contrib.messages",
 26 |     "django.contrib.staticfiles",
 27 | ]
 28 | 
 29 | PROJECT_APPS = [
 30 |     "accounts",
 31 |     "search_blogs"           
 32 | ]
 33 | 
 34 | THIRD_PARTY_APPS = [
 35 |     "rest_framework",
 36 |     "drf_yasg",
 37 |     "djoser",
 38 |     "corsheaders",
 39 |     "rest_framework_simplejwt",
 40 |     "rest_framework_simplejwt.token_blacklist",
 41 | ]
 42 | 
 43 | INSTALLED_APPS = DJANGO_APPS + PROJECT_APPS + THIRD_PARTY_APPS
 44 | 
 45 | MIDDLEWARE = [
 46 |     "django.middleware.security.SecurityMiddleware",
 47 |     "django.contrib.sessions.middleware.SessionMiddleware",
 48 |     "corsheaders.middleware.CorsMiddleware",  # middleware for cors-headers
 49 |     "django.middleware.common.CommonMiddleware",
 50 |     "django.middleware.csrf.CsrfViewMiddleware",
 51 |     "django.contrib.auth.middleware.AuthenticationMiddleware",
 52 |     "django.contrib.messages.middleware.MessageMiddleware",
 53 |     "django.middleware.clickjacking.XFrameOptionsMiddleware",
 54 | ]
 55 | 
 56 | ROOT_URLCONF = 'Backend.urls'
 57 | 
 58 | TEMPLATES = [
 59 |     {
 60 |         'BACKEND': 'django.template.backends.django.DjangoTemplates',
 61 |         'DIRS': [],
 62 |         'APP_DIRS': True,
 63 |         'OPTIONS': {
 64 |             'context_processors': [
 65 |                 'django.template.context_processors.debug',
 66 |                 'django.template.context_processors.request',
 67 |                 'django.contrib.auth.context_processors.auth',
 68 |                 'django.contrib.messages.context_processors.messages',
 69 |             ],
 70 |         },
 71 |     },
 72 | ]
 73 | 
 74 | WSGI_APPLICATION = 'Backend.wsgi.application'
 75 | 
 76 | # Database
 77 | # https://docs.djangoproject.com/en/3.1/ref/settings/#databases
 78 | 
 79 | DATABASES = {
 80 |     'default': {
 81 |         'ENGINE': 'django.db.backends.mysql',
 82 |         'PASSWORD': 'xxxx',
 83 |         'NAME': 'xxxx',
 84 |         'USER': 'root',
 85 |     }
 86 | }
 87 | 
 88 | # Password validation
 89 | # https://docs.djangoproject.com/en/3.1/ref/settings/#auth-password-validators
 90 | 
 91 | AUTH_PASSWORD_VALIDATORS = [
 92 |     {
 93 |         'NAME':
 94 |         'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
 95 |     },
 96 |     {
 97 |         'NAME':
 98 |         'django.contrib.auth.password_validation.MinimumLengthValidator',
 99 |     },
100 |     {
101 |         'NAME':
102 |         'django.contrib.auth.password_validation.CommonPasswordValidator',
103 |     },
104 |     {
105 |         'NAME':
106 |         'django.contrib.auth.password_validation.NumericPasswordValidator',
107 |     },
108 | ]
109 | 
110 | # Internationalization
111 | # https://docs.djangoproject.com/en/3.1/topics/i18n/
112 | 
113 | LANGUAGE_CODE = 'en-us'
114 | 
115 | TIME_ZONE = 'UTC'
116 | 
117 | USE_I18N = True
118 | 
119 | USE_L10N = True
120 | 
121 | USE_TZ = True
122 | 
123 | # Static files (CSS, JavaScript, Images)
124 | # https://docs.djangoproject.com/en/3.1/howto/static-files/
125 | 
126 | STATIC_URL = '/static/'
127 | STATICFILES_DIRS=[
128 |     os.path.join(BASE_DIR,'static')
129 | ]
130 | 
131 | 
132 | # EMAIL CONFIG
133 | EMAIL_BACKEND = "django.core.mail.backends.smtp.EmailBackend"
134 | EMAIL_HOST = "smtp.qq.com"
135 | EMAIL_PORT = 25
136 | EMAIL_HOST_USER = "justin3go@qq.com"
137 | EMAIL_HOST_PASSWORD = "xxxxxxxxx"
138 | EMAIL_USE_TLS = True
139 | DEFAULT_FROM_EMAIL = EMAIL_HOST_USER
140 | 
141 | REST_FRAMEWORK = {
142 |     # "DEFAULT_PERMISSION_CLASSES": ["rest_framework.permissions.IsAuthenticated"],
143 |     "DEFAULT_PERMISSION_CLASSES": ["rest_framework.permissions.AllowAny"],
144 |     "DEFAULT_AUTHENTICATION_CLASSES": (
145 |         "rest_framework_simplejwt.authentication.JWTAuthentication",
146 |     ),
147 | }
148 | 
149 | 
150 | SIMPLE_JWT = {
151 |     "AUTH_HEADER_TYPES": ("JWT",),
152 |     "ACCESS_TOKEN_LIFETIME": timedelta(minutes=60),
153 |     "REFRESH_TOKEN_LIFETIME": timedelta(days=1),
154 |     "AUTH_TOKEN_CLASSES": ("rest_framework_simplejwt.tokens.AccessToken",),
155 | }
156 | DOMAIN = ('localhost:8080') 
157 | SITE_NAME = ('XiuSearch') 
158 | # DJOSER CONFIG
159 | DJOSER = {
160 |     "LOGIN_FIELD": "email",
161 |     "USER_CREATE_PASSWORD_RETYPE": True,
162 |     "USERNAME_CHANGED_EMAIL_CONFIRMATION": True,
163 |     "PASSWORD_CHANGED_EMAIL_CONFIRMATION": True,
164 |     "SEND_CONFIRMATION_EMAIL": True,
165 |     "SET_USERNAME_RETYPE": True,
166 |     "SET_PASSWORD_RETYPE": True,
167 |     "USERNAME_RESET_CONFIRM_URL": "password/reset/confirm?uid={uid}&token={token}",
168 |     "PASSWORD_RESET_CONFIRM_URL": "email/reset/confirm?uid={uid}&token={token}",
169 |     "ACTIVATION_URL": "activate?uid={uid}&token={token}",
170 |     "SEND_ACTIVATION_EMAIL": True,
171 |     "SOCIAL_AUTH_TOKEN_STRATEGY": "djoser.social.token.jwt.TokenStrategy",
172 |     # TODO
173 |     "SOCIAL_AUTH_ALLOWED_REDIRECT_URIS": [
174 |         "your redirect url",
175 |         "your redirect url",
176 |     ],
177 |     "SERIALIZERS": {
178 |         "user_create": "accounts.serializers.MyUserCreateSerializer",  # custom serializer
179 |         "user": "accounts.serializers.MyUserSerializer",
180 |         "current_user": "accounts.serializers.MyUserSerializer",
181 |         "user_delete": "djoser.serializers.UserSerializer",
182 |     },
183 | }
184 | 
185 | # CORS HEADERS
186 | CORS_ORIGIN_ALLOW_ALL = True
187 | CORS_ALLOW_CREDENTIALS = True
188 | 
189 | # 覆盖django的用户类
190 | AUTH_USER_MODEL = 'accounts.CustomUser'


--------------------------------------------------------------------------------
/Backend/search_blogs/views.py:
--------------------------------------------------------------------------------
  1 | from config import ES_HOST
  2 | from datetime import datetime
  3 | 
  4 | from elasticsearch import Elasticsearch
  5 | 
  6 | from django.utils.datastructures import OrderedSet
  7 | from drf_yasg import openapi
  8 | from drf_yasg.utils import swagger_auto_schema
  9 | from rest_framework.views import APIView
 10 | from rest_framework.response import Response
 11 | from rest_framework import status
 12 | from rest_framework.permissions import AllowAny
 13 | 
 14 | from search_blogs.models import BlogsIndex
 15 | import sys
 16 | sys.path.append("C:/My_app/code/咻Search")
 17 | # Create your views here.
 18 | # class IndexView(View):
 19 | #     pass
 20 | 
 21 | client = Elasticsearch(hosts=[ES_HOST])
 22 | 
 23 | 
 24 | class SearchView(APIView):
 25 |     '''
 26 |     返回搜索结果的接口
 27 |     '''
 28 |     permission_classes = [AllowAny]
 29 | 
 30 |     q = openapi.Parameter('q',
 31 |                           openapi.IN_QUERY,
 32 |                           description="查询语句",
 33 |                           type=openapi.TYPE_STRING)
 34 |     p = openapi.Parameter('p',
 35 |                           openapi.IN_QUERY,
 36 |                           description="页码",
 37 |                           type=openapi.TYPE_STRING)
 38 | 
 39 |     @swagger_auto_schema(manual_parameters=[q, p], responses={200: {}})
 40 |     def get(self, request):
 41 |         # 获取参数
 42 |         key_words = request.query_params.get("q", "")
 43 |         page = request.query_params.get("p", "1")
 44 |         # key_words = q
 45 |         # s_type = ["title", "keywords", "description", "content"]
 46 |         # page = p
 47 | 
 48 |         try:
 49 |             page = int(page)
 50 |         except:
 51 |             page = 1
 52 |         try:
 53 |             start_time = datetime.now()  # 计时
 54 |             response = client.search(index="blogs",
 55 |                                      body={
 56 |                                          "query": {
 57 |                                              "multi_match": {
 58 |                                                  "query": key_words,
 59 |                                                  "fields":
 60 |                                                  ["title", "content"]
 61 |                                              }
 62 |                                          },
 63 |                                          "from": (page - 1) * 10,
 64 |                                          "size": 10,
 65 |                                          "highlight": {
 66 |                                              "pre_tags":
 67 |                                              ["<span class='highlight'>"],
 68 |                                              "post_tags": ["</span>"],
 69 |                                              "fields": {
 70 |                                                  "title": {},
 71 |                                                  "content": {},
 72 |                                              },
 73 |                                              "fragment_size":
 74 |                                              40
 75 |                                          }
 76 |                                      })
 77 |             end_time = datetime.now()
 78 |             search_cost_time = (end_time - start_time).total_seconds()
 79 | 
 80 |             total_nums = response["hits"]["total"]["value"]
 81 | 
 82 |             if (total_nums % 10) > 0:
 83 |                 page_nums = int(total_nums / 10) + 1
 84 |             else:
 85 |                 page_nums = int(total_nums / 10)
 86 | 
 87 |             hit_list = []
 88 |             # 这里封装的时候也可以重新排序-->不过elastic里面应该有，后面可以看看
 89 |             for hit in response["hits"]["hits"]:
 90 |                 hit_dict = {}
 91 |                 # title
 92 |                 if "title" in hit["highlight"]:
 93 |                     hit_dict["title"] = "".join(hit["highlight"].get(
 94 |                         "title", ""))
 95 |                 else:
 96 |                     hit_dict["title"] = hit["_source"].get("title", "")
 97 | 
 98 |                 # content
 99 |                 if "content" in hit["highlight"]:
100 |                     hit_dict["content"] = "".join(hit["highlight"].get(
101 |                         "content", ""))  # 取前五百个词
102 |                 else:
103 |                     hit_dict["content"] = hit["_source"].get("content", "")
104 | 
105 |                 hit_dict["page_url"] = hit["_source"].get("page_url", "")
106 |                 hit_dict["score"] = hit["_score"]
107 | 
108 |                 hit_list.append(hit_dict)
109 | 
110 |             result = {
111 |                 "page": page,
112 |                 "searchCostTime": search_cost_time,
113 |                 "totalNums": total_nums,
114 |                 "pageNums": page_nums,
115 |                 "hitList": hit_list,
116 |             }
117 |             return Response(result, status=status.HTTP_200_OK)
118 |         except Exception as e:
119 |             return Response(e, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
120 | 
121 | 
122 | class SearchSuggest(APIView):
123 |     '''
124 |     根据输入返回搜索建议的接口
125 |     '''
126 |     permission_classes = [AllowAny]
127 | 
128 |     input = openapi.Parameter('input',
129 |                               openapi.IN_QUERY,
130 |                               description="输入文本",
131 |                               type=openapi.TYPE_STRING)
132 | 
133 |     @swagger_auto_schema(manual_parameters=[input], responses={200: []})
134 |     def get(self, request):
135 |         input_text = request.query_params.get("input", "")
136 |         suggest_list = []
137 |         if input_text:
138 |             s_ = BlogsIndex.search()
139 |             s = s_.suggest('my_suggest',
140 |                            input_text,
141 |                            completion={
142 |                                "field": "suggest",
143 |                                "fuzzy": {
144 |                                    "fuzziness": 2
145 |                                },
146 |                                "size": 8
147 |                            })
148 |             suggestions = s.execute()
149 |             name_set = OrderedSet()
150 |             for match in suggestions.suggest.my_suggest[0].options[:10]:
151 |                 source = match._source
152 |                 name_set.add(source["title"])
153 |             for name in name_set:
154 |                 suggest_list.append(name)
155 |         return Response(suggest_list, status=status.HTTP_200_OK)
156 | 


--------------------------------------------------------------------------------
/frontend/src/views/Register.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<div class="register">
  3 | 		<div class="welcome">欢迎加入XiuSearch这个大家庭~</div>
  4 | 		<div class="container">
  5 | 			<div class="pic">
  6 | 				<img src="@/assets/register.svg" alt="" />
  7 | 			</div>
  8 | 			<div class="register-part">
  9 | 				<div class="some-tips">在这里注册您的账号</div>
 10 | 				<el-form
 11 | 					ref="form"
 12 | 					:model="form"
 13 | 					:label-position="'left'"
 14 | 					label-width="1rem"
 15 | 					:rules="rules"
 16 | 				>
 17 | 					<el-form-item label="用户名:" prop="userName">
 18 | 						<el-input
 19 | 							v-model="form.userName"
 20 | 							placeholder="请输入您的用户名"
 21 | 							clearable
 22 | 						></el-input>
 23 | 					</el-form-item>
 24 | 					<el-form-item label="您的邮箱:" prop="userEmail">
 25 | 						<el-autocomplete
 26 | 							v-model="form.userEmail"
 27 | 							:fetch-suggestions="querySearchEmail"
 28 | 							:trigger-on-focus="false"
 29 | 							placeholder="请输入您的邮箱"
 30 | 							clearable
 31 | 							type="email"
 32 | 						>
 33 | 						</el-autocomplete>
 34 | 					</el-form-item>
 35 | 					<el-form-item label="您的密码:" prop="userPassword">
 36 | 						<el-input
 37 | 							v-model="form.userPassword"
 38 | 							placeholder="请输入您的密码"
 39 | 							show-password
 40 | 							clearable
 41 | 						></el-input>
 42 | 					</el-form-item>
 43 | 					<el-form-item label="密码认证:" prop="userPassword2">
 44 | 						<el-input
 45 | 							v-model="form.userPassword2"
 46 | 							placeholder="请再次输入您的密码"
 47 | 							show-password
 48 | 							clearable
 49 | 						></el-input>
 50 | 					</el-form-item>
 51 | 					<div class="btn">
 52 | 						<el-form-item>
 53 | 							<el-button type="primary" @click="onSubmit('form')"
 54 | 								>确认</el-button
 55 | 							>
 56 | 							<el-button>取消</el-button>
 57 | 						</el-form-item>
 58 | 						<el-form-item>
 59 | 							<el-button @click="gologin">登录</el-button>
 60 | 						</el-form-item>
 61 | 					</div>
 62 | 				</el-form>
 63 | 			</div>
 64 | 		</div>
 65 | 	</div>
 66 | </template>
 67 | 
 68 | <script>
 69 | import { register } from "@/api/index.js";
 70 | import { ElLoading } from "element-plus";
 71 | export default {
 72 | 	data() {
 73 | 		var validatePass2 = (rule, value, callback) => {
 74 | 			if (value === "") {
 75 | 				callback(new Error("请再次输入密码"));
 76 | 			} else if (value !== this.form.userPassword) {
 77 | 				callback(new Error("两次输入密码不一致!"));
 78 | 			} else {
 79 | 				callback();
 80 | 			}
 81 | 		};
 82 | 		return {
 83 | 			form: {
 84 | 				userName: "",
 85 | 				userEmail: "",
 86 | 				userPassword: "",
 87 | 				userPassword2: "",
 88 | 			},
 89 | 
 90 | 			rules: {
 91 | 				userName: [
 92 | 					{ required: true, message: "请输入用户名", trigger: "blur" },
 93 | 				],
 94 | 				userEmail: [
 95 | 					{ required: true, message: "请输入邮箱", trigger: "blur" },
 96 | 					{
 97 | 						min: 6,
 98 | 						max: 40,
 99 | 						message: "长度在 6 到 40 个字符",
100 | 						trigger: "blur",
101 | 					},
102 | 					{
103 | 						required: true,
104 | 						type: "email",
105 | 						message: "请输入正确的邮箱格式",
106 | 						trigger: "blur",
107 | 					},
108 | 				],
109 | 				userPassword: [
110 | 					{ required: true, message: "请输入密码", trigger: "blur" },
111 | 					{
112 | 						min: 6,
113 | 						message: "长度至少为6",
114 | 						trigger: "blur",
115 | 					},
116 | 				],
117 | 				// 这个两次密码应该一致
118 | 				userPassword2: [
119 | 					{ required: true, validator: validatePass2, trigger: "blur" },
120 | 				],
121 | 			},
122 | 		};
123 | 	},
124 | 	methods: {
125 | 		onSubmit(formName) {
126 | 			this.$refs[formName].validate((valid) => {
127 | 				if (valid) {
128 | 					const loading = ElLoading.service({
129 | 						lock: true,
130 | 						text: "拼命加载中...",
131 | 						background: "rgba(255,255,255,0.5)",
132 | 					});
133 | 					register(
134 | 						this.form.userName,
135 | 						this.form.userEmail,
136 | 						this.form.userPassword,
137 | 						this.form.userPassword2
138 | 					)
139 | 						.then((data) => {
140 | 							console.log(data);
141 | 							loading.close()
142 | 							alert("注册成功!");
143 | 							this.$router.push("/login");
144 | 						})
145 | 						.catch((err) => {
146 | 							loading.close()
147 | 							//TODO 这里简单的打印出来保证逻辑，后续在再优化显示
148 | 							let info = '请求错误'
149 | 							try{
150 | 								info = err.response.data
151 | 							}catch{
152 | 								info = '请求错误'
153 | 							}
154 | 							alert(JSON.stringify(info));
155 | 						});
156 | 				} else {
157 | 					alert("提交失败!");
158 | 					return false;
159 | 				}
160 | 			});
161 | 		},
162 | 		gologin() {
163 | 			this.$router.push("/login");
164 | 		},
165 | 		// 邮箱自动填充后缀名
166 | 		querySearchEmail(queryString, callback) {
167 | 			const emailList = [
168 | 				{ value: "@qq.com" },
169 | 				{ value: "@163.com" },
170 | 				{ value: "@gmail.com" },
171 | 				{ value: "@foxmail.com" },
172 | 				{ value: "@sina.com" },
173 | 				{ value: "@126.com" },
174 | 				{ value: "@sohu.com" },
175 | 				{ value: "@yahoo.com.cn" },
176 | 				{ value: "@msn.com" },
177 | 				{ value: "@hotmail.com" },
178 | 				{ value: "@ask.com" },
179 | 			];
180 | 			let results = [];
181 | 			let queryList = [];
182 | 			emailList.map((item) => {
183 | 				queryList.push({ value: queryString.split("@")[0] + item.value });
184 | 			});
185 | 			results = queryString
186 | 				? queryList.filter(this.createFilter(queryString))
187 | 				: queryList;
188 | 			callback(results);
189 | 		},
190 | 
191 | 		// 邮箱填写过滤
192 | 		createFilter(queryString) {
193 | 			return (item) => {
194 | 				return (
195 | 					item.value.toLowerCase().indexOf(queryString.toLowerCase()) === 0
196 | 				);
197 | 			};
198 | 		},
199 | 	},
200 | };
201 | </script>
202 | <style lang="less" scoped>
203 | // #251AFF
204 | .welcome {
205 | 	font-family: "Trebuchet MS", "Lucida Sans Unicode", "Lucida Grande",
206 | 		"Lucida Sans", Arial, sans-serif;
207 | 	font-style: italic;
208 | 	font-size: 0.8rem;
209 | 	margin-top: 0.5rem;
210 | 	text-align: center;
211 | 	font-weight: 900;
212 | 	color: rgb(20, 19, 68);
213 | }
214 | .container {
215 | 	margin-top: 0.4rem;
216 | 	display: flex;
217 | 	justify-content: space-around;
218 | 	align-items: center;
219 | 	img {
220 | 		height: 5rem;
221 | 		width: 5rem;
222 | 	}
223 | 	.register-part {
224 | 		background-color: #fff;
225 | 		padding: 0.5rem;
226 | 		width: 6.5rem;
227 | 		border-radius: 0.2rem;
228 | 		box-shadow: 0.1rem 0.1rem 0.1rem #999;
229 | 		.some-tips {
230 | 			font-size: 0.2rem;
231 | 			color: #ccc;
232 | 			margin-bottom: 0.15rem;
233 | 		}
234 | 		.el-form {
235 | 			/deep/ .el-autocomplete {
236 | 				width: 4.5rem;
237 | 				outline-style: none;
238 | 			}
239 | 			.btn {
240 | 				display: flex;
241 | 				justify-content: space-between;
242 | 				align-items: center;
243 | 			}
244 | 			.el-form-item {
245 | 				/deep/ .el-input {
246 | 					outline: none;
247 | 				}
248 | 			}
249 | 		}
250 | 	}
251 | }
252 | </style>
253 | 


--------------------------------------------------------------------------------
/frontend/src/components/ResultList/SearchBoxDetail.vue:
--------------------------------------------------------------------------------
  1 | <template>
  2 | 	<div class="search-box-detail">
  3 | 		<div class="search">
  4 | 			<input
  5 | 				@focus="focusShow"
  6 | 				@blur="blurShow"
  7 | 				@keyup.enter="enterToSearch(), (show = false)"
  8 | 				class="input-box"
  9 | 				type="text"
 10 | 				placeholder="Search..."
 11 | 				v-model="searchQuery"
 12 | 			/>
 13 | 			<router-link :to="{ path: '/search', query: { q: searchQuery, p: 1 } }">
 14 | 				<svg class="icon" aria-hidden="true">
 15 | 					<use xlink:href="#icon-sousuo"></use>
 16 | 				</svg>
 17 | 			</router-link>
 18 | 		</div>
 19 | 		<!-- 可以且一个数组不等于空 -->
 20 | 		<div
 21 | 			class="suggests"
 22 | 			v-show="show && searchedsuggests.list.length != 0"
 23 | 			@mouseenter="isMouseOnSerchBox = true"
 24 | 			@mouseleave="isMouseOnSerchBox = false"
 25 | 		>
 26 | 			<div
 27 | 				class="suggest-item"
 28 | 				v-for="(item, index) in searchedsuggests.list"
 29 | 				:key="index"
 30 | 				@click="checkToSearch(item.value), (show = false)"
 31 | 			>
 32 | 				<svg v-if="item.isHistory" class="icon" aria-hidden="true">
 33 | 					<use xlink:href="#icon-lishixiao"></use>
 34 | 				</svg>
 35 | 				<svg v-else class="icon" aria-hidden="true">
 36 | 					<use xlink:href="#icon-sousuo"></use>
 37 | 				</svg>
 38 | 				<div class="text">
 39 | 					{{ item.value }}
 40 | 				</div>
 41 | 			</div>
 42 | 		</div>
 43 | 	</div>
 44 | 	<!-- 上面checkToSearch(item.value),show=false解决点击选项跳转后不隐藏的BUG -->
 45 | </template>
 46 | 
 47 | <script>
 48 | import {
 49 | 	onMounted,
 50 | 	onUpdated,
 51 | 	computed,
 52 | 	reactive,
 53 | 	ref,
 54 | 	watchEffect,
 55 | } from "vue";
 56 | import { useRouter, onBeforeRouteUpdate, useRoute } from "vue-router";
 57 | import store from "@/store/index.js";
 58 | import { mapState, mapMutations } from "vuex";
 59 | import { getSearchResult, getSearchSuggest } from "@/api/index.js";
 60 | import { ElLoading } from "element-plus";
 61 | 
 62 | export default {
 63 | 	name: "search-box-detail",
 64 | 	data() {
 65 | 		return {
 66 | 			show: false,
 67 | 			isMouseOnSerchBox: false,
 68 | 		};
 69 | 	},
 70 | 	methods: {
 71 | 		focusShow() {
 72 | 			this.show = true;
 73 | 		},
 74 | 		blurShow() {
 75 | 			// console.log("执行blurShow......")
 76 | 			// 这个if重点，没这个会造成建议选项无法点击，一点击就触发input焦点消失，然后这个建议的下拉选择项也跟着消失
 77 | 			// 应该是如果点击区域不在search-box中才隐藏-->这里通过@mouseenter这个属性来判断
 78 | 			// console.log("clickSearchBox: ", this.clickSearchBox)
 79 | 			if (!this.isMouseOnSerchBox) {
 80 | 				// console.log("执行if......")
 81 | 				this.show = false;
 82 | 			} else {
 83 | 				// console.log("执行else......")
 84 | 				// 马上重置为false
 85 | 				this.isMouseOnSerchBox = false;
 86 | 			}
 87 | 		},
 88 | 	},
 89 | 	components: {},
 90 | 	// 最开始页面是通过resultList获取数据的，后面再通过watch监听路由变化从而进行数据刷新
 91 | 	setup(props) {
 92 | 		const router = useRouter();
 93 | 		const route = useRoute();
 94 | 		let searchQuery = ref();
 95 | 		// 历史记录+搜索建议的实现
 96 | 		let historyList = JSON.parse(localStorage.getItem("historyList")) || [];
 97 | 		let searchedsuggests = reactive({
 98 | 			list: [],
 99 | 		});
100 | 
101 | 		watchEffect(async () => {
102 | 			let su = [];
103 | 			// 防止item为空tostring方法会报错
104 | 			if (historyList != []) {
105 | 				su = historyList.filter((item) => {
106 | 					return item.toString().indexOf(searchQuery.value) != -1;
107 | 				});
108 | 			}
109 | 			let suggests = [];
110 | 			// 如果历史记录大于n条就直接返回，否则就拼接请求的搜索建议
111 | 			if (su.length < 8) {
112 | 				// 获取搜索建议
113 | 				console.log("小于8条......");
114 | 				let res = await getSearchSuggest(searchQuery.value);
115 | 				console.log("res: ", res.data);
116 | 				//拼接并去前8条,同时区分是否是历史记录
117 | 				for (let i of su) {
118 | 					suggests.push({ isHistory: true, value: i });
119 | 				}
120 | 				console.log(8 - su.length);
121 | 				for (let j = 0; j < 8 - su.length; j++) {
122 | 					if (res.data.length <= j) {
123 | 						break;
124 | 					}
125 | 					suggests.push({ isHistory: false, value: res.data[j] });
126 | 				}
127 | 			} else {
128 | 				for (let i = 0; i < 8; i++) {
129 | 					suggests.push({ isHistory: true, value: su[i] });
130 | 				}
131 | 			}
132 | 			console.log("suggests: ", suggests);
133 | 			searchedsuggests.list = suggests;
134 | 		});
135 | 
136 | 		function enterToSearch() {
137 | 			console.log("触发回车搜索事件......");
138 | 			store.commit("SetSearchValue", searchQuery.value);
139 | 			router.push({ path: "/search", query: { q: searchQuery.value, p: 1 } });
140 | 		}
141 | 		function checkToSearch(value) {
142 | 			// console.log(value);
143 | 			searchQuery.value = value; // 让它在点击情况下也会在搜索框显示
144 | 			store.commit("SetSearchValue", value);
145 | 			router.push({ path: "/search", query: { q: value, p: 1 } });
146 | 		}
147 | 
148 | 		onMounted(async () => {
149 | 			let q = route.query.q;
150 | 			let p = route.query.p;
151 | 			const loading = ElLoading.service({
152 | 				lock: true,
153 | 				text: "拼命加载中...",
154 | 				background: "rgba(255,255,255,0.5)",
155 | 			});
156 | 			console.log("loading");
157 | 			let res = await getSearchResult(q, p);
158 | 			loading.close()
159 | 			console.log("getSearchResult: ", res.data);
160 | 			store.commit("SetSearchResult", res.data);
161 | 			// 注意这里，刷新或者进入这个界面也需要把搜索值提交上去
162 | 			store.commit("SetSearchValue", q);
163 | 			// searchQuery = store.state.SearchValue;
164 | 			searchQuery.value = store.state.SearchValue;
165 | 			console.log("mapState_searchQuery: ", searchQuery);
166 | 		});
167 | 		// 监听路由的变化
168 | 		onBeforeRouteUpdate(async (to) => {
169 | 			const loading = ElLoading.service({
170 | 				lock: true,
171 | 				text: "拼命加载中...",
172 | 				background: "rgba(255,255,255,0.5)",
173 | 			});
174 | 			console.log("loading");
175 | 			let res = await getSearchResult(to.query.q, to.query.p);
176 | 			loading.close();
177 | 			console.log(to);
178 | 			store.commit("SetSearchResult", res.data);
179 | 		});
180 | 		return {
181 | 			searchedsuggests,
182 | 			searchQuery,
183 | 			enterToSearch,
184 | 			checkToSearch,
185 | 		};
186 | 	},
187 | };
188 | </script>
189 | 
190 | <style lang="less" scoped>
191 | .search-box-detail {
192 | 	background-color: #fff;
193 | 	width: 5rem;
194 | 	margin: auto;
195 | 	border-radius: 0.2rem;
196 | 	box-shadow: 0.05rem 0.05rem 0.1rem #666;
197 | 	.search {
198 | 		display: block;
199 | 		position: relative;
200 | 		left: 0;
201 | 		top: 0;
202 | 		width: 5rem;
203 | 		margin: auto;
204 | 		.input-box {
205 | 			width: 5rem;
206 | 			height: 0.4rem;
207 | 			padding: 0 0.5rem 0 0.2rem;
208 | 			color: #111;
209 | 			font-size: 0.2rem;
210 | 			border-radius: 0.2rem;
211 | 			border: 0;
212 | 			// box-shadow: 0.05rem 0.05rem 0.05rem #666;
213 | 			outline: none;
214 | 		}
215 | 		.icon {
216 | 			position: absolute;
217 | 			top: 0.05rem;
218 | 			right: 0.1rem;
219 | 			height: 0.3rem;
220 | 			width: 0.3rem;
221 | 			fill: #999;
222 | 		}
223 | 	}
224 | 	.suggests {
225 | 		padding-bottom: 0.15rem;
226 | 		margin: 0 0.1rem 0 0.1rem;
227 | 		border-top: #ccc 1px solid;
228 | 		.suggest-item {
229 | 			margin-top: 0.1rem;
230 | 			height: 0.3rem;
231 | 			padding-left: 0.1rem;
232 | 			cursor: pointer;
233 | 			display: flex;
234 | 			justify-content: left;
235 | 			align-items: center;
236 | 			.icon {
237 | 				height: 0.15rem;
238 | 				width: 0.15rem;
239 | 				fill: #999;
240 | 			}
241 | 			.text {
242 | 				width: 4.3rem;
243 | 				padding-left: 0.1rem;
244 | 				line-height: 0.3rem;
245 | 				overflow: hidden;
246 | 				text-overflow: ellipsis;
247 | 				display: -webkit-box;
248 | 				-webkit-box-orient: vertical;
249 | 				-webkit-line-clamp: 1;
250 | 				text-align: left;
251 | 			}
252 | 		}
253 | 	}
254 | }
255 | </style>
256 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                      木兰宽松许可证, 第2版
  2 | 
  3 |    木兰宽松许可证， 第2版 
  4 |    2020年1月 http://license.coscl.org.cn/MulanPSL2
  5 | 
  6 | 
  7 |    您对“软件”的复制、使用、修改及分发受木兰宽松许可证，第2版（“本许可证”）的如下条款的约束：
  8 | 
  9 |    0. 定义
 10 | 
 11 |       “软件”是指由“贡献”构成的许可在“本许可证”下的程序和相关文档的集合。
 12 | 
 13 |       “贡献”是指由任一“贡献者”许可在“本许可证”下的受版权法保护的作品。
 14 | 
 15 |       “贡献者”是指将受版权法保护的作品许可在“本许可证”下的自然人或“法人实体”。
 16 | 
 17 |       “法人实体”是指提交贡献的机构及其“关联实体”。
 18 | 
 19 |       “关联实体”是指，对“本许可证”下的行为方而言，控制、受控制或与其共同受控制的机构，此处的控制是指有受控方或共同受控方至少50%直接或间接的投票权、资金或其他有价证券。
 20 | 
 21 |    1. 授予版权许可
 22 | 
 23 |       每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的版权许可，您可以复制、使用、修改、分发其“贡献”，不论修改与否。
 24 | 
 25 |    2. 授予专利许可
 26 | 
 27 |       每个“贡献者”根据“本许可证”授予您永久性的、全球性的、免费的、非独占的、不可撤销的（根据本条规定撤销除外）专利许可，供您制造、委托制造、使用、许诺销售、销售、进口其“贡献”或以其他方式转移其“贡献”。前述专利许可仅限于“贡献者”现在或将来拥有或控制的其“贡献”本身或其“贡献”与许可“贡献”时的“软件”结合而将必然会侵犯的专利权利要求，不包括对“贡献”的修改或包含“贡献”的其他结合。如果您或您的“关联实体”直接或间接地，就“软件”或其中的“贡献”对任何人发起专利侵权诉讼（包括反诉或交叉诉讼）或其他专利维权行动，指控其侵犯专利权，则“本许可证”授予您对“软件”的专利许可自您提起诉讼或发起维权行动之日终止。
 28 | 
 29 |    3. 无商标许可
 30 | 
 31 |       “本许可证”不提供对“贡献者”的商品名称、商标、服务标志或产品名称的商标许可，但您为满足第4条规定的声明义务而必须使用除外。
 32 | 
 33 |    4. 分发限制
 34 | 
 35 |       您可以在任何媒介中将“软件”以源程序形式或可执行形式重新分发，不论修改与否，但您必须向接收者提供“本许可证”的副本，并保留“软件”中的版权、商标、专利及免责声明。
 36 | 
 37 |    5. 免责声明与责任限制
 38 | 
 39 |       “软件”及其中的“贡献”在提供时不带任何明示或默示的担保。在任何情况下，“贡献者”或版权所有者不对任何人因使用“软件”或其中的“贡献”而引发的任何直接或间接损失承担责任，不论因何种原因导致或者基于何种法律理论，即使其曾被建议有此种损失的可能性。 
 40 | 
 41 |    6. 语言
 42 |       “本许可证”以中英文双语表述，中英文版本具有同等法律效力。如果中英文版本存在任何冲突不一致，以中文版为准。
 43 | 
 44 |    条款结束 
 45 | 
 46 |    如何将木兰宽松许可证，第2版，应用到您的软件
 47 |    
 48 |    如果您希望将木兰宽松许可证，第2版，应用到您的新软件，为了方便接收者查阅，建议您完成如下三步：
 49 | 
 50 |       1， 请您补充如下声明中的空白，包括软件名、软件的首次发表年份以及您作为版权人的名字；
 51 | 
 52 |       2， 请您在软件包的一级目录下创建以“LICENSE”为名的文件，将整个许可证文本放入该文件中；
 53 | 
 54 |       3， 请将如下声明文本放入每个源文件的头部注释中。
 55 | 
 56 |    Copyright (c) [Year] [name of copyright holder]
 57 |    [Software Name] is licensed under Mulan PSL v2.
 58 |    You can use this software according to the terms and conditions of the Mulan PSL v2. 
 59 |    You may obtain a copy of Mulan PSL v2 at:
 60 |             http://license.coscl.org.cn/MulanPSL2 
 61 |    THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.  
 62 |    See the Mulan PSL v2 for more details.  
 63 | 
 64 | 
 65 |                      Mulan Permissive Software License，Version 2
 66 | 
 67 |    Mulan Permissive Software License，Version 2 (Mulan PSL v2)
 68 |    January 2020 http://license.coscl.org.cn/MulanPSL2
 69 | 
 70 |    Your reproduction, use, modification and distribution of the Software shall be subject to Mulan PSL v2 (this License) with the following terms and conditions: 
 71 |    
 72 |    0. Definition
 73 |    
 74 |       Software means the program and related documents which are licensed under this License and comprise all Contribution(s). 
 75 |    
 76 |       Contribution means the copyrightable work licensed by a particular Contributor under this License.
 77 |    
 78 |       Contributor means the Individual or Legal Entity who licenses its copyrightable work under this License.
 79 |    
 80 |       Legal Entity means the entity making a Contribution and all its Affiliates.
 81 |    
 82 |       Affiliates means entities that control, are controlled by, or are under common control with the acting entity under this License, ‘control’ means direct or indirect ownership of at least fifty percent (50%) of the voting power, capital or other securities of controlled or commonly controlled entity.
 83 | 
 84 |    1. Grant of Copyright License
 85 | 
 86 |       Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable copyright license to reproduce, use, modify, or distribute its Contribution, with modification or not.
 87 | 
 88 |    2. Grant of Patent License 
 89 | 
 90 |       Subject to the terms and conditions of this License, each Contributor hereby grants to you a perpetual, worldwide, royalty-free, non-exclusive, irrevocable (except for revocation under this Section) patent license to make, have made, use, offer for sale, sell, import or otherwise transfer its Contribution, where such patent license is only limited to the patent claims owned or controlled by such Contributor now or in future which will be necessarily infringed by its Contribution alone, or by combination of the Contribution with the Software to which the Contribution was contributed. The patent license shall not apply to any modification of the Contribution, and any other combination which includes the Contribution. If you or your Affiliates directly or indirectly institute patent litigation (including a cross claim or counterclaim in a litigation) or other patent enforcement activities against any individual or entity by alleging that the Software or any Contribution in it infringes patents, then any patent license granted to you under this License for the Software shall terminate as of the date such litigation or activity is filed or taken.
 91 | 
 92 |    3. No Trademark License
 93 | 
 94 |       No trademark license is granted to use the trade names, trademarks, service marks, or product names of Contributor, except as required to fulfill notice requirements in Section 4.
 95 | 
 96 |    4. Distribution Restriction
 97 | 
 98 |       You may distribute the Software in any medium with or without modification, whether in source or executable forms, provided that you provide recipients with a copy of this License and retain copyright, patent, trademark and disclaimer statements in the Software.
 99 | 
100 |    5. Disclaimer of Warranty and Limitation of Liability
101 | 
102 |       THE SOFTWARE AND CONTRIBUTION IN IT ARE PROVIDED WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED. IN NO EVENT SHALL ANY CONTRIBUTOR OR COPYRIGHT HOLDER BE LIABLE TO YOU FOR ANY DAMAGES, INCLUDING, BUT NOT LIMITED TO ANY DIRECT, OR INDIRECT, SPECIAL OR CONSEQUENTIAL DAMAGES ARISING FROM YOUR USE OR INABILITY TO USE THE SOFTWARE OR THE CONTRIBUTION IN IT, NO MATTER HOW IT’S CAUSED OR BASED ON WHICH LEGAL THEORY, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
103 | 
104 |    6. Language
105 | 
106 |       THIS LICENSE IS WRITTEN IN BOTH CHINESE AND ENGLISH, AND THE CHINESE VERSION AND ENGLISH VERSION SHALL HAVE THE SAME LEGAL EFFECT. IN THE CASE OF DIVERGENCE BETWEEN THE CHINESE AND ENGLISH VERSIONS, THE CHINESE VERSION SHALL PREVAIL.
107 | 
108 |    END OF THE TERMS AND CONDITIONS
109 | 
110 |    How to Apply the Mulan Permissive Software License，Version 2 (Mulan PSL v2) to Your Software
111 | 
112 |       To apply the Mulan PSL v2 to your work, for easy identification by recipients, you are suggested to complete following three steps:
113 | 
114 |       i Fill in the blanks in following statement, including insert your software name, the year of the first publication of your software, and your name identified as the copyright owner; 
115 | 
116 |       ii Create a file named “LICENSE” which contains the whole context of this License in the first directory of your software package;
117 | 
118 |       iii Attach the statement to the appropriate annotated syntax at the beginning of each source file.
119 | 
120 | 
121 |    Copyright (c) [Year] [name of copyright holder]
122 |    [Software Name] is licensed under Mulan PSL v2.
123 |    You can use this software according to the terms and conditions of the Mulan PSL v2. 
124 |    You may obtain a copy of Mulan PSL v2 at:
125 |                http://license.coscl.org.cn/MulanPSL2 
126 |    THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.  
127 |    See the Mulan PSL v2 for more details.  
128 | 


--------------------------------------------------------------------------------
/Engine/html_extractor.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import traceback
  3 | 
  4 | import cchardet
  5 | import lxml
  6 | import lxml.html
  7 | from lxml.html import HtmlComment
  8 | 
  9 | REGEXES = {
 10 |     'okMaybeItsACandidateRe': re.compile(
 11 |         'and|article|artical|body|column|main|shadow', re.I),
 12 |     'positiveRe': re.compile(
 13 |         ('article|arti|body|content|entry|hentry|main|page|'
 14 |          'artical|zoom|arti|context|message|editor|'
 15 |          'pagination|post|txt|text|blog|story'), re.I),
 16 |     'negativeRe': re.compile(
 17 |         ('copyright|combx|comment|com-|contact|foot|footer|footnote|decl|copy|'
 18 |          'notice|'
 19 |          'masthead|media|meta|outbrain|promo|related|scroll|link|pagebottom|bottom|'
 20 |          'other|shoutbox|sidebar|sponsor|shopping|tags|tool|widget'), re.I),
 21 | }
 22 | 
 23 | 
 24 | 
 25 | class MainContent:
 26 |     def __init__(self,):
 27 |         self.non_content_tag = set([
 28 |             'head',
 29 |             'meta',
 30 |             'script',
 31 |             'style',
 32 |             'object', 'embed',
 33 |             'iframe',
 34 |             'marquee',
 35 |             'select',
 36 |         ])
 37 |         self.title = ''
 38 |         self.p_space = re.compile(r'\s')
 39 |         self.p_html = re.compile(r'<html|</html>', re.IGNORECASE|re.DOTALL)
 40 |         self.p_content_stop = re.compile(r'正文.*结束|正文下|相关阅读|声明')
 41 |         self.p_clean_tree = re.compile(r'author|post-add|copyright')
 42 | 
 43 |     def get_title(self, doc):
 44 |         title = ''
 45 |         title_el = doc.xpath('//title')
 46 |         if title_el:
 47 |             title = title_el[0].text_content().strip()
 48 |         if len(title) < 7:
 49 |             tt = doc.xpath('//meta[@name="title"]')
 50 |             if tt:
 51 |                 title = tt[0].get('content', '')
 52 |         if len(title) < 7:
 53 |             tt = doc.xpath('//*[contains(@id, "title") or contains(@class, "title")]')
 54 |             if not tt:
 55 |                 tt =  doc.xpath('//*[contains(@id, "font01") or contains(@class, "font01")]')
 56 |             for t in tt:
 57 |                 ti = t.text_content().strip()
 58 |                 if ti in title and len(ti)*2 > len(title):
 59 |                     title = ti
 60 |                     break
 61 |                 if len(ti) > 20: continue
 62 |                 if len(ti) > len(title) or len(ti) > 7:
 63 |                     title = ti
 64 |         return title
 65 | 
 66 |     def shorten_title(self, title):
 67 |         spliters = [' - ', '–', '—', '-', '|', '::']
 68 |         for s in spliters:
 69 |             if s not in title:
 70 |                 continue
 71 |             tts = title.split(s)
 72 |             if len(tts) < 2:
 73 |                 continue
 74 |             title = tts[0]
 75 |             break
 76 |         return title
 77 | 
 78 |     def calc_node_weight(self, node):
 79 |         weight = 1
 80 |         attr = '%s %s %s' % (
 81 |             node.get('class', ''),
 82 |             node.get('id', ''),
 83 |             node.get('style', '')
 84 |         )
 85 |         if attr:
 86 |             mm = REGEXES['negativeRe'].findall(attr)
 87 |             weight -= 2 * len(mm)
 88 |             mm = REGEXES['positiveRe'].findall(attr)
 89 |             weight += 4 * len(mm)
 90 |         if node.tag in ['div', 'p', 'table']:
 91 |             weight += 2
 92 |         return weight
 93 | 
 94 |     def get_main_block(self, url, html, short_title=True):
 95 |         ''' return (title, etree_of_main_content_block)
 96 |         '''
 97 |         if isinstance(html, bytes):
 98 |             encoding = cchardet.detect(html)['encoding']
 99 |             if encoding is None:
100 |                 return None, None
101 |             html = html.decode(encoding, 'ignore')
102 |         try:
103 |             doc = lxml.html.fromstring(html)
104 |             doc.make_links_absolute(base_url=url)
105 |         except :
106 |             traceback.print_exc()
107 |             return None, None
108 |         self.title = self.get_title(doc)
109 |         if short_title:
110 |             self.title = self.shorten_title(self.title)
111 |         body = doc.xpath('//body')
112 |         if not body:
113 |             return self.title, None
114 |         candidates = []
115 |         nodes = body[0].getchildren()
116 |         while nodes:
117 |             node = nodes.pop(0)
118 |             children = node.getchildren()
119 |             tlen = 0
120 |             for child in children:
121 |                 if isinstance(child, HtmlComment):
122 |                     continue
123 |                 if child.tag in self.non_content_tag:
124 |                     continue
125 |                 if child.tag == 'a':
126 |                     continue
127 |                 if child.tag == 'textarea':
128 |                     # FIXME: this tag is only part of content?
129 |                     continue
130 |                 attr = '%s%s%s' % (child.get('class', ''),
131 |                                    child.get('id', ''),
132 |                                    child.get('style'))
133 |                 if 'display' in attr and 'none' in attr:
134 |                     continue
135 |                 nodes.append(child)
136 |                 if child.tag == 'p':
137 |                     weight = 3
138 |                 else:
139 |                     weight = 1
140 |                 text = '' if not child.text else child.text.strip()
141 |                 tail = '' if not child.tail else child.tail.strip()
142 |                 tlen += (len(text) + len(tail)) * weight
143 |             if tlen < 10:
144 |                 continue
145 |             weight = self.calc_node_weight(node)
146 |             candidates.append((node, tlen*weight))
147 |         if not candidates:
148 |             return self.title, None
149 |         candidates.sort(key=lambda a: a[1], reverse=True)
150 |         good = candidates[0][0]
151 |         if good.tag in ['p', 'pre', 'code', 'blockquote']:
152 |             for i in range(5):
153 |                 good = good.getparent()
154 |                 if good.tag == 'div':
155 |                     break
156 |         good = self.clean_etree(good, url)
157 |         return self.title, good
158 | 
159 |     def clean_etree(self, tree, url=''):
160 |         to_drop = []
161 |         drop_left = False
162 |         for node in tree.iterdescendants():
163 |             if drop_left:
164 |                 to_drop.append(node)
165 |                 continue
166 |             if isinstance(node, HtmlComment):
167 |                 to_drop.append(node)
168 |                 if self.p_content_stop.search(node.text):
169 |                     drop_left = True
170 |                 continue
171 |             if node.tag in self.non_content_tag:
172 |                 to_drop.append(node)
173 |                 continue
174 |             attr = '%s %s' % (
175 |                 node.get('class', ''),
176 |                 node.get('id', '')
177 |             )
178 |             if self.p_clean_tree.search(attr):
179 |                 to_drop.append(node)
180 |                 continue
181 |             aa = node.xpath('.//a')
182 |             if aa:
183 |                 text_node = len(self.p_space.sub('', node.text_content()))
184 |                 text_aa = 0
185 |                 for a in aa:
186 |                     alen = len(self.p_space.sub('', a.text_content()))
187 |                     if alen > 5:
188 |                         text_aa += alen
189 |                 if text_aa > text_node * 0.4:
190 |                     to_drop.append(node)
191 |         for node in to_drop:
192 |             try:
193 |                 node.drop_tree()
194 |             except:
195 |                 pass
196 |         return tree
197 | 
198 |     def get_text(self, doc):
199 |         lxml.etree.strip_elements(doc, 'script')
200 |         lxml.etree.strip_elements(doc, 'style')
201 |         for ch in doc.iterdescendants():
202 |             if not isinstance(ch.tag, str):
203 |                 continue
204 |             if ch.tag in ['div', 'h1', 'h2', 'h3', 'p', 'br', 'table', 'tr', 'dl']:
205 |                 if not ch.tail:
206 |                     ch.tail = '\n'
207 |                 else:
208 |                     ch.tail = '\n' + ch.tail.strip() + '\n'
209 |             if ch.tag in ['th', 'td']:
210 |                 if not ch.text:
211 |                     ch.text = '  '
212 |                 else:
213 |                     ch.text += '  '
214 |             # if ch.tail:
215 |             #     ch.tail = ch.tail.strip()
216 |         lines = doc.text_content().split('\n')
217 |         content = []
218 |         for l in lines:
219 |             l = l.strip()
220 |             if not l:
221 |                 continue
222 |             content.append(l)
223 |         return '\n'.join(content)
224 | 
225 |     def extract(self, url, html):
226 |         '''return (title, content)
227 |         '''
228 |         title, node = self.get_main_block(url, html)
229 |         if node is None:
230 |             print('\tno main block got !!!!!', url)
231 |             return title, '', ''
232 |         content = self.get_text(node)
233 |         return title, content


--------------------------------------------------------------------------------
/frontend/src/assets/userprofile.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" width="956.66997" height="504.42139" viewBox="0 0 956.66997 504.42139" xmlns:xlink="http://www.w3.org/1999/xlink"><path d="M908.57479,700.07025H299.189a46.35967,46.35967,0,0,1-46.30713-46.30664V540.37689A46.35969,46.35969,0,0,1,299.189,494.07022L908.57479,381.91684a46.35963,46.35963,0,0,1,46.30713,46.30667v225.5401A46.35974,46.35974,0,0,1,908.57479,700.07025Z" transform="translate(-121.66502 -197.7893)" fill="#3f3d56"/><rect x="186.36926" y="317.36411" width="2" height="179.75293" fill="#2f2e41"/><rect x="141.36926" y="324.11002" width="2" height="77.00702" fill="#2f2e41"/><rect x="168.34631" y="296.2811" width="2" height="179.75293" fill="#2f2e41"/><path d="M823.00558,492.57016H440.04122a9.67047,9.67047,0,0,1-9.65933-9.65933V257.90505a9.67013,9.67013,0,0,1,9.65933-9.65935H823.00558a9.6702,9.6702,0,0,1,9.65936,9.65933v225.0058A9.67054,9.67054,0,0,1,823.00558,492.57016Z" transform="translate(-121.66502 -197.7893)" fill="#e6e7e8"/><path d="M775.88192,492.57016h-337a8.50981,8.50981,0,0,1-8.5-8.5v-198a8.50947,8.50947,0,0,1,8.5-8.5h337a8.50951,8.50951,0,0,1,8.5,8.5v198A8.50983,8.50983,0,0,1,775.88192,492.57016Z" transform="translate(-121.66502 -197.7893)" fill="#2f2e41"/><path d="M870.38192,574.07025h-530a8.50983,8.50983,0,0,1-8.5-8.5v-77a8.50982,8.50982,0,0,1,8.5-8.5h530a8.50982,8.50982,0,0,1,8.5,8.5v77A8.50981,8.50981,0,0,1,870.38192,574.07025Z" transform="translate(-121.66502 -197.7893)" fill="#3f3d56"/><rect x="298.46684" y="303.78089" width="206" height="54" rx="8" fill="#150bff"/><circle cx="601.96687" cy="306.28089" r="9" fill="#2f2e41"/><circle cx="630.96687" cy="306.28089" r="9" fill="#2f2e41"/><circle cx="659.96687" cy="306.28089" r="9" fill="#2f2e41"/><circle cx="601.96687" cy="329.28089" r="9" fill="#2f2e41"/><circle cx="630.96687" cy="329.28089" r="9" fill="#2f2e41"/><circle cx="659.96687" cy="329.28089" r="9" fill="#2f2e41"/><circle cx="601.96687" cy="352.28089" r="9" fill="#2f2e41"/><circle cx="534.80739" cy="317.36414" r="9" fill="#2f2e41"/><circle cx="534.80739" cy="340.36414" r="9" fill="#2f2e41"/><circle cx="784.69777" cy="183.202" r="19.23499" fill="#3f3d56"/><circle cx="273.10182" cy="317.36414" r="9" fill="#2f2e41"/><circle cx="273.10182" cy="340.36414" r="9" fill="#2f2e41"/><circle cx="630.96687" cy="352.28089" r="9" fill="#2f2e41"/><circle cx="659.96687" cy="352.28089" r="9" fill="#2f2e41"/><path d="M868.88192,654.57025h-531a8.50983,8.50983,0,0,1-8.5-8.5v-42a8.50982,8.50982,0,0,1,8.5-8.5h531a8.50982,8.50982,0,0,1,8.5,8.5v42A8.50981,8.50981,0,0,1,868.88192,654.57025Z" transform="translate(-121.66502 -197.7893)" fill="#2f2e41"/><path d="M468.88186,321.07019h285a8,8,0,0,1,8,8V469.58863a8,8,0,0,1-8,8h-285a8,8,0,0,1-8-8V329.07019A8,8,0,0,1,468.88186,321.07019Z" transform="translate(-121.66502 -197.7893)" fill="#fff"/><circle cx="400.61101" cy="150.6745" r="15.31303" fill="#150bff"/><path d="M583.48745,387.0517H511.52312a4.55992,4.55992,0,0,1-.02-9.11981h71.98432a4.55991,4.55991,0,1,1,.0119,9.11981Z" transform="translate(-121.66502 -197.7893)" fill="#ccc"/><path d="M546.24953,405.07016H511.52309a4.56014,4.56014,0,0,1-.01382-9.12027h34.7402a4.56014,4.56014,0,0,1,0,9.12027Z" transform="translate(-121.66502 -197.7893)" fill="#ccc"/><path d="M715.48745,440.0517H521.52309a4.55991,4.55991,0,0,1-.0119-9.11981H715.48745a4.55991,4.55991,0,1,1,.0119,9.11981h-.0119Z" transform="translate(-121.66502 -197.7893)" fill="#ccc"/><path d="M678.24953,458.07016H521.52309a4.56014,4.56014,0,0,1,0-9.12027H678.24953a4.56014,4.56014,0,0,1,0,9.12027Z" transform="translate(-121.66502 -197.7893)" fill="#ccc"/><path d="M186.99619,647.28211l-.99772-22.43415a72.45571,72.45571,0,0,0-33.79564-8.555c16.23147,13.27045,14.203,38.85126,25.20758,56.69684a43.582,43.582,0,0,0,31.95921,20.13989l13.58305,8.31647a73.02973,73.02973,0,0,0-15.39283-59.17883,70.542,70.542,0,0,0-12.96441-12.04609C191.34223,638.7992,186.99619,647.28211,186.99619,647.28211Z" transform="translate(-121.66502 -197.7893)" fill="#f2f2f2"/><path d="M261.19655,456.80661a9.76007,9.76007,0,0,0,2.00589-14.83108l32.38712-71.76581-17.95819,1.53778L248.8315,441.742a9.81312,9.81312,0,0,0,12.365,15.06463Z" transform="translate(-121.66502 -197.7893)" fill="#9e616a"/><path d="M320.0371,302.75043l-8.58353-10.21252s-4.89257-.71625-14.41189,13.39321c-9.36987,13.88794-23.48248,43.366-16.10745,47.55384l-32.736,69.91748,20.87851,9.86282L304.276,357.78241l15.6059-21.2348Z" transform="translate(-121.66502 -197.7893)" fill="#e6e6e6"/><polygon points="182.344 225.202 242.344 224.202 247.344 183.202 182.344 183.202 182.344 225.202" fill="#9e616a"/><polygon points="237.812 476.034 251.663 476.033 258.253 422.606 237.81 422.607 237.812 476.034" fill="#9e616a"/><polygon points="149.568 475.847 163.419 475.75 169.634 422.278 151.62 426.411 149.568 475.847" fill="#9e616a"/><polygon points="192.89 147.893 189.752 156.053 190.379 164.213 251.804 185.488 248.756 149.776 241.851 141.616 192.89 147.893" fill="#ffb6b6"/><circle cx="219.24153" cy="52.52147" r="21.93835" fill="#9e616a"/><path d="M356.04186,666.98993l21.89609-1.30688v9.38122l20.8172,14.37714a5.85985,5.85985,0,0,1-3.32977,10.682H369.35726l-4.49322-9.27948-1.7544,9.27948H353.281Z" transform="translate(-121.66502 -197.7893)" fill="#2f2e41"/><path d="M266.49477,666.83667l21.88639-1.46014.06567,9.381,20.91733,14.231a5.85987,5.85987,0,0,1-3.25492,10.705l-26.06744.18249-4.55807-9.2478-1.6894,9.2915-9.82846.06879Z" transform="translate(-121.66502 -197.7893)" fill="#2f2e41"/><path d="M266.923,635.61066l10.086-80.61951,5.87287-76.921c-3-35,17.12713-75.079,17.12713-75.079l7.87287-34.921L366.02264,415.052l-.01365-.06082c12.93067,35.26813,17.21476,52.69132,16.50614,90.24845L379.9677,640.24951h.79914a5,5,0,0,1,0,10h-25a5,5,0,0,1,0-10h.64459L333.46447,479.493,314.009,546.99121l-22,88c8-2,6.75785,2.634,6.75785,5.25837a5,5,0,0,1-5,5h-25a4.99479,4.99479,0,0,1-1.84387-9.63886Z" transform="translate(-121.66502 -197.7893)" fill="#2f2e41"/><path d="M295.08639,405.61637l2.00006-20a3.98666,3.98666,0,0,1,3.6051-2.99482c.83331-9.52728,4.33936-50.62872,4.54883-52.42126l.00818-.07581,1.5943-34.05774a5.03657,5.03657,0,0,1,4.70251-4.79034l8.42078-.55041a.44562.44562,0,0,0,.4306-.44532,7.08137,7.08137,0,0,1,7.0813-7.08142h24.93121a6.63,6.63,0,0,1,6.57563,5.80383.44012.44012,0,0,0,.448.39612l.003-.00006a11.84463,11.84463,0,0,1,11.91547,9.82434c3.40839,19.831,3.23907,82.83985-.13812,107.86219-1.17566,8.71069,1.3587,1.81158,1.80328,9.09979a5.36476,5.36476,0,0,1-1.33563,3.88959c2.058.598,2.8562,2.82131,2.32812,4.91616l.43821-.10849a4,4,0,0,1-4.85639,2.90088l-74.709-7.71341C292.73974,419.53021,294.54647,407.75846,295.08639,405.61637Z" transform="translate(-121.66502 -197.7893)" fill="#e6e6e6"/><path d="M439.22588,381.89554a9.76,9.76,0,0,1-14.7001-2.80926l-33.78388,4.90625,7.16275-16.53956,30.96482-2.06747a9.81312,9.81312,0,0,1,10.35647,16.51Z" transform="translate(-121.66502 -197.7893)" fill="#9e616a"/><path d="M376.43548,318.10184s3.86706-21.38184-7.542-24.34146-17.47686,16.21728-17.47686,16.21728c-9.19165,28.34092-10.29709,47.86377-.53473,55.53522l-.19174,19.91608s71.47448,1.79849,65.56933-2.53214-.25137-16.5769-.25137-16.5769L380.763,361.6352l2.2826-5.28721-2.39548.46039-1.71964-5.397Z" transform="translate(-121.66502 -197.7893)" fill="#e6e6e6"/><path d="M238.37586,265.59032q-6.07047-5.27258-12.13739-10.55418a15.25817,15.25817,0,0,1,4.46672-2.39365c5.94661-1.97829,12.39166-.22449,18.57146.749,6.18884.977,13.38469.82111,17.60454-3.8016,4.38589-4.80712,3.65216-12.14687,4.099-18.62735.61319-9.04633,4.198-18.006,10.64346-24.37467s15.83929-9.89331,24.795-8.49409c8.95215,1.40816,17.08521,8.048,19.49869,16.7763A14.97414,14.97414,0,0,1,339.63753,229.229c.0185,3.4185-1.3443,6.99085-4.14556,8.95763s-7.09232,1.85025-9.34006-.73908l3.27557-3.51713c-5.74789-.2733-12.6499,7.60066-14.05475,13.18451-1.41028,5.5713-.13888,11.43763,1.30155,17.00639s3.07019,11.264,2.3851,16.97476c-.983,8.21457-6.81473,15.32364-14.02695,19.38361s-15.66092,5.41852-23.9314,5.60956a93.7324,93.7324,0,0,1-18.4819-1.40021c1.78092-1.56711,3.5528-3.13773,5.33371-4.70484L253.134,302.52008c-2.40359-.68493-4.78979-1.467-7.14053-2.3392-6.838-2.55307-13.4853-5.988-18.635-11.143-5.15493-5.16748-8.68566-12.2445-8.41976-19.5345a19.56935,19.56935,0,0,1,1.52684-6.84626Q229.42237,264.11922,238.37586,265.59032Z" transform="translate(-121.66502 -197.7893)" fill="#2f2e41"/><path d="M364.3991,229.5725c-3.14569-2.45314-7.22009-3.38365-11.19357-3.736-6.909-6.59026-10.57712-9.61813-19.70429-6.81438s-12.31122,12.33518-16.252,21.032c-9.44721,20.84866,18.63269,40.016,32.63269,30.016-3.295-3.04223,2.64877-12.75625,3.48715-17.16183a13.56155,13.56155,0,0,1,5.7478-8.58559c3.33854,1.86772,8.07407-.51015,9.22422-4.21542C369.52352,236.29753,367.54473,232.02564,364.3991,229.5725Z" transform="translate(-121.66502 -197.7893)" fill="#2f2e41"/><path d="M121.665,701.02069a1.18648,1.18648,0,0,0,1.18293,1.19H1077.145a1.19,1.19,0,0,0,0-2.38H122.855a1.18647,1.18647,0,0,0-1.19,1.18286v.00714Z" transform="translate(-121.66502 -197.7893)" fill="#ccc"/><path d="M775.88192,699.89932h-337c-4.69217-.00332-8.49457-2.32443-8.5-5.18873V650.759c.0054-2.86431,3.80783-5.18542,8.5-5.18874h337c4.6922.00332,8.49457,2.32443,8.5,5.18874v43.9516C784.37649,697.57489,780.57412,699.896,775.88192,699.89932Z" transform="translate(-121.66502 -197.7893)" fill="#2f2e41"/><path d="M749.88189,689.89932h-285c-4.41828,0-8-2.17074-8-4.84847V622.91866c0-2.67773,3.58172-4.84847,8-4.84847h285c4.41827,0,8,2.17074,8,4.84847v62.13219C757.88189,687.72858,754.30016,689.89932,749.88189,689.89932Z" transform="translate(-121.66502 -197.7893)" fill="#fff"/><path d="M506.48241,652.49039H700.44665a4.55991,4.55991,0,0,1,.0119,9.11981H506.48235a4.55992,4.55992,0,0,1-.02337-9.11981h.02343Z" transform="translate(-121.66502 -197.7893)" fill="#ccc"/><path d="M543.72027,634.47183H700.44665a4.56015,4.56015,0,0,1,.00091,9.1203H543.72027a4.56013,4.56013,0,1,1-.01953-9.12024h.01953Z" transform="translate(-121.66502 -197.7893)" fill="#ccc"/><path d="M332.97985,218.07c-3.5152,5.14936-7.42059,10.86254-13.615,12.94788a1.53488,1.53488,0,0,0-1.04767,1.84518,1.51482,1.51482,0,0,0,1.84519,1.04766c7.01855-2.36276,11.40737-8.46622,15.40792-14.32656a1.50024,1.50024,0,0,0-2.59042-1.51414h0Z" transform="translate(-121.66502 -197.7893)" fill="#e6e6e6"/><rect x="785.90931" y="184.12754" width="2" height="280.13303" fill="#2f2e41"/><rect x="803.93275" y="216.984" width="2" height="280.13301" fill="#2f2e41"/><rect x="821.39635" y="214.9254" width="2" height="261.18462" fill="#2f2e41"/><rect x="279.80407" y="281.51773" width="463.99388" height="2" fill="#2f2e41"/><rect x="279.80407" y="376.51773" width="463.99388" height="2" fill="#2f2e41"/></svg>


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/classifiers/list.py:
--------------------------------------------------------------------------------
  1 | import joblib
  2 | import numpy as np
  3 | from glob import glob
  4 | from loguru import logger
  5 | from os.path import join, dirname, abspath
  6 | from sklearn.metrics import classification_report
  7 | from sklearn.model_selection import train_test_split, GridSearchCV
  8 | from sklearn.svm import SVC
  9 | from sklearn.preprocessing import StandardScaler
 10 | from gerapy_auto_extractor.extractors.title import TitleExtractor
 11 | from gerapy_auto_extractor.patterns.datetime import METAS_MATCH as DATETIME_METAS
 12 | from gerapy_auto_extractor.schemas.element import Element
 13 | from gerapy_auto_extractor.utils.element import number_of_p_descendants, \
 14 |     number_of_a_descendants, number_of_punctuation, density_of_punctuation, density_of_text, number_of_clusters, \
 15 |     file2element, number_of_a_char, number_of_char, number_of_p_children
 16 | from gerapy_auto_extractor.utils.preprocess import preprocess4list_classifier
 17 | from gerapy_auto_extractor.utils.similarity import similarity1
 18 | from gerapy_auto_extractor.classifiers.base import BaseClassifier
 19 | 
 20 | DATASETS_DIR = join(dirname(dirname(dirname(abspath(__file__)))), 'datasets')
 21 | DATASETS_LIST_DIR = join(DATASETS_DIR, 'list')
 22 | DATASETS_DETAIL_DIR = join(DATASETS_DIR, 'detail')
 23 | 
 24 | MODELS_DIR = join(dirname(abspath(__file__)), 'models')
 25 | 
 26 | 
 27 | class ListClassifier(BaseClassifier):
 28 |     
 29 |     def __init__(self, model_path=None, scaler_path=None):
 30 |         """
 31 |         init features and extractors
 32 |         :param model_path: classifier model file
 33 |         """
 34 |         self.model_path = model_path if model_path else join(MODELS_DIR, 'list_model.pkl')
 35 |         self.scaler_path = scaler_path if scaler_path else join(MODELS_DIR, 'list_scaler.pkl')
 36 |         self.title_extractor = TitleExtractor()
 37 |         self.feature_funcs = {
 38 |             'number_of_a_char': number_of_a_char,
 39 |             'number_of_a_char_log10': self._number_of_a_char_log10,
 40 |             'number_of_char': number_of_char,
 41 |             'number_of_char_log10': self._number_of_char_log10,
 42 |             'rate_of_a_char': self._rate_of_a_char,
 43 |             'number_of_p_descendants': number_of_p_descendants,
 44 |             'number_of_a_descendants': number_of_a_descendants,
 45 |             'number_of_punctuation': number_of_punctuation,
 46 |             'density_of_punctuation': density_of_punctuation,
 47 |             'number_of_clusters': self._number_of_clusters,
 48 |             'density_of_text': density_of_text,
 49 |             'max_density_of_text': self._max_density_of_text,
 50 |             'max_number_of_p_children': self._max_number_of_p_children,
 51 |             'has_datetime_meta': self._has_datetime_mata,
 52 |             'similarity_of_title': self._similarity_of_title,
 53 |         }
 54 |         self.feature_names = self.feature_funcs.keys()
 55 |     
 56 |     def _number_of_clusters(self, element: Element):
 57 |         """
 58 |         get number of clusters like list
 59 |         :param element:
 60 |         :return:
 61 |         """
 62 |         tags = ['div', 'li', 'ul']
 63 |         return number_of_clusters(element, tags=tags)
 64 |     
 65 |     def _similarity_of_title(self, element: Element):
 66 |         """
 67 |         get similarity of <title> and (<h> or <meta>)
 68 |         :param element:
 69 |         :return:
 70 |         """
 71 |         _title_extract_by_title = self.title_extractor.extract_by_title(element)
 72 |         _title_extract_by_meta = self.title_extractor.extract_by_meta(element)
 73 |         _title_extract_by_h = self.title_extractor.extract_by_h(element)
 74 |         
 75 |         _title_target = None
 76 |         if _title_extract_by_meta:
 77 |             _title_target = _title_extract_by_meta
 78 |         elif _title_extract_by_h:
 79 |             _title_target = _title_extract_by_h
 80 |         
 81 |         if not _title_target:
 82 |             return 2
 83 |         if not _title_extract_by_title:
 84 |             return 3
 85 |         return similarity1(_title_target, _title_extract_by_title)
 86 |     
 87 |     def _has_datetime_mata(self, element: Element):
 88 |         """
 89 |         has datetime meta
 90 |         :param element:
 91 |         :return:
 92 |         """
 93 |         for xpath in DATETIME_METAS:
 94 |             datetime = element.xpath(xpath)
 95 |             if datetime:
 96 |                 return True
 97 |         return False
 98 |     
 99 |     def _max_number_of_p_children(self, element: Element):
100 |         """
101 |         get max number of p children an element contains
102 |         :param element:
103 |         :return:
104 |         """
105 |         _number_of_p_children_list = []
106 |         for descendant in element.descendants:
107 |             _number_of_p_children = number_of_p_children(descendant)
108 |             _number_of_p_children_list.append(_number_of_p_children)
109 |         return max(_number_of_p_children_list)
110 |     
111 |     def _max_density_of_text(self, element: Element):
112 |         """
113 |         get max density_of_text
114 |         :param element:
115 |         :return:
116 |         """
117 |         _density_of_text_list = []
118 |         for descendant in element.descendants:
119 |             _density_of_text = density_of_text(descendant)
120 |             _density_of_text_list.append(_density_of_text)
121 |         return np.max(_density_of_text_list)
122 |     
123 |     def _rate_of_a_char(self, element: Element):
124 |         """
125 |         rate of a
126 |         :param element:
127 |         :return:
128 |         """
129 |         _number_of_a_char = number_of_a_char(element)
130 |         _number_of_char = number_of_char(element)
131 |         if _number_of_char == 0:
132 |             return 0
133 |         return _number_of_a_char / _number_of_char
134 |     
135 |     def _number_of_char_log10(self, element: Element):
136 |         """
137 |         log10 of number of char
138 |         :param element:
139 |         :return:
140 |         """
141 |         if element is None:
142 |             return 0
143 |         return np.log10(number_of_char(element) + 1)
144 |     
145 |     def _number_of_a_char_log10(self, element: Element):
146 |         """
147 |         log10 of number of a char
148 |         :param element:
149 |         :return:
150 |         """
151 |         if element is None:
152 |             return 0
153 |         return np.log10(number_of_a_char(element) + 1)
154 |     
155 |     def features_to_list(self, features: dict):
156 |         """
157 |         convert features to list
158 |         :param features:
159 |         :param label:
160 |         :return:
161 |         """
162 |         return [features.get(feature_name) for feature_name in self.feature_names]
163 |     
164 |     def features(self, element: Element):
165 |         """
166 |         build feature map using element
167 |         :param element:
168 |         :return:
169 |         """
170 |         features = {}
171 |         for feature_name, feature_func in self.feature_funcs.items():
172 |             features[feature_name] = feature_func(element)
173 |         return features
174 |     
175 |     def process(self, element: Element):
176 |         """
177 |         get probability of list
178 |         :param element:
179 |         :return:
180 |         """
181 |         preprocess4list_classifier(element)
182 |         x = [self.features_to_list(self.features(element))]
183 |         # scale
184 |         ss = joblib.load(self.scaler_path)
185 |         x = ss.transform(x)
186 |         # load model
187 |         clf = joblib.load(self.model_path)
188 |         # predict
189 |         result = clf.predict_proba(x)
190 |         if result.any() and len(result) and len(result[0]):
191 |             return result[0][1]
192 |         return 0
193 |     
194 |     def train(self):
195 |         """
196 |         build dataset
197 |         :return:
198 |         """
199 |         list_file_paths = list(glob(f'{DATASETS_LIST_DIR}/*.html'))
200 |         detail_file_paths = list(glob(f'{DATASETS_DETAIL_DIR}/*.html'))
201 |         
202 |         x_data, y_data = [], []
203 |         
204 |         for index, list_file_path in enumerate(list_file_paths):
205 |             logger.log('inspect', f'list_file_path {list_file_path}')
206 |             element = file2element(list_file_path)
207 |             if element is None:
208 |                 continue
209 |             preprocess4list_classifier(element)
210 |             x = self.features_to_list(self.features(element))
211 |             x_data.append(x)
212 |             y_data.append(1)
213 |         
214 |         for index, detail_file_path in enumerate(detail_file_paths):
215 |             logger.log('inspect', f'detail_file_path {detail_file_path}')
216 |             element = file2element(detail_file_path)
217 |             if element is None:
218 |                 continue
219 |             preprocess4list_classifier(element)
220 |             x = self.features_to_list(self.features(element))
221 |             x_data.append(x)
222 |             y_data.append(0)
223 |         
224 |         # preprocess data
225 |         ss = StandardScaler()
226 |         x_data = ss.fit_transform(x_data)
227 |         joblib.dump(ss, self.scaler_path)
228 |         x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=5)
229 |         
230 |         # set up grid search
231 |         c_range = np.logspace(-5, 20, 5, base=2)
232 |         gamma_range = np.logspace(-9, 10, 5, base=2)
233 |         param_grid = [
234 |             {'kernel': ['rbf'], 'C': c_range, 'gamma': gamma_range},
235 |             {'kernel': ['linear'], 'C': c_range},
236 |         ]
237 |         grid = GridSearchCV(SVC(probability=True), param_grid, cv=5, verbose=10, n_jobs=-1)
238 |         clf = grid.fit(x_train, y_train)
239 |         y_true, y_pred = y_test, clf.predict(x_test)
240 |         logger.log('inspect', f'\n{classification_report(y_true, y_pred)}')
241 |         score = grid.score(x_test, y_test)
242 |         logger.log('inspect', f'test accuracy {score}')
243 |         # save model
244 |         joblib.dump(grid.best_estimator_, self.model_path)
245 | 
246 | 
247 | list_classifier = ListClassifier()
248 | 
249 | 
250 | def probability_of_list(html, **kwargs):
251 |     """
252 |     get probability of list page
253 |     :param html:
254 |     :param kwargs: other kwargs
255 |     :return:
256 |     """
257 |     return list_classifier.classify(html, **kwargs)
258 | 
259 | 
260 | def is_list(html, threshold=0.5, **kwargs):
261 |     """
262 |     judge if this page is list page
263 |     :param html: source of html
264 |     :param threshold:
265 |     :param kwargs:
266 |     :return:
267 |     """
268 |     _probability_of_list = probability_of_list(html, **kwargs)
269 |     if _probability_of_list > threshold:
270 |         return True
271 |     return False
272 | 


--------------------------------------------------------------------------------
/Engine/gerapy_auto_extractor/extractors/list.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import operator
  3 | from loguru import logger
  4 | import numpy as np
  5 | from collections import defaultdict
  6 | from urllib.parse import urljoin
  7 | from gerapy_auto_extractor.utils.cluster import cluster_dict
  8 | from gerapy_auto_extractor.utils.preprocess import preprocess4list_extractor
  9 | from gerapy_auto_extractor.extractors.base import BaseExtractor
 10 | from gerapy_auto_extractor.utils.element import descendants_of_body
 11 | from gerapy_auto_extractor.schemas.element import Element
 12 | 
 13 | LIST_MIN_NUMBER = 5
 14 | LIST_MIN_LENGTH = 8
 15 | LIST_MAX_LENGTH = 44
 16 | SIMILARITY_THRESHOLD = 0.8
 17 | 
 18 | 
 19 | class ListExtractor(BaseExtractor):
 20 |     """
 21 |     extract list from index page
 22 |     """
 23 |     
 24 |     def __init__(self, min_number=LIST_MIN_NUMBER, min_length=LIST_MIN_LENGTH, max_length=LIST_MAX_LENGTH,
 25 |                  similarity_threshold=SIMILARITY_THRESHOLD):
 26 |         """
 27 |         init list extractor
 28 |         """
 29 |         super(ListExtractor, self).__init__()
 30 |         self.min_number = min_number
 31 |         self.min_length = min_length
 32 |         self.max_length = max_length
 33 |         self.avg_length = (self.min_length + self.max_length) / 2
 34 |         self.similarity_threshold = similarity_threshold
 35 |     
 36 |     def _probability_of_title_with_length(self, length):
 37 |         """
 38 |         get the probability of title according to length
 39 |         import matplotlib.pyplot as plt
 40 |         x = np.asarray(range(5, 40))
 41 |         y = list_extractor.probability_of_title_with_length(x)
 42 |         plt.plot(x, y, 'g', label='m=0, sig=2')
 43 |         plt.show()
 44 |         :param length:
 45 |         :return:
 46 |         """
 47 |         sigma = 6
 48 |         return np.exp(-1 * ((length - self.avg_length) ** 2) / (2 * (sigma ** 2))) / (math.sqrt(2 * np.pi) * sigma)
 49 |     
 50 |     def _build_clusters(self, element):
 51 |         """
 52 |         build candidate clusters according to element
 53 |         :return:
 54 |         """
 55 |         descendants_tree = defaultdict(list)
 56 |         descendants = descendants_of_body(element)
 57 |         for descendant in descendants:
 58 |             # if one element does not have enough siblings, it can not become a child of candidate element
 59 |             if descendant.number_of_siblings + 1 < self.min_number:
 60 |                 continue
 61 |             # if min length is larger than specified max length, it can not become a child of candidate element
 62 |             if descendant.a_descendants_group_text_min_length > self.max_length:
 63 |                 continue
 64 |             # if max length is smaller than specified min length, it can not become a child of candidate element
 65 |             if descendant.a_descendants_group_text_max_length < self.min_length:
 66 |                 continue
 67 |             # descendant element must have same siblings which their similarity should not below similarity_threshold
 68 |             if descendant.similarity_with_siblings < self.similarity_threshold:
 69 |                 continue
 70 |             descendants_tree[descendant.parent_selector].append(descendant)
 71 |         descendants_tree = dict(descendants_tree)
 72 |         
 73 |         # cut tree, remove parent block
 74 |         selectors = sorted(list(descendants_tree.keys()))
 75 |         last_selector = None
 76 |         for selector in selectors[::-1]:
 77 |             # if later selector
 78 |             if last_selector and selector and last_selector.startswith(selector):
 79 |                 del descendants_tree[selector]
 80 |             last_selector = selector
 81 |         clusters = cluster_dict(descendants_tree)
 82 |         return clusters
 83 |     
 84 |     def _evaluate_cluster(self, cluster):
 85 |         """
 86 |         calculate score of cluster using similarity, numbers, or other info
 87 |         :param cluster:
 88 |         :return:
 89 |         """
 90 |         score = dict()
 91 |         
 92 |         # calculate avg_similarity_with_siblings
 93 |         score['avg_similarity_with_siblings'] = np.mean(
 94 |             [element.similarity_with_siblings for element in cluster])
 95 |         
 96 |         # calculate number of elements
 97 |         score['number_of_elements'] = len(cluster)
 98 |         
 99 |         # calculate probability of it contains title
100 |         # score['probability_of_title_with_length'] = np.mean([
101 |         #     self._probability_of_title_with_length(len(a_descendant.text)) \
102 |         #     for a_descendant in itertools.chain(*[element.a_descendants for element in cluster]) \
103 |         #     ])
104 |         
105 |         # TODO: add more quota to select best cluster
106 |         score['clusters_score'] = \
107 |             score['avg_similarity_with_siblings'] \
108 |             * np.log10(score['number_of_elements'] + 1) \
109 |             # * clusters_score[cluster_id]['probability_of_title_with_length']
110 |         return score
111 |     
112 |     def _extend_cluster(self, cluster):
113 |         """
114 |         extend cluster's elements except for missed children
115 |         :param cluster:
116 |         :return:
117 |         """
118 |         result = [element.selector for element in cluster]
119 |         for element in cluster:
120 |             path_raw = element.path_raw
121 |             siblings = list(element.siblings)
122 |             for sibling in siblings:
123 |                 # skip invalid element
124 |                 if not isinstance(sibling, Element):
125 |                     continue
126 |                 sibling_selector = sibling.selector
127 |                 sibling_path_raw = sibling.path_raw
128 |                 if sibling_path_raw != path_raw:
129 |                     continue
130 |                 # add missed sibling
131 |                 if sibling_selector not in result:
132 |                     cluster.append(sibling)
133 |                     result.append(sibling_selector)
134 |         
135 |         cluster = sorted(cluster, key=lambda x: x.nth)
136 |         logger.log('inspect', f'cluster after extend {cluster}')
137 |         return cluster
138 |     
139 |     def _best_cluster(self, clusters):
140 |         """
141 |         use clustering algorithm to choose best cluster from candidate clusters
142 |         :param clusters:
143 |         :return:
144 |         """
145 |         if not clusters:
146 |             logger.log('inspect', 'there is on cluster, just return empty result')
147 |             return []
148 |         if len(clusters) == 1:
149 |             logger.log('inspect', 'there is only one cluster, just return first cluster')
150 |             return clusters[0]
151 |         # choose best cluster using score
152 |         clusters_score = defaultdict(dict)
153 |         clusters_score_arg_max = 0
154 |         clusters_score_max = -1
155 |         for cluster_id, cluster in clusters.items():
156 |             # calculate avg_similarity_with_siblings
157 |             clusters_score[cluster_id] = self._evaluate_cluster(cluster)
158 |             # get max score arg index
159 |             if clusters_score[cluster_id]['clusters_score'] > clusters_score_max:
160 |                 clusters_score_max = clusters_score[cluster_id]['clusters_score']
161 |                 clusters_score_arg_max = cluster_id
162 |         logger.log('inspect', f'clusters_score {clusters_score}')
163 |         best_cluster = clusters[clusters_score_arg_max]
164 |         return best_cluster
165 |     
166 |     def _extract_cluster(self, cluster):
167 |         """
168 |         extract title and href from best cluster
169 |         :param cluster:
170 |         :return:
171 |         """
172 |         if not cluster:
173 |             return None
174 |         # get best tag path of title
175 |         probabilities_of_title = defaultdict(list)
176 |         for element in cluster:
177 |             descendants = element.a_descendants
178 |             for descendant in descendants:
179 |                 path = descendant.path
180 |                 descendant_text = descendant.text
181 |                 probability_of_title_with_length = self._probability_of_title_with_length(len(descendant_text))
182 |                 # probability_of_title_with_descendants = self.probability_of_title_with_descendants(descendant)
183 |                 # TODO: add more quota to calculate probability_of_title
184 |                 probability_of_title = probability_of_title_with_length
185 |                 probabilities_of_title[path].append(probability_of_title)
186 |         
187 |         # get most probable tag_path
188 |         probabilities_of_title_avg = {k: np.mean(v) for k, v in probabilities_of_title.items()}
189 |         if not probabilities_of_title_avg:
190 |             return None
191 |         best_path = max(probabilities_of_title_avg.items(), key=operator.itemgetter(1))[0]
192 |         logger.log('inspect', f'best tag path {best_path}')
193 |         
194 |         # extract according to best tag path
195 |         result = []
196 |         for element in cluster:
197 |             descendants = element.a_descendants
198 |             for descendant in descendants:
199 |                 path = descendant.path
200 |                 if path != best_path:
201 |                     continue
202 |                 title = descendant.text
203 |                 url = descendant.attrib.get('href')
204 |                 if not url:
205 |                     continue
206 |                 if url.startswith('//'):
207 |                     url = 'http:' + url
208 |                 base_url = self.kwargs.get('base_url')
209 |                 if base_url:
210 |                     url = urljoin(base_url, url)
211 |                 result.append({
212 |                     'title': title,
213 |                     'url': url
214 |                 })
215 |         return result
216 |     
217 |     def process(self, element: Element):
218 |         """
219 |         extract content from html
220 |         :param element:
221 |         :return:
222 |         """
223 |         # preprocess
224 |         preprocess4list_extractor(element)
225 |         
226 |         # build clusters
227 |         clusters = self._build_clusters(element)
228 |         logger.log('inspect', f'after build clusters {clusters}')
229 |         
230 |         # choose best cluster
231 |         best_cluster = self._best_cluster(clusters)
232 |         logger.log('inspect', f'best cluster {best_cluster}')
233 |         
234 |         extended_cluster = self._extend_cluster(best_cluster)
235 |         logger.log('inspect', f'extended cluster {extended_cluster}')
236 |         
237 |         # extract result from best cluster
238 |         return self._extract_cluster(best_cluster)
239 | 
240 | 
241 | list_extractor = ListExtractor()
242 | 
243 | 
244 | def extract_list(html, **kwargs):
245 |     """
246 |     extract list from index html
247 |     :param: base_url
248 |     :return:
249 |     """
250 |     return list_extractor.extract(html, **kwargs)
251 | 


--------------------------------------------------------------------------------
/frontend/src/assets/register.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" data-name="Layer 1" width="861.34285" height="648.67936" viewBox="0 0 861.34285 648.67936" xmlns:xlink="http://www.w3.org/1999/xlink"><path d="M487.96668,298.31443c37.10109-23.94307,88.20046-14.64015,124.34793,10.72884a196.9347,196.9347,0,0,1,32.30722,28.82658c5.64318,6.18962,10.99283,12.68743,16.10981,19.409,1.26068,1.64909,2.51531,3.31471,3.74573,4.99679q1.26287,1.70574,2.50473,3.43526c10.37769,14.42492,19.89188,29.60015,29.20575,44.63751,23.25089,37.5382,47.10783,76.19386,82.53441,102.54908,7.29895,5.42842,17.482,10.31693,24.95986,5.13332,6.48912-4.50162,6.7671-13.96718,5.29023-21.72428-5.49032-28.74-25.29548-52.55662-46.33553-72.88759-21.03978-20.33084-44.332-38.97058-60.22963-63.5374-15.8979-24.56687-23.16127-57.36431-9.23631-83.09406,12.00085-22.17255,37.8208-34.60974,63.02364-35.189,25.20065-.57377,49.68543,9.22225,71.00676,22.6735,21.31941,13.45676,40.066,30.53133,59.63091,46.42986,11.79386,9.5917,28.26094,19.10907,41.11381,10.98808,10.85632-6.863,11.70313-23.33834,5.233-34.43131s-18.01809-18.0644-29.08878-24.56575q-51.5577-30.28842-103.11733-60.5713c-10.54549-6.18859-21.32085-12.575-29.10512-22.01161-7.78648-9.43112-12.10823-22.68629-7.81685-34.1389,4.23393-11.2862,16.05933-18.45281,28.02325-19.94843,11.95605-1.49217,24.0148,1.823,35.19936,6.31353,41.91245,16.83224,140.602,126.95133,166.90453,145.77839,16.90328,12.0919,34.65985,27.40012,36.36931,48.11155,2.1296,25.78425-23.526,47.18186-49.27763,49.68313-25.74808,2.50866-50.70386-8.68727-73.71019-20.52382s-46.48564-24.88724-72.30913-26.53029c-25.823-1.64282-54.79532,12.16074-60.91654,37.29958-5.32348,21.87428,8.20967,44.44617,25.83789,58.44235,13.79843,10.95842,29.99225,18.25607,45.63451,26.46205,4.3438,2.27173,8.64194,4.62083,12.8341,7.132a180.25086,180.25086,0,0,1,29.93459,22.5045,183.67679,183.67679,0,0,1,25.77414,29.64094,179.18159,179.18159,0,0,1,15.62068,27.17341q1.00641,2.1759,1.95446,4.4124c.59864,1.42551,1.17874,2.86947,1.72867,4.32782,7.17972,18.91307,9.97662,40.10751.8045,57.76316-9.14885,17.60784-28.96516,28.03412-48.75259,29.48a89.80071,89.80071,0,0,1-33.791-4.46408,137.2297,137.2297,0,0,1-23.12969-9.94084C794.52567,600.7,767.40355,571.54178,742.6339,541.566c-24.7637-29.9737-48.08255-61.5336-77.48153-86.97744-29.39734-25.44945-66.23669-44.69009-105.12049-44.39784-38.88188.2867-78.96477,23.97778-90.00792,61.25757-16.09646-27.73614-26.98109-59.35475-25.544-91.388S461.024,315.70525,487.96668,298.31443Z" transform="translate(-169.32858 -125.66032)" fill="#3f3d56"/><circle cx="469.48709" cy="272.01678" r="4.2718" fill="#251aff"/><circle cx="308.49926" cy="275.79175" r="6.60457" fill="#ff6584"/><circle cx="785.86508" cy="221.51745" r="2.44032" fill="#ff6584"/><circle cx="736.48895" cy="90.36211" r="2.44032" fill="#ff6584"/><path d="M843.02633,456.74236a26.07891,26.07891,0,0,1,2.78-5.68327c4.3438,2.27173,8.64194,4.62083,12.8341,7.132a180.25086,180.25086,0,0,1,29.93459,22.5045,26.02472,26.02472,0,0,1-45.54867-23.95319Z" transform="translate(-169.32858 -125.66032)" fill="#ff6584"/><circle cx="362.47378" cy="249.1678" r="2.49578" fill="#f0f0f0"/><circle cx="668.2498" cy="64.51306" r="2.49578" fill="#f0f0f0"/><circle cx="645.77831" cy="20.05675" r="2.49578" fill="#f0f0f0"/><circle cx="822.50254" cy="185.77006" r="2.49578" fill="#f0f0f0"/><circle cx="691.4035" cy="189.35566" r="2.49578" fill="#f0f0f0"/><circle cx="621.83508" cy="427.40632" r="2.49578" fill="#f0f0f0"/><circle cx="543.40782" cy="193.29038" r="2.49578" fill="#f0f0f0"/><circle cx="539.90427" cy="146.09722" r="2.49578" fill="#f0f0f0"/><path d="M866.14147,532.28381c-3.70615-.15179-7.30281-.16975-10.76983-.04989-21.35628.73815-35.58147,6.85689-39.02807,16.788s3.92974,23.54668,20.23724,37.35613q2.419,2.0484,5.04258,4.06347a80.0276,80.0276,0,0,1,24.51808-58.15768Z" transform="translate(-169.32858 -125.66032)" fill="none"/><path d="M846.04959,563.68214a79.49915,79.49915,0,0,1,20.09249-31.39921q1.80051-1.72372,3.7035-3.33106a80.08128,80.08128,0,0,1,44.50356-18.61538,179.18438,179.18438,0,0,1,15.62068,27.17341q1.00639,2.17594,1.95446,4.4124c.59864,1.42551,1.17874,2.86947,1.72867,4.32782,7.17972,18.91307,9.97662,40.10751.8045,57.76316-9.14885,17.60784-28.96517,28.03416-48.7526,29.48007a89.80131,89.80131,0,0,1-33.791-4.46411,79.90212,79.90212,0,0,1-10.1306-34.05964c-.09842-1.50666-.14666-3.01457-.16151-4.52952A79.68475,79.68475,0,0,1,846.04959,563.68214Z" transform="translate(-169.32858 -125.66032)" fill="#251aff"/><path d="M813.01622,547.86864c4.00872-11.55075,19.00577-18.34978,42.2361-19.15377,4.64842-.16374,9.76062,1.76955,14.59326.237,23.44495-7.43488,44.71546-2.57642,62.07868,12.9704.59863,1.42553,1.17876,2.86948,1.72869,4.32783-.23093-.08636-.46379-.16717-.69664-.248-23.462-8.14252-46.75216-12.8952-66.81423-13.7192-3.707-.14949-8.129-2.29424-10.76958-.04705-21.20758,18.04834-35.55151,10.486-39.02911,16.78748-5.07941,9.204,3.93167,23.5454,20.23837,37.35631q2.41941,2.05122,5.04,4.06041c.01485,1.51495.06309,3.02286.16151,4.52952q-3.94577-2.91643-7.47707-5.90033C816.56833,574.04719,809.00944,559.41384,813.01622,547.86864Z" transform="translate(-169.32858 -125.66032)" fill="#e4e4e4"/><circle cx="731.55933" cy="501.52259" r="2.49578" fill="#f0f0f0"/><circle cx="687.40279" cy="469.42248" r="2.49578" fill="#f0f0f0"/><circle cx="721.18621" cy="454.43059" r="24.27028" fill="#f0f0f0"/><path d="M509.19491,311.783c5.67345.26659,17.40774,3.08394,30.12317,6.72643,12.70782,3.64607,26.49182,8.09431,40.10029,12.4941q35.73268,11.54366,71.46211,23.08,4.91208,1.5836,9.85116,3.19521c1.26068,1.64909,2.51531,3.31471,3.74573,4.99679-2.89437-1.02316-5.8003-2.013-8.69749-2.95637-12.37844-4.02258-23.1198-6.67554-35.13327-10.33536-18.48821-5.62729-37.99248-13.0176-56.909-19.8995C544.823,322.19693,523.12131,314.78335,509.19491,311.783Z" transform="translate(-169.32858 -125.66032)" fill="#f0f0f0" opacity="0.3"/><path d="M932.75759,262.92414l53.58031,88.947-50.55816-90.71607a1.84948,1.84948,0,1,0-3.02215,1.76906Z" transform="translate(-169.32858 -125.66032)" fill="#f0f0f0" opacity="0.3"/><circle cx="737.2964" cy="397.26895" r="2.49578" fill="#f0f0f0"/><circle cx="299.53813" cy="375.5268" r="16" fill="#251aff"/><path d="M459.3879,492.40273a11.44042,11.44042,0,0,1-7.3141-15.94512L423.10256,447.935l20.66168-4.40845,24.344,27.76138a11.50245,11.50245,0,0,1-8.72027,21.11472Z" transform="translate(-169.32858 -125.66032)" fill="#a0616a"/><path d="M320.86565,270.42666l16.157,10.64925,18.91194,4.43s14.99907,107.19866,22.36959,113.62868,1.16,7.03958,3.15737,15.909,6.61965,27.01462,6.61965,27.01462c-44.89679,27.82945-85.006,31.22731-119.22623,4.57454a8.05237,8.05237,0,0,1,3.76528-9.36653c5.786-3.59436-6.76409-17.67495-2.02758-20.495s-3.63329-23.91315-3.63329-23.91315l1.46049-45.29872-4.0663-6.94806,5.72828-44.60031,14.76652-6.56237,5.05588-14.98693Z" transform="translate(-169.32858 -125.66032)" fill="#251aff"/><path d="M327.8667,274.18712l26.64115,8.8716s29.47047-3.11176,30.66466,40.75832-4.29794,67.41167-4.29794,67.41167S442.17772,514.507,375.77221,478.097L356.3667,401.68712l-20-120Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M354.5843,371.17746l13.81877-1.90862s1.33426,20.8342,7.94429,23.99172,4.38173,3.67716,5.16118,7.01962-5.8779,12.60375-.64184,14.90694,51,60,51,60l27-25c-1.85564-5.02813-10.78512-.93686-11.345-6.26714-.70457-8.05889-14.8379-30.07437-17.84566-32.89719s-1.81872-7.79907-3.37763-14.484-.4767-.48633-3.07488-11.62786-37.80825-67.91285-39.03245-78.2-15.55843-16.34242-15.55843-16.34242l-11.14153,2.59818Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><polygon points="68.581 615.998 84.845 620.268 109.058 559.567 85.054 553.264 68.581 615.998" fill="#a0616a"/><path d="M232.881,758.47391l50.014,13.13347.16612-.6325a20.12793,20.12793,0,0,0-14.35407-24.579l-.00121-.00032-7.31533-9.33008-18.86557,2.45516-4.36588-1.14648Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><polygon points="151.957 628.165 168.771 628.164 176.771 563.303 151.953 563.305 151.957 628.165" fill="#a0616a"/><path d="M320.69272,771.36718l51.70963-.00195v-.654a20.12793,20.12793,0,0,0-20.12682-20.12651h-.00125l-9.44543-7.1658-17.62308,7.16689-4.51391.00015Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M279.2272,443.4438c-4.062,5.07751-2.031,22.341-2.031,22.341s-7.10852,55.85264-4.062,60.93015-2.031,9.13953-5.07752,16.248-6.093,24.37206-6.093,24.37206c-17.26354,14.217-16.248,79.2092-16.248,79.2092l-6.093,57.88364c2.031,6.093,30.46508,7.10852,34.52709,6.093S317.8163,566.3196,317.8163,566.3196s-2.031,138.10834-2.031,144.20136,27.41857,3.0465,33.51158,3.0465S393.979,464.76932,393.979,464.76932v-14.217l-5.07751-7.10851S283.28921,438.36629,279.2272,443.4438Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M288.8667,277.18712,263.92876,293.0758,251.8667,303.18712l16,102s-24.30892,92.71612,14,87,36.16111-89.03846,36.16111-89.03846Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M322.23542,422.083a11.44044,11.44044,0,0,1-16.54431-5.83342l-40.64366.98165,11.20149-17.91273,36.87863,1.814A11.50245,11.50245,0,0,1,322.2354,422.083Z" transform="translate(-169.32858 -125.66032)" fill="#a0616a"/><circle cx="132.62957" cy="105.61497" r="32.11879" fill="#a0616a"/><path d="M265.52656,210.17108c3.60658-5.57961,10.37456-9.29825,16.93562-8.25331a23.12708,23.12708,0,0,1,38.75042-13.4894,7.19154,7.19154,0,0,1,7.03282-.70384,14.82856,14.82856,0,0,1,5.71367,4.58627,33.28016,33.28016,0,0,1,4.65684,33.85669c.80185-2.915-2.11948-5.65739-5.07118-6.31152-2.95144-.65414-6.04757-.03529-9.04074-.46126-3.83627-.54587-7.28476-2.77134-11.12709-3.27218-3.22918-.42092-6.47658.407-9.62557,1.23693-3.149.82981-6.388,1.67606-9.62178,1.29208-3.234-.384-7.39959,11.87152-7.34369,20.5035.01062,1.63584-.32631,3.73609-1.925,4.08314-1.96872.42749-3.10954-2.28971-4.9834-3.0301a3.46671,3.46671,0,0,0-4.25212,2.26113,5.652,5.652,0,0,0,1.07909,5.06131,16.70142,16.70142,0,0,0,4.04925,3.51261l-.77175.64187c-1.28373,1.69591-3.94655,1.7694-5.7969.72015a12.5112,12.5112,0,0,1-4.16206-4.739c-3.4232-5.69369-6.426-11.76023-7.63408-18.29308S261.92023,215.75069,265.52656,210.17108Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M276.8667,306.18712l-14.43307-12.3687s-16.85215,5.25847-19.70951,16.31358c-22.49139,28.75861-34.60418,62.34589-37.85742,100.05512,29.36371,11.09012,65.31909,15.02441,104,16l-8-30-55-10Z" transform="translate(-169.32858 -125.66032)" fill="#2f2e41"/><path d="M225.72794,757.227l-7.71-12.39-1.54,7.08008c-.27,1.24-.54,2.5-.79,3.75-2.18995-1.87012-4.52-3.6001-6.79981-5.26q-10.5-7.62012-20.99023-15.26l2.18994,12.7c1.3501,7.82007,2.76025,15.8,6.1001,22.94995.37011.81006.77,1.61011,1.20019,2.39014h32.54a10.48765,10.48765,0,0,0,.54-2.24011.77484.77484,0,0,0,.00976-.15C230.9882,765.947,228.308,761.37706,225.72794,757.227Z" transform="translate(-169.32858 -125.66032)" fill="#f2f2f2"/><path d="M513.29792,742.99305l14.58608-23.44,2.91353,13.39444c.51083,2.34587,1.02167,4.72962,1.49463,7.09443,4.143-3.538,8.55119-6.81084,12.8642-9.95114q19.86441-14.41611,39.71035-28.86963l-4.143,24.02639c-2.55418,14.79438-5.222,29.8913-11.54046,43.41783-.70021,1.53251-1.45676,3.04608-2.27059,4.52178H505.35179a19.84078,19.84078,0,0,1-1.02167-4.238,1.46446,1.46446,0,0,1-.01848-.28382C503.34632,759.48991,508.41681,750.84426,513.29792,742.99305Z" transform="translate(-169.32858 -125.66032)" fill="#f2f2f2"/><path d="M501.855,773.68712H434.87842A11.52467,11.52467,0,0,1,423.3667,762.1754V527.19884a11.52466,11.52466,0,0,1,11.51172-11.51172H501.855a11.52466,11.52466,0,0,1,11.51172,11.51172V762.1754A11.52467,11.52467,0,0,1,501.855,773.68712Z" transform="translate(-169.32858 -125.66032)" fill="#f2f2f2"/><path d="M486.04,521.68712H451.69336A4.332,4.332,0,0,1,447.3667,517.36v-12.3457a4.332,4.332,0,0,1,4.32666-4.32715H486.04a4.332,4.332,0,0,1,4.32666,4.32715V517.36A4.332,4.332,0,0,1,486.04,521.68712Z" transform="translate(-169.32858 -125.66032)" fill="#3f3d56"/><path d="M584.26926,774.03235l-413.75.30733a1.19069,1.19069,0,0,1,0-2.38137l413.75-.30733a1.19069,1.19069,0,0,1,0,2.38137Z" transform="translate(-169.32858 -125.66032)" fill="#cacaca"/></svg>


--------------------------------------------------------------------------------