63 |
64 | {% endblock section %}
65 |
66 |
67 | {% block js %}
68 |
78 | {% endblock js %}
79 |
--------------------------------------------------------------------------------
/crawler/AutoPost.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import sys
3 |
4 | sys.path.append('../')
5 | import pymysql,time,os,random,shutil,platform
6 | from config import mysql_config
7 |
8 | dbhost = {
9 | "host": mysql_config['HOST'],
10 | "dbname": mysql_config['NAME'],
11 | "user": mysql_config['USER'],
12 | "password": mysql_config['PASSWORD']
13 | }
14 |
15 | def do_post(file_dir,sleep_time="0"):
16 | db = pymysql.connect(dbhost.get("host"),dbhost.get("user"), dbhost.get("password"),dbhost.get("dbname"))
17 | cursor = db.cursor()
18 | for files in os.walk(file_dir):
19 | tagidlist = []
20 | sysstr = platform.system()
21 | if sysstr == "Windows":
22 | title=files[0].split("\\")[-1]
23 | os_path=file_dir.split("\\")[-1]
24 | elif sysstr == "Linux":
25 | title = files[0].split("/")[-1]
26 | os_path = file_dir.split("/")[-1]
27 | if title != os_path:
28 | tags=['cosplay','萝莉','美腿','丝袜','少女']
29 | isExists = cursor.execute("SELECT * FROM images_page WHERE title =" + "'" + title + "'" + " limit 1;")
30 | if isExists != 0:
31 | print("已存在:" + title)
32 | else:
33 | for tag in tags:
34 | sqltag = "SELECT * FROM images_tag WHERE tag =" + "'" + tag + "'" + " limit 1;"
35 | isExiststag = cursor.execute(sqltag)
36 | if isExiststag != 1:
37 | cursor.execute("INSERT INTO images_tag (tag) VALUES (%s)", tag)
38 | cursor.execute("SELECT id FROM images_tag WHERE tag =" + "'" + tag + "'")
39 | for id in cursor.fetchall():
40 | tagidlist.append(id[0])
41 | p = (title, str(tagidlist), time.strftime('%Y-%m-%d', time.localtime(time.time())), "1", "1")
42 | cursor.execute("INSERT INTO images_page (title,tagid,sendtime,typeid,firstimg) VALUES (%s,%s,%s,%s,%s)",
43 | p)
44 | pageid = cursor.lastrowid
45 | rpath = "".join(random.sample('abcdefghijklmnopqrstuvwxyz', 7))
46 | count = 1
47 | for name in files[2]:
48 | path=files[0]+"/"+name
49 | rename=str(count)+"."+name.split(".")[-1]
50 | path_isExists=os.path.exists("../static/images/"+rpath)
51 | if not path_isExists:
52 | os.makedirs("../static/images/"+rpath)
53 | try:
54 | shutil.move(path, "../static/images/"+rpath+"/"+rename)
55 | imgp = "/static/images/" + rpath+"/"+rename
56 | if count==1:
57 | cursor.execute(
58 | "UPDATE images_page SET firstimg = %s WHERE id=%s",(imgp,pageid))
59 | cursor.execute("INSERT INTO images_image (pageid,imageurl) VALUES (%s,%s)", (pageid,imgp))
60 |
61 | except Exception as e:
62 | print(e)
63 | break
64 | count+=1
65 | try:
66 | os.removedirs(files[0])
67 | except:
68 | print("目录不为空,无法删除")
69 | print("发布完成:" + title)
70 | time.sleep(int(sleep_time))
71 |
72 | # do_post("输入图片所在目录","发布间隔时间,默认0,单位秒")
73 | if __name__ == "__main__":
74 | print("图片所在目录:")
75 | path=input("")
76 | print("自动发布间隔,0为全部发布,单位秒")
77 | send_time=input("")
78 | do_post(path,send_time)
79 |
80 |
--------------------------------------------------------------------------------
/templates/zde/sort.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 |
4 | {% block title %}{% ifequal typeid "new" %}最新发布 - {{ siteName }}{{ else }}最受欢迎- {{ siteName }}{% endifequal %}{% endblock title %}
5 | {% block keywords %}{{ keyWord }}{% endblock keywords %}
6 | {% block description %}{{ description }}{% endblock description %}
7 |
8 |
9 | {% block menu %}
10 |
44 |
45 | {% load pagination_tags %}
46 | {% autopaginate data 10 %}
47 | {% for imglist in data %}
48 |
49 |
64 |
65 |
66 | {% endblock section %}
67 |
68 |
69 | {% block js %}
70 |
80 | {% endblock js %}
81 |
--------------------------------------------------------------------------------
/94imm.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Navicat Premium Data Transfer
3 |
4 | Source Server : localhost
5 | Source Server Type : MySQL
6 | Source Server Version : 50629
7 | Source Host : localhost:3306
8 | Source Schema : 94imm
9 |
10 | Target Server Type : MySQL
11 | Target Server Version : 50629
12 | File Encoding : 65001
13 |
14 | Date: 15/04/2020 20:13:30
15 | */
16 |
17 | SET NAMES utf8mb4;
18 | SET FOREIGN_KEY_CHECKS = 0;
19 |
20 | -- ----------------------------
21 | -- Table structure for django_migrations
22 | -- ----------------------------
23 | DROP TABLE IF EXISTS `django_migrations`;
24 | CREATE TABLE `django_migrations` (
25 | `id` int(11) NOT NULL AUTO_INCREMENT,
26 | `app` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
27 | `name` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
28 | `applied` datetime(0) NOT NULL,
29 | PRIMARY KEY (`id`) USING BTREE
30 | ) ENGINE = MyISAM AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
31 |
32 | -- ----------------------------
33 | -- Table structure for images_image
34 | -- ----------------------------
35 | DROP TABLE IF EXISTS `images_image`;
36 | CREATE TABLE `images_image` (
37 | `id` int(11) NOT NULL AUTO_INCREMENT,
38 | `pageid` int(11) NOT NULL,
39 | `imageurl` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
40 | `originurl` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
41 | PRIMARY KEY (`id`) USING BTREE
42 | ) ENGINE = MyISAM AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
43 |
44 | -- ----------------------------
45 | -- Table structure for images_page
46 | -- ----------------------------
47 | DROP TABLE IF EXISTS `images_page`;
48 | CREATE TABLE `images_page` (
49 | `id` int(11) NOT NULL AUTO_INCREMENT,
50 | `typeid` int(11) NOT NULL,
51 | `sendtime` date NOT NULL,
52 | `title` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
53 | `firstimg` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
54 | `tagid` varchar(200) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL,
55 | `crawler` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
56 | `hot` int(10) NULL DEFAULT 0,
57 | PRIMARY KEY (`id`) USING BTREE
58 | ) ENGINE = MyISAM AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
59 |
60 | -- ----------------------------
61 | -- Table structure for images_tag
62 | -- ----------------------------
63 | DROP TABLE IF EXISTS `images_tag`;
64 | CREATE TABLE `images_tag` (
65 | `id` int(4) NOT NULL AUTO_INCREMENT,
66 | `tag` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
67 | `uid` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
68 | `date` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
69 | PRIMARY KEY (`id`) USING BTREE
70 | ) ENGINE = MyISAM AUTO_INCREMENT = 1 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
71 |
72 | -- ----------------------------
73 | -- Table structure for images_type
74 | -- ----------------------------
75 | DROP TABLE IF EXISTS `images_type`;
76 | CREATE TABLE `images_type` (
77 | `id` int(11) NOT NULL AUTO_INCREMENT,
78 | `type` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT NULL,
79 | PRIMARY KEY (`id`) USING BTREE
80 | ) ENGINE = MyISAM AUTO_INCREMENT = 7 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
81 |
82 | -- ----------------------------
83 | -- Table structure for images_video
84 | -- ----------------------------
85 | DROP TABLE IF EXISTS `images_video`;
86 | CREATE TABLE `images_video` (
87 | `id` int(11) NOT NULL AUTO_INCREMENT,
88 | `url` varchar(500) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
89 | `user_id` varchar(15) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
90 | `date_time` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
91 | `v_name` varchar(255) CHARACTER SET latin1 COLLATE latin1_swedish_ci NULL DEFAULT NULL,
92 | `v_path` varchar(50) CHARACTER SET latin1 COLLATE latin1_swedish_ci NULL DEFAULT NULL,
93 | `source` varchar(10) CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci NULL DEFAULT NULL,
94 | PRIMARY KEY (`id`) USING BTREE
95 | ) ENGINE = InnoDB AUTO_INCREMENT = 1 CHARACTER SET = latin1 COLLATE = latin1_swedish_ci ROW_FORMAT = Compact;
96 |
97 | SET FOREIGN_KEY_CHECKS = 1;
98 |
--------------------------------------------------------------------------------
/silumz/settings.py:
--------------------------------------------------------------------------------
1 | """
2 | Django settings for silumz project.
3 |
4 | Generated by 'django-admin startproject' using Django 1.10.6.
5 |
6 | For more information on this file, see
7 | https://docs.djangoproject.com/en/1.10/topics/settings/
8 |
9 | For the full list of settings and their values, see
10 | https://docs.djangoproject.com/en/1.10/ref/settings/
11 | """
12 |
13 | import os
14 | from config import mysql_config,allow_url,cache_time,templates,debug
15 |
16 | # Build paths inside the project like this: os.path.join(BASE_DIR, ...)
17 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
18 |
19 |
20 | # Quick-start development settings - unsuitable for production
21 | # See https://docs.djangoproject.com/en/1.10/howto/deployment/checklist/
22 |
23 | # SECURITY WARNING: keep the secret key used in production secret!
24 | SECRET_KEY = 'ge)(a+37gny_zn9c(+(kq+^yqw!jvblb67ck5allkpgv6(wi@^'
25 |
26 | # SECURITY WARNING: don't run with debug turned on in production!
27 | DEBUG = debug
28 |
29 | ALLOWED_HOSTS = allow_url
30 |
31 |
32 | # Application definition
33 |
34 | INSTALLED_APPS = [
35 | 'django.contrib.admin',
36 | 'django.contrib.auth',
37 | 'django.contrib.contenttypes',
38 | 'django.contrib.sessions',
39 | 'django.contrib.messages',
40 | 'django.contrib.staticfiles',
41 | 'images',
42 | 'dj_pagination'
43 | ]
44 |
45 | MIDDLEWARE = [
46 | 'django.middleware.cache.UpdateCacheMiddleware',
47 | 'django.middleware.security.SecurityMiddleware',
48 | 'django.middleware.gzip.GZipMiddleware',
49 | 'django.contrib.sessions.middleware.SessionMiddleware',
50 | 'django.middleware.common.CommonMiddleware',
51 | # 'django.middleware.csrf.CsrfViewMiddleware',
52 | 'django.contrib.auth.middleware.AuthenticationMiddleware',
53 | 'django.contrib.messages.middleware.MessageMiddleware',
54 | 'django.middleware.clickjacking.XFrameOptionsMiddleware',
55 | 'dj_pagination.middleware.PaginationMiddleware',
56 | 'django.middleware.cache.FetchFromCacheMiddleware',
57 | ]
58 |
59 | CACHES = {
60 | 'default': {
61 | 'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',
62 | 'LOCATION': 'cache', #设置缓存文件的目录
63 | 'TIMEOUT':cache_time,
64 | 'OPTIONS':{
65 | 'MAX_ENTRIES': 300,
66 | 'CULL_FREQUENCY': 3,
67 | }
68 | }
69 | }
70 |
71 | ROOT_URLCONF = 'silumz.urls'
72 |
73 | TEMPLATES = [
74 | {
75 | 'BACKEND': 'django.template.backends.django.DjangoTemplates',
76 | 'DIRS': [os.path.join(BASE_DIR, 'templates'+"/"+templates)]
77 | ,
78 | 'APP_DIRS': True,
79 | 'OPTIONS': {
80 | 'context_processors': [
81 | 'django.template.context_processors.debug',
82 | 'django.template.context_processors.request',
83 | 'django.contrib.auth.context_processors.auth',
84 | 'django.contrib.messages.context_processors.messages',
85 | 'django.template.context_processors.i18n',
86 | 'django.template.context_processors.media',
87 | 'django.contrib.auth.context_processors.auth',
88 | ],
89 | },
90 | },
91 | ]
92 |
93 | WSGI_APPLICATION = 'silumz.wsgi.application'
94 |
95 |
96 | # Database
97 | # https://docs.djangoproject.com/en/1.10/ref/settings/#databases
98 |
99 | DATABASES = {
100 | 'default': {
101 | 'ENGINE': 'django.db.backends.mysql',
102 | 'NAME': mysql_config['NAME'],
103 | 'USER': mysql_config['USER'],
104 | 'PASSWORD': mysql_config['PASSWORD'],
105 | 'HOST': mysql_config['HOST'],
106 | 'PORT': mysql_config['PORT'],
107 | }
108 | }
109 |
110 |
111 | # Password validation
112 | # https://docs.djangoproject.com/en/1.10/ref/settings/#auth-password-validators
113 |
114 | AUTH_PASSWORD_VALIDATORS = [
115 | {
116 | 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
117 | },
118 | {
119 | 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
120 | },
121 | {
122 | 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
123 | },
124 | {
125 | 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
126 | },
127 | ]
128 |
129 |
130 | # Internationalization
131 | # https://docs.djangoproject.com/en/1.10/topics/i18n/
132 |
133 | LANGUAGE_CODE = 'en-us'
134 |
135 | TIME_ZONE = 'UTC'
136 |
137 | USE_I18N = True
138 |
139 | USE_L10N = True
140 |
141 | USE_TZ = True
142 |
143 |
144 | #= Static files (CSS, JavaScript, Images)
145 | # https://docs.djangoproject.com/en/1.10/howto/static-files/
146 |
147 | STATIC_URL = '/static/'
148 | STATICFILES_DIRS=(
149 | os.path.join(BASE_DIR,'static'),
150 | )
151 |
152 | # dj_pagination
153 | PAGINATION_DEFAULT_WINDOW=1
154 |
--------------------------------------------------------------------------------
/templates/zde/video.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 |
4 | {% block title %}视频小姐姐 - {{ siteName }}{% endblock title %}
5 | {% block keywords %}{% for t in keyword %}{{ t.tag }},{% endfor %}{% endblock keywords %}
6 | {% block description %}{{ description }}{% endblock description %}
7 |
8 | {% block js_head %}
9 |
10 | {% endblock js_head %}
11 | {% block focusbox %}
12 |
13 |
小姐姐福利视频
14 |
19 |
20 | 点击用户ID可以关注小姐姐哦~!如查询结果为空请尝试刷新查询页面
21 |
22 |
23 |
24 | {% endblock focusbox %}
25 |
26 | {% block menu %}
27 |
28 | - 随便看看
29 |
38 |
47 | - 标签云
48 |
50 |
51 | {% endblock menu %}
52 |
53 |
54 | {% block section %}
55 |
56 |
57 |
59 | 视频来源:[{{ source }}]
61 | 用户ID:[{{ user_id }}]
63 |
65 |
66 |
67 | {% endblock section %}
68 |
69 | {% block js %}
70 |
71 |
123 | {% endblock js %}
--------------------------------------------------------------------------------
/crawler/crawler_xmt.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | import sys
3 |
4 | sys.path.append('../')
5 | from bs4 import BeautifulSoup
6 | import threading, pymysql, time, requests, os, urllib3, re
7 | from config import mysql_config
8 |
9 | requests.packages.urllib3.disable_warnings()
10 | # 数据库连接信息
11 | dbhost = {
12 | "host": mysql_config['HOST'],
13 | "dbname": mysql_config['NAME'],
14 | "user": mysql_config['USER'],
15 | "password": mysql_config['PASSWORD']
16 | }
17 |
18 |
19 | class Spider():
20 | headers = {
21 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
22 | 'Chrome/65.0.3325.181 Safari/537.36',
23 | 'Referer': "http://www.xgmmtk.com/"
24 | }
25 | page_url_list = []
26 | img_url_list = []
27 | rlock = threading.RLock()
28 | # s = requests.session()
29 |
30 | def __init__(self, img_path='imgdir', thread_number=5):
31 | self.spider_url = 'http://www.xgmmtk.com/'
32 | self.img_path = img_path
33 | self.thread_num = thread_number
34 |
35 | def get_url(self):
36 | page = requests.get("http://www.xgmmtk.com/")
37 | soup = BeautifulSoup(page.text, "html.parser")
38 | a_soup = soup.find_all("a")
39 | for a in a_soup:
40 | url = "http://www.xgmmtk.com/" + a.get("href")
41 | self.page_url_list.append(url)
42 |
43 | def get_img(self):
44 | db = pymysql.connect(dbhost.get("host"), dbhost.get("user"), dbhost.get("password"), dbhost.get("dbname"))
45 | cursor = db.cursor()
46 | while True:
47 | self.rlock.acquire()
48 | if len(self.page_url_list) == 0:
49 | self.rlock.release()
50 | break
51 | else:
52 | page_url = self.page_url_list.pop()
53 | self.rlock.release()
54 | page = requests.get(page_url)
55 | soup=BeautifulSoup(page.text,"html.parser")
56 | title=soup.title.string.replace("�","")
57 | isExists = cursor.execute(
58 | "SELECT title FROM images_page WHERE title =" + "'" + title + "'" + " limit 1;")
59 | if isExists == 0:
60 | print("添加采集:",title)
61 | if "袜" in title or "丝" in title or "腿" in title:
62 | type_id = 2
63 | tagidlist=[3679,3700,3719,3628]
64 | elif "青春" in title or "清纯" in title or "萝莉" in title:
65 | tagidlist=[3694,3627,3635]
66 | type_id = 3
67 | else:
68 | tagidlist=[3630,3623,3618,3642]
69 | type_id = 1
70 | p = (
71 | title, str(tagidlist), time.strftime('%Y-%m-%d', time.localtime(time.time())), type_id,
72 | "1",
73 | page_url)
74 | cursor.execute(
75 | "INSERT INTO images_page (title,tagid,sendtime,typeid,firstimg,crawler) VALUES (%s,%s,%s,%s,%s,%s)",
76 | p)
77 | pageid = cursor.lastrowid
78 | img = soup.find_all("img")
79 | i=0
80 | page_id=page_url[page_url.find("?id=")+4:-1]
81 | img_path = self.img_path + time.strftime('%Y%m%d', time.localtime(
82 | time.time())) + "/" +page_id + "/"
83 | for imgurl in img:
84 | imgsrc = "http://www.xgmmtk.com/" + imgurl.get("src")
85 | self.img_url_list.append(
86 | {"img_url": imgsrc, "Referer": page_url,
87 | "id": page_id})
88 | if i==0:
89 | cursor.execute(
90 | "UPDATE images_page SET firstimg = " + "'" + img_path+imgsrc.split("/")[-1] + "'" + " WHERE id=" + "'" + str(
91 | pageid) + "'")
92 | i+=1
93 | else:
94 | print("已采集")
95 | pass
96 | def down_img(self,imgsrc,Referer,id):
97 | headers = {
98 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
99 | "Referer": Referer
100 | }
101 | path = self.img_path + time.strftime('%Y%m%d', time.localtime(time.time())) + "/"
102 | page_id = id
103 | isdata = os.path.exists("../" + path + page_id)
104 | if not isdata:
105 | os.makedirs("../" + path + page_id)
106 | with open("../" + path + page_id + "/" + imgsrc.split("/")[-1], "wb") as f:
107 | print("已保存:" ,imgsrc)
108 | f.write(requests.get(imgsrc, headers=headers,verify=False).content)
109 |
110 |
111 |
112 | def run_img(self):
113 | while True:
114 | Spider.rlock.acquire()
115 | if len(self.img_url_list) == 0 :
116 | Spider.rlock.release()
117 | continue
118 | else:
119 | urls = self.img_url_list.pop()
120 | url = urls.get("img_url")
121 | Referer = urls.get("Referer")
122 | id = urls.get("id")
123 | Spider.rlock.release()
124 | try:
125 | self.down_img(url, Referer, id)
126 | except Exception as e:
127 | pass
128 |
129 | def run(self):
130 | # 启动thread_num个来下载图片
131 | for img_th in range(self.thread_num):
132 | download_t = threading.Thread(target=self.run_img)
133 | download_t.start()
134 |
135 | for img_th in range(self.thread_num):
136 | run_t = threading.Thread(target=self.get_img)
137 | run_t.start()
138 |
139 | if __name__ == "__main__":
140 | spider=Spider(img_path='/static/images/',thread_number=10)
141 | spider.get_url()
142 | spider.run()
143 |
--------------------------------------------------------------------------------
/templates/zde/page.html:
--------------------------------------------------------------------------------
1 | {% extends 'base.html' %}
2 |
3 |
4 | {% block title %}{{ title }} - {{ siteName }}{% endblock title %}
5 | {% block keywords %}{% for t in tag %}{{ t.tname }},{% endfor %}{% endblock keywords %}
6 | {% block description %}由{{ siteName }}为您整理的高清《{{ title }}》图集{% endblock description %}
7 |
8 | {% block focusbox %}
9 |
10 |
{{ title }}
11 |
12 | {% for t in tag %}
13 |
{{ t.tname }}
14 | {% endfor %}
15 |
16 |
17 | 点击图片自动播放
18 |
19 |
20 |
21 | {% endblock focusbox %}
22 |
23 | {% block menu %}
24 |
50 | {% endblock menu %}
51 |
52 |
53 | {% block section %}
54 |
55 |
56 |
57 | {% for img in data %}
58 |
59 |
60 | {% endfor %}
61 |
62 |
63 | {% endblock section %}
64 |
65 | {% block recommend %}
66 |
相关推荐
67 |
68 |
69 | {% for items in similar %}
70 |
71 |
76 |
77 |
85 |
86 | {% endfor %}
87 |
88 |
89 | {% endblock recommend %}
90 |
91 | {% block js %}
92 |
147 | {% endblock js %}
148 |
--------------------------------------------------------------------------------
/crawler/crawle_mzt.py:
--------------------------------------------------------------------------------
1 | # coding='UTF-8'
2 | import sys
3 |
4 | sys.path.append('../')
5 | from bs4 import BeautifulSoup
6 | import threading, pymysql, time, requests, os, urllib3
7 | from config import mysql_config
8 |
9 | requests.packages.urllib3.disable_warnings()
10 |
11 |
12 | class Spider():
13 | headers = {
14 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
15 | 'Chrome/65.0.3325.181 Safari/537.36',
16 | 'Referer': "https://www.mzitu.com"
17 | }
18 | page_url_list = []
19 | img_url_list = []
20 | rlock = threading.RLock()
21 | s = requests.session()
22 | s.keep_alive = False
23 | dbhost = {
24 | "host": mysql_config['HOST'],
25 | "dbname": mysql_config['NAME'],
26 | "user": mysql_config['USER'],
27 | "password": mysql_config['PASSWORD']
28 | }
29 |
30 | def __init__(self, page_num=10, img_path='imgdir', thread_num=5, type="xinggan", type_id=1):
31 | self.spider_url = 'https://www.mzitu.com/'
32 | self.page_number = int(page_num)
33 | self.img_path = img_path
34 | self.thread_num = thread_num
35 | self.type = type
36 | self.type_id = type_id
37 |
38 | def get_url(self):
39 | for i in range(1, self.page_number + 1):
40 | if i ==1:
41 | page = self.s.get(self.spider_url +"/"+self.type ,verify=False).text
42 | page=self.s.get(self.spider_url +"/"+self.type+"/page/"+str(i),verify=False).text
43 | soup = BeautifulSoup(page, "html.parser")
44 | page_base_url = soup.find("div",class_="postlist").find_all("li")
45 | for page_url in page_base_url:
46 | url = page_url.find("a").get("href")
47 | self.page_url_list.append(url)
48 | i = i + 1
49 |
50 | def get_img_url(self):
51 | db = pymysql.connect(self.dbhost.get("host"), self.dbhost.get("user"), self.dbhost.get("password"),
52 | self.dbhost.get("dbname"))
53 | cursor = db.cursor()
54 | for img_base_url in self.page_url_list:
55 | tagidlist = []
56 | img_soup = BeautifulSoup(self.s.get(img_base_url,verify=False).text, "html.parser")
57 | img_num = img_soup.find("div", class_="pagenavi").text.split("…")[-1][0:-5]
58 | img_url = img_soup.find("div", class_="main-image").find("img").get("src").split("/")[0:-1]
59 | img_surl = "/".join(img_url)
60 | title = img_soup.find("h2", class_="main-title").text
61 | isExists = cursor.execute("SELECT * FROM images_page WHERE title =" + "'" + title + "'" + " limit 1;")
62 | tag_list = img_soup.find("div", class_="main-tags").find_all("a")
63 | if isExists == 1:
64 | print("已采集:" + title)
65 | else:
66 | for tags in tag_list:
67 | tag=tags.text
68 | print(tag)
69 | sqltag = "SELECT * FROM images_tag WHERE tag =" + "'" + tag + "'" + " limit 1;"
70 | isExiststag = cursor.execute(sqltag)
71 | if isExiststag != 1:
72 | cursor.execute("INSERT INTO images_tag (tag) VALUES (%s)", tag)
73 | cursor.execute("SELECT id FROM images_tag WHERE tag =" + "'" + tag + "'")
74 | for id in cursor.fetchall():
75 | tagidlist.append(id[0])
76 | p = (title, str(tagidlist), time.strftime('%Y-%m-%d', time.localtime(time.time())), self.type_id, "1")
77 | cursor.execute("INSERT INTO images_page (title,tagid,sendtime,typeid,firstimg) VALUES (%s,%s,%s,%s,%s)",
78 | p)
79 | print("开始采集:" + title)
80 | pageid = cursor.lastrowid
81 | for i in range(1, int(img_num)):
82 | temp_url = img_soup.find("div", class_="main-image").find("img").get("src").split("/")
83 | path = temp_url[-1][0:3]
84 | new_url = img_surl + "/" + path + str("%02d" % i) + ".jpg"
85 | img_src = temp_url[-3] + "/" + temp_url[-2] + "/" + path + str("%02d" % i) + ".jpg"
86 | imgp = pageid, self.img_path + img_src
87 | cursor.execute("INSERT INTO images_image (pageid,imageurl) VALUES (%s,%s)", imgp)
88 | if i == 1:
89 | cursor.execute(
90 | "UPDATE images_page SET firstimg = " + "'" + self.img_path + img_src + "'" + " WHERE title=" + "'" + title + "'")
91 | self.img_url_list.append(new_url)
92 | i = i + 1
93 | db.close()
94 |
95 | def down_img(self, imgsrc):
96 | path = imgsrc.split("/")[-3] + "/" + imgsrc.split("/")[-2]
97 | isdata = os.path.exists("../" + self.img_path + path)
98 | if isdata == False:
99 | os.makedirs("../" + self.img_path + path)
100 | with open("../" + self.img_path + path + "/" + imgsrc.split("/")[-1], "wb")as f:
101 | print("下载图片:" + self.img_path + path + "/" + imgsrc.split("/")[-1])
102 | f.write(requests.get(imgsrc, headers=self.headers, verify=False).content)
103 |
104 | def down_url(self):
105 | while True:
106 | Spider.rlock.acquire()
107 | if len(Spider.img_url_list) == 0:
108 | Spider.rlock.release()
109 | break
110 | else:
111 | img_url = Spider.img_url_list.pop()
112 | Spider.rlock.release()
113 | try:
114 | self.down_img(img_url)
115 | except Exception as e:
116 | pass
117 |
118 | def run(self):
119 | # 启动thread_num个来下载图片
120 | for img_th in range(self.thread_num):
121 | download_t = threading.Thread(target=self.down_url)
122 | download_t.start()
123 |
124 |
125 | if __name__ == '__main__':
126 | for i in [{"page": 1, "type": "xinggan", "type_id": 1},]:
127 | spider = Spider(page_num=i.get("page"), img_path='/static/images/', thread_num=10, type_id=i.get("type_id"),
128 | type=i.get("type"))
129 | spider.get_url()
130 | spider.get_img_url()
131 | spider.run()
132 |
--------------------------------------------------------------------------------
/crawler/crawler_mtl.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python3
2 | import sys
3 |
4 | sys.path.append('../')
5 | from bs4 import BeautifulSoup
6 | import threading,pymysql,time,requests,os,urllib3
7 | from config import mysql_config
8 | requests.packages.urllib3.disable_warnings()
9 | requests.adapters.DEFAULT_RETRIES = 5
10 |
11 | class Spider():
12 | headers = {
13 | 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) '
14 | 'Chrome/65.0.3325.181 Safari/537.36',
15 | 'Referer': "https://www.meitulu.com"
16 | }
17 | page_url_list = []
18 | img_url_list = []
19 | rlock = threading.RLock()
20 | s=requests.session()
21 | s.keep_alive = False
22 | dbhost = {
23 | "host": mysql_config['HOST'],
24 | "dbname": mysql_config['NAME'],
25 | "user": mysql_config['USER'],
26 | "password": mysql_config['PASSWORD']
27 | }
28 |
29 | def __init__(self,page_number=10,img_path='imgdir',thread_number=5,type='xinggan',type_id=1):
30 | self.spider_url = 'https://www.meitulu.com/t/'+type
31 | self.page_number = int(page_number)
32 | self.img_path = img_path
33 | self.thread_num = thread_number
34 | self.type_id = type_id
35 |
36 | def get_url(self):
37 | db = pymysql.connect(self.dbhost.get("host"), self.dbhost.get("user"), self.dbhost.get("password"),
38 | self.dbhost.get("dbname"))
39 | cursor = db.cursor()
40 | for i in range(1, self.page_number+1):
41 | page_base_url = BeautifulSoup(requests.get(self.spider_url + "/" + str(i) + ".html").content.decode("utf-8"),
42 | "html.parser")
43 | if i == 1:
44 | page_base_url = BeautifulSoup(requests.get(self.spider_url).content.decode("utf-8"), "html.parser")
45 | img_ul = page_base_url.find("ul", class_="img").find_all("li")
46 | for img_li in img_ul:
47 | page_url = img_li.find("p", class_="p_title").find("a").get("href")
48 | self.page_url_list.append(page_url)
49 | db.close()
50 |
51 | def get_img_url(self):
52 | db = pymysql.connect(self.dbhost.get("host"), self.dbhost.get("user"), self.dbhost.get("password"), self.dbhost.get("dbname"))
53 | cursor = db.cursor()
54 | for page_url in reversed(self.page_url_list):
55 | tagidlist = []
56 | img_div_soup = BeautifulSoup(requests.get(page_url).content.decode("utf-8"), "html.parser")
57 | img_base_url = img_div_soup.find("img", class_="content_img").get("src").split("/")
58 | img_url = "/".join(img_base_url[0:-1])
59 | title = img_div_soup.find("div", class_="weizhi").find("h1").text.replace(" 萝莉丝袜写真套图","")
60 | isExists =cursor.execute("SELECT * FROM images_page WHERE title =" + "'" + title + "'" + " limit 1;")
61 | if isExists != 0:
62 | print ("已采集:"+title)
63 | else:
64 | tag_list = img_div_soup.find("div", class_="fenxiang_l").find_all("a")
65 | for tag in tag_list:
66 | sqltag = "SELECT * FROM images_tag WHERE tag =" + "'" + tag.text + "'" + " limit 1;"
67 | isExiststag = cursor.execute(sqltag)
68 | if isExiststag == 0:
69 | cursor.execute("INSERT INTO images_tag (tag) VALUES (%s)", tag.text)
70 | cursor.execute("SELECT id FROM images_tag WHERE tag =" + "'" + tag.text + "'")
71 | for id in cursor.fetchall():
72 | tagidlist.append(id[0])
73 | p = (
74 | title, str(tagidlist), time.strftime('%Y-%m-%d', time.localtime(time.time())), self.type_id, "1", page_url)
75 | cursor.execute(
76 | "INSERT INTO images_page (title,tagid,sendtime,typeid,firstimg,crawler) VALUES (%s,%s,%s,%s,%s,%s)",
77 | p)
78 | pageid =cursor.lastrowid
79 | ima_num_tem = img_div_soup.find("div", id="pages").text
80 | img_num = ima_num_tem[-6:-4]
81 | i = 1
82 | for i in range(1, int(img_num)):
83 | img_src = img_url + "/" + str(i) + "." + img_base_url[-1].split(".")[-1]
84 | img_loc_path = self.img_path + img_base_url[-2]+"/"+ str(i) + "." + img_base_url[-1].split(".")[-1]
85 | imgp = pageid, img_loc_path,img_src
86 | if i == 1:
87 | cursor.execute(
88 | "UPDATE images_page SET firstimg = " + "'" + img_loc_path + "'" + " WHERE title=" + "'" + title + "'")
89 | i = i + 1
90 | cursor.execute("INSERT INTO images_image (pageid,imageurl,originurl) VALUES (%s,%s,%s)", imgp)
91 | self.img_url_list.append(img_src)
92 | print("添加:"+title)
93 | db.close()
94 |
95 | def down_img(self,imgsrc):
96 | path = imgsrc.split("/")[-2]
97 | isdata = os.path.exists("../" + self.img_path + path)
98 | if isdata == False:
99 | os.makedirs("../" + self.img_path + path)
100 | with open("../" + self.img_path + path + "/" + imgsrc.split("/")[-1], "wb")as f:
101 | f.write(requests.get(imgsrc, headers=self.headers, verify=False).content)
102 |
103 | def down_url(self):
104 | while True:
105 | Spider.rlock.acquire()
106 | if len(Spider.img_url_list) == 0:
107 | Spider.rlock.release()
108 | break
109 | else:
110 | img_url = Spider.img_url_list.pop()
111 | Spider.rlock.release()
112 | try:
113 | self.down_img(img_url)
114 | except Exception as e:
115 | pass
116 |
117 |
118 | def run(self):
119 | # print("开始下载")
120 | # 启动thread_num个进程来爬去具体的img url 链接
121 | for th in range(self.thread_num):
122 | add_pic_t = threading.Thread(target=self.get_img_url)
123 | add_pic_t.start()
124 |
125 | # 启动thread_num个来下载图片
126 | for img_th in range(self.thread_num):
127 | download_t = threading.Thread(target=self.down_url)
128 | download_t.start()
129 |
130 |
131 | if __name__ == '__main__':
132 | for i in [{"page": 3, "type": "1290", "type_id": 4}]:
133 | spider = Spider(page_number=i.get("page"), img_path='/static/images/', thread_number=10,type=i.get("type"),type_id=i.get("type_id"))
134 | spider.get_url()
135 | spider.get_img_url()
136 | spider.run()
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | function install_mysql(){
4 | cd ~/$tmp
5 | yum install ncurses-devel libaio-devel cmake gcc gcc-c++ make autoconf -y
6 | wget http://dev.mysql.com/get/Downloads/MySQL-5.6/mysql-5.6.21.tar.gz
7 | tar -zxvf mysql-5.6.21.tar.gz
8 | cd mysql-5.6.21
9 |
10 | cmake .
11 | make
12 | sudo make install
13 |
14 |
15 | sudo groupadd mysql
16 | sudo useradd -r -g mysql mysql
17 |
18 | cd /usr/local/mysql/
19 | sudo chown -R root .
20 | sudo chown -R mysql data
21 |
22 |
23 | sudo yum install perl-Data-Dumper -y
24 |
25 | sudo scripts/mysql_install_db --user=mysql
26 | sudo cp support-files/my-default.cnf /etc/my.cnf
27 |
28 | sudo cp support-files/mysql.server /etc/init.d/mysql
29 | sudo chmod u+x /etc/init.d/mysql
30 | sudo chkconfig --add mysql
31 | # MySQL 环境变量
32 | cd ~
33 | echo 'if [ -d "/usr/local/mysql/bin" ] ; then
34 | PATH=$PATH:/usr/local/mysql/bin
35 | export PATH
36 | fi' > env_mysql.sh
37 | sudo cp env_mysql.sh /etc/profile.d/env_mysql.sh
38 | touch /usr/local/vagrant.mysql.lock
39 | ln -s /usr/local/mysql/bin/mysql /usr/bin
40 | systemctl start mysql
41 | mysql -uroot -e "CREATE DATABASE $db_name;"
42 | echo "Mysql install successful"
43 | }
44 |
45 | function install_python(){
46 | # the version of python
47 | version="3.8.0"
48 | # the installation directory of python
49 | python3_install_dir="/usr/local/python3"
50 | cd ~/$tmp
51 | file_name="Python-$version.tgz"
52 | sudo yum -y install zlib-devel bzip2-devel openssl-devel ncurses-devel sqlite-devel readline-devel tk-devel gcc make libffi-devel
53 | rm `pwd`"/$file_name"
54 | wget "https://www.python.org/ftp/python/$version/$file_name"
55 | mkdir $tmp
56 | tar -xf $file_name -C $tmp
57 | make_dir="$tmp/Python-$version"
58 | cd $make_dir
59 | mkdir -p $python3_install_dir
60 | ./configure --prefix=$python3_install_dir --with-ssl
61 | sudo make
62 | sudo make install
63 | ln -s /usr/local/python3/bin/python3 /usr/bin/python3
64 | cd ~/tmp
65 | wget --no-check-certificate https://pypi.python.org/packages/source/s/setuptools/setuptools-19.6.tar.gz
66 | tar -zxvf setuptools-19.6.tar.gz
67 | cd setuptools-19.6
68 | python3 setup.py build
69 | python3 setup.py install
70 | ln -s /usr/local/python3/bin/pip3 /usr/bin/pip3
71 | rm -rf ~/$tmp
72 | echo "all in well !"
73 | }
74 |
75 | # ----------------------- 安装mysql ------------------------------
76 | # 编译MySQL时间比较长,需要等很长时间
77 | read -p "Allow Url: " allow_url
78 | read -p "Site Name: " site_name
79 | read -p "Site Url: " site_url
80 | yum install wget git -y
81 | git clone https://github.com/Turnright-git/94imm.git
82 | yum install gcc mariadb-devel -y
83 | cd "94imm"
84 | path=$(pwd)
85 | yum install yum install -y python3-devel
86 | tmp="tmp"
87 | mkdir ~/$tmp
88 | if ! [ -x "$(command -v python3)" ]; then
89 | echo "Start the Python3 installation process"
90 | install_python
91 | fi
92 | if ! [ -x "$(command -v mysql)" ]; then
93 | echo "编译MySQL时间比较长,需要等很长时间,可自安装。行输入n退出"
94 | read -p "(y , n):" isinstallmysql56
95 | case "$isinstallmysql56" in
96 | n|N|No|NO)
97 | exit
98 | ;;
99 | *)
100 | esac
101 | echo "Start the MySQL installation process"
102 | install_mysql
103 | systemctl start mysql
104 | read -p "Create databases : " db_name
105 | read -p "Create databases password: " db_pass
106 | create_db_sql="create database IF NOT EXISTS ${db_name} DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;"
107 | create_user="update mysql.user set password=password('${db_pass}') where user='root';"
108 | mysql -uroot -e "${create_db_sql}"
109 | mysql -uroot -e "${create_user}"
110 | mysql -uroot -e "${grant_user}"
111 | mysql -uroot -e "flush privileges;"
112 | else
113 | read -p "Create databases : " db_name
114 | read -p "Password for root: " db_pass
115 | create_db_sql="create database IF NOT EXISTS ${db_name} DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;"
116 | create_user="update mysql.user set password=password('${db_pass}') where user='root';"
117 | mysql -uroot -p$db_pass -e "${create_db_sql}"
118 | fi
119 | if ! [ -x "$(command -v nginx)" ]; then
120 | cd ~/$tmp
121 | wget https://nginx.org/download/nginx-1.16.0.tar.gz
122 | tar zxvf nginx-1.16.0.tar.gz
123 | cd nginx-1.16.0
124 | ./configure --user=nobody --group=nobody --prefix=/usr/local/nginx --with-http_stub_status_module --with-http_gzip_static_module --with-http_realip_module --with-http_sub_module --with-http_ssl_module
125 | make && make install
126 | cd $path
127 | cat>/lib/systemd/system/nginx.service<
/usr/local/nginx/conf/nginx.conf<"$path/uwsgi.ini"<"$path/config.py"<
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 | 视频小姐姐-手机版-94iMM
19 |
70 |
71 |
72 |
73 |
76 | 视频来源:[{{ source }}]
77 | 用户ID:[{{ user_id}}]
78 | ←左滑切换视频
79 | 点击播放/暂停
80 | 视频采集自网络 | 返回94imm
81 |
82 |
83 |
84 |
85 |
178 |
182 |
183 |
194 |
--------------------------------------------------------------------------------
/crawler/crawler_mmjpg.py:
--------------------------------------------------------------------------------
1 | #coding='UTF-8'
2 |
3 | import sys
4 |
5 | sys.path.append('../')
6 | from bs4 import BeautifulSoup
7 | from requests.adapters import HTTPAdapter
8 | import threading,pymysql,time,requests,os,urllib3,re,random
9 | from config import mysql_config
10 |
11 | requests.packages.urllib3.disable_warnings()
12 | requests.adapters.DEFAULT_RETRIES = 5
13 | s = requests.session()
14 | s.keep_alive = False
15 | s.mount('http://', HTTPAdapter(max_retries=3))
16 | # 数据库连接信息
17 | dbhost = {
18 | "host": mysql_config['HOST'],
19 | "dbname": mysql_config['NAME'],
20 | "user": mysql_config['USER'],
21 | "password": mysql_config['PASSWORD']
22 | }
23 |
24 | class Spider():
25 | rlock = threading.RLock()
26 | page_url_list=[]
27 | img_url_list=[]
28 | proxy_dict = ""
29 | base_url="http://www.mmmjpg.com/"
30 | def __init__(self,start_page_num,end_page_num,img_path,thread_num,type):
31 | self.start_page_num=start_page_num
32 | self.end_page_num=end_page_num
33 | self.img_path=img_path
34 | self.thread_num=thread_num
35 | self.type=type
36 |
37 | def get_url(self):
38 | for i in range(self.start_page_num,self.end_page_num+1):
39 | if i==0:
40 | page=s.get(self.base_url)
41 | else:
42 | page=s.get(self.base_url+self.type+"/"+str(i))
43 | soup=BeautifulSoup(page.text, "html.parser")
44 | url_soup=soup.find("div",class_="pic").find("ul").find_all("li")
45 | for li in url_soup:
46 | url=li.find("a").get("href")
47 | self.page_url_list.append(url)
48 |
49 | def get_img(self,url):
50 | db = pymysql.connect(dbhost.get("host"), dbhost.get("user"), dbhost.get("password"),dbhost.get("dbname"))
51 | cursor = db.cursor()
52 | tagidlist=[]
53 | page_id = url.split("/")[-1]
54 | page_url=self.base_url+"mm/"+page_id
55 | headers = {
56 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
57 | "Referer": page_url
58 | }
59 | info_page = s.get(self.base_url+"mm/" + page_id,headers=headers)
60 | info_page.encoding="utf-8"
61 | info_soup = BeautifulSoup(info_page.text,"html.parser")
62 | title=info_soup.find("div",class_="article").find("h1").text
63 | if "袜" in title or "丝" in title or "腿" in title:
64 | type_id = 2
65 | elif "青春" in title or "清纯" in title:
66 | type_id = 3
67 | elif "萝莉" in title:
68 | type_id = 4
69 | else:
70 | type_id = 1
71 | isExists = cursor.execute("SELECT title FROM images_page WHERE title =" + "'" + title + "'" + " limit 1;")
72 | img_m_src=info_soup.find("div",class_="content").find("a").find("img").get("src").split("/")[-3]
73 | if isExists != 0:
74 | print("已采集:" + title)
75 | else:
76 | tags=info_soup.find("div",class_="tags").find_all("a")
77 | for tag_soup in tags:
78 | tag=tag_soup.text
79 | sqltag = "SELECT * FROM images_tag WHERE tag =" + "'" + tag + "'" + " limit 1;"
80 | isExiststag = cursor.execute(sqltag)
81 | if isExiststag == 0:
82 | cursor.execute("INSERT INTO images_tag (tag) VALUES (%s)", tag)
83 | cursor.execute("SELECT id FROM images_tag WHERE tag =" + "'" + tag + "'")
84 | for id in cursor.fetchall():
85 | tagidlist.append(id[0])
86 | p = (title, str(tagidlist), time.strftime('%Y-%m-%d', time.localtime(time.time())), self.type_id, "1",page_url)
87 | cursor.execute("INSERT INTO images_page (title,tagid,sendtime,typeid,firstimg,crawler) VALUES (%s,%s,%s,%s,%s,%s)", p)
88 | print("开始采集:"+title)
89 | pageid = cursor.lastrowid
90 | page=s.get(page_url,headers=headers)
91 | soup=BeautifulSoup(page.text,"html.parser")
92 | img_base=soup.find("div",class_="content").find("img").get("src").split("/")
93 | img_base_url="http://"+img_base[2]+"/"
94 | img_num=soup.find("div",class_="page").text.replace("全部图片下一页","").split("...")[-1]
95 | img_path = self.img_path + time.strftime('%Y%m%d', time.localtime(
96 | time.time())) + "/" + img_base[-2] +"/"
97 | for i in range(1,int(img_num)):
98 | img_loc_path=img_path+str(i)+".jpg"
99 | imgp = pageid, img_loc_path, img_base_url+img_base[-2]+"/"+str(i)+".jpg"
100 | cursor.execute("INSERT INTO images_image (pageid,imageurl,originurl) VALUES (%s,%s,%s)", imgp)
101 | if i == 1:
102 | cursor.execute(
103 | "UPDATE images_page SET firstimg = " + "'" + img_loc_path + "'" + " WHERE id=" + "'" + str(
104 | pageid) + "'")
105 | self.img_url_list.append({"img_url": img_base_url+img_base[-2]+"/"+str(i)+".jpg", "Referer": url, "id": img_base[-2]})
106 | # print({"img_url": img_base_url+img_path+str(i)+".jpg", "Referer": img_base_url+img_base[-2], "id": img_base[-2]})
107 |
108 |
109 |
110 | def down_img(self,imgsrc,Referer,id):
111 | headers = {
112 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
113 | "Referer": Referer
114 | }
115 | path = self.img_path + time.strftime('%Y%m%d', time.localtime(time.time())) + "/"
116 | page_id = id
117 | isdata = os.path.exists("../" + path + page_id)
118 | if not isdata:
119 | os.makedirs("../" + path + page_id)
120 | with open("../" + path + page_id + "/" + imgsrc.split("/")[-1].split(".")[0] + ".jpg", "wb") as f:
121 | print("已保存:" + path + page_id + "/" + imgsrc.split("/")[-1].split(".")[0] + ".jpg")
122 | f.write(s.get(imgsrc, headers=headers,verify=False).content)
123 |
124 | def run_page(self):
125 | while True:
126 | Spider.rlock.acquire()
127 | if len(self.page_url_list) == 0:
128 | Spider.rlock.release()
129 | break
130 | else:
131 | page_url = self.page_url_list.pop()
132 | Spider.rlock.release()
133 | try:
134 | self.get_img(page_url)
135 | except Exception as e:
136 | pass
137 |
138 | def run_img(self):
139 | while True:
140 | Spider.rlock.acquire()
141 | if len(self.img_url_list) == 0 :
142 | Spider.rlock.release()
143 | break
144 | else:
145 | urls = self.img_url_list.pop()
146 | url = urls.get("img_url")
147 | Referer = urls.get("Referer")
148 | id = urls.get("id")
149 | Spider.rlock.release()
150 | try:
151 | self.down_img(url, Referer, id)
152 | except Exception as e:
153 | pass
154 |
155 | def run_1(self):
156 | # 启动thread_num个进程来爬去具体的img url 链接
157 | url_threa_list=[]
158 | for th in range(self.thread_num):
159 | add_pic_t = threading.Thread(target=self.run_page)
160 | url_threa_list.append(add_pic_t)
161 |
162 | for t in url_threa_list:
163 | t.setDaemon(True)
164 | t.start()
165 |
166 | for t in url_threa_list:
167 | t.join()
168 |
169 | def run_2(self):
170 | # 启动thread_num个来下载图片
171 | for img_th in range(self.thread_num):
172 | download_t = threading.Thread(target=self.run_img)
173 | download_t.start()
174 |
175 | # start_page是采集开始也,end是采集结束页,type不用修改,自动分类
176 | if __name__ == "__main__":
177 | for i in [{"start_page": 1,"end_page":1, "type": "home"}]:
178 | spider=Spider(start_page_num=i.get("start_page"),end_page_num=i.get("end_page"),img_path='/static/images/',thread_num=10,type=i.get("type"))
179 | spider.get_url()
180 | spider.run_1()
181 | spider.run_2()
--------------------------------------------------------------------------------
/crawler/crawler_mm131.py:
--------------------------------------------------------------------------------
1 | # coding='UTF-8'
2 |
3 | import sys
4 |
5 | sys.path.append('../')
6 | from bs4 import BeautifulSoup
7 | import threading, pymysql, time, requests, os, urllib3, re,random
8 | from config import mysql_config
9 |
10 | requests.packages.urllib3.disable_warnings()
11 | requests.adapters.DEFAULT_RETRIES = 5
12 | s = requests.session()
13 | s.keep_alive = False
14 | # 数据库连接信息
15 | dbhost = {
16 | "host": mysql_config['HOST'],
17 | "dbname": mysql_config['NAME'],
18 | "user": mysql_config['USER'],
19 | "password": mysql_config['PASSWORD']
20 | }
21 | base_url="https://mm131.pro"
22 |
23 | class Spider():
24 | rlock = threading.RLock()
25 | page_url_list = []
26 | img_url_list = []
27 | headers = {
28 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36",
29 | "Referer": base_url
30 | }
31 |
32 | def __init__(self, page_num, img_path, thread_num, type_id=1, type="home",tagslist=["性感美女","诱惑美女","大胸美女","萌妹子"]):
33 | self.page_num = page_num
34 | self.img_path = img_path
35 | self.thread_num = thread_num
36 | self.type_id = type_id
37 | self.type = type
38 | self.tagslist= tagslist
39 |
40 | def get_url(self):
41 | for i in range(self.page_num):
42 | page = s.get(base_url+"/e/action/ListInfo/?classid="+str(self.type_id), headers=self.headers,verify=False)
43 | soup = BeautifulSoup(page.text, "html.parser")
44 | try:
45 | page_div = soup.find("dl", class_="list-left public-box").find_all("dd")
46 | except:
47 | print("采集错误,跳过本条")
48 | continue
49 | del page_div[-1]
50 | for dd in page_div:
51 | url = dd.find("a").get("href")
52 | self.page_url_list.append(base_url+url)
53 |
54 | def get_img(self,url):
55 | db = pymysql.connect(dbhost.get("host"), dbhost.get("user"), dbhost.get("password"), dbhost.get("dbname"))
56 | cursor = db.cursor()
57 | tagidlist = []
58 | page = s.get(url, headers=self.headers)
59 | page.encoding='UTF-8'
60 | soup = BeautifulSoup(page.text, "html.parser")
61 | # page_div = soup.find("div", class_="content-pic")
62 | title = soup.title.string.replace("_znns.com宅男钕神",'')
63 | isExists = cursor.execute("SELECT title FROM images_page WHERE title =" + "'" + title + "'" + " limit 1;")
64 | if isExists != 0:
65 | print("isExists:" + title)
66 | else:
67 | tagslist = re.findall('', page.text)
68 | for tags in tagslist:
69 | for tag in tags.split(","):
70 | sqltag = "SELECT * FROM images_tag WHERE tag =" + "'" + tag + "'" + " limit 1;"
71 | isExiststag = cursor.execute(sqltag)
72 | if isExiststag == 0:
73 | cursor.execute("INSERT INTO images_tag (tag) VALUES (%s)", tag)
74 | cursor.execute("SELECT id FROM images_tag WHERE tag =" + "'" + tag + "'")
75 | for id in cursor.fetchall():
76 | tagidlist.append(id[0])
77 | p = (
78 | title, str(tagidlist), time.strftime('%Y-%m-%d', time.localtime(time.time())), self.type_id, "1", url)
79 | cursor.execute(
80 | "INSERT INTO images_page (title,tagid,sendtime,typeid,firstimg,crawler) VALUES (%s,%s,%s,%s,%s,%s)", p)
81 | print("down:" + title)
82 | pageid = cursor.lastrowid
83 | img_num_soup = soup.find("div", class_="content-page").find("span").text
84 | img_num = "".join(re.findall(r"\d", img_num_soup))
85 | for i in range(1, int(img_num)):
86 | headers = self.headers.copy()
87 | headers.update({"Referer":url})
88 | id = url.split("/")[-1].split(".")[0]
89 | if i==1:
90 | img_page_url=url
91 | else:
92 | img_page_url = "/".join(url.split("/")[0:-1]) + "/" + id + "_" + str(i) + ".html"
93 | img_page=s.get(img_page_url,headers=headers,verify=False)
94 | # page.encoding = 'utf-8'
95 | img_soup=BeautifulSoup(img_page.text,"html.parser")
96 | img_url = img_soup.find("div",class_="content-pic").find("img").get("src")
97 | img_name =img_url.split("/")[-1]
98 | id=url.split("/")[-1].split(".")[0]
99 | img_loc_path = self.img_path + time.strftime('%Y%m%d', time.localtime(
100 | time.time())) + "/" + id + "/" +img_name
101 | if i == 1:
102 | cursor.execute(
103 | "UPDATE images_page SET firstimg = " + "'" + img_loc_path + "'" + " WHERE id=" + "'" + str(
104 | pageid) + "'")
105 | imgp = pageid, img_loc_path,img_url
106 | cursor.execute("INSERT INTO images_image (pageid,imageurl,originurl) VALUES (%s,%s,%s)", imgp)
107 | i += 1
108 | data={"img_url":img_url,"Referer":url,"id":id}
109 | if data in self.img_url_list:
110 | continue
111 | else:
112 | self.img_url_list.append(data)
113 |
114 | def down_img(self,imgsrc,Referer,id):
115 | headers = {
116 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
117 | "Referer": Referer
118 | }
119 | path = self.img_path + time.strftime('%Y%m%d', time.localtime(time.time())) + "/"
120 | page_id = id
121 | isdata = os.path.exists("../" + path + page_id)
122 | if not isdata:
123 | os.makedirs("../" + path + page_id)
124 | with open("../" + path + page_id + "/" + imgsrc.split("/")[-1].split(".")[0] + ".jpg", "wb") as f:
125 | print("已保存:" + path + page_id + "/" + imgsrc.split("/")[-1].split(".")[0] + ".jpg")
126 | f.write(s.get(imgsrc, headers=headers,verify=False).content)
127 |
128 | def run_page(self):
129 | while True:
130 | Spider.rlock.acquire()
131 | if len(self.page_url_list) == 0:
132 | Spider.rlock.release()
133 | break
134 | else:
135 | try:
136 | page_url = self.page_url_list.pop()
137 | except Exception as e:
138 | print(e)
139 | pass
140 | Spider.rlock.release()
141 | try:
142 | self.get_img(page_url)
143 | except Exception as e:
144 | print(e)
145 | pass
146 |
147 | def run_img(self):
148 | while True:
149 | Spider.rlock.acquire()
150 | if len(self.img_url_list) == 0 :
151 | Spider.rlock.release()
152 | break
153 | else:
154 | urls = self.img_url_list.pop()
155 | url = urls.get("img_url")
156 | Referer = urls.get("Referer")
157 | id = urls.get("id")
158 | Spider.rlock.release()
159 | try:
160 | self.down_img(url, Referer, id)
161 | except Exception as e:
162 | print(e)
163 | pass
164 |
165 | def run_1(self):
166 | # 启动thread_num个进程来爬去具体的img url 链接
167 | url_threa_list=[]
168 | for th in range(self.thread_num):
169 | add_pic_t = threading.Thread(target=self.run_page)
170 | url_threa_list.append(add_pic_t)
171 |
172 | for t in url_threa_list:
173 | t.setDaemon(True)
174 | t.start()
175 |
176 | for t in url_threa_list:
177 | t.join()
178 |
179 | def run_2(self):
180 | # 启动thread_num个来下载图片
181 | for img_th in range(self.thread_num):
182 | download_t = threading.Thread(target=self.run_img)
183 | download_t.start()
184 |
185 |
186 | # page是采集深度,从1开始,采集第一页即采集最新发布。type是源站分类,type_id是对应本站分类的id
187 | if __name__ == "__main__":
188 | for i in [{"page": 1, "type": "xinggan", "type_id": 1},{"page":1,"type":"qingchun","type_id": 2}]:
189 | spider = Spider(page_num=i.get("page"), img_path='/static/images/', thread_num=10, type_id=i.get("type_id"),
190 | type=i.get("type"),tagslist=["性感美女","诱惑美女","大胸美女","萌妹子"])
191 | spider.get_url()
192 | spider.run_1()
193 | spider.run_2()
--------------------------------------------------------------------------------
/crawler/crawler_amn.py:
--------------------------------------------------------------------------------
1 | # coding='UTF-8'
2 | import sys
3 | sys.path.append('../')
4 | from bs4 import BeautifulSoup
5 | from requests.adapters import HTTPAdapter
6 | import threading,pymysql,time,requests,os,urllib3,re,random
7 | from config import mysql_config
8 |
9 | requests.packages.urllib3.disable_warnings()
10 | requests.adapters.DEFAULT_RETRIES = 5
11 | s = requests.session()
12 | s.keep_alive = False
13 | s.mount('http://', HTTPAdapter(max_retries=3))
14 | # 数据库连接信息
15 | dbhost = {
16 | "host": mysql_config['HOST'],
17 | "dbname": mysql_config['NAME'],
18 | "user": mysql_config['USER'],
19 | "password": mysql_config['PASSWORD']
20 | }
21 |
22 | base_url="https://www.2meinv.com/"
23 | tag_url="https://www.2meinv.com/tags-{}-{}.html"
24 | index_url="https://www.2meinv.com/index-1.html"
25 | img_path='/static/images/'
26 |
27 | class Spider():
28 | page_url_list = []
29 | img_url_list = []
30 | rlock = threading.RLock()
31 | proxy_dict = ""
32 | def __init__(self, start_page_num, end_page_num,img_path, thread_num, type="home",type_id=0):
33 | self.start_page_num = start_page_num
34 | self.end_page_num=end_page_num
35 | self.img_path = img_path
36 | self.thread_num = thread_num
37 | self.type = type
38 | self.type_id=type_id
39 |
40 | def get_url(self):
41 | for i in range(self.start_page_num, self.end_page_num):
42 | if self.type_id==0:
43 | page = s.get(index_url.format(str(i)), verify=False).text
44 | else:
45 | page = s.get(tag_url.format(self.type,str(i)), verify=False).text
46 | # page = s.get(base_url + self.type+"-"+str(i)+".html", verify=False).text
47 | soup = BeautifulSoup(page, "html.parser")
48 | page_base_url = soup.find("ul", class_="detail-list").find_all("li")
49 | for page_url in page_base_url:
50 | url = page_url.find("a",class_="dl-pic").get("href")
51 | self.page_url_list.append(url)
52 |
53 | def get_img(self,url):
54 | tagidlist=[]
55 | db = pymysql.connect(dbhost.get("host"), dbhost.get("user"), dbhost.get("password"), dbhost.get("dbname"))
56 | cursor = db.cursor()
57 | page = s.get(url,verify=False)
58 | soup = BeautifulSoup(page.text, "html.parser")
59 | title=soup.title.string.replace("_爱美女","")
60 | if self.type_id == 0:
61 | if "袜" in title or "丝" in title or "腿" in title:
62 | self.type_id = 2
63 | elif "青春" in title or "清纯" in title:
64 | self.type_id = 3
65 | elif "萝莉" in title:
66 | self.type_id = 4
67 | else:
68 | self.type_id = 1
69 | isExists = cursor.execute("SELECT title FROM images_page WHERE title =" + "'" + title + "'" + " limit 1;")
70 | if isExists != 0:
71 | print("已采集:" , title)
72 | else:
73 | print("正在采集:", title)
74 | tags=soup.find(attrs={"name":"Keywords"})['content'].split(",")
75 | for tag in tags:
76 | sqltag = "SELECT * FROM images_tag WHERE tag =" + "'" + tag + "'" + " limit 1;"
77 | isExiststag = cursor.execute(sqltag)
78 | if isExiststag == 0:
79 | cursor.execute("INSERT INTO images_tag (tag) VALUES (%s)", tag)
80 | cursor.execute("SELECT id FROM images_tag WHERE tag =" + "'" + tag + "'")
81 | for id in cursor.fetchall():
82 | tagidlist.append(id[0])
83 | p = (title, str(tagidlist), time.strftime('%Y-%m-%d', time.localtime(time.time())), self.type_id, "1",url)
84 | cursor.execute("INSERT INTO images_page (title,tagid,sendtime,typeid,firstimg,crawler) VALUES (%s,%s,%s,%s,%s,%s)", p)
85 | pageid = cursor.lastrowid
86 | img_soup=soup.find("div",class_="page-show").text
87 | img_nums=re.sub("\D", "", img_soup)
88 | if len(img_nums)==6:
89 | img_num=img_nums[-2:]
90 | elif len(img_nums)<6:
91 | img_num = img_nums[-1]
92 | elif len(img_nums)>6:
93 | img_num = img_nums[-3:]
94 | id=url.split("-")[-1].split(".")[0]
95 | for i in range(1,int(img_num)+1):
96 | img_page_url=base_url+"article-"+id+"-"+str(i)+".html"
97 | img_page=s.get(img_page_url)
98 | img_soup=BeautifulSoup(img_page.text, "html.parser")
99 | img_url=img_soup.find("div",class_="pp hh").find("img").get("src")
100 | img_name = img_url.split("/")[-1]
101 | img_loc_path = self.img_path + time.strftime('%Y%m%d', time.localtime(
102 | time.time())) + "/" + id + "/" + img_name
103 | imgp = pageid, img_loc_path,img_url
104 | cursor.execute("INSERT INTO images_image (pageid,imageurl,originurl) VALUES (%s,%s,%s)", imgp)
105 | if i==1:
106 | cursor.execute(
107 | "UPDATE images_page SET firstimg = " + "'" + img_loc_path + "'" + " WHERE id=" + "'" + str(
108 | pageid) + "'")
109 | self.img_url_list.append({"img_url":img_url,"Referer":url,"id":id})
110 |
111 | def down_img(self,imgsrc,Referer,id):
112 | headers = {
113 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
114 | "Referer": Referer
115 | }
116 | path = img_path + time.strftime('%Y%m%d', time.localtime(time.time())) + "/"
117 | page_id = id
118 | isdata = os.path.exists("../" + path + page_id)
119 | if not isdata:
120 | os.makedirs("../" + path + page_id)
121 | with open("../" + path + page_id + "/" + imgsrc.split("/")[-1].split(".")[0] + ".jpg", "wb") as f:
122 | print("已保存:" + path + page_id + "/" + imgsrc.split("/")[-1].split(".")[0] + ".jpg")
123 | f.write(s.get(imgsrc, headers=headers,verify=False).content)
124 |
125 | def run_page(self):
126 | while True:
127 | Spider.rlock.acquire()
128 | if len(self.page_url_list) == 0:
129 | Spider.rlock.release()
130 | break
131 | else:
132 | page_url = self.page_url_list.pop()
133 | Spider.rlock.release()
134 | try:
135 | self.get_img(page_url)
136 | except Exception as e:
137 | print(e)
138 | pass
139 |
140 | def run_img(self):
141 | while True:
142 | Spider.rlock.acquire()
143 | if len(self.img_url_list) == 0 :
144 | Spider.rlock.release()
145 | break
146 | else:
147 | urls = self.img_url_list.pop()
148 | url = urls.get("img_url")
149 | Referer = urls.get("Referer")
150 | id = urls.get("id")
151 | Spider.rlock.release()
152 | try:
153 | self.down_img(url, Referer, id)
154 | except Exception as e:
155 | print(e)
156 | pass
157 |
158 | def run_1(self):
159 | # 启动thread_num个进程来爬去具体的img url 链接
160 | url_threa_list=[]
161 | for th in range(self.thread_num):
162 | add_pic_t = threading.Thread(target=self.run_page)
163 | url_threa_list.append(add_pic_t)
164 |
165 | for t in url_threa_list:
166 | t.setDaemon(True)
167 | t.start()
168 |
169 | for t in url_threa_list:
170 | t.join()
171 |
172 | def run_2(self):
173 | # 启动thread_num个来下载图片
174 | for img_th in range(self.thread_num):
175 | download_t = threading.Thread(target=self.run_img)
176 | download_t.start()
177 |
178 | # start_page是采集开始也,end是采集结束页,type不用修改,自动分类,起始页为1
179 | if __name__ == "__main__":
180 | cl_list=[{"start_page": 1,"end_page":17, "type": "Cosplay", "type_id":6},
181 | {"start_page": 1,"end_page":17, "type": "性感", "type_id":1},
182 | {"start_page": 1, "end_page": 17, "type": "丝袜", "type_id": 2},
183 | {"start_page": 1, "end_page": 17, "type": "美腿", "type_id": 2},
184 | {"start_page": 1, "end_page": 17, "type": "美胸", "type_id": 1},
185 | {"start_page": 1, "end_page": 17, "type": "制服诱惑", "type_id": 2}
186 | ]
187 |
188 |
189 | for i in cl_list:
190 | spider = Spider(start_page_num=i.get("start_page"),end_page_num=i.get("end_page"), img_path='/static/images/', thread_num=10,
191 | type=i.get("type"),type_id=i.get("type_id"))
192 | spider.get_url()
193 | spider.run_1()
194 | spider.run_2()
--------------------------------------------------------------------------------
/crawler/crawler_nsg.py:
--------------------------------------------------------------------------------
1 | # coding='UTF-8'
2 |
3 | import sys
4 |
5 | sys.path.append('../')
6 | from bs4 import BeautifulSoup
7 | from requests.adapters import HTTPAdapter
8 | import threading,pymysql,time,requests,os,urllib3,re,random
9 | from config import mysql_config
10 |
11 | requests.packages.urllib3.disable_warnings()
12 | requests.adapters.DEFAULT_RETRIES = 5
13 | s = requests.session()
14 | s.keep_alive = False
15 | s.mount('http://', HTTPAdapter(max_retries=3))
16 | # 数据库连接信息
17 | dbhost = {
18 | "host": mysql_config['HOST'],
19 | "dbname": mysql_config['NAME'],
20 | "user": mysql_config['USER'],
21 | "password": mysql_config['PASSWORD']
22 | }
23 |
24 | # dbhost = {
25 | # "host": "192.168.1.67",
26 | # "dbname": "silumz",
27 | # "user": "silumz",
28 | # "password": "fendou2009"
29 | # }
30 |
31 | base_url="http://www.nvshenge.com/mntp/"
32 | img_path='/static/images/'
33 |
34 | class Spider():
35 | page_url_list = []
36 | img_url_list = []
37 | rlock = threading.RLock()
38 | proxy_dict = ""
39 | def __init__(self, start_page_num, end_page_num,img_path, thread_num, type="home"):
40 | self.start_page_num = start_page_num
41 | self.end_page_num=end_page_num
42 | self.img_path = img_path
43 | self.thread_num = thread_num
44 | self.type = type
45 |
46 | def get_url(self):
47 | for i in range(self.start_page_num -1, self.end_page_num -1):
48 | if i==0:
49 | page=s.get(base_url, verify=False).text
50 | else:
51 | page = s.get(base_url + "list_"+str(i)+".html", verify=False).text
52 | soup = BeautifulSoup(page, "html.parser")
53 | all_list = soup.find_all("a", class_="PicTxt")
54 | i = 0
55 | for info_soup in all_list:
56 | url=info_soup.get("href")
57 | title=info_soup.text
58 | self.page_url_list.append({"url":url,"title":title})
59 | i += 1
60 |
61 |
62 | def get_img(self):
63 | db = pymysql.connect(dbhost.get("host"), dbhost.get("user"), dbhost.get("password"),
64 | dbhost.get("dbname"))
65 | cursor = db.cursor()
66 | while True:
67 | self.rlock.acquire()
68 | if len(self.page_url_list) == 0:
69 | self.rlock.release()
70 | break
71 | else:
72 | page_info= self.page_url_list.pop()
73 | page_url = page_info.get("url")
74 | title = page_info.get("title")
75 | if "袜" in title or "丝" in title or "腿" in title:
76 | type_id = 2
77 | elif "青春" in title or "清纯" in title:
78 | type_id = 3
79 | elif "萝莉" in title:
80 | type_id = 4
81 | else:
82 | type_id = 1
83 | self.rlock.release()
84 | try:
85 | tagidlist = []
86 | page = s.get(page_url, verify=False).text
87 | soup = BeautifulSoup(page, "html.parser")
88 | img_num_soup = soup.find("div", class_="articleTop yh").find("h1").text
89 | img_num = int(img_num_soup[img_num_soup.find("(1/") + 3:img_num_soup.find(")")])
90 | isExists = cursor.execute(
91 | "SELECT title FROM images_page WHERE title =" + "'" + title + "'" + " limit 1;")
92 | if isExists != 0:
93 | print("已采集:" + title)
94 | else:
95 | taglist = soup.find("div",class_="articleTag l").find_all("dd")
96 | for tag_soup in taglist:
97 | tag=tag_soup.text
98 | sqltag = "SELECT * FROM images_tag WHERE tag =" + "'" + tag + "'" + " limit 1;"
99 | isExiststag = cursor.execute(sqltag)
100 | if isExiststag == 0:
101 | cursor.execute("INSERT INTO images_tag (tag) VALUES (%s)", tag)
102 | cursor.execute("SELECT id FROM images_tag WHERE tag =" + "'" + tag + "'")
103 | for id in cursor.fetchall():
104 | tagidlist.append(id[0])
105 | p = (
106 | title, str(tagidlist), time.strftime('%Y-%m-%d', time.localtime(time.time())), type_id,
107 | "1", page_url)
108 | cursor.execute(
109 | "INSERT INTO images_page (title,tagid,sendtime,typeid,firstimg,crawler) VALUES (%s,%s,%s,%s,%s,%s)",
110 | p)
111 | print("开始采集:" + title)
112 | pageid = cursor.lastrowid
113 | for i in range(0, int(img_num)):
114 | img_id = page_url.split("/")[-1].split(".")[0]
115 | if i==0:
116 | url=page_url
117 | else:
118 | url = "/".join(page_url.split("/")[0:-1])+"/"+img_id+"_"+str(i)+".html"
119 | img_page=s.get(url, verify=False).text
120 | img_soup= BeautifulSoup(img_page, "html.parser")
121 | img_src=img_soup.find("div",id="ArticlePicBox1").find("img").get("src")
122 | img_loc_path = self.img_path + time.strftime('%Y%m%d', time.localtime(
123 | time.time())) + "/"+img_id+"/"+img_src.split("/")[-1]
124 | if i == 0:
125 | cursor.execute(
126 | "UPDATE images_page SET firstimg = " + "'" + img_loc_path + "'" + " WHERE title=" + "'" + title + "'")
127 | imgp = pageid, img_loc_path, img_src
128 | cursor.execute("INSERT INTO images_image (pageid,imageurl,originurl) VALUES (%s,%s,%s)",
129 | imgp)
130 | self.img_url_list.append({"url": img_src, "path": img_loc_path, "referer": page_url})
131 | except Exception as e:
132 | cursor.execute("Delete FROM images_page WHERE title=" + "'" + title + "'")
133 | print("采集失败(已删除):",title)
134 | print("连接地址:", page_url)
135 | print("错误信息:", e)
136 | db.close()
137 |
138 | def down_img(self, imgsrc, imgpath, referer):
139 | headers = {
140 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
141 | "Referer": referer
142 | }
143 | isdata = os.path.exists(".." +"/".join(imgpath.split("/")[0:-1]))
144 | if not isdata:
145 | os.makedirs(".."+"/".join(imgpath.split("/")[0:-1]))
146 | with open(".."+ imgpath, "wb")as f:
147 | f.write(requests.get(imgsrc, headers=headers, verify=False).content)
148 | print("下载图片:" + imgpath)
149 |
150 | def down_url(self):
151 | while True:
152 | Spider.rlock.acquire()
153 | if len(Spider.img_url_list) == 0:
154 | Spider.rlock.release()
155 | break
156 | else:
157 | img_url = Spider.img_url_list.pop()
158 | Spider.rlock.release()
159 | try:
160 | url = img_url.get("url")
161 | path = img_url.get("path")
162 | referer = img_url.get("referer")
163 | self.down_img(url, path, referer)
164 | except Exception as e:
165 | print(e)
166 | self.img_url_list.append(
167 | {"url": img_url.get("url"), "path": img_url.get("path"), "referer": img_url.get("referer")})
168 | pass
169 |
170 | def run_1(self):
171 | # 启动thread_num个进程来爬去具体的img url 链接
172 | url_threa_list = []
173 | for th in range(self.thread_num):
174 | add_pic_t = threading.Thread(target=self.get_img)
175 | url_threa_list.append(add_pic_t)
176 |
177 | for t in url_threa_list:
178 | t.setDaemon(True)
179 | t.start()
180 |
181 | for t in url_threa_list:
182 | t.join()
183 |
184 | def run_2(self):
185 | # 启动thread_num个来下载图片
186 | for img_th in range(self.thread_num):
187 | download_t = threading.Thread(target=self.down_url)
188 | download_t.start()
189 |
190 |
191 | # start_page是采集开始也,end是采集结束页,type不用修改,自动分类,起始页为1
192 | if __name__ == "__main__":
193 | for i in [{"start_page": 1,"end_page":2, "type": "index"}]:
194 | spider = Spider(start_page_num=i.get("start_page"),end_page_num=i.get("end_page"), img_path='/static/images/', thread_num=10,
195 | type=i.get("type"))
196 | spider.get_url()
197 | spider.run_1()
198 | spider.run_2()
--------------------------------------------------------------------------------
/static/zde/css/fonts/iconfont.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
93 |
--------------------------------------------------------------------------------
/images/views.py:
--------------------------------------------------------------------------------
1 | from django.shortcuts import render
2 | from images.models import *
3 | import random, json
4 | from django.http import HttpResponse
5 | from config import site_name, site_url, key_word, description, email,friendly_link
6 |
7 |
8 |
9 | def index(request):
10 | if request.method == "GET":
11 | imgs = []
12 | page_list = Page.objects.all().order_by('?')[:50]
13 | typedict, typelist = type_list()
14 | for pid in page_list:
15 | id = pid.id
16 | title = pid.title
17 | firstimg = pid.firstimg
18 | sendtime = pid.sendtime
19 | hot = pid.hot
20 | type_id = pid.typeid
21 | imgs.append({"pid": id, "firstimg": firstimg, "title": title, "sendtime": sendtime, "hot": hot,
22 | "type": typedict[type_id], "type_id": type_id})
23 | return render(request, 'index.html',
24 | {"data": imgs, "typelist": typelist, "siteName": site_name, "keyWord": key_word,
25 | "description": description, "siteUrl": site_url, "email": email})
26 |
27 |
28 | def page(request, i_id):
29 | # try:
30 | page_arr = Page.objects.get(id=i_id)
31 | imgs = []
32 | tags = []
33 | typedict, typelist = type_list()
34 | page_hot = page_arr.hot
35 | page_arr.hot = page_hot + 1
36 | page_arr.save()
37 | time = page_arr.sendtime
38 | typeid = page_arr.typeid
39 | pagetype = Type.objects.get(id=typeid).type
40 | title = page_arr.title
41 | taglist = page_arr.tagid
42 | tag_arr = taglist.replace("[", "").replace("]", "").split(",")
43 | for t_id in tag_arr:
44 | tagid = t_id.strip(" ")
45 | tag = Tag.objects.get(id=tagid).tag
46 | tags.append({"tname": tag, "tid": tagid})
47 | imglist = Image.objects.filter(pageid=i_id)
48 | for img_arr in imglist:
49 | img = img_arr.imageurl
50 | imgs.append(img)
51 | if len(tags) > 4:
52 | tags = random.sample(tags, 4)
53 | typename = typedict[typeid]
54 | return render(request, 'page.html',
55 | {"data": imgs, "tag": tags, "title": title, "type": pagetype, "typeid": str(typeid), "time": time,
56 | "similar": page_similar(typeid), "typelist": typelist, "pageid": i_id, "siteName": site_name,
57 | "keyWord": key_word, "description": description, "typeName": typename, "siteUrl": site_url,
58 | "email": email,"friendly_link":friendly_link})
59 |
60 |
61 | def tag(request, tid):
62 | if request.method == "GET":
63 | imgs = []
64 | page_list = Page.objects.all().order_by("-id")
65 | typedict, typelist = type_list()
66 | for pid in page_list:
67 | if tid in pid.tagid:
68 | id = pid.id
69 | title = pid.title
70 | firstimg = pid.firstimg
71 | type_id = pid.typeid
72 | sendtime = pid.sendtime
73 | hot = pid.hot
74 | imgs.append({"pid": id, "firstimg": firstimg, "title": title, "sendtime": sendtime, "hot": hot,
75 | "type": typedict[type_id], "type_id": type_id})
76 | return render(request, 'index.html',
77 | {"data": imgs, "typelist": typelist, "siteName": site_name, "keyWord": key_word,
78 | "description": description, "siteUrl": site_url, "email": email,"friendly_link":friendly_link})
79 |
80 |
81 | def type(request, typeid):
82 | if request.method == "GET":
83 | imgs = []
84 | typedict, typelist = type_list()
85 | page_list = Page.objects.filter(typeid=typeid).order_by("-id")
86 | for pid in page_list:
87 | title = pid.title
88 | firstimg = pid.firstimg
89 | id = pid.id
90 | hot = pid.hot
91 | type_id = pid.typeid
92 | sendtime = pid.sendtime
93 | imgs.append({"pid": id, "firstimg": firstimg, "title": title, "sendtime": sendtime, "hot": hot,
94 | "type": typedict[type_id], "type_id": type_id})
95 | return render(request, 'category.html',
96 | {"data": imgs, "typelist": typelist, "typeid": str(typeid), "siteName": site_name,
97 | "keyWord": key_word, "description": description, "siteUrl": site_url, "email": email,"friendly_link":friendly_link})
98 |
99 |
100 | def page_similar(id):
101 | similarlist = []
102 | sidlist = Page.objects.filter(typeid=id).order_by("?")
103 | type = Type.objects.get(id=id).type
104 | i = 0
105 | for s in sidlist:
106 | if i < 20:
107 | stitle = s.title
108 | pid = s.id
109 | tid = s.typeid
110 | firstimg = s.firstimg
111 | sendtime = s.sendtime
112 | hot = s.hot
113 | if pid != id:
114 | similarlist.append(
115 | {"stitle": stitle, "tid": tid, "pid": pid, "firstimg": firstimg, "sendtime": sendtime, "hot": hot,
116 | "type": type, "type_id": tid
117 | })
118 | i += 1
119 | return similarlist
120 |
121 |
122 | def search(request):
123 | if "s" in request.GET:
124 | imgs = []
125 | typedict, typelist = type_list()
126 | context = request.GET['s']
127 | pagelist = Page.objects.filter(title__contains=context).order_by("-id")
128 | for pid in pagelist:
129 | title = pid.title
130 | firstimg = pid.firstimg
131 | id = pid.id
132 | hot = pid.hot
133 | type_id = pid.typeid
134 | sendtime = pid.sendtime
135 | imgs.append({"pid": id, "firstimg": firstimg, "title": title, "sendtime": sendtime, "hot": hot,
136 | "type": typedict[type_id], "type_id": type_id})
137 | return render(request, 'index.html',
138 | {"data": imgs, "typelist": typelist, "siteName": site_name, "keyWord": key_word,
139 | "description": description, "siteUrl": site_url, "email": email,"friendly_link":friendly_link})
140 |
141 |
142 | def HotTag(request):
143 | tag_sql = Tag.objects.all().order_by("?")
144 | tag_dict = {}
145 | tag_id_list = []
146 | page_sql = Page.objects.all()
147 | page_dict = {}
148 | return_list = []
149 | typedict, typelist = type_list()
150 | for alltag in tag_sql:
151 | tag_dict.update({str(alltag.id).strip(): alltag.tag})
152 | for page in page_sql:
153 | title = page.title
154 | pid = page.id
155 | tag_id = page.tagid.replace("[", "").replace("]", "").split(",")
156 | for t in tag_id:
157 | if str(t).strip() == '':
158 | pass
159 | else:
160 | if str(t).strip() not in tag_id_list:
161 | page_dict.update({str(t).strip(): 1})
162 | tag_id_list.append(str(t).strip())
163 | else:
164 | view = page_dict[str(t).strip()]
165 | page_dict.update({str(t).strip(): view + 1})
166 |
167 | page_dict_sort = sorted(page_dict.items(), key=lambda d: d[1], reverse=True)
168 | for i in page_dict_sort:
169 | if page_dict[str(i[0])] > 20:
170 | return_list.append(
171 | {"tid": i[0], "tag": tag_dict[str(i[0]).strip()], "viwe": page_dict[str(i[0].strip())]}
172 | )
173 | return render(request, 'tag.html',
174 | {"data": return_list, "typelist": typelist, "keyword": return_list[0:10], "siteName": site_name,
175 | "keyWord": key_word, "description": description, "siteUrl": site_url, "email": email,"friendly_link":friendly_link})
176 |
177 |
178 | def SortBy(request, method):
179 | if request.method == "GET":
180 | if method == "new":
181 | page_list = Page.objects.all().order_by("-id")[:100]
182 | else:
183 | page_list = Page.objects.all().order_by("-hot")[:100]
184 | imgs = []
185 | type_dict, typelist = type_list()
186 | for pid in page_list:
187 | title = pid.title
188 | firstimg = pid.firstimg
189 | id = pid.id
190 | hot = pid.hot
191 | type_id = pid.typeid
192 | sendtime = pid.sendtime
193 | imgs.append({"pid": id, "firstimg": firstimg, "title": title, "sendtime": sendtime, "hot": hot,
194 | "type": type_dict[type_id], "type_id": type_id})
195 |
196 | return render(request, 'sort.html',
197 | {"data": imgs, "typelist": typelist, "method": method, "siteName": site_name, "keyWord": key_word,
198 | "description": description, "siteUrl": site_url, "email": email,"friendly_link":friendly_link})
199 |
200 |
201 | def getVideo(request):
202 | count = Video.objects.count()
203 | video_info = ''
204 | while True:
205 | vid = random.randint(1, count)
206 | try:
207 | video_info = Video.objects.get(id=vid)
208 | break
209 | except:
210 | continue
211 | url = video_info.url
212 | user_id = video_info.user_id
213 | source = video_info.source
214 | return HttpResponse(json.dumps({"url": url, "user_id": user_id, "source": source}))
215 |
216 |
217 | def mVideo(request):
218 | if request.method == "GET":
219 | count = Video.objects.count()
220 | video_info = ''
221 | while True:
222 | vid = random.randint(1, count)
223 | try:
224 | video_info = Video.objects.get(id=vid)
225 | break
226 | except:
227 | continue
228 | url = "https:"+video_info.url
229 | return render(request, 'mVideo.html', {
230 | "url": url,
231 | "user_id": video_info.user_id,
232 | "date_time": video_info.date_time,
233 | "v_name": video_info.v_name,
234 | "source": video_info.source, "siteName": site_name, "keyWord": key_word, "description": description,
235 | "siteUrl": site_url, "email": email,"friendly_link":friendly_link})
236 |
237 |
238 | def pVideo(request):
239 | if request.method == "GET":
240 | typedict, typelist = type_list()
241 | count = Video.objects.count()
242 | video_info = ''
243 | while True:
244 | vid = random.randint(1, count)
245 | try:
246 | video_info = Video.objects.get(id=vid)
247 | break
248 | except:
249 | continue
250 | url="https:"+video_info.url
251 | return render(request, 'video.html', {
252 | "url": url,
253 | "user_id": video_info.user_id,
254 | "date_time": video_info.date_time,
255 | "v_name": video_info.v_name,
256 | "source": video_info.source,
257 | "typelist": typelist, "siteName": site_name, "keyWord": key_word, "description": description,
258 | "siteUrl": site_url, "email": email,"friendly_link":friendly_link})
259 |
260 |
261 | def type_list():
262 | typelist = []
263 | type_list = Type.objects.all().order_by("id")
264 | type_dict = {}
265 | for type_arr in type_list:
266 | type = type_arr.type
267 | type_id = type_arr.id
268 | typelist.append({"type": type, "type_id": str(type_id)})
269 | type_dict.update({type_id: type})
270 | return type_dict, typelist
271 |
--------------------------------------------------------------------------------