├── README.md ├── javlibrary.sql └── javlibrary.py /README.md: -------------------------------------------------------------------------------- 1 | # Javlibrary Bot 2 | 3 | Use this poorly written Python bot to build your local Javlibrary. 4 | 5 | ## How to use 6 | 7 | - Import the SQL file provided and modify the MySQL section in javlibrary.py 8 | - Run javlibrary.py 9 | 10 | ## What you need 11 | 12 | - Python 3 13 | - A MySQL server 14 | - 15 GB of free space to store images 15 | - [PyMySQL](https://github.com/PyMySQL/PyMySQL) 16 | - [BeautifulSoup4](http://www.crummy.com/software/BeautifulSoup) 17 | 18 | If you have both Python 2 and Python 3 installed make sure you use the correct pip. 19 | ```bash 20 | dpkg -L python3-pip | tail -n 1 21 | ``` 22 | Then solve dependencies using the correct one. 23 | 24 | ## How it work 25 | 26 | ``` 27 | +-------------+ +----------+ 28 | | MAIN THREAD | | PROGESS | 29 | +-------------+ | SAVEFILE | <--+ 30 | | +----------+ | 31 | +-----------------+ RESTORE PROGRESS ON START 32 | | LOTS OF WORKERS | SAVE PROGRESS EVERY 60SEC 33 | +--+--------------+ | 34 | | | 35 | +-> Get URL from pool <----------+ +----------+ | 36 | | + | | | 37 | | +-> Mark scanned +--------+ | URL POOL | <--+ 38 | | | | | | 39 | | +------> +----------+ | 40 | +-> Extract all links | | | 41 | | + + | +----------+ | 42 | | +-> Select what we need | | | | 43 | | | | SCANNED | | 44 | | +-> | URL POOL | <--+ 45 | +-> Extract all contents | | 46 | + +----------+ 47 | +-> Save in database 48 | | 49 | +-> Download images +-------> +--------+-> AAA-001 50 | | IMAGES | 51 | +----------> AAA-002 52 | | 53 | +-> ABC-123 54 | | 55 | +-> ... 56 | ``` 57 | 58 | ## Bugs 59 | 60 | Need to suppress lots of database related errors: 61 | - pymysql.err.InternalError: (1205, 'Lock wait timeout exceeded; try restarting transaction') -------------------------------------------------------------------------------- /javlibrary.sql: -------------------------------------------------------------------------------- 1 | -- phpMyAdmin SQL Dump 2 | -- version 4.2.12deb2 3 | -- http://www.phpmyadmin.net 4 | -- 5 | -- Host: localhost 6 | -- Generation Time: Jul 08, 2015 at 11:05 PM 7 | -- Server version: 5.5.44-0+deb8u1 8 | -- PHP Version: 5.6.9-0+deb8u1 9 | 10 | SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; 11 | SET time_zone = "+00:00"; 12 | 13 | 14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 17 | /*!40101 SET NAMES utf8 */; 18 | 19 | -- 20 | -- Database: `javlibrary` 21 | -- 22 | 23 | -- -------------------------------------------------------- 24 | 25 | -- 26 | -- Table structure for table `artists` 27 | -- 28 | 29 | CREATE TABLE IF NOT EXISTS `artists` ( 30 | `alink` char(6) NOT NULL COMMENT 'Primary Key: Artist shortlink.', 31 | `vlink` char(10) NOT NULL COMMENT 'Primary Key: Video shortlink.' 32 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 33 | 34 | -- -------------------------------------------------------- 35 | 36 | -- 37 | -- Table structure for table `directors` 38 | -- 39 | 40 | CREATE TABLE IF NOT EXISTS `directors` ( 41 | `dlink` char(4) NOT NULL COMMENT 'Primary Key: Director shortlink.', 42 | `vlink` char(10) NOT NULL COMMENT 'Primary Key: Video shortlink.' 43 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 44 | 45 | -- -------------------------------------------------------- 46 | 47 | -- 48 | -- Table structure for table `genres` 49 | -- 50 | 51 | CREATE TABLE IF NOT EXISTS `genres` ( 52 | `glink` char(4) NOT NULL COMMENT 'Primary Key: Genres shortlink.', 53 | `vlink` char(10) NOT NULL COMMENT 'Primary Key: Video shortlink.' 54 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 55 | 56 | -- -------------------------------------------------------- 57 | 58 | -- 59 | -- Table structure for table `issuers` 60 | -- 61 | 62 | CREATE TABLE IF NOT EXISTS `issuers` ( 63 | `ilink` char(4) NOT NULL COMMENT 'Primary Key: Issuer shortlink.', 64 | `vlink` char(10) NOT NULL COMMENT 'Primary Key: Video shortlink.' 65 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 66 | 67 | -- -------------------------------------------------------- 68 | 69 | -- 70 | -- Table structure for table `list_artists` 71 | -- 72 | 73 | CREATE TABLE IF NOT EXISTS `list_artists` ( 74 | `alink` char(6) NOT NULL COMMENT 'Primary key: Unique artist shortlink.', 75 | `name` varchar(32) NOT NULL COMMENT 'Artist name.' 76 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 77 | 78 | -- -------------------------------------------------------- 79 | 80 | -- 81 | -- Table structure for table `list_directors` 82 | -- 83 | 84 | CREATE TABLE IF NOT EXISTS `list_directors` ( 85 | `dlink` char(4) NOT NULL COMMENT 'Primary key: Unique director shortlink.', 86 | `name` varchar(32) NOT NULL COMMENT 'Director name.' 87 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 88 | 89 | -- -------------------------------------------------------- 90 | 91 | -- 92 | -- Table structure for table `list_genres` 93 | -- 94 | 95 | CREATE TABLE IF NOT EXISTS `list_genres` ( 96 | `glink` char(4) NOT NULL COMMENT 'Primary key: Unique genres shortlink.', 97 | `name` varchar(16) NOT NULL COMMENT 'Primary key: Genres name.' 98 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 99 | 100 | -- -------------------------------------------------------- 101 | 102 | -- 103 | -- Table structure for table `list_issuers` 104 | -- 105 | 106 | CREATE TABLE IF NOT EXISTS `list_issuers` ( 107 | `ilink` char(4) NOT NULL COMMENT 'Primary key: Unique issuer shortlink.', 108 | `name` varchar(32) NOT NULL COMMENT 'Issuer name.' 109 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 110 | 111 | -- -------------------------------------------------------- 112 | 113 | -- 114 | -- Table structure for table `list_makers` 115 | -- 116 | 117 | CREATE TABLE IF NOT EXISTS `list_makers` ( 118 | `mlink` char(4) NOT NULL COMMENT 'Primary key: Unique maker shortlink.', 119 | `name` varchar(32) NOT NULL COMMENT 'Maker name.' 120 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 121 | 122 | -- -------------------------------------------------------- 123 | 124 | -- 125 | -- Table structure for table `makers` 126 | -- 127 | 128 | CREATE TABLE IF NOT EXISTS `makers` ( 129 | `mlink` char(4) NOT NULL COMMENT 'Primary Key: Maker shortlink.', 130 | `vlink` char(10) NOT NULL COMMENT 'Primary Key: Video shortlink.' 131 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 132 | 133 | -- -------------------------------------------------------- 134 | 135 | -- 136 | -- Table structure for table `previews` 137 | -- 138 | 139 | CREATE TABLE IF NOT EXISTS `previews` ( 140 | `vlink` char(10) NOT NULL COMMENT 'Primary Key: Video shortlink.', 141 | `url` varchar(128) NOT NULL COMMENT 'Primary Key: Image URL.' 142 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 143 | 144 | -- -------------------------------------------------------- 145 | 146 | -- 147 | -- Table structure for table `videos` 148 | -- 149 | 150 | CREATE TABLE IF NOT EXISTS `videos` ( 151 | `id` varchar(16) NOT NULL COMMENT 'Primary Key: Video identifier.', 152 | `vlink` char(10) NOT NULL COMMENT 'Primary Key: Video shortlink.', 153 | `title` varchar(192) NOT NULL COMMENT 'Primary Key: Video title.', 154 | `date` date DEFAULT NULL COMMENT 'Release date.', 155 | `length` int(3) DEFAULT NULL COMMENT 'Video length.', 156 | `score` int(3) DEFAULT NULL COMMENT 'User rating.', 157 | `thumbnail` varchar(128) DEFAULT NULL COMMENT 'Thumbnail image URL.', 158 | `cover` varchar(128) DEFAULT NULL COMMENT 'Cover image URL.', 159 | `media` varchar(64) DEFAULT NULL COMMENT 'Media file URL.' 160 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 161 | 162 | -- 163 | -- Indexes for dumped tables 164 | -- 165 | 166 | -- 167 | -- Indexes for table `artists` 168 | -- 169 | ALTER TABLE `artists` 170 | ADD PRIMARY KEY (`alink`,`vlink`); 171 | 172 | -- 173 | -- Indexes for table `directors` 174 | -- 175 | ALTER TABLE `directors` 176 | ADD PRIMARY KEY (`dlink`,`vlink`); 177 | 178 | -- 179 | -- Indexes for table `genres` 180 | -- 181 | ALTER TABLE `genres` 182 | ADD PRIMARY KEY (`glink`,`vlink`); 183 | 184 | -- 185 | -- Indexes for table `issuers` 186 | -- 187 | ALTER TABLE `issuers` 188 | ADD PRIMARY KEY (`ilink`,`vlink`); 189 | 190 | -- 191 | -- Indexes for table `list_artists` 192 | -- 193 | ALTER TABLE `list_artists` 194 | ADD PRIMARY KEY (`alink`); 195 | 196 | -- 197 | -- Indexes for table `list_directors` 198 | -- 199 | ALTER TABLE `list_directors` 200 | ADD PRIMARY KEY (`dlink`); 201 | 202 | -- 203 | -- Indexes for table `list_genres` 204 | -- 205 | ALTER TABLE `list_genres` 206 | ADD PRIMARY KEY (`glink`,`name`); 207 | 208 | -- 209 | -- Indexes for table `list_issuers` 210 | -- 211 | ALTER TABLE `list_issuers` 212 | ADD PRIMARY KEY (`ilink`); 213 | 214 | -- 215 | -- Indexes for table `list_makers` 216 | -- 217 | ALTER TABLE `list_makers` 218 | ADD PRIMARY KEY (`mlink`); 219 | 220 | -- 221 | -- Indexes for table `makers` 222 | -- 223 | ALTER TABLE `makers` 224 | ADD PRIMARY KEY (`mlink`,`vlink`); 225 | 226 | -- 227 | -- Indexes for table `previews` 228 | -- 229 | ALTER TABLE `previews` 230 | ADD PRIMARY KEY (`vlink`,`url`); 231 | 232 | -- 233 | -- Indexes for table `videos` 234 | -- 235 | ALTER TABLE `videos` 236 | ADD PRIMARY KEY (`id`,`vlink`,`title`); 237 | 238 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 239 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 240 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 241 | -------------------------------------------------------------------------------- /javlibrary.py: -------------------------------------------------------------------------------- 1 | #!/#!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | __author__ = 'dommyet' 4 | 5 | import os 6 | import shutil 7 | import pickle 8 | import threading 9 | import re 10 | import urllib.request 11 | import urllib.error 12 | import gzip 13 | import time 14 | from collections import deque 15 | import pymysql 16 | from bs4 import BeautifulSoup 17 | 18 | 19 | class Spider: 20 | def __init__(self): 21 | self.url_seed = ('genres.php', 'star_list.php?prefix=A', 22 | 'vl_update.php', 'vl_newentries.php', 'vl_newrelease.php?&mode=2&', 23 | 'vl_mostwanted.php', 'vl_mostwanted.php?&mode=2&', 24 | 'vl_bestrated.php', 'vl_bestrated.php?&mode=2&') 25 | 26 | self.url_pool = deque() 27 | self.url_scanned_pool = deque() 28 | for i in self.url_seed: 29 | self.url_pool.append(i) 30 | 31 | # Load previous from files 32 | try: 33 | print('[INFO] Restoring scan progress') 34 | f = open('url_pool.bin', 'rb') 35 | self.url_pool = pickle.load(f) 36 | f.close() 37 | f = open('url_scanned_pool.bin', 'rb') 38 | self.url_scanned_pool = pickle.load(f) 39 | f.close() 40 | print('[INFO] Scan progress restored') 41 | except FileNotFoundError: 42 | print('[INFO] File not found') 43 | 44 | # Test database 45 | try: 46 | self.mysql() 47 | self.mysql().close() 48 | except: 49 | print('[CRIT] Database error please check settings') 50 | time.sleep(9999) 51 | 52 | if not os.path.exists('images'): # Check and create folder 53 | os.makedirs('images') 54 | 55 | self.re_detail = re.compile(r'\./\?v=\w{10}') 56 | self.re_genres = re.compile(r'genres\.php') 57 | self.re_artist = re.compile(r'star_list\.php\?*') 58 | self.re_update_list = re.compile(r'vl_update\.php\?list*') 59 | self.re_overview = re.compile( 60 | r'vl_genre|vl_star|vl_director|vl_maker|vl_label|vl_bestrated|vl_mostwanted|vl_newentries|vl_newrelease|vl_update|star_list|\.php\?*') 61 | 62 | self.flag = True # Worker control flag 63 | 64 | def get_url(self): 65 | while len(self.url_pool) > 0: 66 | url = self.url_pool.popleft() 67 | if url not in self.url_scanned_pool: 68 | self.url_scanned_pool.append(url) 69 | return url 70 | else: 71 | time.sleep(5) 72 | 73 | def soup(self, url): 74 | thread_name = threading.current_thread().getName() 75 | req = urllib.request.Request('http://www.javlibrary.com/cn/' + url) 76 | req.add_header('Host', 'www.javlibrary.com') 77 | req.add_header('Accept', 'text/html, application/xhtml+xml, */*') 78 | req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko') 79 | req.add_header('Referer', 'http://www.javlibrary.com/') 80 | req.add_header('Accept-Encoding', 'gzip, deflate') 81 | req.add_header('Accept-Language', 'en-US,en;q=0.8,zh;q=0.6,zh-CN;q=0.4') 82 | 83 | while True: 84 | try: # Open URL 85 | response = urllib.request.urlopen(req).read() 86 | try: # Decompress 87 | html = gzip.decompress(response).decode('utf-8', 'ignore') 88 | except OSError: # Not compressed 89 | html = response.decode('utf-8', 'ignore') 90 | finally: 91 | break # Jump out from the loop 92 | except urllib.error.HTTPError as error: # HTTP error 93 | print('[WARN] %s # Error %s while opening %s' % (thread_name, error.code, url)) 94 | except urllib.error.URLError: # URL error 95 | print('[WARN] %s # URL error while opening %s' % (thread_name, url)) 96 | finally: # Sleep and retry 97 | time.sleep(5) 98 | return BeautifulSoup(html, 'html.parser') 99 | 100 | def mysql(self): 101 | mysql_host = 'localhost' 102 | # mysql_sock = 'var/run/mysqld/mysqld.sock' 103 | mysql_user = 'username' 104 | mysql_pass = 'password' 105 | conn = pymysql.connect( 106 | user=mysql_user, passwd=mysql_pass, host=mysql_host, db='javlibrary', charset='utf8') 107 | # conn = pymysql.connect( 108 | # user=mysql_user, passwd=mysql_pass, host=mysql_host, unix_socket=mysql_sock, db='javlibrary', charset='utf8') 109 | return conn 110 | 111 | def download(self, url, identity): 112 | thread_name = threading.current_thread().getName() 113 | req = urllib.request.Request(url) 114 | req.add_header('Host', 'pics.dmm.co.jp') 115 | req.add_header('Accept', 'text/html, application/xhtml+xml, */*') 116 | req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko') 117 | req.add_header('Referer', 'http://www.javlibrary.com/') 118 | req.add_header('Accept-Encoding', 'gzip, deflate') 119 | req.add_header('Accept-Language', 'en-US,en;q=0.8,zh;q=0.6,zh-CN;q=0.4') 120 | req.add_header('Cache-Control', 'no-cache') 121 | 122 | if not os.path.exists('images' + os.sep + identity): # Test and create folder 123 | os.makedirs('images' + os.sep + identity) 124 | 125 | while True: 126 | try: # Open URL 127 | response = urllib.request.urlopen(req) 128 | output = open('images' + os.sep + identity + os.sep + os.path.split(url)[1], 'wb') 129 | shutil.copyfileobj(response, output) 130 | break # Jump out from the loop 131 | except urllib.error.HTTPError as error: # HTTP error 132 | print('[WARN] %s # Error %s while downloading %s' % (thread_name, error.code, url)) 133 | except urllib.error.URLError: # URL error 134 | print('[WARN] %s # URL error while downloading %s' % (thread_name, url)) 135 | except TimeoutError: 136 | print('[WARN] %s # Timeout error while downloading %s' % (thread_name, url)) 137 | except ConnectionResetError: 138 | print('[WARN] %s # Connection reset error while downloading %s' % (thread_name, url)) 139 | finally: # Sleep and retry 140 | time.sleep(5) 141 | 142 | def dispatch_url(self, url): 143 | if url not in self.url_scanned_pool: # URL not scanned 144 | if self.re_detail.match(url): # URL matching detail page 145 | self.url_pool.append(url) 146 | elif self.re_overview.match(url) and not self.re_update_list.match(url): 147 | self.url_pool.append(url) # URL matching overview page, avoid vl_update list view 148 | 149 | def parse_url(self, url, soup): 150 | if self.re_detail.match(url): # URL matching detail page 151 | self.detail_parser(soup) 152 | elif self.re_genres.match(url): # URL matching genres page 153 | self.genres_parser(soup) 154 | elif self.re_artist.match(url): # URL matching artist page 155 | self.artist_parser(soup) 156 | else: # URL matching overview page 157 | self.overview_parser(soup) 158 | 159 | def detail_parser(self, soup): 160 | video_identity = soup.find('div', id='video_id', class_='item').find('td', class_='text').get_text() 161 | 162 | video_shortlink = soup.find('link', rel='shortlink').get('href')[-10:] 163 | 164 | video_title = soup.title.string.strip(video_identity + ' ').strip(' - JAVLibrary') 165 | 166 | video_date = soup.find('div', id='video_date', class_='item').find('td', class_='text').get_text() 167 | 168 | video_length = soup.find('div', id='video_length', class_='item').find('span', class_='text').get_text() 169 | 170 | try: # Some video doesn't have score field 171 | video_score = soup.find('div', id='video_review', class_='item').find('span', class_='score').get_text() 172 | try: # Some video doesn't have score 173 | video_score = str(int(float(video_score.strip('()')) * 100)) 174 | except ValueError: 175 | video_score = '0' 176 | except AttributeError: 177 | video_score = '0' 178 | 179 | video_cover = soup.find('div', id='video_jacket').find('img', id='video_jacket_img').get('src') 180 | 181 | video_preview = [] 182 | try: # Some video doesn't have preview images 183 | for i in soup.find('div', class_='previewthumbs').findAll('img'): 184 | video_preview.append(i.get('src')) 185 | except AttributeError: 186 | pass 187 | 188 | conn = self.mysql() 189 | cursor = conn.cursor() 190 | 191 | # Table videos 192 | cursor.execute('SELECT vlink FROM videos WHERE vlink = %s', video_shortlink) 193 | if not cursor.fetchall(): # If entry not exist insert entry 194 | item = [video_identity, video_shortlink, video_title, video_date, video_length, video_score, video_cover] 195 | while True: 196 | try: 197 | cursor.execute('INSERT INTO videos (id, vlink, title, date, length, score, cover) VALUES (%s, %s, %s, %s, %s, %s, %s)', item) 198 | break 199 | except pymysql.err.InternalError: # Suppress lock wait timeout exceeded error 200 | time.sleep(1) 201 | else: # If entry exist update entry 202 | item = [video_date, video_length, video_score, video_cover, video_shortlink] 203 | cursor.execute('UPDATE videos SET date = %s, length = %s, score = %s, cover = %s WHERE vlink = %s', item) 204 | 205 | # Table previews 206 | for i in video_preview: 207 | cursor.execute('SELECT vlink FROM previews WHERE vlink = %s AND url = %s', (video_shortlink, i)) 208 | if not cursor.fetchall(): # If entry not exist insert entry 209 | cursor.execute('INSERT INTO previews (vlink, url) VALUES (%s, %s)', (video_shortlink, i)) 210 | 211 | # Table list_directors, directors 212 | for i in soup.find('div', id='video_director').findAll('a', rel='tag'): 213 | video_director = [i.get('href')[18:], i.get_text()] 214 | if video_director[1] != '----': # Some video doesn't have director 215 | cursor.execute('SELECT dlink FROM list_directors WHERE dlink = %s', video_director[0]) 216 | if not cursor.fetchall(): # If entry not exist insert entry 217 | cursor.execute('INSERT INTO list_directors (dlink, name) VALUES (%s, %s)', video_director) 218 | cursor.execute('SELECT dlink FROM directors WHERE dlink = %s AND vlink = %s', (video_director[0], video_shortlink)) 219 | if not cursor.fetchall(): # If entry not exist insert entry 220 | cursor.execute('INSERT INTO directors (dlink, vlink) VALUES (%s, %s)', (video_director[0], video_shortlink)) 221 | 222 | # Table list_makers, makers 223 | for i in soup.find('div', id='video_maker').findAll('a', rel='tag'): 224 | video_maker = [i.get('href')[15:], i.get_text()] 225 | if video_maker[1] != '----': # Some video doesn't have maker 226 | cursor.execute('SELECT mlink FROM list_makers WHERE mlink = %s', video_maker[0]) 227 | if not cursor.fetchall(): # If entry not exist insert entry 228 | cursor.execute('INSERT INTO list_makers (mlink, name) VALUES (%s, %s)', video_maker) 229 | cursor.execute('SELECT mlink FROM makers WHERE mlink = %s AND vlink = %s', (video_maker[0], video_shortlink)) 230 | if not cursor.fetchall(): # If entry not exist insert entry 231 | cursor.execute('INSERT INTO makers (mlink, vlink) VALUES (%s, %s)', (video_maker[0], video_shortlink)) 232 | 233 | # Table list_issuers, issuers 234 | for i in soup.find('div', id='video_label').findAll('a', rel='tag'): 235 | video_issuer = [i.get('href')[15:], i.get_text()] 236 | if video_issuer[1] != '----': # Some video doesn't have issuer 237 | cursor.execute('SELECT ilink FROM list_issuers WHERE ilink = %s', video_issuer[0]) 238 | if not cursor.fetchall(): # If entry not exist insert entry 239 | cursor.execute('INSERT INTO list_issuers (ilink, name) VALUES (%s, %s)', video_issuer) 240 | cursor.execute('SELECT ilink FROM issuers WHERE ilink = %s AND vlink = %s', (video_issuer[0], video_shortlink)) 241 | if not cursor.fetchall(): # If entry not exist insert entry 242 | cursor.execute('INSERT INTO issuers (ilink, vlink) VALUES (%s, %s)', (video_issuer[0], video_shortlink)) 243 | 244 | # Table list_genres, genres 245 | for i in soup.find('div', id='video_genres').findAll('a', rel='category tag'): 246 | video_genres = [i.get('href')[15:], i.get_text()] 247 | cursor.execute('SELECT glink FROM list_genres WHERE glink = %s AND name = %s', video_genres) 248 | if not cursor.fetchall(): # If entry not exist insert entry 249 | cursor.execute('INSERT INTO list_genres (glink, name) VALUES (%s, %s)', video_genres) 250 | cursor.execute('SELECT glink FROM genres WHERE glink = %s AND vlink = %s', (video_genres[0], video_shortlink)) 251 | if not cursor.fetchall(): # If entry not exist insert entry 252 | cursor.execute('INSERT INTO genres (glink, vlink) VALUES (%s, %s)', (video_genres[0], video_shortlink)) 253 | 254 | # Table list_artists, artists 255 | for i in soup.find('div', id='video_cast').findAll('a', rel='tag'): 256 | video_artist = [i.get('href')[14:], i.get_text()] 257 | cursor.execute('SELECT alink FROM list_artists WHERE alink = %s', video_artist[0]) 258 | if not cursor.fetchall(): # If entry not exist insert entry 259 | cursor.execute('INSERT INTO list_artists (alink, name) VALUES (%s, %s)', video_artist) 260 | cursor.execute('SELECT alink FROM artists WHERE alink = %s AND vlink = %s', (video_artist[0], video_shortlink)) 261 | if not cursor.fetchall(): # If entry not exist insert entry 262 | cursor.execute('INSERT INTO artists (alink, vlink) VALUES (%s, %s)', (video_artist[0], video_shortlink)) 263 | 264 | conn.commit() # Commit together 265 | cursor.close() 266 | conn.close() 267 | 268 | # Download images 269 | self.download(video_cover, video_identity) # Download cover 270 | for i in video_preview: 271 | self.download(i, video_identity) # Download previews 272 | 273 | def genres_parser(self, soup): 274 | conn = self.mysql() 275 | cursor = conn.cursor() 276 | for a in soup.findAll('div', class_='genreitem'): # Assemble item 277 | item = [] 278 | item.append(a.find('a', href=True).get('href')[15:]) # Genres shortlink 279 | item.append(a.find('a', href=True).get_text()) # Genres name 280 | cursor.execute('SELECT glink, name FROM list_genres WHERE glink = %s and name = %s', item) 281 | if not cursor.fetchall(): # If entry not exist insert entry 282 | cursor.execute('INSERT INTO list_genres (glink, name) VALUES (%s, %s)', item) 283 | conn.commit() # Commit together 284 | cursor.close() 285 | conn.close() 286 | 287 | def artist_parser(self, soup): 288 | conn = self.mysql() 289 | cursor = conn.cursor() 290 | for a in soup.find('div', class_='starbox').findAll('div', class_='searchitem'): # Assemble item 291 | item = [] 292 | item.append(a.get('id')) # Artist shortlink 293 | item.append(a.find('a').get_text()) # Artist name 294 | cursor.execute('SELECT alink FROM list_artists WHERE alink = %s', item[0]) 295 | if not cursor.fetchall(): # If entry not exist insert entry 296 | cursor.execute('INSERT INTO list_artists (alink, name) VALUES (%s, %s)', item) 297 | conn.commit() # Commit together 298 | cursor.close() 299 | conn.close() 300 | pass 301 | 302 | def overview_parser(self, soup): 303 | conn = self.mysql() 304 | cursor = conn.cursor() 305 | 306 | if not soup.find('em'): # Check empty page 307 | for a in soup.find('div', class_='videos').findAll('div', class_='video', id=True): 308 | video_identity = a.find('div', class_='id').get_text() 309 | video_shortlink = a.find('a', class_='icn_want', id=True).get('id') 310 | video_title = a.find('div', class_='title').get_text().encode('gbk', 'ignore').decode('gbk') 311 | video_thumbnail = a.find('img', src=True).get('src') 312 | cursor.execute('SELECT vlink FROM videos WHERE vlink = %s', video_shortlink) 313 | if not cursor.fetchall(): # If entry not exist insert entry 314 | item = [video_identity, video_shortlink, video_title, video_thumbnail] 315 | while True: 316 | try: 317 | cursor.execute('INSERT INTO videos (id, vlink, title, thumbnail) VALUES (%s, %s, %s, %s)', item) 318 | break 319 | except pymysql.err.InternalError: # Avoid 'Lock wait timeout exceeded'? 320 | time.sleep(1) 321 | except pymysql.err.IntegrityError: # Avoid 'Duplicated entry'? 322 | break 323 | 324 | else: # If entry exist update entry 325 | item = [video_title, video_thumbnail] 326 | while True: 327 | try: 328 | cursor.execute('UPDATE videos SET title = %s, thumbnail = %s', item) 329 | break 330 | except pymysql.err.InternalError: 331 | time.sleep(1) # Avoid strange deadlock found when trying to get lock? 332 | 333 | # Download images 334 | if video_thumbnail != '../img/noimageps.gif': # Avoid no thumbnail error 335 | self.download(video_thumbnail, video_identity) # Download thumbnail 336 | 337 | conn.commit() # Commit together 338 | cursor.close() 339 | conn.close() 340 | 341 | def worker(self): 342 | thread_name = threading.current_thread().name 343 | print('[INFO] %s # Started' % thread_name) 344 | 345 | while self.flag: # Worker control flag True 346 | url = self.get_url() 347 | if url: 348 | time_start = time.time() 349 | print('[INFO] %s # Parsing %s' % (thread_name, url)) 350 | 351 | # Read contents 352 | soup = self.soup(url) 353 | 354 | # Extract and parse all URLs 355 | for i in soup.findAll('a', href=True): 356 | self.dispatch_url(i.get('href')) 357 | 358 | # Parse myself 359 | self.parse_url(url, soup) 360 | 361 | time_end = time.time() 362 | print('[INFO] %s # Job finished in %0.1f seconds' % (thread_name, time_end - time_start)) 363 | 364 | # Control flag False terminate self 365 | print('[INFO] %s # Terminated' % thread_name) 366 | 367 | def main(self, threads): 368 | time.sleep(5) 369 | for i in range(threads): 370 | t = threading.Thread(target=self.worker) 371 | t.start() 372 | time.sleep(5) 373 | 374 | while len(self.url_pool) > 0: 375 | print('[INFO] %s URLs in pool, %s URLs scanned' % (len(self.url_pool), len(self.url_scanned_pool))) 376 | 377 | try: 378 | f = open('url_pool.bin', 'wb') 379 | pickle.dump(self.url_pool, f) 380 | f.close() 381 | f = open('url_scanned_pool.bin', 'wb') 382 | pickle.dump(self.url_pool, f) 383 | f.close() 384 | # print('[INFO] Scan progress saved') 385 | except: 386 | print('[WARN] Save scan progress failed') 387 | 388 | time.sleep(60) 389 | 390 | self.flag = False # Set termination flag 391 | t.join() 392 | 393 | if __name__ == '__main__': 394 | Spider().main(8) 395 | --------------------------------------------------------------------------------