├── README.md ├── javlib.sh ├── bestrated.py ├── mostwanted.py ├── star_mostfav.py ├── star.py └── javlibrary.py /README.md: -------------------------------------------------------------------------------- 1 | # javlibrary 2 | scraper for javlibrary 3 | 4 | 我还一直没写文档 5 | 6 | 太懒了 7 | 8 | 需要安装包:cloudscraper beautifulsoup ,当你需要科学上网时,还需要pysocks 9 | 10 | 修改javlib.py,更改key_word为你想搜的,比如说ssni 11 | 12 | 科学上网还需要设置proxies 13 | 14 | 直接运行 15 | 16 | python3 javlibrary.py 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /javlib.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | def=$(date '+%Y%m%d') 4 | 5 | python3 ~/javlibrary/bestrated.py | tee ~/javlibrary/bestrated-${def}.txt 6 | python3 ~/javlibrary/mostwanted.py | tee ~/javlibrary/mostwanted-${def}.txt 7 | python3 ~/javlibrary/star_mostfav.py | tee ~/javlibrary/star_mostfav-${def}.txt 8 | -------------------------------------------------------------------------------- /bestrated.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | import time 4 | import cloudscraper 5 | 6 | scraper = cloudscraper.create_scraper() 7 | scrape_url = 'http://www.javlibrary.com/cn/vl_bestrated.php' 8 | web_data = scraper.get(scrape_url).content.decode('utf-8') 9 | soup = BeautifulSoup(web_data, 'lxml') 10 | 11 | t_title = soup.find('title').text 12 | print(t_title) 13 | 14 | localtime = time.asctime( time.localtime(time.time()) ) 15 | print ("本地时间为 :", localtime) 16 | 17 | pattern = re.compile('vid_javli') 18 | t_list = soup.find_all(name='div', attrs={'id':pattern}) 19 | # print(t_list) 20 | for item in t_list: 21 | name = item.a['title'] 22 | address = item.a['href'].replace('.', '') 23 | link = 'http://www.javlibrary.com/cn' + address 24 | print('name: ' + name) 25 | print('link: ' + link) 26 | print('******************************') 27 | -------------------------------------------------------------------------------- /mostwanted.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | import time 4 | import cloudscraper 5 | 6 | scraper = cloudscraper.create_scraper() 7 | scrape_url = 'http://www.javlibrary.com/cn/vl_mostwanted.php' 8 | web_data = scraper.get(scrape_url).content.decode('utf-8') 9 | soup = BeautifulSoup(web_data, 'lxml') 10 | 11 | t_title = soup.find('title').text 12 | print(t_title) 13 | 14 | localtime = time.asctime( time.localtime(time.time()) ) 15 | print ("本地时间为 :", localtime) 16 | 17 | pattern = re.compile('vid_javli') 18 | t_list = soup.find_all(name='div', attrs={'id':pattern}) 19 | # print(t_list) 20 | for item in t_list: 21 | name = item.a['title'] 22 | address = item.a['href'].replace('.', '') 23 | link = 'http://www.javlibrary.com/cn' + address 24 | print('name: ' + name) 25 | print('link: ' + link) 26 | print('******************************') 27 | -------------------------------------------------------------------------------- /star_mostfav.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | import time 4 | import cloudscraper 5 | 6 | scraper = cloudscraper.create_scraper() 7 | scrape_url = 'http://www.javlibrary.com/cn/star_mostfav.php' 8 | web_data = scraper.get(scrape_url).content.decode('utf-8') 9 | soup = BeautifulSoup(web_data, 'lxml') 10 | 11 | t_title = soup.find('title').text 12 | print(t_title) 13 | 14 | localtime = time.asctime( time.localtime(time.time()) ) 15 | print ("本地时间为 :", localtime) 16 | 17 | star_list = soup.find_all(name='div', attrs={'class': 'searchitem'}) 18 | for star in star_list: 19 | sequence = star.h3.get_text() 20 | print(sequence) 21 | name = star.find(name='img')['title'] 22 | print(name) 23 | star_id = star['id'] 24 | star_link = 'https://www.javlibrary.com/cn/vl_star.php?s=' + star_id 25 | print(star_link) 26 | print('******************************') 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /star.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import re 3 | import time 4 | import cloudscraper 5 | 6 | def get_star_zuopin(url): 7 | scraper = cloudscraper.create_scraper() 8 | web_data = scraper.get(url).content.decode('utf-8') 9 | soup = BeautifulSoup(web_data, 'lxml') 10 | 11 | t_title = soup.find('title').text 12 | print(t_title) 13 | 14 | pattern = re.compile('vid_javli') 15 | t_list = soup.find_all(name='div', attrs={'id':pattern}) 16 | for item in t_list: 17 | name = item.a['title'] 18 | address = item.a['href'].replace('.', '') 19 | link = 'http://www.javlibrary.com/cn' + address 20 | print('name: ' + name) 21 | print('link: ' + link) 22 | print('******************************') 23 | 24 | def get_star_pagenum(url): 25 | scraper = cloudscraper.create_scraper() 26 | web_data = scraper.get(url).content.decode('utf-8') 27 | soup = BeautifulSoup(web_data, 'lxml') 28 | page_url = soup.find(name='a', attrs={'class': 'page last'}) 29 | pattern = re.compile('\=(\d+)$') 30 | page_num = re.findall(pattern, page_url['href']) 31 | return page_num[0] 32 | 33 | def main(): 34 | url = 'https://www.javlibrary.com/cn/vl_star.php?s=oqjq' 35 | page_num = int(get_star_pagenum(url)) 36 | for i in range(1, page_num+1): 37 | i_url = url + '&page=' + str(i) 38 | get_star_zuopin(i_url) 39 | print('================page %s end=================' % i) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | 44 | 45 | -------------------------------------------------------------------------------- /javlibrary.py: -------------------------------------------------------------------------------- 1 | import cloudscraper 2 | from bs4 import BeautifulSoup 3 | import time 4 | import re 5 | import csv 6 | import os 7 | 8 | html_dir = '/root/javlib/jav_html/' 9 | csv_dir = '/root/javlib/jav_csv/' 10 | key_word = 'jul' 11 | search_url = 'https://www.javlibrary.com/cn/vl_searchbyid.php?keyword=%s' % key_word 12 | 13 | 14 | 15 | def get_maxpage(url): 16 | scraper = cloudscraper.create_scraper() 17 | web_data = scraper.get(url).content.decode('utf-8') 18 | soup = BeautifulSoup(web_data, 'lxml') 19 | page_num_area = soup.find(attrs={'class': 'page last'}) 20 | num_pattern = re.compile('page=(\d+)') 21 | page_num = re.findall(num_pattern, page_num_area['href'])[0] 22 | return int(page_num) 23 | 24 | 25 | def get_urls(url): 26 | scraper = cloudscraper.create_scraper() 27 | web_data = scraper.get(url).content.decode('utf-8') 28 | soup = BeautifulSoup(web_data, 'lxml') 29 | pattern = re.compile('vid_javli') 30 | t_list = soup.find_all(name='div', attrs={'id': pattern}) 31 | url_list = [] 32 | for item in t_list: 33 | address = item.a['href'].replace('.', '') 34 | link = 'http://www.javlibrary.com/cn' + address 35 | url_list.append(link) 36 | return url_list 37 | 38 | 39 | def get_html(url): 40 | htmlname_pattern = re.compile('=(\w+)') 41 | html_name = re.findall(htmlname_pattern, url)[0] 42 | scraper = cloudscraper.create_scraper() 43 | web_data = scraper.get(url).content.decode('utf-8') 44 | soup = BeautifulSoup(web_data, 'lxml') 45 | video_id_area = soup.find(attrs={'id': 'video_id'}) 46 | video_id = video_id_area.find(attrs={'class': 'text'}).get_text() 47 | with open(html_dir + video_id + html_name + '.html', 'w', encoding='utf-8') as f: 48 | f.write(web_data) 49 | localtime = time.asctime(time.localtime(time.time())) 50 | print(localtime + ' **** %s html saved ****' % video_id) 51 | 52 | 53 | def read_html(file): 54 | f = open(file, 'r', encoding='utf-8') 55 | soup = BeautifulSoup(f.read(), 'lxml') 56 | video_title_area = soup.find(attrs={'id': 'video_title'}) 57 | video_title = video_title_area.h3.get_text() 58 | video_id_area = soup.find(attrs={'id': 'video_id'}) 59 | video_id = video_id_area.find(attrs={'class': 'text'}).get_text() 60 | video_imgurl = 'https:' + soup.find(attrs={'id': 'video_jacket'}).img['src'] 61 | video_date_area = soup.find(attrs={'id': 'video_date'}) 62 | video_date = video_date_area.find(attrs={'class': 'text'}).get_text() 63 | video_director_area = soup.find(attrs={'id': 'video_director'}) 64 | video_director = video_director_area.find(attrs={'class': 'text'}).get_text().strip() 65 | video_maker_area = soup.find(attrs={'id': 'video_maker'}) 66 | video_maker = video_maker_area.find(attrs={'class': 'text'}).get_text().strip() 67 | video_label_area = soup.find(attrs={'id': 'video_label'}) 68 | video_label = video_label_area.find(attrs={'class': 'text'}).get_text().strip() 69 | video_review_area = soup.find(attrs={'id': 'video_review'}) 70 | if video_review_area: 71 | video_review_text = video_review_area.find(attrs={'class': 'score'}).get_text().strip() 72 | review_pattern = re.compile('\d+.\d+') 73 | if review_pattern.search(video_review_text): 74 | video_review = review_pattern.search(video_review_text).group(0) 75 | else: 76 | video_review = '' 77 | else: 78 | video_review = '' 79 | video_cast_area = soup.find(attrs={'id': 'video_cast'}) 80 | video_cast = video_cast_area.find(attrs={'class': 'text'}).get_text().strip() 81 | video_genres_area = soup.find(attrs={'id': 'video_genres'}) 82 | genres_list = video_genres_area.find_all(attrs={'class': 'genre'}) 83 | video_genres = "" 84 | for genre in genres_list: 85 | video_genres = genre.get_text() + ',' + video_genres 86 | video_dict = { 87 | 'video_title': video_title, 88 | 'video_id': video_id, 89 | 'video_imgurl': video_imgurl, 90 | 'video_date': video_date, 91 | 'video_director': video_director, 92 | 'video_maker': video_maker, 93 | 'video_label': video_label, 94 | 'video_review': video_review, 95 | 'video_cast': video_cast, 96 | 'video_genres': video_genres 97 | } 98 | return video_dict 99 | 100 | 101 | def write_csv(dict): 102 | headers = ['video_title', 'video_id', 'video_imgurl', 'video_date', 'video_director', 'video_maker', 103 | 'video_label', 'video_review', 'video_cast', 'video_genres'] 104 | with open(csv_dir + key_word + '.csv', 'w') as f: 105 | f_csv = csv.DictWriter(f, headers) 106 | f_csv.writeheader() 107 | f_csv.writerows(dict) 108 | 109 | 110 | def main(page): 111 | scrape_url = search_url + "&page=" + str(page) 112 | jav_list = get_urls(scrape_url) 113 | localtime = time.asctime(time.localtime(time.time())) 114 | print(localtime + ' **** In this page there is %d url ****' % len(jav_list)) 115 | for url in jav_list: 116 | try: 117 | get_html(url) 118 | except: 119 | localtime = time.asctime(time.localtime(time.time())) 120 | print(localtime + ' **** Error occured, wait for 10 seconds ****') 121 | time.sleep(10) 122 | get_html(url) 123 | 124 | 125 | if __name__ == '__main__': 126 | page_max = get_maxpage(search_url) 127 | localtime = time.asctime(time.localtime(time.time())) 128 | print(localtime + ' **** max page number is %d ****' % page_max) 129 | for i in range(1, page_max + 1): 130 | localtime = time.asctime(time.localtime(time.time())) 131 | print(localtime + " #### page %d start ####" % i) 132 | main(i) 133 | localtime = time.asctime(time.localtime(time.time())) 134 | print(localtime + " #### page %d completed ####" % i) 135 | localtime = time.asctime(time.localtime(time.time())) 136 | print(localtime + " #### Saving html completed ####") 137 | jav_dict = [] 138 | for filename in os.listdir(html_dir): 139 | jav_dict.append(read_html(html_dir + filename)) 140 | write_csv(jav_dict) 141 | 142 | --------------------------------------------------------------------------------