├── README.md
├── javlib.sh
├── bestrated.py
├── mostwanted.py
├── star_mostfav.py
├── star.py
└── javlibrary.py


/README.md:
--------------------------------------------------------------------------------
 1 | # javlibrary
 2 | scraper for javlibrary
 3 | 
 4 | 我还一直没写文档
 5 | 
 6 | 太懒了
 7 | 
 8 | 需要安装包：cloudscraper beautifulsoup ，当你需要科学上网时，还需要pysocks
 9 | 
10 | 修改javlib.py，更改key_word为你想搜的，比如说ssni
11 | 
12 | 科学上网还需要设置proxies
13 | 
14 | 直接运行
15 | 
16 | python3 javlibrary.py
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/javlib.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | def=$(date '+%Y%m%d')
4 | 
5 | python3 ~/javlibrary/bestrated.py | tee ~/javlibrary/bestrated-${def}.txt
6 | python3 ~/javlibrary/mostwanted.py | tee ~/javlibrary/mostwanted-${def}.txt
7 | python3 ~/javlibrary/star_mostfav.py | tee ~/javlibrary/star_mostfav-${def}.txt
8 | 


--------------------------------------------------------------------------------
/bestrated.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | import time
 4 | import cloudscraper
 5 | 
 6 | scraper = cloudscraper.create_scraper()
 7 | scrape_url = 'http://www.javlibrary.com/cn/vl_bestrated.php'
 8 | web_data = scraper.get(scrape_url).content.decode('utf-8')
 9 | soup = BeautifulSoup(web_data, 'lxml')
10 | 
11 | t_title = soup.find('title').text
12 | print(t_title)
13 | 
14 | localtime = time.asctime( time.localtime(time.time()) )
15 | print ("本地时间为 :", localtime)
16 | 
17 | pattern = re.compile('vid_javli')
18 | t_list = soup.find_all(name='div', attrs={'id':pattern})
19 | # print(t_list)
20 | for item in t_list:
21 | 	name = item.a['title']
22 | 	address = item.a['href'].replace('.', '')
23 | 	link = 'http://www.javlibrary.com/cn' + address
24 | 	print('name: ' + name)
25 | 	print('link: ' + link)
26 | 	print('******************************')
27 | 


--------------------------------------------------------------------------------
/mostwanted.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | import time
 4 | import cloudscraper
 5 | 
 6 | scraper = cloudscraper.create_scraper()
 7 | scrape_url = 'http://www.javlibrary.com/cn/vl_mostwanted.php'
 8 | web_data = scraper.get(scrape_url).content.decode('utf-8')
 9 | soup = BeautifulSoup(web_data, 'lxml')
10 | 
11 | t_title = soup.find('title').text
12 | print(t_title)
13 | 
14 | localtime = time.asctime( time.localtime(time.time()) )
15 | print ("本地时间为 :", localtime)
16 | 
17 | pattern = re.compile('vid_javli')
18 | t_list = soup.find_all(name='div', attrs={'id':pattern})
19 | # print(t_list)
20 | for item in t_list:
21 | 	name = item.a['title']
22 | 	address = item.a['href'].replace('.', '')
23 | 	link = 'http://www.javlibrary.com/cn' + address
24 | 	print('name: ' + name)
25 | 	print('link: ' + link)
26 | 	print('******************************')
27 | 


--------------------------------------------------------------------------------
/star_mostfav.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | import time
 4 | import  cloudscraper
 5 | 
 6 | scraper = cloudscraper.create_scraper()
 7 | scrape_url = 'http://www.javlibrary.com/cn/star_mostfav.php'
 8 | web_data = scraper.get(scrape_url).content.decode('utf-8')
 9 | soup = BeautifulSoup(web_data, 'lxml')
10 | 
11 | t_title = soup.find('title').text
12 | print(t_title)
13 | 
14 | localtime = time.asctime( time.localtime(time.time()) )
15 | print ("本地时间为 :", localtime)
16 | 
17 | star_list = soup.find_all(name='div', attrs={'class': 'searchitem'})
18 | for star in star_list:
19 | 	sequence = star.h3.get_text()
20 | 	print(sequence)
21 | 	name = star.find(name='img')['title']
22 | 	print(name)
23 | 	star_id = star['id']
24 | 	star_link = 'https://www.javlibrary.com/cn/vl_star.php?s=' + star_id
25 | 	print(star_link)
26 | 	print('******************************')
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/star.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import re
 3 | import time
 4 | import cloudscraper
 5 | 
 6 | def get_star_zuopin(url):
 7 | 	scraper = cloudscraper.create_scraper()
 8 | 	web_data = scraper.get(url).content.decode('utf-8')
 9 | 	soup = BeautifulSoup(web_data, 'lxml')
10 | 
11 | 	t_title = soup.find('title').text
12 | 	print(t_title)
13 | 
14 | 	pattern = re.compile('vid_javli')
15 | 	t_list = soup.find_all(name='div', attrs={'id':pattern})
16 | 	for item in t_list:
17 | 		name = item.a['title']
18 | 		address = item.a['href'].replace('.', '')
19 | 		link = 'http://www.javlibrary.com/cn' + address
20 | 		print('name: ' + name)
21 | 		print('link: ' + link)
22 | 		print('******************************')
23 | 
24 | def get_star_pagenum(url):
25 | 	scraper = cloudscraper.create_scraper()
26 | 	web_data = scraper.get(url).content.decode('utf-8')
27 | 	soup = BeautifulSoup(web_data, 'lxml')
28 | 	page_url = soup.find(name='a', attrs={'class': 'page last'})
29 | 	pattern = re.compile('\=(\d+)$')
30 | 	page_num = re.findall(pattern, page_url['href'])
31 | 	return page_num[0]
32 | 
33 | def main():
34 | 	url = 'https://www.javlibrary.com/cn/vl_star.php?s=oqjq'
35 | 	page_num = int(get_star_pagenum(url))
36 | 	for i in range(1, page_num+1):
37 | 		i_url = url + '&page=' + str(i)
38 | 		get_star_zuopin(i_url)
39 | 		print('================page %s end=================' % i)
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/javlibrary.py:
--------------------------------------------------------------------------------
  1 | import cloudscraper
  2 | from bs4 import BeautifulSoup
  3 | import time
  4 | import re
  5 | import csv
  6 | import os
  7 | 
  8 | html_dir = '/root/javlib/jav_html/'
  9 | csv_dir = '/root/javlib/jav_csv/'
 10 | key_word = 'jul'
 11 | search_url = 'https://www.javlibrary.com/cn/vl_searchbyid.php?keyword=%s' % key_word
 12 | 
 13 | 
 14 | 
 15 | def get_maxpage(url):
 16 |     scraper = cloudscraper.create_scraper()
 17 |     web_data = scraper.get(url).content.decode('utf-8')
 18 |     soup = BeautifulSoup(web_data, 'lxml')
 19 |     page_num_area = soup.find(attrs={'class': 'page last'})
 20 |     num_pattern = re.compile('page=(\d+)')
 21 |     page_num = re.findall(num_pattern, page_num_area['href'])[0]
 22 |     return int(page_num)
 23 | 
 24 | 
 25 | def get_urls(url):
 26 |     scraper = cloudscraper.create_scraper()
 27 |     web_data = scraper.get(url).content.decode('utf-8')
 28 |     soup = BeautifulSoup(web_data, 'lxml')
 29 |     pattern = re.compile('vid_javli')
 30 |     t_list = soup.find_all(name='div', attrs={'id': pattern})
 31 |     url_list = []
 32 |     for item in t_list:
 33 |         address = item.a['href'].replace('.', '')
 34 |         link = 'http://www.javlibrary.com/cn' + address
 35 |         url_list.append(link)
 36 |     return url_list
 37 | 
 38 | 
 39 | def get_html(url):
 40 |     htmlname_pattern = re.compile('=(\w+)')
 41 |     html_name = re.findall(htmlname_pattern, url)[0]
 42 |     scraper = cloudscraper.create_scraper()
 43 |     web_data = scraper.get(url).content.decode('utf-8')
 44 |     soup = BeautifulSoup(web_data, 'lxml')
 45 |     video_id_area = soup.find(attrs={'id': 'video_id'})
 46 |     video_id = video_id_area.find(attrs={'class': 'text'}).get_text()
 47 |     with open(html_dir + video_id + html_name + '.html', 'w', encoding='utf-8') as f:
 48 |         f.write(web_data)
 49 |     localtime = time.asctime(time.localtime(time.time()))
 50 |     print(localtime + ' **** %s html saved ****' % video_id)
 51 | 
 52 | 
 53 | def read_html(file):
 54 |     f = open(file, 'r', encoding='utf-8')
 55 |     soup = BeautifulSoup(f.read(), 'lxml')
 56 |     video_title_area = soup.find(attrs={'id': 'video_title'})
 57 |     video_title = video_title_area.h3.get_text()
 58 |     video_id_area = soup.find(attrs={'id': 'video_id'})
 59 |     video_id = video_id_area.find(attrs={'class': 'text'}).get_text()
 60 |     video_imgurl = 'https:' + soup.find(attrs={'id': 'video_jacket'}).img['src']
 61 |     video_date_area = soup.find(attrs={'id': 'video_date'})
 62 |     video_date = video_date_area.find(attrs={'class': 'text'}).get_text()
 63 |     video_director_area = soup.find(attrs={'id': 'video_director'})
 64 |     video_director = video_director_area.find(attrs={'class': 'text'}).get_text().strip()
 65 |     video_maker_area = soup.find(attrs={'id': 'video_maker'})
 66 |     video_maker = video_maker_area.find(attrs={'class': 'text'}).get_text().strip()
 67 |     video_label_area = soup.find(attrs={'id': 'video_label'})
 68 |     video_label = video_label_area.find(attrs={'class': 'text'}).get_text().strip()
 69 |     video_review_area = soup.find(attrs={'id': 'video_review'})
 70 |     if video_review_area:
 71 |         video_review_text = video_review_area.find(attrs={'class': 'score'}).get_text().strip()
 72 |         review_pattern = re.compile('\d+.\d+')
 73 |         if review_pattern.search(video_review_text):
 74 |             video_review = review_pattern.search(video_review_text).group(0)
 75 |         else:
 76 |             video_review = ''
 77 |     else:
 78 |         video_review = ''
 79 |     video_cast_area = soup.find(attrs={'id': 'video_cast'})
 80 |     video_cast = video_cast_area.find(attrs={'class': 'text'}).get_text().strip()
 81 |     video_genres_area = soup.find(attrs={'id': 'video_genres'})
 82 |     genres_list = video_genres_area.find_all(attrs={'class': 'genre'})
 83 |     video_genres = ""
 84 |     for genre in genres_list:
 85 |         video_genres = genre.get_text() + ',' + video_genres
 86 |     video_dict = {
 87 |         'video_title': video_title,
 88 |         'video_id': video_id,
 89 |         'video_imgurl': video_imgurl,
 90 |         'video_date': video_date,
 91 |         'video_director': video_director,
 92 |         'video_maker': video_maker,
 93 |         'video_label': video_label,
 94 |         'video_review': video_review,
 95 |         'video_cast': video_cast,
 96 |         'video_genres': video_genres
 97 |     }
 98 |     return video_dict
 99 | 
100 | 
101 | def write_csv(dict):
102 |     headers = ['video_title', 'video_id', 'video_imgurl', 'video_date', 'video_director', 'video_maker',
103 |                'video_label', 'video_review', 'video_cast', 'video_genres']
104 |     with open(csv_dir + key_word + '.csv', 'w') as f:
105 |         f_csv = csv.DictWriter(f, headers)
106 |         f_csv.writeheader()
107 |         f_csv.writerows(dict)
108 | 
109 | 
110 | def main(page):
111 |     scrape_url = search_url + "&page=" + str(page)
112 |     jav_list = get_urls(scrape_url)
113 |     localtime = time.asctime(time.localtime(time.time()))
114 |     print(localtime + ' **** In this page there is %d url ****' % len(jav_list))
115 |     for url in jav_list:
116 |         try:
117 |             get_html(url)
118 |         except:
119 |             localtime = time.asctime(time.localtime(time.time()))
120 |             print(localtime + ' **** Error occured, wait for 10 seconds ****')
121 |             time.sleep(10)
122 |             get_html(url)
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     page_max = get_maxpage(search_url)
127 |     localtime = time.asctime(time.localtime(time.time()))
128 |     print(localtime + ' **** max page number is %d ****' % page_max)
129 |     for i in range(1, page_max + 1):
130 |         localtime = time.asctime(time.localtime(time.time()))
131 |         print(localtime + " #### page %d start ####" % i)
132 |         main(i)
133 |         localtime = time.asctime(time.localtime(time.time()))
134 |         print(localtime + " #### page %d completed ####" % i)
135 |     localtime = time.asctime(time.localtime(time.time()))
136 |     print(localtime + " #### Saving html completed ####")
137 |     jav_dict = []
138 |     for filename in os.listdir(html_dir):
139 |         jav_dict.append(read_html(html_dir + filename))
140 |     write_csv(jav_dict)
141 | 
142 | 


--------------------------------------------------------------------------------