├── README.md
├── controler.py
├── crawler.py
├── downloader.py
└── pageparser.py


/README.md:
--------------------------------------------------------------------------------
1 | # Javbus_crawler
2 | 打开打开crawler.py即可开始爬取
3 | 


--------------------------------------------------------------------------------
/controler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | import sqlite3
 5 | 
 6 | 
 7 | #用来处理用Python的sqlite3操作数据库要插入的字符串中含有中文字符的时候报错处理，配合map
 8 | def _decode_utf8(aStr):
 9 |     return aStr.encode('utf-8','ignore').decode('utf-8')
10 | 
11 | def create_db():
12 |     '''create a db and table if not exists'''
13 |     conn = sqlite3.connect("javbus.sqlite3.db")
14 |     cursor = conn.cursor()
15 | 
16 |     cursor.execute('''
17 |         CREATE TABLE IF NOT EXISTS JAVBUS_DATA(
18 |             URL       TEXT PRIMARY KEY,
19 |             識別碼    TEXT,
20 |             發行日期  TEXT,
21 |             長度      TEXT,
22 |             導演      TEXT,
23 |             製作商    TEXT,
24 |             發行商    TEXT,
25 |             系列      TEXT,
26 |             演員      TEXT,
27 |             類別      TEXT,
28 |             磁力链接  TEXT,
29 |             无码      INTEGER);''')
30 | 
31 |     print("Table created successfully")
32 |     cursor.close()
33 |     conn.commit()
34 |     conn.close()
35 | 
36 | def write_data(dict_jav, uncensored):
37 |     '''write_data(dict_jav, uncensored)'''
38 | 
39 |     conn = sqlite3.connect("javbus.sqlite3.db")
40 |     cursor = conn.cursor()
41 |     #对数据解码为unicode
42 |     insert_data = map(_decode_utf8, (dict_jav['URL'], dict_jav['識別碼'], dict_jav['發行日期'], dict_jav['長度'], dict_jav['導演'], dict_jav['製作商'], dict_jav['發行商'], dict_jav['系列'], dict_jav['演員'], dict_jav['類別'], dict_jav['磁力链接']))
43 |     insert_data.append(uncensored)
44 |     #插入数据
45 |     cursor.execute('''
46 |     INSERT INTO JAVBUS_DATA (URL, 識別碼, 發行日期, 長度, 導演, 製作商, 發行商, 系列, 演員, 類別, 磁力链接, 无码)
47 |     VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
48 |     ''', insert_data)
49 |     cursor.close()
50 |     conn.commit()
51 |     conn.close()
52 | 
53 | def check_url_not_in_table(url):
54 |     """check_url_in_db(url),if the url isn't in the table it will return True, otherwise return False"""
55 | 
56 |     conn = sqlite3.connect("javbus.sqlite3.db")
57 |     cursor = conn.cursor()
58 | 
59 |     cursor.execute('select URL from JAVBUS_DATA where URL=?', (url.decode('utf-8'),))
60 |     check = cursor.fetchall()
61 |     cursor.close()
62 |     conn.close()
63 |     if check:
64 |         return False
65 |     return True


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | import controler
 5 | import downloader
 6 | import pageparser
 7 | import time
 8 | 
 9 | def get_dict(url):
10 |     """get the dict of the detail page and yield the dict"""
11 | 
12 |     url_html = downloader.get_html(url)
13 |     for detail_url in pageparser.parser_homeurl(url_html):
14 |         try:
15 |             detail_page_html = downloader.get_html(detail_url)
16 |             dict_jav = pageparser.parser_content(detail_page_html)
17 |         except:
18 |             with open('fail_url.txt', 'a') as fd:
19 |                 fd.write('%s\n' % detail_url)
20 |             print("Fail to crawl %s\ncrawl next detail page......" % detail_url)
21 |             continue
22 |         yield dict_jav, detail_url
23 | 
24 | 
25 | def join_db(url,is_uncensored):
26 |     """the detail_dict of the url join the db"""
27 | 
28 |     for dict_jav_data, detail_url in get_dict(url):
29 |         if controler.check_url_not_in_table(url):
30 |             controler.write_data(dict_jav_data, is_uncensored)
31 |             print("Crawled %s" % detail_url)
32 |         else:
33 |             print("it has updated over...window will be closed after 60s")
34 |             time.sleep(60)
35 |             exit()
36 | 
37 | 
38 | 
39 | def main(entrance):
40 |     #创建数据表
41 |     controler.create_db()
42 |     #无码为1，有码为0
43 |     is_uncensored = 1 if 'uncensored' in entrance else 0
44 |     join_db(entrance, is_uncensored)
45 | 
46 |     entrance_html = downloader.get_html(entrance)
47 |     next_page_url = pageparser.get_next_page_url(entrance, entrance_html)
48 |     while True:
49 |         if next_page_url:
50 |             join_db(next_page_url,is_uncensored)
51 |         next_page_html = downloader.get_html(next_page_url)
52 |         next_page_url = pageparser.get_next_page_url(entrance, next_page_html)
53 |         if next_page_url == None:
54 |             break
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main('https://www.javbus5.com')
59 |     main('https://www.javbus5.com/uncensored')


--------------------------------------------------------------------------------
/downloader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #-*-coding:utf-8-*-
 3 | 
 4 | import requests
 5 | 
 6 | headers = {
 7 |     'User-Agent	' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0', 
 8 | }
 9 | 
10 | def get_html(url, Referer_url=None):
11 |     '''get_html(url),download and return html'''
12 |     if Referer_url:
13 |         headers['Referer'] = Referer_url
14 |     req = requests.get(url, headers=headers)
15 |     return req.content
16 | 


--------------------------------------------------------------------------------
/pageparser.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | #-*-coding:utf-8-*-
  3 | 
  4 | from bs4 import BeautifulSoup
  5 | import downloader
  6 | 
  7 | def _get_cili_url(soup):
  8 |     """get_cili(soup).get the ajax url and Referer url of request"""
  9 | 
 10 |     ajax_get_cili_url = 'https://www.javbus5.com/ajax/uncledatoolsbyajax.php?lang=zh'
 11 |     ajax_data = soup.select('script')[8].text
 12 |     for l in ajax_data.split(';')[:-1]:
 13 |         ajax_get_cili_url += '&%s' % l[7:].replace("'","").replace(' ','')
 14 |     return ajax_get_cili_url
 15 | 
 16 | 
 17 | def _parser_magnet(html):
 18 |     """parser_magnet(html),get all magnets from a html and return the str of magnet"""
 19 | 
 20 |     #存放磁力的字符串
 21 |     magnet = ''
 22 |     soup = BeautifulSoup(html,"html.parser")
 23 |     for td in soup.select('td[width="70%"]'):
 24 |         magnet += td.a['href'] + '\n'
 25 |     return magnet
 26 | 
 27 | def get_next_page_url(entrance, html):
 28 |     """get_next_page_url(entrance, html),return the url of next page if exist"""
 29 |     print("done the page.......")
 30 |     soup = BeautifulSoup(html, "html.parser")
 31 |     next_page = soup.select('a[id="next"]')
 32 |     if next_page:
 33 |         next_page_link = next_page[0]['href'].split('/')[-2:]
 34 |         next_page_link = '/'+'/'.join(next_page_link)
 35 |         next_page_url = entrance + next_page_link
 36 |         return next_page_url
 37 |     return None
 38 | 
 39 | 
 40 | def parser_homeurl(html):
 41 |     """parser_homeurl(html),parser every url on every page and yield the url"""
 42 | 
 43 |     soup = BeautifulSoup(html,"html.parser")
 44 |     for url in soup.select('a[class="movie-box"]'):
 45 |         yield url['href']
 46 | 
 47 | 
 48 | def parser_content(html):
 49 |     """parser_content(html),parser page's content of every url and yield the dict of content"""
 50 | 
 51 |     soup = BeautifulSoup(html, "html.parser")
 52 | 
 53 |     categories = {}
 54 | 
 55 |     code_name_doc = soup.find('span', text="識別碼:")
 56 |     code_name = code_name_doc.parent.contents[2].text if code_name_doc else ''
 57 |     categories['識別碼'] = code_name
 58 |     #code_name = soup.find('span', text="識別碼:").parent.contents[2].text if soup.find('span', text="識別碼:") else ''
 59 | 
 60 |     date_issue_doc = soup.find('span', text="發行日期:")
 61 |     date_issue = date_issue_doc.parent.contents[1].strip() if date_issue_doc else ''
 62 |     categories['發行日期'] = date_issue
 63 |     #date_issue = soup.find('span', text="發行日期:").parent.contents[1].strip() if soup.find('span', text="發行日期:") else ''
 64 | 
 65 |     duration_doc = soup.find('span', text="長度:")
 66 |     duration = duration_doc.parent.contents[1].strip() if duration_doc else ''
 67 |     categories['長度'] = duration
 68 |     #duration = soup.find('span', text="長度:").parent.contents[1].strip() if soup.find('span', text="長度:") else ''
 69 | 
 70 |     director_doc = soup.find('span', text="導演:")
 71 |     director = director_doc.parent.contents[2].text if director_doc else ''
 72 |     categories['導演'] = director
 73 |     #director = soup.find('span', text="導演:").parent.contents[2].text if soup.find('span', text="導演:") else ''
 74 | 
 75 |     manufacturer_doc = soup.find('span', text="製作商:")
 76 |     manufacturer = manufacturer_doc.parent.contents[2].text if manufacturer_doc else ''
 77 |     categories['製作商'] = manufacturer
 78 |     #manufacturer = soup.find('span', text="製作商:").parent.contents[2].text if soup.find('span', text="製作商:") else ''
 79 | 
 80 |     publisher_doc = soup.find('span', text="發行商:")
 81 |     publisher = publisher_doc.parent.contents[2].text if publisher_doc else ''
 82 |     categories['發行商'] = publisher
 83 |     #publisher = soup.find('span', text="發行商:").parent.contents[2].text if soup.find('span', text="發行商:") else ''
 84 | 
 85 |     series_doc = soup.find('span', text="系列:")
 86 |     series = series_doc.parent.contents[2].text if series_doc else ''
 87 |     categories['系列'] = series
 88 |     #series = soup.find('span', text="系列:").parent.contents[2].text if soup.find('span', text="系列:") else ''
 89 | 
 90 |     genre_doc = soup.find('p', text="類別:")
 91 |     genre =(i.text.strip() for i in genre_doc.find_next('p').select('span')) if genre_doc else ''
 92 |     #genre =(i.text.strip() for i in soup.find('p', text="類別:").find_next('p').select('span')) if soup.find('p', text="類別:") else ''
 93 |     genre_text = ''
 94 |     for tex in genre:
 95 |         genre_text += '%s   ' % tex 
 96 |     categories['類別'] = genre_text
 97 | 
 98 |     actor_doc = soup.select('span[onmouseover^="hoverdiv"]')
 99 |     actor = (i.text.strip() for i in actor_doc) if actor_doc else ''
100 |     #actor = (i.text.strip() for i in soup.select('span[onmouseover^="hoverdiv"]')) if soup.select('span[onmouseover^="hoverdiv"]') else ''
101 |     actor_text = ''
102 |     for tex in actor:
103 |         actor_text += '%s   ' % tex 
104 |     categories['演員'] = actor_text
105 |     
106 |     #网址加入字典
107 |     url = soup.select('link[hreflang="zh"]')[0]['href']
108 |     categories['URL'] = url
109 | 
110 |     #将磁力链接加入字典
111 |     magnet_html = downloader.get_html(_get_cili_url(soup), Referer_url=url)
112 |     magnet = _parser_magnet(magnet_html)
113 |     categories['磁力链接'] = magnet
114 | 
115 |     return categories
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------