├── README.md ├── controler.py ├── crawler.py ├── downloader.py └── pageparser.py /README.md: -------------------------------------------------------------------------------- 1 | # Javbus_crawler 2 | 打开打开crawler.py即可开始爬取 3 | -------------------------------------------------------------------------------- /controler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*-coding:utf-8-*- 3 | 4 | import sqlite3 5 | 6 | 7 | #用来处理用Python的sqlite3操作数据库要插入的字符串中含有中文字符的时候报错处理,配合map 8 | def _decode_utf8(aStr): 9 | return aStr.encode('utf-8','ignore').decode('utf-8') 10 | 11 | def create_db(): 12 | '''create a db and table if not exists''' 13 | conn = sqlite3.connect("javbus.sqlite3.db") 14 | cursor = conn.cursor() 15 | 16 | cursor.execute(''' 17 | CREATE TABLE IF NOT EXISTS JAVBUS_DATA( 18 | URL TEXT PRIMARY KEY, 19 | 識別碼 TEXT, 20 | 發行日期 TEXT, 21 | 長度 TEXT, 22 | 導演 TEXT, 23 | 製作商 TEXT, 24 | 發行商 TEXT, 25 | 系列 TEXT, 26 | 演員 TEXT, 27 | 類別 TEXT, 28 | 磁力链接 TEXT, 29 | 无码 INTEGER);''') 30 | 31 | print("Table created successfully") 32 | cursor.close() 33 | conn.commit() 34 | conn.close() 35 | 36 | def write_data(dict_jav, uncensored): 37 | '''write_data(dict_jav, uncensored)''' 38 | 39 | conn = sqlite3.connect("javbus.sqlite3.db") 40 | cursor = conn.cursor() 41 | #对数据解码为unicode 42 | insert_data = map(_decode_utf8, (dict_jav['URL'], dict_jav['識別碼'], dict_jav['發行日期'], dict_jav['長度'], dict_jav['導演'], dict_jav['製作商'], dict_jav['發行商'], dict_jav['系列'], dict_jav['演員'], dict_jav['類別'], dict_jav['磁力链接'])) 43 | insert_data.append(uncensored) 44 | #插入数据 45 | cursor.execute(''' 46 | INSERT INTO JAVBUS_DATA (URL, 識別碼, 發行日期, 長度, 導演, 製作商, 發行商, 系列, 演員, 類別, 磁力链接, 无码) 47 | VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 48 | ''', insert_data) 49 | cursor.close() 50 | conn.commit() 51 | conn.close() 52 | 53 | def check_url_not_in_table(url): 54 | """check_url_in_db(url),if the url isn't in the table it will return True, otherwise return False""" 55 | 56 | conn = sqlite3.connect("javbus.sqlite3.db") 57 | cursor = conn.cursor() 58 | 59 | cursor.execute('select URL from JAVBUS_DATA where URL=?', (url.decode('utf-8'),)) 60 | check = cursor.fetchall() 61 | cursor.close() 62 | conn.close() 63 | if check: 64 | return False 65 | return True -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*-coding:utf-8-*- 3 | 4 | import controler 5 | import downloader 6 | import pageparser 7 | import time 8 | 9 | def get_dict(url): 10 | """get the dict of the detail page and yield the dict""" 11 | 12 | url_html = downloader.get_html(url) 13 | for detail_url in pageparser.parser_homeurl(url_html): 14 | try: 15 | detail_page_html = downloader.get_html(detail_url) 16 | dict_jav = pageparser.parser_content(detail_page_html) 17 | except: 18 | with open('fail_url.txt', 'a') as fd: 19 | fd.write('%s\n' % detail_url) 20 | print("Fail to crawl %s\ncrawl next detail page......" % detail_url) 21 | continue 22 | yield dict_jav, detail_url 23 | 24 | 25 | def join_db(url,is_uncensored): 26 | """the detail_dict of the url join the db""" 27 | 28 | for dict_jav_data, detail_url in get_dict(url): 29 | if controler.check_url_not_in_table(url): 30 | controler.write_data(dict_jav_data, is_uncensored) 31 | print("Crawled %s" % detail_url) 32 | else: 33 | print("it has updated over...window will be closed after 60s") 34 | time.sleep(60) 35 | exit() 36 | 37 | 38 | 39 | def main(entrance): 40 | #创建数据表 41 | controler.create_db() 42 | #无码为1,有码为0 43 | is_uncensored = 1 if 'uncensored' in entrance else 0 44 | join_db(entrance, is_uncensored) 45 | 46 | entrance_html = downloader.get_html(entrance) 47 | next_page_url = pageparser.get_next_page_url(entrance, entrance_html) 48 | while True: 49 | if next_page_url: 50 | join_db(next_page_url,is_uncensored) 51 | next_page_html = downloader.get_html(next_page_url) 52 | next_page_url = pageparser.get_next_page_url(entrance, next_page_html) 53 | if next_page_url == None: 54 | break 55 | 56 | 57 | if __name__ == '__main__': 58 | main('https://www.javbus5.com') 59 | main('https://www.javbus5.com/uncensored') -------------------------------------------------------------------------------- /downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*-coding:utf-8-*- 3 | 4 | import requests 5 | 6 | headers = { 7 | 'User-Agent ' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0', 8 | } 9 | 10 | def get_html(url, Referer_url=None): 11 | '''get_html(url),download and return html''' 12 | if Referer_url: 13 | headers['Referer'] = Referer_url 14 | req = requests.get(url, headers=headers) 15 | return req.content 16 | -------------------------------------------------------------------------------- /pageparser.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | #-*-coding:utf-8-*- 3 | 4 | from bs4 import BeautifulSoup 5 | import downloader 6 | 7 | def _get_cili_url(soup): 8 | """get_cili(soup).get the ajax url and Referer url of request""" 9 | 10 | ajax_get_cili_url = 'https://www.javbus5.com/ajax/uncledatoolsbyajax.php?lang=zh' 11 | ajax_data = soup.select('script')[8].text 12 | for l in ajax_data.split(';')[:-1]: 13 | ajax_get_cili_url += '&%s' % l[7:].replace("'","").replace(' ','') 14 | return ajax_get_cili_url 15 | 16 | 17 | def _parser_magnet(html): 18 | """parser_magnet(html),get all magnets from a html and return the str of magnet""" 19 | 20 | #存放磁力的字符串 21 | magnet = '' 22 | soup = BeautifulSoup(html,"html.parser") 23 | for td in soup.select('td[width="70%"]'): 24 | magnet += td.a['href'] + '\n' 25 | return magnet 26 | 27 | def get_next_page_url(entrance, html): 28 | """get_next_page_url(entrance, html),return the url of next page if exist""" 29 | print("done the page.......") 30 | soup = BeautifulSoup(html, "html.parser") 31 | next_page = soup.select('a[id="next"]') 32 | if next_page: 33 | next_page_link = next_page[0]['href'].split('/')[-2:] 34 | next_page_link = '/'+'/'.join(next_page_link) 35 | next_page_url = entrance + next_page_link 36 | return next_page_url 37 | return None 38 | 39 | 40 | def parser_homeurl(html): 41 | """parser_homeurl(html),parser every url on every page and yield the url""" 42 | 43 | soup = BeautifulSoup(html,"html.parser") 44 | for url in soup.select('a[class="movie-box"]'): 45 | yield url['href'] 46 | 47 | 48 | def parser_content(html): 49 | """parser_content(html),parser page's content of every url and yield the dict of content""" 50 | 51 | soup = BeautifulSoup(html, "html.parser") 52 | 53 | categories = {} 54 | 55 | code_name_doc = soup.find('span', text="識別碼:") 56 | code_name = code_name_doc.parent.contents[2].text if code_name_doc else '' 57 | categories['識別碼'] = code_name 58 | #code_name = soup.find('span', text="識別碼:").parent.contents[2].text if soup.find('span', text="識別碼:") else '' 59 | 60 | date_issue_doc = soup.find('span', text="發行日期:") 61 | date_issue = date_issue_doc.parent.contents[1].strip() if date_issue_doc else '' 62 | categories['發行日期'] = date_issue 63 | #date_issue = soup.find('span', text="發行日期:").parent.contents[1].strip() if soup.find('span', text="發行日期:") else '' 64 | 65 | duration_doc = soup.find('span', text="長度:") 66 | duration = duration_doc.parent.contents[1].strip() if duration_doc else '' 67 | categories['長度'] = duration 68 | #duration = soup.find('span', text="長度:").parent.contents[1].strip() if soup.find('span', text="長度:") else '' 69 | 70 | director_doc = soup.find('span', text="導演:") 71 | director = director_doc.parent.contents[2].text if director_doc else '' 72 | categories['導演'] = director 73 | #director = soup.find('span', text="導演:").parent.contents[2].text if soup.find('span', text="導演:") else '' 74 | 75 | manufacturer_doc = soup.find('span', text="製作商:") 76 | manufacturer = manufacturer_doc.parent.contents[2].text if manufacturer_doc else '' 77 | categories['製作商'] = manufacturer 78 | #manufacturer = soup.find('span', text="製作商:").parent.contents[2].text if soup.find('span', text="製作商:") else '' 79 | 80 | publisher_doc = soup.find('span', text="發行商:") 81 | publisher = publisher_doc.parent.contents[2].text if publisher_doc else '' 82 | categories['發行商'] = publisher 83 | #publisher = soup.find('span', text="發行商:").parent.contents[2].text if soup.find('span', text="發行商:") else '' 84 | 85 | series_doc = soup.find('span', text="系列:") 86 | series = series_doc.parent.contents[2].text if series_doc else '' 87 | categories['系列'] = series 88 | #series = soup.find('span', text="系列:").parent.contents[2].text if soup.find('span', text="系列:") else '' 89 | 90 | genre_doc = soup.find('p', text="類別:") 91 | genre =(i.text.strip() for i in genre_doc.find_next('p').select('span')) if genre_doc else '' 92 | #genre =(i.text.strip() for i in soup.find('p', text="類別:").find_next('p').select('span')) if soup.find('p', text="類別:") else '' 93 | genre_text = '' 94 | for tex in genre: 95 | genre_text += '%s ' % tex 96 | categories['類別'] = genre_text 97 | 98 | actor_doc = soup.select('span[onmouseover^="hoverdiv"]') 99 | actor = (i.text.strip() for i in actor_doc) if actor_doc else '' 100 | #actor = (i.text.strip() for i in soup.select('span[onmouseover^="hoverdiv"]')) if soup.select('span[onmouseover^="hoverdiv"]') else '' 101 | actor_text = '' 102 | for tex in actor: 103 | actor_text += '%s ' % tex 104 | categories['演員'] = actor_text 105 | 106 | #网址加入字典 107 | url = soup.select('link[hreflang="zh"]')[0]['href'] 108 | categories['URL'] = url 109 | 110 | #将磁力链接加入字典 111 | magnet_html = downloader.get_html(_get_cili_url(soup), Referer_url=url) 112 | magnet = _parser_magnet(magnet_html) 113 | categories['磁力链接'] = magnet 114 | 115 | return categories 116 | 117 | 118 | --------------------------------------------------------------------------------