├── .gitignore ├── .python-version ├── 80s_daily.py ├── 80s_dm.py ├── 80s_ju.py ├── 80s_movie.py ├── 80s_zy.py ├── LICENCE ├── README.md ├── config.json ├── data └── .gitkeep ├── model ├── __init__.py └── resource.py ├── requirements.txt └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | /data/task.db 2 | /data/scheduler.all 3 | /data/scheduler.1h 4 | /data/scheduler.1d 5 | /data/result.db 6 | /data/project.db 7 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.6.1 2 | -------------------------------------------------------------------------------- /80s_daily.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on 2017-11-21 10:21:37 4 | # Project: 80s_daily 5 | 6 | import re 7 | from pyspider.libs.base_handler import * 8 | from utils import * 9 | 10 | 11 | UPDATE = { 12 | "movie": "https://www.80s.tw/top/last_update_list/1", 13 | "ju": "https://www.80s.tw/top/last_update_list/tv", 14 | "dm": "https://www.80s.tw/top/last_update_list/14", 15 | "zy": "https://www.80s.tw/top/last_update_list/4" 16 | } 17 | WRITE_MONGODB = True 18 | 19 | class Handler(BaseHandler): 20 | crawl_config = {} 21 | 22 | # 一天 23 | @every(minutes=24 * 60) 24 | def on_start(self): 25 | for rtype, url in UPDATE.items(): 26 | self.crawl( 27 | url, 28 | validate_cert=False, 29 | callback=self.list_page, 30 | save={'rtype': rtype}) 31 | 32 | @config(age=10 * 24 * 60 * 60, priority=1, retries=1) 33 | def list_page(self, response): 34 | for i in response.doc('.tpul1line a').items(): 35 | if i.attr.href.split('/')[-2:-1] == [response.save['rtype']]: 36 | print(i.attr.href) 37 | self.crawl( 38 | i.attr.href, 39 | validate_cert=False, 40 | callback=self.detail_page, 41 | save={'rtype': response.save['rtype']}) 42 | 43 | # age 一天内认为页面没有改变,不会再重新爬取 44 | # 详情页 45 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=2, retries=1) 46 | def detail_page(self, response): 47 | resource_item = {} 48 | final_json = {} 49 | resource_item["url"] = response.url 50 | resource_item["title"] = response.doc('title').text() 51 | # 构建两块信息 52 | brief_info = format_brief_info(response, response.save['rtype']) 53 | resource_brief = construct_brief_json(brief_info) 54 | detail_info = format_detail_info(response) 55 | resource_detail = construct_detail_json(detail_info) 56 | resource_item = {**resource_brief, **resource_detail} 57 | 58 | if WRITE_MONGODB: 59 | # 两块信息先构建好 60 | final_json = construct_final_json(resource_item, response.url) 61 | 62 | # 第一个 tab bt 直接能解析,其他的 tab 需要爬单独的 html 再解析 63 | # http://www.80s.tw/movie/1173/bt-1 bd-1 hd-1 64 | mark = get_mark(response.doc('.dlselected > span').text()) 65 | final_json["url_has_downlaod"] = [] 66 | final_json["url_has_downlaod"].append(mark) 67 | 68 | if mark: 69 | if WRITE_MONGODB: 70 | download_json_final = get_download_info( 71 | response, response.save['rtype'], mark) 72 | final_json = {**final_json, **download_json_final} 73 | write_to_mongodb(final_json, mark) 74 | 75 | # 另外两种大小,可有可无 76 | tab_text = response.doc('.cpage').text() 77 | bt_re = re.search(r"电视", tab_text) 78 | bd_re = re.search(r"平板", tab_text) 79 | hd_re = re.search(r"手机", tab_text) 80 | pt_re = re.search(r"小MP4", tab_text) 81 | if bt_re and mark != 'bt': 82 | self.crawl( 83 | response.url + "/bt-1", 84 | validate_cert=False, 85 | callback=self.get_bt_info, 86 | save={ 87 | 'resource_item': resource_item, 88 | 'rtype': response.save['rtype'] 89 | }) 90 | if bd_re and mark != 'bd': 91 | self.crawl( 92 | response.url + "/bd-1", 93 | validate_cert=False, 94 | callback=self.get_bd_info, 95 | save={ 96 | 'resource_item': resource_item, 97 | 'rtype': response.save['rtype'] 98 | }) 99 | elif hd_re and mark != 'hd': 100 | self.crawl( 101 | response.url + "/hd-1", 102 | validate_cert=False, 103 | callback=self.get_hd_info, 104 | save={ 105 | 'resource_item': resource_item, 106 | 'rtype': response.save['rtype'] 107 | }) 108 | elif pt_re and mark != 'pt': 109 | self.crawl( 110 | response.url + "/mp4-1", 111 | validate_cert=False, 112 | callback=self.get_pt_info, 113 | save={ 114 | 'resource_item': resource_item, 115 | 'rtype': response.save['rtype'] 116 | }) 117 | return { 118 | "url": response.url, 119 | "title": response.doc('.font14w').text(), 120 | } 121 | else: 122 | print('========== 处理错误,没有得到下载信息 ==========') 123 | 124 | # age 一天内认为页面没有改变,不会再重新爬取 125 | # 爬取 bt 126 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 127 | def get_bt_info(self, response): 128 | self.crawl_download_info(response, 'bt', 129 | response.save['resource_item']) 130 | 131 | # age 一天内认为页面没有改变,不会再重新爬取 132 | # 爬取 bd 133 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 134 | def get_bd_info(self, response): 135 | self.crawl_download_info(response, 'bd', 136 | response.save['resource_item']) 137 | 138 | # age 一天内认为页面没有改变,不会再重新爬取 139 | # 爬取 hd 140 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 141 | def get_hd_info(self, response): 142 | self.crawl_download_info(response, 'hd', 143 | response.save['resource_item']) 144 | 145 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 146 | def get_pt_info(self, response): 147 | self.crawl_download_info(response, 'pt', 148 | response.save['resource_item']) 149 | 150 | def crawl_download_info(self, response, mark, resource_item): 151 | if WRITE_MONGODB: 152 | download_json_final = get_download_info( 153 | response, response.save['rtype'], mark) 154 | url_source = response.url 155 | update_download_info_to_mongodb(download_json_final, mark, 156 | url_source) 157 | -------------------------------------------------------------------------------- /80s_dm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on 2017-11-01 14:44:52 4 | # Project: 80s_dm 5 | 6 | import re 7 | from pyspider.libs.base_handler import * 8 | from utils import * 9 | 10 | 11 | FIRST_PAGE = 'https://www.80s.tw/dm/list' 12 | START_PAGE = 'https://www.80s.tw/dm/list/----14--p' 13 | PAGE_NUM = 1 14 | PAGE_TOTAL = 61 15 | WRITE_MONGODB = True 16 | 17 | 18 | class Handler(BaseHandler): 19 | crawl_config = {} 20 | 21 | def __init__(self): 22 | self.first_page = FIRST_PAGE 23 | self.start_page = START_PAGE 24 | self.page_num = PAGE_NUM 25 | self.page_total = PAGE_TOTAL 26 | 27 | # 每五天重爬 28 | @every(minutes=24 * 60 * 5) 29 | def on_start(self): 30 | self.crawl( 31 | self.first_page, 32 | validate_cert=False, 33 | headers=generate_random_headers(), 34 | callback=self.get_page_num) 35 | 36 | # age 一天内认为页面没有改变,不会再重新爬取,每天自动重爬 37 | # 获取页数 38 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=1, retries=1) 39 | def get_page_num(self, response): 40 | pager_list = [i.attr.href for i in response.doc('.pager > a').items()] 41 | page_total_url = pager_list[-1:][0] 42 | page_total_re = re.search(r"p(\d+)", page_total_url) 43 | if page_total_re: 44 | page_total = page_total_re.group(0)[1:] 45 | else: 46 | page_total = self.page_total 47 | print('总页数 ========== ' + str(page_total)) 48 | while self.page_num <= int(page_total): 49 | crawl_url = self.start_page + str(self.page_num) 50 | print(crawl_url) 51 | self.crawl( 52 | crawl_url, 53 | validate_cert=False, 54 | headers=generate_random_headers(), 55 | callback=self.index_page) 56 | self.page_num += 1 57 | 58 | # age 一天内认为页面没有改变,不会再重新爬取,每天自动重爬 59 | # 列表页 60 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=1, retries=1) 61 | def index_page(self, response): 62 | for each in response.doc('.h3 > a').items(): 63 | if each.attr.href.split('/')[-2:-1] == ['dm']: 64 | print(each.attr.href) 65 | self.crawl( 66 | each.attr.href, 67 | validate_cert=False, 68 | headers=generate_random_headers(), 69 | callback=self.detail_page) 70 | 71 | # age 一天内认为页面没有改变,不会再重新爬取 72 | # 详情页 73 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=2, retries=1) 74 | def detail_page(self, response): 75 | resource_item = {} 76 | final_json = {} 77 | resource_item["url"] = response.url 78 | resource_item["title"] = response.doc('title').text() 79 | # 构建两块信息 80 | brief_info = format_brief_info(response, 'dm') 81 | resource_brief = construct_brief_json(brief_info) 82 | detail_info = format_detail_info(response) 83 | resource_detail = construct_detail_json(detail_info) 84 | resource_item = {**resource_brief, **resource_detail} 85 | 86 | if WRITE_MONGODB: 87 | # 两块信息先构建好 88 | final_json = construct_final_json(resource_item, response.url) 89 | 90 | # 第一个 tab bt 直接能解析,其他的 tab 需要爬单独的 html 再解析 91 | # http://www.80s.tw/movie/1173/bt-1 bd-1 hd-1 92 | mark = get_mark(response.doc('.dlselected > span').text()) 93 | final_json["url_has_downlaod"] = [] 94 | final_json["url_has_downlaod"].append(mark) 95 | 96 | if mark: 97 | if WRITE_MONGODB: 98 | download_json_final = get_download_info(response, 'dm', mark) 99 | final_json = {**final_json, **download_json_final} 100 | write_to_mongodb(final_json, mark) 101 | 102 | # 另外两种大小,可有可无 103 | tab_text = response.doc('.cpage').text() 104 | bt_re = re.search(r"电视", tab_text) 105 | bd_re = re.search(r"平板", tab_text) 106 | hd_re = re.search(r"手机", tab_text) 107 | pt_re = re.search(r"小MP4", tab_text) 108 | if bt_re and mark != 'bt': 109 | self.crawl( 110 | response.url + "/bt-1", 111 | validate_cert=False, 112 | headers=generate_random_headers(), 113 | callback=self.get_bt_info, 114 | save={'resource_item': resource_item}) 115 | if bd_re and mark != 'bd': 116 | self.crawl( 117 | response.url + "/bd-1", 118 | validate_cert=False, 119 | headers=generate_random_headers(), 120 | callback=self.get_bd_info, 121 | save={'resource_item': resource_item}) 122 | elif hd_re and mark != 'hd': 123 | self.crawl( 124 | response.url + "/hd-1", 125 | validate_cert=False, 126 | headers=generate_random_headers(), 127 | callback=self.get_hd_info, 128 | save={'resource_item': resource_item}) 129 | elif pt_re and mark != 'pt': 130 | self.crawl( 131 | response.url + "/mp4-1", 132 | validate_cert=False, 133 | headers=generate_random_headers(), 134 | callback=self.get_pt_info, 135 | save={'resource_item': resource_item}) 136 | return { 137 | "url": response.url, 138 | "title": response.doc('.font14w').text(), 139 | } 140 | else: 141 | print('========== 处理错误,没有得到下载信息 ==========') 142 | 143 | # age 一天内认为页面没有改变,不会再重新爬取 144 | # 爬取 bt 145 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 146 | def get_bt_info(self, response): 147 | self.crawl_download_info(response, 'bt', 148 | response.save['resource_item']) 149 | 150 | # age 一天内认为页面没有改变,不会再重新爬取 151 | # 爬取 bd 152 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 153 | def get_bd_info(self, response): 154 | self.crawl_download_info(response, 'bd', 155 | response.save['resource_item']) 156 | 157 | # age 一天内认为页面没有改变,不会再重新爬取 158 | # 爬取 hd 159 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 160 | def get_hd_info(self, response): 161 | self.crawl_download_info(response, 'hd', 162 | response.save['resource_item']) 163 | 164 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 165 | def get_pt_info(self, response): 166 | self.crawl_download_info(response, 'pt', 167 | response.save['resource_item']) 168 | 169 | def crawl_download_info(self, response, mark, resource_item): 170 | if WRITE_MONGODB: 171 | download_json_final = get_download_info(response, 'dm', mark) 172 | url_source = response.url 173 | update_download_info_to_mongodb(download_json_final, mark, 174 | url_source) 175 | -------------------------------------------------------------------------------- /80s_ju.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on 2017-11-01 14:44:52 4 | # Project: 80s_ju 5 | 6 | import re 7 | from pyspider.libs.base_handler import * 8 | from utils import * 9 | 10 | FIRST_PAGE = 'https://www.80s.tw/ju/list' 11 | START_PAGE = 'https://www.80s.tw/ju/list/----0--p' 12 | PAGE_NUM = 1 13 | PAGE_TOTAL = 130 14 | WRITE_MONGODB = True 15 | 16 | 17 | class Handler(BaseHandler): 18 | crawl_config = {} 19 | 20 | def __init__(self): 21 | self.first_page = FIRST_PAGE 22 | self.start_page = START_PAGE 23 | self.page_num = PAGE_NUM 24 | self.page_total = PAGE_TOTAL 25 | 26 | # 每五天重爬 27 | @every(minutes=24 * 60 * 5) 28 | def on_start(self): 29 | self.crawl( 30 | self.first_page, 31 | validate_cert=False, 32 | headers=generate_random_headers(), 33 | callback=self.get_page_num) 34 | 35 | # age 一天内认为页面没有改变,不会再重新爬取,每天自动重爬 36 | # 获取页数 37 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=1, retries=1) 38 | def get_page_num(self, response): 39 | pager_list = [i.attr.href for i in response.doc('.pager > a').items()] 40 | page_total_url = pager_list[-1:][0] 41 | page_total_re = re.search(r"p(\d+)", page_total_url) 42 | if page_total_re: 43 | page_total = page_total_re.group(0)[1:] 44 | else: 45 | page_total = self.page_total 46 | print('总页数 ========== ' + str(page_total)) 47 | while self.page_num <= int(page_total): 48 | crawl_url = self.start_page + str(self.page_num) 49 | print(crawl_url) 50 | self.crawl( 51 | crawl_url, 52 | validate_cert=False, 53 | headers=generate_random_headers(), 54 | callback=self.index_page) 55 | self.page_num += 1 56 | 57 | # age 一天内认为页面没有改变,不会再重新爬取,每天自动重爬 58 | # 列表页 59 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=1, retries=1) 60 | def index_page(self, response): 61 | for each in response.doc('.h3 > a').items(): 62 | if each.attr.href.split('/')[-2:-1] == ['ju']: 63 | print(each.attr.href) 64 | self.crawl( 65 | each.attr.href, 66 | validate_cert=False, 67 | headers=generate_random_headers(), 68 | callback=self.detail_page) 69 | 70 | # age 一天内认为页面没有改变,不会再重新爬取 71 | # 详情页 72 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=2, retries=1) 73 | def detail_page(self, response): 74 | resource_item = {} 75 | final_json = {} 76 | resource_item["url"] = response.url 77 | resource_item["title"] = response.doc('title').text() 78 | # 构建两块信息 79 | brief_info = format_brief_info(response, 'ju') 80 | resource_brief = construct_brief_json(brief_info) 81 | detail_info = format_detail_info(response) 82 | resource_detail = construct_detail_json(detail_info) 83 | resource_item = {**resource_brief, **resource_detail} 84 | 85 | if WRITE_MONGODB: 86 | # 两块信息先构建好 87 | final_json = construct_final_json(resource_item, response.url) 88 | 89 | # 第一个 tab bt 直接能解析,其他的 tab 需要爬单独的 html 再解析 90 | # http://www.80s.tw/movie/1173/bt-1 bd-1 hd-1 91 | mark = get_mark(response.doc('.dlselected > span').text()) 92 | final_json["url_has_downlaod"] = [] 93 | final_json["url_has_downlaod"].append(mark) 94 | 95 | if mark: 96 | if WRITE_MONGODB: 97 | download_json_final = get_download_info(response, 'ju', mark) 98 | final_json = {**final_json, **download_json_final} 99 | write_to_mongodb(final_json, mark) 100 | 101 | # 另外两种大小,可有可无 102 | tab_text = response.doc('.cpage').text() 103 | bt_re = re.search(r"电视", tab_text) 104 | bd_re = re.search(r"平板", tab_text) 105 | hd_re = re.search(r"手机", tab_text) 106 | pt_re = re.search(r"小MP4", tab_text) 107 | if bt_re and mark != 'bt': 108 | self.crawl( 109 | response.url + "/bt-1", 110 | validate_cert=False, 111 | headers=generate_random_headers(), 112 | callback=self.get_bt_info, 113 | save={'resource_item': resource_item}) 114 | if bd_re and mark != 'bd': 115 | self.crawl( 116 | response.url + "/bd-1", 117 | validate_cert=False, 118 | headers=generate_random_headers(), 119 | callback=self.get_bd_info, 120 | save={'resource_item': resource_item}) 121 | elif hd_re and mark != 'hd': 122 | self.crawl( 123 | response.url + "/hd-1", 124 | validate_cert=False, 125 | headers=generate_random_headers(), 126 | callback=self.get_hd_info, 127 | save={'resource_item': resource_item}) 128 | elif pt_re and mark != 'pt': 129 | self.crawl( 130 | response.url + "/mp4-1", 131 | validate_cert=False, 132 | headers=generate_random_headers(), 133 | callback=self.get_pt_info, 134 | save={'resource_item': resource_item}) 135 | return { 136 | "url": response.url, 137 | "title": response.doc('.font14w').text(), 138 | } 139 | else: 140 | print('========== 处理错误,没有得到下载信息 ==========') 141 | 142 | # age 一天内认为页面没有改变,不会再重新爬取 143 | # 爬取 bt 144 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 145 | def get_bt_info(self, response): 146 | self.crawl_download_info(response, 'bt', 147 | response.save['resource_item']) 148 | 149 | # age 一天内认为页面没有改变,不会再重新爬取 150 | # 爬取 bd 151 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 152 | def get_bd_info(self, response): 153 | self.crawl_download_info(response, 'bd', 154 | response.save['resource_item']) 155 | 156 | # age 一天内认为页面没有改变,不会再重新爬取 157 | # 爬取 hd 158 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 159 | def get_hd_info(self, response): 160 | self.crawl_download_info(response, 'hd', 161 | response.save['resource_item']) 162 | 163 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 164 | def get_pt_info(self, response): 165 | self.crawl_download_info(response, 'pt', 166 | response.save['resource_item']) 167 | 168 | def crawl_download_info(self, response, mark, resource_item): 169 | if WRITE_MONGODB: 170 | download_json_final = get_download_info(response, 'ju', mark) 171 | url_source = response.url 172 | update_download_info_to_mongodb(download_json_final, mark, 173 | url_source) 174 | -------------------------------------------------------------------------------- /80s_movie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on 2017-11-03 10:56:36 4 | # Project: 80s_movie 5 | 6 | import re 7 | from pyspider.libs.base_handler import * 8 | from utils import * 9 | 10 | 11 | FIRST_PAGE = 'https://www.80s.tw/movie/list' 12 | START_PAGE = 'https://www.80s.tw/movie/list/-----p' 13 | PAGE_NUM = 1 14 | PAGE_TOTAL = 408 15 | WRITE_MONGODB = True 16 | 17 | 18 | class Handler(BaseHandler): 19 | crawl_config = {} 20 | 21 | def __init__(self): 22 | self.first_page = FIRST_PAGE 23 | self.start_page = START_PAGE 24 | self.page_num = PAGE_NUM 25 | self.page_total = PAGE_TOTAL 26 | 27 | # 每五天重爬 28 | @every(minutes=24 * 60 * 5) 29 | def on_start(self): 30 | self.crawl( 31 | self.first_page, 32 | validate_cert=False, 33 | headers=generate_random_headers(), 34 | callback=self.get_page_num) 35 | 36 | # age 一天内认为页面没有改变,不会再重新爬取,每天自动重爬 37 | # 获取页数 38 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=1, retries=1) 39 | def get_page_num(self, response): 40 | pager_list = [i.attr.href for i in response.doc('.pager > a').items()] 41 | page_total_url = pager_list[-1:][0] 42 | page_total_re = re.search(r"p(\d+)", page_total_url) 43 | if page_total_re: 44 | page_total = page_total_re.group(0)[1:] 45 | else: 46 | page_total = self.page_total 47 | print('总页数 ========== ' + str(page_total)) 48 | while self.page_num <= int(page_total): 49 | crawl_url = self.start_page + str(self.page_num) 50 | print(crawl_url) 51 | self.crawl( 52 | crawl_url, 53 | validate_cert=False, 54 | headers=generate_random_headers(), 55 | callback=self.index_page) 56 | self.page_num += 1 57 | 58 | # age 一天内认为页面没有改变,不会再重新爬取,每天自动重爬 59 | # 列表页 60 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=1, retries=1) 61 | def index_page(self, response): 62 | for each in response.doc('.h3 > a').items(): 63 | if each.attr.href.split('/')[-2:-1] == ['movie']: 64 | print(each.attr.href) 65 | self.crawl( 66 | each.attr.href, 67 | validate_cert=False, 68 | headers=generate_random_headers(), 69 | callback=self.detail_page) 70 | 71 | # age 一天内认为页面没有改变,不会再重新爬取 72 | # 详情页 73 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=2, retries=1) 74 | def detail_page(self, response): 75 | resource_item = {} 76 | final_json = {} 77 | resource_item["url"] = response.url 78 | resource_item["title"] = response.doc('title').text() 79 | # 构建两块信息 80 | brief_info = format_brief_info(response, 'movie') 81 | resource_brief = construct_brief_json(brief_info) 82 | detail_info = format_detail_info(response) 83 | resource_detail = construct_detail_json(detail_info) 84 | resource_item = {**resource_brief, **resource_detail} 85 | 86 | if WRITE_MONGODB: 87 | # 两块信息先构建好 88 | final_json = construct_final_json(resource_item, response.url) 89 | 90 | # 第一个 tab bt 直接能解析,其他的 tab 需要爬单独的 html 再解析 91 | # http://www.80s.tw/movie/1173/bt-1 bd-1 hd-1 92 | mark = get_mark(response.doc('.dlselected > span').text()) 93 | final_json["url_has_downlaod"] = [] 94 | final_json["url_has_downlaod"].append(mark) 95 | 96 | if mark: 97 | if WRITE_MONGODB: 98 | download_json_final = get_download_info(response, 'movie', mark) 99 | final_json = {**final_json, **download_json_final} 100 | write_to_mongodb(final_json, mark) 101 | 102 | # 另外两种大小,可有可无 103 | tab_text = response.doc('.cpage').text() 104 | bt_re = re.search(r"电视", tab_text) 105 | bd_re = re.search(r"平板", tab_text) 106 | hd_re = re.search(r"手机", tab_text) 107 | pt_re = re.search(r"小MP4", tab_text) 108 | if bt_re and mark != 'bt': 109 | self.crawl( 110 | response.url + "/bt-1", 111 | validate_cert=False, 112 | headers=generate_random_headers(), 113 | callback=self.get_bt_info, 114 | save={'resource_item': resource_item}) 115 | if bd_re and mark != 'bd': 116 | self.crawl( 117 | response.url + "/bd-1", 118 | validate_cert=False, 119 | headers=generate_random_headers(), 120 | callback=self.get_bd_info, 121 | save={'resource_item': resource_item}) 122 | elif hd_re and mark != 'hd': 123 | self.crawl( 124 | response.url + "/hd-1", 125 | validate_cert=False, 126 | headers=generate_random_headers(), 127 | callback=self.get_hd_info, 128 | save={'resource_item': resource_item}) 129 | elif pt_re and mark != 'pt': 130 | self.crawl( 131 | response.url + "/mp4-1", 132 | validate_cert=False, 133 | headers=generate_random_headers(), 134 | callback=self.get_pt_info, 135 | save={'resource_item': resource_item}) 136 | return { 137 | "url": response.url, 138 | "title": response.doc('.font14w').text(), 139 | } 140 | else: 141 | print('========== 处理错误,没有得到下载信息 ==========') 142 | 143 | # age 一天内认为页面没有改变,不会再重新爬取 144 | # 爬取 bt 145 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 146 | def get_bt_info(self, response): 147 | self.crawl_download_info(response, 'bt', 148 | response.save['resource_item']) 149 | 150 | # age 一天内认为页面没有改变,不会再重新爬取 151 | # 爬取 bd 152 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 153 | def get_bd_info(self, response): 154 | self.crawl_download_info(response, 'bd', 155 | response.save['resource_item']) 156 | 157 | # age 一天内认为页面没有改变,不会再重新爬取 158 | # 爬取 hd 159 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 160 | def get_hd_info(self, response): 161 | self.crawl_download_info(response, 'hd', 162 | response.save['resource_item']) 163 | 164 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 165 | def get_pt_info(self, response): 166 | self.crawl_download_info(response, 'pt', 167 | response.save['resource_item']) 168 | 169 | def crawl_download_info(self, response, mark, resource_item): 170 | if WRITE_MONGODB: 171 | download_json_final = get_download_info(response, 'movie', mark) 172 | url_source = response.url 173 | update_download_info_to_mongodb(download_json_final, mark, 174 | url_source) 175 | -------------------------------------------------------------------------------- /80s_zy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | # Created on 2017-11-01 14:44:52 4 | # Project: 80s_zy 5 | 6 | import re 7 | from pyspider.libs.base_handler import * 8 | from utils import * 9 | 10 | 11 | FIRST_PAGE = 'https://www.80s.tw/zy/list' 12 | START_PAGE = 'https://www.80s.tw/zy/list/----4--p' 13 | PAGE_NUM = 1 14 | PAGE_TOTAL = 17 15 | WRITE_MONGODB = True 16 | 17 | 18 | class Handler(BaseHandler): 19 | crawl_config = {} 20 | 21 | def __init__(self): 22 | self.first_page = FIRST_PAGE 23 | self.start_page = START_PAGE 24 | self.page_num = PAGE_NUM 25 | self.page_total = PAGE_TOTAL 26 | 27 | # 每五天重爬 28 | @every(minutes=24 * 60 * 5) 29 | def on_start(self): 30 | self.crawl( 31 | self.first_page, 32 | validate_cert=False, 33 | headers=generate_random_headers(), 34 | callback=self.get_page_num) 35 | 36 | # age 一天内认为页面没有改变,不会再重新爬取,每天自动重爬 37 | # 获取页数 38 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=1, retries=1) 39 | def get_page_num(self, response): 40 | pager_list = [i.attr.href for i in response.doc('.pager > a').items()] 41 | page_total_url = pager_list[-1:][0] 42 | page_total_re = re.search(r"p(\d+)", page_total_url) 43 | if page_total_re: 44 | page_total = page_total_re.group(0)[1:] 45 | else: 46 | page_total = self.page_total 47 | print('总页数 ========== ' + str(page_total)) 48 | while self.page_num <= int(page_total): 49 | crawl_url = self.start_page + str(self.page_num) 50 | print(crawl_url) 51 | self.crawl( 52 | crawl_url, 53 | validate_cert=False, 54 | headers=generate_random_headers(), 55 | callback=self.index_page) 56 | self.page_num += 1 57 | 58 | # age 一天内认为页面没有改变,不会再重新爬取,每天自动重爬 59 | # 列表页 60 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=1, retries=1) 61 | def index_page(self, response): 62 | for each in response.doc('.h3 > a').items(): 63 | if each.attr.href.split('/')[-2:-1] == ['zy']: 64 | print(each.attr.href) 65 | self.crawl( 66 | each.attr.href, 67 | validate_cert=False, 68 | headers=generate_random_headers(), 69 | callback=self.detail_page) 70 | 71 | # age 一天内认为页面没有改变,不会再重新爬取 72 | # 详情页 73 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=2, retries=1) 74 | def detail_page(self, response): 75 | resource_item = {} 76 | final_json = {} 77 | resource_item["url"] = response.url 78 | resource_item["title"] = response.doc('title').text() 79 | # 构建两块信息 80 | brief_info = format_brief_info(response, 'zy') 81 | resource_brief = construct_brief_json(brief_info) 82 | detail_info = format_detail_info(response) 83 | resource_detail = construct_detail_json(detail_info) 84 | resource_item = {**resource_brief, **resource_detail} 85 | 86 | if WRITE_MONGODB: 87 | # 两块信息先构建好 88 | final_json = construct_final_json(resource_item, response.url) 89 | 90 | # 第一个 tab bt 直接能解析,其他的 tab 需要爬单独的 html 再解析 91 | # http://www.80s.tw/movie/1173/bt-1 bd-1 hd-1 92 | mark = get_mark(response.doc('.dlselected > span').text()) 93 | final_json["url_has_downlaod"] = [] 94 | final_json["url_has_downlaod"].append(mark) 95 | 96 | if mark: 97 | if WRITE_MONGODB: 98 | download_json_final = get_download_info(response, 'zy', mark) 99 | final_json = {**final_json, **download_json_final} 100 | write_to_mongodb(final_json, mark) 101 | 102 | # 另外两种大小,可有可无 103 | tab_text = response.doc('.cpage').text() 104 | bt_re = re.search(r"电视", tab_text) 105 | bd_re = re.search(r"平板", tab_text) 106 | hd_re = re.search(r"手机", tab_text) 107 | pt_re = re.search(r"小MP4", tab_text) 108 | if bt_re and mark != 'bt': 109 | self.crawl( 110 | response.url + "/bt-1", 111 | validate_cert=False, 112 | headers=generate_random_headers(), 113 | callback=self.get_bt_info, 114 | save={'resource_item': resource_item}) 115 | if bd_re and mark != 'bd': 116 | self.crawl( 117 | response.url + "/bd-1", 118 | validate_cert=False, 119 | headers=generate_random_headers(), 120 | callback=self.get_bd_info, 121 | save={'resource_item': resource_item}) 122 | elif hd_re and mark != 'hd': 123 | self.crawl( 124 | response.url + "/hd-1", 125 | validate_cert=False, 126 | headers=generate_random_headers(), 127 | callback=self.get_hd_info, 128 | save={'resource_item': resource_item}) 129 | elif pt_re and mark != 'pt': 130 | self.crawl( 131 | response.url + "/mp4-1", 132 | validate_cert=False, 133 | headers=generate_random_headers(), 134 | callback=self.get_pt_info, 135 | save={'resource_item': resource_item}) 136 | return { 137 | "url": response.url, 138 | "title": response.doc('.font14w').text(), 139 | } 140 | else: 141 | print('========== 处理错误,没有得到下载信息 ==========') 142 | 143 | # age 一天内认为页面没有改变,不会再重新爬取 144 | # 爬取 bt 145 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 146 | def get_bt_info(self, response): 147 | self.crawl_download_info(response, 'bt', 148 | response.save['resource_item']) 149 | 150 | # age 一天内认为页面没有改变,不会再重新爬取 151 | # 爬取 bd 152 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 153 | def get_bd_info(self, response): 154 | self.crawl_download_info(response, 'bd', 155 | response.save['resource_item']) 156 | 157 | # age 一天内认为页面没有改变,不会再重新爬取 158 | # 爬取 hd 159 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 160 | def get_hd_info(self, response): 161 | self.crawl_download_info(response, 'hd', 162 | response.save['resource_item']) 163 | 164 | @config(age=24 * 60 * 60, auto_recrawl=True, priority=3, retries=1) 165 | def get_pt_info(self, response): 166 | self.crawl_download_info(response, 'pt', 167 | response.save['resource_item']) 168 | 169 | def crawl_download_info(self, response, mark, resource_item): 170 | if WRITE_MONGODB: 171 | download_json_final = get_download_info(response, 'zy', mark) 172 | url_source = response.url 173 | update_download_info_to_mongodb(download_json_final, mark, 174 | url_source) 175 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2016 Chen Jian 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the "Software"), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be included in all 12 | copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 17 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 18 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 19 | OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE 20 | OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 80s_spider 2 | 3 | www.80s.tw 爬虫,用 [pyspider](https://github.com/binux/pyspider), 4 | 只爬电影、电视剧、动漫、综艺,爬取后存储至 MongoDB 5 | 6 | 7 | ## model 8 | 9 | model 见 `model/resource.py`,数据清洗和保存更新操作都放在 `utils.py` 中。 10 | 11 | 12 | ## 运行 13 | 14 | ``` 15 | pyspider --config config.json 16 | ``` 17 | 18 | 19 | ![](http://breakwire.oss-cn-shanghai.aliyuncs.com/Screen%20Shot%202018-01-15%20at%2002.30.24.png) 20 | 21 | 先爬一遍整站的话成功率在 94% 左右,电影、电视剧、综艺基本都爬下来了,动漫的失败率最高,应该是数据解析处理没有完全考虑到位。 22 | 23 | 24 | # LICENCE 25 | MIT 26 | -------------------------------------------------------------------------------- /config.json: -------------------------------------------------------------------------------- 1 | { 2 | "webui": { 3 | "username": "80spyspider", 4 | "password": "80spyspider", 5 | "need-auth": true 6 | }, 7 | "data-path": "data" 8 | } 9 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lsdlab/80s_spider/6bcac48e87587796718038897dac03feb91f8e9f/data/.gitkeep -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | class BaseEnum(object): 2 | 3 | # meta属性,这个字典里存储通用方法可以返回的数据或配置相关的属性 4 | __meta__ = {'title_kv': {}, 'small_title': {}} 5 | 6 | @classmethod 7 | def title_with_value(cls, value): 8 | title_kv = cls.__meta__['title_kv'] 9 | return title_kv.get(value, None) 10 | 11 | @classmethod 12 | def title_list(cls): 13 | title_kv = cls.__meta__['title_kv'] 14 | return list(title_kv.values()) 15 | 16 | @classmethod 17 | def title_kv(cls): 18 | title_kv = cls.__meta__['title_kv'] 19 | return title_kv 20 | 21 | @classmethod 22 | def get_small_title(cls, order_type): 23 | small_title = cls.__meta__['small_title'] 24 | return small_title.get(order_type, None) 25 | 26 | @classmethod 27 | def find_enum_with_title(cls, title): 28 | for k, v in cls.__meta__.items(): 29 | if v == title: 30 | return k 31 | return None 32 | -------------------------------------------------------------------------------- /model/resource.py: -------------------------------------------------------------------------------- 1 | 2 | from mongoengine import * 3 | from model import BaseEnum 4 | 5 | 6 | class ResourceSource(BaseEnum): 7 | _80s = "www.80s.tw" 8 | 9 | 10 | class ResourceDownloadItem(EmbeddedDocument): 11 | title = StringField(required=True, default="") 12 | url = StringField(required=True, default="") 13 | url_backup = StringField(required=True, default="") 14 | size = StringField(required=True, default="") 15 | 16 | 17 | class ResourceTagItem(EmbeddedDocument): 18 | title = StringField(required=True, default="") 19 | item_list = EmbeddedDocumentListField( 20 | "ResourceDownloadItem", required=False, default=[]) 21 | 22 | 23 | class ResourceRecord(Document): 24 | # 类型 25 | rtype = StringField(required=True) 26 | # 唯一标识,是否有这个必要呢? 27 | hash = StringField(required=True) 28 | # 抓取数据的来源网址 29 | url_source = StringField(required=True) 30 | # 标题 31 | title = StringField(required=True, default="") 32 | # 副标题 33 | sub_title = StringField(required=True, default="") 34 | # 简介 35 | summery = StringField(required=True, default="") 36 | # 最新更新描述 37 | last_update_desc = StringField(required=True, default="") 38 | # 更新周期 39 | update_cycle = StringField(required=True, default="") 40 | # 缩略图 41 | url_image_thumb = StringField(required=True, default="") 42 | # 原图 43 | url_image = StringField(required=True, default="") 44 | # 展示图 45 | url_image_list = ListField( 46 | StringField(required=False, default=""), required=True, default=[]) 47 | # 发布时间 48 | show_release_time = StringField(required=True, default="") 49 | # 更新时间 50 | show_update_time = StringField(required=True, default="") 51 | # 年份 52 | year = StringField(required=True, default="") 53 | # 评分 54 | score = StringField(required=True, default="") 55 | # 演员 56 | actors = StringField(required=True, default="") 57 | # 导演 58 | directors = StringField(required=True, default="") 59 | # 区域 60 | areas = StringField(required=True, default="") 61 | # 标签 62 | tags = StringField(required=True, default="") 63 | # 语言 64 | langs = StringField(required=True, default="") 65 | # 时长 66 | time_length = StringField(required=True, default="") 67 | # 一共多少集 68 | total = IntField(required=True, default=0) 69 | # 当前第几集 70 | current = IntField(required=True, default=0) 71 | # 下载地址 72 | # 不同清晰度的片源列表,如["hd", "bd", "bt"]代表有三种片源,取的时候也是对应去取就好了 73 | url_has_downlaod = ListField( 74 | StringField(required=False, default=""), required=True, default=[]) 75 | url_bt_download = EmbeddedDocumentListField( 76 | "ResourceTagItem", required=False, default=[]) 77 | url_bd_download = EmbeddedDocumentListField( 78 | "ResourceTagItem", required=False, default=[]) 79 | url_hd_download = EmbeddedDocumentListField( 80 | "ResourceTagItem", required=False, default=[]) 81 | url_pt_download = EmbeddedDocumentListField( 82 | "ResourceTagItem", required=False, default=[]) 83 | 84 | # 资源来源, 如80s 85 | source = StringField(required=True, default=ResourceSource._80s) 86 | 87 | # 创建时间 88 | create_time = DateTimeField(required=True) 89 | # 更新时间 90 | update_time = DateTimeField(required=False) 91 | 92 | meta = {'db_alias': 'movie'} 93 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mongoengine==0.14.3 2 | pyspider==0.3.9 3 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import hashlib 3 | from datetime import datetime 4 | from model.resource import ResourceRecord 5 | from model.resource import ResourceSource 6 | from model.resource import ResourceDownloadItem 7 | from model.resource import ResourceTagItem 8 | from fake_useragent import UserAgent 9 | from mongoengine import connect 10 | 11 | databases = [ 12 | dict(host="127.0.0.1", port=27017, name='movie'), 13 | ] 14 | 15 | for database in databases: 16 | connect( 17 | alias=database['name'], 18 | db=database['name'], 19 | host=database['host'], 20 | port=database['port']) 21 | 22 | 23 | # 处理上半部分信息 24 | def format_brief_info(res, rtype): 25 | title, year, latest_update, current, total, update_period, special_list, other_names, actors, header_img_link, screenshot_link = '', '', '', '', '', '', '', '', '', '', '' 26 | info = res.doc('.info').text() 27 | # 名称 28 | title = res.doc('.font14w').text() 29 | # 年份 30 | year_re = re.search(r"(\d{4})", info) 31 | if year_re: 32 | year = year_re.group(0) 33 | # 题图 34 | header_img_link = res.doc('.img > img').attr.src or '' 35 | print('header_img') 36 | print(header_img_link) 37 | # 截图 38 | screenshot_link = res.doc('.noborder > img').attr.src or '' 39 | print('screenshot_link') 40 | print(screenshot_link) 41 | # 80s 的描述,专题,又名,演员 字符串列表和对应的链接 42 | info_span = [i.text() for i in res.doc('.info > span').items()] 43 | # 第一个是80s的一句话描述,可为空,后面都是演员链接 44 | info_span_link = [i.attr.href for i in res.doc('.info > span > a').items()] 45 | 46 | print(info_span) 47 | print(info_span_link) 48 | 49 | if rtype == 'movie' or rtype == 'zy': 50 | current = 1 51 | total = 1 52 | 53 | # info_span 第一个可能为空,有的电视剧名称前面有个国语粤语标识,这五个信息排版混乱,用正则找 54 | for i in info_span: 55 | if re.search(r"最近更新", i): 56 | if ":" in i: 57 | raw_latest_update = i.split(':') 58 | if raw_latest_update[1]: 59 | latest_update = raw_latest_update[1].strip() 60 | # 处理 total current 61 | total, current = 0, 0 62 | total_current_re = re.search(r"/", latest_update) 63 | current_re = re.search(r"第(\d+)集", latest_update) 64 | total_re = re.search(r"共(\d+)集", latest_update) 65 | # 有总数有当前集数 66 | if total_current_re: 67 | current_re = re.search(r"第(\d+)集", 68 | latest_update.split('/')[0]) 69 | total_re = re.search(r"共(\d+)集", 70 | latest_update.split('/')[1]) 71 | if current_re: 72 | current = current_re.group(0)[1:-1] 73 | if total_re: 74 | total = total_re.group(0)[1:-1] 75 | # 只有当前集数 76 | elif current_re: 77 | current = current_re.group(0)[1:-1] 78 | # 只有总技术 79 | elif total_re: 80 | total = total_re.group(0)[1:-1] 81 | elif re.search(r"更新周期", i): 82 | if ":" in i: 83 | raw_update_period = i.split(':') 84 | if raw_update_period[1]: 85 | update_period = raw_update_period[1].strip() 86 | elif re.search(r"又名", i): 87 | # 又名可有可无 88 | if ":" in i: 89 | raw_other_names = i.split(':') 90 | if raw_other_names[1]: 91 | other_names = raw_other_names[1].strip() 92 | # 如果能详细处理 93 | if ' , ' in other_names: 94 | other_names = '|'.join(other_names.split(' , ')) 95 | elif re.search(r"专题", i): 96 | # 专题也是可有可无 97 | if ":" in i: 98 | raw_special_list = i.split(':') 99 | if raw_special_list[1]: 100 | special_list = raw_special_list[1].strip() 101 | elif re.search(r"演员", i): 102 | if ":" in i: 103 | raw_actors = i.split(':') 104 | if raw_actors[1]: 105 | actors = raw_actors[1].strip() 106 | # 如果能详细处理 107 | if ' ' in actors: 108 | actors = '|'.join(actors.split(' ')) 109 | 110 | return title, year, latest_update, current, total, update_period, special_list, other_names, actors, header_img_link, screenshot_link 111 | 112 | 113 | # 处理下半部份信息 114 | def format_detail_info(res): 115 | type, region, language, directors, created_at, updated_at, item_length, douban_rate, douban_comment_link, movie_content = '', '', '', '', '', '', '', '', '', '' 116 | # 类型、地区、语言、剧情介绍等信息的字符串列表和链接 117 | span_block, span_block_link = [], [] 118 | span_block = [i.text() for i in res.doc('.span_block').items()] 119 | span_block_link = [ 120 | i.attr.href or '' for i in res.doc('.span_block > a').items() 121 | ] 122 | 123 | print(span_block) 124 | print(span_block_link) 125 | 126 | if span_block: 127 | for i in span_block: 128 | if re.search(r"类型", i): 129 | if ":" in i: 130 | raw_type = i.split(':') 131 | if raw_type[1]: 132 | type = raw_type[1].strip() 133 | # 如果能详细处理 134 | if " " in type: 135 | type = '|'.join(type.split(' ')) 136 | elif re.search(r"导演", i): 137 | # 导演可有可无 138 | if ":" in i: 139 | raw_directors = i.split(':') 140 | if raw_directors[1]: 141 | directors = raw_directors[1].strip() 142 | # 如果能详细处理 143 | if " " in directors: 144 | directors = '|'.join(directors.split(' ')) 145 | elif re.search(r"地区", i): 146 | if len(i) > 10: 147 | region = re.sub('[\s+]', ' ', i) 148 | if ":" in region: 149 | raw_region = region.split(':') 150 | if raw_region[1]: 151 | region = raw_region[1].strip() 152 | # 如果能详细处理 153 | region = "|".join(region.split()) 154 | else: 155 | if ":" in i: 156 | raw_region = i.split(':') 157 | if raw_region[1]: 158 | region = raw_region[1].strip() 159 | # 如果能详细处理 160 | if " " in region: 161 | region = '|'.join(region.split(' ')) 162 | elif re.search(r"语言", i): 163 | if ":" in i: 164 | raw_language = i.split(':') 165 | if raw_language[1]: 166 | language = raw_language[1].strip() 167 | # 如果能详细处理 168 | if " " in language: 169 | language = '|'.join(language.split(' ')) 170 | elif re.search(r"上映日期", i): 171 | if ":" in i: 172 | raw_created_at = i.split(':') 173 | if raw_created_at[1]: 174 | created_at = raw_created_at[1].strip() 175 | elif re.search(r"片长", i): 176 | if ":" in i: 177 | raw_item_length = i.split(':') 178 | if raw_item_length[1]: 179 | item_length = raw_item_length[1].strip() 180 | elif re.search(r"更新日期", i): 181 | if ":" in i: 182 | raw_created_at = i.split(':') 183 | if raw_created_at[1]: 184 | updated_at = raw_created_at[1].strip() 185 | elif re.search(r"豆瓣评分", i): 186 | if ":" in i: 187 | raw_douban_rate = i.split(':') 188 | if raw_douban_rate[1]: 189 | douban_rate = raw_douban_rate[1].strip() 190 | if douban_rate == '暂无': 191 | douban_rate = '0' 192 | 193 | if span_block_link: 194 | douban_comment_link = span_block_link[-1] 195 | if len(res.doc('#movie_content').text()) == 5: 196 | movie_content = '' 197 | else: 198 | movie_content = res.doc('#movie_content').text().split(': ')[1].strip() 199 | return type, region, language, directors, created_at, updated_at, item_length, douban_rate, douban_comment_link, movie_content 200 | 201 | 202 | def construct_brief_json(*args, **kwargs): 203 | brief_info = {} 204 | brief_info["title"] = args[0][0] 205 | brief_info["year"] = args[0][1] 206 | brief_info["latest_update"] = args[0][2] 207 | brief_info["current"] = args[0][3] 208 | brief_info["total"] = args[0][4] 209 | brief_info["update_period"] = args[0][5] 210 | brief_info["special_list"] = args[0][6] 211 | brief_info["other_names"] = args[0][7] 212 | brief_info["actors"] = args[0][8] 213 | brief_info["header_img_link"] = args[0][9] 214 | brief_info["screenshot_link"] = args[0][10] 215 | return brief_info 216 | 217 | 218 | def construct_detail_json(*args, **kwargs): 219 | detail_info = {} 220 | detail_info["type"] = args[0][0] 221 | detail_info["region"] = args[0][1] 222 | detail_info["language"] = args[0][2] 223 | detail_info["directors"] = args[0][3] 224 | detail_info["created_at"] = args[0][4] 225 | detail_info["updated_at"] = args[0][5] 226 | detail_info["item_length"] = args[0][6] 227 | detail_info["douban_rate"] = args[0][7] 228 | detail_info["douban_comment_link"] = args[0][8] 229 | detail_info["movie_content"] = args[0][9] 230 | return detail_info 231 | 232 | 233 | def construct_final_json(resource_item, url_source): 234 | final_json = {} 235 | m = hashlib.md5() 236 | md5_string = ResourceSource._80s + '/' + url_source.split('/')[-1] 237 | m.update(md5_string.encode('utf-8')) 238 | 239 | if url_source.split('/')[-2] == 'movie': 240 | rtype = '电影' 241 | elif url_source.split('/')[-2] == 'ju': 242 | rtype = '电视剧' 243 | elif url_source.split('/')[-2] == 'dm': 244 | rtype = '动漫' 245 | elif url_source.split('/')[-2] == 'zy': 246 | rtype = '综艺' 247 | else: 248 | rtype = '未知' 249 | final_json["rtype"] = rtype 250 | final_json["hash"] = m.hexdigest() 251 | final_json["url_source"] = url_source 252 | final_json["title"] = resource_item["title"] 253 | final_json["sub_title"] = resource_item["latest_update"] 254 | final_json["year"] = resource_item["year"] 255 | final_json["last_update_desc"] = resource_item["latest_update"] 256 | final_json["update_cycle"] = resource_item["update_period"] 257 | final_json["url_image_thumb"] = resource_item["header_img_link"] 258 | final_json["url_image"] = resource_item["header_img_link"] 259 | final_json["show_release_time"] = resource_item["created_at"] 260 | final_json["show_update_time"] = resource_item["updated_at"] 261 | final_json["score"] = resource_item["douban_rate"] 262 | final_json["actors"] = resource_item["actors"] 263 | final_json["directors"] = resource_item["directors"] 264 | final_json["areas"] = resource_item["region"] 265 | final_json["tags"] = resource_item["type"] 266 | final_json["langs"] = '' 267 | final_json["time_length"] = resource_item["item_length"] 268 | final_json["total"] = resource_item["total"] 269 | final_json["current"] = resource_item["current"] 270 | final_json["source"] = ResourceSource._80s 271 | final_json["summery"] = resource_item["movie_content"] 272 | if resource_item["screenshot_link"] == '': 273 | final_json["url_image_list"] = [''] 274 | else: 275 | final_json["url_image_list"] = [resource_item["screenshot_link"]] 276 | final_json["create_time"] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') 277 | final_json["update_time"] = final_json["create_time"] 278 | 279 | return final_json 280 | 281 | 282 | def get_download_info(res, rtype, mark): 283 | # 多语言处理 284 | # 电影原始名称 电视 赛车总动员 4.2 G 需要处理 285 | row_title = [i.text() for i in res.doc('.nm > span').items()] 286 | center_title = [i.text() for i in res.doc('.nm a').items()] 287 | center_download_link = [i.attr.href for i in res.doc('.nm a').items()] 288 | # 格式化后的名称列表 289 | # 格式化后的剧集名称列表,有国语粤语之分,size, download_link 一起放在 for 里面处理 290 | resource_tag_item = {} 291 | download_list = [] 292 | for k, i in enumerate(row_title): 293 | size_re = re.search(r"\d*\.\d*.?[G|GB|M|MB]", i) 294 | episode_language_re = re.search(r"国语|粤语|日语|泰语|英语|韩语|德语|法语", i) 295 | if episode_language_re: 296 | episode_language = episode_language_re.group(0) 297 | else: 298 | episode_language = '正片' 299 | resource_tag_item = {} 300 | resource_tag_item["title"] = episode_language 301 | resource_tag_item["item_list"] = [] 302 | 303 | if rtype == 'dm' or rtype == 'ju': 304 | episode_number_re = re.search(r"第(\d+.*-*_*\d*)集", i) 305 | if episode_number_re: 306 | title = episode_number_re.group(0)[1:-1] 307 | else: 308 | title = center_title[k] 309 | elif rtype == 'movie' or rtype == 'zy': 310 | title = center_title[k] 311 | else: 312 | title = "" 313 | 314 | if size_re: 315 | size_re_group = size_re.group(0) 316 | if ' ' in size_re_group: 317 | size = ''.join(size_re_group.split(' ')) 318 | else: 319 | size = size_re_group 320 | else: 321 | size = '' 322 | 323 | link = center_download_link[k] 324 | 325 | item_list_item = {} 326 | item_list_item['title'] = title 327 | item_list_item['size'] = size 328 | item_list_item['url'] = link 329 | resource_tag_item["item_list"].append(item_list_item) 330 | download_list.append(resource_tag_item) 331 | 332 | download_list_final = [] 333 | language = [i['title'] for i in download_list] 334 | language = list(set(language)) 335 | for i in language: 336 | resource_tag_item = {} 337 | resource_tag_item['title'] = i 338 | resource_tag_item['item_list'] = [] 339 | download_list_final.append(resource_tag_item) 340 | 341 | for j in download_list: 342 | for i in download_list_final: 343 | if j['title'] == i['title']: 344 | item = {} 345 | item['title'] = j['item_list'][0]['title'] 346 | item['size'] = j['item_list'][0]['size'] 347 | item['url'] = j['item_list'][0]['url'] 348 | i['item_list'].append(item) 349 | 350 | # print('download_list_final') 351 | # print(download_list_final) 352 | # return download_list_final 353 | 354 | download_item_key = "url" + "_" + mark + "_download" 355 | download_json_final = {} 356 | download_json_final[download_item_key] = download_list_final 357 | print('download_json_final--------------') 358 | print(download_json_final) 359 | return download_json_final 360 | 361 | 362 | def write_to_mongodb(final_json, mark): 363 | print('========== final_json 只带有第一个下载信息 ==========') 364 | print(final_json) 365 | exist_record = ResourceRecord.objects( 366 | url_source=final_json["url_source"]).first() 367 | if not exist_record: 368 | # 新建 369 | record = create_or_update_record(exist_record, final_json) 370 | else: 371 | # 只保存有可能更新的信息和第一页下载信息 372 | exist_record = update_detail_download_info_to_mongodb(exist_record, 373 | mark, final_json) 374 | exist_record['sub_title'] = final_json['sub_title'] 375 | exist_record['last_update_desc'] = final_json['last_update_desc'] 376 | exist_record['current'] = final_json['current'] 377 | exist_record['total'] = final_json['total'] 378 | exist_record['update_time'] = datetime.now() 379 | exist_record.save() 380 | print( 381 | '========== 资源已存在,更新基本信息(副标题和最新更新描述 current 和 total)和第一页的剧集更新(如果有更新的话) ' 382 | + final_json['url_source'] + ' ==========') 383 | 384 | # 更新新的剧集下载信息进去 385 | def update_download_info_to_mongodb(final_json, mark, url): 386 | print('final_json 只带有下载信息的 final_json') 387 | print(final_json) 388 | print('mark ' + mark) 389 | url_source = url[:-5] 390 | print('url ' + url) 391 | exist_record = ResourceRecord.objects(url_source=url_source).first() 392 | if exist_record: 393 | url_has_downlaod = list(exist_record['url_has_downlaod']) 394 | if mark not in url_has_downlaod: 395 | exist_record['url_has_downlaod'].append(mark) 396 | exist_record['update_time'] = datetime.now() 397 | exist_record = update_detail_download_info_to_mongodb(exist_record, 398 | mark, final_json) 399 | 400 | 401 | # 更新下载信息 402 | def update_detail_download_info_to_mongodb(exist_record, mark, final_json): 403 | download_item_key = 'url' + '_' + mark + '_download' 404 | print('---------------------') 405 | print(final_json[download_item_key]) 406 | 407 | _list = [] 408 | for item1 in final_json[download_item_key]: 409 | tt_items = item1['item_list'] 410 | it1 = ResourceTagItem() 411 | it1.title = item1['title'] 412 | _list2 = [] 413 | for item2 in tt_items: 414 | _item = ResourceDownloadItem() 415 | for _k1, _v1 in item2.items(): 416 | _item.__setitem__(_k1, _v1) 417 | _list2.append(_item) 418 | it1.item_list = _list2 419 | _list.append(it1) 420 | exist_record.__setitem__(download_item_key, _list) 421 | exist_record.save() 422 | return exist_record 423 | 424 | 425 | # 生成随机 user-agent headers 426 | def generate_random_headers(): 427 | ua = UserAgent() 428 | headers = {"User-Agent": ua.random, "Host": "www.80s.tw"} 429 | return headers 430 | 431 | 432 | def get_mark(mark_text): 433 | mark_re = re.search(r"电视|平板|手机|小MP4", mark_text) 434 | mark = '' 435 | if mark_re: 436 | if mark_re.group(0) == '电视': 437 | mark = 'bt' 438 | elif mark_re.group(0) == '平板': 439 | mark = 'bd' 440 | elif mark_re.group(0) == '手机': 441 | mark = 'hd' 442 | elif mark_re.group(0) == '小MP4': 443 | mark = 'pt' 444 | return mark 445 | 446 | 447 | def create_or_update_record(record, resource): 448 | if not record: 449 | record = ResourceRecord() 450 | record.hash = resource['hash'] 451 | record.url_source = resource['url_source'] 452 | 453 | if not isinstance(record, ResourceRecord): 454 | return None 455 | if not isinstance(resource, dict): 456 | return None 457 | 458 | for key, value in resource.items(): 459 | if key in [ 460 | '_id', 'blocked', 'deleted', 'count_browser', 'count_download', 461 | 'feedback', 'hash', 'url_source' 462 | ]: 463 | continue 464 | if key in [ 465 | 'url_bt_download', 'url_bd_download', 'url_hd_download', 466 | 'url_pt_download' 467 | ]: 468 | _list = [] 469 | for item1 in value: 470 | tt_items = item1['item_list'] 471 | it1 = ResourceTagItem() 472 | it1.title = item1['title'] 473 | _list2 = [] 474 | for item2 in tt_items: 475 | _item = ResourceDownloadItem() 476 | for _k1, _v1 in item2.items(): 477 | _item.__setitem__(_k1, _v1) 478 | _list2.append(_item) 479 | it1.item_list = _list2 480 | _list.append(it1) 481 | record.__setitem__(key, _list) 482 | continue 483 | if key in ['create_time', 'update_time']: 484 | dt = datetime.strptime(value, '%Y-%m-%d %H:%M:%S') 485 | record.__setitem__(key, dt) 486 | continue 487 | record.__setitem__(key, value) 488 | record.save() 489 | return record 490 | --------------------------------------------------------------------------------