└── Spider.py /Spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author : Huangcc 3 | 4 | import requests 5 | from lxml import etree 6 | from collections import deque 7 | 8 | 9 | class LinuxidcSpider: 10 | def __init__(self): 11 | self.url = 'https://linux.linuxidc.com/' 12 | self.queues = deque() 13 | self.session = requests.Session() 14 | self.queues.append((self.url, 'init url', 'folder.png', 'folder', '2018-01-01 00:00')) 15 | 16 | def get_all_files(self): 17 | while self.queues: 18 | url, name, _, _, _ = self.queues.popleft() 19 | print 'searching {} with url {}'.format(name, url) 20 | lines = self.get_files_from_url(url) 21 | print lines 22 | folder_lines = filter(lambda x: x[3] == 'folder', lines) 23 | file_lines = filter(lambda x: x[3] == 'file', lines) 24 | self.queues.extend(folder_lines) 25 | self.write_result_to_file(file_lines) 26 | 27 | @staticmethod 28 | def write_result_to_file(lines): 29 | with open('result.csv', 'a') as f: 30 | lines = [','.join(line)+'\n' for line in lines] 31 | f.writelines(lines) 32 | 33 | def get_files_from_url(self, url): 34 | print 'getting files...' 35 | results = list() 36 | try: 37 | resp = self.session.get(url, timeout=1) 38 | html = etree.HTML(resp.text) 39 | trs = html.xpath('//tr') 40 | for tr in trs: 41 | a = tr.xpath('td/div/a') 42 | div = tr.xpath('td/div') 43 | if len(a) == 1: 44 | url = a[0].xpath('@href')[0] 45 | url = url if url.startswith('http') else self.url + url 46 | name = a[0].xpath('text()')[0].encode('utf-8') 47 | image_src = div[0].xpath('img/@src')[0] 48 | update_time = tr.xpath('td[last()]/text()')[0] 49 | file_type = 'folder' if image_src.endswith('folder.png') else 'file' 50 | results.append((url, name, image_src, file_type, update_time)) 51 | print 'finishing getting files...' 52 | except requests.exceptions.ReadTimeout: 53 | print 'failed with url {}'.format(url) 54 | return results 55 | 56 | if __name__ == '__main__': 57 | spider = LinuxidcSpider() 58 | spider.get_all_files() 59 | --------------------------------------------------------------------------------