├── README.md └── pachong.py /README.md: -------------------------------------------------------------------------------- 1 | # GetNumber 2 | 爬取全国的手机号码/Get all the Chinese phone number from web 3 | 4 | 代码解读可以查看我的博客:[爬取全国的手机号码](https://www.itnote.tech/2019/11/01/%E7%88%AC%E5%8F%96%E4%B8%8A%E6%B5%B7%E5%B8%82%E7%9A%84%E6%89%80%E6%9C%89%E7%94%B5%E8%AF%9D%E5%8F%B7%E7%A0%81/) 5 | -------------------------------------------------------------------------------- /pachong.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import re 4 | 5 | 6 | class GetPhoneNumber(object): 7 | def __init__(self): 8 | self.headers = { 9 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 10 | 'Chrome/66.0.3359.117 Safari/537.36' 11 | } 12 | 13 | def get_phone_num(self, seg_phone_num): 14 | url = 'http://so.qqdna.com/mobile/shanghai_' + seg_phone_num + '.html' 15 | res = requests.get(url) 16 | res.encoding = 'gb2312' 17 | bf = BeautifulSoup(res.text, 'lxml') 18 | return str(self.filter_data(bf.body.table.tr.td)) 19 | 20 | @staticmethod 21 | def filter_data(original_data): 22 | rr = re.compile(r'[1][0-9][0-9]{9}') 23 | list_string_data = str(rr.findall(str(original_data))[2:]) 24 | filtered_data = list_string_data.replace(r'[', '') 25 | filtered_data = filtered_data.replace(r']', '') 26 | filtered_data = filtered_data.replace(r"', '", '\n') 27 | filtered_data = filtered_data.replace(r"'", '') 28 | return filtered_data 29 | 30 | 31 | class GetPhoneSegment(object): 32 | def __init__(self): 33 | self.headers = { 34 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 35 | 'Chrome/66.0.3359.117 Safari/537.36' 36 | } 37 | self.url = 'http://so.qqdna.com/city/shanghai/shanghai.php' 38 | 39 | def get_seg_num(self): 40 | res = requests.get(self.url) 41 | res.encoding = 'gb2312' 42 | bs = BeautifulSoup(res.text, 'lxml') 43 | all_seg = bs.findAll('li') 44 | return self.filter_data(all_seg) 45 | 46 | @staticmethod 47 | def filter_data(original_data): 48 | rr = re.compile(r'[1][0-9]+') 49 | list_string_data = rr.findall(str(original_data)) 50 | data_remove_dup = list(set(list_string_data)) 51 | data_remove_dup.sort(key=list_string_data.index) 52 | return data_remove_dup 53 | 54 | 55 | if __name__ == '__main__': 56 | get_phone_seg = GetPhoneSegment() 57 | seg_num_list = get_phone_seg.get_seg_num() 58 | get_num = GetPhoneNumber() 59 | print("号码段数量:" + str(len(seg_num_list))) 60 | for seg_num in seg_num_list: 61 | data = get_num.get_phone_num(seg_num) 62 | fo = open(seg_num + '.txt', 'w') # a: 追加, w: 覆盖 63 | fo.write(data) 64 | fo.close() 65 | 66 | 67 | --------------------------------------------------------------------------------