├── 抓药品数据 ├── Readme.md ├── symptom.py ├── disease.py ├── yaopintong.py ├── medicamentANDorgThird.py ├── yaopintong2.py └── NL.py ├── Dict.py ├── SetBGPhoto.py ├── netspeed.py ├── tieba.py ├── netStat.py └── downMp4OfJiKeXueYuan.py /抓药品数据/Readme.md: -------------------------------------------------------------------------------- 1 | 抓取药品时,数据有几十万,创建了一个线程池,始终容纳固定量的线程 2 | 若某一个线程超时未完成任务,则自己退出,下一个线程进来 3 | 4 | 这些都是一个模式,要仿照,请参照最成熟版本yaopintong2.py -------------------------------------------------------------------------------- /Dict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding:utf-8 3 | import urllib 4 | import sys 5 | import re 6 | 7 | if len(sys.argv) == 1: #没有单词就提示用法 8 | print "用法:./Dict.py 要查找的单词" 9 | sys.exit() 10 | 11 | word = "" 12 | for x in range(len(sys.argv) - 1): #查找的可能是短语,中间有空格,如"join in",这里拼接单词 13 | word += " " + sys.argv[x + 1] 14 | print "单词:" + word 15 | 16 | searchUrl = "http://dict.youdao.com/search?q=" + word + "&keyfrom=dict.index" #查找的地址 17 | response = urllib.urlopen(searchUrl).read() #获得查找到的网页源码 18 | 19 | #从网页源码提取出单词释义那一部分 20 | searchSuccess = re.search(r"(?s)
(.*?)<') 20 | # 匹配公司名称 21 | companyNameRegex = re.compile(u'(?s)企业名称:.*?
(.*?)<') 22 | # 匹配公司地址 23 | companyAddressRegex = re .compile(u'(?s)生产地址:.*?
(.*?)<') 24 | # 电话 25 | phoneNumRegex = re.compile(u'(?s)联系电话:.*?
(.*?)<') 26 | 27 | # 处理抓取任务 28 | def loop(pageUrl): 29 | global i,thirdCount,medicamentNameRegex,symptomRegex,companyAddressRegex,companyNameRegex 30 | i += 1 31 | # 文件名用数字 32 | fNum = i; 33 | # 表示新线程启动了 34 | thirdCount += 1 35 | 36 | try: 37 | request = urllib.urlopen(pageUrl) 38 | except Exception, e: 39 | # 减少一个线程 40 | thirdCount -= 1 41 | return 42 | 43 | try: 44 | # 获得网页源码 45 | html = request.read().decode('gbk') 46 | except Exception, e: 47 | # 关闭请求 48 | request.close() 49 | # 减少一个线程 50 | thirdCount -= 1 51 | return 52 | 53 | # 正则匹配需要的数据 54 | medicamentName = medicamentNameRegex.search(html) 55 | symptom = symptomRegex.search(html) 56 | companyName = companyNameRegex.search(html) 57 | companyAddress = companyAddressRegex.search(html) 58 | phoneNum = phoneNumRegex.search(html) 59 | 60 | if medicamentName or symptom or companyName or companyAddress or phoneNum: 61 | f = open('/home/geekgao/data/' + str(fNum),'w') 62 | if medicamentName: 63 | f.write(medicamentName.group(1).encode('utf-8') + '\n') 64 | if symptom: 65 | f.write(symptom.group(1).encode('utf-8') + '\n') 66 | if companyName: 67 | f.write(companyName.group(1).encode('utf-8') + '\n') 68 | if companyAddress: 69 | f.write(companyAddress.group(1).encode('utf-8') + '\n') 70 | if phoneNum: 71 | f.write(phoneNum.group(1).encode('utf-8') + '\n') 72 | f.close() 73 | print pageUrl + '抓取成功!' 74 | else: 75 | print pageUrl + '抓取失败!' 76 | 77 | # 关闭请求 78 | request.close() 79 | # 减少一个线程 80 | thirdCount -= 1 81 | 82 | startTime = time.time() 83 | # 打开存储有需要抓取的网页链接的文件 84 | f = open('/home/geekgao/1','r') 85 | while True: 86 | num = i + 1 87 | # 线程要始终保持在50个 88 | if thirdCount <= 50: 89 | pageUrl = f.readline() 90 | # 读完了就退出循环 91 | if pageUrl == '': 92 | break 93 | print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount) 94 | t = threading.Thread(target = loop, name = str(num) + " loopThird",args=(pageUrl,)) 95 | t.start() 96 | time.sleep(0.001) 97 | 98 | thisStartTime = time.time() 99 | while thirdCount != 0: 100 | # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”) 101 | if time.time() - thisStartTime > 10: 102 | print "等待时间到,强行退出." 103 | break 104 | print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作" 105 | time.sleep(0.010) 106 | endTime = time.time() 107 | 108 | allTime = endTime - startTime 109 | f.close() 110 | print "完成!花费时间:" + str(allTime) + "s" -------------------------------------------------------------------------------- /抓药品数据/NL.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import urllib 3 | import re 4 | import threading 5 | import time 6 | import socket 7 | 8 | # 设置这么长时间超时 9 | socket.setdefaulttimeout(10) 10 | 11 | # 抓网页的地址数字 12 | i = 0 13 | # 存储线程的个数 14 | thirdCount = 0 15 | # 获取title的正则表达式 16 | titleRegex = re.compile(r"(?s)