├── 抓药品数据 ├── Readme.md ├── symptom.py ├── disease.py ├── yaopintong.py ├── medicamentANDorgThird.py ├── yaopintong2.py └── NL.py ├── Dict.py ├── SetBGPhoto.py ├── netspeed.py ├── tieba.py ├── netStat.py └── downMp4OfJiKeXueYuan.py /抓药品数据/Readme.md: -------------------------------------------------------------------------------- 1 | 抓取药品时,数据有几十万,创建了一个线程池,始终容纳固定量的线程 2 | 若某一个线程超时未完成任务,则自己退出,下一个线程进来 3 | 4 | 这些都是一个模式,要仿照,请参照最成熟版本yaopintong2.py -------------------------------------------------------------------------------- /Dict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #coding:utf-8 3 | import urllib 4 | import sys 5 | import re 6 | 7 | if len(sys.argv) == 1: #没有单词就提示用法 8 | print "用法:./Dict.py 要查找的单词" 9 | sys.exit() 10 | 11 | word = "" 12 | for x in range(len(sys.argv) - 1): #查找的可能是短语,中间有空格,如"join in",这里拼接单词 13 | word += " " + sys.argv[x + 1] 14 | print "单词:" + word 15 | 16 | searchUrl = "http://dict.youdao.com/search?q=" + word + "&keyfrom=dict.index" #查找的地址 17 | response = urllib.urlopen(searchUrl).read() #获得查找到的网页源码 18 | 19 | #从网页源码提取出单词释义那一部分 20 | searchSuccess = re.search(r"(?s)
\s*
",response) 21 | 22 | if searchSuccess: 23 | means = re.findall(r"(?m)
  • (.*?)
  • ",searchSuccess.group()) #获取我们想提取的核心单词释义 24 | print "释义:" 25 | for mean in means: 26 | print "\t" + mean #输出释义 27 | else: 28 | print "未查找到释义." -------------------------------------------------------------------------------- /SetBGPhoto.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | #encoding:utf-8 3 | import time 4 | import os 5 | import urllib 6 | import re 7 | import os 8 | 9 | html = urllib.urlopen("http://cn.bing.com/").read() 10 | 11 | imgAddress = re.search(r'g_img={url: "(.*?)"',html).group(1).replace('\\','') 12 | imgAddress = "http://cn.bing.com" + imgAddress 13 | 14 | 15 | if imgAddress: 16 | path = os.path.expanduser('~') + "/BingImg/" 17 | if os.path.exists(path) == False: 18 | os.makedirs(path) 19 | 20 | fileName = path + time.strftime("%Y-%m-%d") + ".jpg" 21 | print "今天Bing图片的地址是:" + imgAddress 22 | print "正在下载……" 23 | urllib.urlretrieve(imgAddress, fileName) 24 | print "下载完毕!" + "存储为" + fileName 25 | orderStr = "gsettings set org.gnome.desktop.background picture-uri \"file:" + fileName + "\"" 26 | os.system(orderStr) 27 | else: 28 | print "今天貌似出问题了……" 29 | -------------------------------------------------------------------------------- /netspeed.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | #coding:utf-8 3 | 4 | import urllib2,re,sys 5 | 6 | if len(sys.argv) < 3: 7 | print "用法:./netspeed 主机号 下载速度(kB/S)" 8 | sys.exit() 9 | 10 | ip = "192.168.1." + sys.argv[1] 11 | downSpeed = int(sys.argv[2]) * 8 12 | 13 | request = urllib2.Request('http://192.168.1.1/userRpm/QoSCfgRpm.htm?enable=true&start_ip_addr=' + ip + '&end_ip_addr=' + ip + '&min_up_band_width=0&max_up_band_width=0&min_down_band_width=0&max_down_band_width=' + str(downSpeed) + '&Save=%B1%A3+%B4%E6&curEditId=0&Page=1') 14 | request.add_header('Cookie','Authorization=Basic%20YWRtaW46aHVxaWFuZ3hp; ChgPwdSubTag=') 15 | request.add_header('Referer','http://192.168.1.1/userRpm/SystemStatisticRpm.htm?contType=1&sortType=4&Num_per_page=100&Goto_page=1') 16 | request.add_header('Upgrade-Insecure-Requests','1') 17 | request.add_header('User-Agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36') 18 | 19 | urllib2.urlopen(request) 20 | -------------------------------------------------------------------------------- /抓药品数据/symptom.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import urllib 3 | import re 4 | import threading 5 | 6 | i = 0 7 | def loop(): 8 | global i 9 | i += 1 10 | pageUrl = "http://zzk.xywy.com/" + str(i) + "_gaishu.html" 11 | request = urllib.urlopen(pageUrl) 12 | 13 | # 获得网页源码 14 | html = request.read() 15 | # 如果是404就退出 16 | if html == "404": 17 | print "404! url:" + pageUrl 18 | return 19 | # 获得title 20 | symptom = re.search(r"(.*?)",html) 21 | # 如果匹配到了title 22 | if symptom: 23 | # 打印症状和链接 24 | print symptom.group(1).decode("gbk").split(u"怎么办")[0] + " url:" + pageUrl 25 | # 写入文件 26 | f.write((symptom.group(1).decode("gbk").split(u"怎么办")[0] + " @f Nesymptom\n").encode("utf-8")) 27 | # 关闭请求 28 | request.close() 29 | 30 | f = open("/home/geekgao/symptom1",'w') 31 | 32 | while i < 6911: 33 | # 存储线程引用 34 | thirdList = [] 35 | # = 线程计数 36 | count = 0 37 | # 每次同时启用200个线程 38 | while count < 200: 39 | count += 1 40 | t = threading.Thread(target = loop, name = str(i)) 41 | t.start() 42 | thirdList.append(t) 43 | for t in thirdList: 44 | t.join() 45 | 46 | f.close() 47 | print "完成" -------------------------------------------------------------------------------- /抓药品数据/disease.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import urllib 3 | import re 4 | import threading 5 | 6 | i = 0 7 | def loop(): 8 | global i 9 | i += 1 10 | pageUrl = "http://jib.xywy.com/il_sii_" + str(i + 1) + ".htm" 11 | request = urllib.urlopen(pageUrl) 12 | 13 | # 获得网页源码 14 | html = request.read() 15 | # 获得title 16 | disease = re.search(r"(.*?)",html) 17 | # 如果匹配到了title 18 | if disease: 19 | # 打印病名和链接 20 | print disease.group(1).decode("gbk").split(",")[0] + " url:" + pageUrl 21 | # 如果是404就退出 22 | if re.match("^404",disease.group(1).decode("gbk").split(",")[0]): 23 | return 24 | # 写入文件 25 | f.write((disease.group(1).decode("gbk").split(",")[0] + " @f NeDisease\n").encode("utf-8")) 26 | # 关闭请求 27 | request.close() 28 | 29 | f = open("/home/geekgao/disease1",'w') 30 | 31 | while i < 10136: 32 | # 存储线程引用 33 | thirdList = [] 34 | # = 线程计数 35 | count = 0 36 | # 每次同时启用100个线程 37 | while count < 200: 38 | count += 1 39 | t = threading.Thread(target = loop, name = str(i)) 40 | t.start() 41 | thirdList.append(t) 42 | for t in thirdList: 43 | t.join() 44 | 45 | f.close() 46 | print "完成" -------------------------------------------------------------------------------- /tieba.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | # 百度贴吧签到,需要填充cookie才能使用 3 | from urllib.request import * 4 | import urllib.parse 5 | from time import * 6 | import re 7 | 8 | url = "http://tieba.baidu.com/f/like/mylike?v=" + str(int(time() * 1000)) 9 | request = Request(url) 10 | # cookie 11 | request.add_header("cookie","") 12 | 13 | response = urlopen(request) 14 | html = str(response.read(),'gbk') 15 | 16 | # 关注的贴吧html的table部分 17 | tableStr = re.search('.*?
    ',html).group(0) 18 | # 贴吧的所有url 19 | urls = re.findall('href="(/f\?kw.*?)"',tableStr) 20 | 21 | # # 遍历百度贴吧,发送签到请求 22 | for url in urls: 23 | tiebaName = urllib.parse.unquote(re.search('kw=(.*)',url).group(1),encoding = 'gbk') 24 | print(tiebaName) 25 | url = 'http://tieba.baidu.com' + url 26 | print(url) 27 | 28 | # 获取post需要的tbs参数 29 | thisHtml = str(urlopen(Request(url)).read(),'utf-8') 30 | tbs = re.search("tbs': \"(.*?)\"",thisHtml).group(1) 31 | print('tbs:' +tbs) 32 | 33 | data = { 34 | 'ie':'utf-8', 35 | 'kw':tiebaName, 36 | 'tbs':tbs 37 | } 38 | request = Request('http://tieba.baidu.com/sign/add',data = urllib.parse.urlencode(data).encode(),method = 'POST') 39 | print('POST的数据:' + urllib.parse.urlencode(data)) 40 | # cookie 41 | request.add_header("cookie","") 42 | response = urlopen(request) 43 | print('statusCode:' + str(response.status)) 44 | print() 45 | -------------------------------------------------------------------------------- /抓药品数据/yaopintong.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 抓取药品通的网站需要的数据,这个代码是检查哪些网页不是404.存储起来,在yaopintong2.py中进行抓取 3 | import urllib 4 | import re 5 | import threading 6 | import time 7 | import socket 8 | 9 | # 设置这么长时间超时 10 | socket.setdefaulttimeout(8) 11 | 12 | # 抓网页的地址起始数字 13 | i = 800000 14 | # 存储线程的个数 15 | thirdCount = 0 16 | 17 | # 处理抓取任务 18 | def loop(): 19 | global i,thirdCount,titleRegex,NLRegex 20 | i += 1 21 | # 当前网页的编号 22 | pageNum = i 23 | # 表示新线程启动了 24 | thirdCount += 1 25 | 26 | pageUrl = "http://wapypk.39.net/manual/" + str(pageNum) 27 | try: 28 | request = urllib.urlopen(pageUrl) 29 | except Exception, e: 30 | # 减少一个线程 31 | thirdCount -= 1 32 | return 33 | 34 | # 不正常就退出 35 | if request.getcode() != 200: 36 | print "不正常的页面:" + str(pageNum) + " 返回值:" + str(request.getcode()) 37 | # 关闭请求 38 | request.close() 39 | # 减少一个线程 40 | thirdCount -= 1 41 | return 42 | print "正常的页面:" + str(pageNum) 43 | 44 | f.write(pageUrl + '\n') 45 | # 关闭请求 46 | request.close() 47 | # 减少一个线程 48 | thirdCount -= 1 49 | 50 | startTime = time.time() 51 | f = open('/home/geekgao/1','a+') 52 | while i < 830000: 53 | num = i + 1 54 | # 线程要始终保持在50个 55 | if thirdCount < 50: 56 | print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount) 57 | t = threading.Thread(target = loop, name = str(num) + "loopThird") 58 | t.start() 59 | time.sleep(0.001) 60 | 61 | thisStartTime = time.time() 62 | while thirdCount != 0: 63 | # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”) 64 | if time.time() - thisStartTime > 10: 65 | print "等待时间到,强行退出." 66 | break 67 | print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作" 68 | time.sleep(0.010) 69 | endTime = time.time() 70 | 71 | allTime = endTime - startTime 72 | f.close() 73 | print "完成!花费时间:" + str(allTime) + "s" -------------------------------------------------------------------------------- /netStat.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | #coding:utf-8 3 | 4 | import urllib2,re 5 | 6 | request = urllib2.Request('http://192.168.1.1/userRpm/SystemStatisticRpm.htm?contType=1&sortType=4&autoRefresh=2&Num_per_page=100&Goto_page=1') 7 | request.add_header('Cookie','Authorization=Basic%20YWRtaW46aHVxaWFuZ3hp; ChgPwdSubTag=') 8 | request.add_header('Referer','http://192.168.1.1/userRpm/SystemStatisticRpm.htm?contType=1&sortType=4&Num_per_page=100&Goto_page=1') 9 | request.add_header('Upgrade-Insecure-Requests','1') 10 | request.add_header('User-Agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36') 11 | 12 | htmlCode = urllib2.urlopen(request).read() 13 | resultStr = re.search(r'(?s)new Array\(\n(.*?)\n0,0 \);',htmlCode).group(1) 14 | 15 | # 字符串数组,每一项是一项主机记录 16 | computers = re.findall(r'(?m)^(.*?)$',resultStr) 17 | # 分别输出没一行记录 18 | i = 0 19 | for c in computers: 20 | # 计数 21 | i += 1 22 | print ('%3d:'%i), 23 | 24 | # ip 25 | ip = c.split('"')[1].split('"')[0] 26 | print ip + ' ', 27 | # front代表紧接着的下次分割时的字符串 28 | front = ip + '","' 29 | 30 | # mac 31 | mac = c.split(front)[1].split('"')[0] 32 | print mac + ' ', 33 | front = front + mac + '",' 34 | 35 | # 上传量(B) 36 | upSize = c.split(front)[1].split(',')[0] 37 | print ("[↓%8.2fMB "%(float(upSize) / 1024 / 1024)), 38 | front = front + upSize + ',' 39 | 40 | # 下载量(B) 41 | downSize = c.split(front)[1].split(',')[0] 42 | print ("↑%8.2fMB]\t"%(float(downSize) / 1024 / 1024)), 43 | front = front + downSize + ',' 44 | 45 | # 上传速度(B/s) 46 | up = c.split(front)[1].split(',')[0] 47 | print ("[↓%8.2fKB/s "%(float(up) / 1024)), 48 | front = front + up + ',' 49 | 50 | # 上传速度(B/s) 51 | down = c.split(front)[1].split(',')[0] 52 | print ("↑%8.2fKB/s]\t"%(float(down) / 1024)) 53 | -------------------------------------------------------------------------------- /抓药品数据/medicamentANDorgThird.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | # 最大453482 3 | import urllib 4 | import re 5 | import threading 6 | import time 7 | import socket 8 | 9 | # 设置这么长时间超时 10 | socket.setdefaulttimeout(10) 11 | 12 | # 抓网页的地址数字 13 | i = 30000 14 | # 存储线程的dict[序号:线程引用] 15 | thirdDict = {} 16 | 17 | # 处理抓取任务 18 | def loop(): 19 | global i,thirdDict 20 | i += 1 21 | key = i 22 | # 放入当前进程的引用 23 | thirdDict[key] = threading.current_thread() 24 | 25 | pageUrl = "http://yao.xywy.com/goods/" + str(i + 1) + ".htm" 26 | try: 27 | request = urllib.urlopen(pageUrl) 28 | except Exception, e: 29 | # 删除key-value 30 | thirdDict.pop(key) 31 | return 32 | 33 | try: 34 | # 获得网页源码 35 | html = request.read() 36 | except Exception, e: 37 | # 关闭请求 38 | request.close() 39 | # 删除key-value 40 | thirdDict.pop(key) 41 | return 42 | 43 | # 获得title 44 | medicament = re.search(r"(.*)?",html) 45 | org = re.search(r'生产企业.*?">(.*?)',html) 46 | # 如果匹配到了title和企业信息 47 | if medicament and org: 48 | # 如果是404就退出 49 | if medicament.group(1) == "": 50 | print "404! url:" + pageUrl 51 | # 关闭请求 52 | request.close() 53 | # 删除key-value 54 | thirdDict.pop(key) 55 | return 56 | # 打印药名和链接 57 | print medicament.group(1).decode("utf-8").split("(")[0] + " url:" + pageUrl 58 | # 写入文件 59 | medicamentF.write((medicament.group(1).decode("utf-8").split("(")[0] + " @f NeMedicament\n").encode("utf-8")) 60 | orgF.write((org.group(1).decode("utf-8") + " @f NeOrg\n").encode("utf-8")) 61 | # 关闭请求 62 | request.close() 63 | # 删除key-value 64 | thirdDict.pop(key) 65 | 66 | medicamentF = open("/home/geekgao/medicament",'w') 67 | orgF = open("/home/geekgao/org",'w') 68 | 69 | thisStartTime = time.time() 70 | while i < 453482: 71 | num = i 72 | # 线程要始终保持在50个 73 | if len(thirdDict) < 50: 74 | # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”) 75 | if time.time() - thisStartTime > 10: 76 | print "等待时间到,强行退出." 77 | break 78 | print '新进程:' + str(num) + "loopThird" + "进程总数:" + str(len(thirdDict)) 79 | t = threading.Thread(target = loop, name = str(num) + "loopThird") 80 | # t = threading.Thread(target = thirdMonitor, name = str(num) + "thirdMonitor",args=(num,)) 81 | t.start() 82 | time.sleep(0.001) 83 | 84 | while len(thirdDict) != 0: 85 | time.sleep(0.001) 86 | medicamentF.close() 87 | orgF.close() 88 | print "完成" -------------------------------------------------------------------------------- /抓药品数据/yaopintong2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 抓取药品通网站的数据,这里的链接是经yaopintong.py过滤后确实可用的链接 3 | import urllib 4 | import re 5 | import threading 6 | import time 7 | import socket 8 | 9 | # 设置这么长时间超时 10 | socket.setdefaulttimeout(8) 11 | 12 | # 进程计数,存储文件计数 13 | i = 0 14 | # 存储线程的个数 15 | thirdCount = 0 16 | # 匹配药品名称 17 | medicamentNameRegex = re.compile(u'(?s)通用名称:(.*?)<') 18 | # 匹配适应症状 19 | symptomRegex = re.compile(u'(?s)适应症:.*?

    (.*?)<') 20 | # 匹配公司名称 21 | companyNameRegex = re.compile(u'(?s)企业名称:.*?

    (.*?)<') 22 | # 匹配公司地址 23 | companyAddressRegex = re .compile(u'(?s)生产地址:.*?

    (.*?)<') 24 | # 电话 25 | phoneNumRegex = re.compile(u'(?s)联系电话:.*?

    (.*?)<') 26 | 27 | # 处理抓取任务 28 | def loop(pageUrl): 29 | global i,thirdCount,medicamentNameRegex,symptomRegex,companyAddressRegex,companyNameRegex 30 | i += 1 31 | # 文件名用数字 32 | fNum = i; 33 | # 表示新线程启动了 34 | thirdCount += 1 35 | 36 | try: 37 | request = urllib.urlopen(pageUrl) 38 | except Exception, e: 39 | # 减少一个线程 40 | thirdCount -= 1 41 | return 42 | 43 | try: 44 | # 获得网页源码 45 | html = request.read().decode('gbk') 46 | except Exception, e: 47 | # 关闭请求 48 | request.close() 49 | # 减少一个线程 50 | thirdCount -= 1 51 | return 52 | 53 | # 正则匹配需要的数据 54 | medicamentName = medicamentNameRegex.search(html) 55 | symptom = symptomRegex.search(html) 56 | companyName = companyNameRegex.search(html) 57 | companyAddress = companyAddressRegex.search(html) 58 | phoneNum = phoneNumRegex.search(html) 59 | 60 | if medicamentName or symptom or companyName or companyAddress or phoneNum: 61 | f = open('/home/geekgao/data/' + str(fNum),'w') 62 | if medicamentName: 63 | f.write(medicamentName.group(1).encode('utf-8') + '\n') 64 | if symptom: 65 | f.write(symptom.group(1).encode('utf-8') + '\n') 66 | if companyName: 67 | f.write(companyName.group(1).encode('utf-8') + '\n') 68 | if companyAddress: 69 | f.write(companyAddress.group(1).encode('utf-8') + '\n') 70 | if phoneNum: 71 | f.write(phoneNum.group(1).encode('utf-8') + '\n') 72 | f.close() 73 | print pageUrl + '抓取成功!' 74 | else: 75 | print pageUrl + '抓取失败!' 76 | 77 | # 关闭请求 78 | request.close() 79 | # 减少一个线程 80 | thirdCount -= 1 81 | 82 | startTime = time.time() 83 | # 打开存储有需要抓取的网页链接的文件 84 | f = open('/home/geekgao/1','r') 85 | while True: 86 | num = i + 1 87 | # 线程要始终保持在50个 88 | if thirdCount <= 50: 89 | pageUrl = f.readline() 90 | # 读完了就退出循环 91 | if pageUrl == '': 92 | break 93 | print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount) 94 | t = threading.Thread(target = loop, name = str(num) + " loopThird",args=(pageUrl,)) 95 | t.start() 96 | time.sleep(0.001) 97 | 98 | thisStartTime = time.time() 99 | while thirdCount != 0: 100 | # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”) 101 | if time.time() - thisStartTime > 10: 102 | print "等待时间到,强行退出." 103 | break 104 | print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作" 105 | time.sleep(0.010) 106 | endTime = time.time() 107 | 108 | allTime = endTime - startTime 109 | f.close() 110 | print "完成!花费时间:" + str(allTime) + "s" -------------------------------------------------------------------------------- /抓药品数据/NL.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | import urllib 3 | import re 4 | import threading 5 | import time 6 | import socket 7 | 8 | # 设置这么长时间超时 9 | socket.setdefaulttimeout(10) 10 | 11 | # 抓网页的地址数字 12 | i = 0 13 | # 存储线程的个数 14 | thirdCount = 0 15 | # 获取title的正则表达式 16 | titleRegex = re.compile(r"(?s)(.*?)_") 17 | # 获取自然语言的正则表达式(中间会有<br>,在最后写入文件之前去掉) 18 | NLRegex = re.compile(r'(?s)<div class="pt15 f14 graydeep\s*pl20 pr20">(.*?)</div>') 19 | # 获取大概的问题,里面会有html标签 20 | generalQuestionRegex = re.compile(r'(?s)<div class="graydeep User_quecol pt10 mt10" id="qdetailc"(.*?)/div>') 21 | # 获取大概的问题中的文字,去除html标签 22 | accurateQuestionRegex = re.compile(r'(?s)>(.*?)<') 23 | # 删除字符串中的空白字符 24 | deleteSpaceRegex = re.compile(r'\s') 25 | # 删除<br> 26 | deleteBrRegex = re.compile(r'<br>') 27 | 28 | # 处理抓取任务 29 | def loop(): 30 | global i,thirdCount,titleRegex,NLRegex 31 | i += 1 32 | # 表示新线程启动了 33 | thirdCount += 1 34 | 35 | pageUrl = "http://club.xywy.com/static/1/" + str(i) + ".htm" 36 | try: 37 | request = urllib.urlopen(pageUrl) 38 | except Exception, e: 39 | # 减少一个线程 40 | thirdCount -= 1 41 | return 42 | 43 | try: 44 | # 获得网页源码 45 | html = request.read() 46 | except Exception, e: 47 | # 关闭请求 48 | request.close() 49 | # 减少一个线程 50 | thirdCount -= 1 51 | return 52 | 53 | # 获取title 54 | title = titleRegex.search(html) 55 | # 获取自然语言 56 | NL = NLRegex.findall(html) 57 | # 获取大概的问题,里面会有html标签 58 | generalQuestion = generalQuestionRegex.search(html) 59 | 60 | # 没有找到title就退出 61 | if title == None: 62 | # 关闭请求 63 | request.close() 64 | # 减少一个线程 65 | thirdCount -= 1 66 | return 67 | # 如果是404页面就退出 68 | if title.group(1).decode("gbk") == u"404页面": 69 | # 关闭请求 70 | request.close() 71 | # 减少一个线程 72 | thirdCount -= 1 73 | return 74 | print "url: " + pageUrl + " title:" + title.group(1).decode("gbk") 75 | 76 | # 获取大概的问题中的文字,去除html标签 77 | accurateQuestion = accurateQuestionRegex.findall(generalQuestion.group(1)) 78 | 79 | # 如果有人说的话 80 | if NL: 81 | # 打开文件 82 | NLFile = open('/home/geekgao/data/' + repr(time.time()),'w') 83 | # 写入文件的结果字符串(问题和回答) 84 | result = '' 85 | for x in accurateQuestion: 86 | result += x 87 | for x in NL: 88 | result += x 89 | # 删除空白字符 90 | result = deleteSpaceRegex.sub('',result) 91 | # 删除<br> 92 | result = deleteBrRegex.sub('',result) 93 | 94 | NLFile.write(result.decode("gbk").encode("utf-8")) 95 | # 关闭文件 96 | NLFile.close() 97 | # 关闭请求 98 | request.close() 99 | # 减少一个线程 100 | thirdCount -= 1 101 | 102 | 103 | 104 | startTime = time.time() 105 | while i < 100000: 106 | num = i 107 | # 线程要始终保持在50个 108 | if thirdCount < 50: 109 | print '【新进程】:' + str(num) + "loopThird" + "进程总数:" + str(thirdCount) 110 | t = threading.Thread(target = loop, name = str(num) + "loopThird") 111 | t.start() 112 | time.sleep(0.001) 113 | 114 | thisStartTime = time.time() 115 | while thirdCount != 0: 116 | # 等待超时就退出(没有这个有时候线程并不能全部退出,看资源管理器,说“等候频道 poll_scheme_time”) 117 | if time.time() - thisStartTime > 10: 118 | print "等待时间到,强行退出." 119 | break 120 | print "等待线程全部结束!还有" + str(thirdCount) + "个线程在工作" 121 | time.sleep(0.010) 122 | endTime = time.time() 123 | 124 | allTime = endTime - startTime 125 | print "完成!花费时间:" + str(allTime) + "s" -------------------------------------------------------------------------------- /downMp4OfJiKeXueYuan.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/python 2 | # coding:utf-8 3 | 4 | import urllib, os, urllib2, cookielib, re 5 | 6 | # 下载极客学院的视频 7 | # 需要一个vip账号(验证邮箱和手机会有体验vip) 8 | class DownCourse(object): 9 | # 给urllib2添加cookie支持 10 | # path: 下载的视频要保存的文件夹 11 | def __init__(self,path): 12 | # 初始化一个CookieJar来处理Cookie 13 | cookieJar = cookielib.CookieJar() 14 | # 实例化一个全局opener 15 | opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookieJar)) 16 | # 把这个cookie处理机制装上去,大概是这个意思-.- 17 | urllib2.install_opener(opener) 18 | 19 | self.folderPath = path 20 | # 判断文件夹是否存在 21 | folderExists = os.path.exists(self.folderPath) 22 | if not folderExists: 23 | os.mkdir(self.folderPath) 24 | 25 | # 登陆函数 26 | def login(self): 27 | # 从登录页面获取登陆参数 28 | login_url = 'http://passport.jikexueyuan.com/sso/login' 29 | # 登陆信息发送到这个地址 30 | passport_url = 'http://passport.jikexueyuan.com/submit/login?is_ajax=1' 31 | verifyCode_url = 'http://passport.jikexueyuan.com/sso/verify' 32 | 33 | # 获取登陆页面源码 34 | request = urllib2.urlopen(login_url) 35 | html = request.read() 36 | request.close() 37 | 38 | # 获取登陆要post的数据 39 | expire = re.search(r"(?s)value='(.*?)' name='expire",html) 40 | # 验证码 41 | verifyCodeGifPath = '/tmp/jikexueyuan.gif' 42 | request = urllib2.urlopen(verifyCode_url) 43 | gif = request.read() 44 | request.close() 45 | fGif = open(verifyCodeGifPath,'w') 46 | fGif.write(gif) 47 | fGif.close() 48 | # 读取保存到本地的验证码图片 49 | os.system('eog ' + verifyCodeGifPath) 50 | verify = raw_input("请输入图中的验证码:") 51 | 52 | data = { 53 | 'expire': expire.group(1), 54 | 'referer': 'http%3A%2F%2Fwww.jikexueyuan.com%2F', 55 | 'uname': 用户名, 56 | 'password': 密码, 57 | 'verify': verify, 58 | } 59 | post_data = urllib.urlencode(data) 60 | 61 | request = urllib2.Request(passport_url,post_data) 62 | # 给一个useragent,防止被认为是爬虫程序 63 | request.add_header('User-Agent', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.125 Safari/537.36') 64 | # 发送登录请求 65 | request = urllib2.urlopen(request) 66 | request.close() 67 | print '登陆完成' 68 | 69 | # courseUrl: 课程地址首页,例如:http://www.jikexueyuan.com/course/989.html 70 | def download(self, courseUrl): 71 | # 获取课程名称 72 | request = urllib2.urlopen(courseUrl) 73 | coursePageHtml = request.read() 74 | request.close() 75 | courseName = re.search(r'(?s)<title>(.*?)-',coursePageHtml).group(1) 76 | # 课程数量 77 | courseCount = int(re.search(r'(?s)class="timebox"><span>(.*?)课时',coursePageHtml).group(1)) 78 | # 存储视频的文件夹路径 79 | folderPath = self.folderPath + courseName + '/' 80 | # 判断文件夹是否存在 81 | folderExists = os.path.exists(folderPath) 82 | if not folderExists: 83 | os.mkdir(folderPath) 84 | 85 | print '课程名:' + courseName + ' 课程数量:' + str(courseCount) 86 | # 课程的编号,构建课程的页面地址 87 | i = 0 88 | while i < courseCount: 89 | i += 1 90 | pageUrl = courseUrl.split('.html')[0] + '_' + str(i) + '.html?ss=1' 91 | # 本节课程的html代码 92 | request = urllib2.urlopen(pageUrl) 93 | pageHtml = request.read() 94 | request.close() 95 | # 本节课程的名称 96 | name = re.search(r'(?s)<title>(.*?)-',pageHtml).group(1) 97 | # 本节课程的视频地址 98 | videoUrl = re.search(r'<source src="(.*?)"',pageHtml) 99 | # 有的页面写的课时比实际课时多,会匹配不到视频地址 100 | if videoUrl == None: 101 | continue 102 | else: 103 | videoUrl = videoUrl.group(1) 104 | print '正在下载' + name + '...' 105 | # 存储视频的Path: 总路径/课程名/每一节的名称 106 | urllib.urlretrieve(videoUrl,folderPath + str(i) + name + '.mp4',self.cbk) 107 | print '下载完成' 108 | 109 | # 从网上下载的可以显示下载进度的函数 110 | # \b是我加的,产生了很奇特的显示效果,还行 111 | def cbk(self,a, b, c): 112 | '''回调函数 113 | @a: 已经下载的数据块 114 | @b: 数据块的大小 115 | @c: 远程文件的大小 116 | ''' 117 | per = 100.0 * a * b / c 118 | if per > 100: 119 | per = 100 120 | print '%.2f%%\b\b\b\b\b\b' % per, 121 | 122 | # 建立下载对象,参数是即将下载的这些视频放的目录,程序会根据课程名在这个文件夹里面再建文件夹 123 | down = DownCourse('/home/geekgao/视频/SpringMVC/') 124 | down.login() 125 | 126 | # 下载一个页面中的所有课程 127 | request = urllib2.urlopen('http://www.jikexueyuan.com/course/springmvc/') 128 | html = request.read() 129 | request.close() 130 | courseUrls = re.findall(r'class="lesson-info-h2"><a href="(.*?)"',html) 131 | 132 | for courseUrl in courseUrls: 133 | down.download(courseUrl) 134 | --------------------------------------------------------------------------------