├── .DS_Store ├── DrivingSubject.py ├── News.py ├── README.md └── images └── ResJson.png /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lsido/PythonScript/ec8d8ab1318b193426aa66dbdda2bf0bb7859553/.DS_Store -------------------------------------------------------------------------------- /DrivingSubject.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 驾考宝典题库爬虫 lsido.com 3 | # 爬取各类车型科目一、科目四所有题,带更新 4 | import requests 5 | import re 6 | from random import choice 7 | import json 8 | import random 9 | from bs4 import BeautifulSoup 10 | import MySQLdb 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | #get请求 15 | def getHTMLText(url): 16 | try: 17 | r = requests.get(url, timeout = 30) 18 | r.raise_for_status() 19 | r.encoding = 'utf-8' 20 | return r.text 21 | except: 22 | return "" 23 | #异常处理数据库链接 24 | try: 25 | conn = MySQLdb.connect(host='localhost',port= 3306,user = 'root',passwd='root',db='test',charset='utf8') 26 | cur = conn.cursor() 27 | except Exception, e: 28 | print '发生了一个错误: %s, 你可以在这里删除错误的文档' % e 29 | 30 | #python获取控制台参数 31 | if sys.argv[1] == '小车': 32 | carType = 'car' 33 | chexing = 'C1C2C3C4' 34 | elif sys.argv[1] == '货车': 35 | carType = 'truck' 36 | chexing = 'A2B2' 37 | elif sys.argv[1] == '客车': 38 | carType = 'bus' 39 | chexing = 'A1A3B1' 40 | elif sys.argv[1] == '摩托车': 41 | carType = 'moto' 42 | chexing = 'moto' 43 | else: 44 | carType = 'car' 45 | chexing = 'C1C2C3C4' 46 | kemu = sys.argv[2] 47 | if sys.argv[2] == '科目一': 48 | r = getHTMLText('http://api2.jiakaobaodian.com/api/open/exercise/sequence.htm?_r=17801703702540802070&cityCode=511300&page=1&limit=25&course=kemu1&carType='+str(carType)+'&_=0.7121756361682074') 49 | pjson = json.loads(r) 50 | jlist = pjson['data'] 51 | else: 52 | r = getHTMLText('http://api2.jiakaobaodian.com/api/open/exercise/sequence.htm?_r=17801703702540802070&cityCode=511300&page=1&limit=25&course=kemu3&carType='+str(carType)+'&_=0.7121756361682074') 53 | pjson = json.loads(r) 54 | jlist = pjson['data'] 55 | a=0 56 | data = jlist 57 | 58 | #循环读取ID,从驾考宝典内获取单个id下的记录 59 | for i in data: 60 | #各种异常傻瓜式处理 61 | try: 62 | problem = getHTMLText('http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds='+str(i)) 63 | pjson = json.loads(problem) 64 | jlist = pjson['data'] 65 | except: 66 | problem = getHTMLText('http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds='+str(i)) 67 | pjson = json.loads(problem) 68 | jlist = pjson['data'] 69 | 70 | if len(jlist) == 0: 71 | try: 72 | problem = getHTMLText('http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds='+str(i)) 73 | pjson = json.loads(problem) 74 | jlist = pjson['data'] 75 | except: 76 | problem = getHTMLText('http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds='+str(i)) 77 | pjson = json.loads(problem) 78 | jlist = pjson['data'] 79 | #异常处理获取某个题end 80 | 81 | #开始插入数据库 82 | for k in jlist: 83 | #数据库读取当前id题、判断数据库与驾考宝典是否一致,若无此题则插入本地数据库 84 | idCount=cur.execute("select * from yourtable where questionId = "+str(i)) 85 | #如果上sql有记录,判断当前题目车型里是否含有当前车型,没有则将当前车型更新到当前题目,有则无动作 86 | CxCount=cur.execute("select * from yourtable where questionId = "+str(i)+" and chexing like '%"+str(chexing)+"%'") 87 | if idCount == 0: 88 | #if 1==1: 89 | if k['optionA'] == "正确": 90 | if k['answer'] == 32: 91 | answer = "错误," 92 | else: 93 | answer = "正确," 94 | try: 95 | imgUrl = k['mediaContent'] 96 | except: 97 | imgUrl = '0' 98 | try: 99 | explain = k['explain'] 100 | except: 101 | explain = '0' 102 | sql = "INSERT INTO `yourtable` (`chexing`, `kemu`, `zhangjieId`, `id`, `type`, `area1`, `area2`, `area3`, `area4`, `area5`, `region`, `title`, `content`, `imagesPath`, `musicPath`, `daan`, `score`, `tikuIndex`, `jieshi`, `beiyong`, `imagesWidth`, `imagesHeight`, `A`, `B`, `C`, `D`, `title2`, `xuanxiang2`, `changjingshifan1`, `changjingshifan2`,`questionId`) VALUES (%s, %s, '0', NULL, '判断题', NULL, NULL, NULL, NULL, NULL, NULL,%s, NULL, %s, NULL, %s, '1', '1', %s, '', NULL, NULL, NULL, NULL, NULL, NULL, '', '', '', '',%s);" 103 | reCount = cur.execute(sql,(chexing,kemu,k['question'],imgUrl,answer,explain,k['questionId'])) 104 | conn.commit() 105 | print '判断题:'+str(k['question']) + '已插入数据库' 106 | else: 107 | if k['answer'] == 16: 108 | answer = "A," 109 | elif k['answer'] == 32: 110 | answer = "B," 111 | elif k['answer'] == 64: 112 | answer = "C," 113 | else: 114 | answer = "D," 115 | try: 116 | imgUrl = k['mediaContent'] 117 | except: 118 | imgUrl = "0" 119 | try: 120 | explain = k['explain'] 121 | except: 122 | explain = '0' 123 | if k['optionType'] == 2: 124 | atype = '多选题' 125 | else: 126 | atype = '单选题' 127 | sql = "INSERT INTO `yourtable` (`chexing`, `kemu`, `zhangjieId`, `id`, `type`, `area1`, `area2`, `area3`, `area4`, `area5`, `region`, `title`, `content`, `imagesPath`, `musicPath`, `daan`, `score`, `tikuIndex`, `jieshi`, `beiyong`, `imagesWidth`, `imagesHeight`, `A`, `B`, `C`, `D`, `title2`, `xuanxiang2`, `changjingshifan1`, `changjingshifan2`,`questionId`) VALUES (%s, %s, '0', NULL, %s, NULL, NULL, NULL, NULL, NULL, NULL,%s, NULL, %s, NULL, %s, '1', '1', %s, '1', '1', '1', %s, %s, %s, %s, '', '', '', '', %s);" 128 | reCount = cur.execute(sql,(chexing,kemu,atype,k['question'],imgUrl,answer,explain,k['optionA'],k['optionB'],k['optionC'],k['optionD'],k['questionId'])) 129 | conn.commit() 130 | 131 | print str(atype)+':'+str(k['question']) + '已插入数据库' 132 | else: 133 | 134 | if CxCount == 1: 135 | print str(i)+':数据库已有当前题目' 136 | else: 137 | try: 138 | sql = "update `yourtable` set chexing = CONCAT(`chexing`,'"+str(chexing)+"') where questionId = "+str(i) 139 | reCount = cur.execute(sql) 140 | except: 141 | sql = "update `yourtable` set chexing = CONCAT(`chexing`,'"+str(chexing)+"') where questionId = "+str(i) 142 | reCount = cur.execute(sql) 143 | a=a+1 144 | print "完成记录:"+str(a) 145 | conn.commit() 146 | cur.close() 147 | conn.close() -------------------------------------------------------------------------------- /News.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #搜狐新闻爬虫 lsido.com 3 | import requests 4 | from bs4 import BeautifulSoup 5 | import sys 6 | import re 7 | import MySQLdb 8 | import json 9 | reload(sys) 10 | sys.setdefaultencoding('utf8') 11 | #获取新闻详情内容 12 | def getHTMLText(url): 13 | try: 14 | r = requests.get(url, timeout = 30) 15 | r.raise_for_status() 16 | r.encoding = 'gbk' 17 | return r.text 18 | except: 19 | return "" 20 | 21 | #解析新闻详情内容 22 | def getContent(url): 23 | html = getHTMLText(url) 24 | # print(html) 25 | soup = BeautifulSoup(html,'lxml') 26 | title = soup.find_all(attrs={"itemprop": "headline"}) 27 | #print(title[0].get_text()) 28 | paras = soup.find_all(attrs={"itemprop": "articleBody"}) 29 | [s.extract() for s in soup('script')] 30 | [s.extract() for s in soup('style')] 31 | [s.extract() for s in soup.select("div.new_hot1")] 32 | soup.prettify() 33 | for para in paras: 34 | if len(para) > 0: 35 | return para 36 | 37 | #遍历新闻列表,插入数据库 38 | def getNewsList(url): 39 | 40 | wbdata = requests.get(url, timeout = 30) 41 | wbdata.raise_for_status() 42 | wbdata.encoding = 'gbk' 43 | soup = BeautifulSoup(wbdata.text,'html.parser') 44 | news_titles = soup.find_all(attrs={"test": "a"}) 45 | acount = 0 46 | for n in news_titles: 47 | title = n.get_text() 48 | link = n.get("href") 49 | try: 50 | cont = getContent(link) 51 | html_escaped = MySQLdb.escape_string(cont.encode('utf-8')) 52 | myc = HttpPost(html_escaped) 53 | except Exception, e: 54 | print '发生了一个错误: %s, 你可以在这里删除错误的文档' % e 55 | 56 | try: 57 | sql='你的sql语句' 58 | reCount = cur.execute(sql,(title,myc)) 59 | conn.commit() 60 | except Exception as e: 61 | print e 62 | conn.rollback() 63 | acount = acount+1 64 | print('标题为:'+title+'-已成功插入数据库,总量:'+str(acount)) 65 | #生成新闻页码 66 | def getNewsPage(): 67 | page = range(940,944); 68 | for n in reversed(page): 69 | n=n-1; 70 | url = 'http://wei.sohu.com/roll/index_' + str(n) + '.shtml' 71 | getNewsList(url) 72 | 73 | #新闻伪原创(可加入至Python,但是性能有所降低,综合比较post本地php比加载Python快约20%,测试环境: MAC OS) 74 | def HttpPost(content): 75 | url = 'http://localhost/words.php' 76 | d = {'content':content} 77 | r = requests.post(url, data=d) 78 | hjson = json.loads(r.text) 79 | return hjson['content'] 80 | #入口 81 | def main(): 82 | getNewsPage(); 83 | 84 | #数据库操作 85 | conn = MySQLdb.connect(host='localhost',port= 3306,user = 'root',passwd='root',db='article',charset='utf8') 86 | cur = conn.cursor() 87 | main() 88 | conn.commit() 89 | cur.close() 90 | conn.close() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DrivingSubject.py 2 | 3 | (2017.11.16) 4 | 5 | 基于Python2.7爬取驾考宝典所有题目爬虫 6 | 7 | 安装好拓展后,使用方法如下(控制台输入): 8 | ``` 9 | python DrivingSubject.py 小车 科目一 10 | ``` 11 | 12 | 加入更新判断,linux下可放入crontab定时执行,通过判断题目达到更新目的 13 | 14 | 具体看代码 15 | 16 | 备注: 17 | 18 | 驾考宝典获取题目ID接口: 19 | ``` 20 | http://api2.jiakaobaodian.com/api/open/question/list-by-tag.htm?_r=111922017237088616081&cityCode=511300&page=1&limit=25&course=kemu1&tagId=2&carType=car&_=0.5066246786512065 21 | ``` 22 | 23 | 根据ID读取题目接口: 24 | ``` 25 | http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds=909400 26 | ``` 27 | 返回Json如下: 28 | ![Image text](https://raw.githubusercontent.com/Lsido/PythonScript/master/images/ResJson.png) 29 | 30 | 31 | # News.py 32 | 33 | 基于Python2.7写的单线程爬搜狐新闻列表批量存入数据库 34 | 35 | 新闻列表:http://wei.sohu.com/roll/ 大概100个页码,每页40*100 约4000篇新闻 36 | 37 | 使用需安装几个拓展 38 | ``` 39 | pip install requests 40 | pip install BeautifulSoup 41 | pip install bs4 42 | pip install MySQL-python 43 | ``` 44 | 45 | 其中新闻伪原创没加在Python里,可以自己定义进去 46 | * 伪原创 47 | PHP代码为: 48 | 49 | ``` 50 | 51 | function str_reWords($str) 52 | { 53 | $words=array(); 54 | $content = file_get_contents('词库.txt'); 55 | $content = str_replace( "\r", "",$content); 56 | $content = preg_split('/\n/', $content, -1, PREG_SPLIT_NO_EMPTY); 57 | foreach($content as $k=>$v) 58 | { 59 | if($k!=0) 60 | { 61 | $str_data = explode('_',$v); 62 | $words+=array("$str_data[0]"=>"$str_data[1]"); 63 | } 64 | } 65 | return strtr($str,$words); 66 | } 67 | die(json_encode(array ('content'=>str_reWords($_POST['content'])))); 68 | 69 | 70 | 71 | 72 | ``` 73 | 词库文件内容: 74 | ``` 75 | 善良_善意 76 | 好人_不坏的人 77 | ``` 78 | Python将文章存入数据库时会转义HTML 79 | 80 | PHP读取文章可使用函数stripslashes进行反转义: 81 | ``` 82 | $content = str_replace('\n','',content); //替换换行 83 | $content = stripslashes($row['content']);//反转义 / 84 | ``` 85 | 86 | 拓展: 87 | 88 | 可根据页面详情抓取{腾讯新闻}{百度新闻}{网易新闻}列表 89 | -------------------------------------------------------------------------------- /images/ResJson.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Lsido/PythonScript/ec8d8ab1318b193426aa66dbdda2bf0bb7859553/images/ResJson.png --------------------------------------------------------------------------------