├── .DS_Store
├── DrivingSubject.py
├── News.py
├── README.md
└── images
    └── ResJson.png


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lsido/PythonScript/ec8d8ab1318b193426aa66dbdda2bf0bb7859553/.DS_Store


--------------------------------------------------------------------------------
/DrivingSubject.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # 驾考宝典题库爬虫 lsido.com
  3 | # 爬取各类车型科目一、科目四所有题，带更新
  4 | import requests
  5 | import re
  6 | from random import choice
  7 | import json
  8 | import random
  9 | from bs4 import BeautifulSoup
 10 | import MySQLdb
 11 | import sys
 12 | reload(sys)
 13 | sys.setdefaultencoding('utf-8')
 14 | #get请求
 15 | def getHTMLText(url):
 16 |     try:
 17 |         r = requests.get(url, timeout = 30)
 18 |         r.raise_for_status()
 19 |         r.encoding = 'utf-8'
 20 |         return r.text
 21 |     except:
 22 |         return ""
 23 | #异常处理数据库链接
 24 | try:
 25 |     conn = MySQLdb.connect(host='localhost',port= 3306,user = 'root',passwd='root',db='test',charset='utf8')
 26 |     cur = conn.cursor()
 27 | except Exception, e:
 28 |     print '发生了一个错误: %s, 你可以在这里删除错误的文档' % e
 29 | 
 30 | #python获取控制台参数
 31 | if sys.argv[1] == '小车':
 32 |     carType = 'car'
 33 |     chexing = 'C1C2C3C4'
 34 | elif sys.argv[1] == '货车':
 35 |     carType = 'truck'
 36 |     chexing = 'A2B2'
 37 | elif sys.argv[1] == '客车':
 38 |     carType = 'bus'
 39 |     chexing = 'A1A3B1'
 40 | elif sys.argv[1] == '摩托车':
 41 |     carType = 'moto'
 42 |     chexing = 'moto'
 43 | else:
 44 |     carType = 'car'
 45 |     chexing = 'C1C2C3C4'
 46 | kemu = sys.argv[2]
 47 | if sys.argv[2] == '科目一':
 48 |     r = getHTMLText('http://api2.jiakaobaodian.com/api/open/exercise/sequence.htm?_r=17801703702540802070&cityCode=511300&page=1&limit=25&course=kemu1&carType='+str(carType)+'&_=0.7121756361682074')
 49 |     pjson = json.loads(r)
 50 |     jlist = pjson['data']
 51 | else:
 52 |     r = getHTMLText('http://api2.jiakaobaodian.com/api/open/exercise/sequence.htm?_r=17801703702540802070&cityCode=511300&page=1&limit=25&course=kemu3&carType='+str(carType)+'&_=0.7121756361682074')
 53 |     pjson = json.loads(r)
 54 |     jlist = pjson['data']
 55 | a=0
 56 | data = jlist
 57 | 
 58 | #循环读取ID，从驾考宝典内获取单个id下的记录
 59 | for i in data:
 60 |     #各种异常傻瓜式处理
 61 |     try:
 62 |         problem = getHTMLText('http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds='+str(i))
 63 |         pjson = json.loads(problem)
 64 |         jlist = pjson['data']
 65 |     except:
 66 |         problem = getHTMLText('http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds='+str(i))
 67 |         pjson = json.loads(problem)
 68 |         jlist = pjson['data']
 69 | 
 70 |     if len(jlist) == 0:
 71 |         try:
 72 |             problem = getHTMLText('http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds='+str(i))
 73 |             pjson = json.loads(problem)
 74 |             jlist = pjson['data']
 75 |         except:
 76 |             problem = getHTMLText('http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds='+str(i))
 77 |             pjson = json.loads(problem)
 78 |             jlist = pjson['data']
 79 |     #异常处理获取某个题end
 80 | 
 81 |     #开始插入数据库
 82 |     for k in jlist:
 83 |         #数据库读取当前id题、判断数据库与驾考宝典是否一致，若无此题则插入本地数据库
 84 |         idCount=cur.execute("select * from yourtable where questionId = "+str(i))
 85 |         #如果上sql有记录，判断当前题目车型里是否含有当前车型，没有则将当前车型更新到当前题目，有则无动作
 86 |         CxCount=cur.execute("select * from yourtable where questionId = "+str(i)+" and chexing like '%"+str(chexing)+"%'")
 87 |         if idCount == 0:
 88 |         #if 1==1:
 89 |             if k['optionA'] == "正确":
 90 |                 if k['answer'] == 32:
 91 |                     answer = "错误,"
 92 |                 else:
 93 |                     answer = "正确,"
 94 |                 try:
 95 |                     imgUrl = k['mediaContent']
 96 |                 except:
 97 |                     imgUrl = '0'
 98 |                 try:
 99 |                     explain = k['explain']
100 |                 except:
101 |                     explain = '0'
102 |                 sql = "INSERT INTO `yourtable` (`chexing`, `kemu`, `zhangjieId`, `id`, `type`, `area1`, `area2`, `area3`, `area4`, `area5`, `region`, `title`, `content`, `imagesPath`, `musicPath`, `daan`, `score`, `tikuIndex`, `jieshi`, `beiyong`, `imagesWidth`, `imagesHeight`, `A`, `B`, `C`, `D`, `title2`, `xuanxiang2`, `changjingshifan1`, `changjingshifan2`,`questionId`) VALUES (%s, %s, '0', NULL, '判断题', NULL, NULL, NULL, NULL, NULL, NULL,%s, NULL, %s, NULL, %s, '1', '1', %s, '', NULL, NULL, NULL, NULL, NULL, NULL, '', '', '', '',%s);"
103 |                 reCount = cur.execute(sql,(chexing,kemu,k['question'],imgUrl,answer,explain,k['questionId']))
104 |                 conn.commit()
105 |                 print '判断题：'+str(k['question']) + '已插入数据库' 
106 |             else:
107 |                 if k['answer'] == 16:
108 |                     answer = "A,"
109 |                 elif k['answer'] == 32:
110 |                     answer = "B,"
111 |                 elif k['answer'] == 64:
112 |                     answer = "C,"
113 |                 else:
114 |                     answer = "D,"
115 |                 try:
116 |                     imgUrl = k['mediaContent']
117 |                 except:
118 |                     imgUrl = "0"
119 |                 try:
120 |                     explain = k['explain']
121 |                 except:
122 |                     explain = '0'
123 |                 if k['optionType'] == 2:
124 |                     atype = '多选题'
125 |                 else:
126 |                     atype = '单选题'
127 |                 sql = "INSERT INTO `yourtable` (`chexing`, `kemu`, `zhangjieId`, `id`, `type`, `area1`, `area2`, `area3`, `area4`, `area5`, `region`, `title`, `content`, `imagesPath`, `musicPath`, `daan`, `score`, `tikuIndex`, `jieshi`, `beiyong`, `imagesWidth`, `imagesHeight`, `A`, `B`, `C`, `D`, `title2`, `xuanxiang2`, `changjingshifan1`, `changjingshifan2`,`questionId`) VALUES (%s, %s, '0', NULL, %s, NULL, NULL, NULL, NULL, NULL, NULL,%s, NULL, %s, NULL, %s, '1', '1', %s, '1', '1', '1', %s, %s, %s, %s, '', '', '', '', %s);"
128 |                 reCount = cur.execute(sql,(chexing,kemu,atype,k['question'],imgUrl,answer,explain,k['optionA'],k['optionB'],k['optionC'],k['optionD'],k['questionId']))
129 |                 conn.commit()
130 |                 
131 |                 print str(atype)+'：'+str(k['question']) + '已插入数据库'
132 |         else:
133 |              
134 |             if CxCount == 1:
135 |                 print str(i)+':数据库已有当前题目'
136 |             else:
137 |                 try:
138 |                     sql = "update `yourtable` set chexing = CONCAT(`chexing`,'"+str(chexing)+"') where questionId = "+str(i)
139 |                     reCount = cur.execute(sql)
140 |                 except:
141 |                     sql = "update `yourtable` set chexing = CONCAT(`chexing`,'"+str(chexing)+"') where questionId = "+str(i)
142 |                     reCount = cur.execute(sql)
143 |     a=a+1
144 | print "完成记录:"+str(a)
145 | conn.commit()
146 | cur.close()
147 | conn.close()


--------------------------------------------------------------------------------
/News.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8 
 2 | #搜狐新闻爬虫 lsido.com
 3 | import requests
 4 | from bs4 import BeautifulSoup
 5 | import sys
 6 | import re
 7 | import MySQLdb
 8 | import json
 9 | reload(sys) 
10 | sys.setdefaultencoding('utf8')
11 | #获取新闻详情内容
12 | def getHTMLText(url):
13 |     try:
14 |         r = requests.get(url, timeout = 30)
15 |         r.raise_for_status()
16 |         r.encoding = 'gbk'
17 |         return r.text
18 |     except:
19 |         return ""
20 | 
21 | #解析新闻详情内容
22 | def getContent(url):
23 |     html = getHTMLText(url)
24 |     # print(html)
25 |     soup = BeautifulSoup(html,'lxml')
26 |     title = soup.find_all(attrs={"itemprop": "headline"})
27 |     #print(title[0].get_text())
28 |     paras = soup.find_all(attrs={"itemprop": "articleBody"})
29 |     [s.extract() for s in soup('script')] 
30 |     [s.extract() for s in soup('style')] 
31 |     [s.extract() for s in soup.select("div.new_hot1")] 
32 |     soup.prettify()
33 |     for para in paras:
34 |         if len(para) > 0:
35 |              return para
36 |              
37 | #遍历新闻列表,插入数据库
38 | def getNewsList(url):
39 |     
40 |     wbdata = requests.get(url, timeout = 30)
41 |     wbdata.raise_for_status()
42 |     wbdata.encoding = 'gbk'
43 |     soup = BeautifulSoup(wbdata.text,'html.parser')
44 |     news_titles = soup.find_all(attrs={"test": "a"})
45 |     acount = 0
46 |     for n in news_titles:
47 |         title = n.get_text()
48 |         link = n.get("href")
49 |         try:
50 |             cont =  getContent(link)
51 |             html_escaped = MySQLdb.escape_string(cont.encode('utf-8'))
52 |             myc =  HttpPost(html_escaped)
53 |         except Exception, e:
54 |             print '发生了一个错误: %s, 你可以在这里删除错误的文档' % e
55 |  
56 |         try:  
57 |              sql='你的sql语句'
58 |              reCount = cur.execute(sql,(title,myc))
59 |              conn.commit()  
60 |         except Exception as e:  
61 |             print e  
62 |             conn.rollback()
63 |         acount = acount+1  
64 |         print('标题为:'+title+'-已成功插入数据库,总量:'+str(acount))
65 | #生成新闻页码
66 | def getNewsPage():
67 |     page = range(940,944); 
68 |     for n in reversed(page):
69 |        n=n-1;
70 |        url = 'http://wei.sohu.com/roll/index_' + str(n) + '.shtml'
71 |        getNewsList(url)
72 |        
73 | #新闻伪原创（可加入至Python，但是性能有所降低，综合比较post本地php比加载Python快约20%，测试环境: MAC OS）
74 | def HttpPost(content):
75 |     url = 'http://localhost/words.php'
76 |     d = {'content':content}
77 |     r = requests.post(url, data=d)
78 |     hjson = json.loads(r.text)
79 |     return hjson['content']
80 | #入口
81 | def main():
82 |      getNewsPage();
83 | 
84 | #数据库操作
85 | conn = MySQLdb.connect(host='localhost',port= 3306,user = 'root',passwd='root',db='article',charset='utf8')
86 | cur = conn.cursor()
87 | main()
88 | conn.commit()
89 | cur.close()
90 | conn.close()


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DrivingSubject.py
 2 | 
 3 | (2017.11.16)
 4 | 
 5 | 基于Python2.7爬取驾考宝典所有题目爬虫
 6 | 
 7 | 安装好拓展后，使用方法如下（控制台输入）:
 8 | ```
 9 | python DrivingSubject.py 小车 科目一
10 | ```
11 | 
12 | 加入更新判断,linux下可放入crontab定时执行,通过判断题目达到更新目的
13 | 
14 | 具体看代码
15 | 
16 | 备注:
17 | 
18 | 驾考宝典获取题目ID接口:
19 | ```
20 | http://api2.jiakaobaodian.com/api/open/question/list-by-tag.htm?_r=111922017237088616081&cityCode=511300&page=1&limit=25&course=kemu1&tagId=2&carType=car&_=0.5066246786512065
21 | ```
22 | 
23 | 根据ID读取题目接口:
24 | ```
25 | http://api2.jiakaobaodian.com/api/open/question/question-list.htm?_r=19604815519963578102&page=1&limit=25&questionIds=909400
26 | ```
27 | 返回Json如下:
28 | ![Image text](https://raw.githubusercontent.com/Lsido/PythonScript/master/images/ResJson.png)
29 | 
30 | 
31 | # News.py
32 | 
33 | 基于Python2.7写的单线程爬搜狐新闻列表批量存入数据库
34 | 
35 | 新闻列表：http://wei.sohu.com/roll/ 大概100个页码，每页40*100 约4000篇新闻
36 | 
37 | 使用需安装几个拓展
38 | ```
39 | pip install requests
40 | pip install BeautifulSoup
41 | pip install bs4
42 | pip install MySQL-python
43 | ```
44 | 
45 | 其中新闻伪原创没加在Python里，可以自己定义进去
46 | * 伪原创
47 | PHP代码为：
48 | 
49 | ```
50 | 
51 | function str_reWords($str)
52 | {
53 |     $words=array();
54 |     $content = file_get_contents('词库.txt');
55 |     $content = str_replace( "\r", "",$content);
56 |     $content = preg_split('/\n/', $content, -1, PREG_SPLIT_NO_EMPTY);
57 |     foreach($content as $k=>$v)
58 |     {
59 |         if($k!=0)
60 |         {
61 |             $str_data = explode('_',$v);
62 |             $words+=array("$str_data[0]"=>"$str_data[1]");
63 |         }
64 |     }
65 |     return strtr($str,$words);
66 | }
67 | die(json_encode(array ('content'=>str_reWords($_POST['content']))));
68 | 
69 | 
70 | 
71 | 
72 | ```
73 | 词库文件内容：
74 | ```
75 | 善良_善意
76 | 好人_不坏的人
77 | ```
78 | Python将文章存入数据库时会转义HTML
79 | 
80 | PHP读取文章可使用函数stripslashes进行反转义：
81 | ```
82 | 	$content = str_replace('\n','',content); //替换换行
83 | 	$content = stripslashes($row['content']);//反转义 /
84 | ```
85 | 
86 | 拓展：
87 | 
88 | 可根据页面详情抓取{腾讯新闻}{百度新闻}{网易新闻}列表
89 | 


--------------------------------------------------------------------------------
/images/ResJson.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Lsido/PythonScript/ec8d8ab1318b193426aa66dbdda2bf0bb7859553/images/ResJson.png


--------------------------------------------------------------------------------