├── .gitattributes
├── .idea
├── dictionaries
│ └── Administrator.xml
├── encodings.xml
├── haodf.iml
├── inspectionProfiles
│ └── Project_Default.xml
├── libraries
│ └── R_User_Library.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── README.md
├── baseCode
├── advisory_page_down.py
├── getContent.py
├── getHaodf.py
└── study.py
├── debug.log
├── multi_haodf
├── __pycache__
│ └── getContent.cpython-37.pyc
├── getContent.py
└── multi_start.py
├── source
└── haodf.jpg
└── tools
├── ConnectDatabase.py
├── Logger.py
└── __pycache__
└── ConnectDatabase.cpython-37.pyc
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/.idea/dictionaries/Administrator.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | haodf
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/haodf.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/libraries/R_User_Library.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 | Chrom
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 | 1560663619467
229 |
230 |
231 | 1560663619467
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 HelloAtilol
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 好大夫的爬虫工具
2 |
3 | ## Dependence(Python 3.6)
4 | - BeautifulSoup
5 | - selenium
6 | - time
7 | - datetime
8 | - random
9 | - pyprind
10 | - pymysql
11 |
12 | ## 数据存储方式
13 | 包括四个表:all_url, QA, doctor, relative_qa。具体结构如图所示;
14 | 
15 |
16 | ## 代码执行步骤
17 | 1. 配置`ConnectDatabase.py`中的数据库参数,与数据库建立链接;
18 | 2. 执行`getHaodf.py`,获取所有的URL,可以通过设置日期获取;
19 | ```python
20 | # 爬取文件开始日期
21 | CURRENT_DATE = "20180714"
22 | # 爬取文件结束日期
23 | END_DATE = "20181231"
24 | ```
25 | 3. 获取每个URL的信息,好大夫的数据结构有很多种,目前发现了两种:`{"class": "zzx_yh_stream"}`和`{"class": "f-card clearfix js-f-card"}`,如果出现新的数据结构,需要重新编写。
26 |
27 | ```python
28 | # 解析QA
29 | qa_list = 1
30 | # 默认第一种解析方式{"class": "zzx_yh_stream"}
31 | split_type = 1
32 | qa_content_soups = soup.find_all("div", {"class": "zzx_yh_stream"})
33 | # 第二种解析方式{"class": "f-card clearfix js-f-card"}
34 | if len(qa_content_soups) == 0:
35 | split_type = 3
36 | print("第二种解析方式")
37 | qa_content_soups = soup.find_all("div", {"class": "f-card clearfix js-f-card"})
38 | # 出现了新的网站结构。需要手动解析
39 | if len(qa_content_soups) == 0:
40 | split_type = 5
41 | input("未知解析方式!")
42 | ```
43 | 4. 执行`getContent.py`。
44 |
45 | ## 更新信息
46 | ### 2019.06.21
47 | - 为读取页面URL添加了进度条;
48 | - 进行了1000个页面的测试;
49 |
50 | ### 2019.07.05
51 | - 增加了多线程访问;
52 | - 优化了代码结构;
53 | - 增加了解析失败url的保存文档;
54 | - 对已知大部分的url解析可能更新了状态;
55 |
56 | ### 209.07.12
57 | - 增加了日志文件;
58 | - 修复了部分数据存入数据库失败的BUG;
59 | - 整合了存储部分的代码,减少数据库操作;
--------------------------------------------------------------------------------
/baseCode/advisory_page_down.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # @Time : 2018/8/10 18:54
4 | # @FileName: advisory_page_down.py
5 | # @Function: 根据输入的起止时间,下载该时段内好大夫医患对话详情页,可自定义全局常量 2018-09-12 0234(断点续爬)
6 |
7 | import datetime
8 | import os
9 | import time
10 |
11 | import re
12 | from selenium import webdriver
13 | from selenium.common.exceptions import TimeoutException, NoSuchElementException
14 | from selenium.webdriver.common.by import By
15 | from selenium.webdriver.support import expected_conditions as EC
16 | from selenium.webdriver.support.wait import WebDriverWait
17 |
18 | # 定义抓取数据的url,时间跨度及存储路径等初始值
19 | BASE_URL = 'https://www.haodf.com/sitemap-zx/'
20 | DATE_START = '20080222'
21 | DATE_END = '20080223'
22 | DIR_PATH = './'
23 | TIME_WAIT = 30
24 | TIME_SLEEP = 2
25 | # log 编码方式
26 | ENCODING_STYLE = 'gb18030'
27 | # status
28 | # 断点发生日,默认为爬虫起始页
29 | CURRENT_DATE = DATE_START
30 | # 断点发生时正在爬取的日期页为第几页,默认为1
31 | CURRENT_PAGE = 1
32 | # 断点发生时 list 下标 current_index 正好等于已成功爬取的条数,默认为0
33 | current_index = 0
34 |
35 | # chrome 无窗模式
36 | chrome_options = webdriver.ChromeOptions()
37 | chrome_options.add_argument('--headless')
38 | browser = webdriver.Chrome(chrome_options=chrome_options)
39 |
40 | # 显式等待
41 | wait = WebDriverWait(browser, TIME_WAIT)
42 |
43 |
44 | def down_detail_page(file_path, local_time):
45 | """
46 | 构造日期循环,调用creat_date_page_url函数,下载医患对话详情页到本地
47 | :param file_path:
48 | :param local_time:
49 | :return:
50 | """
51 | # 抓取网页的起止时间的字符串型时间格式化为日期型
52 | advisory_date = datetime.datetime.strptime(DATE_START, '%Y%m%d')
53 | # 断点发生日
54 | current_date = datetime.datetime.strptime(CURRENT_DATE, '%Y%m%d')
55 | advisory_date_end = datetime.datetime.strptime(DATE_END, '%Y%m%d')
56 | # 是否断点续爬
57 | if advisory_date < current_date:
58 | advisory_date = current_date
59 | # else:
60 | # print('从初始日期 DATE_START 开始爬取。')
61 | # 获取断点页页码,首次爬取页面数一般是 CURRENT_PAGE,这里为了防止输入页码小于1
62 | current_page = max(CURRENT_PAGE, 1)
63 | # 遍历待抓取网页起止时间区间
64 | while advisory_date <= advisory_date_end:
65 | # 下面调用函数生成全部每日所有页面的 URL ,解析出医患对话详情页的 URL
66 | # 该函数内部调用获取详情页源码的函数 get_detail_page
67 | # =====================
68 | # 首先考虑生成时间戳,然后取前后两次的差,如果差小于某个值,说明可能存在拒绝访问、
69 | # 网速差或其他未知错误,抓取页面不正常,但程序仍在无效运行,可能出现在抓取日期页过程中
70 | # =====================
71 | start_time = time.perf_counter()
72 | print(advisory_date)
73 | # 调用函数生成某一天的每一页页面 url,然后解析获取该页面中的 title 和 url
74 | # creat_date_page_url(advisory_date, file_path, local_time, current_page)
75 | # 后面的每一天都从第一页开始爬取
76 | current_page = 1
77 | # 输出浮点型数据
78 | delta_time = time.perf_counter() - start_time
79 | # 这里设置完成某天抓取小于多少时间或大于多少时间意义不大,可以将时差打印出来,人工来判断手工终止程序
80 | # 暂时不清楚 browser.get()会不会触发异常
81 | # 以下本来在一行,delta_time 有 warning
82 | print(advisory_date.strftime('%Y-%m-%d'), ' 日的页面用时 ', end='')
83 | print(delta_time, end='')
84 | print(' 秒解析并抓取完毕')
85 | # 日期推进一天,联系下文不用加 time.sleep
86 | advisory_date += datetime.timedelta(days=1)
87 | else:
88 | print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), ' 程序顺利运行结束!')
89 |
90 |
91 | def creat_date_page_url(advisory_date, file_path, local_time, current_page):
92 | """
93 | 根据传入的日期参数,生成该日所有页面的 URL
94 | 然后解析该页面获取医患对话详情页 URL,然后调用 get_detail_page 保存详情页
95 | :param current_page:
96 | :param local_time:
97 | :param file_path:
98 | :param advisory_date:
99 | :return:
100 | """
101 | for date_page in range(current_page, 1000):
102 | date_page_url = BASE_URL + advisory_date.strftime('%Y%m%d') + '_' + str(date_page) + '/'
103 | # 打印状态
104 | print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), ' 开始尝试抓取 ', advisory_date.strftime('%Y-%m-%d'),
105 | ' 日第 ', str(date_page), ' 页问诊记录')
106 | try:
107 | # 获取含 title 和 detail page url的页面
108 | browser.get(date_page_url)
109 | # 等页面加载成功直到时间超过 TIME_WAIT
110 | wait.until(EC.presence_of_element_located((By.XPATH, '//div[@class="map_all"]')))
111 | # 查找页面class name为'hh'的节点。这里也可以用 try except 做
112 | # 判断如果有 hh 存在,对话详情页 title 和 url 一定存在,至少为1个
113 | browser.find_element_by_xpath('//li[@class="hh"]')
114 | # 和 html.xpath 获取 text()不同。用 elements
115 | item = browser.find_elements_by_xpath('//li/a')
116 | # 取详情页数量
117 | len_item = len(item)
118 | # 将 item 的属性值即 title 和 url 存入二维数组中,调用函数creat_arr_title_url()
119 | arr_title_url = creat_arr_title_url(item, len_item)
120 | # 生成最后网页文件名称前缀
121 | pre_file_name = advisory_date.strftime('%Y%m%d') + '_' + str(date_page) + '_'
122 | # 判断是否从某页非第一条 url 开始爬取,并通过修改全局变量确保只执行一次
123 | global current_index
124 | start_index = 0
125 | if current_index:
126 | start_index = current_index
127 | current_index = 0
128 | for i in range(start_index, len_item):
129 | # 打印状态
130 | print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), advisory_date.strftime('%Y-%m-%d'),
131 | ' 日第 ', str(date_page), ' 页问诊列表共有 ', str(len_item), ' 条问诊页地址,正在抓取第 ', str(i+1), ' 条')
132 | # 记录所有成功加载的某日某页面中所有的 title 和 url,包含可能将没有成功保存至本地的
133 | record_title_url_filename = 'TitleandUrl' + DATE_START + DATE_END + local_time
134 | with open(file_path + 'log/' + record_title_url_filename + '.txt', 'a', encoding=ENCODING_STYLE) \
135 | as record_title_url:
136 | record_title_url.write(arr_title_url[i][0] + '\t' + arr_title_url[i][1] + '\n')
137 | # 调用函数获取某一日某一页上所有医患对话详情页 URL 对应的页面并存入本地
138 | get_detail_page(arr_title_url[i][0], pre_file_name, file_path, local_time)
139 | # 记录成功爬取的最后一个 url 的状态,断点时正在爬取的为这里的下一条
140 | # 可能存在已经成功爬取当前 url 问诊记录的前几页然后异常中断
141 | current_status_filename = 'CurrentStatus' + DATE_START + DATE_END + local_time
142 | current_status_content = advisory_date.strftime('%Y-%m-%d') + ' 日第 ' + str(date_page) + \
143 | ' 页问诊列表共有 ' + str(len_item) + ' 条问诊页地址,已成功抓取 ' + str(i+1) + ' 条'
144 | with open(file_path + 'log/' + current_status_filename + '.txt', 'w', encoding=ENCODING_STYLE) \
145 | as current_status:
146 | current_status.write(current_status_content)
147 | # 记录含有 医患对话 title 和 url的 date_page_url
148 | normal_date_page_url = 'NormalDatePageUrl' + DATE_START + DATE_END + local_time
149 | with open(file_path + 'log/' + normal_date_page_url + '.txt', 'a', encoding=ENCODING_STYLE) \
150 | as normal_date_page:
151 | normal_date_page.write(date_page_url + '\n')
152 | time.sleep(TIME_SLEEP)
153 | except NoSuchElementException:
154 | # 记录该日无记录(首页无 title,url)或该日有记录的最后一页的后一页的 url
155 | print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), advisory_date.strftime('%Y-%m-%d'),
156 | ' 日只有 ', str(date_page-1), ' 页问诊列表,全部抓取完毕')
157 | empty_date_page_url = 'EmptyDatePageUrl' + DATE_START + DATE_END + local_time
158 | with open(file_path + 'log/' + empty_date_page_url + '.txt', 'a', encoding=ENCODING_STYLE) \
159 | as empty_date_page:
160 | empty_date_page.write(date_page_url + '\n')
161 | # 防止频繁访问
162 | time.sleep(TIME_SLEEP)
163 | # 遇到某日某页没有 title 和 url,即空白,结束页码循环,等待开始下一日
164 | break
165 | except TimeoutException:
166 | # 考虑 IP 被封
167 | print(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()), date_page_url, ' 加载失败,请检查网络质量,IP!')
168 | # 打印 date_page_url 没有加载成功的情况,记录 url
169 | record_bug_date_page_url = 'BugDatePageUrl' + DATE_START + DATE_END + local_time
170 | with open(file_path + 'log/' + record_bug_date_page_url + '.txt', 'a', encoding=ENCODING_STYLE) \
171 | as record_bug_date_page:
172 | record_bug_date_page.write(date_page_url + '\n')
173 | # 结束循环,开始请求下一日的第一页
174 | break
175 |
176 |
177 | def creat_arr_title_url(item, len_item):
178 | """
179 | 解析出当前页中的所有 title 和 url,并存入二维数组
180 | :param item:
181 | :param len_item:
182 | :return: arr_title_url
183 | """
184 | # 数组初始化,行数为节点的个数,j 为临时变量
185 | arr_title_url = [[] for j in range(len_item)]
186 | for i in range(len_item):
187 | arr_title_url[i].append(item[i].get_attribute('href'))
188 | arr_title_url[i].append(item[i].text)
189 | return arr_title_url
190 |
191 |
192 | def get_detail_page(detail_page_url, pre_file_name, file_path, local_time):
193 | """
194 | 获取某一页中所有 title 和 url对应的医患对话详情页,并保存到本地
195 | :param detail_page_url:
196 | :param pre_file_name:
197 | :param file_path:
198 | :param local_time:
199 | :return:
200 | """
201 | try:
202 | # 注意 href 值的'//'问题,暂未处理
203 | browser.get(detail_page_url)
204 | # 等待所有节点加载出来
205 | wait.until(EC.presence_of_all_elements_located)
206 | # 保存网页源码为 HTML 文件到本地,注意编码问题
207 | source_code = browser.page_source
208 | # HTML 命名形如20180322_1_xxx.htm,以下用切片的方法获取没有'/'的部分,不然会被认为是路径
209 | # 切片也可以用 detail_page_url.split('/')[-1]
210 | # file_name = pre_file_name + detail_page_url.split('/')[-1]
211 | file_name = pre_file_name + detail_page_url[28:] + '.txt'
212 | # 命名 记录所有成功保存至本地的网页的名称 的文本
213 | record_filename_name = 'NameofSavedPages' + DATE_START + DATE_END + local_time
214 | # 保存网页源码到本地, file_name 自带'.htm' 后缀,采用和网页相同的 gbk 编码
215 | with open(file_path + file_name, 'w', encoding=ENCODING_STYLE) as file:
216 | file.write(source_code)
217 | # 保存成功下载的网页的名称到'NameofSavedPages'(根目录)文件中,这里暂时不加对应 title
218 | with open(file_path + 'log/' + record_filename_name + '.txt', 'a', encoding=ENCODING_STYLE) as record_filename:
219 | record_filename.write(file_name + '\n')
220 | # 抓取每个页面后等候一下,防止过快被屏蔽或出现 5k 文件
221 | time.sleep(TIME_SLEEP)
222 | # 判断该页是否有后续页(翻页)
223 | detail_pages_amount = re.search('
.*?\D*?(\d+)\D*?页', source_code, re.S)
224 | if detail_pages_amount:
225 | for i in range(2, int(detail_pages_amount.group(1)) + 1):
226 | print('当前问诊记录共有 ', detail_pages_amount.group(1), ' 页,', '正在爬取第 ', str(i), ' 页!')
227 | detail_page_more_url = detail_page_url[:-4] + '_p_' + str(i) + '.htm'
228 | get_detail_page_more(detail_page_more_url, pre_file_name, file_path, local_time)
229 | # else:
230 | # print(detail_page_url, '没有后续页!')
231 | except Exception:
232 | print(detail_page_url, ' 未抓取成功!')
233 | # 为记录没有成功保存的 HTML 的 URL 的 TXT 文件命名
234 | record_errfilename_name = 'NameofUnsavedPages' + DATE_START + DATE_END + local_time
235 | with open(file_path + 'log/' + record_errfilename_name + '.txt', 'a', encoding=ENCODING_STYLE) \
236 | as record_errfilename:
237 | record_errfilename.write(pre_file_name + '_' + detail_page_url + '\n')
238 | time.sleep(TIME_SLEEP)
239 |
240 |
241 | def get_detail_page_more(detail_page_url, pre_file_name, file_path, local_time):
242 | """
243 | 如果某个对话有多个页面,这里负责爬取后续页
244 | :param detail_page_url:
245 | :param pre_file_name:
246 | :param file_path:
247 | :param local_time:
248 | :return:
249 | """
250 | try:
251 | browser.get(detail_page_url)
252 | wait.until(EC.presence_of_all_elements_located)
253 | source_code = browser.page_source
254 | file_name = pre_file_name + detail_page_url[28:] + '.txt'
255 | record_filename_name = 'NameofSavedPages' + DATE_START + DATE_END + local_time
256 | with open(file_path + file_name, 'w', encoding=ENCODING_STYLE) as file:
257 | file.write(source_code)
258 | with open(file_path + 'log/' + record_filename_name + '.txt', 'a', encoding=ENCODING_STYLE) as record_filename:
259 | record_filename.write(file_name + '\n')
260 | time.sleep(TIME_SLEEP)
261 | except Exception:
262 | print(detail_page_url, ' 未抓取成功!')
263 | record_errfilename_name = 'NameofUnsavedPages' + DATE_START + DATE_END + local_time
264 | with open(file_path + 'log/' + record_errfilename_name + '.txt', 'a', encoding=ENCODING_STYLE) \
265 | as record_errfilename:
266 | record_errfilename.write(pre_file_name + '_' + detail_page_url + '\n')
267 | time.sleep(TIME_SLEEP)
268 |
269 |
270 | def make_dir():
271 | """
272 | 生成以网页起止时间命名的文件夹,并返回路径 file_path
273 | :return: file_path
274 | """
275 | # 定义存储 HTML 文件的路径为初始路径下起止时间命名的文件夹
276 | file_path = DIR_PATH + DATE_START + '_' + DATE_END + '/'
277 | # 生成 TXT 日志存储路径
278 | log_path = file_path + 'log/'
279 | exists = os.path.exists(log_path)
280 | if not exists:
281 | os.makedirs(log_path)
282 | # 返回两个参数麻烦
283 | return file_path
284 | else:
285 | return file_path
286 |
287 |
288 | def main():
289 | """
290 | 生成file_path 和local_time 供整个程序使用
291 | :return:
292 | """
293 | # 考虑 try 一个对常量的检查,如 DATE_START 一定在 DATE_ENG 前面
294 | local_time = time.strftime('%Y%m%d_%H%M%S', time.localtime())
295 | # 可以考虑给文件夹名称也接一个 local_time 标识
296 | # ========================================================
297 | # 当前路径设置要求,如果程序因异常终止,需要人工删除系列 log 文件
298 | # ========================================================
299 | file_path = make_dir()
300 | try:
301 | # 下载医患对话的详情页
302 | down_detail_page(file_path, local_time)
303 | print('🍺🍺🍺🍺🍺🍺🍺🍺🍺🍺 从 ', DATE_START, ' 到 ', DATE_END, ' 期间的网页已全部存入 ', file_path)
304 | except Exception:
305 | print('😰😰😰😰😰😰😰😰😰😰 从 ', DATE_START, ' 到 ', DATE_END, ' 期间的网页获取失败!')
306 | browser.close()
307 |
308 |
309 | if __name__ == '__main__':
310 | main()
311 |
--------------------------------------------------------------------------------
/baseCode/getContent.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | # @Time : 2019/6/16 18:34
4 | # @Author : 王诚坤
5 | # @File : getContent.py
6 | # @des :解析页面
7 | """
8 |
9 | from tools import ConnectDatabase as conn
10 | from bs4 import BeautifulSoup
11 | from selenium import webdriver
12 | import time
13 | import random
14 |
15 | # 连接数据库
16 | select_conn = conn.MySQLCommand()
17 | select_conn.connectMysql(table="all_url")
18 | update_conn = conn.MySQLCommand()
19 |
20 | driver = webdriver.Chrome()
21 |
22 |
23 | def split_relative(qa_number, relative_soups):
24 | """
25 | 负责解析相关问答、文章和疾病
26 | :param qa_number: 访问页面对应的编号
27 | :param relative_soups: 需要解析的Beautifulsoup对象
28 | :return: None
29 | """
30 | # 更换数据存储的表名
31 | update_conn.connectMysql(table="relative_url")
32 |
33 | for relative_soup in relative_soups:
34 | # print(relative_soup)
35 | r_text = relative_soup.p.find("span", {"class": "fl f18"}).text
36 | # 相关标签,0代表问答;1代表文章;2代表疾病
37 | if "相关回复" in r_text:
38 | tag = '0'
39 | elif "相关文章" in r_text:
40 | tag = '1'
41 | elif "相关疾病" in r_text:
42 | tag = '2'
43 | rq_lis = relative_soup.find_all("li")
44 | for rq_li in rq_lis:
45 | result = {"rela_tag": tag, "rela_title": rq_li.a.string, "rela_url": rq_li.a["href"][2:],
46 | "qa_number": qa_number}
47 | update_conn.insertData(data_dict=result, primary_key="rela_url")
48 |
49 |
50 | def split_content_1(qa_number, qa_list, qa_soups):
51 | """
52 | 解析第一种内容的第一条,class=“zzx_yh_stream”
53 | :param qa_list: 问答的标签
54 | :param qa_number: url编号
55 | :param qa_soups: 问答的Beautifulsoup对象
56 | :return: qa_list 在一个问答中发言的编号
57 | """
58 | # 更换数据存储的表名
59 | update_conn.connectMysql(table="QA")
60 |
61 | # 取出第一个提问
62 | first_describe = qa_soups[0]
63 | # 患者状态和咨询时间
64 | user_soup = first_describe.find("div", {"class": "stream_yh_left"})
65 | # 状态
66 | patient_status = user_soup.find("div", {"class": "yh_l_states"}).span.string
67 | # 发表时间
68 | qa_time = user_soup.find("div", {"class": "yh_l_times"}).string
69 | # qa_author: 0是患者,1是医生
70 | first_result = {"qa_number": qa_number, "qa_list": qa_list, "qa_author": '0', "patient_status": patient_status, "qa_time": qa_time}
71 |
72 | # 咨询内容
73 | content = ""
74 | describe = first_describe.find("div", {"class": "h_s_info_cons"})
75 | for child in describe.children:
76 | if child.name is None or child.name == "script":
77 | continue
78 | if child.text is "" or "本人登录后可见" in child.text:
79 | continue
80 | content += (child.text.replace("\n", "") + "\n")
81 | # 去掉最后一个\n
82 | first_result["qa_content"] = content[:-1]
83 | # 表QA无主键,主键为QA编号
84 | update_conn.insertData(first_result, primary_key="")
85 | qa_list += 1
86 | qa_soups.remove(first_describe)
87 |
88 | # qa_list 在一个问答中发言的编号
89 | return split_soups_1(qa_number, qa_list, qa_soups)
90 |
91 |
92 | def split_soups_1(qa_number, qa_list, qa_soups):
93 | """
94 | 解析第一种内容的后续内容class=“zzx_yh_stream”
95 | :param qa_number: url的编号
96 | :param qa_list: 内容在问答种的编号
97 | :param qa_soups: 需要解析的soup的list
98 | :return: qa_list 在一个问答中发言的编号
99 | """
100 | # 更换数据存储的表名
101 | update_conn.connectMysql(table="QA")
102 |
103 | for qa_soup in qa_soups:
104 | # 时间
105 | qa_time = qa_soup.find("div", {"class": "yh_l_times"}).string
106 | # 结果字典初始化
107 | result = {"qa_number": qa_number, "qa_list": qa_list, "qa_time": qa_time, "qa_tag": ""}
108 | # 判断是医生发言还是患者发言
109 | if "yi" in qa_soup.find("div", {"class": "yh_r_t_icon"}).img["src"]:
110 | # 医生发言对应的处理过程
111 | qa_author = '1'
112 | result["qa_author"] = qa_author
113 | # 判断是否是语音
114 | try:
115 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title"})
116 | if content_soup is None:
117 | content_soup = qa_soup.find("p", {"class": "h_s_cons_main mb10"})
118 | if content_soup is not None:
119 | result["qa_tag"] += qa_soup.find("h3", {"class": "h_s_docs_title mb10 ml10"}).text
120 | content = content_soup.text
121 | except AttributeError:
122 | # 语音的处理方法
123 | content_soup = qa_soup.find("div", {"class": "yy_vioce_box"})
124 | if content_soup is not None:
125 | content = content_soup["src"][2:]
126 | else:
127 | content = ""
128 | result["qa_tag"] += "语音"
129 | result["qa_content"] = content
130 | else:
131 | # 患者提问的解析办法
132 | qa_author = '0'
133 | result["qa_author"] = qa_author
134 | content = ""
135 | # 使用最多的结构
136 | content_soup = qa_soup.find("pre", {"class": "h_s_cons_main"})
137 | # 开药方请求
138 | if content_soup is None:
139 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title iconyaofang"})
140 | # 送礼物
141 | if content_soup is None:
142 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title gifts"})
143 | if content_soup is not None:
144 | result["qa_tag"] = "送礼物"
145 | content_soup = qa_soup.find("p", {"class": "h_s_cons_main"})
146 | # 感谢信
147 | if content_soup is None:
148 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title iconmails"})
149 | if content_soup is not None:
150 | result["qa_tag"] = content_soup.text
151 | content_soup = qa_soup.find("p", {"class": "pt5 wb"}).span
152 | # 上传视频
153 | if content_soup is None:
154 | content_soup = qa_soup.find("span", {"class": "fl bingli_hide_word"})
155 |
156 | # 再次上传病历
157 | if content_soup is None:
158 | # 判断是不是有病例专有头部标签
159 | content_soup = qa_soup.find("div", {"class": "h_s_cons_info_top"})
160 | if content_soup is not None:
161 | qa_list = split_content_1(qa_number, qa_list, [qa_soup])
162 |
163 | # 患者Tag
164 | if content_soup is None:
165 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title"})
166 | if content_soup is not None:
167 | result["qa_tag"] = content_soup.text
168 | h_s_cons = qa_soup.find("div", {"class": "h_s_cons"})
169 | if h_s_cons is not None:
170 | for p_soup in h_s_cons.children:
171 | if p_soup.name != "p":
172 | continue
173 | content += p_soup.text
174 |
175 | # 以上方式都未检测到,需要添加解析方式
176 | if content_soup is None:
177 | input("患者出现新的解析方式")
178 | if len(content) > 0:
179 | result["qa_content"] = content
180 | else:
181 | result["qa_content"] = content_soup.text
182 |
183 | # 患者状态
184 | patient_status = qa_soup.find("div", {"class": "yh_l_states"}).span.string
185 | result["patient_status"] = patient_status
186 |
187 | # 将数据保存到数据库
188 | update_conn.insertData(result, primary_key="")
189 | qa_list += 1
190 |
191 | # qa_list 在一个问答中发言的编号
192 | return qa_list
193 |
194 |
195 | def split_content_2(qa_number, qa_list, qa_soups):
196 | """
197 | 解析第二种内容的第一条,class=“f-card clearfix js-f-card”
198 | :param qa_list: 问答的标签
199 | :param qa_number: url编号
200 | :param qa_soups: 问答的Beautifulsoup对象
201 | :return: qa_list 在一个问答中发言的编号
202 | """
203 | # 更换数据存储的表名
204 | update_conn.connectMysql(table="QA")
205 |
206 | # 取出第一个提问内容
207 | first_describe = qa_soups[0]
208 |
209 | # 患者状态和咨询时间
210 | user_soup = first_describe.find("div", {"class": "f-c-left"})
211 | # 状态
212 | patient_status = user_soup.find("div", {"class": "f-c-l-status"}).span.string
213 | # 发表时间
214 | qa_time = user_soup.find("div", {"class": "f-c-l-date"}).string
215 | # qa_author: 0是患者,1是医生
216 | first_result = {"qa_number": qa_number, "qa_list": qa_list, "qa_author": '0',
217 | "patient_status": patient_status, "qa_time": qa_time}
218 |
219 | # 咨询内容
220 | content = ""
221 | describe = first_describe.find("div", {"class": "f-c-r-wrap"})
222 | for child in describe.children:
223 | if child.name == "h4" or child.name == "p":
224 | content += (child.text+"\n")
225 | # 去掉最后一个\n
226 | first_result["qa_content"] = content[:-1]
227 | update_conn.insertData(first_result, primary_key="")
228 | qa_list += 1
229 |
230 | # 从list中去掉第一个
231 | qa_soups.remove(first_describe)
232 |
233 | # qa_list 在一个问答中发言的编号
234 | return split_soups_2(qa_number, qa_list, qa_soups)
235 |
236 |
237 | def split_soups_2(qa_number, qa_list, qa_soups):
238 | """
239 | 解析第一种内容的后续内容class=“f-card clearfix js-f-card”
240 | :param qa_number: url的编号
241 | :param qa_list: 内容在问答种的编号
242 | :param qa_soups: 需要解析的soup的list
243 | :return: qa_list 在一个问答中发言的编号
244 | """
245 | # 更换数据存储的表名
246 | update_conn.connectMysql(table="QA")
247 |
248 | for qa_soup in qa_soups:
249 | # 时间
250 | qa_time = qa_soup.find("div", {"class": "f-c-l-date"}).string
251 | # 结果字典初始化
252 | result = {"qa_number": qa_number, "qa_list": qa_list, "qa_time": qa_time, "qa_tag": ""}
253 | # 判断是医生还是患者
254 | if "doctor" in qa_soup.find("img", {"class": "f-c-r-usertype"})["src"]:
255 | # 医生的处理方式
256 | qa_author = '1'
257 | result["qa_author"] = qa_author
258 | qa_tag_soup = qa_soup.find("h2", {"class": "f-c-r-w-title"})
259 | # 添加标签(一问一答、图文问诊)
260 | if qa_tag_soup is not None:
261 | result["qa_tag"] += qa_tag_soup.text
262 | # 判断是不是语音
263 | try:
264 | content_soup = qa_soup.find("h4", {"class": "f-c-r-w-subtitle"})
265 | if content_soup is None:
266 | content_soup = qa_soup.find("p", {"class": "f-c-r-doctext"})
267 | # 医生回答中出现了新的结构,需要手动更新
268 | if content_soup is None:
269 | print(qa_soup.find("div", {"class": "f-c-r-wrap"}))
270 | input("医生出现了新的解析方式,可能是语音,重新查看")
271 | content = content_soup.text.replace("\t", "").replace("\n", "").replace(" ", "")
272 | except AttributeError:
273 | # 这里暂时没有找到对应的语音,出现了语音,重新编写
274 | input("出现了语音!")
275 | content = qa_soup.find("div", {"class": "yy_vioce_box"})["src"][2:]
276 | result["qa_tag"] += "语音"
277 | result["qa_content"] = content
278 | else:
279 | # 患者的处理方式
280 | qa_author = '0'
281 | result["qa_author"] = qa_author
282 | content = qa_soup.find("p", {"class": "f-c-r-w-text"}).text
283 | result["qa_content"] = content.replace("\t", "").replace("\n", "").replace(" ", "")
284 |
285 | # 患者状态
286 | patient_status = qa_soup.find("div", {"class": "f-c-l-status"}).span.string
287 | result["patient_status"] = patient_status
288 | # 将数据保存到数据库
289 | update_conn.insertData(result, primary_key="")
290 | qa_list += 1
291 | # qa_list 在一个问答中发言的编号
292 | return qa_list
293 |
294 |
295 | def change_split_type(split_type, qa_number, qa_list, qa_soups):
296 | """
297 | 根据解析类型调用不同的解析方式
298 | :param split_type: 解析类型
299 | :param qa_number: url对应的编号
300 | :param qa_list: 对话顺序
301 | :param qa_soups: soup的list集合
302 | :return: qa_list 在一个问答中发言的编号
303 | """
304 | if split_type is 1:
305 | # 解析首页
306 | return split_content_1(qa_number, qa_list, qa_soups)
307 | elif split_type is 2:
308 | # 解析后续页面
309 | return split_soups_1(qa_number, qa_list, qa_soups)
310 |
311 | elif split_type is 3:
312 | return split_content_2(qa_number, qa_list, qa_soups)
313 | elif split_type is 4:
314 | return split_soups_2(qa_number, qa_list, qa_soups)
315 | else:
316 | raise IndexError("没有对应的解析方式。需要查看URL对应的解析方式。")
317 |
318 |
319 | def split_page(qa_number, url):
320 | """
321 | 根据url进行解析
322 | :param qa_number: url对应的数据库编号
323 | :param url: url
324 | :return: None
325 | """
326 |
327 | # 判断是问答是医生还是团队, 0是医生,1是团队
328 | if "wenda" in url:
329 | doctor_patient = url.replace("https://www.haodf.com/wenda/", "").replace(".htm", "")
330 | try:
331 | _, _, patient = doctor_patient.split("_")
332 | except ValueError:
333 | _, _, _, patient = doctor_patient.split("_")
334 | update_url = {"qa_patient": patient, "qa_type": '0'}
335 | elif "flow_team" in url:
336 | doctor_patient = url.replace("https://www.haodf.com/doctorteam/", "").replace(".htm", "")
337 | _, _, patient = doctor_patient.split("_")
338 | update_url = {"qa_patient": patient, "qa_type": '1'}
339 | else:
340 | print("出现新的URL方式,请手动解析!")
341 |
342 | # 将页面变为Beautisoup对象
343 | driver.get(url)
344 | try:
345 | soup = BeautifulSoup(driver.page_source.encode('gbk'), "lxml")
346 | except UnicodeEncodeError:
347 | # 编码异常
348 | update_url["qa_status"] = '3'
349 | update_conn.connectMysql(table="all_url")
350 | update_conn.update_database(datadict=update_url, situation="WHERE qa_number = '%s'" % qa_number)
351 | return
352 |
353 | # 判断页面是否存在
354 | try:
355 | # 更新医生id
356 | doctor_id_soup = soup.find("span", {"class": "space_b_url"})
357 | update_url["qa_doctor"] = doctor_id_soup.string
358 | # 更新title
359 | title_soup = soup.find("h1", {"class": "fl f20 fn fyahei pl20 bdn"})
360 | if title_soup is None:
361 | title_soup = soup.find("div", {"class": "fl-title ellps"})
362 | update_url["qa_title"] = title_soup.string
363 | except AttributeError:
364 | # 2代表网页异常
365 | update_url["qa_status"] = '2'
366 | update_conn.connectMysql(table="all_url")
367 | update_conn.update_database(datadict=update_url, situation="WHERE qa_number = '%s'" % qa_number)
368 | return
369 |
370 | # 解析相关问答、文章、疾病
371 | relative_soups = soup.find_all("div", {"class": "mt20 w670 bg_w zzx_t_repeat"})
372 | split_relative(qa_number=qa_number, relative_soups=relative_soups)
373 |
374 | # 解析QA
375 | qa_list = 1
376 | # 默认第一种解析方式{"class": "zzx_yh_stream"}
377 | split_type = 1
378 | qa_content_soups = soup.find_all("div", {"class": "zzx_yh_stream"})
379 |
380 | # 第二种解析方式{"class": "f-card clearfix js-f-card"}
381 | if len(qa_content_soups) == 0:
382 | split_type = 3
383 | print("第二种解析方式")
384 | qa_content_soups = soup.find_all("div", {"class": "f-card clearfix js-f-card"})
385 |
386 | # 出现了新的网站结构。需要手动解析
387 | if len(qa_content_soups) == 0:
388 | split_type = 5
389 | input("未知解析方式!")
390 | try:
391 | qa_list = change_split_type(split_type, qa_number=qa_number, qa_list=qa_list, qa_soups=qa_content_soups)
392 | except AttributeError:
393 | return
394 | except TypeError:
395 | return
396 | # 获取页数,如果有,做翻页处理。
397 | page_soup = soup.find("a", {'class': 'page_turn_a', 'rel': 'true'})
398 | if page_soup is not None:
399 | page_num = page_soup.text.split("\xa0")[1]
400 | for i in range(1, int(page_num)):
401 | driver.get(url.replace(".htm", "_p_%d.htm" % (i+1)))
402 | try:
403 | soup = BeautifulSoup(driver.page_source.encode('gbk'), "lxml")
404 | except UnicodeEncodeError:
405 | # 编码异常
406 | update_url["qa_status"] = '3'
407 | update_conn.connectMysql(table="all_url")
408 | update_conn.update_database(datadict=update_url, situation="WHERE qa_number = '%s'" % qa_number)
409 | return
410 | qa_content_soups = soup.find_all("div", {"class": "zzx_yh_stream"})
411 | qa_list = change_split_type(split_type+1, qa_number=qa_number, qa_list=qa_list, qa_soups=qa_content_soups)
412 |
413 | # 更改URL的status,0代表未解析,1代表已解析,其他代表异常,并更新到数据库
414 | update_url["qa_status"] = '1'
415 | update_conn.connectMysql(table="all_url")
416 | update_conn.update_database(datadict=update_url, situation="WHERE qa_number = '%s'" % qa_number)
417 |
418 |
419 | def main():
420 | # 从数据库中读取未解析页面的URL
421 | title_list = ["qa_number", "qa_url"]
422 | situation = "WHERE qa_status = '0'"
423 | select_cursor = select_conn.select_order(title_list=title_list, situation=situation)
424 | while True:
425 | result = select_cursor.fetchone()
426 | if result is None:
427 | break
428 | qa_number, temp_url = result
429 | # 测试用的URL
430 | # 语音
431 | # temp_url = 'https://www.haodf.com/wenda/kongweimin_g_5974272953.htm'
432 | # 医生
433 | # temp_url = 'https://www.haodf.com/wenda/abc195366_g_5673322365.htm'
434 | # 团队, 第二种解析方式
435 | # temp_url = 'https://www.haodf.com/doctorteam/flow_team_6465190653.htm'
436 | # 分页 且 无相关ss,第一种解析方式
437 | # temp_url = 'https://www.haodf.com/wenda/fingerprints_g_6403406888.htm'
438 | # 送礼物
439 | # temp_url = "https://www.haodf.com/wenda/blueesky_g_5673307901.htm"
440 |
441 | # 如果之前爬取过该页面,删除相关信息
442 | delete_situation = "WHERE qa_number = '%s'" % (str(qa_number))
443 | update_conn.connectMysql(table="QA")
444 | update_conn.delete_data(situation=delete_situation)
445 | update_conn.connectMysql(table="relative_url")
446 | update_conn.delete_data(situation=delete_situation)
447 |
448 | # if qa_number % 100 == 0:
449 | # print("休息30s")
450 | # time.sleep(10)
451 |
452 | print("---------------------------------------" * 3)
453 | print("\t第%s个URL正在解析.URL:%s" % (str(qa_number), temp_url))
454 | split_page(str(qa_number), temp_url)
455 | print("---------------------------------------" * 3)
456 | # 设置睡眠时间,不然会出现被跳转的情况,目前无法try到那个异常
457 | time.sleep(random.randint(1, 3))
458 | # 关闭所有连接
459 | driver.close()
460 | select_conn.closeMysql()
461 | update_conn.closeMysql()
462 |
463 |
464 | if __name__ == '__main__':
465 | main()
466 |
467 |
--------------------------------------------------------------------------------
/baseCode/getHaodf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | # @Time : 2019/6/16 14:17
4 | # @Author : 王诚坤
5 | # @File : getHaodf.py
6 | # @des : 爬取好大夫的所有网络咨询,并保存到数据库:haodf
7 | """
8 |
9 | from tools import ConnectDatabase as conn
10 | from selenium import webdriver
11 | import datetime
12 | import pyprind
13 |
14 | driver = webdriver.Chrome()
15 | # 连接数据库
16 | db_conn = conn.MySQLCommand()
17 | # 爬取文件开始日期
18 | CURRENT_DATE = "20181216"
19 | # 爬取文件结束日期
20 | END_DATE = "20181231"
21 | # 基础url
22 | BASE_URL = "https://www.haodf.com/sitemap-zx/"
23 |
24 |
25 | def saveURL(current_date):
26 |
27 | current_page = 1
28 | page_ele = "尾页"
29 | while page_ele == "尾页":
30 | print("------------------" * 3)
31 | print("**********正在查找%s日的第%d页的内容************" % (current_date.strftime('%Y%m%d'), current_page))
32 | temp_url = "%s%s_%d/" % (BASE_URL, current_date.strftime('%Y%m%d'), current_page)
33 | driver.get(temp_url)
34 |
35 | # 找到所有的标签
36 | item = driver.find_elements_by_xpath('//li/a')
37 | bar = pyprind.ProgBar(len(item))
38 | for i in item:
39 | db_url = {"qa_date": str(current_date.strftime('%Y%m%d')), "qa_status": '0',
40 | "qa_url": i.get_attribute('href'), "qa_title": i.text.replace("\"", "\'").replace("\\", "_")}
41 | # 将数据保存到数据库
42 | db_conn.insertData(db_url, primary_key="qa_url")
43 | bar.update()
44 |
45 | # 翻页
46 | try:
47 | page_ele = driver.find_elements_by_class_name("p_num")[-1].text
48 | except IndexError:
49 | break
50 | # print(page_ele)
51 | current_page += 1
52 |
53 |
54 | def getAllURL():
55 | db_conn.connectMysql(table="all_url")
56 | current_date = datetime.datetime.strptime(CURRENT_DATE, '%Y%m%d')
57 | end_date = datetime.datetime.strptime(END_DATE, '%Y%m%d')
58 | # 按照日期遍历
59 | while current_date <= end_date:
60 | saveURL(current_date)
61 | current_date += datetime.timedelta(days=1)
62 |
63 |
64 | def main():
65 | # 获取所有的URL
66 | getAllURL()
67 |
68 |
69 | if __name__ == '__main__':
70 | main()
71 | driver.close()
72 | db_conn.closeMysql()
73 |
--------------------------------------------------------------------------------
/baseCode/study.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | # @Time : 2019/7/4 17:46
4 | # @Author : 王诚坤
5 | # @File : study.py
6 | # @des :
7 | """
8 | import threading
9 | import time
10 |
11 | # 定义一个全局变量
12 | g_num = 0
13 |
14 |
15 | def test1(num):
16 | global g_num
17 |
18 | for i in range(num):
19 | mutex.acquire() # 上锁 注意了此时锁的代码越少越好
20 | g_num += 1
21 | mutex.release() # 解锁
22 |
23 | print("-----in test1 g_num=%d----" % g_num)
24 |
25 |
26 | def test2(num):
27 | global g_num
28 | for i in range(num):
29 | mutex.acquire() # 上锁
30 | g_num += 1
31 | mutex.release() # 解锁
32 | print("-----in test2 g_num=%d----" % g_num)
33 |
34 |
35 | # 创建一个互斥锁,默认是没有上锁的
36 | mutex = threading.Lock()
37 |
38 |
39 | def main():
40 | t1 = threading.Thread(target=test1, args=(1000000,))
41 | t2 = threading.Thread(target=test2, args=(1000000,))
42 |
43 | t1.start()
44 | t2.start()
45 |
46 | # 等待上面的2个线程执行完毕....
47 | time.sleep(2)
48 |
49 | print("-----in main Thread g_num = %d---" % g_num)
50 |
51 |
52 | if __name__ == "__main__":
53 | main()
54 |
--------------------------------------------------------------------------------
/debug.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloAtilol/haodf/f34a275bad73cd7ff105ba5ea1d937ebb01819e5/debug.log
--------------------------------------------------------------------------------
/multi_haodf/__pycache__/getContent.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloAtilol/haodf/f34a275bad73cd7ff105ba5ea1d937ebb01819e5/multi_haodf/__pycache__/getContent.cpython-37.pyc
--------------------------------------------------------------------------------
/multi_haodf/getContent.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | # @Time : 2019/6/16 18:34
4 | # @Author : 王诚坤
5 | # @File : getContent.py
6 | # @des :解析页面
7 | """
8 |
9 | from tools import ConnectDatabase as conn
10 | from bs4 import BeautifulSoup
11 | from selenium import webdriver
12 | import time
13 | import random
14 | import threading
15 | from tools.Logger import Logger
16 |
17 | # 连接数据库
18 | update_conn = conn.MySQLCommand()
19 |
20 | # 创建一个多线程互斥锁,保证数据库不会出现混乱
21 | lock = threading.Lock()
22 |
23 | # 加载LOG日志
24 | logger = Logger("haodf").getLog()
25 |
26 |
27 | def split_relative(qa_number, relative_soups):
28 | """
29 | 负责解析相关问答、文章和疾病
30 | :param qa_number: 访问页面对应的编号
31 | :param relative_soups: 需要解析的Beautifulsoup对象
32 | :return: None
33 | """
34 | for relative_soup in relative_soups:
35 | # print(relative_soup)
36 | r_text = relative_soup.p.find("span", {"class": "fl f18"}).text
37 | # 相关标签,0代表问答;1代表文章;2代表疾病
38 | if "相关回复" in r_text:
39 | tag = '0'
40 | elif "相关文章" in r_text:
41 | tag = '1'
42 | elif "相关疾病" in r_text:
43 | tag = '2'
44 | rq_lis = relative_soup.find_all("li")
45 | for rq_li in rq_lis:
46 | result = {"rela_tag": tag, "rela_title": rq_li.a.string, "rela_url": rq_li.a["href"][2:],
47 | "qa_number": qa_number}
48 | lock.acquire()
49 | # 更换数据存储的表名
50 | update_conn.connectMysql(table="relative_url")
51 | update_conn.insertData(data_dict=result, primary_key="rela_url")
52 | lock.release()
53 |
54 |
55 | def split_content_1(qa_number, qa_list, qa_soups):
56 | """
57 | 解析第一种内容的第一条,class=“zzx_yh_stream”
58 | :param qa_list: 问答的标签
59 | :param qa_number: url编号
60 | :param qa_soups: 问答的Beautifulsoup对象
61 | :return: qa_list 在一个问答中发言的编号
62 | """
63 |
64 | # 取出第一个提问
65 | first_describe = qa_soups[0]
66 |
67 | is_describe = first_describe.find("div", {"class": "h_s_cons_info_top"})
68 |
69 | if is_describe is None:
70 | # 如果第一条不是病情描述
71 | qa_list = split_soups_1(qa_number, qa_list, (first_describe,))
72 | try:
73 | next_soups = qa_soups[1:]
74 | return split_content_1(qa_number, qa_list, next_soups)
75 | except IndexError:
76 | return qa_list
77 |
78 | # 患者状态和咨询时间
79 | user_soup = first_describe.find("div", {"class": "stream_yh_left"})
80 | # 状态
81 | patient_status = user_soup.find("div", {"class": "yh_l_states"}).span.string
82 | # 发表时间
83 | qa_time = user_soup.find("div", {"class": "yh_l_times"}).string
84 | # qa_author: 0是患者,1是医生
85 | first_result = {"qa_number": qa_number, "qa_author": '0', "patient_status": patient_status, "qa_time": qa_time}
86 |
87 | # 咨询内容
88 | content = ""
89 | describe = first_describe.find("div", {"class": "h_s_info_cons"})
90 |
91 | for child in describe.children:
92 | if child.name is None or child.name == "script":
93 | continue
94 | if child.text is "" or "本人登录后可见" in child.text:
95 | continue
96 | content += (child.text.replace("\n", "") + "\n")
97 | # 去掉最后一个\n
98 | first_result["qa_content"] = content[:-1]
99 |
100 | # 将第一条数据加入到对话流
101 | qa_list.append(first_result)
102 | qa_soups.remove(first_describe)
103 |
104 | # qa_list 在一个问答中发言的编号
105 | return split_soups_1(qa_number, qa_list, qa_soups)
106 |
107 |
108 | def split_soups_1(qa_number, qa_list, qa_soups):
109 | """
110 | 解析第一种内容的后续内容class=“zzx_yh_stream”
111 | :param qa_number: url的编号
112 | :param qa_list: 内容在问答种的编号
113 | :param qa_soups: 需要解析的soup的list
114 | :return: qa_list 在一个问答中发言的编号
115 | """
116 |
117 | for qa_soup in qa_soups:
118 | # 时间
119 | qa_time = qa_soup.find("div", {"class": "yh_l_times"}).string
120 | # 结果字典初始化
121 | result = {"qa_number": qa_number, "qa_time": qa_time, "qa_tag": ""}
122 | # 判断是医生发言还是患者发言
123 | if "yi" in qa_soup.find("div", {"class": "yh_r_t_icon"}).img["src"]:
124 | # 医生发言对应的处理过程
125 | qa_author = '1'
126 | result["qa_author"] = qa_author
127 | # 判断是否是语音
128 | try:
129 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title"})
130 | if content_soup is None:
131 | content_soup = qa_soup.find("p", {"class": "h_s_cons_main mb10"})
132 | if content_soup is not None:
133 | result["qa_tag"] += qa_soup.find("h3", {"class": "h_s_docs_title mb10 ml10"}).text
134 | content = content_soup.text
135 | except AttributeError:
136 | # 语音的处理方法
137 | content_soup = qa_soup.find("div", {"class": "yy_vioce_box"})
138 | if content_soup is not None:
139 | content = content_soup["src"][2:]
140 | else:
141 | content = ""
142 | result["qa_tag"] += "语音"
143 | result["qa_content"] = content
144 | else:
145 | # 患者提问的解析办法
146 | qa_author = '0'
147 | result["qa_author"] = qa_author
148 | content = ""
149 | # 使用最多的结构
150 | content_soup = qa_soup.find("pre", {"class": "h_s_cons_main"})
151 | # 开药方请求
152 | if content_soup is None:
153 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title iconyaofang"})
154 | # 送礼物
155 | if content_soup is None:
156 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title gifts"})
157 | if content_soup is not None:
158 | result["qa_tag"] = "送礼物"
159 | content_soup = qa_soup.find("p", {"class": "h_s_cons_main"})
160 | # 感谢信
161 | if content_soup is None:
162 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title iconmails"})
163 | if content_soup is not None:
164 | result["qa_tag"] = content_soup.text
165 | content_soup = qa_soup.find("p", {"class": "pt5 wb"}).span
166 | # 上传视频
167 | if content_soup is None:
168 | content_soup = qa_soup.find("span", {"class": "fl bingli_hide_word"})
169 |
170 | # 再次上传病历
171 | if content_soup is None:
172 | # 判断是不是有病例专有头部标签
173 | content_soup = qa_soup.find("div", {"class": "h_s_cons_info_top"})
174 | if content_soup is not None:
175 | qa_list = split_content_1(qa_number, qa_list, [qa_soup])
176 |
177 | # 患者Tag
178 | if content_soup is None:
179 | content_soup = qa_soup.find("h3", {"class": "h_s_cons_title"})
180 | if content_soup is not None:
181 | result["qa_tag"] = content_soup.text
182 | h_s_cons = qa_soup.find("div", {"class": "h_s_cons"})
183 | if h_s_cons is not None:
184 | for p_soup in h_s_cons.children:
185 | if p_soup.name != "p":
186 | continue
187 | content += p_soup.text
188 |
189 | # 以上方式都未检测到,需要添加解析方式
190 | if content_soup is None:
191 | input("患者出现新的解析方式")
192 | if len(content) > 0:
193 | result["qa_content"] = content
194 | else:
195 | result["qa_content"] = content_soup.text
196 |
197 | # 患者状态
198 | patient_status = qa_soup.find("div", {"class": "yh_l_states"}).span.string
199 | result["patient_status"] = patient_status
200 |
201 | # 将解析出来的结果加入对话流
202 | qa_list.append(result)
203 |
204 | return qa_list
205 |
206 |
207 | def split_content_2(qa_number, qa_list, qa_soups):
208 | """
209 | 解析第二种内容的第一条,class=“f-card clearfix js-f-card”
210 | :param qa_list: 对话流
211 | :param qa_number: url编号
212 | :param qa_soups: 问答的Beautifulsoup对象
213 | :return: qa_list 在一个问答中发言的编号
214 | """
215 |
216 | # 取出第一个提问内容
217 | first_describe = qa_soups[0]
218 |
219 | # 患者状态和咨询时间
220 | user_soup = first_describe.find("div", {"class": "f-c-left"})
221 | # 状态
222 | patient_status = user_soup.find("div", {"class": "f-c-l-status"}).span.string
223 | # 发表时间
224 | qa_time = user_soup.find("div", {"class": "f-c-l-date"}).string
225 | # qa_author: 0是患者,1是医生
226 | first_result = {"qa_number": qa_number, "qa_author": '0',
227 | "patient_status": patient_status, "qa_time": qa_time}
228 |
229 | # 咨询内容
230 | content = ""
231 | describe = first_describe.find("div", {"class": "f-c-r-wrap"})
232 | for child in describe.children:
233 | if child.name == "h4" or child.name == "p":
234 | content += (child.text+"\n")
235 | # 去掉最后一个\n
236 | first_result["qa_content"] = content[:-1]
237 |
238 | lock.acquire()
239 | # 更换数据存储的表名
240 | update_conn.connectMysql(table="QA")
241 | update_conn.insertData(first_result, primary_key="")
242 | lock.release()
243 |
244 | # 加入对话流
245 | qa_list.append(first_result)
246 |
247 | # 从list中去掉第一个
248 | qa_soups.remove(first_describe)
249 |
250 | # qa_list 在一个问答中发言的编号
251 | return split_soups_2(qa_number, qa_list, qa_soups)
252 |
253 |
254 | def split_soups_2(qa_number, qa_list, qa_soups):
255 | """
256 | 解析第一种内容的后续内容class=“f-card clearfix js-f-card”
257 | :param qa_number: url的编号
258 | :param qa_list: 内容在问答种的编号
259 | :param qa_soups: 需要解析的soup的list
260 | :return: qa_list 在一个问答中发言的编号
261 | """
262 | for qa_soup in qa_soups:
263 | # 时间
264 | qa_time = qa_soup.find("div", {"class": "f-c-l-date"}).string
265 | # 结果字典初始化
266 | result = {"qa_number": qa_number, "qa_time": qa_time, "qa_tag": ""}
267 | # 判断是医生还是患者
268 | if "doctor" in qa_soup.find("img", {"class": "f-c-r-usertype"})["src"]:
269 | # 医生的处理方式
270 | qa_author = '1'
271 | result["qa_author"] = qa_author
272 | qa_tag_soup = qa_soup.find("h2", {"class": "f-c-r-w-title"})
273 | # 添加标签(一问一答、图文问诊)
274 | if qa_tag_soup is not None:
275 | result["qa_tag"] += qa_tag_soup.text
276 | # 判断是不是语音
277 | try:
278 | content_soup = qa_soup.find("h4", {"class": "f-c-r-w-subtitle"})
279 | if content_soup is None:
280 | content_soup = qa_soup.find("p", {"class": "f-c-r-doctext"})
281 | # 医生回答中出现了新的结构,需要手动更新
282 | if content_soup is None:
283 | print(qa_soup.find("div", {"class": "f-c-r-wrap"}))
284 | input("医生出现了新的解析方式,可能是语音,重新查看")
285 | content = content_soup.text.replace("\t", "").replace("\n", "").replace(" ", "")
286 | except AttributeError:
287 | # 这里暂时没有找到对应的语音,出现了语音,重新编写
288 | input("出现了语音!")
289 | content = qa_soup.find("div", {"class": "yy_vioce_box"})["src"][2:]
290 | result["qa_tag"] += "语音"
291 | result["qa_content"] = content
292 | else:
293 | # 患者的处理方式
294 | qa_author = '0'
295 | result["qa_author"] = qa_author
296 | content = qa_soup.find("p", {"class": "f-c-r-w-text"}).text
297 | result["qa_content"] = content.replace("\t", "").replace("\n", "").replace(" ", "")
298 |
299 | # 患者状态
300 | patient_status = qa_soup.find("div", {"class": "f-c-l-status"}).span.string
301 | result["patient_status"] = patient_status
302 |
303 | # 解析后加入对话流
304 | qa_list.append(result)
305 |
306 | qa_list += 1
307 |
308 | return qa_list
309 |
310 |
311 | def change_split_type(split_type, qa_number, qa_list, qa_soups):
312 | """
313 | 根据解析类型调用不同的解析方式
314 | :param split_type: 解析类型
315 | :param qa_number: url对应的编号
316 | :param qa_list: 对话流
317 | :param qa_soups: soup的list集合
318 | :return: qa_list 在一个问答中发言的编号
319 | """
320 | if split_type is 1:
321 | # 解析首页
322 | return split_content_1(qa_number, qa_list, qa_soups)
323 | elif split_type is 2:
324 | # 解析后续页面
325 | return split_soups_1(qa_number, qa_list, qa_soups)
326 |
327 | elif split_type is 3:
328 | return split_content_2(qa_number, qa_list, qa_soups)
329 | elif split_type is 4:
330 | return split_soups_2(qa_number, qa_list, qa_soups)
331 | else:
332 | raise IndexError("没有对应的解析方式。需要查看URL对应的解析方式。")
333 |
334 |
335 | def split_page(driver, qa_number, url):
336 | """
337 | 根据url进行解析
338 | :param driver: 浏览器组件
339 | :param qa_number: url对应的数据库编号
340 | :param url: url
341 | :return: None
342 | """
343 |
344 | # 判断是问答是医生还是团队, 0是医生,1是团队
345 | if "wenda" in url:
346 | doctor_patient = url.replace("https://www.haodf.com/wenda/", "").replace(".htm", "")
347 | try:
348 | _, _, patient = doctor_patient.split("_")
349 | except ValueError:
350 | _, _, _, patient = doctor_patient.split("_")
351 | update_url = {"qa_patient": patient, "qa_type": '0'}
352 | elif "flow_team" in url:
353 | doctor_patient = url.replace("https://www.haodf.com/doctorteam/", "").replace(".htm", "")
354 | _, _, patient = doctor_patient.split("_")
355 | update_url = {"qa_patient": patient, "qa_type": '1'}
356 | else:
357 | print("出现新的URL方式,请手动解析!")
358 |
359 | # 将页面变为Beautisoup对象
360 | driver.get(url)
361 | soup = BeautifulSoup(driver.page_source.encode('gbk', errors='ignore'), "lxml")
362 |
363 | # 判断页面是否存在
364 | try:
365 | # 更新医生id
366 | doctor_id_soup = soup.find("span", {"class": "space_b_url"})
367 | update_url["qa_doctor"] = doctor_id_soup.string
368 | # 更新title
369 | title_soup = soup.find("h1", {"class": "fl f20 fn fyahei pl20 bdn"})
370 | if title_soup is None:
371 | title_soup = soup.find("div", {"class": "fl-title ellps"})
372 | update_url["qa_title"] = title_soup.string
373 | except AttributeError:
374 | # 2代表网页异常
375 | lock.acquire()
376 | update_url["qa_status"] = '2'
377 | update_conn.connectMysql(table="all_url")
378 | update_conn.update_database(datadict=update_url, situation="WHERE qa_number = '%s'" % qa_number)
379 | lock.release()
380 | return
381 |
382 | # 解析相关问答、文章、疾病
383 | relative_soups = soup.find_all("div", {"class": "mt20 w670 bg_w zzx_t_repeat"})
384 | split_relative(qa_number=qa_number, relative_soups=relative_soups)
385 |
386 | # 解析QA
387 | qa_list = []
388 | # 默认第一种解析方式{"class": "zzx_yh_stream"}
389 | split_type = 1
390 | qa_content_soups = soup.find_all("div", {"class": "zzx_yh_stream"})
391 |
392 | # 第二种解析方式{"class": "f-card clearfix js-f-card"}
393 | if len(qa_content_soups) == 0:
394 | split_type = 3
395 | # print("第二种解析方式")
396 | qa_content_soups = soup.find_all("div", {"class": "f-card clearfix js-f-card"})
397 | # 出现了新的网站结构。需要手动解析
398 | if len(qa_content_soups) == 0:
399 | split_type = 5
400 | input("未知解析方式!")
401 | try:
402 | qa_list = change_split_type(split_type, qa_number=qa_number, qa_list=qa_list, qa_soups=qa_content_soups)
403 | # 获取页数,如果有,做翻页处理。
404 | page_soup = soup.find("a", {'class': 'page_turn_a', 'rel': 'true'})
405 | if page_soup is not None:
406 | page_num = page_soup.text.split("\xa0")[1]
407 | for i in range(1, int(page_num)):
408 | # 对之后的页面解析
409 | driver.get(url.replace(".htm", "_p_%d.htm" % (i + 1)))
410 | soup = BeautifulSoup(driver.page_source.encode('gbk'), "lxml")
411 | qa_content_soups = soup.find_all("div", {"class": "zzx_yh_stream"})
412 | qa_list = change_split_type(split_type + 1, qa_number=qa_number, qa_list=qa_list,
413 | qa_soups=qa_content_soups)
414 | lock.acquire()
415 | tag = 1
416 | for ql in qa_list:
417 | # 将数据保存到数据库
418 | ql["qa_list"] = tag
419 | update_conn.connectMysql(table="QA")
420 | update_conn.insertData(ql, primary_key="")
421 | tag += 1
422 | lock.release()
423 | qa_status = '1'
424 | except AttributeError as e:
425 | logger.error("%d, %s,\t %s \n" % (qa_number, url, e))
426 | qa_status = '4'
427 | except TypeError as e:
428 | logger.error("%d, %s,\t %s \n" % (qa_number, url, e))
429 | qa_status = '4'
430 | except Exception as e:
431 | logger.error("%d, %s,\t %s \n" % (qa_number, url, e))
432 | qa_status = '5'
433 | finally:
434 | # 更改URL的status,
435 | # 0代表未解析;
436 | # 1代表已解析;
437 | # 2代表页面异常;
438 | # 3代表编码异常;
439 | # 4代表解析异常;
440 | # 5代表其他异常
441 | lock.acquire()
442 | update_url["qa_status"] = qa_status
443 | update_conn.connectMysql(table="all_url")
444 | update_conn.update_database(datadict=update_url, situation="WHERE qa_number = '%s'" % qa_number)
445 | lock.release()
446 |
447 |
448 | def start(number_urls):
449 | """
450 | 为多线程启动创建的入口
451 | :param number_urls: 需要解析的多个(qa_number, url)
452 | :return:None
453 | """
454 | option = webdriver.ChromeOptions()
455 | option.add_argument('headless')
456 | driver = webdriver.Chrome(options=option)
457 |
458 | for res in number_urls:
459 | if res is None:
460 | break
461 | qa_number, temp_url = res
462 |
463 | # 输出提示信息
464 | # print("---------------------------------------" * 3)
465 | # print("\t第%s个URL正在解析.URL:%s" % (str(qa_number), temp_url))
466 |
467 | # 如果之前爬取过该页面,删除相关信息
468 | lock.acquire()
469 | delete_situation = "WHERE qa_number = '%s'" % (str(qa_number))
470 | update_conn.connectMysql(table="QA")
471 | update_conn.delete_data(situation=delete_situation)
472 | update_conn.connectMysql(table="relative_url")
473 | update_conn.delete_data(situation=delete_situation)
474 | lock.release()
475 |
476 | # if qa_number % 100 == 0:
477 | # print("休息30s")
478 | # time.sleep(10)
479 |
480 | split_page(driver, str(qa_number), temp_url)
481 | # print("---------------------------------------" * 3)
482 | # 设置睡眠时间,不然会出现被跳转的情况,目前无法try到那个异常
483 | # time.sleep(1)
484 | logger.info(temp_url+"\tcompleted.")
485 | driver.close()
486 |
487 |
488 | if __name__ == '__main__':
489 |
490 | # 测试用的URL
491 | # 语音
492 | # temp_urls = 'https://www.haodf.com/wenda/kongweimin_g_5974272953.htm'
493 | # 医生
494 | # temp_urls = 'https://www.haodf.com/wenda/abc195366_g_5673322365.htm'
495 | # 团队, 第二种解析方式
496 | # temp_urls = 'https://www.haodf.com/doctorteam/flow_team_6465190653.htm'
497 | # 分页 且 无相关ss,第一种解析方式
498 | # temp_urls = 'https://www.haodf.com/wenda/fingerprints_g_6403406888.htm'
499 | # 送礼物
500 | temp_urls = "https://www.haodf.com/wenda/wanghuigk_g_5737507776.htm"
501 | start(((148990, temp_urls),))
502 | update_conn.closeMysql()
503 |
504 |
--------------------------------------------------------------------------------
/multi_haodf/multi_start.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | # @Time : 2019/7/4 18:27
4 | # @Author : 王诚坤
5 | # @File : multi_start.py
6 | # @des : 多线程启动函数
7 | """
8 |
9 | from multi_haodf import getContent
10 | import threading
11 | from tools import ConnectDatabase as conn
12 |
13 |
14 | def main():
15 | # 查询数据库连接
16 | select_conn = conn.MySQLCommand()
17 | select_conn.connectMysql(table="all_url")
18 |
19 | # 确定线程数量
20 | num = int(input("Threading Number:\t"))
21 |
22 | print("Current Number:\t %d" % num)
23 | # 获取启动函数
24 | start = getContent.start
25 |
26 | # 查询
27 | title_list = ["qa_number", "qa_url"]
28 | situation = "WHERE qa_status = '0' or qa_status ='3'"
29 | select_cursor = select_conn.select_order(title_list=title_list, situation=situation)
30 |
31 | # 定义一个tag控制循环
32 | tag = True
33 |
34 | while tag:
35 | th_list = []
36 | for i in range(num):
37 | temp_result = select_cursor.fetchmany(10)
38 | if temp_result[0] is None:
39 | tag = False
40 | break
41 | th = threading.Thread(target=start, args=(temp_result,))
42 | th.start()
43 | th_list.append(th)
44 |
45 | # 使主线程等待所有子线程执行完成
46 | for t in th_list:
47 | t.join()
48 |
49 | # 关闭数据库连接
50 | select_conn.closeMysql()
51 |
52 |
53 | if __name__ == '__main__':
54 | main()
55 |
56 |
--------------------------------------------------------------------------------
/source/haodf.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloAtilol/haodf/f34a275bad73cd7ff105ba5ea1d937ebb01819e5/source/haodf.jpg
--------------------------------------------------------------------------------
/tools/ConnectDatabase.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | 编写创建数据库的类,并构建connectMysql方法
5 | author:王诚坤
6 | date:2018/10/16
7 | update: 2019/03/20
8 | """
9 |
10 | import pymysql
11 | import csv
12 |
13 |
14 | class MySQLCommand(object):
15 | # 初始化类
16 | def __init__(self):
17 | # 数据库地址
18 | self.host = '192.168.1.181'
19 | # 端口号
20 | self.port = 3306
21 | # 用户名
22 | self.user = 'sim509'
23 | # 密码
24 | self.password = 'sim509'
25 | # 数据库名
26 | self.db = 'haodf'
27 | self.conn = pymysql.connect(host=self.host, port=self.port, user=self.user,
28 | passwd=self.password, db=self.db, charset='utf8')
29 | self.cursor = self.conn.cursor()
30 |
31 | def connectMysql(self, table='topic'):
32 | """
33 | 建立数据库连接
34 | :return:
35 | """
36 | try:
37 | self.table = table
38 | # print(self.table, "表已连接!")
39 | except pymysql.Error as e:
40 | print('连接数据库失败!')
41 | print(e)
42 |
43 | def insertData(self, data_dict, primary_key=''):
44 | """
45 | 将数据插入数据库,首先检查数据是否已经存在,如果存在则不插入
46 | :param data_dict: 要插入的数据字典
47 | :param primary_key: 主键
48 | :return:
49 | """
50 | # 检测数据是否存在
51 | if primary_key is not "":
52 | sqlExit = "SELECT %s FROM %s WHERE %s = '%s' "\
53 | % (primary_key, self.table, primary_key, data_dict[primary_key])
54 | # 执行查找语句
55 | # print(sqlExit)
56 | res = self.cursor.execute(sqlExit)
57 | if res:
58 | # print('数据已经存入数据库', res)
59 | return 0
60 | # 数据不存在,则执行插入操作
61 | try:
62 | # 拼接属性名
63 | cols = ','.join(data_dict.keys())
64 | # 拼接属性名对应的值
65 | values = '","'.join([str(x).replace("\"", "_").replace("\'", "_") for x in data_dict.values()])
66 | # 插入语句
67 | sql = "INSERT INTO %s (%s) VALUES (%s)" % (self.table, cols, '"' + values + '"')
68 | # print(sql)
69 | try:
70 | # 执行插入操作
71 | result = self.cursor.execute(sql)
72 | insert_id = self.conn.insert_id()
73 | self.conn.commit()
74 |
75 | if result:
76 | # print('插入成功', insert_id)
77 | return insert_id + 1
78 | except pymysql.Error as e:
79 | # 如果出现异常,执行回滚操作
80 | self.conn.rollback()
81 | if "key 'PRIMARY'" in e.args[1]:
82 | print('数据已存在,未再次插入!')
83 | else:
84 | print("插入数据失败,原因 %d: %s" % (e.args[0], e.args[1]))
85 | except pymysql.Error as e:
86 | print("数据库错误,原因 %d: %s" % (e.args[0], e.args[1]))
87 |
88 | def select_order(self, title_list, situation='', order_title="", order_type='ASC'):
89 | """
90 | 查找所有数据中的某几列
91 | :param order_type: 排序方式,ASC为升序,DESC为降序;
92 | :param order_title: 排序的列的名称(title);
93 | :param situation: 条件语句,即WHERE语句;
94 | :param title_list: 要查找的列的名称;
95 | :return:查询的结果;
96 | """
97 | title = ','.join(title_list)
98 | if order_title is not "":
99 | order_title = "ORDER BY %s %s" % (order_title, order_type)
100 | sql = "SELECT %s FROM %s %s %s ;" % (title, self.table, situation, order_title)
101 | self.cursor.execute(sql)
102 | return self.cursor
103 |
104 | def select_distinct(self, title="talker"):
105 | """
106 | 获取所有的群编号(talker的数量)
107 | :param title:默认为talker,可以替换使用其他方式;
108 | :return:查询结果的元组;
109 | """
110 | sqlChatRoom = "SELECT DISTINCT %s FROM %s;" % (title, self.table)
111 | res = self.cursor.execute(sqlChatRoom)
112 | if res:
113 | result = self.cursor.fetchall()
114 | return result
115 | else:
116 | raise Exception("%s 没有内容!" % title)
117 |
118 | def update_database(self, datadict, situation):
119 | part_sql = ""
120 | for key, value in datadict.items():
121 | part_sql = part_sql + "%s = '%s'," % (key.replace(" ", "_").replace("\"", "_").replace("\'", "_"),
122 | str(value).replace(" ", "_").replace("\"", "_").replace("\'", "_")
123 | .replace("\\", "_"))
124 | sql = "UPDATE %s SET %s %s" % (self.table, part_sql[0: -1], situation)
125 | # print(sql)
126 | res = self.cursor.execute(sql)
127 | self.conn.commit()
128 | return res
129 |
130 | def delete_data(self, situation):
131 | """
132 | 删除记录
133 | :param situation:
134 | :return:
135 | """
136 | sql = "DELETE FROM %s %s" % (self.table, situation)
137 | res = self.cursor.execute(sql)
138 | self.conn.commit()
139 | return res
140 |
141 | def closeMysql(self):
142 | """
143 | 关闭数据库连接
144 | :return:
145 | """
146 | self.cursor.close()
147 | self.conn.close()
148 | print('数据库连接已关闭!')
149 |
150 |
151 | def main():
152 | """
153 | 导入数据的操作。
154 | 只需要修改表名和文件名即可。(推荐文件编码格式为utf-8.否则潜在不能正确编码的问题。)
155 | :return:
156 | """
157 | # 初始化并建立数据库连接
158 | conn = MySQLCommand()
159 | conn.connectMysql(table="wechat_contact")
160 | # 读取csv文件
161 | with open('data/wechat_contact_0314.csv', 'r', encoding="UTF-8") as f:
162 | csv_file = csv.DictReader(f)
163 | i = 0
164 | while True:
165 | try:
166 | i = i + 1
167 | data_dict = next(csv_file)
168 | except UnicodeDecodeError as e:
169 | print(e)
170 | print(data_dict)
171 | break
172 | except StopIteration:
173 | print('遍历结束!')
174 | break
175 | try:
176 | conn.insertData(data_dict, primary_key="username")
177 | except pymysql.err.ProgrammingError as e:
178 | print(e)
179 | print(str(i) + '行出现错误,手动处理')
180 | print(data_dict)
181 | # 关闭数据库连接
182 | conn.closeMysql()
183 |
184 |
185 | if __name__ == '__main__':
186 | main()
187 |
--------------------------------------------------------------------------------
/tools/Logger.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | # @Time : 2019/7/12 17:52
4 | # @Author : 王诚坤
5 | # @File : Logger
6 | # @des : log管理
7 | """
8 |
9 | import logging
10 | import os
11 | import time
12 |
13 |
14 | class Logger(object):
15 | def __init__(self, logger):
16 | """
17 | 指定保存日志的文件路径,日志级别,调用文件
18 | 将日志存入到指定的文件中
19 | :param logger:
20 | """
21 |
22 | # 创建一个logger(记录器)
23 | # 日志记录的工作主要由Logger对象来完成。在调用getLogger时要提供Logger的名称
24 | self.logger = logging.getLogger(logger)
25 | self.logger.setLevel(logging.DEBUG)
26 |
27 | # 创建一个handler,用于写入日志文件
28 | rq = time.strftime('%Y%m%d', time.localtime(time.time()))
29 | log_path = os.path.dirname(os.getcwd()) + '/logs/'
30 | log_name = log_path + rq + '.log' # 文件名
31 |
32 | # 将日志写入磁盘
33 | fh = logging.FileHandler(log_name)
34 | fh.setLevel(logging.INFO)
35 |
36 | # 创建一个handler,用于输出到控制台
37 | ch = logging.StreamHandler()
38 | ch.setLevel(logging.INFO)
39 |
40 | # 定义handler的输出格式
41 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
42 | fh.setFormatter(formatter)
43 | ch.setFormatter(formatter)
44 |
45 | # 给logger添加handler
46 | self.logger.addHandler(fh)
47 | self.logger.addHandler(ch)
48 |
49 | def getLog(self):
50 | return self.logger
51 |
--------------------------------------------------------------------------------
/tools/__pycache__/ConnectDatabase.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/HelloAtilol/haodf/f34a275bad73cd7ff105ba5ea1d937ebb01819e5/tools/__pycache__/ConnectDatabase.cpython-37.pyc
--------------------------------------------------------------------------------