├── .gitattributes
├── .idea
├── encodings.xml
├── misc.xml
├── modules.xml
├── vcs.xml
├── workspace.xml
└── yuqing_system.iml
├── .vscode
└── settings.json
├── README.md
├── clean
├── 1.chinese_text_word_cloud.html
├── 2.chinese_text_analysis.html
├── 3.chinese_text_classifier.html
├── news.py
├── simhei.ttf
├── stopwords.txt
├── 出租车罢工.csv
├── 地陷事件.csv
├── 好一新大火.csv
├── 相识度计算.py
└── 词频统计_LDA主题模型.py
├── dz_spider
├── __init__.py
├── __pycache__
│ └── __init__.cpython-36.pyc
├── dz_spider
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── middlewares.cpython-36.pyc
│ │ ├── pipelines.cpython-36.pyc
│ │ └── settings.cpython-36.pyc
│ ├── common.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ ├── __pycache__
│ │ ├── __init__.cpython-36.pyc
│ │ ├── baidu.cpython-36.pyc
│ │ ├── sogou.cpython-36.pyc
│ │ └── toutiao.cpython-36.pyc
│ │ ├── baidu.py
│ │ ├── sogou.py
│ │ └── toutiao.py
├── log
│ └── app.log
├── run.py
└── scrapy.cfg
└── plan
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=Python
2 | *.css linguist-language=Python
3 | *.html linguist-language=Python
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
140 |
141 |
142 |
143 | timeout
144 | 10.29.30.25
145 | !QAZ2wsx
146 | http://bbs.jiaguhome.com/item-2900.html
147 | set_text
148 | 原因
149 | show_topics
150 |
151 |
152 | 127.0.0.1
153 | l
154 | lang1994
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 | 1564127592128
385 |
386 |
387 | 1564127592128
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 | 1565075263832
398 |
399 |
400 |
401 | 1565075263832
402 |
403 |
404 | 1565155883849
405 |
406 |
407 |
408 | 1565155883849
409 |
410 |
411 | 1565156238638
412 |
413 |
414 |
415 | 1565156238638
416 |
417 |
418 | 1565156425570
419 |
420 |
421 |
422 | 1565156425570
423 |
424 |
425 | 1565156525593
426 |
427 |
428 |
429 | 1565156525593
430 |
431 |
432 | 1565169400743
433 |
434 |
435 |
436 | 1565169400743
437 |
438 |
439 | 1565169459776
440 |
441 |
442 |
443 | 1565169459776
444 |
445 |
446 | 1565169482032
447 |
448 |
449 |
450 | 1565169482032
451 |
452 |
453 | 1565171930795
454 |
455 |
456 |
457 | 1565171930795
458 |
459 |
460 | 1565226568694
461 |
462 |
463 |
464 | 1565226568694
465 |
466 |
467 | 1565236191217
468 |
469 |
470 |
471 | 1565236191217
472 |
473 |
474 | 1565236211698
475 |
476 |
477 |
478 | 1565236211698
479 |
480 |
481 | 1565236359421
482 |
483 |
484 |
485 | 1565236359421
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 | file://$PROJECT_DIR$/clean/相识度计算.py
560 | 116
561 |
562 |
563 |
564 | file://$PROJECT_DIR$/clean/词频统计_LDA主题模型.py
565 | 72
566 |
567 |
568 |
569 | file://$PROJECT_DIR$/clean/news.py
570 | 53
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 | str(e)
580 | Python
581 | CODE_FRAGMENT
582 |
583 |
584 | "MaxRetryError" in str(e)
585 | Python
586 | CODE_FRAGMENT
587 |
588 |
589 | data_df[:10]["count"].values*1000
590 | Python
591 | CODE_FRAGMENT
592 |
593 |
594 | "TimeoutError" in str(e)
595 | Python
596 | CODE_FRAGMENT
597 |
598 |
599 | data_df["index"].values
600 | Python
601 | CODE_FRAGMENT
602 |
603 |
604 | numpy.array(data_df["count"].values)
605 | Python
606 | CODE_FRAGMENT
607 |
608 |
609 | numpy.array(data_df["count"].Number)
610 | Python
611 | CODE_FRAGMENT
612 |
613 |
614 | sorted(wors.items(),key=lambda x:x[1],reverse=True)
615 | Python
616 | CODE_FRAGMENT
617 |
618 |
619 | sorted(wors.items(),key=lambda x:x[1],reverse=False)
620 | Python
621 | CODE_FRAGMENT
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
708 |
709 |
710 |
711 |
712 |
713 |
714 |
715 |
716 |
717 |
718 |
719 |
720 |
721 |
722 |
723 |
724 |
725 |
726 |
727 |
728 |
729 |
730 |
731 |
732 |
733 |
734 |
735 |
736 |
737 |
738 |
739 |
740 |
741 |
742 |
743 |
744 |
745 |
746 |
747 |
748 |
749 |
750 |
751 |
752 |
753 |
754 |
755 |
756 |
757 |
758 |
759 |
760 |
761 |
762 |
763 |
764 |
765 |
766 |
767 |
768 |
769 |
770 |
771 |
772 |
773 |
774 |
775 |
776 |
777 |
778 |
779 |
780 |
781 |
782 |
783 |
784 |
785 |
786 |
787 |
788 |
789 |
790 |
791 |
792 |
793 |
794 |
795 |
796 |
797 |
798 |
799 |
800 |
801 |
802 |
803 |
804 |
805 |
806 |
807 |
808 |
809 |
810 |
811 |
812 |
813 |
814 |
815 |
816 |
817 |
818 |
819 |
820 |
821 |
822 |
823 |
824 |
825 |
826 |
827 |
828 |
829 |
830 |
831 |
832 |
833 |
834 |
835 |
836 |
837 |
838 |
839 |
840 |
841 |
842 |
843 |
844 |
845 |
846 |
847 |
848 |
849 |
850 |
851 |
852 |
853 |
854 |
855 |
856 |
857 |
858 |
859 |
860 |
861 |
862 |
863 |
864 |
865 |
866 |
867 |
868 |
869 |
870 |
871 |
872 |
873 |
874 |
875 |
876 |
877 |
878 |
879 |
880 |
881 |
882 |
883 |
884 |
885 |
886 |
887 |
888 |
889 |
890 |
891 |
892 |
893 |
894 |
895 |
896 |
897 |
898 |
899 |
900 |
901 |
902 |
903 |
904 |
905 |
906 |
907 |
--------------------------------------------------------------------------------
/.idea/yuqing_system.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "python.pythonPath": "/Users/yuanlang/work/python/anaconda2/envs/python3_6/bin/python"
3 | }
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # 舆情线下爬虫设计
3 |
4 | ### 安装 scrapy
5 | ```
6 | $ scrapy startproject dz_spider
7 | $ cd dz_spider
8 | $ scrapy genspider baidu www.baicu.com
9 | ```
10 |
11 | ### 使用spiderkeeper管理scrapy项目
12 | ```
13 | 略
14 | ```
15 |
16 | ### 新闻正文提取 Article 模块 (clean/news.py)
17 | ### 新闻主题分类 (clean/关键字提取.py)
--------------------------------------------------------------------------------
/clean/news.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | """
3 | --------------------------------------
4 | @describe 新闻内容提取
5 | @version: 1.0
6 | @project: yuqing_system
7 | @file: news.py
8 | @author: yuanlang
9 | @time: 2019-08-06 16:04
10 | ---------------------------------------
11 | """
12 | import time
13 | from newspaper import Article
14 | import requests
15 | import pymysql
16 | import threading
17 | from queue import Queue
18 |
19 | conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="lang1994", db="yuqing_db", charset="utf8")
20 | cursor = conn.cursor()
21 | q=Queue()
22 |
23 |
24 | def download(url):
25 |
26 | try:
27 | print(f"fetch url --------> {url}")
28 | news = Article(url, language='zh')
29 | reponse = requests.get(url, verify=False,timeout=3)
30 | if reponse.status_code==404 or reponse.status_code==503:
31 | sql = "update seed set status=-1 where url='" + url + "'"
32 | print(sql)
33 | cursor.execute(sql)
34 | conn.commit()
35 | return
36 | news.set_html(reponse.content)
37 | news.parse() # 再解析
38 | text = news.text
39 | if text == "":
40 | sql = "update seed set status=-2 where url='" + url + "'"
41 | print(sql)
42 | cursor.execute(sql)
43 | conn.commit()
44 | return
45 | sql="insert into context(url,content) values('"+url+"','"+text+"')"
46 | print(sql)
47 | cursor.execute(sql)
48 | sql = "update seed set status=1 where url='" + url+"'"
49 | print(sql)
50 | cursor.execute(sql)
51 | conn.commit()
52 | except Exception as e:
53 | print("exception"+str(repr(e)))
54 | if "TimeoutError" in str(e) or "HTTPSConnectionPool" in str(e) or "Exceeded 30 redirects" in str(e) \
55 | or "Max retries" in str(e) or "HTTPConnectionPool" in str(e) or "Data too long" in str(e):
56 | sql = "update seed set status=-1 where url='" + url + "'"
57 | print(sql)
58 | cursor.execute(sql)
59 | conn.commit()
60 |
61 |
62 | def spider():
63 |
64 |
65 | while True:
66 | cursor.execute("select url from seed where status = 0 limit 1")
67 | items = cursor.fetchall()
68 | for item in items:
69 | q.put(item[0])
70 | # result=[]
71 | # for i in range(20):
72 | # url = q.get()
73 | # t=threading.Thread(target=download,args=(url,))
74 | # t.start()
75 | # time.sleep(1)
76 | # result.append(t)
77 | # for t in result:
78 | # t.join()
79 | url = q.get()
80 | download(url=url)
81 | # time.sleep(1)
82 |
83 | if __name__ == "__main__":
84 | spider()
--------------------------------------------------------------------------------
/clean/simhei.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/clean/simhei.ttf
--------------------------------------------------------------------------------
/clean/stopwords.txt:
--------------------------------------------------------------------------------
1 | !
2 | "
3 | #
4 | $
5 | %
6 | &
7 | '
8 | (
9 | )
10 | *
11 | +
12 | ,
13 | -
14 | --
15 | .
16 | ..
17 | ...
18 | ......
19 | ...................
20 | ./
21 | .一
22 | 记者
23 | 数
24 | 年
25 | 月
26 | 日
27 | 时
28 | 分
29 | 秒
30 | /
31 | //
32 | 0
33 | 1
34 | 2
35 | 3
36 | 4
37 | 5
38 | 6
39 | 7
40 | 8
41 | 9
42 | :
43 | ://
44 | ::
45 | ;
46 | <
47 | =
48 | >
49 | >>
50 | ?
51 | @
52 | A
53 | Lex
54 | [
55 | \
56 | ]
57 | 【
58 | 】
59 | ^
60 | _
61 | `
62 | exp
63 | sub
64 | sup
65 | |
66 | }
67 | ~
68 | ~~~~
69 | ·
70 | ×
71 | ×××
72 | Δ
73 | Ψ
74 | γ
75 | μ
76 | φ
77 | φ.
78 | В
79 | —
80 | ——
81 | ———
82 | ‘
83 | ’
84 | ’‘
85 | “
86 | ”
87 | ”,
88 | …
89 | ……
90 | …………………………………………………③
91 | ′∈
92 | ′|
93 | ℃
94 | Ⅲ
95 | ↑
96 | →
97 | ∈[
98 | ∪φ∈
99 | ≈
100 | ①
101 | ②
102 | ②c
103 | ③
104 | ③]
105 | ④
106 | ⑤
107 | ⑥
108 | ⑦
109 | ⑧
110 | ⑨
111 | ⑩
112 | ──
113 | ■
114 | ▲
115 |
116 | 、
117 | 。
118 | 〈
119 | 〉
120 | 《
121 | 》
122 | 》),
123 | 」
124 | 『
125 | 』
126 | 〔
127 | 〕
128 | 〕〔
129 | ㈧
130 | 一
131 | 一.
132 | 一一
133 | 一下
134 | 一个
135 | 一些
136 | 一何
137 | 一切
138 | 一则
139 | 一则通过
140 | 一天
141 | 一定
142 | 一方面
143 | 一旦
144 | 一时
145 | 一来
146 | 一样
147 | 一次
148 | 一片
149 | 一番
150 | 一直
151 | 一致
152 | 一般
153 | 一起
154 | 一转眼
155 | 一边
156 | 一面
157 | 男子
158 | 女子
159 | 七
160 | 万一
161 | 三
162 | 三天两头
163 | 三番两次
164 | 三番五次
165 | 上
166 | 上下
167 | 上升
168 | 上去
169 | 上来
170 | 上述
171 | 上面
172 | 下
173 | 下列
174 | 下去
175 | 下来
176 | 下面
177 | 不
178 | 不一
179 | 不下
180 | 不久
181 | 不了
182 | 不亦乐乎
183 | 不仅
184 | 不仅...而且
185 | 不仅仅
186 | 不仅仅是
187 | 不会
188 | 不但
189 | 不但...而且
190 | 不光
191 | 不免
192 | 不再
193 | 不力
194 | 不单
195 | 不变
196 | 不只
197 | 不可
198 | 不可开交
199 | 不可抗拒
200 | 不同
201 | 不外
202 | 不外乎
203 | 不够
204 | 不大
205 | 不如
206 | 不妨
207 | 不定
208 | 不对
209 | 不少
210 | 不尽
211 | 不尽然
212 | 不巧
213 | 不已
214 | 不常
215 | 不得
216 | 不得不
217 | 不得了
218 | 不得已
219 | 不必
220 | 不怎么
221 | 不怕
222 | 不惟
223 | 不成
224 | 不拘
225 | 不择手段
226 | 不敢
227 | 不料
228 | 不断
229 | 不日
230 | 不时
231 | 不是
232 | 不曾
233 | 不止
234 | 不止一次
235 | 不比
236 | 不消
237 | 不满
238 | 不然
239 | 不然的话
240 | 不特
241 | 不独
242 | 不由得
243 | 不知不觉
244 | 不管
245 | 不管怎样
246 | 不经意
247 | 不胜
248 | 不能
249 | 不能不
250 | 不至于
251 | 不若
252 | 不要
253 | 不论
254 | 不起
255 | 不足
256 | 不过
257 | 不迭
258 | 不问
259 | 不限
260 | 与
261 | 与其
262 | 与其说
263 | 与否
264 | 与此同时
265 | 专门
266 | 且
267 | 且不说
268 | 且说
269 | 两者
270 | 严格
271 | 严重
272 | 个
273 | 个人
274 | 个别
275 | 中小
276 | 中间
277 | 丰富
278 | 串行
279 | 临
280 | 临到
281 | 为
282 | 为主
283 | 为了
284 | 为什么
285 | 为什麽
286 | 为何
287 | 为止
288 | 为此
289 | 为着
290 | 主张
291 | 主要
292 | 举凡
293 | 举行
294 | 乃
295 | 乃至
296 | 乃至于
297 | 么
298 | 之
299 | 之一
300 | 之前
301 | 之后
302 | 之後
303 | 之所以
304 | 之类
305 | 乌乎
306 | 乎
307 | 乒
308 | 乘
309 | 乘势
310 | 乘机
311 | 乘胜
312 | 乘虚
313 | 乘隙
314 | 九
315 | 也
316 | 也好
317 | 也就是说
318 | 也是
319 | 也罢
320 | 了
321 | 了解
322 | 争取
323 | 二
324 | 二来
325 | 二话不说
326 | 二话没说
327 | 于
328 | 于是
329 | 于是乎
330 | 云云
331 | 云尔
332 | 互
333 | 互相
334 | 五
335 | 些
336 | 交口
337 | 亦
338 | 产生
339 | 亲口
340 | 亲手
341 | 亲眼
342 | 亲自
343 | 亲身
344 | 人
345 | 人人
346 | 人们
347 | 人家
348 | 人民
349 | 什么
350 | 什么样
351 | 什麽
352 | 仅
353 | 仅仅
354 | 今
355 | 今后
356 | 今天
357 | 今年
358 | 今後
359 | 介于
360 | 仍
361 | 仍旧
362 | 仍然
363 | 从
364 | 从不
365 | 从严
366 | 从中
367 | 从事
368 | 从今以后
369 | 从优
370 | 从古到今
371 | 从古至今
372 | 从头
373 | 从宽
374 | 从小
375 | 从新
376 | 从无到有
377 | 从早到晚
378 | 从未
379 | 从来
380 | 从此
381 | 从此以后
382 | 从而
383 | 从轻
384 | 从速
385 | 从重
386 | 他
387 | 他人
388 | 他们
389 | 他是
390 | 他的
391 | 代替
392 | 以
393 | 以上
394 | 以下
395 | 以为
396 | 以便
397 | 以免
398 | 以前
399 | 以及
400 | 以后
401 | 以外
402 | 以後
403 | 以故
404 | 以期
405 | 以来
406 | 以至
407 | 以至于
408 | 以致
409 | 们
410 | 任
411 | 任何
412 | 任凭
413 | 任务
414 | 企图
415 | 伙同
416 | 会
417 | 伟大
418 | 传
419 | 传说
420 | 传闻
421 | 似乎
422 | 似的
423 | 但
424 | 但凡
425 | 但愿
426 | 但是
427 | 何
428 | 何乐而不为
429 | 何以
430 | 何况
431 | 何处
432 | 何妨
433 | 何尝
434 | 何必
435 | 何时
436 | 何止
437 | 何苦
438 | 何须
439 | 余外
440 | 作为
441 | 你
442 | 你们
443 | 你是
444 | 你的
445 | 使
446 | 使得
447 | 使用
448 | 例如
449 | 依
450 | 依据
451 | 依照
452 | 依靠
453 | 便
454 | 便于
455 | 促进
456 | 保持
457 | 保管
458 | 保险
459 | 俺
460 | 俺们
461 | 倍加
462 | 倍感
463 | 倒不如
464 | 倒不如说
465 | 倒是
466 | 倘
467 | 倘使
468 | 倘或
469 | 倘然
470 | 倘若
471 | 借
472 | 借以
473 | 借此
474 | 假使
475 | 假如
476 | 假若
477 | 偏偏
478 | 做到
479 | 偶尔
480 | 偶而
481 | 傥然
482 | 像
483 | 儿
484 | 允许
485 | 元/吨
486 | 充其极
487 | 充其量
488 | 充分
489 | 先不先
490 | 先后
491 | 先後
492 | 先生
493 | 光
494 | 光是
495 | 全体
496 | 全力
497 | 全年
498 | 全然
499 | 全身心
500 | 全部
501 | 全都
502 | 全面
503 | 八
504 | 八成
505 | 公然
506 | 六
507 | 兮
508 | 共
509 | 共同
510 | 共总
511 | 关于
512 | 其
513 | 其一
514 | 其中
515 | 其二
516 | 其他
517 | 其余
518 | 其后
519 | 其它
520 | 其实
521 | 其次
522 | 具体
523 | 具体地说
524 | 具体来说
525 | 具体说来
526 | 具有
527 | 兼之
528 | 内
529 | 再
530 | 再其次
531 | 再则
532 | 再有
533 | 再次
534 | 再者
535 | 再者说
536 | 再说
537 | 冒
538 | 冲
539 | 决不
540 | 决定
541 | 决非
542 | 况且
543 | 准备
544 | 凑巧
545 | 凝神
546 | 几
547 | 几乎
548 | 几度
549 | 几时
550 | 几番
551 | 几经
552 | 凡
553 | 凡是
554 | 凭
555 | 凭借
556 | 出
557 | 出于
558 | 出去
559 | 出来
560 | 出现
561 | 分别
562 | 分头
563 | 分期
564 | 分期分批
565 | 切
566 | 切不可
567 | 切切
568 | 切勿
569 | 切莫
570 | 则
571 | 则甚
572 | 刚
573 | 刚好
574 | 刚巧
575 | 刚才
576 | 初
577 | 别
578 | 别人
579 | 别处
580 | 别是
581 | 别的
582 | 别管
583 | 别说
584 | 到
585 | 到了儿
586 | 到处
587 | 到头
588 | 到头来
589 | 到底
590 | 到目前为止
591 | 前后
592 | 前此
593 | 前者
594 | 前进
595 | 前面
596 | 加上
597 | 加之
598 | 加以
599 | 加入
600 | 加强
601 | 动不动
602 | 动辄
603 | 勃然
604 | 匆匆
605 | 十分
606 | 千
607 | 千万
608 | 千万千万
609 | 半
610 | 单
611 | 单单
612 | 单纯
613 | 即
614 | 即令
615 | 即使
616 | 即便
617 | 即刻
618 | 即如
619 | 即将
620 | 即或
621 | 即是说
622 | 即若
623 | 却
624 | 却不
625 | 历
626 | 原来
627 | 去
628 | 又
629 | 又及
630 | 及
631 | 及其
632 | 及时
633 | 及至
634 | 双方
635 | 反之
636 | 反之亦然
637 | 反之则
638 | 反倒
639 | 反倒是
640 | 反应
641 | 反手
642 | 反映
643 | 反而
644 | 反过来
645 | 反过来说
646 | 取得
647 | 取道
648 | 受到
649 | 变成
650 | 古来
651 | 另
652 | 另一个
653 | 另一方面
654 | 另外
655 | 另悉
656 | 另方面
657 | 另行
658 | 只
659 | 只当
660 | 只怕
661 | 只是
662 | 只有
663 | 只消
664 | 只要
665 | 只限
666 | 叫
667 | 叫做
668 | 召开
669 | 叮咚
670 | 叮当
671 | 可
672 | 可以
673 | 可好
674 | 可是
675 | 可能
676 | 可见
677 | 各
678 | 各个
679 | 各人
680 | 各位
681 | 各地
682 | 各式
683 | 各种
684 | 各级
685 | 各自
686 | 合理
687 | 同
688 | 同一
689 | 同时
690 | 同样
691 | 后
692 | 后来
693 | 后者
694 | 后面
695 | 向
696 | 向使
697 | 向着
698 | 吓
699 | 吗
700 | 否则
701 | 吧
702 | 吧哒
703 | 吱
704 | 呀
705 | 呃
706 | 呆呆地
707 | 呐
708 | 呕
709 | 呗
710 | 呜
711 | 呜呼
712 | 呢
713 | 周围
714 | 呵
715 | 呵呵
716 | 呸
717 | 呼哧
718 | 呼啦
719 | 咋
720 | 和
721 | 咚
722 | 咦
723 | 咧
724 | 咱
725 | 咱们
726 | 咳
727 | 哇
728 | 哈
729 | 哈哈
730 | 哉
731 | 哎
732 | 哎呀
733 | 哎哟
734 | 哗
735 | 哗啦
736 | 哟
737 | 哦
738 | 哩
739 | 哪
740 | 哪个
741 | 哪些
742 | 哪儿
743 | 哪天
744 | 哪年
745 | 哪怕
746 | 哪样
747 | 哪边
748 | 哪里
749 | 哼
750 | 哼唷
751 | 唉
752 | 唯有
753 | 啊
754 | 啊呀
755 | 啊哈
756 | 啊哟
757 | 啐
758 | 啥
759 | 啦
760 | 啪达
761 | 啷当
762 | 喀
763 | 喂
764 | 喏
765 | 喔唷
766 | 喽
767 | 嗡
768 | 嗡嗡
769 | 嗬
770 | 嗯
771 | 嗳
772 | 嘎
773 | 嘎嘎
774 | 嘎登
775 | 嘘
776 | 嘛
777 | 嘻
778 | 嘿
779 | 嘿嘿
780 | 四
781 | 因
782 | 因为
783 | 因了
784 | 因此
785 | 因着
786 | 因而
787 | 固
788 | 固然
789 | 在
790 | 在下
791 | 在于
792 | 地
793 | 均
794 | 坚决
795 | 坚持
796 | 基于
797 | 基本
798 | 基本上
799 | 处在
800 | 处处
801 | 处理
802 | 复杂
803 | 多
804 | 多么
805 | 多亏
806 | 多多
807 | 多多少少
808 | 多多益善
809 | 多少
810 | 多年前
811 | 多年来
812 | 多数
813 | 多次
814 | 够瞧的
815 | 大
816 | 大不了
817 | 大举
818 | 大事
819 | 大体
820 | 大体上
821 | 大凡
822 | 大力
823 | 大多
824 | 大多数
825 | 大大
826 | 大家
827 | 大张旗鼓
828 | 大批
829 | 大抵
830 | 大概
831 | 大略
832 | 大约
833 | 大致
834 | 大都
835 | 大量
836 | 大面儿上
837 | 失去
838 | 奇
839 | 奈
840 | 奋勇
841 | 她
842 | 她们
843 | 她是
844 | 她的
845 | 好
846 | 好在
847 | 好的
848 | 好象
849 | 如
850 | 如上
851 | 如上所述
852 | 如下
853 | 如今
854 | 如何
855 | 如其
856 | 如前所述
857 | 如同
858 | 如常
859 | 如是
860 | 如期
861 | 如果
862 | 如次
863 | 如此
864 | 如此等等
865 | 如若
866 | 始而
867 | 姑且
868 | 存在
869 | 存心
870 | 孰料
871 | 孰知
872 | 宁
873 | 宁可
874 | 宁愿
875 | 宁肯
876 | 它
877 | 它们
878 | 它们的
879 | 它是
880 | 它的
881 | 安全
882 | 完全
883 | 完成
884 | 定
885 | 实现
886 | 实际
887 | 宣布
888 | 容易
889 | 密切
890 | 对
891 | 对于
892 | 对应
893 | 对待
894 | 对方
895 | 对比
896 | 将
897 | 将才
898 | 将要
899 | 将近
900 | 小
901 | 少数
902 | 尔
903 | 尔后
904 | 尔尔
905 | 尔等
906 | 尚且
907 | 尤其
908 | 就
909 | 就地
910 | 就是
911 | 就是了
912 | 就是说
913 | 就此
914 | 就算
915 | 就要
916 | 尽
917 | 尽可能
918 | 尽如人意
919 | 尽心尽力
920 | 尽心竭力
921 | 尽快
922 | 尽早
923 | 尽然
924 | 尽管
925 | 尽管如此
926 | 尽量
927 | 局外
928 | 居然
929 | 届时
930 | 属于
931 | 屡
932 | 屡屡
933 | 屡次
934 | 屡次三番
935 | 岂
936 | 岂但
937 | 岂止
938 | 岂非
939 | 川流不息
940 | 左右
941 | 巨大
942 | 巩固
943 | 差一点
944 | 差不多
945 | 己
946 | 已
947 | 已矣
948 | 已经
949 | 巴
950 | 巴巴
951 | 带
952 | 帮助
953 | 常
954 | 常常
955 | 常言说
956 | 常言说得好
957 | 常言道
958 | 平素
959 | 年复一年
960 | 并
961 | 并不
962 | 并不是
963 | 并且
964 | 并排
965 | 并无
966 | 并没
967 | 并没有
968 | 并肩
969 | 并非
970 | 广大
971 | 广泛
972 | 应当
973 | 应用
974 | 应该
975 | 庶乎
976 | 庶几
977 | 开外
978 | 开始
979 | 开展
980 | 引起
981 | 弗
982 | 弹指之间
983 | 强烈
984 | 强调
985 | 归
986 | 归根到底
987 | 归根结底
988 | 归齐
989 | 当
990 | 当下
991 | 当中
992 | 当儿
993 | 当前
994 | 当即
995 | 当口儿
996 | 当地
997 | 当场
998 | 当头
999 | 当庭
1000 | 当时
1001 | 当然
1002 | 当真
1003 | 当着
1004 | 形成
1005 | 彻夜
1006 | 彻底
1007 | 彼
1008 | 彼时
1009 | 彼此
1010 | 往
1011 | 往往
1012 | 待
1013 | 待到
1014 | 很
1015 | 很多
1016 | 很少
1017 | 後来
1018 | 後面
1019 | 得
1020 | 得了
1021 | 得出
1022 | 得到
1023 | 得天独厚
1024 | 得起
1025 | 心里
1026 | 必
1027 | 必定
1028 | 必将
1029 | 必然
1030 | 必要
1031 | 必须
1032 | 快
1033 | 快要
1034 | 忽地
1035 | 忽然
1036 | 怎
1037 | 怎么
1038 | 怎么办
1039 | 怎么样
1040 | 怎奈
1041 | 怎样
1042 | 怎麽
1043 | 怕
1044 | 急匆匆
1045 | 怪
1046 | 怪不得
1047 | 总之
1048 | 总是
1049 | 总的来看
1050 | 总的来说
1051 | 总的说来
1052 | 总结
1053 | 总而言之
1054 | 恍然
1055 | 恐怕
1056 | 恰似
1057 | 恰好
1058 | 恰如
1059 | 恰巧
1060 | 恰恰
1061 | 恰恰相反
1062 | 恰逢
1063 | 您
1064 | 您们
1065 | 您是
1066 | 惟其
1067 | 惯常
1068 | 意思
1069 | 愤然
1070 | 愿意
1071 | 慢说
1072 | 成为
1073 | 成年
1074 | 成年累月
1075 | 成心
1076 | 我
1077 | 我们
1078 | 我是
1079 | 我的
1080 | 或
1081 | 或则
1082 | 或多或少
1083 | 或是
1084 | 或曰
1085 | 或者
1086 | 或许
1087 | 战斗
1088 | 截然
1089 | 截至
1090 | 所
1091 | 所以
1092 | 所在
1093 | 所幸
1094 | 所有
1095 | 所谓
1096 | 才
1097 | 才能
1098 | 扑通
1099 | 打
1100 | 打从
1101 | 打开天窗说亮话
1102 | 扩大
1103 | 把
1104 | 抑或
1105 | 抽冷子
1106 | 拦腰
1107 | 拿
1108 | 按
1109 | 按时
1110 | 按期
1111 | 按照
1112 | 按理
1113 | 按说
1114 | 挨个
1115 | 挨家挨户
1116 | 挨次
1117 | 挨着
1118 | 挨门挨户
1119 | 挨门逐户
1120 | 换句话说
1121 | 换言之
1122 | 据
1123 | 据实
1124 | 据悉
1125 | 据我所知
1126 | 据此
1127 | 据称
1128 | 据说
1129 | 掌握
1130 | 接下来
1131 | 接着
1132 | 接著
1133 | 接连不断
1134 | 放量
1135 | 故
1136 | 故意
1137 | 故此
1138 | 故而
1139 | 敞开儿
1140 | 敢
1141 | 敢于
1142 | 敢情
1143 | 数/
1144 | 整个
1145 | 断然
1146 | 方
1147 | 方便
1148 | 方才
1149 | 方能
1150 | 方面
1151 | 旁人
1152 | 无
1153 | 无宁
1154 | 无法
1155 | 无论
1156 | 既
1157 | 既...又
1158 | 既往
1159 | 既是
1160 | 既然
1161 | 日复一日
1162 | 日渐
1163 | 日益
1164 | 日臻
1165 | 日见
1166 | 时候
1167 | 昂然
1168 | 明显
1169 | 明确
1170 | 是
1171 | 是不是
1172 | 是以
1173 | 是否
1174 | 是的
1175 | 显然
1176 | 显著
1177 | 普通
1178 | 普遍
1179 | 暗中
1180 | 暗地里
1181 | 暗自
1182 | 更
1183 | 更为
1184 | 更加
1185 | 更进一步
1186 | 曾
1187 | 曾经
1188 | 替
1189 | 替代
1190 | 最
1191 | 最后
1192 | 最大
1193 | 最好
1194 | 最後
1195 | 最近
1196 | 最高
1197 | 有
1198 | 有些
1199 | 有关
1200 | 有利
1201 | 有力
1202 | 有及
1203 | 有所
1204 | 有效
1205 | 有时
1206 | 有点
1207 | 有的
1208 | 有的是
1209 | 有着
1210 | 有著
1211 | 望
1212 | 朝
1213 | 朝着
1214 | 末##末
1215 | 本
1216 | 本人
1217 | 本地
1218 | 本着
1219 | 本身
1220 | 权时
1221 | 来
1222 | 来不及
1223 | 来得及
1224 | 来看
1225 | 来着
1226 | 来自
1227 | 来讲
1228 | 来说
1229 | 极
1230 | 极为
1231 | 极了
1232 | 极其
1233 | 极力
1234 | 极大
1235 | 极度
1236 | 极端
1237 | 构成
1238 | 果然
1239 | 果真
1240 | 某
1241 | 某个
1242 | 某些
1243 | 某某
1244 | 根据
1245 | 根本
1246 | 格外
1247 | 梆
1248 | 概
1249 | 次第
1250 | 欢迎
1251 | 欤
1252 | 正值
1253 | 正在
1254 | 正如
1255 | 正巧
1256 | 正常
1257 | 正是
1258 | 此
1259 | 此中
1260 | 此后
1261 | 此地
1262 | 此处
1263 | 此外
1264 | 此时
1265 | 此次
1266 | 此间
1267 | 殆
1268 | 毋宁
1269 | 每
1270 | 每个
1271 | 每天
1272 | 每年
1273 | 每当
1274 | 每时每刻
1275 | 每每
1276 | 每逢
1277 | 比
1278 | 比及
1279 | 比如
1280 | 比如说
1281 | 比方
1282 | 比照
1283 | 比起
1284 | 比较
1285 | 毕竟
1286 | 毫不
1287 | 毫无
1288 | 毫无例外
1289 | 毫无保留地
1290 | 汝
1291 | 沙沙
1292 | 没
1293 | 没奈何
1294 | 没有
1295 | 沿
1296 | 沿着
1297 | 注意
1298 | 活
1299 | 深入
1300 | 清楚
1301 | 满
1302 | 满足
1303 | 漫说
1304 | 焉
1305 | 然
1306 | 然则
1307 | 然后
1308 | 然後
1309 | 然而
1310 | 照
1311 | 照着
1312 | 牢牢
1313 | 特别是
1314 | 特殊
1315 | 特点
1316 | 犹且
1317 | 犹自
1318 | 独
1319 | 独自
1320 | 猛然
1321 | 猛然间
1322 | 率尔
1323 | 率然
1324 | 现代
1325 | 现在
1326 | 理应
1327 | 理当
1328 | 理该
1329 | 瑟瑟
1330 | 甚且
1331 | 甚么
1332 | 甚或
1333 | 甚而
1334 | 甚至
1335 | 甚至于
1336 | 用
1337 | 用来
1338 | 甫
1339 | 甭
1340 | 由
1341 | 由于
1342 | 由是
1343 | 由此
1344 | 由此可见
1345 | 略
1346 | 略为
1347 | 略加
1348 | 略微
1349 | 白
1350 | 白白
1351 | 的
1352 | 的确
1353 | 的话
1354 | 皆可
1355 | 目前
1356 | 直到
1357 | 直接
1358 | 相似
1359 | 相信
1360 | 相反
1361 | 相同
1362 | 相对
1363 | 相对而言
1364 | 相应
1365 | 相当
1366 | 相等
1367 | 省得
1368 | 看
1369 | 看上去
1370 | 看出
1371 | 看到
1372 | 看来
1373 | 看样子
1374 | 看看
1375 | 看见
1376 | 看起来
1377 | 真是
1378 | 真正
1379 | 眨眼
1380 | 着
1381 | 着呢
1382 | 矣
1383 | 矣乎
1384 | 矣哉
1385 | 知道
1386 | 砰
1387 | 确定
1388 | 碰巧
1389 | 社会主义
1390 | 离
1391 | 种
1392 | 积极
1393 | 移动
1394 | 究竟
1395 | 穷年累月
1396 | 突出
1397 | 突然
1398 | 窃
1399 | 立
1400 | 立刻
1401 | 立即
1402 | 立地
1403 | 立时
1404 | 立马
1405 | 竟
1406 | 竟然
1407 | 竟而
1408 | 第
1409 | 第二
1410 | 等
1411 | 等到
1412 | 等等
1413 | 策略地
1414 | 简直
1415 | 简而言之
1416 | 简言之
1417 | 管
1418 | 类如
1419 | 粗
1420 | 精光
1421 | 紧接着
1422 | 累年
1423 | 累次
1424 | 纯
1425 | 纯粹
1426 | 纵
1427 | 纵令
1428 | 纵使
1429 | 纵然
1430 | 练习
1431 | 组成
1432 | 经
1433 | 经常
1434 | 经过
1435 | 结合
1436 | 结果
1437 | 给
1438 | 绝
1439 | 绝不
1440 | 绝对
1441 | 绝非
1442 | 绝顶
1443 | 继之
1444 | 继后
1445 | 继续
1446 | 继而
1447 | 维持
1448 | 综上所述
1449 | 缕缕
1450 | 罢了
1451 | 老
1452 | 老大
1453 | 老是
1454 | 老老实实
1455 | 考虑
1456 | 者
1457 | 而
1458 | 而且
1459 | 而况
1460 | 而又
1461 | 而后
1462 | 而外
1463 | 而已
1464 | 而是
1465 | 而言
1466 | 而论
1467 | 联系
1468 | 联袂
1469 | 背地里
1470 | 背靠背
1471 | 能
1472 | 能否
1473 | 能够
1474 | 腾
1475 | 自
1476 | 自个儿
1477 | 自从
1478 | 自各儿
1479 | 自后
1480 | 自家
1481 | 自己
1482 | 自打
1483 | 自身
1484 | 臭
1485 | 至
1486 | 至于
1487 | 至今
1488 | 至若
1489 | 致
1490 | 般的
1491 | 良好
1492 | 若
1493 | 若夫
1494 | 若是
1495 | 若果
1496 | 若非
1497 | 范围
1498 | 莫
1499 | 莫不
1500 | 莫不然
1501 | 莫如
1502 | 莫若
1503 | 莫非
1504 | 获得
1505 | 藉以
1506 | 虽
1507 | 虽则
1508 | 虽然
1509 | 虽说
1510 | 蛮
1511 | 行为
1512 | 行动
1513 | 表明
1514 | 表示
1515 | 被
1516 | 要
1517 | 要不
1518 | 要不是
1519 | 要不然
1520 | 要么
1521 | 要是
1522 | 要求
1523 | 见
1524 | 规定
1525 | 觉得
1526 | 譬喻
1527 | 譬如
1528 | 认为
1529 | 认真
1530 | 认识
1531 | 让
1532 | 许多
1533 | 论
1534 | 论说
1535 | 设使
1536 | 设或
1537 | 设若
1538 | 诚如
1539 | 诚然
1540 | 话说
1541 | 该
1542 | 该当
1543 | 说明
1544 | 说来
1545 | 说说
1546 | 请勿
1547 | 诸
1548 | 诸位
1549 | 诸如
1550 | 谁
1551 | 谁人
1552 | 谁料
1553 | 谁知
1554 | 谨
1555 | 豁然
1556 | 贼死
1557 | 赖以
1558 | 赶
1559 | 赶快
1560 | 赶早不赶晚
1561 | 起
1562 | 起先
1563 | 起初
1564 | 起头
1565 | 起来
1566 | 起见
1567 | 起首
1568 | 趁
1569 | 趁便
1570 | 趁势
1571 | 趁早
1572 | 趁机
1573 | 趁热
1574 | 趁着
1575 | 越是
1576 | 距
1577 | 跟
1578 | 路经
1579 | 转动
1580 | 转变
1581 | 转贴
1582 | 轰然
1583 | 较
1584 | 较为
1585 | 较之
1586 | 较比
1587 | 边
1588 | 达到
1589 | 达旦
1590 | 迄
1591 | 迅速
1592 | 过
1593 | 过于
1594 | 过去
1595 | 过来
1596 | 运用
1597 | 近
1598 | 近几年来
1599 | 近年来
1600 | 近来
1601 | 还
1602 | 还是
1603 | 还有
1604 | 还要
1605 | 这
1606 | 这一来
1607 | 这个
1608 | 这么
1609 | 这么些
1610 | 这么样
1611 | 这么点儿
1612 | 这些
1613 | 这会儿
1614 | 这儿
1615 | 这就是说
1616 | 这时
1617 | 这样
1618 | 这次
1619 | 这点
1620 | 这种
1621 | 这般
1622 | 这边
1623 | 这里
1624 | 这麽
1625 | 进入
1626 | 进去
1627 | 进来
1628 | 进步
1629 | 进而
1630 | 进行
1631 | 连
1632 | 连同
1633 | 连声
1634 | 连日
1635 | 连日来
1636 | 连袂
1637 | 连连
1638 | 迟早
1639 | 迫于
1640 | 适应
1641 | 适当
1642 | 适用
1643 | 逐步
1644 | 逐渐
1645 | 通常
1646 | 通过
1647 | 造成
1648 | 逢
1649 | 遇到
1650 | 遭到
1651 | 遵循
1652 | 遵照
1653 | 避免
1654 | 那
1655 | 那个
1656 | 那么
1657 | 那么些
1658 | 那么样
1659 | 那些
1660 | 那会儿
1661 | 那儿
1662 | 那时
1663 | 那末
1664 | 那样
1665 | 那般
1666 | 那边
1667 | 那里
1668 | 那麽
1669 | 部分
1670 | 都
1671 | 鄙人
1672 | 采取
1673 | 里面
1674 | 重大
1675 | 重新
1676 | 重要
1677 | 鉴于
1678 | 针对
1679 | 长期以来
1680 | 长此下去
1681 | 长线
1682 | 长话短说
1683 | 问题
1684 | 间或
1685 | 防止
1686 | 阿
1687 | 附近
1688 | 陈年
1689 | 限制
1690 | 陡然
1691 | 除
1692 | 除了
1693 | 除却
1694 | 除去
1695 | 除外
1696 | 除开
1697 | 除此
1698 | 除此之外
1699 | 除此以外
1700 | 除此而外
1701 | 除非
1702 | 随
1703 | 随后
1704 | 随时
1705 | 随着
1706 | 随著
1707 | 隔夜
1708 | 隔日
1709 | 难得
1710 | 难怪
1711 | 难说
1712 | 难道
1713 | 难道说
1714 | 集中
1715 | 零
1716 | 需要
1717 | 非但
1718 | 非常
1719 | 非徒
1720 | 非得
1721 | 非特
1722 | 非独
1723 | 靠
1724 | 顶多
1725 | 顷
1726 | 顷刻
1727 | 顷刻之间
1728 | 顷刻间
1729 | 顺
1730 | 顺着
1731 | 顿时
1732 | 颇
1733 | 风雨无阻
1734 | 饱
1735 | 首先
1736 | 马上
1737 | 高低
1738 | 高兴
1739 | 默然
1740 | 默默地
1741 | 齐
1742 | ︿
1743 | !
1744 | #
1745 | $
1746 | %
1747 | &
1748 | '
1749 | (
1750 | )
1751 | )÷(1-
1752 | )、
1753 | *
1754 | +
1755 | +ξ
1756 | ++
1757 | ,
1758 | ,也
1759 | -
1760 | -β
1761 | --
1762 | -[*]-
1763 | .
1764 | /
1765 | 0
1766 | 0:2
1767 | 1
1768 | 1.
1769 | 12%
1770 | 2
1771 | 2.3%
1772 | 3
1773 | 4
1774 | 5
1775 | 5:0
1776 | 6
1777 | 7
1778 | 8
1779 | 9
1780 | :
1781 | ;
1782 | <
1783 | <±
1784 | <Δ
1785 | <λ
1786 | <φ
1787 | <<
1788 | =
1789 | =″
1790 | =☆
1791 | =(
1792 | =-
1793 | =[
1794 | ={
1795 | >
1796 | >λ
1797 | ?
1798 | @
1799 | A
1800 | LI
1801 | R.L.
1802 | ZXFITL
1803 |
1804 | [*]
1805 | [-
1806 | []
1807 | ]
1808 | ]∧′=[
1809 | ][
1810 | _
1811 | a]
1812 | b]
1813 | c]
1814 | e]
1815 | f]
1816 | ng昉
1817 | {
1818 | {-
1819 | |
1820 | }
1821 | }>
1822 | ~
1823 | ~±
1824 | ~+
1825 | ¥
1826 | secondly
1827 | all
1828 | whose
1829 | under
1830 | sorry
1831 | four
1832 | we'll
1833 | somewhere
1834 | likely
1835 | even
1836 | above
1837 | ever
1838 | never
1839 | ZZ
1840 | hers
1841 | i'd
1842 | howbeit
1843 | i'm
1844 | theres
1845 | changes
1846 | anyhow
1847 | would
1848 | therefore
1849 | is
1850 | hereby
1851 | must
1852 | me
1853 | my
1854 | indicated
1855 | indicates
1856 | keep
1857 | far
1858 | after
1859 | hereupon
1860 | keeps
1861 | every
1862 | over
1863 | before
1864 | better
1865 | then
1866 | them
1867 | they
1868 | reasonably
1869 | each
1870 | went
1871 | mean
1872 | we'd
1873 | rd
1874 | re
1875 | got
1876 | forth
1877 | you're
1878 | little
1879 | whereupon
1880 | uses
1881 | already
1882 | another
1883 | took
1884 | second
1885 | seen
1886 | seem
1887 | relatively
1888 | thoroughly
1889 | latter
1890 | that
1891 | thorough
1892 | nobody
1893 | definitely
1894 | came
1895 | saying
1896 | specify
1897 | do
1898 | next
1899 | despite
1900 | unfortunately
1901 | twice
1902 | best
1903 | said
1904 | away
1905 | there's
1906 | unto
1907 | hopefully
1908 | seven
1909 | we
1910 | ltd
1911 | here
1912 | against
1913 | com
1914 | ZT
1915 | aren't
1916 | been
1917 | much
1918 | concerning
1919 | wish
1920 | say
1921 | near
1922 | unlikely
1923 | cant
1924 | in
1925 | ie
1926 | if
1927 | containing
1928 | beside
1929 | several
1930 | kept
1931 | whereby
1932 | whoever
1933 | the
1934 | yours
1935 | just
1936 | yes
1937 | yet
1938 | had
1939 | has
1940 | t's
1941 | possible
1942 | apart
1943 | right
1944 | old
1945 | somehow
1946 | for
1947 | everything
1948 | asking
1949 | who
1950 | of
1951 | theirs
1952 | plus
1953 | formerly
1954 | down
1955 | c's
1956 | accordingly
1957 | way
1958 | was
1959 | becoming
1960 | tell
1961 | sometime
1962 | no
1963 | whereas
1964 | nd
1965 | welcome
1966 | let's
1967 | certainly
1968 | a's
1969 | did
1970 | it'll
1971 | says
1972 | appear
1973 | alone
1974 | wherever
1975 | example
1976 | usually
1977 | nowhere
1978 | hither
1979 | regardless
1980 | everybody
1981 | thru
1982 | everywhere
1983 | can
1984 | following
1985 | want
1986 | didn't
1987 | may
1988 | such
1989 | whenever
1990 | maybe
1991 | ones
1992 | so
1993 | seeing
1994 | indeed
1995 | course
1996 | still
1997 | thank
1998 | he's
1999 | selves
2000 | ours
2001 | outside
2002 | non
2003 | within
2004 | thereby
2005 | not
2006 | now
2007 | nor
2008 | entirely
2009 | eg
2010 | ex
2011 | et
2012 | hadn't
2013 | furthermore
2014 | looking
2015 | seriously
2016 | shouldn't
2017 | she
2018 | quite
2019 | besides
2020 | think
2021 | first
2022 | ignored
2023 | awfully
2024 | given
2025 | anyone
2026 | indicate
2027 | gives
2028 | mostly
2029 | than
2030 | here's
2031 | were
2032 | and
2033 | appreciate
2034 | himself
2035 | saw
2036 | any
2037 | downwards
2038 | take
2039 | sure
2040 | especially
2041 | later
2042 | that's
2043 | fifth
2044 | don't
2045 | aside
2046 | only
2047 | going
2048 | get
2049 | truly
2050 | cannot
2051 | nearly
2052 | regarding
2053 | us
2054 | where
2055 | up
2056 | namely
2057 | anyways
2058 | wonder
2059 | behind
2060 | between
2061 | it
2062 | across
2063 | come
2064 | many
2065 | whereafter
2066 | according
2067 | comes
2068 | afterwards
2069 | couldn't
2070 | moreover
2071 | considering
2072 | sensible
2073 | hardly
2074 | wants
2075 | former
2076 | those
2077 | these
2078 | [
2079 | somebody
2080 | different
2081 | etc
2082 | insofar
2083 | same
2084 | without
2085 | can't
2086 | very
2087 | you've
2088 | among
2089 | being
2090 | we've
2091 | seems
2092 | around
2093 | using
2094 | specified
2095 | on
2096 | ok
2097 | oh
2098 | whence
2099 | it's
2100 | or
2101 | everyone
2102 | your
2103 | her
2104 | there
2105 | amongst
2106 | trying
2107 | with
2108 | they're
2109 | wasn't
2110 | gone
2111 | certain
2112 | am
2113 | an
2114 | as
2115 | at
2116 | again
2117 | serious
2118 | hello
2119 | since
2120 | consider
2121 | causes
2122 | to
2123 | th
2124 | myself
2125 | i'll
2126 | zero
2127 | further
2128 | what
2129 | brief
2130 | seemed
2131 | c'mon
2132 | allows
2133 | followed
2134 | ask
2135 | viz
2136 | contains
2137 | two
2138 | taken
2139 | more
2140 | knows
2141 | ain't
2142 | particular
2143 | known
2144 | none
2145 | nine
2146 | needs
2147 | rather
2148 | [
2149 | okay
2150 | tried
2151 | tries
2152 | onto
2153 | perhaps
2154 | specifying
2155 | ]
2156 | help
2157 | soon
2158 | through
2159 | its
2160 | seeming
2161 | inward
2162 | actually
2163 | might
2164 | haven't
2165 | someone
2166 | hereafter
2167 | always
2168 | isn't
2169 | beyond
2170 | really
2171 | they'll
2172 | enough
2173 | thereafter
2174 | done
2175 | together
2176 | least
2177 | too
2178 | immediate
2179 | believe
2180 | gotten
2181 | toward
2182 | self
2183 | also
2184 | towards
2185 | most
2186 | nothing
2187 | they'd
2188 | sometimes
2189 | lest
2190 | particularly
2191 | somewhat
2192 | his
2193 | goes
2194 | meanwhile
2195 | during
2196 | him
2197 | greetings
2198 | see
2199 | are
2200 | currently
2201 | please
2202 | various
2203 | probably
2204 | available
2205 | both
2206 | last
2207 | wouldn't
2208 | became
2209 | whole
2210 | liked
2211 | whatever
2212 | except
2213 | throughout
2214 | along
2215 | described
2216 | though
2217 | whom
2218 | beforehand
2219 | what's
2220 | new
2221 | else
2222 | look
2223 | while
2224 | herein
2225 | itself
2226 | wherein
2227 | used
2228 | anybody
2229 | obviously
2230 | thats
2231 | from
2232 | useful
2233 | merely
2234 | follows
2235 | often
2236 | some
2237 | ourselves
2238 | shall
2239 | per
2240 | tends
2241 | either
2242 | be
2243 | by
2244 | anything
2245 | consequently
2246 | into
2247 | appropriate
2248 | we're
2249 | elsewhere
2250 | hasn't
2251 | un
2252 | noone
2253 | associated
2254 | thanks
2255 | having
2256 | once
2257 | edu
2258 | go
2259 | sent
2260 | provides
2261 | yourselves
2262 | they've
2263 | try
2264 | this
2265 | you'd
2266 | yourself
2267 | zz
2268 | zt
2269 | respectively
2270 | let
2271 | others
2272 | until
2273 | weren't
2274 | use
2275 | few
2276 | themselves
2277 | becomes
2278 | anywhere
2279 | something
2280 | six
2281 | allow
2282 | won't
2283 | thence
2284 | willing
2285 | instead
2286 | whither
2287 | doing
2288 | how
2289 | cause
2290 | thereupon
2291 | que
2292 | via
2293 | could
2294 | hence
2295 | third
2296 | doesn't
2297 | their
2298 | exactly
2299 | regards
2300 | herself
2301 | have
2302 | need
2303 | clearly
2304 | i've
2305 | able
2306 | which
2307 | unless
2308 | where's
2309 | eight
2310 | why
2311 | you'll
2312 | normally
2313 | anyway
2314 | one
2315 | should
2316 | mainly
2317 | overall
2318 | qv
2319 | contain
2320 | looks
2321 | neither
2322 | however
2323 | otherwise
2324 | co
2325 | it'd
2326 | corresponding
2327 | thanx
2328 | novel
2329 | value
2330 | will
2331 | almost
2332 | thus
2333 | vs
2334 | when
2335 | gets
2336 | upon
2337 | off
2338 | nevertheless
2339 | well
2340 | less
2341 | presumably
2342 | ought
2343 | who's
2344 | five
2345 | know
2346 | you
2347 | name
2348 | necessary
2349 | like
2350 | become
2351 | therein
2352 | because
2353 | happens
2354 | does
2355 | although
2356 | about
2357 | getting
2358 | own
2359 | three
2360 | inasmuch
2361 | inner
2362 | but
2363 | hi
2364 | he
2365 | whether
2366 | placed
2367 | below
2368 | our
2369 | 上去--
2370 | inc
2371 | lately
2372 | other
2373 | latterly
2374 | out
2375 | 是什么
2376 | 什么时候
2377 | 是什么意思
2378 | 什么意思
2379 | 多少钱
2380 | 有没有
2381 | 更有趣
2382 | 更有甚者
2383 | 更有效
2384 | 更有意义
2385 | 更远的
2386 | 更重要的是
2387 | 正确
2388 | 错误
2389 | 第二把
2390 | 第二波
2391 | 第二大节
2392 | 第二单元
2393 | 第二关
2394 | 第二行
2395 | 第二集
2396 | 第二讲
2397 | 第二款
2398 | 第二类
2399 | 第二盘
2400 | 第二任
2401 | 第二声
2402 | 第二十
2403 | 第二首
2404 | 第二项
2405 | 第三遍
2406 | 第三册
2407 | 第三层
2408 | 第三产业
2409 | 第三大
2410 | 第三单元
2411 | 第三行
2412 | 第三回
2413 | 第三集
2414 | 第三件
2415 | 第三句
2416 | 第三卷
2417 | 第三课
2418 | 第三类
2419 | 第三篇
2420 | 第三期
2421 | 第三日
2422 | 第三声
2423 | 地三鲜
2424 | 第三项
2425 | 第三站
2426 | 第三张
2427 | 第十八
2428 | 第十次
2429 | 第十二
2430 | 的士高
2431 | 第十集
2432 | 第十届
2433 | 第十九
2434 | 第十六
2435 | 第十名
2436 | 第十三
2437 | 第十四
2438 | 第十天
2439 | 第十一
2440 | 第十一个
2441 | 第四版
2442 | 第四册
2443 | 第四场
2444 | 第四代
2445 | 第四单元
2446 | 第四集
2447 | 第四届
2448 | 第四年
2449 | 第四期
2450 | 第四声
2451 | 第四套
2452 | 第四位
2453 | 第四张
2454 | 第四者
2455 | 第四种
2456 | 第五部
2457 | 第五大道
2458 | 第五单元
2459 | 第五集
2460 | 第五卷
2461 | 第五课
2462 | 第五年
2463 | 第五期
2464 | 第五位
2465 | 第五元素
2466 | 第五组
2467 | 召唤
2468 | 最后一班
2469 | 最后一遍
2470 | 最后一关
2471 | 最后一集
2472 | 最后一科
2473 | 最后一颗子弹
2474 | 最后一派
2475 | 最后一题
2476 | 最后一眼
2477 | 最后一页
2478 | 10
2479 | 11
2480 | 12
2481 | 35
2482 | 25
2483 | 2016
2484 | 2015
2485 | 2014
2486 | 又为什么
2487 | 有问题吗
2488 | 有问题么
2489 | 又喜欢
2490 | 有喜欢
2491 | 又小
2492 | 又笑
2493 | 有笑
2494 | 有效地
2495 | 有一百
2496 | 又一遍
2497 | 有一部
2498 | 又一城
2499 | 又一村
2500 | 有一道
2501 | 有意的
2502 | 有一堆
2503 | 有一对
2504 | 有一方
2505 | 有一根
2506 | 有一会了
2507 | 有一批
2508 | 有一片
2509 | 有一期
2510 | 有一起
2511 | 有一群
2512 | 又又
2513 | 由由
2514 | 财新网
2515 | 上午
2516 | 下午
2517 | NULL
2518 | 新华社
2519 | 消息
2520 | 13
2521 | 14
2522 | 15
2523 | 16
2524 | 17
2525 | 18
2526 | 19
2527 | 20
2528 | 21
2529 | 22
2530 | 23
2531 | 24
2532 | 26
2533 | 27
2534 | 28
2535 | 29
2536 | 30
2537 | 31
2538 | 32
2539 | 33
2540 | 34
2541 | 36
2542 | 37
2543 | 38
2544 | 39
2545 | 40
2546 | 41
2547 | 42
2548 | 43
2549 | 44
2550 | 45
2551 | 46
2552 | 47
2553 | 48
2554 | 49
2555 | 50
2556 | 51
2557 | 52
2558 | 53
2559 | 54
2560 | 55
2561 | 56
2562 | 57
2563 | 58
2564 | 59
2565 | 60
2566 | 61
2567 | 62
2568 | 63
2569 | 64
2570 | 65
2571 | 66
2572 | 67
2573 | 68
2574 | 69
2575 | 70
2576 | 71
2577 | 72
2578 | 73
2579 | 74
2580 | 75
2581 | 76
2582 | 77
2583 | 78
2584 | 79
2585 | 80
2586 | 81
2587 | 82
2588 | 83
2589 | 84
2590 | 85
2591 | 86
2592 | 87
2593 | 88
2594 | 89
2595 | 90
2596 | 91
2597 | 92
2598 | 93
2599 | 94
2600 | 95
2601 | 96
2602 | 97
2603 | 98
2604 | 99
2605 | 100
2606 | 01
2607 | 02
2608 | 03
2609 | 04
2610 | 05
2611 | 06
2612 | 07
2613 | 08
2614 | 09
2615 | 达川
2616 | 达州
2617 | 达州市
2618 | 2018
2619 | 现场
2620 | 发生
2621 | 被困
2622 | 发现
2623 | 事件
2624 | 视频
2625 | 介绍
2626 | 成都
2627 | 四川
2628 | 城市
2629 | 地面
2630 | 狂犬病
2631 | 聊城市
2632 | APP
2633 | 威望
2634 | 一新
--------------------------------------------------------------------------------
/clean/相识度计算.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | """
3 | --------------------------------------
4 | @describe
5 | @version: 1.0
6 | @project: yuqing_system
7 | @file: 相识度计算.py
8 | @author: yuanlang
9 | @time: 2019-08-06 15:13
10 | ---------------------------------------
11 | """
12 | import jieba
13 | # import Levenshtein
14 | import difflib
15 | import numpy as np
16 | import pymysql
17 |
18 | # jieba.load_userdict("dict.txt")
19 |
20 | class StrSimilarity():
21 |
22 | __stop_words=["苑","园","大厦","大街","None","公寓","里","花园","公园","小区","期","区"]
23 |
24 | def __init__(self, word):
25 | self.word = word
26 |
27 | # def stopwordslist(self,filepath):
28 | # stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
29 | # return stopwords
30 |
31 | def stopwordslist(self, filepath):
32 | stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
33 | return stopwords
34 |
35 | # 对句子去除停用词
36 | def movestopwords(self,sentence):
37 | # stopwords = self.stopwordslist('语料/hlt_stop_words.txt') # 这里加载停用词的路径
38 | outstr = ''
39 | for word in sentence:
40 | if word not in self.__stop_words:
41 | if word != '\t' and '\n':
42 | outstr += word
43 | # outstr += " "
44 | return outstr
45 |
46 | # Compared函数,参数str_list是对比字符串列表
47 | # 返回原始字符串分词后和对比字符串的匹配次数,返回一个字典
48 | def Compared(self, str_list):
49 | dict_data = {}
50 | sarticiple = self.movestopwords(jieba.cut(self.word.strip()))
51 | for strs in str_list:
52 | num = 0
53 | for sart in sarticiple:
54 | if sart in strs:
55 | num = num + 1
56 | else:
57 | num = num
58 | dict_data[strs] = num
59 | return dict_data
60 |
61 | # NumChecks函数,参数dict_data是原始字符串分词后和对比字符串的匹配次数的字典,也就是Compared函数的返回值
62 | # 返回出现次数最高的两个,返回一个字典
63 | def NumChecks(self, dict_data):
64 | list_data = sorted(dict_data.items(), key=lambda asd: asd[1], reverse=True)
65 | length = len(list_data)
66 | json_data = {}
67 | if length >= 2:
68 | datas = list_data[:2]
69 | else:
70 | datas = list_data[:length]
71 | for data in datas:
72 | json_data[data[0]] = data[1]
73 | return json_data
74 |
75 | # MMedian函数,参数dict_data是出现次数最高的两个对比字符串的字典,也就是NumChecks函数的返回值
76 | # 返回对比字符串和调节值的字典
77 | def MMedian(self, dict_data):
78 | median_list = {}
79 | length = len(self.word)
80 | for k, v in dict_data.items():
81 | num = np.median([len(k), length])
82 | if abs(length - num) != 0:
83 | # xx = (1.0/(abs(length-num)))*0.1
84 | xx = (abs(length - num)) * 0.017
85 | else:
86 | xx = 0
87 | median_list[k] = xx
88 | return median_list
89 |
90 | # Appear函数,参数dict_data是对比字符串和调节值的字典,也就是MMedian函数的返回值
91 | # 返回最相似的字符串
92 | def Appear(self, dict_data):
93 | json_data = {}
94 | for k, v in dict_data.items():
95 | fraction = difflib.SequenceMatcher(None, self.word, k).quick_ratio() - v
96 | json_data[k] = fraction
97 | tulp_data = sorted(json_data.items(), key=lambda asd: asd[1], reverse=True)
98 | return tulp_data[0]
99 |
100 |
101 | def main(can_zhao_biao,mu_biao_biao):
102 | conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="lang1994", db="yuqing_db", charset="utf8")
103 | cursor = conn.cursor()
104 |
105 | # cursor.execute("select url,title from seed;")
106 | # lj_items = cursor.fetchall()
107 | # str_list = []
108 | # _dict = {}
109 | # for lj_item in lj_items:
110 | # aim = "{0}{1}{2}{3}{4}".format(lj_item[1], lj_item[2], lj_item[3], lj_item[4], lj_item[4])
111 | # str_list.append(aim)
112 | # _dict[aim] = lj_item[0]
113 |
114 | str_list="2018年10月7日达川区南外济民医院门口突然塌陷事件"
115 |
116 | while True:
117 | cursor.execute("select url,title from seed limit 1")
118 | f5_items = cursor.fetchall()
119 | if len(f5_items) == 0:
120 | break
121 |
122 | query_str,query_id= '',''
123 | for f5_item in f5_items:
124 | query_id=f5_item[0]
125 | query_str = f5_item[1]
126 |
127 | ss = StrSimilarity(query_str)
128 | list_data = ss.Compared(str_list)
129 | num = ss.NumChecks(list_data)
130 | mmedian = ss.MMedian(num)
131 | print(query_str+" ===> "+ss.Appear(mmedian)[0]+":"+str(ss.Appear(mmedian)[1]))
132 |
133 | # sql="update %s set lj_xiaoqu_id='%s',ration=%12.10f where xiaoqu_id='%s'"%\
134 | # (mu_biao_biao,_dict[ss.Appear(mmedian)[0]],ss.Appear(mmedian)[1],query_id)
135 | # cursor.execute(sql)
136 | # conn.commit()
137 |
138 | if __name__ == "__main__":
139 | #参照表hs_community_dict_fang
140 | can_zhao_biao="poi_ration"
141 | #目标表
142 | mu_biao_biao="shop_ration"
143 | main(can_zhao_biao,mu_biao_biao)
--------------------------------------------------------------------------------
/clean/词频统计_LDA主题模型.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | """
3 | --------------------------------------
4 | @describe 数据整理
5 | @version: 1.0
6 | @project: yuqing_system
7 | @file: 词频统计_LDA主题模型.py
8 | @author: yuanlang
9 | @time: 2019-08-07 10:00
10 | ---------------------------------------
11 | """
12 | import os
13 | import jieba
14 | import pymysql
15 | import pandas as pd
16 | import gensim
17 | import numpy
18 | import matplotlib.pyplot as plt
19 | from wordcloud import WordCloud#词云包
20 | from gensim import corpora, models, similarities
21 | # 编码问题
22 | plt.rcParams['figure.figsize'] = (5.0, 5.0)
23 | plt.rcParams['font.sans-serif'] = ['simhei']
24 | plt.rcParams['axes.unicode_minus'] = False
25 |
26 | print(os.path.dirname(__file__))
27 | # 导入停用词
28 | stopwords=pd.read_csv(f"{os.path.dirname(__file__)}/stopwords.txt",index_col=False,quoting=3,sep="\t",names=['stopword'], encoding='utf-8')
29 | stopwords=stopwords['stopword'].values
30 |
31 | # 读取新闻内容
32 | df = pd.read_csv(f"{os.path.dirname(__file__)}/地陷事件.csv", encoding='utf-8',sep = '&@@&')
33 | # df = pd.read_csv(f"{os.path.dirname(__file__)}/出租车罢工.csv", encoding='utf-8',sep = '&@@&')
34 | # df = pd.read_csv(f"{os.path.dirname(__file__)}/好一新大火.csv", encoding='utf-8',sep = '&@@&')
35 |
36 | x=0
37 | lines=[((++x),item) for item in df.content.values.tolist()]
38 |
39 | # 原始数据
40 | # conn = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="lang1994", db="yuqing_db", charset="utf8")
41 | # cursor = conn.cursor()
42 | # cursor.execute("select * from context")
43 | # lines=cursor.fetchall()
44 |
45 | def db_to_csv(lines):
46 | """保存到本地"""
47 | with open("好一新大火.csv","w",encoding="utf-8") as f:
48 | f.writelines("url&@@&content\n")
49 | for line in lines:
50 | text = line[1].replace("\n", "").replace(" ", "").replace("\t", "")
51 | print(text)
52 | f.writelines("\""+line[0]+"\""+"&@@&"+"\""+text+"\"\n")
53 |
54 | # db_to_csv(lines)
55 |
56 | def word_count(lines,stopwords):
57 | # 词频统计
58 | segment = []
59 | for line in lines:
60 | try:
61 | text = line[1].replace("\n", "").replace(" ", "").replace("\t", "")
62 | segs = jieba.__lcut(text)
63 | for seg in segs:
64 | if len(seg) > 1 and seg != '\r\n' and seg not in stopwords:
65 | segment.append(seg)
66 | # print(segment)
67 | except Exception as e:
68 | print(e)
69 |
70 | words_df = pd.DataFrame({'segment': segment})
71 | words_stat = words_df.groupby(by=['segment'])['segment'].agg(["size"])
72 | words_stat = words_stat[1300:]
73 | words_stat = words_stat.reset_index().sort_values(by=["size"], ascending=False)
74 | print(words_stat[:1500])
75 | wordcloud = WordCloud(font_path="simhei.ttf", background_color="white", max_font_size=80)
76 | word_frequence = {x[0]: x[1] for x in words_stat.head(1500).values}
77 | wordcloud = wordcloud.fit_words(word_frequence)
78 | plt.imshow(wordcloud)
79 | plt.show()
80 |
81 | # word_count(lines,stopwords)
82 |
83 | def lda(lines,stopwords):
84 | """lda主题"""
85 | sentences = []
86 | for line in lines:
87 | try:
88 | text = line[1].replace("\n", "").replace(" ", "").replace("\t", "")
89 | segs = jieba.__lcut(text)
90 | segs = filter(lambda x: len(x) > 1, segs)
91 | segs = [seg for seg in list(segs) if seg not in stopwords]
92 | sentences.append(segs)
93 | except Exception as e:
94 | print(e)
95 |
96 | # 词袋模型
97 | dictionary = corpora.Dictionary(sentences)
98 | corpus = [dictionary.doc2bow(_sentence) for _sentence in sentences]
99 | lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
100 |
101 | # 主题模型打印
102 | print(lda.print_topics())
103 | wors={}
104 | for topic in lda.print_topics():
105 | words=topic[1].split("+")
106 | for word in words:
107 | ss=[ii.replace(" ","").replace("\"","") for ii in word.split("*")]
108 | print(wors.get(ss[1],0),ss[0],wors.get(ss[1],0)+float(ss[0]))
109 | wors[ss[1]]=wors.get(ss[1],0)+float(ss[0])
110 | # print(ss)
111 | wors={x:float('%.3f'%y) for x,y in wors.items()}
112 |
113 | # 合并词
114 | data_dic = {'count': wors}
115 | data_df = pd.DataFrame(data_dic)
116 | data_df = data_df.reset_index().sort_values(by=["count"], ascending=False)
117 | print(data_df[:10]["index"])
118 | print(data_df[:10].index)
119 | print(data_df[:10]["count"])
120 |
121 | number = numpy.array(data_df[:10]["count"].values*1000)
122 | work_type = data_df[:10]["index"].values
123 |
124 |
125 | labels = tuple(work_type)
126 | fracs = number
127 |
128 | print(labels)
129 | plt.pie(x=fracs, labels=labels, autopct='%.0f%%') # autopct显示百分比
130 | plt.show()
131 |
132 |
133 | lda(lines,stopwords)
--------------------------------------------------------------------------------
/dz_spider/__init__.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | """
3 | --------------------------------------
4 | @describe
5 | @version: 1.0
6 | @project: yuqing_system
7 | @file: __init__.py.py
8 | @author: yuanlang
9 | @time: 2019-07-26 17:50
10 | ---------------------------------------
11 | """
--------------------------------------------------------------------------------
/dz_spider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/dz_spider/dz_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__init__.py
--------------------------------------------------------------------------------
/dz_spider/dz_spider/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/dz_spider/dz_spider/__pycache__/middlewares.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__pycache__/middlewares.cpython-36.pyc
--------------------------------------------------------------------------------
/dz_spider/dz_spider/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__pycache__/pipelines.cpython-36.pyc
--------------------------------------------------------------------------------
/dz_spider/dz_spider/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/__pycache__/settings.cpython-36.pyc
--------------------------------------------------------------------------------
/dz_spider/dz_spider/common.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | """
3 | --------------------------------------
4 | @describe
5 | @version: 1.0
6 | @project: yuqing_system
7 | @file: common.py
8 | @author: yuanlang
9 | @time: 2019-07-26 17:50
10 | ---------------------------------------
11 | """
12 |
13 | site_name=["凤凰山下","达州市人民政府","闽南网","新京报网"]
14 |
15 | # 种子表
16 | seed_table="""create table if not exists `seed`(
17 | `url` varchar(500) Not null,
18 | `title` varchar(500) default "",
19 | `site_name` char(10) default "",
20 | `status` int(2) default 0,
21 | `create_time` timestamp default current_timestamp,
22 | `update_time` timestamp default current_timestamp,
23 | primary key (`url`)
24 | )ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci;
25 | """
26 |
27 |
--------------------------------------------------------------------------------
/dz_spider/dz_spider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DzSpiderItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/dz_spider/dz_spider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8-*-
2 | # 随机更换user agent
3 | import random
4 | from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
5 | import base64
6 | import requests
7 | import redis
8 | import datetime
9 | import time
10 |
11 |
12 | class RotateUserAgentMiddleware(UserAgentMiddleware):
13 | def __init__(self, user_agent=''):
14 | self.user_agent = user_agent
15 | self._redis=redis.Redis(host="10.29.4.242",port=6379,db=0)
16 | # self.start_time=datetime.datetime.now()
17 |
18 |
19 | def get_proxy(self,name):
20 | key = self._redis.hgetall(name=name)
21 | rkey = random.choice(list(key.keys())) if key else None
22 | if isinstance(rkey, bytes):
23 | return rkey.decode('utf-8')
24 | else:
25 | return rkey
26 |
27 | def process_request(self, request, spider):
28 | ua = random.choice(self.user_agent_list)
29 | #ip = random.choice(self.ip_proxy)
30 | # ip = self.get_proxy("useful_proxy")
31 | if ua:
32 | request.headers.setdefault('User-Agent', ua)
33 | # 设置代理
34 | # request.meta['proxy'] = 'http://{0}'.format(ip)
35 |
36 | #response=requests.get("http://10.29.4.242:5010/get/")
37 | #print('http://{0}'.format(response.text))
38 | #request.meta['proxy'] = 'http://{0}'.format(response.text)
39 | # request.meta['proxy'] = "127.0.0.1:8888"
40 | # proxy_user_pass = 'XXXXXXXXXXXXXXX:KKKKKKKKKKKKKKKK'
41 | # encoded_user_pass = base64.b64encode(proxy_user_pass.encode(encoding='utf-8'))
42 | # request.headers['Proxy-Authorization'] = 'Basic ' + str(encoded_user_pass)
43 |
44 | ip_proxy = ['101.50.1.2:80', '54.36.1.22:3128', '178.238.228.187:9090', '149.56.108.133:3128', '190.2.137.31:1080', '13.125.162.226:3128', '128.199.182.128:3128', '122.216.120.254:80', '157.55.233.183:80', '85.10.247.140:1080', '90.84.242.77:3128', '159.65.156.178:3128', '54.38.100.98:1080', '119.28.221.28:8088', '139.224.24.26:8888', '190.2.137.9:1080', '178.32.181.66:3128', '47.88.35.91:3128', '103.78.213.147:80', '59.44.164.34:3128', '190.2.137.15:1080', '54.36.31.203:3128', '142.44.198.187:3128', '122.114.31.177:808', '66.195.76.86:8080', '122.216.120.244:80', '212.237.34.18:8888', '134.119.205.147:1080', '159.89.201.219:3128', '50.28.48.83:8080', '211.159.219.158:80', '124.51.247.48:3128', '35.162.122.16:8888', '217.182.242.64:3128', '139.59.21.37:3128', '47.89.23.174:8080', '200.16.208.187:8080', '5.135.74.36:1080', '117.242.145.103:8080', '61.5.207.102:80', '61.135.217.7:80', '71.13.112.152:3128', '5.135.74.37:1080', '211.159.177.212:3128', '210.5.149.43:8090', '122.72.18.35:80', '212.237.51.54:8888', '61.136.163.245:8107', '124.193.37.5:8888', '120.78.182.79:3128', '180.173.67.197:9797', '171.97.67.88:3128', '145.239.185.127:1080', '167.99.70.26:8080', '159.65.141.81:3128', '180.235.42.148:8080', '67.205.159.46:3128', '121.8.98.198:80', '151.80.140.233:54566', '139.59.224.113:8080', '47.91.165.126:80', '5.9.78.89:3128', '142.44.202.122:3128', '35.198.103.196:3128', '39.137.47.11:80', '142.44.197.15:3128', '190.2.137.38:1080', '122.216.120.251:80', '159.65.139.226:3128', '116.11.254.37:80', '36.80.123.114:3128', '194.67.220.181:3128', '217.182.216.236:3128', '190.2.137.47:1080', '163.172.217.103:3128', '145.239.185.122:1080', '212.237.37.152:8888', '219.135.164.245:3128', '119.28.26.57:3128', '120.77.254.116:3128', '60.207.106.140:3128', '14.139.189.216:3128', '212.126.117.158:80', '120.26.160.183:8090', '142.44.198.121:3128', '218.50.2.102:8080', '183.179.199.225:8080', '116.58.227.143:3128', '144.202.70.37:3128', '119.28.112.130:3128', '45.63.95.172:3128', '167.99.87.147:8080', '202.175.61.162:8080', '200.63.129.131:80', '194.182.74.203:3128', '77.244.21.75:3128', '118.212.137.135:31288', '145.239.185.121:1080', '190.2.137.45:1080', '5.167.54.154:8080', '50.233.137.38:80', '112.21.164.58:1080', '45.76.56.140:3128', '35.200.194.218:3128', '159.65.142.92:3128', '37.204.219.50:8081', '113.214.13.1:8000', '47.90.72.227:8088', '114.130.42.20:80', '119.28.152.208:80', '167.99.78.239:8080', '144.202.70.81:3128', '151.80.9.177:3128', '151.106.10.230:1080', '104.155.53.214:3128', '123.57.133.142:3128', '151.106.5.26:1080', '5.9.78.28:3128', '47.75.56.36:8118', '66.70.147.195:3128', '114.232.171.58:48354', '122.72.18.34:80', '5.135.74.32:1080', '114.130.42.20:3128']
45 |
46 | user_agent_list = [ \
47 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
48 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
49 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
50 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
51 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
52 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
53 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
54 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
55 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
56 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
57 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
58 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
59 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
60 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
61 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
62 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
63 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
64 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
65 | ]
--------------------------------------------------------------------------------
/dz_spider/dz_spider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import pymongo
3 | from scrapy import log
4 | from scrapy.conf import settings
5 | import threading
6 | from openpyxl import Workbook
7 | import redis
8 | from scrapy.pipelines.images import ImagesPipeline
9 | from scrapy.exceptions import DropItem
10 | import scrapy
11 | import pymysql
12 | from twisted.enterprise import adbapi
13 | import random
14 | import sys
15 | from scrapy.log import logger
16 | # Define your item pipelines here
17 | #
18 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
19 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
20 | # 单例模式创建MongoPipline
21 |
22 | Lock = threading.Lock()
23 |
24 |
25 | class MongoPipeline(object):
26 | # 定义静态变量实例
27 | __instance = None
28 |
29 | def __init__(self):
30 | pass
31 |
32 | def __new__(cls, *args, **kwargs):
33 | if not cls.__instance:
34 | try:
35 | Lock.acquire()
36 | # double check
37 | if not cls.__instance:
38 | cls.client = pymongo.MongoClient(settings['MONGO_URI'])
39 | cls.db = cls.client[settings['MONGO_DATABASE']]
40 | cls.__instance = super(MongoPipeline, cls).__new__(cls, *args, **kwargs)
41 | finally:
42 | Lock.release()
43 | return cls.__instance
44 |
45 | def dorp_connection(self,db_name):
46 | return self.db[db_name].drop()
47 |
48 | def ensure_index(self,db_name,unique_id):
49 | return self.db[db_name].ensure_index(unique_id,unique=True)
50 |
51 |
52 | # def process_item(self, item,spider):
53 | # '''
54 | # 异步增加,修改
55 | # :param item:
56 | # :param spider:
57 | # :return:
58 | # '''
59 | # if item["operation"]=="insert":
60 | #
61 | # try:
62 | # self.db[item["db"]].insert(dict(item["info"]))
63 | # log.msg("[{0} line:{1}] insert {2}=====>>>>>种子入库".
64 | # format(self.__class__.__name__, sys._getframe().f_lineno, item["db"]), level=log.INFO)
65 | # except Exception as e:
66 | # log.msg("[{0} line:{1}] {2}".
67 | # format(self.__class__.__name__, sys._getframe().f_lineno, e),level=log.ERROR)
68 | #
69 | # elif item["operation"]=="upsert":
70 | # self.db[item["db"]].update(item["condition"], item["info"], True)
71 | # log.msg("[{0} line:{1}] upsert {2}=====>>>>更新种子信息"
72 | # .format(self.__class__.__name__, sys._getframe().f_lineno, item["db"]),level=log.INFO)
73 | # elif item["operation"]=="update":
74 | # self.db[item["db"]].update(item["condition"], item["info"], False)
75 | # log.msg("[{0} line:{1}] update {2}=====>>>>更新种子信息"
76 | # .format(self.__class__.__name__, sys._getframe().f_lineno, item["db"]),level=log.INFO)
77 |
78 | def process_item(self, item,db_name):
79 | try:
80 | self.db[db_name].insert(dict(item))
81 | log.msg("[{0} line:{1}] insert {2}=====>>>>>种子入库".
82 | format(self.__class__.__name__, sys._getframe().f_lineno,db_name), level=log.INFO)
83 | except Exception as e:
84 | log.msg("[{0} line:{1}] {2}".
85 | format(self.__class__.__name__, sys._getframe().f_lineno, e), level=log.ERROR)
86 |
87 | def process_items(self, items, db_name):
88 | try:
89 | self.db[db_name].insert(items)
90 | log.msg("[{0} line:{1}] insert {2}=====>>>>>种子入库".
91 | format(self.__class__.__name__, sys._getframe().f_lineno,db_name), level=log.INFO)
92 | except Exception as e:
93 | log.msg("[{0} line:{1}] {2}".
94 | format(self.__class__.__name__, sys._getframe().f_lineno, e), level=log.ERROR)
95 |
96 | def seed_find(self,db_name,conditions,return_range):
97 | log.msg("[{0} line:{1}] find {2}=====>>>>>小区列表页种子查询"
98 | .format(self.__class__.__name__,sys._getframe().f_lineno,db_name),
99 | level=log.INFO)
100 | return self.db[db_name].find(conditions,return_range)
101 |
102 | def info_update(self,db_name,conditions,info):
103 | log.msg("[{0} line:{1}] update {2}=====>>>>更新种子信息"
104 | .format(self.__class__.__name__,sys._getframe().f_lineno,db_name),
105 | level=log.INFO)
106 | return self.db[db_name].update(conditions,info,False)
107 |
108 | def info_upsert(self,db_name,conditions,info):
109 | log.msg("[{0} line:{1}] update {2}=====>>>>更新种子信息"
110 | .format(self.__class__.__name__,sys._getframe().f_lineno,db_name),
111 | level=log.INFO)
112 | return self.db[db_name].update(conditions,info,True)
113 |
114 | def info_update_many(self,db_name,conditions,info):
115 | log.msg("[{0} line:{1}] update {2}=====>>>>更新种子信息"
116 | .format(self.__class__.__name__,sys._getframe().f_lineno,db_name),
117 | level=log.INFO)
118 | return self.db[db_name].update_many(conditions,info,False)
119 |
120 |
121 | # ######################################链家房产################################
122 | # ###小区
123 | # def lianjia_xiaoqu_insert_seed(self, seed):
124 | # '''
125 | # 小区列表页种子入库
126 | # :param seed:
127 | # :return:
128 | # '''
129 | # log.msg("[{0} line:{1}] insert LianJiaXiaoQuSeed=====>>>>>链家小区列表页种子入库".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
130 | # return self.db["LianJiaXiaoQuSeed"].insert(seed)
131 | #
132 | # def lianjia_xiaoqu_find_seed1(self):
133 | # '''
134 | # 链家小区列表页种子提取
135 | # :return:
136 | # '''
137 | # print("finid操作======》查询链家小区列表页种子")
138 | # return self.db["LianJiaXiaoQuSeed"].find({"status": 0}, {"url": 1, "_id": 0})
139 | #
140 | # def lianjia_xiaoqu_find_seed2(self):
141 | # '''
142 | # 链家小区详细信息种子提取
143 | # :return:
144 | # '''
145 | # print("finid操作======》查询链家详细信息页种子")
146 | # return self.db["LianJiaXiaoQuInfo"].find({"status": 0}, {"xiaoqu_url": 1, "_id": 0})
147 | #
148 | # def lianjia_xiaoqu_update_seed(self, seed):
149 | # '''
150 | # 更新小区列表页种子状态
151 | # :param seed:
152 | # :return:
153 | # '''
154 | # log.msg("[{0} line:{1}] update LianJiaXiaoQuSeed=====>>>>>更新链家列表页种子".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
155 | # return self.db["LianJiaXiaoQuSeed"].update_one({"url":seed["url"]},
156 | # {"$set":{"status":seed["status"],
157 | # "ts":seed["ts"]}},True)
158 | # def lianjia_xiaoqu_img_find(self):
159 | # return self.db["LianJiaXiaoQuImg"].find({"status": 0}, {"xiaoquImgs": 1, "_id": 0,"xiaoquId":1})
160 | #
161 | # def lianjia_xiaoqu_img_update(self,item):
162 | # return self.db["LianJiaXiaoQuImg"].update_many({"xiaoquId":item["xiaoquId"]},
163 | # {"$set":{"status":item["status"],
164 | # "ts":item["ts"]}})
165 | #
166 | # def lianjia_xiaoqu_update_info(self, info):
167 | # '''
168 | # 更新链家详细页种子状态和详细信息
169 | # :param info:
170 | # :return:
171 | # '''
172 | # log.msg("[{0} line:{1}] update LianJiaXiaoQuInfo=====>>>>>更新家详细页种子状态和详细信息".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
173 | # return self.db["LianJiaXiaoQuInfo"].update({"xiaoqu_url" : info["xiaoqu_url"]},
174 | # {"$set":{"address": info["address"],
175 | # "build_year": info['build_year'],
176 | # "build_type": info['build_type'],
177 | # "property_cost" : info["property_cost"],
178 | # "property_company": info["property_company"],
179 | # "developer": info["developer"],
180 | # "lou_dong_count" : info["lou_dong_count"],
181 | # "house_count": info["house_count"],
182 | # "nerber_shop": info["nerber_shop"],
183 | # "longitude": info["longitude"],
184 | # "latitude": info["latitude"],
185 | # "chengjiao_url":info["chengjiao_url"],
186 | # "imgs":info["imgs"],
187 | # "nerber_xiaoqu":info["nerber_xiaoqu"],
188 | # "xiaoqu_name_other":info["xiaoqu_name_other"],
189 | # "status": info["status"],
190 | # "html":info["html"],
191 | # "follow":info["follow"],
192 | # "sale_url": info["sale_url"],
193 | # "rent_url": info["rent_url"],
194 | # "ts":info["ts"]
195 | # }},True)
196 | #
197 | # ###成交部分
198 | # def lianjia_chengjiao_insert_seed(self, seed):
199 | # '''
200 | # 链家成交种子保存
201 | # :param seed:
202 | # :return:
203 | # '''
204 | # log.msg("[{0} line:{1}] insert LianJiaChengJiaoFangSeed=====>>>>>插入链家成交列表页种子".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
205 | # return self.db["LianJiaChengJiaoFangSeed"].insert_many(seed)
206 | #
207 | # ###二手房部分
208 | # def lianjia_ershoufang_insert_seed(self, seed):
209 | # '''
210 | # 链家二手房种子保存
211 | # :param seed:
212 | # :return:
213 | # '''
214 | # return self.db["LianJiaErShouFangSeed"].insert_many(seed)
215 | #
216 | # def lianjia_ershoufang_find_seed1(self):
217 | # '''
218 | # 链家二手房列表页种子提取
219 | # :return:
220 | # '''
221 | # return self.db["LianJiaErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0})
222 | #
223 | # def lianjia_ershoufang_find_seed2(self):
224 | # '''
225 | # 链家二手房详细页种子提取
226 | # :return:
227 | # '''
228 | # return self.db["LianJiaErShouFangInfo"].find({"status": 0}, {"url": 1, "_id": 0})
229 | #
230 | # def lianjia_ershoufang_update_seed(self,seed):
231 | # '''
232 | # 链家二手房列表页种子状态更新
233 | # :param seed:
234 | # :return:
235 | # '''
236 | # print("update操作======》更新链家二手房列表页状态: "+str(seed))
237 | # return self.db["LianJiaErShouFangSeed"].update_one({"url":seed["url"]},
238 | # {"$set":{"status":seed["status"],
239 | # "ts":seed["ts"]}})
240 | # def lianjia_ershoufang_update_info_seed(self,seed):
241 | # '''
242 | # 链家二手房详细页状态更新
243 | # :param seed:
244 | # :return:
245 | # '''
246 | # print("update操作======》更新链家二手房详细页状态: "+str(seed))
247 | # return self.db["LianJiaErShouFangInfo"].update_one({"url":seed["url"]},
248 | # {"$set":{"status":seed["status"],
249 | # "ts":seed["ts"]}})
250 | # def lianjia_ershoufang_update_info(self, info):
251 | # '''
252 | # 链家二手房详细信息更新
253 | # :param info:
254 | # :return:
255 | # '''
256 | # print("update操作======》更新链家二手房详细信息: " + info['url'])
257 | # self.db["LianJiaErShouFangInfo"].update({"url": info['url']},
258 | # {"$set":{"buyPoint": info["buyPoint"],
259 | # "layout": info['layout'],
260 | # "floor" : info["floor"],
261 | # "buildArea": info["buildArea"],
262 | # "layoutStructure": info["layoutStructure"],
263 | # "area": info["area"],
264 | # "buildType": info["buildType"],
265 | # "chaoXiang": info["chaoXiang"],
266 | # "buildStructure": info["buildStructure"],
267 | # "decoration": info["decoration"],
268 | # "ladderProportion": info["ladderProportion"],
269 | # "heatingMode": info["heatingMode"],
270 | # "propertyRightYear": info["propertyRightYear"],
271 | # "publishDate": info["publishDate"],
272 | # "transAttributes": info["transAttributes"],
273 | # "lastTransaction": info["lastTransaction"],
274 | # "houseUse": info["houseUse"],
275 | # "houseYear": info["houseYear"],
276 | # "propertybelong": info["propertybelong"],
277 | # "emortgage": info["emortgage"],
278 | # "backUp": info["backUp"],
279 | # "houseTag": info["houseTag"],
280 | # "traffic": info["traffic"],
281 | # "decoration_desc": info["decoration_desc"],
282 | # "layout_instru": info["layout_instru"],
283 | # "longitude": info["longitude"],
284 | # "latitude": info["latitude"],
285 | # "ts":info["ts"]
286 | # }} ,True)
287 | #
288 | # def lianjia_chengjiaofang_update_seed(self,seed):
289 | # '''
290 | # 链家成交房列表页种子状态更新
291 | # :param seed:
292 | # :return:
293 | # '''
294 | # log.msg("[{0} line:{1}] update LianJiaChengJiaoFangSeed=====>>>>>更新链家成交房列表页状态".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
295 | # return self.db["LianJiaChengJiaoFangSeed"].update_one({"url":seed["url"]},
296 | # {"$set":{"status":seed["status"],
297 | # "ts":seed["ts"]}})
298 | #
299 | # def lianjia_chengjiaofang_find_seed(self):
300 | # '''
301 | # 链家小区交易种子提取
302 | # :return:
303 | # '''
304 | # return self.db["LianJiaXiaoQuInfo"].find({"status": 1,"chengjiao_url":{"$ne":""}}, {"chengjiao_url": 1, "_id": 0})
305 | #
306 | # def lianjia_chengjiaofang_find_seed1(self):
307 | # '''
308 | # 链家成交房列表页种子提取
309 | # :return:
310 | # '''
311 | # return self.db["LianJiaChengJiaoFangSeed"].find({"status": 0}, {"url": 1, "_id": 0})
312 | #
313 | # def lianjia_chengjiaofang_find_seed2(self):
314 | # '''
315 | # 链家成交房详细页种子提取
316 | # :return:
317 | # '''
318 | # return self.db["LianJiaChengJiaoFangInfo"].find({"status": 0}, {"chengjiao_url": 1, "_id": 0})
319 | #
320 | # def lianjia_chengjiaofang_update_info(self, info):
321 | # '''
322 | # 链家成交房详细信息更新
323 | # :param info:
324 | # :return:
325 | # '''
326 | # log.msg("[{0} line:{1}] update LianJiaChengJiaoFangInfo=====>>>>>更新链家成交房详细信息".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
327 | # self.db["LianJiaChengJiaoFangInfo"].update({"chengjiao_url": info['chengjiao_url']},
328 | # {"$set":{"xiaoqu_id": info["xiaoqu_id"],
329 | # "chengjiao_url": info["chengjiao_url"],
330 | # "trade_date": info["trade_date"],
331 | # "trade_channel": info["trade_channel"],
332 | # "total_price": info['total_price'],
333 | # "unit_price" : info["unit_price"],
334 | # "list_price": info["list_price"],
335 | # "transaction_cycle": info['transaction_cycle'],
336 | # "modify_price": info["modify_price"],
337 | # "watch": info["watch"],
338 | # "follow": info["follow"],
339 | # "layout": info["layout"],
340 | # "floor": info["floor"],
341 | # "build_area": info["build_area"],
342 | # "layout_structure": info["layout_structure"],
343 | # "area": info["area"],
344 | # "build_type": info["build_type"],
345 | # "orientation": info["orientation"],
346 | # "house_year": info["house_year"],
347 | # "build_year": info["build_year"],
348 | # "decoration": info["decoration"],
349 | # "build_structure": info["build_structure"],
350 | # "ladder_ratio": info["ladder_ratio"],
351 | # "heating_mode": info["heating_mode"],
352 | # "right_year": info["right_year"],
353 | # "has_elevator": info["has_elevator"],
354 | # "publish_date": info["publish_date"],
355 | # "transaction_attr": info["transaction_attr"],
356 | # "last_tranfic": info["last_tranfic"],
357 | # "house_use": info["house_use"],
358 | # "house_year": info["house_year"],
359 | # "right_belong": info["right_belong"],
360 | # "layout_instru": info["layout_instru"],
361 | # "emortgage": info["emortgage"],
362 | # "back_up": info["back_up"],
363 | # "record": info["record"],
364 | # "house_tag": info["house_tag"],
365 | # "xiaoqu_instru":info["xiaoqu_instru"],
366 | # "sax_analysis": info["sax_analysis"],
367 | # "traffic": info["traffic"],
368 | # "decoration_desc": info["decoration_desc"],
369 | # "layout_instru": info["layout_instru"],
370 | # "buy_point": info["buy_point"],
371 | # "imgs": info["imgs"],
372 | # "ts":info["ts"],
373 | # "status":info["status"],
374 | # "html":info["html"]
375 | # }} ,True)
376 | #
377 | # def lianjia_xiaoqu_update_chengjiao_seed(self, info):
378 | # '''
379 | # 链家成交房详细信息更新
380 | # :param info:
381 | # :return:
382 | # '''
383 | # log.msg("[{0} line:{1}] update LianJiaXiaoQuInfo=====>>>>>更新链家小区成交房种子状态 status=2".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
384 | # self.db["LianJiaXiaoQuInfo"].update({"chengjiao_url": info['chengjiao_url']},
385 | # {"$set":{"status": info["status"],
386 | # "ts":info["ts"]
387 | # }} ,True)
388 | #
389 | #
390 | # #房价
391 | def lianjia_fangjia_insert_seed(self, seed):
392 | '''
393 | 房价种子入库
394 | :param seed:
395 | :return:
396 | '''
397 | print("insert操作======》链家房价种子入库")
398 | return self.db["LianJiaFangJiaSeed"].insert(seed)
399 | #
400 | def lianjia_fangjia_update_seed(self,seed):
401 | '''
402 | 链家成交房列表页种子状态更新
403 | :param seed:
404 | :return:
405 | '''
406 | print("update操作======》更新链家房价种子状态: "+str(seed))
407 | return self.db["LianJiaFangJiaSeed"].update_one({"url":seed["url"]},
408 | {"$set":{"status":seed["status"],
409 | "ts":seed["ts"]}})
410 | #
411 | #
412 | def lianjia_fangjia_find_seed(self):
413 | '''
414 | :return:
415 | '''
416 | print("finid操作======》查询链家房价种子")
417 | return self.db["LianJiaFangJiaSeed"].find({"status": 0}, {"url": 1, "_id": 0})
418 | #
419 | # ####租房部分
420 | # def lianjia_zufang_insert_seed(self, seed):
421 | # return self.db["LianJiaZuFangSeed"].insert_many(seed)
422 | #
423 | # ##未实现循环读取Redis
424 | # def lianjia_zufang_find_seed(self):
425 | # return self.db["LianJiaZuFangSeed"].find({"status": 0}, {"url": 1, "_id": 0})
426 | #
427 | # def lianjia_zufang_update_seed(self,seed):
428 | # print("update操作======》更新链家租房url状态: "+str(seed))
429 | # return self.db["LianJiaZuFangSeed"].update_one({"url":seed["url"]},
430 | # {"$set":{"status":seed["status"],
431 | # "ts":seed["ts"]}})
432 | #
433 | # #################################我爱我家##################################################
434 | # ###我爱我家小区
435 | #
436 | # def f5j5j_xiaoqu_insert_seed(self, seed):
437 | # '''
438 | # 我爱我家小区列表页种子保存
439 | # :param seed:
440 | # :return:
441 | # '''
442 | # return self.db["F5J5JXiaoQuSeed"].insert_many(seed)
443 | #
444 | # def f5j5j_xiaoqu_find_seed1(self):
445 | # '''
446 | # 我爱我家小区列表页种子提取
447 | # :return:
448 | # '''
449 | # print("finid操作======》查询我爱我家小区列表页种子")
450 | # return self.db["F5J5JXiaoQuSeed"].find({"status": 0}, {"url": 1, "_id": 0})
451 | #
452 | # def f5j5j_xiaoqu_find_seed2(self):
453 | # '''
454 | # 我爱我家小区详细信息种子提取
455 | # :return:
456 | # '''
457 | # print("finid操作======》查询我爱我家详细信息页种子")
458 | # return self.db["F5J5JXiaoQuInfo"].find({"status": 0}, {"xiaoqu_url": 1, "_id": 0})
459 | #
460 | # def f5j5j_xiaoqu_update_seed(self, seed):
461 | # '''
462 | # 更新小区列表页种子状态
463 | # :param seed:
464 | # :return:
465 | # '''
466 | # log.msg("[{0} line:{1}] update F5J5JXiaoQuSeed=====>>>>>更新我爱我家列表页种子".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
467 | # return self.db["F5J5JXiaoQuSeed"].update_one({"url":seed["url"]},
468 | # {"$set":{"status":seed["status"],
469 | # "ts":seed["ts"]}})
470 | #
471 | # def f5j5j_xiaoqu_update_info(self, info):
472 | # '''
473 | # 更新我爱我家详细页种子状态和详细信息
474 | # :param info:
475 | # :return:
476 | # '''
477 | # log.msg("[{0} line:{1}] update F5J5JXiaoQuInfo=====>>>>>更新我爱我家详细页种子状态和详细信息".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
478 | # return self.db["F5J5JXiaoQuInfo"].update({"xiaoqu_url" : info["xiaoqu_url"]},
479 | # {"$set":{"address": info["address"],
480 | # "build_year": info['build_year'],
481 | # "build_type": info['build_type'],
482 | # "lou_dong_count": info["lou_dong_count"],
483 | # "house_count": info["house_count"],
484 | # "region_three" : info["region_three"],
485 | # "region_four": info["region_four"],
486 | # "follow": info["follow"],
487 | # "chengjiao_url":info["chengjiao_url"],
488 | # "property_company": info["property_company"],
489 | # "developer": info["developer"],
490 | # "greening_rate": info["greening_rate"],
491 | # "traffic": info["traffic"],
492 | # "nerber_shop": info["nerber_shop"],
493 | # "trend": info['trend'],
494 | # "imgs": info["imgs"],
495 | # "longitude": info["longitude"],
496 | # "latitude": info["latitude"],
497 | # "status": info["status"],
498 | # "ts":info["ts"]
499 | # }} ,True)
500 | #
501 | # def f5j5j_xiaoqu_img_find(self):
502 | # return self.db["F5J5JXiaoQuImg"].find({"status": 0}, {"image_urls": 1, "_id": 0,"id":1})
503 | #
504 | # def f5j5j_xiaoqu_img_update(self,item):
505 | # return self.db["F5J5JXiaoQuImg"].update_many({"id":item["id"]},
506 | # {"$set":{"status":item["status"],
507 | # "ts":item["ts"]}})
508 | # #成交
509 | # def f5j5j_chengjiao_find_seed(self):
510 | # '''
511 | # 成交种子提取
512 | # :return:
513 | # '''
514 | # print("finid操作======》查询我爱我家成交房详细信息页种子")
515 | # return self.db["F5J5JXiaoQuInfo"].find({"status": 1}, {"chengjiao_url": 1, "_id": 0})
516 | #
517 | # def f5j5j_chengjiao_update_info(self,seed):
518 | # '''
519 | # 更新种子
520 | # :return:
521 | # '''
522 | # log.msg("[{0} line:{1}] update F5J5JXiaoQuInfo=====>>>>>更新我爱我家小区成交房种子状态[chengjiao_url:{2}]".format(self.__class__.__name__, sys._getframe().f_lineno,seed["chengjiao_url"]), level=log.INFO)
523 | # return self.db["F5J5JXiaoQuInfo"].update_one({"chengjiao_url": seed["chengjiao_url"]},
524 | # {"$set":{
525 | # "status":seed["status"],
526 | # "ts":seed["ts"]
527 | # }},True)
528 | #
529 | # #二手房
530 | # def f5j5j_ershoufang_insert_seed(self, seed):
531 | # return self.db["F5J5JErShouFangSeed"].insert_many(seed)
532 | #
533 | # def f5j5j_ershoufang_find_seed1(self):
534 | # '''
535 | # 我爱我家小区列表页种子提取
536 | # :return:
537 | # '''
538 | # print("finid操作======》查询我爱我家小区列表页种子")
539 | # return self.db["F5J5JErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0})
540 | #
541 | # def f5j5j_ershoufang_find_seed2(self):
542 | # '''
543 | # 我爱我家小区详细信息种子提取
544 | # :return:
545 | # '''
546 | # print("finid操作======》查询我爱我家详细信息页种子")
547 | # return self.db["F5J5JErShouFangInfo"].find({"status": 0}, {"url": 1, "_id": 0})
548 | #
549 | # def f5j5j_zufang_insert_seed(self, seed):
550 | # return self.db["F5J5JZuFangSeed"].insert_many(seed)
551 | #
552 | # def f5j5j_ershoufang_find_seed(self):
553 | # return self.db["F5J5JErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50)
554 | #
555 | # def f5j5j_ershoufang_update_seed(self,seed):
556 | # print("update操作======》更新我爱我家二手房url状态: "+str(seed))
557 | # return self.db["F5J5JErShouFangSeed"].update_one({"url":seed["url"]},
558 | # {"$set":{"status":seed["status"],
559 | # "ts":seed["ts"]}})
560 | #
561 | # def f5j5j_ershoufang_update_info(self, info):
562 | # print("update操作======》更新我爱我家二手房详细信息: " + info['url'])
563 | # self.db["F5J5JErShouFangInfo"].update({"url": info['url']},
564 | # {"$set": {"buyPoint": info["buyPoint"],
565 | # "layout": info['layout'],
566 | # "floor": info["floor"],
567 | # "area": info["area"],
568 | # "publishDate": info["publishDate"],
569 | # "buildYear": info["buildYear"],
570 | # "layout_instru": info["layout_instru"],
571 | # "traffic" :info["traffic"],
572 | # "taxAnalysis": info["taxAnalysis"],
573 | # "loanSituation": info["loanSituation"],
574 | # "arroundMatch": info["arroundMatch"],
575 | # "propertyMortgage": info["propertyMortgage"],
576 | # "xiaoquInfo": info["xiaoquInfo"],
577 | # "arroundMatch": info["arroundMatch"],
578 | # "status":info["status"],
579 | # "ts": info["ts"]
580 | # }}, True)
581 | #
582 | # def f5j5j_zufang_find_seed(self):
583 | # return self.db["F5J5JZuFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50)
584 | #
585 | # ###########################tc58#################################
586 | # ###五八同城小区
587 | # def tc58_xiaoqu_insert_seed(self, seed):
588 | # '''
589 | # 小区列表页种子入库
590 | # :param seed:
591 | # :return:
592 | # '''
593 | # print("insert操作======》tc58小区列表页种子入库")
594 | # return self.db["TC58XiaoQuSeed"].insert(seed)
595 | #
596 | # def tc58_xiaoqu_update_seed(self, info):
597 | # log.msg("update操作======》tc58小区列表页种子更新{0}".format(info['url']),level=log.INFO)
598 | # self.db["TC58XiaoQuSeed"].update({"url": info['url']},
599 | # {"$set": {"status": info['status'],
600 | # "ts": info["ts"]
601 | # }})
602 | #
603 | # def tc58_xiaoqu_update_info(self, info):
604 | # log.msg("update操作======》tc58小区详细页种子更新{0}".format(info['xiaoqu_url']),level=log.INFO)
605 | # self.db["TC58XiaoQuInfo"].update({"xiaoqu_url": info['xiaoqu_url']},
606 | # {"$set": {"xiaoqu_name_two": info['xiaoqu_name_two'],
607 | # "huan_bi": info["huan_bi"],
608 | # "address":info["address"],
609 | # "build_type":info["build_type"],
610 | # "house_count": info["house_count"],
611 | # "property_type": info["property_type"],
612 | # "property_cost": info["property_cost"],
613 | # "far": info["far"],
614 | # "build_year": info["build_year"],
615 | # "greening_rate": info["greening_rate"],
616 | # "building_foot_print": info["building_foot_print"],
617 | # "building_area": info["building_area"],
618 | # "property_company": info["property_company"],
619 | # "developer": info["developer"],
620 | # "xiaoqu_id": info["xiaoqu_id"],
621 | # "trend": info["trend"],
622 | # "latitude":info["latitude"],
623 | # "longitude":info["longitude"],
624 | # "ts": info["ts"],
625 | # "status": info["status"]
626 | # }},True)
627 | #
628 | # def tc58_xiaoqu_find_seed1(self):
629 | # '''
630 | # tc58小区列表页种子提取
631 | # :return:
632 | # '''
633 | # print("finid操作======》查询链家小区列表页种子")
634 | # return self.db["TC58XiaoQuSeed"].find({"status": 0}, {"url": 1, "_id": 0})
635 | #
636 | # def tc58_xiaoqu_find_seed2(self):
637 | # '''
638 | # tc58小区详细信息种子提取
639 | # :return:
640 | # '''
641 | # print("finid操作======》查询链家详细信息页种子")
642 | # return self.db["TC58XiaoQuInfo"].find({"status": 0}, {"xiaoqu_url": 1, "_id": 0})
643 | #
644 | #
645 | # def tc58_personfang_update_seed(self, info):
646 | # log.msg("update操作======》个人房源列表页种子更新{0}".format(info['url']),level=log.INFO)
647 | # self.db["TC58PersonFangSeed"].update({"url": info['url']},
648 | # {"$set": {"status": info['status'],
649 | # "ts": info["ts"]
650 | # }},True)
651 | #
652 | # def tc58_personfang_update_info(self, info):
653 | # log.msg("update操作======》tc58个人房源种子更新{0}".format(info['ershoufang_url']),level=log.INFO)
654 | # self.db["TC58PersonFangInfo"].update({"ershoufang_url": info['ershoufang_url']},
655 | # {"$set": {"publish_date": info['publish_date'],
656 | # "total_price": info["total_price"],
657 | # "unit_price": info['unit_price'],
658 | # "xiaoqu_name_two": info["xiaoqu_name_two"],
659 | # "floor": info["floor"],
660 | # "layout": info["layout"],
661 | # "decoration": info["decoration"],
662 | # "area": info["area"],
663 | # "right_year": info["right_year"],
664 | # "orientation": info["orientation"],
665 | # "build_year": info["build_year"],
666 | # "buy_point": info["buy_point"],
667 | # "house_use": info["house_use"],
668 | # "transaction_attr": info["transaction_attr"],
669 | # "status": info["status"],
670 | # "ts": info["ts"]
671 | # }},True)
672 | #
673 | # def tc58_personfang_find_seed(self):
674 | # '''
675 | # tc58小区详细信息种子提取
676 | # :return:
677 | # '''
678 | # print("find操作======》查询tc58详细信息页种子")
679 | # return self.db["TC58PersonFangInfo"].find({"status": 0}, {"ershoufang_url": 1, "_id": 0})
680 | #
681 | # def tc58_xiaoqu_list_insert_seed(self, seed):
682 | # '''
683 | # 小区列表页种子入库
684 | # :param seed:
685 | # :return:
686 | # '''
687 | # print("insert操作======》小区信息")
688 | # return self.db["TC58XiaoQu_list"].insert(seed)
689 | #
690 | #
691 | # #######################二手房
692 | #
693 | # def tc58_ershoufang_find_seed_from_xiaoqu(self):
694 | # return self.db["TC58XiaoQu"].find({"status": 0}, {"erShouFangUrl": 1, "_id": 0}).limit(50)
695 | #
696 | # def tc58_ershoufang_insert_seed(self, seed):
697 | # return self.db["TC58ErShouFangSeed"].insert_many(seed)
698 | #
699 | # def tc58_ershoufang_insert_info(self, info):
700 | # return self.db["TC58ErShouFangInfo"].insert_one({"url":info["url"],
701 | # "title" : info["title"],
702 | # "address": info["address"],
703 | # "totalPrice": info["totalPrice"],
704 | # "unitPrice": info["unitPrice"],
705 | # "area":info["area"],
706 | # "status":0})
707 | #
708 | #
709 | # def tc58_ershoufang_find_seed(self):
710 | # return self.db["TC58ErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50)
711 | #
712 | # ####################租房
713 | #
714 | # def tc58_zufang_find_seed_from_xiaoqu(self):
715 | # return self.db["TC58XiaoQu"].find({"status": 0}, {"zuFangUrl": 1, "_id": 0}).limit(50)
716 | #
717 | # def tc58_zufang_find_seed(self):
718 | # return self.db["TC58ZuFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50)
719 | #
720 | # def tc58_zufang_insert_seed(self, seed):
721 | # return self.db["TC58ZuFangSeed"].insert_many(seed)
722 | #
723 | # def tc58_zufang_insert_info(self, info):
724 | # return self.db["TC58ZuFangInfo"].insert_one({"url":info["url"],
725 | # "title" : info["title"],
726 | # "address": info["address"],
727 | # "unitPrice": info["unitPrice"],
728 | # "status":0})
729 | # ### 麦田小区信息 ###
730 | #
731 | # def maitian_xiaoqu_insert_seed(self, seed):
732 | # '''
733 | # 生成小区名称及相应的url
734 | # '''
735 | # print("insert操作======》麦田小区url")
736 | # return self.db["maitianXiaoQuSeed"].insert(seed)
737 | #
738 | # def maitian_xiaqu_url(self):
739 | # return self.db["maitianXiaoQuSeed"].find({}).limit(1400)
740 | #
741 | # def maitian_xiaoqu_insert(self, data):
742 | # return self.db["maitian_xiaoqu_info"].insert(data)
743 | #
744 | #
745 | #
746 | # ################################房天下##########################################################
747 | # #####房天下小区
748 | # def fang_xiaoqu_insert_seed(self, seed):
749 | # '''
750 | # 小区列表页种子入库
751 | # :param seed:
752 | # :return:
753 | # '''
754 | # log.msg("[{0} line:{1}] insert FangXiaoQuSeed=====>>>>>插入Fang天下小区url".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
755 | # return self.db["FangXiaoQuSeed"].insert(seed)
756 | #
757 | # def fang_xiaoqu_find_seed1(self):
758 | # return self.db["FangXiaoQuSeed"].find({"status": 0}).limit(50)
759 | #
760 | # def fang_xiaoqu_find_seed2(self):
761 | # return self.db["FangXiaoQuInfo"].find({"status": 0},{"xiaoqu_url": 1, "_id": 0}).limit(50)
762 | #
763 | # def fang_xiaoqu_update_seed(self, info):
764 | # log.msg("[{0} line:{1}] update FangXiaoQuSeed=====>>>>>Fang天下小区列表页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
765 | # self.db["FangXiaoQuSeed"].update({"url": info['url']},
766 | # {"$set": {"status": info['status'],
767 | # "ts": info["ts"]
768 | # }},True)
769 | #
770 | # def fang_xiaoqu_update_info(self, info):
771 | # log.msg("[{0} line:{1}] update FangXiaoQuInfo=====>>>>>Fang天下小区详细页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
772 | # self.db["FangXiaoQuInfo"].update({"xiaoqu_url": info['xiaoqu_url']},
773 | # {"$set": {"xiaoqu_name_two": info['xiaoqu_name_two'],
774 | # "sale_url": info["sale_url"],
775 | # "chengjiao_url":info["chengjiao_url"],
776 | # "rent_url":info["rent_url"],
777 | # "sale_total": info["sale_total"],
778 | # "chengjiao_total": info["chengjiao_total"],
779 | # "rent_total": info["rent_total"],
780 | # "xiaoqu_id": info["xiaoqu_id"],
781 | # "region_two": info["region_two"],
782 | # "region_three": info["region_three"],
783 | # "longitude": info["longitude"],
784 | # "latitude": info["latitude"],
785 | # "trend": info["trend"],
786 | # "unit_price": info["unit_price"],
787 | # "huan_bi": info["huan_bi"],
788 | # "tong_bi": info["tong_bi"],
789 | # "address": info["address"],
790 | # "property_type": info["property_type"],
791 | # "build_year": info["build_year"],
792 | # "developer": info["developer"],
793 | # "build_type": info["build_type"],
794 | # "building_area": info["building_area"],
795 | # "building_foot_print": info["building_foot_print"],
796 | # "property_company": info["property_company"],
797 | # "greening_rate": info["greening_rate"],
798 | # "far": info["far"],
799 | # "property_cost":info["property_cost"],
800 | # "follow":info["follow"],
801 | # "imgs":info["imgs"],
802 | # "layout_imgs": info["layout_imgs"],
803 | # "status": info["status"],
804 | # "ts": info["ts"]
805 | # }},True)
806 | # ####房天下二手房
807 | # def fang_ershoufang_insert_seed(self, seed):
808 | # '''
809 | # 房天下二手房列表页种子入库
810 | # :param seed:
811 | # :return:
812 | # '''
813 | # log.msg("[{0} line:{1}] insert FangErShouFangSeed=====>>>>>房天下二手房url".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
814 | # return self.db["FangErShouFangSeed"].insert(seed)
815 | #
816 | # def fang_ershoufang_find_seed(self):
817 | # return self.db["FangXiaoQuInfo"].find({"$and":[{"sale_url":{"$ne":None}},{"sale_url":""},{"status": 1}]},{"xiaoqu_url": 1, "_id": 0})
818 | #
819 | # def fang_ershoufang_info(self):
820 | # return self.db["FangErShouFangInfo"].find({"status": 1,"zf_jjname": "业主"},{"formUrl": 1, "_id": 0})
821 | #
822 | # def fang_ershoufang_info_update(self, info):
823 | # log.msg("update操作======》Fang天下二手房小区名称", level=log.INFO)
824 | # self.db["FangErShouFangInfo"].update_many({"formUrl": info['formUrl']},
825 | # {"$set": {"xiaoquName": info['xiaoquName'],
826 | # "status":info["status"]
827 | # }}, True)
828 | #
829 | # def fang_ershoufang_find_seed1(self):
830 | # return self.db["FangErShouFangSeed"].find({"status": 0})
831 | #
832 | # def fang_ershoufang_find_seed2(self):
833 | # return self.db["FangErShouFangInfo"].find({"status": 0},{"ershoufang_url": 1, "_id": 0})
834 | #
835 | # def fang_ershoufang_find_seed3(self):
836 | # return self.db["FangXiaoQuInfo"].find({"$and":[{"sale_url":{"$ne":None}},{"sale_url":{"$ne":""}},{"status": 1}]},{"sale_url": 1, "_id": 0})
837 | #
838 | # def fang_ershoufang_update_seed(self, info):
839 | # log.msg("update操作======》Fang天下二手房列表页种子更新{0}".format(info['url']),level=log.INFO)
840 | # self.db["FangErShouFangSeed"].update({"url": info['url']},
841 | # {"$set": {"status": info['status'],
842 | # "ts": info["ts"]
843 | # }},True,True)
844 | # def fang_xiaoqu_ershoufang_update_seed(self, info):
845 | # log.msg("[{0} line:{1}] update FangXiaoQuInfo=====>>>>>Fang天下小区列表页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
846 | # self.db["FangXiaoQuInfo"].update({"sale_url": info['sale_url']},
847 | # {"$set": {"status": info['status'],
848 | # "ts": info["ts"]
849 | # }},True)
850 | #
851 | # def fang_xiaoqu_ershoufang_update_seed2(self, info):
852 | # log.msg("[{0} line:{1}] update FangErShouFangSeed=====>>>>>Fang天下小区列表页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
853 | # self.db["FangXiaoQuInfo"].update({"xiaoqu_url": info['xiaoqu_url']},
854 | # {"$set": {"status": info['status'],
855 | # "ts": info["ts"]
856 | # }},True)
857 | #
858 | # def fang_ershoufang_update_info(self, info):
859 | # log.msg("[{0} line:{1}] update FangErShouFangInfo=====>>>>>Fang天下二手房详细页种子更新".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
860 | # self.db["FangErShouFangInfo"].update({"ershoufang_url": info['ershoufang_url']},
861 | # {"$set": {"total_price": info['total_price'],
862 | # "decoration": info["decoration"],
863 | # "floor":info["floor"],
864 | # "layout":info["layout"],
865 | # "build_area": info["build_area"],
866 | # "unit_price": info["unit_price"],
867 | # "orientation": info["orientation"],
868 | # "region_three": info["region_three"],
869 | # "build_year": info["build_year"],
870 | # "has_elevator": info["has_elevator"],
871 | # "house_use": info["house_use"],
872 | # "build_structure": info["build_structure"],
873 | # "build_type": info["build_type"],
874 | # "publish_date": info["publish_date"],
875 | # "buy_point": info["buy_point"],
876 | # "fzzj": info["fzzj"],
877 | # "xiaoqu_instru": info["xiaoqu_instru"],
878 | # "ye_zhu": info["ye_zhu"],
879 | # "status":info["status"],
880 | # "ts": info["ts"]
881 | # }},True)
882 | #
883 | #
884 | # ######################################中原房产################################
885 | # ###小区
886 | # def zhongyuan_xiaoqu_insert_seed(self, seed):
887 | # '''
888 | # 小区列表页种子入库
889 | # :param seed:
890 | # :return:
891 | # '''
892 | # log.msg("[{0} line:{1}] insert ZhongYuanXiaoQuSeed =====>>>>> 中原房产小区列表页种子入库".format(self.__class__.__name__,sys._getframe().f_lineno),level=log.INFO)
893 | # return self.db["ZhongYuanXiaoQuSeed"].insert(seed)
894 | #
895 | # def zhongyuan_xiaoqu_update_seed(self, seed):
896 | # '''
897 | # 更新小区列表页种子状态
898 | # :param seed:
899 | # :return:
900 | # '''
901 | # log.msg("[{0} line:{1}] update ZhongYuanXiaoQuSeed =====>>>>> 更新中原房产列表页种子".format(self.__class__.__name__,sys._getframe().f_lineno),level=log.INFO)
902 | # return self.db["ZhongYuanXiaoQuSeed"].update_one({"url":seed["url"]},
903 | # {"$set":{"status":seed["status"],
904 | # "ts":seed["ts"]}})
905 | #
906 | # def zhongyuan_xiaoqu_update_info(self, info):
907 | # '''
908 | # 更新中原详细页种子状态和详细信息
909 | # :param info:
910 | # :return:
911 | # '''
912 | # log.msg("[{0} line:{1}] update ZhongYuanXiaoQuInfo=====>>>>>更新家中原地产详细页种子状态和详细信息".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
913 | # return self.db["ZhongYuanXiaoQuInfo"].update({"xiaoqu_url" : info["xiaoqu_url"]},
914 | # {"$set":{"xiaoqu_name_other": info["xiaoqu_name_other"],
915 | # "address": info['address'],
916 | # "region_three": info['region_three'],
917 | # "region_four" : info["region_four"],
918 | # "property_type": info["property_type"],
919 | # "build_year": info["build_year"],
920 | # "property_cost" : info["property_cost"],
921 | # "property_company": info["property_company"],
922 | # "developer": info["developer"],
923 | # "far": info["far"],
924 | # "greening_rate": info["greening_rate"],
925 | # "sale_url":info["sale_url"],
926 | # "rent_url":info["rent_url"],
927 | # "chengjiao_url":info["chengjiao_url"],
928 | # "latitude":info["latitude"],
929 | # "longitude": info["longitude"],
930 | # "ts":info["ts"],
931 | # "status":info["status"],
932 | # "imgs": info["imgs"],
933 | # "trend": info["trend"],
934 | # "html": info["html"],
935 | # "xiaoqu_id":info["xiaoqu_id"]
936 | # }},True)
937 | #
938 | # def zhongyuan_xiaoqu_find_seed1(self):
939 | # '''
940 | # 中原地产小区列表页种子提取
941 | # :return:
942 | # '''
943 | # print("finid操作======》查询中原地产小区列表页种子")
944 | # return self.db["ZhongYuanXiaoQuSeed"].find({"status": 0}, {"url": 1, "_id": 0})
945 | #
946 | # def zhongyuan_xiaoqu_find_seed2(self):
947 | # '''
948 | # 中原小区详细信息种子提取
949 | # :return:
950 | # '''
951 | # print("finid操作======》查询中原地产详细信息页种子")
952 | # return self.db["ZhongYuanXiaoQuInfo"].find({"status": 0}, {"xiaoqu_url": 1, "_id": 0})
953 | #
954 | # def zhongyuan_chengjiaofang_find_seed(self):
955 | # '''
956 | # 中原小区交易种子提取
957 | # :return:
958 | # '''
959 | # return self.db["ZhongYuanXiaoQuInfo"].find({"status": 1,"chengjiao_url":{"$ne":""}}, {"chengjiao_url": 1, "_id": 0})
960 | #
961 | # def zhongyuan_xiaoqu_update_chengjiao_seed(self, info):
962 | # '''
963 | # 中原小区
964 | # :param info:
965 | # :return:
966 | # '''
967 | # log.msg("[{0} line:{1}] update ZhongYuanXiaoQuInfo=====>>>>>更新链家小区成交房种子状态 status=2".format(self.__class__.__name__, sys._getframe().f_lineno), level=log.INFO)
968 | # self.db["ZhongYuanXiaoQuInfo"].update({"chengjiao_url": info['chengjiao_url']},
969 | # {"$set":{"status": info["status"],
970 | # "ts":info["ts"]
971 | # }} ,True)
972 | #
973 | #
974 | # ###############################赶集网###############################################
975 | # def ganji_ershoufang_insert_seed(self, seed):
976 | # return self.db["GanJiErShouFangSeed"].insert_many(seed)
977 | #
978 | # def ganji_ershoufang_find_seed(self):
979 | # return self.db["GanJiErShouFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50)
980 | #
981 | # def ganji_ershoufang_insert_info(self, info):
982 | # return self.db["GanJiErShouFangInfo"].insert_one({"url":info["url"],
983 | # "title" : info["title"],
984 | # "layout": info["layout"],
985 | # "area": info["area"],
986 | # "chaoXiang": info["chaoXiang"],
987 | # "floor": info["floor"],
988 | # "decoration": info["decoration"],
989 | # "xiaoquName": info["xiaoquName"],
990 | # "xiaoquUrl": info["xiaoquUrl"],
991 | # "address": info["address"],
992 | # "unitPrice": info["unitPrice"],
993 | # "totalPrice": info["totalPrice"],
994 | # "status":0})
995 | #
996 | # def ganji_zufang_insert_seed(self, seed):
997 | # return self.db["GanJiZuFangSeed"].insert_many(seed)
998 | #
999 | # def ganji_zufang_find_seed(self):
1000 | # return self.db["GanJiZuFangSeed"].find({"status": 0}, {"url": 1, "_id": 0}).limit(50)
1001 | #
1002 | # def ganji_zufang_insert_info(self, info):
1003 | # return self.db["GanJiZuFangInfo"].insert_one({"url":info["url"],
1004 | # "title" : info["title"],
1005 | # "layout": info["layout"],
1006 | # "area": info["area"],
1007 | # "chaoXiang": info["chaoXiang"],
1008 | # "floor": info["floor"],
1009 | # "decoration": info["decoration"],
1010 | # "xiaoquName": info["xiaoquName"],
1011 | # "xiaoquUrl": info["xiaoquUrl"],
1012 | # "address": info["address"],
1013 | # "unitPrice": info["unitPrice"],
1014 | # "leasehold":info["leasehold"],
1015 | # "publishDate":info["publishDate"],
1016 | # "status":0})
1017 |
1018 | ##########img################################################
1019 | class ImgDownloadPipeline(ImagesPipeline):
1020 | #yeild 调用下载
1021 | def get_media_requests(self, item, info):
1022 | for image_url in item['image_urls']:
1023 | yield scrapy.Request(image_url)
1024 |
1025 | def item_completed(self, results, item, info):
1026 | image_paths = [x['path'] for ok, x in results if ok]
1027 | if not image_paths:
1028 | raise DropItem("Item contains no images")
1029 | item['image_paths'] = image_paths
1030 | return item
1031 |
1032 | #########redis#############################################
1033 | class RedisPipeline(object):
1034 |
1035 | def __init__(self):
1036 | if not hasattr(RedisPipeline, 'pool'):
1037 | RedisPipeline.create_pool()
1038 | self._connection = redis.Redis(connection_pool=RedisPipeline.pool)
1039 |
1040 | @staticmethod
1041 | def create_pool():
1042 | RedisPipeline.pool = redis.ConnectionPool(
1043 | host="127.0.0.1",
1044 | port=6379,
1045 | db=0)
1046 |
1047 | def set_lianjia_seed(self, key, value):
1048 | '''''set data with (key, value)
1049 | '''
1050 | return self._connection.lpush(key, value)
1051 |
1052 | def set_seed(self, key, value):
1053 | '''''set data with (key, value)
1054 | '''
1055 | return self._connection.lpush(key, value)
1056 |
1057 | def list_len(self,key):
1058 | '''
1059 | 获取长度
1060 | :return:
1061 | '''
1062 | return self._connection.llen(key)
1063 |
1064 | def delete_key(self,key):
1065 | return self._connection.delete(key)
1066 |
1067 |
1068 |
1069 | ####mysql######################################################
1070 | class MysqlPipline(object):
1071 | # 定义静态变量实例
1072 | __instance = None
1073 |
1074 | def __init__(self):
1075 | pass
1076 |
1077 | def __new__(cls, *args, **kwargs):
1078 | if not cls.__instance:
1079 | try:
1080 | Lock.acquire()
1081 | # double check
1082 | if not cls.__instance:
1083 | cls.conn = pymysql.connect(host=settings['MYSQL_HOST'],
1084 | port=settings['MYSQL_PORT'],
1085 | user=settings['MYSQL_USER'],
1086 | passwd=settings['MYSQL_PASSWD'],
1087 | db=settings['MYSQL_DB'])
1088 | cls.cursor = cls.conn.cursor()
1089 | cls.__instance = super().__new__(cls, *args, **kwargs)
1090 | finally:
1091 | Lock.release()
1092 | return cls.__instance
1093 |
1094 | # 使用twisted将mysql插入变成异步执行
1095 | def process_item(self, item, spider):
1096 | pass
1097 |
1098 | def close(self):
1099 | self.cursor.close()
1100 | self.conn.close()
1101 |
1102 | def handle_error(self, failure, item, spider):
1103 | #处理异步插入的异常
1104 | print (failure)
1105 |
1106 | def excute_sql(self,sql):
1107 | try:
1108 | logger.info(f"excute_sql===>>> {sql}")
1109 | self.cursor.execute(sql)
1110 | self.conn.commit()
1111 | except Exception as e:
1112 | if "Duplicate" not in str(e):
1113 | self.conn.rollback()
1114 |
1115 |
1116 | # excel
1117 | # class ExcelPipeline(object):
1118 | # def __init__(self):
1119 | # self.wb = Workbook()
1120 | # self.ws = self.wb.active
1121 | # self.ws.append(['文章url', '文章title', '文章发布时间', '文章内容', '文章作者连接', '文章作者', '文章评论数量']) # 设置表头
1122 | #
1123 | # self.wb2 = Workbook()
1124 | # self.ws2 = self.wb2.active
1125 | # self.ws2.append(['文章url', '评论人', '评论时间', '评论内容', '评论给那一条', '评论给谁']) # 设置表头
1126 | #
1127 | # def process_item(self, item, spider):
1128 | # collection_name = item.__class__.__name__
1129 | # if collection_name == "DouBanItem":
1130 | # line = [item['article_url'], item['article_title'], item['article_publish_date'], item['article_content']
1131 | # , item['article_author_url'], item['article_author_name'],
1132 | # item['article_comment_quantity']] # 把数据中每一项整理出来
1133 | # self.ws.append(line) # 将数据以行的形式添加到xlsx中
1134 | # self.wb.save('content.xlsx') # 保存xlsx文件
1135 | # return item
1136 | # if collection_name == "CommentItem":
1137 | # line = [item['article_url'], item['comment_people'], item['comment_time'], item['comment_content']
1138 | # , item['comment_to_which_coment'], item['comment_to_Who']] # 把数据中每一项整理出来
1139 | # self.ws2.append(line) # 将数据以行的形式添加到xlsx中
1140 | # self.wb2.save('comment.xlsx') # 保存xlsx文件
1141 | # return item
1142 |
--------------------------------------------------------------------------------
/dz_spider/dz_spider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for dz_spider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'dz_spider'
13 |
14 | SPIDER_MODULES = ['dz_spider.spiders']
15 | NEWSPIDER_MODULE = 'dz_spider.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'dz_spider (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | DOWNLOAD_DELAY = 2
31 | # 配合使用
32 | DOWNLOAD_TIMEOUT = 30
33 |
34 | # The download delay setting will honor only one of:
35 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
36 | #CONCURRENT_REQUESTS_PER_IP = 16
37 |
38 | # Disable cookies (enabled by default)
39 | COOKIES_ENABLED = False
40 |
41 | # Disable Telnet Console (enabled by default)
42 | TELNETCONSOLE_ENABLED = False
43 |
44 | # Override the default request headers:
45 | # DEFAULT_REQUEST_HEADERS = {
46 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
47 | # 'Accept-Language': 'en',
48 | # }
49 |
50 | # Enable or disable spider middlewares
51 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
52 | #SPIDER_MIDDLEWARES = {
53 | # 'dz_spider.middlewares.DzSpiderSpiderMiddleware': 543,
54 | #}
55 |
56 | # Enable or disable downloader middlewares
57 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
58 | DOWNLOADER_MIDDLEWARES = {
59 | 'dz_spider.middlewares.RotateUserAgentMiddleware':400,
60 | 'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware':500
61 | }
62 |
63 | # Enable or disable extensions
64 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
65 | #EXTENSIONS = {
66 | # 'scrapy.extensions.telnet.TelnetConsole': None,
67 | #}
68 |
69 | # Configure item pipelines
70 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
71 | #数据保存
72 | ITEM_PIPELINES = {
73 | #'house.pipelines.ImgDownloadPipeline': 100,
74 | # 'dz_spider.pipelines.MongoPipeline':2,
75 | 'dz_spider.pipelines.MysqlPipline':3,
76 | #'house.pipelines.ExcelPipeline':1,
77 | }
78 |
79 | # Enable and configure the AutoThrottle extension (disabled by default)
80 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
81 | #AUTOTHROTTLE_ENABLED = True
82 | # The initial download delay
83 | #AUTOTHROTTLE_START_DELAY = 5
84 | # The maximum download delay to be set in case of high latencies
85 | #AUTOTHROTTLE_MAX_DELAY = 60
86 | # The average number of requests Scrapy should be sending in parallel to
87 | # each remote server
88 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
89 | # Enable showing throttling stats for every response received:
90 | #AUTOTHROTTLE_DEBUG = False
91 |
92 | # Enable and configure HTTP caching (disabled by default)
93 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
94 | #HTTPCACHE_ENABLED = True
95 | #HTTPCACHE_EXPIRATION_SECS = 0
96 | #HTTPCACHE_DIR = 'httpcache'
97 | #HTTPCACHE_IGNORE_HTTP_CODES = []
98 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
99 |
100 |
101 | # MongoDB数据连接
102 | MONGO_URI="mongodb://127.0.0.1:27017/"
103 | MONGO_DATABASE="house"
104 | # mysql数据库连接
105 | MYSQL_HOST="127.0.0.1"
106 | MYSQL_PROT="3306"
107 | MYSQL_DB="yuqing_db"
108 | MYSQL_USER="root"
109 | MYSQL_PASSWD="lang1994"
110 |
111 |
112 | LOG_LEVEL = "DEBUG"
113 |
--------------------------------------------------------------------------------
/dz_spider/dz_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/dz_spider/dz_spider/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/spiders/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/dz_spider/dz_spider/spiders/__pycache__/baidu.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/spiders/__pycache__/baidu.cpython-36.pyc
--------------------------------------------------------------------------------
/dz_spider/dz_spider/spiders/__pycache__/sogou.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/spiders/__pycache__/sogou.cpython-36.pyc
--------------------------------------------------------------------------------
/dz_spider/dz_spider/spiders/__pycache__/toutiao.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/langgithub/yuqing_system/8041e3666f7c4014c34bebea3265d852997f2f55/dz_spider/dz_spider/spiders/__pycache__/toutiao.cpython-36.pyc
--------------------------------------------------------------------------------
/dz_spider/dz_spider/spiders/baidu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import time
3 | import json
4 | import scrapy
5 | import requests
6 | import urllib.parse
7 | from pipelines import MysqlPipline
8 | from scrapy.log import logger
9 |
10 |
11 | class BaiduSpider(scrapy.Spider):
12 | name = 'baidu'
13 | allowed_domains = ['www.baidu.com']
14 | start_urls = ['https://www.baidu.com/s?wd=2018%E5%B9%B48%E6%9C%88%E8%BE%BE%E5%B7%9E%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6&oq=2018%E5%B9%B48%E6%9C%88%E8%BE%BE%E5%B7%9E%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6&ie=utf-8&rsv_pq=f23a79aa000d332f&rsv_t=7059xeWb4ls1KKoJ0h16REkV2j9830xUMMrCpERps%2BBRpST5YFJuXbPeYuo']
15 | mysql = MysqlPipline()
16 |
17 |
18 | def start_requests(self):
19 | for url in self.start_urls:
20 | for page in range(9):
21 | yield scrapy.Request(url=f"{url}&pn={page}0")
22 |
23 | def parse(self, response):
24 | t1=time.time()
25 | html=scrapy.Selector(text=response.text)
26 | divs=html.css("#content_left > div .f13 .c-tools::attr(data-tools)")
27 | for div in divs:
28 | data_str=div.extract()
29 | data_dict=json.loads(data_str)
30 | url=None
31 | try:
32 | url=requests.get(data_dict['url'],timeout=5).url
33 | schame = urllib.parse.urlparse(url).netloc
34 | sql = f"insert into seed(url,title,site_name,type) values('{url}','{data_dict['title']}','{schame}',1)"
35 | self.mysql.excute_sql(sql)
36 | except Exception as e:
37 | logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}")
38 | t2=time.time()
39 | logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
--------------------------------------------------------------------------------
/dz_spider/dz_spider/spiders/sogou.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import time
3 | import json
4 | import scrapy
5 | import requests
6 | import urllib.parse
7 | from dz_spider.pipelines import MysqlPipline
8 | from scrapy.log import logger
9 |
10 |
11 | class SogouSpider(scrapy.Spider):
12 | name = 'sogou'
13 | allowed_domains = ['www.sogou.com']
14 | start_urls = ['https://www.sogou.com/tx?query=2018%E5%B9%B48%E6%9C%88%E8%BE%BE%E5%B7%9E%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6&hdq=sogou-wsse-3f7bcd0b3ea82268&duppid=1&cid=&s_from=result_up&sut=1606&sst0=1565226159021&lkt=0,0,0&sugsuv=006E51C00177C3995CB434B8FE950363&sugtime=1565226159021&ie=utf8&w=01029901&dr=1']
15 | mysql = MysqlPipline()
16 |
17 |
18 | def start_requests(self):
19 | for url in self.start_urls:
20 | for page in range(0,3):
21 | yield scrapy.Request(url=f"{url}&page={page}")
22 |
23 | def parse(self, response):
24 | t1=time.time()
25 | html=scrapy.Selector(text=response.text)
26 | divs=html.css("div.results > div")
27 | for div in divs:
28 | vrwrap=div.css("div.vrwrap")
29 | if len(vrwrap)==0:
30 | title = "".join(div.css("div.rb h3 a::text").extract())
31 | url = "https://www.sogou.com" + div.css("div.rb h3 a::attr(href)").extract()[0]
32 | else:
33 | title="".join(div.css("div.vrwrap h3 a::text").extract())
34 | url = "https://www.sogou.com"+div.css("div.vrwrap h3 a::attr(href)").extract()[0]
35 | try:
36 | _html=scrapy.Selector(text=requests.get(url,verify=False).text)
37 | url = _html.re("window.location.replace\(\"(.*?)\"\)")[0]
38 | schame = urllib.parse.urlparse(url).netloc
39 | sql = f"insert into seed(url,title,site_name,type) values('{url}','{title}','{schame}',1)"
40 | self.mysql.excute_sql(sql)
41 | except Exception as e:
42 | logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}")
43 | t2=time.time()
44 | logger.info(f"执行===>>> {response.url} 花费时间{str(t2-t1)}")
45 |
--------------------------------------------------------------------------------
/dz_spider/dz_spider/spiders/toutiao.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import time
3 | import json
4 | import scrapy
5 | import urllib.parse
6 | from pipelines import MysqlPipline
7 | from scrapy.log import logger
8 | from selenium import webdriver
9 |
10 |
11 | class ToutiaoSpider(scrapy.Spider):
12 | """
13 | 烦人的cookie 直接用driver
14 | """
15 |
16 | name = 'toutiao'
17 | allowed_domains = ['www.toutiao.com']
18 | start_urls = ['https://www.toutiao.com/api/search/content/?aid=24&app_name=web_search&format=json&keyword=2018%E5%B9%B48%E6%9C%88%E5%9B%9B%E5%B7%9D%E8%BE%BE%E5%B7%9E%E5%B8%82%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6&autoload=true&count=20&en_qc=1&cur_tab=1&from=search_tab&pd=synthesis']
19 | mysql = MysqlPipline()
20 |
21 |
22 | def start_requests(self):
23 | driver=webdriver.Chrome(executable_path="/Users/yuanlang/work/javascript/chromedriver")
24 | driver.get("https://www.toutiao.com/search/?keyword=2018%E5%B9%B48%E6%9C%88%E5%9B%9B%E5%B7%9D%E8%BE%BE%E5%B7%9E%E5%B8%82%E5%87%BA%E7%A7%9F%E8%BD%A6%E7%BD%A2%E5%B7%A5%E4%BA%8B%E4%BB%B6")
25 | time.sleep(2)
26 | for url in self.start_urls:
27 | for page in range(0,8):
28 | driver.get(url=f"{url}&offset={20*page}×tamp={'%d'%(time.time()*1000)}")
29 | time.sleep(5)
30 | html=scrapy.Selector(text=driver.page_source)
31 | content=html.css("body > pre::text").extract_first()
32 | data=json.loads(content)["data"]
33 | for item in data:
34 | try:
35 | if "article_url" not in item:
36 | if "display" not in item:
37 | print(item)
38 | continue
39 | print(item["display"])
40 | _url = item["display"]["info"]["url"]
41 | title = item["display"]["emphasized"]["title"]
42 | else:
43 | title = item["abstract"]
44 | _url = item["article_url"]
45 | schame = urllib.parse.urlparse(_url).netloc
46 | sql = f"insert into seed(url,title,site_name,type) values('{_url}','{title}','{schame}',1)"
47 | self.mysql.excute_sql(sql)
48 | except Exception as e:
49 | logger.error(f"requests.get(data_dict['url']).url ===>>> {str(e)}")
50 |
51 | # time.sleep(6000)
52 |
53 | def parse(self, response):
54 | pass
55 |
56 |
--------------------------------------------------------------------------------
/dz_spider/log/app.log:
--------------------------------------------------------------------------------
1 | [1;37m2019-07-26 19:53:26,076 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m
2 | [1;37m2019-07-26 19:53:28,387 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.163.com/18/1009/07/DTLL52J10001875P.html','四川达州地陷2名遇难者系年轻夫妻 国庆刚办宴席_网易新闻','news.163.com')[0m
3 | [1;37m2019-07-26 19:53:29,603 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://ah.people.com.cn/n2/2018/1008/c358314-32129820.html','四川达州地面塌陷 造成至少一人死亡--安徽频道--人民网 ','ah.people.com.cn')[0m
4 | [1;37m2019-07-26 19:53:31,895 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://wemedia.ifeng.com/81218105/wemedia.shtml','最新!达州路面塌陷已发现两名被困者,其中一人抢救无效死亡','wemedia.ifeng.com')[0m
5 | [1;37m2019-07-26 19:53:33,591 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://blog.sina.com.cn/s/blog_5f7396520102z29j.html','达州路面塌陷事件:要追究刑事责任人_丁金坤_新浪博客','blog.sina.com.cn')[0m
6 | [1;37m2019-07-26 19:53:35,960 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://cnahrx.com/a2188-320779-0.shtml','[达川区]达州南外一人行路面发生塌陷 官方发布险情通报_达州今日...','cnahrx.com')[0m
7 | [1;37m2019-07-26 19:53:36,888 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.sina.com.cn/s/2018-10-14/doc-ihmhafir5676446.shtml','四川达州涵洞塌陷区域回填结束 事件原因正在调查|涵洞|..._新浪新闻','news.sina.com.cn')[0m
8 | [1;37m2019-07-26 19:53:38,456 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://sichuan.scol.com.cn/ggxw/201810/56571800.html','达州路面塌陷事故救援最新进展:又一名被困人员被发现_..._四川在线','sichuan.scol.com.cn')[0m
9 | [1;37m2019-07-26 19:53:39,057 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://society.huanqiu.com/photonew/2018-10/2905788.html','四川达州一人行道突然塌陷 有人员掉落_社会_环球网','society.huanqiu.com')[0m
10 | [1;37m2019-07-26 19:53:39,644 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.cngold.org/gundong/2018-10-09/c6009808.html','人行道路面塌陷 事故疑似造成4名行人陷落-滚动新闻-金投热点网-金...','news.cngold.org')[0m
11 | [1;37m2019-07-26 19:54:55,739 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sc.gov.cn/10462/10464/13722/2018/10/8/10460284.shtml','达州南外一人行路面发生塌陷 官方发布险情通报- 四川省人民政府','www.sc.gov.cn')[0m
12 | [1;37m2019-07-26 19:58:53,120 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m
13 | [1;37m2019-07-26 19:58:55,254 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://cnahrx.com/a2188-320779-0.shtml','[达川区]达州南外一人行路面发生塌陷 官方发布险情通报_达州今日...','cnahrx.com')[0m
14 | [1;37m2019-07-26 19:58:55,706 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://k.sina.com.cn/article_1855024094_6e916bde02000dz07.html?cre=tianyi&mod=pcpager_news&loc=17&r=9&doct=0&rfunc=100&tj=none&tr=9','达州人行道涵洞塌陷区域回填已结束,事件原因仍在调查中','k.sina.com.cn')[0m
15 | [1;37m2019-07-26 19:58:56,462 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://sichuan.scol.com.cn/ggxw/201810/56571800.html','达州路面塌陷事故救援最新进展:又一名被困人员被发现_..._四川在线','sichuan.scol.com.cn')[0m
16 | [1;37m2019-07-26 19:58:57,918 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.51daifu.com/2018/1009/AF7ACB92C3T658683.shtml','人行道路面塌陷 多人死亡令人惋惜_医生在线','news.51daifu.com')[0m
17 | [1;37m2019-07-26 19:58:58,453 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.cctv.com/2018/10/09/ARTI32aZet0kONxAyM8i1jIZ181009.shtml','达州塌陷事故中的遇难夫妻 4天前刚举行婚礼_新闻频道_央视网(cctv...','news.cctv.com')[0m
18 | [1;37m2019-07-26 19:58:59,616 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.520zg.net/forum.php?mod=viewthread&tid=1239730','人行道突然塌陷,90后新婚夫妻遇难!出事前刚买完喜糖 - 龙都茶坊 -...','bbs.520zg.net')[0m
19 | [1;37m2019-07-26 19:59:00,147 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.xishu365.com/thread-484836-1-1.html','四川达州一人行道路面塌陷,四名大人一名小孩掉入坑内,情况有点不...','bbs.xishu365.com')[0m
20 | [1;37m2019-07-26 19:59:00,751 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.thepaper.cn/newsDetail_forward_2508912','父子俩仍被埋!达州路面塌陷疑似4人陷落,已致2人死亡','m.thepaper.cn')[0m
21 | [1;37m2019-07-26 19:59:01,413 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.yidianzixun.com/article/0KEEVO8G','【一点资讯】一周安全警示:9.26~10.08安全事故简报 www.yidian...','www.yidianzixun.com')[0m
22 | [1;37m2019-07-26 20:00:17,396 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sc.gov.cn/10462/10464/13722/2018/10/8/10460284.shtml','达州南外一人行路面发生塌陷 官方发布险情通报- 四川省人民政府','www.sc.gov.cn')[0m
23 | [1;37m2019-07-26 20:00:19,674 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817853-1-1.html','达州南外地陷事故救援结束!被困父子已被寻获 - 今日达..._凤凰山下','www.dz19.net')[0m
24 | [1;37m2019-07-26 20:00:20,662 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://scnews.newssc.org/system/20181007/000912644.html','达州南外一人行路面发生塌陷 官方发布险情通报 -四川..._四川新闻网','scnews.newssc.org')[0m
25 | [1;37m2019-07-26 20:00:21,400 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258128561_100144890','达州南外一人行路面塌陷 临时交通管制险情通报_达川区','www.sohu.com')[0m
26 | [1;37m2019-07-26 20:00:22,194 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DU3EOHRD05149V0C.html','达州市达川区人行道塌陷进入临时恢复阶段 事件原因仍在..._网易订阅','dy.163.com')[0m
27 | [1;37m2019-07-26 20:00:27,258 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258128561_100144890','达州南外一人行路面塌陷 临时交通管制险情通报-警法频道-手机搜狐','m.sohu.com')[0m
28 | [1;37m2019-07-26 20:00:28,516 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://scnews.newssc.org/system/20181014/000914485.html','达州市达川区人行道塌陷进入临时恢复阶段 事件原因仍..._四川新闻网','scnews.newssc.org')[0m
29 | [1;37m2019-07-26 20:00:30,309 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://sc.ifeng.com/a/20181015/6945534_0.shtml','达州市达川区人行道塌陷进入临时恢复阶段 事件原因仍在调查中_...','sc.ifeng.com')[0m
30 | [1;37m2019-07-26 20:00:32,353 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.chinanews.com/sh/shipin/cns-d/2018/10-07/news788198.shtml','四川达州市达川区一人行道突然塌陷-中新网视频','www.chinanews.com')[0m
31 | [1;37m2019-07-26 20:00:35,831 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dsj365.cn/front/article/17733.html','大事记盘点:2018年10月国内热点事件 - 大事记','www.dsj365.cn')[0m
32 | [1;37m2019-07-26 20:00:37,645 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.520zg.net/forum.php?mod=viewthread&tid=1239730','人行道突然塌陷,90后新婚夫妻遇难!出事前刚买完喜糖 - 龙都茶坊 -...','bbs.520zg.net')[0m
33 | [1;37m2019-07-26 20:00:40,940 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.gazx.org/content/2018-10/8/201810815122663654.htm','达州“地陷”是天灾还是人祸?_广安在线','www.gazx.org')[0m
34 | [1;37m2019-07-26 20:00:42,099 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.jinciwei.cn/j447103.html','最新动态!南外人行道塌陷恢复治理工作正在有序进行,赶紧看过来! -...','www.jinciwei.cn')[0m
35 | [1;37m2019-07-26 20:00:43,186 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.nbtv.cn/xwdsg/gn/30067333.shtml','达州人行道路面塌陷事故搜救结束,4名被困者遇难','www.nbtv.cn')[0m
36 | [1;37m2019-07-26 20:00:44,217 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.hainan.net/post-73-653820-1.shtml','这才是真正的坑人,坑死人了,人行道塌陷致一对新婚夫妻遇难_三亚_...','bbs.hainan.net')[0m
37 | [1;37m2019-07-26 20:00:45,580 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dzwww.com/xinwen/shehuixinwen/201810/t20181007_17916739.htm','四川达州一人行路面塌陷 附近路段实施临时交通管制_社会新闻_大众网','www.dzwww.com')[0m
38 | [1;37m2019-07-26 20:00:46,469 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.kandianla.com/kandian/6322.html','人行道路面塌陷 怎么回事 真是太危险了_看点啦','www.kandianla.com')[0m
39 | [1;37m2019-07-26 20:00:49,371 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://zhaopin.baidu.com/m/company?query=4d37003a46a1bac4f0593d376d803df3','达州济民医院-企业名片-百度百聘','zhaopin.baidu.com')[0m
40 | [1;37m2019-07-26 20:00:50,778 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.11467.com/qiye/39876980.htm','达州济民医院','www.11467.com')[0m
41 | [1;37m2019-07-26 20:00:51,450 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.bjnews.com.cn/wevideo/2018/10/08/509511.html','四川达州路面塌陷救援继续 附近居民暂停水电气供应 - ..._新京报网','www.bjnews.com.cn')[0m
42 | [1;37m2019-07-26 20:00:52,201 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://mini.eastday.com/a/181014164416587.html','达州市达川区人行道塌陷进入临时恢复阶段 事件原因仍在调查中_...','mini.eastday.com')[0m
43 | [1;37m2019-07-26 20:00:54,105 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DTHPCVH205149V0C.html','达州南外一人行路面发生塌陷 官方发布险情通报_网易订阅','dy.163.com')[0m
44 | [1;37m2019-07-26 20:00:54,620 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://mini.eastday.com/a/181007195326849.html','达州南外一人行路面发生塌陷 官方发布险情通报_社会频道_东方头条','mini.eastday.com')[0m
45 | [1;37m2019-07-26 20:00:56,524 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://news.sina.com.cn/c/2018-10-11/doc-ihmhafiq9423734.shtml','四川达州塌陷事故专家组:尚未完全确定地陷原因|地陷|涵..._新浪新闻','news.sina.com.cn')[0m
46 | [1;37m2019-07-26 20:00:57,808 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://news.sina.com.cn/o/2018-10-08/doc-ihkvrhpt0250131.shtml','四川达州一人行道路面塌陷 两被困者抢救无效死亡|路面..._新浪新闻','news.sina.com.cn')[0m
47 | [1;37m2019-07-26 20:00:58,655 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://gas.newssc.org/system/20181008/002523077.html','达州南外一人行路面发生塌陷 官方发布险情通报 - 四川新闻网广安...','gas.newssc.org')[0m
48 | [1;37m2019-07-26 20:00:59,622 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dingjinkun.blog.caixin.com/archives/189901','达州路面塌陷事件:要追究刑事责任人-丁金坤-财新博客-新世纪的...','dingjinkun.blog.caixin.com')[0m
49 | [1;37m2019-07-26 20:01:00,602 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.chinanews.com/gn/2018/10-08/8644238.shtml','四川达州地面塌陷 造成至少一人死亡-中新网','www.chinanews.com')[0m
50 | [1;37m2019-07-26 20:01:02,375 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.163.com/18/1009/07/DTLL52J10001875P.html#from=relevant','四川达州地陷2名遇难者系年轻夫妻 国庆刚办宴席_网易新闻','news.163.com')[0m
51 | [1;37m2019-07-26 20:01:03,545 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258644792_100016713','揪心!四川达州路面塌陷致4人死亡,一对父子,一对新婚夫妇!_..._搜狐','www.sohu.com')[0m
52 | [1;37m2019-07-26 20:01:04,608 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://www.qichacha.com/postnews_19fb84f3c7d4c4036adca34dec5d6107.html','达州南外一人行路面发生塌陷 官方发布险情通报 -四川新闻-四川...','www.qichacha.com')[0m
53 | [1;37m2019-07-26 20:01:06,573 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.chinanews.com/gn/2018/10-08/8644238.shtml','四川达州地面塌陷 造成至少一人死亡-中新网','www.chinanews.com')[0m
54 | [1;37m2019-07-26 20:01:07,058 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dingjinkun.blog.caixin.com/archives/189901','达州路面塌陷事件:要追究刑事责任人-丁金坤-财新博客-新世纪的...','dingjinkun.blog.caixin.com')[0m
55 | [1;37m2019-07-26 20:01:07,982 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258355254_100172646/','一周安全警示:四川塌陷事故2人失踪,定陶塔吊倒塌3人死亡_手机搜狐网','m.sohu.com')[0m
56 | [1;37m2019-07-26 20:01:09,502 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://sichuan.scol.com.cn/dzxw/201810/56572825.html','达州地面塌陷吞噬路人后续:隐患排查 周边约200户居民被..._四川在线','sichuan.scol.com.cn')[0m
57 | [1;37m2019-07-26 20:01:10,238 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://sc.sina.com.cn/news/m/2018-10-15/detail-ifxeuwws4249029.shtml','达州人行道路面塌陷事件:涵洞塌陷区域回填已结束_新浪四川_新浪网','sc.sina.com.cn')[0m
58 | [1;37m2019-07-26 20:01:11,676 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://baijiahao.baidu.com/s?id=1613672926035544522&wfr=spider&for=pc','四川一人行道路突然塌陷,4名路人瞬间陷落,救援现场二次塌陷','baijiahao.baidu.com')[0m
59 | [1;37m2019-07-26 20:01:12,642 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://scnews.newssc.org/system/20181009/000913005.html','达州路面塌陷4人陷落2人死亡 搜救30多小时一对父子仍..._四川新闻网','scnews.newssc.org')[0m
60 | [1;37m2019-07-26 20:01:13,200 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258432536_99955512','突然陷落! 四川人行道路面塌陷, 共4人遇难, 新婚夫妻和一对父子_...','www.sohu.com')[0m
61 | [1;37m2019-07-26 20:01:14,671 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://sichuan.scol.com.cn/dzxw/201810/56572827.html','达州地面塌陷吞噬路人后续:事发现场周边交通管制范围扩..._四川在线','sichuan.scol.com.cn')[0m
62 | [1;37m2019-07-26 20:01:15,362 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258355254_100172646/','一周安全警示:四川塌陷事故2人失踪,定陶塔吊倒塌3人死亡_手机搜狐网','m.sohu.com')[0m
63 | [1;37m2019-07-26 20:01:17,234 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258664216_151037','重磅!2018全国百强县区出炉!我省7个县区上榜,排名最靠前的是..._...','www.sohu.com')[0m
64 | [1;37m2019-07-26 20:01:18,552 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817460-1-1.html','南外济民医院门口突现“天坑”消防,武警正在搜救::: - ..._凤凰山下','www.dz19.net')[0m
65 | [1;37m2019-07-26 20:01:21,682 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://xin.baidu.com/yuqing?yuqingId=dd5f81d78d803062ef973ba7833ed2cb&fl=1&castk=LTE%3D','【达州市达川区人行道塌陷进入临时恢复阶段 事件原因仍在调查中-...','xin.baidu.com')[0m
66 | [1;37m2019-07-26 20:01:23,974 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://new.qq.com/omn/20181007/20181007A1HWGS00','【附官方险情通报】达州一人行道路面突然塌陷! 4名路人瞬间坠入,1...','new.qq.com')[0m
67 | [1;37m2019-07-26 20:01:24,643 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/259434578_330146','达州人行道涵洞塌陷区域回填已结束,事件原因仍在调查中_搜..._搜狐','www.sohu.com')[0m
68 | [1;37m2019-07-26 20:01:25,409 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz169.net/2018/1008/102442.shtml','达川区人行道塌陷:救援人员争分夺秒不眠不休搜救陷落群众_达州网','www.dz169.net')[0m
69 | [1;37m2019-07-26 20:01:26,571 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.gohoedu.com/guo1207227p1p1.html','达州一人行道路面突然塌陷!4名路人瞬间坠入,1名男孩已被救起 - ...','bbs.gohoedu.com')[0m
70 | [1;37m2019-07-26 20:01:27,354 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.ybvv.com/thread-1452995-1-1.html','黑人!四川一人行道路面突然塌陷! 4名路人瞬间坠入,1名男..._零距离','bbs.ybvv.com')[0m
71 | [1;37m2019-07-26 20:01:28,118 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://info.fire.hc360.com/2018/10/1509011101753.shtml','...事件原因正在调查分析中-达州涵洞,塌陷区-消防行业-hc360慧聪网 ','info.fire.hc360.com')[0m
72 | [1;37m2019-07-26 20:01:29,279 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258066494_116237','突发!达州一人行道路面突然塌陷! 4名路人瞬间坠入,1名男孩..._搜狐','m.sohu.com')[0m
73 | [1;37m2019-07-26 20:01:31,328 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258153846_100137571','南城路面塌陷,救援人员争分夺秒不眠不休搜救:今日上午又一..._搜狐','www.sohu.com')[0m
74 | [1;37m2019-07-26 20:01:32,375 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://wemedia.ifeng.com/81315202/wemedia.shtml','【突发】达州人行道塌陷已致两人死亡,现场救援仍在进行!','wemedia.ifeng.com')[0m
75 | [1;37m2019-07-26 20:01:33,471 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://temp.163.com/special/ntes_404/','达川区人行道塌陷:救援人员争分夺秒不眠不休搜救陷落群众_网易订阅','temp.163.com')[0m
76 | [1;37m2019-07-26 20:01:34,302 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DTJG4TNI0512EL5Q.html','达州塌陷事故最新进展:再次发现被困者 已送往医院救治_网易订阅','dy.163.com')[0m
77 | [1;37m2019-07-26 20:01:34,949 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://scnews.newssc.org/system/20181007/000912640.html','达州南外一人行路面塌陷 附近路段实施临时交通管制 -..._四川新闻网','scnews.newssc.org')[0m
78 | [1;37m2019-07-26 20:01:36,264 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.eastday.com/s/20181008/u1a14284702.html','...进展:又一名被困人员被发现-路面塌陷 被困人员 事故 救援 10月...','news.eastday.com')[0m
79 | [1;37m2019-07-26 20:01:38,222 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://new.qq.com/omn/20181008/20181008A0QJ86.html','达州市达川区人行道塌陷 4名群众陷落2人被搜救','new.qq.com')[0m
80 | [1;37m2019-07-26 20:01:39,310 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.cssqt.com/xw/gn/sp/297837.shtml','四川达州一人行道路面突然塌陷!4名路人瞬间坠入 1名男孩已被救起!...','www.cssqt.com')[0m
81 | [1;37m2019-07-26 20:01:40,317 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.yidianzixun.com/article/0KEEVO8G','【一点资讯】一周安全警示:9.26~10.08安全事故简报 www.yidian...','www.yidianzixun.com')[0m
82 | [1;37m2019-07-26 20:01:41,098 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258644792_100016713','揪心!四川达州路面塌陷致4人死亡,一对父子,一对新婚夫妇!-..._搜狐','m.sohu.com')[0m
83 | [1;37m2019-07-26 20:01:43,128 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.yidianzixun.com/article/0KEEVO8G','【一点资讯】一周安全警示:9.26~10.08安全事故简报 www.yidian...','www.yidianzixun.com')[0m
84 | [1;37m2019-07-26 20:01:43,887 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258644792_100016713','揪心!四川达州路面塌陷致4人死亡,一对父子,一对新婚夫妇!-..._搜狐','m.sohu.com')[0m
85 | [1;37m2019-07-26 20:01:45,247 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.51daifu.com/2018/1009/5BE3E0B7C11T658714.shtml','人行道路面塌陷 意外受伤需及时送医治疗_医生在线','news.51daifu.com')[0m
86 | [1;37m2019-07-26 20:01:46,573 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DU0GP7QN0522JH3J.html','揪心!造成4人死亡,地面塌陷谁之过?专家回应事故原因_网易订阅','dy.163.com')[0m
87 | [1;37m2019-07-26 20:01:47,931 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://k.sina.com.cn/article_5710586189_15460a14d02000k2ml.html','达川区人行道塌陷恢复治理正有序推进|塌陷|人行道|涵洞_新浪网','k.sina.com.cn')[0m
88 | [1;37m2019-07-26 20:01:49,097 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dzrbs.com/html/2018-10/14/content_331870.htm','达州人行道路面塌陷事件:涵洞塌陷区域已回填,东环南..._达州日报网','www.dzrbs.com')[0m
89 | [1;37m2019-07-26 20:01:49,915 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://sc.sina.com.cn/news/m/2018-10-09/detail-ihkvrhpt2040783.shtml','达州路面塌陷4人陷落2人死亡 搜救30多小时一对父子仍失..._新浪四川','sc.sina.com.cn')[0m
90 | [1;37m2019-07-26 20:01:51,099 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.ii119.cn/news/201810/15/52515.html','达州涵洞塌陷区域回填结束,事件原因正在调查分析中_消防新闻_资讯...','www.ii119.cn')[0m
91 | [1;37m2019-07-26 20:01:52,228 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://mini.eastday.com/a/181008112443666.html','四川达州地面塌陷 造成至少一人死亡_社会频道_东方头条','mini.eastday.com')[0m
92 | [1;37m2019-07-26 20:01:52,764 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DU3Q3I5A0512EL5Q.html','达州人行道涵洞塌陷区域回填已结束,事件原因仍在调查中_网易订阅','dy.163.com')[0m
93 | [1;37m2019-07-26 20:01:54,401 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://baijiahao.baidu.com/s?id=1613872793220182890&wfr=spider&for=pc','四川达州市达川区路面塌陷被困4人全部遇难','baijiahao.baidu.com')[0m
94 | [1;37m2019-07-26 20:01:55,106 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://k.sina.com.cn/article_5044281310_12ca99fde02000m3d1.html','达州涵洞塌陷区域回填结束,事件原因正在调查分析中','k.sina.com.cn')[0m
95 | [1;37m2019-07-26 20:01:56,105 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://mini.eastday.com/a/181014162451012.html','达州涵洞塌陷区域回填结束,事件原因正在调查分析中_社会频道_东方...','mini.eastday.com')[0m
96 | [1;37m2019-07-26 20:01:57,080 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DTO78BUC05371OH7.html','达川区人行道塌陷:救援人员争分夺秒不眠不休搜救陷落群众_网易订阅','dy.163.com')[0m
97 | [1;37m2019-07-26 20:01:58,787 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://news.dahe.cn/2018/10-08/385850.html','达州路面塌陷事故救援最新进展:又一名被困人员被发现-大河网','news.dahe.cn')[0m
98 | [1;37m2019-07-26 20:01:59,914 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://m.sohu.com/a/258045254_364195','突发!南城一医院门口人行道塌陷 相关部门积极救援-警法频道..._搜狐','m.sohu.com')[0m
99 | [1;37m2019-07-26 20:02:01,079 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://temp.163.com/special/ntes_404/','南城路面塌陷,救援人员争分夺秒不眠不休搜救:今日上午..._网易订阅','temp.163.com')[0m
100 | [1;37m2019-07-26 20:02:01,720 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://ah.people.com.cn/GB/n2/2018/1008/c358314-32129820.html','四川达州地面塌陷 造成至少一人死亡--安徽频道--人民网 ','ah.people.com.cn')[0m
101 | [1;37m2019-07-26 20:02:02,508 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://sc.china.com.cn/2018/shizhou_1009/292010.html','达州路面塌陷4人陷落2人死亡 搜救30多小时一对父子仍失踪 - 市州...','sc.china.com.cn')[0m
102 | [1;37m2019-07-26 20:02:03,328 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://dy.163.com/v2/article/detail/DU3GED150514R9P4.html','达州涵洞塌陷区域回填结束,事件原因正在调查分析中_网易订阅','dy.163.com')[0m
103 | [1;37m2019-07-26 20:02:04,298 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://www.qichacha.com/postnews_19fb84f3c7d4c4036adca34dec5d6107.html','达州南外一人行路面发生塌陷 官方发布险情通报 -四川新闻-四川...','www.qichacha.com')[0m
104 | [1;37m2019-07-26 20:02:05,929 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258145605_255783','四川达州地面塌陷 造成至少一人死亡_达川区','www.sohu.com')[0m
105 | [1;37m2019-07-26 20:02:06,959 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.sohu.com/a/258751336_364195','大搜救中的4个“关键字”_搜狐警法_搜狐网','www.sohu.com')[0m
106 | [1;37m2019-07-26 20:02:07,982 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://baijiahao.baidu.com/s?id=1614283709015688151&wfr=spider&for=pc','四川达州人行道路面塌陷区域回填结束 事件原因正在调查分析中','baijiahao.baidu.com')[0m
107 | [1;37m2019-07-26 20:02:24,795 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m
108 | [0;31m2019-07-26 20:02:25,040 [ERROR] baidu.py parse[line:36]: requests.get(data_dict['url']).url ===>>> HTTPConnectionPool(host='www.edushi.com', port=80): Max retries exceeded with url: /zixun/info/2-15-n4537771.html (Caused by NewConnectionError(': Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))[0m
109 | [1;37m2019-07-26 20:03:41,157 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dazhou.gov.cn/articview_20181011102748790.html','达州市达川区人行道路面塌陷灾害搜救回顾 - 达州市人民政府 ','www.dazhou.gov.cn')[0m
110 | [1;37m2019-07-26 20:03:42,389 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://baijiahao.baidu.com/s?id=1613725507012412733&wfr=spider&for=pc','达州南客站对面济民医院门口地面突然塌陷,有人跌落','baijiahao.baidu.com')[0m
111 | [1;37m2019-07-26 20:03:42,844 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.mnw.cn/news/shehui/2068305.html','达州济民医院地面塌陷 造成至少一人死亡-闽南网','www.mnw.cn')[0m
112 | [1;37m2019-07-26 20:03:43,378 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://blog.sina.com.cn/s/blog_5f7396520102z29j.html','达州路面塌陷事件:要追究刑事责任人_丁金坤_新浪博客','blog.sina.com.cn')[0m
113 | [1;37m2019-07-26 20:03:44,322 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://new.qq.com/omn/20181010/20181010B1V7KD.html','南外人行道地面塌陷事故救援结束,新婚夫妇和一对父子均遇难','new.qq.com')[0m
114 | [1;37m2019-07-26 20:03:45,417 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.163.com/18/1009/07/DTLL52J10001875P.html','四川达州地陷2名遇难者系年轻夫妻 国庆刚办宴席_网易新闻','news.163.com')[0m
115 | [1;37m2019-07-26 20:03:46,168 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.bjnews.com.cn/wevideo/2018/10/08/509511.html','四川达州路面塌陷救援继续 附近居民暂停水电气供应 - ..._新京报网','www.bjnews.com.cn')[0m
116 | [1;37m2019-07-26 20:09:02,380 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m
117 | [1;37m2019-07-26 20:12:48,454 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dz19.net/thread-1817616-1-1.html','南外济民医院门口,发生地陷事故,目前正在抢救中... - ..._凤凰山下','www.dz19.net')[0m
118 | [1;37m2019-07-26 20:14:05,291 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.dazhou.gov.cn/articview_20181011102748790.html','达州市达川区人行道路面塌陷灾害搜救回顾 - 达州市人民政府 ','www.dazhou.gov.cn')[0m
119 | [1;37m2019-07-26 20:16:58,045 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.cbda.cn/html/kx/20181010/123371.html','揪心!四川一路面突然塌陷,新婚仅4天的夫妻双亡,一对父..._中装新网','www.cbda.cn')[0m
120 | [1;37m2019-07-26 20:16:59,684 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.51daifu.com/2018/1009/AF7ACB92C3T658683.shtml','人行道路面塌陷 多人死亡令人惋惜_医生在线','news.51daifu.com')[0m
121 | [1;37m2019-07-26 20:17:00,350 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.crjwz.com/shehui/69073.html','四川达州路面塌陷事故救援进展:两人经抢救无效死亡_今天热点新闻...','news.crjwz.com')[0m
122 | [1;37m2019-07-26 20:17:00,833 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://news.cctv.com/2018/10/09/ARTI32aZet0kONxAyM8i1jIZ181009.shtml','达州塌陷事故中的遇难夫妻 4天前刚举行婚礼_新闻频道_央视网(cctv...','news.cctv.com')[0m
123 | [1;37m2019-07-26 20:17:02,134 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://bbs.xishu365.com/thread-484836-1-1.html','四川达州一人行道路面塌陷,四名大人一名小孩掉入坑内,情况有点不...','bbs.xishu365.com')[0m
124 | [1;37m2019-07-26 20:17:05,739 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.zz-qq.com/1/97382.html','人行道路面塌陷_四川达州一人行道路面塌陷 两人遇难两人被困- ...','www.zz-qq.com')[0m
125 | [1;37m2019-07-26 20:17:07,208 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('https://www.xianjichina.com/news/details_85746.html','达州人行道塌陷的最新进展:两人死亡,施救困难-贤集网资讯','www.xianjichina.com')[0m
126 | [1;37m2019-07-26 20:17:09,077 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.aihami.com/a/dangjian/zugong/375555.html','达州路面塌陷事故 抢修工作现正在紧张的进行中_楚秀网','www.aihami.com')[0m
127 | [1;37m2019-07-26 20:17:11,364 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.gazx.org/content/2018-10/8/201810815122663654.htm','达州“地陷”是天灾还是人祸?_广安在线','www.gazx.org')[0m
128 | [1;37m2019-07-26 20:17:12,308 [DEBUG] pipelines.py excute_sql[line:1108]: excute_sql===>>> insert into seed(url,title,site_name) values('http://www.jinciwei.cn/j447103.html','最新动态!南外人行道塌陷恢复治理工作正在有序进行,赶紧看过来! -...','www.jinciwei.cn')[0m
129 | [1;37m2019-07-26 20:17:23,054 [DEBUG] baidu.py parse[line:39]: 执行===>>> https://www.baidu.com/s?wd=2018%E5%B9%B410%E6%9C%887%E6%97%A5%E8%BE%BE%E5%B7%9D%E5%8C%BA%E5%8D%97%E5%A4%96%E6%B5%8E%E6%B0%91%E5%8C%BB%E9%99%A2%E9%97%A8%E5%8F%A3%E7%AA%81%E7%84%B6%E5%A1%8C%E9%99%B7%E4%BA%8B%E4%BB%B6&oq=2018%E5%B9%B410%E6%9C%887%E6%97%A5%E8%BE%BE%E5%B7%9D%E5%8C%BA%E5%8D%97%E5%A4%96%E6%B5%8E%E6%B0%91%E5%8C%BB%E9%99%A2%E9%97%A8%E5%8F%A3%E7%AA%81%E7%84%B6%E5%A1%8C%E9%99%B7%E4%BA%8B%E4%BB%B6&ie=utf-8&rsv_idx=1&rsv_pq=da4e0d0600051217&rsv_t=0bdcDWC5g2e2v0%2FFpxTTPC6IQO3RvUQxRCleqWWkBvdvuCKNo6MtAkayKAM&pn=30 花费时间22.64210271835327[0m
130 |
--------------------------------------------------------------------------------
/dz_spider/run.py:
--------------------------------------------------------------------------------
1 | # encoding: utf-8
2 | """
3 | --------------------------------------
4 | @describe
5 | @version: 1.0
6 | @project: yuqing_system
7 | @file: run.py
8 | @author: yuanlang
9 | @time: 2019-07-26 17:12
10 | ---------------------------------------
11 | """
12 |
13 | from scrapy import cmdline
14 | # cmdline.execute(['scrapy', 'crawl', 'baidu'])
15 | cmdline.execute(['scrapy', 'crawl', 'toutiao'])
16 | # cmdline.execute(['scrapy', 'crawl', 'sogou'])
17 |
18 |
19 |
--------------------------------------------------------------------------------
/dz_spider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = dz_spider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dz_spider
12 |
--------------------------------------------------------------------------------
/plan:
--------------------------------------------------------------------------------
1 | 1. 2018年10月7日达川区南外济民医院门口突然塌陷事件
2 | 2. 2018年6月1日达州市好一新大火事件
3 | 3. 2018年8月出租车罢工事件
4 |
5 |
6 | 数据源:
7 | 今日头条,百度新闻,sougo,微博
8 |
9 | #达州地陷#
10 | #达州好一新火灾#
11 |
12 |
13 |
14 | 新闻数据关键词提取、主题聚类分析
15 | 评论数据进行关键词提取、主题聚类分析
16 |
17 | url title create_time update_time
--------------------------------------------------------------------------------