├── Captcha1
├── !Test.bat
├── ReadMe.md
├── convert.exe
├── pic
│ ├── fnord.tif
│ ├── get_price_img.png
│ ├── get_price_img1.png
│ ├── get_price_img1_binary.png
│ ├── get_price_img2.png
│ ├── get_price_img2_binary.png
│ ├── get_price_img_binary.png
│ ├── get_random.jpg
│ ├── get_random1.jpg
│ ├── get_random1_binary.png
│ ├── get_random1_binary_midu.png
│ ├── get_random1_binary_midu_pro1.png
│ ├── get_random2.jpg
│ ├── get_random2_binary.png
│ ├── get_random2_binary_midu.png
│ ├── get_random2_binary_midu_pro1.png
│ ├── get_random_binary.png
│ ├── get_random_binary_midu.png
│ └── get_random_binary_midu_pro1.png
├── pytesser_pro
│ ├── __init__.py
│ ├── errors.py
│ ├── pytesser_pro.py
│ └── util.py
├── tess_test.py
└── tesseract.exe
├── NewsSpider
├── NewsSpider.exe
├── NewsSpider.py
└── ReadMe.md
├── QunarSpider
├── QunarSpider.py
└── ReadMe.md
├── ReadMe.md
├── Spider_Java
├── README.md
├── Spider_Java1
│ ├── .classpath
│ ├── .project
│ ├── bin
│ │ ├── synchronizetest
│ │ │ ├── Booth.class
│ │ │ ├── Reservoir.class
│ │ │ └── Test.class
│ │ └── wallstreetcnsave
│ │ │ └── WallstreetcnSaveTest.class
│ ├── lib
│ │ └── mongo-java-driver-2.13.0-rc1.jar
│ └── src
│ │ ├── synchronizetest
│ │ └── Test.java
│ │ └── wallstreetcnsave
│ │ └── WallstreetcnSaveTest.java
└── Spider_Java2
│ ├── .classpath
│ ├── .project
│ ├── bin
│ ├── synchronizetest
│ │ ├── Booth.class
│ │ ├── Reservoir.class
│ │ └── Test.class
│ └── wallstreetcnsave
│ │ ├── GetrequestUrl.class
│ │ ├── WallstreetcnSave.class
│ │ └── WallstreetcnSaveTest.class
│ ├── lib
│ └── mongo-java-driver-2.13.0-rc1.jar
│ └── src
│ ├── synchronizetest
│ └── Test.java
│ └── wallstreetcnsave
│ └── WallstreetcnSaveTest.java
├── Spider_Python
├── README.md
└── WallstreetcnSaveTest.py
├── WechatSearchProjects
├── README.md
├── Spider_Main.py
├── WechatSearchTest.py
└── Wechatproject
│ ├── Wechatproject
│ ├── __init__.py
│ ├── items.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ └── spider.py
│ └── scrapy.cfg
└── ZhihuSpider
├── ReadMe.md
├── ZhihuSpider.py
└── config.ini
/Captcha1/!Test.bat:
--------------------------------------------------------------------------------
1 | python tess_test.py ./pic/get_price_img.png
2 | pause
--------------------------------------------------------------------------------
/Captcha1/ReadMe.md:
--------------------------------------------------------------------------------
1 | ### 验证码识别项目第一版:Captcha1
2 |
3 | 本项目采用Tesseract V3.01版本(V3.02版本在训练时有改动,多shapeclustering过程)
4 |
5 | **Tesseract用法:**
6 | * 配置环境变量TESSDATA_PREFIX =“D:\Tesseract-ocr\”,即tessdata的目录,在源码中会到这个路径下查找相应的字库文件用来识别。
7 | * 命令格式:
8 | `tesseract imagename outputbase [-l lang] [-psm pagesegmode] [configfile...]`
9 | * 只识别成数字
10 | `tesseract imagename outputbase -l eng digits`
11 | * 解决empty page!!
12 | **-psm N**
13 |
14 | 7 = Treat the image as a single text line
15 | tesseract imagename outputbase -l eng -psm 7
16 | * configfile 参数值为tessdata\configs 和 tessdata\tessconfigs 目录下的文件名:
17 | `tesseract imagename outputbase -l eng nobatch`
18 |
19 |
20 | **验证码识别项目使用方法1:**
21 |
22 | * 将下载的图片放到./pic目录下,
23 |
24 | 验证码图片名称:get_random.jpg
25 | 价格图片名称:get_price_img.png
26 |
27 | * 命令格式:
28 |
29 | 验证码图片识别:python tess_test.py ./pic/get_random.jpg
30 | 价格图片识别:python tess_test.py ./pic/get_price_img.png
31 |
32 | 打印出识别的结果
33 |
34 | 若要将结果存在临时文本文件**temp.txt**中,则修改pytessr_pro.py中代码"**cleanup_scratch_flag = True**"改为"**cleanup_scratch_flag = False**"
35 |
--------------------------------------------------------------------------------
/Captcha1/convert.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/convert.exe
--------------------------------------------------------------------------------
/Captcha1/pic/fnord.tif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/fnord.tif
--------------------------------------------------------------------------------
/Captcha1/pic/get_price_img.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_price_img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img1.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_price_img1_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img1_binary.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_price_img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img2.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_price_img2_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img2_binary.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_price_img_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img_binary.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_random.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random.jpg
--------------------------------------------------------------------------------
/Captcha1/pic/get_random1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random1.jpg
--------------------------------------------------------------------------------
/Captcha1/pic/get_random1_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random1_binary.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_random1_binary_midu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random1_binary_midu.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_random1_binary_midu_pro1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random1_binary_midu_pro1.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_random2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random2.jpg
--------------------------------------------------------------------------------
/Captcha1/pic/get_random2_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random2_binary.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_random2_binary_midu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random2_binary_midu.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_random2_binary_midu_pro1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random2_binary_midu_pro1.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_random_binary.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random_binary.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_random_binary_midu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random_binary_midu.png
--------------------------------------------------------------------------------
/Captcha1/pic/get_random_binary_midu_pro1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random_binary_midu_pro1.png
--------------------------------------------------------------------------------
/Captcha1/pytesser_pro/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pytesser_pro/__init__.py
--------------------------------------------------------------------------------
/Captcha1/pytesser_pro/errors.py:
--------------------------------------------------------------------------------
1 | """Test for exceptions raised in the tesseract.exe logfile"""
2 |
3 | class Tesser_General_Exception(Exception):
4 | pass
5 |
6 | class Tesser_Invalid_Filetype(Tesser_General_Exception):
7 | pass
8 |
9 | def check_for_errors(logfile = "tesseract.log"):
10 | inf = file(logfile)
11 | text = inf.read()
12 | inf.close()
13 | # All error conditions result in "Error" somewhere in logfile
14 | if text.find("Error") != -1:
15 | raise Tesser_General_Exception, text
--------------------------------------------------------------------------------
/Captcha1/pytesser_pro/pytesser_pro.py:
--------------------------------------------------------------------------------
1 | import Image
2 | import subprocess
3 |
4 | import util
5 | import errors
6 |
7 | tesseract_exe_name = "tesseract" # Name of executable to be called at command line
8 | scratch_image_name = "temp.bmp" # This file must be .bmp or other Tesseract-compatible format
9 | scratch_text_name_root = "temp" # Leave out the .txt extension
10 | cleanup_scratch_flag = False # Temporary files cleaned up after OCR operation
11 |
12 | def call_tesseract(input_filename, output_filename, bool_digits=False):
13 | """Calls external tesseract.exe on input file (restrictions on types),
14 | outputting output_filename+'txt'"""
15 | # args = [tesseract_exe_name, input_filename, output_filename]
16 | if bool_digits:
17 | # args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l eng -psm 7 nobatch eng_digits" # price
18 | args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l test_digits -psm 7 nobatch" # price
19 | else:
20 | args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l eng -psm 7 nobatch eng_characters" # English letters
21 | # args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l test_eng -psm 7 nobatch" # English letters
22 | # print args
23 | proc = subprocess.Popen(args, shell=True)
24 | retcode = proc.wait()
25 | if retcode != 0:
26 | errors.check_for_errors()
27 |
28 | def image_to_string(im, cleanup = cleanup_scratch_flag, bool_digits=False):
29 | """Converts im to file, applies tesseract, and fetches resulting text.
30 | If cleanup=True, delete scratch files after operation."""
31 | try:
32 | util.image_to_scratch(im, scratch_image_name)
33 | call_tesseract(scratch_image_name, scratch_text_name_root, bool_digits)
34 | text = util.retrieve_text(scratch_text_name_root)
35 | finally:
36 | if cleanup:
37 | util.perform_cleanup(scratch_image_name, scratch_text_name_root)
38 | return text
39 |
40 | def image_file_to_string(filename, cleanup = cleanup_scratch_flag, graceful_errors=True, bool_digits=False):
41 | """Applies tesseract to filename; or, if image is incompatible and graceful_errors=True,
42 | converts to compatible format and then applies tesseract. Fetches resulting text.
43 | If cleanup=True, delete scratch files after operation."""
44 | try:
45 | try:
46 | call_tesseract(filename, scratch_text_name_root, bool_digits)
47 | text = util.retrieve_text(scratch_text_name_root)
48 | except errors.Tesser_General_Exception:
49 | if graceful_errors:
50 | im = Image.open(filename)
51 | text = image_to_string(im, cleanup, bool_digits)
52 | else:
53 | raise
54 | finally:
55 | if cleanup:
56 | util.perform_cleanup(scratch_image_name, scratch_text_name_root)
57 | return text
58 |
--------------------------------------------------------------------------------
/Captcha1/pytesser_pro/util.py:
--------------------------------------------------------------------------------
1 | """Utility functions for processing images for delivery to Tesseract"""
2 |
3 | import os
4 |
5 | def image_to_scratch(im, scratch_image_name):
6 | """Saves image in memory to scratch file. .bmp format will be read correctly by Tesseract"""
7 | im.save(scratch_image_name, dpi=(200,200))
8 |
9 | def retrieve_text(scratch_text_name_root):
10 | inf = file(scratch_text_name_root + '.txt')
11 | text = inf.read()
12 | inf.close()
13 | return text
14 |
15 | def perform_cleanup(scratch_image_name, scratch_text_name_root):
16 | """Clean up temporary files from disk"""
17 | for name in (scratch_image_name, scratch_text_name_root + '.txt', "tesseract.log"):
18 | try:
19 | os.remove(name)
20 | except OSError:
21 | pass
22 |
--------------------------------------------------------------------------------
/Captcha1/tess_test.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 |
3 | import os
4 | import sys
5 | import subprocess
6 | from pytesser_pro.pytesser_pro import *
7 | import Image, ImageEnhance, ImageFilter
8 | from pylab import *
9 |
10 |
11 |
12 | # 二值化并转格式
13 | def binary(image_name, binary_image_name):
14 | # 白底黑字
15 | args = "convert -monochrome "+image_name+" "+binary_image_name
16 | # print args
17 | proc = subprocess.Popen(args, shell=True)
18 | proc.wait()
19 | im = Image.open(binary_image_name)
20 | w, h = im.size
21 | data = list(im.getdata())
22 | if (data[0], data[w-1], data[(h-1)*w], data[h*w-1]) == (0, 0, 0, 0): # 0-黑色,255-白色
23 | # 若非白底黑字则灰度反转
24 | args1 = "convert -negate "+binary_image_name+" "+binary_image_name
25 | proc1 = subprocess.Popen(args1, shell=True)
26 | proc1.wait()
27 |
28 | # 计算范围内点的个数
29 | def numpoint(im):
30 | w, h = im.size
31 | # print w, h
32 | data = list(im.getdata())
33 | mumpoint = 0
34 | for x in range(w):
35 | for y in range(h):
36 | if data[y*w+x] == 0: # 0-黑色,255-白色
37 | mumpoint += 1
38 | return mumpoint
39 |
40 | # 投影法去干扰线
41 | def pointmidu(binary_image_name, midu_image_name):
42 | im = Image.open(binary_image_name)
43 | w, h = im.size
44 | # print w, h
45 | len = 5
46 | for x in range(0, w, len):
47 | box = (x, 0, x+len, h)
48 | im_box = im.crop(box)
49 | num = numpoint(im_box)
50 | # print num
51 | if num < 20:
52 | for i in range(x, x+len):
53 | for j in range(h):
54 | im.putpixel((i, j), 255) # 0-黑色,255-白色
55 | data = list(im.getdata())
56 | data_column = []
57 | for x in range(w):
58 | temp = 0
59 | for y in range(h):
60 | if data[y*w+x] == 0: # 0-黑色,255-白色
61 | temp += 1
62 | data_column.append(temp)
63 | # print data_column
64 | start = 0
65 | for i in range(0, w, 1):
66 | if data_column[i] != 0:
67 | break
68 | else:
69 | start += 1
70 | # print start
71 | end = w-1
72 | for j in range(w-1, -1, -1):
73 | if data_column[j] != 0:
74 | break
75 | else:
76 | end += -1
77 | # print end
78 | box_new = (start, 0, end+1, h)
79 | im_box_new = im.crop(box_new)
80 | im_box_new.save(midu_image_name)
81 |
82 | # 图像增强
83 | def filter_enhance(midu_image_name, midu_image_name_pro1):
84 | im = Image.open(midu_image_name)
85 | # 去噪
86 | im = im.filter(ImageFilter.MedianFilter())
87 | # 亮度加强
88 | enhancer = ImageEnhance.Contrast(im)
89 | im = enhancer.enhance(2)
90 | im = im.convert('1')
91 | # im.show()
92 | im.save(midu_image_name_pro1)
93 |
94 | # 字符分割
95 | def seg(midu_image_name_pro1, midu_image_name_pro2, num):
96 | im = Image.open(midu_image_name_pro1)
97 | w, h = im.size
98 | # print w, h, w/num
99 | len = 2
100 | for i in range(num-1):
101 | start = (i+1)*w/num
102 | end = start+len
103 | for m in range(start, end+1):
104 | for n in range(h):
105 | im.putpixel((m, n), 255) # 0-黑色,255-白色
106 | im.save(midu_image_name_pro2)
107 |
108 | def get_aim1_point(im):
109 | aim = []
110 | w, h = im.size
111 | # print w, h
112 | data = list(im.getdata())
113 | for x in range(0, w, 1):
114 | for y in range(0, h, 1):
115 | if data[y*w+x] == 0: # 0-黑色,255-白色
116 | start_point = (x, y)
117 | # print start_point
118 | aim.append(start_point)
119 | break
120 | return aim
121 |
122 | def get_aim2_point(im):
123 | aim = []
124 | w, h = im.size
125 | # print w, h
126 | data = list(im.getdata())
127 | for x in range(0, w, 1):
128 | for y in range(h-1, -1, -1):
129 | if data[y*w+x] == 0: # 0-黑色,255-白色
130 | start_point = (x, y)
131 | # print start_point
132 | aim.append(start_point)
133 | break
134 | return aim
135 |
136 |
137 | if __name__=='__main__':
138 |
139 | if len(sys.argv) == 1:
140 | image_name = "./pic/get_random.jpg" # 验证码图片名称
141 | digits = False
142 | # image_name = "./pic/get_price_img.png" # 价格图片名称
143 | # digits = True
144 | elif len(sys.argv) == 2:
145 | if sys.argv[1].find("get_random") != -1:
146 | image_name = sys.argv[1]
147 | digits = False
148 | elif sys.argv[1].find("get_price_img") != -1:
149 | image_name = sys.argv[1]
150 | digits = True
151 | else:
152 | print "Please Input the Correct Image Name!"
153 | sys.exit(0)
154 | else:
155 | print "Too Many Arguments!"
156 | sys.exit(0)
157 |
158 |
159 | # 二值化并转格式
160 | binary_image_name = os.path.splitext(image_name)[0]+"_binary.png"
161 | binary(image_name, binary_image_name)
162 |
163 | im = Image.open(binary_image_name)
164 | print im.format, im.size, im.mode
165 |
166 |
167 | if digits:
168 | text = image_file_to_string(binary_image_name, bool_digits=digits)
169 | print text.replace("\n", "")
170 | else:
171 | # 投影法去干扰线
172 | fpathandname , fext = os.path.splitext(binary_image_name)
173 | midu_image_name = fpathandname+"_midu"+fext
174 | pointmidu(binary_image_name, midu_image_name)
175 |
176 |
177 | fpathandname , fext = os.path.splitext(midu_image_name)
178 |
179 | # 去干扰线
180 | # im = Image.open(midu_image_name)
181 | # w, h = im.size
182 | # data = list(im.getdata())
183 | # aim1 = get_aim1_point(im)
184 | # for x, y in aim1:
185 | # curr = data[y*w+x]
186 | # prev = data[(y-1)*w+x]
187 | # next = data[(y+1)*w+x]
188 | #
189 | # if prev == 0 and next == 0: # 0-黑色,255-白色
190 | # continue
191 | # if prev == 0:
192 | # im.putpixel((x, y), 255)
193 | # im.putpixel((x, y-1), 255)
194 | # elif next == 0:
195 | # im.putpixel((x, y), 255)
196 | # im.putpixel((x, y+1), 255)
197 | # else:
198 | # im.putpixel((x, y), 255)
199 | # data = list(im.getdata())
200 | # aim2 = get_aim2_point(im)
201 | # for x, y in aim2:
202 | # curr = data[y*w+x]
203 | # prev = data[(y-1)*w+x]
204 | # next = data[(y+1)*w+x]
205 | #
206 | # if prev == 0 and next == 0: # 0-黑色,255-白色
207 | # continue
208 | # if prev == 0:
209 | # im.putpixel((x, y), 255)
210 | # im.putpixel((x, y-1), 255)
211 | # elif next == 0:
212 | # im.putpixel((x, y), 255)
213 | # im.putpixel((x, y+1), 255)
214 | # else:
215 | # im.putpixel((x, y), 255)
216 | # midu_image_name_new = fpathandname+"_new"+fext
217 | # im.save(midu_image_name_new)
218 |
219 |
220 | # 图像增强
221 | midu_image_name_pro1 = fpathandname+"_pro1"+fext
222 | filter_enhance(midu_image_name, midu_image_name_pro1)
223 | # 字符分割
224 | # num = 4
225 | # midu_image_name_pro2 = fpathandname+"_pro2"+fext
226 | # seg(midu_image_name_pro1, midu_image_name_pro2, num)
227 |
228 | # im = Image.open(midu_image_name)
229 | # text = image_to_string(im)
230 | # print text.replace("\n", "")
231 | text = image_file_to_string(midu_image_name_pro1, bool_digits=digits)
232 | print text.replace("\n", "")
--------------------------------------------------------------------------------
/Captcha1/tesseract.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/tesseract.exe
--------------------------------------------------------------------------------
/NewsSpider/NewsSpider.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/NewsSpider/NewsSpider.exe
--------------------------------------------------------------------------------
/NewsSpider/NewsSpider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import sys
4 | import urllib2
5 | import requests
6 | import re
7 | from lxml import etree
8 |
9 |
10 | def StringListSave(save_path, filename, slist):
11 | if not os.path.exists(save_path):
12 | os.makedirs(save_path)
13 | path = save_path+"/"+filename+".txt"
14 | with open(path, "w+") as fp:
15 | for s in slist:
16 | fp.write("%s\t\t%s\n" % (s[0].encode("utf8"), s[1].encode("utf8")))
17 |
18 | def Page_Info(myPage):
19 | '''Regex'''
20 | mypage_Info = re.findall(r'
', myPage, re.S)
21 | return mypage_Info
22 |
23 | def New_Page_Info(new_page):
24 | '''Regex(slowly) or Xpath(fast)'''
25 | # new_page_Info = re.findall(r'.*?(.*?) | ', new_page, re.S)
26 | # # new_page_Info = re.findall(r'.*?(.*?) | ', new_page, re.S) # bugs
27 | # results = []
28 | # for url, item in new_page_Info:
29 | # results.append((item, url+".html"))
30 | # return results
31 | dom = etree.HTML(new_page)
32 | new_items = dom.xpath('//tr/td/a/text()')
33 | new_urls = dom.xpath('//tr/td/a/@href')
34 | assert(len(new_items) == len(new_urls))
35 | return zip(new_items, new_urls)
36 |
37 | def Spider(url):
38 | i = 0
39 | print "downloading ", url
40 | myPage = requests.get(url).content.decode("gbk")
41 | # myPage = urllib2.urlopen(url).read().decode("gbk")
42 | myPageResults = Page_Info(myPage)
43 | save_path = u"网易新闻抓取"
44 | filename = str(i)+"_"+u"新闻排行榜"
45 | StringListSave(save_path, filename, myPageResults)
46 | i += 1
47 | for item, url in myPageResults:
48 | print "downloading ", url
49 | new_page = requests.get(url).content.decode("gbk")
50 | # new_page = urllib2.urlopen(url).read().decode("gbk")
51 | newPageResults = New_Page_Info(new_page)
52 | filename = str(i)+"_"+item
53 | StringListSave(save_path, filename, newPageResults)
54 | i += 1
55 |
56 |
57 | if __name__ == '__main__':
58 | print "start"
59 | start_url = "http://news.163.com/rank/"
60 | Spider(start_url)
61 | print "end"
--------------------------------------------------------------------------------
/NewsSpider/ReadMe.md:
--------------------------------------------------------------------------------
1 | ### 网络爬虫之最基本的爬虫:爬取[网易新闻排行榜](http://news.163.com/rank/)
2 |
3 | **一些说明:**
4 |
5 | * 使用urllib2或requests包来爬取页面。
6 |
7 | * 使用正则表达式分析一级页面,使用Xpath来分析二级页面。
8 |
9 | * 将得到的标题和链接,保存为本地文件。
10 |
--------------------------------------------------------------------------------
/QunarSpider/QunarSpider.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding:utf-8 -*-
3 |
4 | import os
5 | import time
6 | import datetime
7 | import codecs
8 | import multiprocessing as mp
9 | from os import makedirs
10 | from os.path import exists
11 | from selenium import webdriver
12 | from selenium.webdriver.common.proxy import *
13 |
14 |
15 | site = 'http://flight.qunar.com'
16 | hot_city_list = [u'上海', u'北京', u'广州', u'深圳']
17 | num = len(hot_city_list)
18 |
19 |
20 | def one_driver_ticket(driver, from_city, to_city):
21 | # time = datetime.datetime.now()
22 | date = datetime.date.today()
23 | tomorrow = date+datetime.timedelta(days=1)
24 | # date格式转为string格式
25 | tomorrow_string = tomorrow.strftime('%Y-%m-%d')
26 |
27 | driver.find_element_by_name('fromCity').clear()
28 | driver.find_element_by_name('fromCity').send_keys(from_city)
29 | driver.find_element_by_name('toCity').clear()
30 | driver.find_element_by_name('toCity').send_keys(to_city)
31 | driver.find_element_by_name('fromDate').clear()
32 | driver.find_element_by_name('fromDate').send_keys(tomorrow_string)
33 | driver.find_element_by_xpath('//button[@type="submit"]').click()
34 | time.sleep(5) # 控制间隔时间,等待浏览器反映
35 |
36 | flag = True
37 | page_num = 0
38 | while flag:
39 | # 保存页面
40 | # print driver.page_source
41 | source_code = driver.find_element_by_xpath("//*").get_attribute("outerHTML")
42 | print type(source_code)
43 | dstdir = u'./ticket/'
44 | if not exists(dstdir):
45 | makedirs(dstdir)
46 | f = codecs.open(dstdir+from_city+u','+to_city+unicode(tomorrow_string)+u','+unicode(str(page_num+1))+u'.html', 'w+', 'utf8')
47 | f.write(source_code)
48 | f.close()
49 |
50 | next_page = None
51 | try:
52 | next_page = driver.find_element_by_id('nextXI3')
53 | except Exception as e:
54 | print e
55 | pass
56 | print "page: %d" % (page_num+1)
57 | if next_page:
58 | try:
59 | next_page.click()
60 | time.sleep(2) # 控制间隔时间,等待浏览器反映
61 | page_num += 1
62 | except Exception as e:
63 | print 'next_page could not be clicked'
64 | print e
65 | flag = False
66 | else:
67 | flag = False
68 |
69 | def get_proxy_list(file_path):
70 | proxy_list = []
71 | try:
72 | f = open(file_path, 'r')
73 | all_lines = f.readlines() # readlines()每次按行读取整个文件内容,将读取到的内容放到一个列表中,返回list类型。
74 | for line in all_lines:
75 | proxy_list.append(line.replace('\r', '').replace('\n', ''))
76 | f.close()
77 | except Exception as e:
78 | print e
79 | return proxy_list
80 |
81 | def ticket_worker_proxy(city_proxy):
82 | city = city_proxy.split(',')[0]
83 | proxy = city_proxy.split(',')[1]
84 | proxy = Proxy({
85 | 'proxyType': ProxyType.MANUAL,
86 | 'httpProxy': proxy,
87 | 'ftpProxy': proxy,
88 | 'sslProxy': proxy,
89 | 'noProxy': '' # 过滤不需要代理的地址
90 | })
91 | driver = webdriver.Firefox(proxy=proxy)
92 | driver.get(site)
93 | driver.maximize_window() # 将浏览器最大化显示
94 | for i in xrange(num):
95 | if city == hot_city_list[i]:
96 | continue
97 | from_city = city
98 | to_city = hot_city_list[i]
99 | one_driver_ticket(driver, from_city, to_city)
100 | driver.close()
101 |
102 | def all_ticket_proxy():
103 | hot_city_proxy_list = []
104 | proxy_list = get_proxy_list('./proxy/proxy.txt') # ./表示当前目录,../表示上一级目录
105 | for i in xrange(num):
106 | hot_city_proxy_list.append(hot_city_list[i]+','+proxy_list[i])
107 | pool = mp.Pool(processes=1)
108 | pool.map(ticket_worker_proxy, hot_city_proxy_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)]
109 | pool.close()
110 | pool.join()
111 |
112 | def ticket_worker_no_proxy(city):
113 | driver = webdriver.Firefox()
114 | # chromedriver = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
115 | # os.environ['webdriver.chrome.driver'] = chromedriver
116 | # driver = webdriver.Chrome(chromedriver)
117 | driver.get(site)
118 | driver.maximize_window() # 将浏览器最大化显示
119 | time.sleep(5) # 控制间隔时间,等待浏览器反映
120 | for i in xrange(num):
121 | if city == hot_city_list[i]:
122 | continue
123 | from_city = city
124 | to_city = hot_city_list[i]
125 | one_driver_ticket(driver, from_city, to_city)
126 | driver.close()
127 |
128 | def all_ticket_no_proxy():
129 | pool = mp.Pool(processes=1)
130 | pool.map(ticket_worker_no_proxy, hot_city_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)]
131 | pool.close()
132 | pool.join()
133 |
134 |
135 | if __name__ == '__main__':
136 | print "start"
137 | start = datetime.datetime.now()
138 | # all_ticket_proxy() # proxy
139 | all_ticket_no_proxy() # no proxy
140 | end = datetime.datetime.now()
141 | print "end"
142 | print "time: ", end-start
143 |
--------------------------------------------------------------------------------
/QunarSpider/ReadMe.md:
--------------------------------------------------------------------------------
1 | ### 网络爬虫之Selenium使用代理登陆:爬取[去哪儿](http://flight.qunar.com/)网站
2 |
3 | **一些说明:**
4 |
5 | * 使用selenium模拟浏览器登陆,获取翻页操作。
6 |
7 | * 代理可以存入一个文件,程序读取并使用。
8 |
9 | * 支持多进程抓取。
--------------------------------------------------------------------------------
/ReadMe.md:
--------------------------------------------------------------------------------
1 | # [Python入门网络爬虫之精华版](https://github.com/lining0806/PythonSpiderNotes)
2 |
3 | ***
4 |
5 | Python学习网络爬虫主要分3个大的版块:**抓取**,**分析**,**存储**
6 |
7 | 另外,比较常用的爬虫框架[Scrapy](http://scrapy.org/),这里最后也详细介绍一下。
8 |
9 | 首先列举一下本人总结的相关文章,这些覆盖了入门网络爬虫需要的基本概念和技巧:[宁哥的小站-网络爬虫](http://www.lining0806.com/category/spider/)
10 | ***
11 |
12 | 当我们在浏览器中输入一个url后回车,后台会发生什么?比如说你输入[http://www.lining0806.com/](http://www.lining0806.com/),你就会看到宁哥的小站首页。
13 |
14 | 简单来说这段过程发生了以下四个步骤:
15 |
16 | * 查找域名对应的IP地址。
17 | * 向IP对应的服务器发送请求。
18 | * 服务器响应请求,发回网页内容。
19 | * 浏览器解析网页内容。
20 |
21 | 网络爬虫要做的,简单来说,就是实现浏览器的功能。通过指定url,直接返回给用户所需要的数据,而不需要一步步人工去操纵浏览器获取。
22 |
23 | ## 抓取
24 | 这一步,你要明确要得到的内容是什么?是HTML源码,还是Json格式的字符串等。
25 |
26 | #### 1. 最基本的抓取
27 |
28 | 抓取大多数情况属于get请求,即直接从对方服务器上获取数据。
29 |
30 | 首先,Python中自带urllib及urllib2这两个模块,基本上能满足一般的页面抓取。另外,[requests](https://github.com/kennethreitz/requests)也是非常有用的包,与此类似的,还有[httplib2](https://github.com/jcgregorio/httplib2)等等。
31 |
32 | ```
33 | Requests:
34 | import requests
35 | response = requests.get(url)
36 | content = requests.get(url).content
37 | print "response headers:", response.headers
38 | print "content:", content
39 | Urllib2:
40 | import urllib2
41 | response = urllib2.urlopen(url)
42 | content = urllib2.urlopen(url).read()
43 | print "response headers:", response.headers
44 | print "content:", content
45 | Httplib2:
46 | import httplib2
47 | http = httplib2.Http()
48 | response_headers, content = http.request(url, 'GET')
49 | print "response headers:", response_headers
50 | print "content:", content
51 | ```
52 |
53 | 此外,对于带有查询字段的url,get请求一般会将来请求的数据附在url之后,以?分割url和传输数据,多个参数用&连接。
54 |
55 | ```
56 | data = {'data1':'XXXXX', 'data2':'XXXXX'}
57 | Requests:data为dict,json
58 | import requests
59 | response = requests.get(url=url, params=data)
60 | Urllib2:data为string
61 | import urllib, urllib2
62 | data = urllib.urlencode(data)
63 | full_url = url+'?'+data
64 | response = urllib2.urlopen(full_url)
65 | ```
66 |
67 | 相关参考:[网易新闻排行榜抓取回顾](http://www.lining0806.com/%E7%BD%91%E6%98%93%E6%96%B0%E9%97%BB%E6%8E%92%E8%A1%8C%E6%A6%9C%E6%8A%93%E5%8F%96%E5%9B%9E%E9%A1%BE/)
68 |
69 | 参考项目:[网络爬虫之最基本的爬虫:爬取网易新闻排行榜](https://github.com/lining0806/PythonSpiderNotes/blob/master/NewsSpider)
70 |
71 | ### 2. 对于登陆情况的处理
72 |
73 | **2.1 使用表单登陆**
74 |
75 | 这种情况属于post请求,即先向服务器发送表单数据,服务器再将返回的cookie存入本地。
76 |
77 | ```
78 | data = {'data1':'XXXXX', 'data2':'XXXXX'}
79 | Requests:data为dict,json
80 | import requests
81 | response = requests.post(url=url, data=data)
82 | Urllib2:data为string
83 | import urllib, urllib2
84 | data = urllib.urlencode(data)
85 | req = urllib2.Request(url=url, data=data)
86 | response = urllib2.urlopen(req)
87 | ```
88 |
89 | **2.2 使用cookie登陆**
90 |
91 | 使用cookie登陆,服务器会认为你是一个已登陆的用户,所以就会返回给你一个已登陆的内容。因此,需要验证码的情况可以使用带验证码登陆的cookie解决。
92 |
93 | ```
94 | import requests
95 | requests_session = requests.session()
96 | response = requests_session.post(url=url_login, data=data)
97 | ```
98 |
99 | 若存在验证码,此时采用response = requests_session.post(url=url_login, data=data)是不行的,做法应该如下:
100 |
101 | ```
102 | response_captcha = requests_session.get(url=url_login, cookies=cookies)
103 | response1 = requests.get(url_login) # 未登陆
104 | response2 = requests_session.get(url_login) # 已登陆,因为之前拿到了Response Cookie!
105 | response3 = requests_session.get(url_results) # 已登陆,因为之前拿到了Response Cookie!
106 | ```
107 |
108 | 相关参考:[网络爬虫-验证码登陆](http://www.lining0806.com/6-%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB-%E9%AA%8C%E8%AF%81%E7%A0%81%E7%99%BB%E9%99%86/)
109 |
110 | 参考项目:[网络爬虫之用户名密码及验证码登陆:爬取知乎网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/ZhihuSpider)
111 |
112 | ### 3. 对于反爬虫机制的处理
113 |
114 | **3.1 使用代理**
115 |
116 | 适用情况:限制IP地址情况,也可解决由于“频繁点击”而需要输入验证码登陆的情况。
117 |
118 | 这种情况最好的办法就是维护一个代理IP池,网上有很多免费的代理IP,良莠不齐,可以通过筛选找到能用的。对于“频繁点击”的情况,我们还可以通过限制爬虫访问网站的频率来避免被网站禁掉。
119 |
120 | ```
121 | proxies = {'http':'http://XX.XX.XX.XX:XXXX'}
122 | Requests:
123 | import requests
124 | response = requests.get(url=url, proxies=proxies)
125 | Urllib2:
126 | import urllib2
127 | proxy_support = urllib2.ProxyHandler(proxies)
128 | opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler)
129 | urllib2.install_opener(opener) # 安装opener,此后调用urlopen()时都会使用安装过的opener对象
130 | response = urllib2.urlopen(url)
131 | ```
132 |
133 | **3.2 时间设置**
134 |
135 | 适用情况:限制频率情况。
136 |
137 | Requests,Urllib2都可以使用time库的sleep()函数:
138 |
139 | ```
140 | import time
141 | time.sleep(1)
142 | ```
143 |
144 | **3.3 伪装成浏览器,或者反“反盗链”**
145 |
146 | 有些网站会检查你是不是真的浏览器访问,还是机器自动访问的。这种情况,加上User-Agent,表明你是浏览器访问即可。有时还会检查是否带Referer信息还会检查你的Referer是否合法,一般再加上Referer。
147 |
148 | ```
149 | headers = {'User-Agent':'XXXXX'} # 伪装成浏览器访问,适用于拒绝爬虫的网站
150 | headers = {'Referer':'XXXXX'}
151 | headers = {'User-Agent':'XXXXX', 'Referer':'XXXXX'}
152 | Requests:
153 | response = requests.get(url=url, headers=headers)
154 | Urllib2:
155 | import urllib, urllib2
156 | req = urllib2.Request(url=url, headers=headers)
157 | response = urllib2.urlopen(req)
158 | ```
159 |
160 | ### 4. 对于断线重连
161 |
162 | 不多说。
163 |
164 | ```
165 | def multi_session(session, *arg):
166 | retryTimes = 20
167 | while retryTimes>0:
168 | try:
169 | return session.post(*arg)
170 | except:
171 | print '.',
172 | retryTimes -= 1
173 | ```
174 |
175 | 或者
176 |
177 | ```
178 | def multi_open(opener, *arg):
179 | retryTimes = 20
180 | while retryTimes>0:
181 | try:
182 | return opener.open(*arg)
183 | except:
184 | print '.',
185 | retryTimes -= 1
186 | ```
187 |
188 | 这样我们就可以使用multi_session或multi_open对爬虫抓取的session或opener进行保持。
189 |
190 | ### 5. 多进程抓取
191 |
192 | 这里针对[华尔街见闻](http://live.wallstreetcn.com/ )进行并行抓取的实验对比:[Python多进程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Python) 与 [Java单线程和多线程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Java)
193 |
194 | 相关参考:[关于Python和Java的多进程多线程计算方法对比](http://www.lining0806.com/%E5%85%B3%E4%BA%8Epython%E5%92%8Cjava%E7%9A%84%E5%A4%9A%E8%BF%9B%E7%A8%8B%E5%A4%9A%E7%BA%BF%E7%A8%8B%E8%AE%A1%E7%AE%97%E6%96%B9%E6%B3%95%E5%AF%B9%E6%AF%94/)
195 |
196 | ### 6. 对于Ajax请求的处理
197 |
198 | 对于“加载更多”情况,使用Ajax来传输很多数据。
199 |
200 | 它的工作原理是:从网页的url加载网页的源代码之后,会在浏览器里执行JavaScript程序。这些程序会加载更多的内容,“填充”到网页里。这就是为什么如果你直接去爬网页本身的url,你会找不到页面的实际内容。
201 |
202 | 这里,若使用Google Chrome分析”请求“对应的链接(方法:右键→审查元素→Network→清空,点击”加载更多“,出现对应的GET链接寻找Type为text/html的,点击,查看get参数或者复制Request URL),循环过程。
203 |
204 | * 如果“请求”之前有页面,依据上一步的网址进行分析推导第1页。以此类推,抓取抓Ajax地址的数据。
205 | * 对返回的json格式数据(str)进行正则匹配。json格式数据中,需从'\\uxxxx'形式的unicode_escape编码转换成u'\uxxxx'的unicode编码。
206 |
207 | ### 7. 自动化测试工具Selenium
208 |
209 | Selenium是一款自动化测试工具。它能实现操纵浏览器,包括字符填充、鼠标点击、获取元素、页面切换等一系列操作。总之,凡是浏览器能做的事,Selenium都能够做到。
210 |
211 | 这里列出在给定城市列表后,使用selenium来动态抓取[去哪儿网](http://flight.qunar.com/)的票价信息的代码。
212 |
213 | 参考项目:[网络爬虫之Selenium使用代理登陆:爬取去哪儿网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/QunarSpider)
214 |
215 | ### 8. 验证码识别
216 |
217 | 对于网站有验证码的情况,我们有三种办法:
218 |
219 | * 使用代理,更新IP。
220 | * 使用cookie登陆。
221 | * 验证码识别。
222 |
223 | 使用代理和使用cookie登陆之前已经讲过,下面讲一下验证码识别。
224 |
225 | 可以利用开源的Tesseract-OCR系统进行验证码图片的下载及识别,将识别的字符传到爬虫系统进行模拟登陆。当然也可以将验证码图片上传到打码平台上进行识别。如果不成功,可以再次更新验证码识别,直到成功为止。
226 |
227 | 参考项目:[验证码识别项目第一版:Captcha1](https://github.com/lining0806/PythonSpiderNotes/blob/master/Captcha1)
228 |
229 | **爬取有两个需要注意的问题:**
230 |
231 | * 如何监控一系列网站的更新情况,也就是说,如何进行增量式爬取?
232 | * 对于海量数据,如何实现分布式爬取?
233 |
234 | ## 分析
235 |
236 | 抓取之后就是对抓取的内容进行分析,你需要什么内容,就从中提炼出相关的内容来。
237 |
238 | 常见的分析工具有[正则表达式](http://deerchao.net/tutorials/regex/regex.htm),[BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/),[lxml](http://lxml.de/)等等。
239 |
240 | ## 存储
241 |
242 | 分析出我们需要的内容之后,接下来就是存储了。
243 |
244 | 我们可以选择存入文本文件,也可以选择存入[MySQL](http://www.mysql.com/)或[MongoDB](https://www.mongodb.org/)数据库等。
245 |
246 | **存储有两个需要注意的问题:**
247 |
248 | * 如何进行网页去重?
249 | * 内容以什么形式存储?
250 |
251 |
252 | ## Scrapy
253 |
254 | Scrapy是一个基于Twisted的开源的Python爬虫框架,在工业中应用非常广泛。
255 |
256 | 相关内容可以参考[基于Scrapy网络爬虫的搭建](http://www.lining0806.com/%E5%9F%BA%E4%BA%8Escrapy%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E7%9A%84%E6%90%AD%E5%BB%BA/),同时给出这篇文章介绍的[微信搜索](http://weixin.sogou.com/weixin)爬取的项目代码,给大家作为学习参考。
257 |
258 | 参考项目:[使用Scrapy或Requests递归抓取微信搜索结果](https://github.com/lining0806/PythonSpiderNotes/blob/master/WechatSearchProjects)
259 |
260 | ## Robots协议
261 |
262 | 好的网络爬虫,首先需要遵守**Robots协议**。Robots协议(也称为爬虫协议、机器人协议等)的全称是“网络爬虫排除标准”(Robots Exclusion Protocol),网站通过Robots协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取。
263 |
264 | 在网站根目录下放一个robots.txt文本文件(如 https://www.taobao.com/robots.txt ),里面可以指定不同的网络爬虫能访问的页面和禁止访问的页面,指定的页面由正则表达式表示。网络爬虫在采集这个网站之前,首先获取到这个robots.txt文本文件,然后解析到其中的规则,然后根据规则来采集网站的数据。
265 |
266 | ### 1. Robots协议规则
267 |
268 | User-agent: 指定对哪些爬虫生效
269 | Disallow: 指定不允许访问的网址
270 | Allow: 指定允许访问的网址
271 |
272 | 注意: 一个英文要大写,冒号是英文状态下,冒号后面有一个空格,"/"代表整个网站
273 |
274 | ### 2. Robots协议举例
275 |
276 | 禁止所有机器人访问
277 | User-agent: *
278 | Disallow: /
279 | 允许所有机器人访问
280 | User-agent: *
281 | Disallow:
282 | 禁止特定机器人访问
283 | User-agent: BadBot
284 | Disallow: /
285 | 允许特定机器人访问
286 | User-agent: GoodBot
287 | Disallow:
288 | 禁止访问特定目录
289 | User-agent: *
290 | Disallow: /images/
291 | 仅允许访问特定目录
292 | User-agent: *
293 | Allow: /images/
294 | Disallow: /
295 | 禁止访问特定文件
296 | User-agent: *
297 | Disallow: /*.html$
298 | 仅允许访问特定文件
299 | User-agent: *
300 | Allow: /*.html$
301 | Disallow: /
--------------------------------------------------------------------------------
/Spider_Java/README.md:
--------------------------------------------------------------------------------
1 | ### Spider_Java
2 |
3 | 抓取网址:[华尔街见闻](http://live.wallstreetcn.com/)
4 |
5 | 单线程抓取 Spider_Java1
6 |
7 | 多线程抓取 Spider_Java2
8 |
--------------------------------------------------------------------------------
/Spider_Java/Spider_Java1/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/Spider_Java/Spider_Java1/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | Spider
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/Spider_Java/Spider_Java1/bin/synchronizetest/Booth.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/bin/synchronizetest/Booth.class
--------------------------------------------------------------------------------
/Spider_Java/Spider_Java1/bin/synchronizetest/Reservoir.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/bin/synchronizetest/Reservoir.class
--------------------------------------------------------------------------------
/Spider_Java/Spider_Java1/bin/synchronizetest/Test.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/bin/synchronizetest/Test.class
--------------------------------------------------------------------------------
/Spider_Java/Spider_Java1/bin/wallstreetcnsave/WallstreetcnSaveTest.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/bin/wallstreetcnsave/WallstreetcnSaveTest.class
--------------------------------------------------------------------------------
/Spider_Java/Spider_Java1/lib/mongo-java-driver-2.13.0-rc1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/lib/mongo-java-driver-2.13.0-rc1.jar
--------------------------------------------------------------------------------
/Spider_Java/Spider_Java1/src/synchronizetest/Test.java:
--------------------------------------------------------------------------------
1 | /**
2 | *
3 | */
4 | package synchronizetest;
5 |
6 | /**
7 | * @author FIRELING
8 | *
9 | */
10 | public class Test
11 | {
12 | public static void main(String[] args)
13 | {
14 | Reservoir r = new Reservoir(100);
15 | Booth b1 = new Booth(r);
16 | Booth b2 = new Booth(r);
17 | Booth b3 = new Booth(r);
18 | }
19 | }
20 | /**
21 | * contain shared resource
22 | */
23 | class Reservoir {
24 | private int total;
25 | public Reservoir(int t)
26 | {
27 | this.total = t;
28 | }
29 | /**
30 | * Thread safe method
31 | * serialized access to Booth.total
32 | */
33 | public synchronized boolean sellTicket() // 利用synchronized修饰符同步了整个方法
34 | {
35 | if(this.total > 0) {
36 | this.total = this.total-1;
37 | return true; // successfully sell one
38 | }
39 | else {
40 | return false; // no more tickets
41 | }
42 | }
43 | }
44 | /**
45 | * create new thread by inheriting Thread
46 | */
47 | class Booth extends Thread {
48 | private static int threadID = 0; // owned by Class object
49 |
50 | private Reservoir release; // sell this reservoir
51 | private int count = 0; // owned by this thread object
52 | /**
53 | * constructor
54 | */
55 | public Booth(Reservoir r) {
56 | super("ID:"+(++threadID));
57 | this.release = r; // all threads share the same reservoir
58 | this.start();
59 | }
60 | /**
61 | * convert object to string
62 | */
63 | public String toString() {
64 | return super.getName();
65 | }
66 | /**
67 | * what does the thread do?
68 | */
69 | public void run() {
70 | while(true) { // 循环体!!!
71 | if(this.release.sellTicket()) {
72 | this.count = this.count+1;
73 | System.out.println(this.getName()+":sell 1");
74 | try {
75 | sleep((int) Math.random()*100); // random intervals
76 | // sleep(100); // 若sleep时间相同,则每个窗口买票相当
77 | }
78 | catch (InterruptedException e) {
79 | throw new RuntimeException(e);
80 | }
81 | }
82 | else {
83 | break;
84 | }
85 | }
86 | System.out.println(this.getName()+" I sold:"+count);
87 | }
88 | }
89 |
90 |
--------------------------------------------------------------------------------
/Spider_Java/Spider_Java1/src/wallstreetcnsave/WallstreetcnSaveTest.java:
--------------------------------------------------------------------------------
1 | package wallstreetcnsave;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.IOException;
5 | import java.io.InputStream;
6 | import java.io.InputStreamReader;
7 | import java.net.HttpURLConnection;
8 | import java.net.URL;
9 | import java.text.DateFormat;
10 | import java.util.ArrayList;
11 | import java.util.Date;
12 | import java.util.HashMap;
13 | import java.util.List;
14 | import java.util.Map;
15 | import java.util.regex.Matcher;
16 | import java.util.regex.Pattern;
17 |
18 | import com.mongodb.BasicDBObject;
19 | import com.mongodb.DB;
20 | import com.mongodb.DBCollection;
21 | import com.mongodb.Mongo;
22 |
23 | public class WallstreetcnSaveTest implements Runnable {
24 |
25 | private static String DataBaseName = "textclassify";
26 | private static String CollectionName = "WallstreetSaveJava";
27 |
28 | private static String url = "http://api.wallstreetcn.com/v2/livenews?&page=";
29 |
30 | private static String Regex = ".*?\"type\":\"(.*?)\".*?\"contentHtml\":\"(.*?)<\\\\/p>\".*?\"categorySet\":\"(.*?)\".*?";
31 | private static final String REGEXSTRING1 = "type";
32 | private static final String REGEXSTRING2 = "content";
33 | private static final String REGEXSTRING3 = "categoryset";
34 |
35 | //map表的存放
36 | public static Map GetMap() {
37 | Map map = new HashMap();
38 | map.put("1", "外汇");
39 | map.put("2", "股市");
40 | map.put("3", "商品");
41 | map.put("4", "债市");
42 | map.put("9", "中国");
43 | map.put("10", "美国");
44 | map.put("11", "欧元区");
45 | map.put("12", "日本");
46 | map.put("13", "英国");
47 | map.put("14", "澳洲");
48 | map.put("15", "加拿大");
49 | map.put("16", "瑞士");
50 | map.put("17", "其他地区");
51 | map.put("5", "央行");
52 | return map;
53 | }
54 | private static String[] ruleList_district = { "9", "10", "11", "12", "13", "14", "15", "16", "17" };
55 | private static String[] ruleList_property = { "1", "2", "3", "4" };
56 | private static String[] ruleList_centralbank = { "5" };
57 |
58 | private static final int start = 1;
59 | private static final int end = 3000;
60 |
61 | //对x,x,x格式的内容进行分隔筛选
62 | public static String setCategory(String categorySet, String[] ruleList, Map map) {
63 | StringBuffer disStr = new StringBuffer();
64 | String[] strArray = null;
65 | strArray = categorySet.split(","); // 拆分字符为",",然后把结果交给数组strArray
66 | // 获取需要的信息
67 | int length_strArray = strArray.length;
68 | int length_ruleList = ruleList.length;
69 |
70 | if (length_strArray > 0) {
71 | for (int iArr = 0; iArr < length_strArray; iArr++) {
72 | String s = strArray[iArr];
73 | for (int iRul=0; iRul < length_ruleList; iRul++) {
74 | if (s.equals(ruleList[iRul])) {
75 | disStr.append(map.get(s));
76 | disStr.append(",");
77 | break;
78 | }
79 | }
80 | }
81 | }
82 | if(disStr.length()>1) {
83 | disStr = disStr.deleteCharAt(disStr.length()-1);
84 | }
85 | return disStr.toString();
86 | }
87 |
88 | //读取整个页面,返回html字符串
89 | private static String httpRequest(String requestUrl) {
90 | StringBuffer buffer = null;
91 | BufferedReader bufferedReader = null;
92 | InputStreamReader inputStreamReader = null;
93 | InputStream inputStream = null;
94 | HttpURLConnection httpUrlConn = null;
95 | try {
96 | // 建立get请求
97 | URL url = new URL(requestUrl);
98 | httpUrlConn = (HttpURLConnection) url.openConnection();
99 | httpUrlConn.setDoInput(true);
100 | httpUrlConn.setRequestMethod("GET");
101 | // 获取输入流
102 | inputStream = httpUrlConn.getInputStream();
103 | inputStreamReader = new InputStreamReader(inputStream, "UTF-8");
104 | bufferedReader = new BufferedReader(inputStreamReader);
105 | // 从输入流获取结果
106 | buffer = new StringBuffer();
107 | String str = null;
108 | while ((str = bufferedReader.readLine()) != null) {
109 | str = new String(str.getBytes(), "UTF-8");
110 | buffer.append(str);
111 | }
112 | } catch (Exception e) {
113 | e.printStackTrace();
114 | } finally {
115 | if (bufferedReader != null) {
116 | try {
117 | bufferedReader.close();
118 | } catch (IOException e) {
119 | e.printStackTrace();
120 | }
121 | }
122 | if (inputStreamReader != null) {
123 | try {
124 | inputStreamReader.close();
125 | } catch (IOException e) {
126 | e.printStackTrace();
127 | }
128 | }
129 | if (inputStream != null) {
130 | try {
131 | inputStream.close();
132 | } catch (IOException e) {
133 | e.printStackTrace();
134 | }
135 | }
136 | if (httpUrlConn != null) {
137 | httpUrlConn.disconnect();
138 | }
139 | }
140 | return buffer.toString();
141 | }
142 |
143 | // 过滤掉无用的信息
144 | public static List