├── Captcha1 ├── !Test.bat ├── ReadMe.md ├── convert.exe ├── pic │ ├── fnord.tif │ ├── get_price_img.png │ ├── get_price_img1.png │ ├── get_price_img1_binary.png │ ├── get_price_img2.png │ ├── get_price_img2_binary.png │ ├── get_price_img_binary.png │ ├── get_random.jpg │ ├── get_random1.jpg │ ├── get_random1_binary.png │ ├── get_random1_binary_midu.png │ ├── get_random1_binary_midu_pro1.png │ ├── get_random2.jpg │ ├── get_random2_binary.png │ ├── get_random2_binary_midu.png │ ├── get_random2_binary_midu_pro1.png │ ├── get_random_binary.png │ ├── get_random_binary_midu.png │ └── get_random_binary_midu_pro1.png ├── pytesser_pro │ ├── __init__.py │ ├── errors.py │ ├── pytesser_pro.py │ └── util.py ├── tess_test.py └── tesseract.exe ├── NewsSpider ├── NewsSpider.exe ├── NewsSpider.py └── ReadMe.md ├── QunarSpider ├── QunarSpider.py └── ReadMe.md ├── ReadMe.md ├── Spider_Java ├── README.md ├── Spider_Java1 │ ├── .classpath │ ├── .project │ ├── bin │ │ ├── synchronizetest │ │ │ ├── Booth.class │ │ │ ├── Reservoir.class │ │ │ └── Test.class │ │ └── wallstreetcnsave │ │ │ └── WallstreetcnSaveTest.class │ ├── lib │ │ └── mongo-java-driver-2.13.0-rc1.jar │ └── src │ │ ├── synchronizetest │ │ └── Test.java │ │ └── wallstreetcnsave │ │ └── WallstreetcnSaveTest.java └── Spider_Java2 │ ├── .classpath │ ├── .project │ ├── bin │ ├── synchronizetest │ │ ├── Booth.class │ │ ├── Reservoir.class │ │ └── Test.class │ └── wallstreetcnsave │ │ ├── GetrequestUrl.class │ │ ├── WallstreetcnSave.class │ │ └── WallstreetcnSaveTest.class │ ├── lib │ └── mongo-java-driver-2.13.0-rc1.jar │ └── src │ ├── synchronizetest │ └── Test.java │ └── wallstreetcnsave │ └── WallstreetcnSaveTest.java ├── Spider_Python ├── README.md └── WallstreetcnSaveTest.py ├── WechatSearchProjects ├── README.md ├── Spider_Main.py ├── WechatSearchTest.py └── Wechatproject │ ├── Wechatproject │ ├── __init__.py │ ├── items.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── spider.py │ └── scrapy.cfg └── ZhihuSpider ├── ReadMe.md ├── ZhihuSpider.py └── config.ini /Captcha1/!Test.bat: -------------------------------------------------------------------------------- 1 | python tess_test.py ./pic/get_price_img.png 2 | pause -------------------------------------------------------------------------------- /Captcha1/ReadMe.md: -------------------------------------------------------------------------------- 1 | ### 验证码识别项目第一版:Captcha1 2 | 3 | 本项目采用Tesseract V3.01版本(V3.02版本在训练时有改动,多shapeclustering过程) 4 | 5 | **Tesseract用法:** 6 | * 配置环境变量TESSDATA_PREFIX =“D:\Tesseract-ocr\”,即tessdata的目录,在源码中会到这个路径下查找相应的字库文件用来识别。 7 | * 命令格式: 8 | `tesseract imagename outputbase [-l lang] [-psm pagesegmode] [configfile...]` 9 | * 只识别成数字 10 | `tesseract imagename outputbase -l eng digits` 11 | * 解决empty page!! 12 | **-psm N** 13 | 14 | 7 = Treat the image as a single text line 15 | tesseract imagename outputbase -l eng -psm 7 16 | * configfile 参数值为tessdata\configs 和 tessdata\tessconfigs 目录下的文件名: 17 | `tesseract imagename outputbase -l eng nobatch` 18 | 19 | 20 | **验证码识别项目使用方法1:** 21 | 22 | * 将下载的图片放到./pic目录下, 23 | 24 | 验证码图片名称:get_random.jpg 25 | 价格图片名称:get_price_img.png 26 | 27 | * 命令格式: 28 | 29 | 验证码图片识别:python tess_test.py ./pic/get_random.jpg 30 | 价格图片识别:python tess_test.py ./pic/get_price_img.png 31 | 32 | 打印出识别的结果 33 | 34 | 若要将结果存在临时文本文件**temp.txt**中,则修改pytessr_pro.py中代码"**cleanup_scratch_flag = True**"改为"**cleanup_scratch_flag = False**" 35 | -------------------------------------------------------------------------------- /Captcha1/convert.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/convert.exe -------------------------------------------------------------------------------- /Captcha1/pic/fnord.tif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/fnord.tif -------------------------------------------------------------------------------- /Captcha1/pic/get_price_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img.png -------------------------------------------------------------------------------- /Captcha1/pic/get_price_img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img1.png -------------------------------------------------------------------------------- /Captcha1/pic/get_price_img1_binary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img1_binary.png -------------------------------------------------------------------------------- /Captcha1/pic/get_price_img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img2.png -------------------------------------------------------------------------------- /Captcha1/pic/get_price_img2_binary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img2_binary.png -------------------------------------------------------------------------------- /Captcha1/pic/get_price_img_binary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_price_img_binary.png -------------------------------------------------------------------------------- /Captcha1/pic/get_random.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random.jpg -------------------------------------------------------------------------------- /Captcha1/pic/get_random1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random1.jpg -------------------------------------------------------------------------------- /Captcha1/pic/get_random1_binary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random1_binary.png -------------------------------------------------------------------------------- /Captcha1/pic/get_random1_binary_midu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random1_binary_midu.png -------------------------------------------------------------------------------- /Captcha1/pic/get_random1_binary_midu_pro1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random1_binary_midu_pro1.png -------------------------------------------------------------------------------- /Captcha1/pic/get_random2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random2.jpg -------------------------------------------------------------------------------- /Captcha1/pic/get_random2_binary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random2_binary.png -------------------------------------------------------------------------------- /Captcha1/pic/get_random2_binary_midu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random2_binary_midu.png -------------------------------------------------------------------------------- /Captcha1/pic/get_random2_binary_midu_pro1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random2_binary_midu_pro1.png -------------------------------------------------------------------------------- /Captcha1/pic/get_random_binary.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random_binary.png -------------------------------------------------------------------------------- /Captcha1/pic/get_random_binary_midu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random_binary_midu.png -------------------------------------------------------------------------------- /Captcha1/pic/get_random_binary_midu_pro1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pic/get_random_binary_midu_pro1.png -------------------------------------------------------------------------------- /Captcha1/pytesser_pro/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/pytesser_pro/__init__.py -------------------------------------------------------------------------------- /Captcha1/pytesser_pro/errors.py: -------------------------------------------------------------------------------- 1 | """Test for exceptions raised in the tesseract.exe logfile""" 2 | 3 | class Tesser_General_Exception(Exception): 4 | pass 5 | 6 | class Tesser_Invalid_Filetype(Tesser_General_Exception): 7 | pass 8 | 9 | def check_for_errors(logfile = "tesseract.log"): 10 | inf = file(logfile) 11 | text = inf.read() 12 | inf.close() 13 | # All error conditions result in "Error" somewhere in logfile 14 | if text.find("Error") != -1: 15 | raise Tesser_General_Exception, text -------------------------------------------------------------------------------- /Captcha1/pytesser_pro/pytesser_pro.py: -------------------------------------------------------------------------------- 1 | import Image 2 | import subprocess 3 | 4 | import util 5 | import errors 6 | 7 | tesseract_exe_name = "tesseract" # Name of executable to be called at command line 8 | scratch_image_name = "temp.bmp" # This file must be .bmp or other Tesseract-compatible format 9 | scratch_text_name_root = "temp" # Leave out the .txt extension 10 | cleanup_scratch_flag = False # Temporary files cleaned up after OCR operation 11 | 12 | def call_tesseract(input_filename, output_filename, bool_digits=False): 13 | """Calls external tesseract.exe on input file (restrictions on types), 14 | outputting output_filename+'txt'""" 15 | # args = [tesseract_exe_name, input_filename, output_filename] 16 | if bool_digits: 17 | # args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l eng -psm 7 nobatch eng_digits" # price 18 | args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l test_digits -psm 7 nobatch" # price 19 | else: 20 | args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l eng -psm 7 nobatch eng_characters" # English letters 21 | # args = tesseract_exe_name+" "+input_filename+" "+output_filename+" -l test_eng -psm 7 nobatch" # English letters 22 | # print args 23 | proc = subprocess.Popen(args, shell=True) 24 | retcode = proc.wait() 25 | if retcode != 0: 26 | errors.check_for_errors() 27 | 28 | def image_to_string(im, cleanup = cleanup_scratch_flag, bool_digits=False): 29 | """Converts im to file, applies tesseract, and fetches resulting text. 30 | If cleanup=True, delete scratch files after operation.""" 31 | try: 32 | util.image_to_scratch(im, scratch_image_name) 33 | call_tesseract(scratch_image_name, scratch_text_name_root, bool_digits) 34 | text = util.retrieve_text(scratch_text_name_root) 35 | finally: 36 | if cleanup: 37 | util.perform_cleanup(scratch_image_name, scratch_text_name_root) 38 | return text 39 | 40 | def image_file_to_string(filename, cleanup = cleanup_scratch_flag, graceful_errors=True, bool_digits=False): 41 | """Applies tesseract to filename; or, if image is incompatible and graceful_errors=True, 42 | converts to compatible format and then applies tesseract. Fetches resulting text. 43 | If cleanup=True, delete scratch files after operation.""" 44 | try: 45 | try: 46 | call_tesseract(filename, scratch_text_name_root, bool_digits) 47 | text = util.retrieve_text(scratch_text_name_root) 48 | except errors.Tesser_General_Exception: 49 | if graceful_errors: 50 | im = Image.open(filename) 51 | text = image_to_string(im, cleanup, bool_digits) 52 | else: 53 | raise 54 | finally: 55 | if cleanup: 56 | util.perform_cleanup(scratch_image_name, scratch_text_name_root) 57 | return text 58 | -------------------------------------------------------------------------------- /Captcha1/pytesser_pro/util.py: -------------------------------------------------------------------------------- 1 | """Utility functions for processing images for delivery to Tesseract""" 2 | 3 | import os 4 | 5 | def image_to_scratch(im, scratch_image_name): 6 | """Saves image in memory to scratch file. .bmp format will be read correctly by Tesseract""" 7 | im.save(scratch_image_name, dpi=(200,200)) 8 | 9 | def retrieve_text(scratch_text_name_root): 10 | inf = file(scratch_text_name_root + '.txt') 11 | text = inf.read() 12 | inf.close() 13 | return text 14 | 15 | def perform_cleanup(scratch_image_name, scratch_text_name_root): 16 | """Clean up temporary files from disk""" 17 | for name in (scratch_image_name, scratch_text_name_root + '.txt', "tesseract.log"): 18 | try: 19 | os.remove(name) 20 | except OSError: 21 | pass 22 | -------------------------------------------------------------------------------- /Captcha1/tess_test.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import os 4 | import sys 5 | import subprocess 6 | from pytesser_pro.pytesser_pro import * 7 | import Image, ImageEnhance, ImageFilter 8 | from pylab import * 9 | 10 | 11 | 12 | # 二值化并转格式 13 | def binary(image_name, binary_image_name): 14 | # 白底黑字 15 | args = "convert -monochrome "+image_name+" "+binary_image_name 16 | # print args 17 | proc = subprocess.Popen(args, shell=True) 18 | proc.wait() 19 | im = Image.open(binary_image_name) 20 | w, h = im.size 21 | data = list(im.getdata()) 22 | if (data[0], data[w-1], data[(h-1)*w], data[h*w-1]) == (0, 0, 0, 0): # 0-黑色,255-白色 23 | # 若非白底黑字则灰度反转 24 | args1 = "convert -negate "+binary_image_name+" "+binary_image_name 25 | proc1 = subprocess.Popen(args1, shell=True) 26 | proc1.wait() 27 | 28 | # 计算范围内点的个数 29 | def numpoint(im): 30 | w, h = im.size 31 | # print w, h 32 | data = list(im.getdata()) 33 | mumpoint = 0 34 | for x in range(w): 35 | for y in range(h): 36 | if data[y*w+x] == 0: # 0-黑色,255-白色 37 | mumpoint += 1 38 | return mumpoint 39 | 40 | # 投影法去干扰线 41 | def pointmidu(binary_image_name, midu_image_name): 42 | im = Image.open(binary_image_name) 43 | w, h = im.size 44 | # print w, h 45 | len = 5 46 | for x in range(0, w, len): 47 | box = (x, 0, x+len, h) 48 | im_box = im.crop(box) 49 | num = numpoint(im_box) 50 | # print num 51 | if num < 20: 52 | for i in range(x, x+len): 53 | for j in range(h): 54 | im.putpixel((i, j), 255) # 0-黑色,255-白色 55 | data = list(im.getdata()) 56 | data_column = [] 57 | for x in range(w): 58 | temp = 0 59 | for y in range(h): 60 | if data[y*w+x] == 0: # 0-黑色,255-白色 61 | temp += 1 62 | data_column.append(temp) 63 | # print data_column 64 | start = 0 65 | for i in range(0, w, 1): 66 | if data_column[i] != 0: 67 | break 68 | else: 69 | start += 1 70 | # print start 71 | end = w-1 72 | for j in range(w-1, -1, -1): 73 | if data_column[j] != 0: 74 | break 75 | else: 76 | end += -1 77 | # print end 78 | box_new = (start, 0, end+1, h) 79 | im_box_new = im.crop(box_new) 80 | im_box_new.save(midu_image_name) 81 | 82 | # 图像增强 83 | def filter_enhance(midu_image_name, midu_image_name_pro1): 84 | im = Image.open(midu_image_name) 85 | # 去噪 86 | im = im.filter(ImageFilter.MedianFilter()) 87 | # 亮度加强 88 | enhancer = ImageEnhance.Contrast(im) 89 | im = enhancer.enhance(2) 90 | im = im.convert('1') 91 | # im.show() 92 | im.save(midu_image_name_pro1) 93 | 94 | # 字符分割 95 | def seg(midu_image_name_pro1, midu_image_name_pro2, num): 96 | im = Image.open(midu_image_name_pro1) 97 | w, h = im.size 98 | # print w, h, w/num 99 | len = 2 100 | for i in range(num-1): 101 | start = (i+1)*w/num 102 | end = start+len 103 | for m in range(start, end+1): 104 | for n in range(h): 105 | im.putpixel((m, n), 255) # 0-黑色,255-白色 106 | im.save(midu_image_name_pro2) 107 | 108 | def get_aim1_point(im): 109 | aim = [] 110 | w, h = im.size 111 | # print w, h 112 | data = list(im.getdata()) 113 | for x in range(0, w, 1): 114 | for y in range(0, h, 1): 115 | if data[y*w+x] == 0: # 0-黑色,255-白色 116 | start_point = (x, y) 117 | # print start_point 118 | aim.append(start_point) 119 | break 120 | return aim 121 | 122 | def get_aim2_point(im): 123 | aim = [] 124 | w, h = im.size 125 | # print w, h 126 | data = list(im.getdata()) 127 | for x in range(0, w, 1): 128 | for y in range(h-1, -1, -1): 129 | if data[y*w+x] == 0: # 0-黑色,255-白色 130 | start_point = (x, y) 131 | # print start_point 132 | aim.append(start_point) 133 | break 134 | return aim 135 | 136 | 137 | if __name__=='__main__': 138 | 139 | if len(sys.argv) == 1: 140 | image_name = "./pic/get_random.jpg" # 验证码图片名称 141 | digits = False 142 | # image_name = "./pic/get_price_img.png" # 价格图片名称 143 | # digits = True 144 | elif len(sys.argv) == 2: 145 | if sys.argv[1].find("get_random") != -1: 146 | image_name = sys.argv[1] 147 | digits = False 148 | elif sys.argv[1].find("get_price_img") != -1: 149 | image_name = sys.argv[1] 150 | digits = True 151 | else: 152 | print "Please Input the Correct Image Name!" 153 | sys.exit(0) 154 | else: 155 | print "Too Many Arguments!" 156 | sys.exit(0) 157 | 158 | 159 | # 二值化并转格式 160 | binary_image_name = os.path.splitext(image_name)[0]+"_binary.png" 161 | binary(image_name, binary_image_name) 162 | 163 | im = Image.open(binary_image_name) 164 | print im.format, im.size, im.mode 165 | 166 | 167 | if digits: 168 | text = image_file_to_string(binary_image_name, bool_digits=digits) 169 | print text.replace("\n", "") 170 | else: 171 | # 投影法去干扰线 172 | fpathandname , fext = os.path.splitext(binary_image_name) 173 | midu_image_name = fpathandname+"_midu"+fext 174 | pointmidu(binary_image_name, midu_image_name) 175 | 176 | 177 | fpathandname , fext = os.path.splitext(midu_image_name) 178 | 179 | # 去干扰线 180 | # im = Image.open(midu_image_name) 181 | # w, h = im.size 182 | # data = list(im.getdata()) 183 | # aim1 = get_aim1_point(im) 184 | # for x, y in aim1: 185 | # curr = data[y*w+x] 186 | # prev = data[(y-1)*w+x] 187 | # next = data[(y+1)*w+x] 188 | # 189 | # if prev == 0 and next == 0: # 0-黑色,255-白色 190 | # continue 191 | # if prev == 0: 192 | # im.putpixel((x, y), 255) 193 | # im.putpixel((x, y-1), 255) 194 | # elif next == 0: 195 | # im.putpixel((x, y), 255) 196 | # im.putpixel((x, y+1), 255) 197 | # else: 198 | # im.putpixel((x, y), 255) 199 | # data = list(im.getdata()) 200 | # aim2 = get_aim2_point(im) 201 | # for x, y in aim2: 202 | # curr = data[y*w+x] 203 | # prev = data[(y-1)*w+x] 204 | # next = data[(y+1)*w+x] 205 | # 206 | # if prev == 0 and next == 0: # 0-黑色,255-白色 207 | # continue 208 | # if prev == 0: 209 | # im.putpixel((x, y), 255) 210 | # im.putpixel((x, y-1), 255) 211 | # elif next == 0: 212 | # im.putpixel((x, y), 255) 213 | # im.putpixel((x, y+1), 255) 214 | # else: 215 | # im.putpixel((x, y), 255) 216 | # midu_image_name_new = fpathandname+"_new"+fext 217 | # im.save(midu_image_name_new) 218 | 219 | 220 | # 图像增强 221 | midu_image_name_pro1 = fpathandname+"_pro1"+fext 222 | filter_enhance(midu_image_name, midu_image_name_pro1) 223 | # 字符分割 224 | # num = 4 225 | # midu_image_name_pro2 = fpathandname+"_pro2"+fext 226 | # seg(midu_image_name_pro1, midu_image_name_pro2, num) 227 | 228 | # im = Image.open(midu_image_name) 229 | # text = image_to_string(im) 230 | # print text.replace("\n", "") 231 | text = image_file_to_string(midu_image_name_pro1, bool_digits=digits) 232 | print text.replace("\n", "") -------------------------------------------------------------------------------- /Captcha1/tesseract.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Captcha1/tesseract.exe -------------------------------------------------------------------------------- /NewsSpider/NewsSpider.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/NewsSpider/NewsSpider.exe -------------------------------------------------------------------------------- /NewsSpider/NewsSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import sys 4 | import urllib2 5 | import requests 6 | import re 7 | from lxml import etree 8 | 9 | 10 | def StringListSave(save_path, filename, slist): 11 | if not os.path.exists(save_path): 12 | os.makedirs(save_path) 13 | path = save_path+"/"+filename+".txt" 14 | with open(path, "w+") as fp: 15 | for s in slist: 16 | fp.write("%s\t\t%s\n" % (s[0].encode("utf8"), s[1].encode("utf8"))) 17 | 18 | def Page_Info(myPage): 19 | '''Regex''' 20 | mypage_Info = re.findall(r'

(.*?)

.*?
', myPage, re.S) 21 | return mypage_Info 22 | 23 | def New_Page_Info(new_page): 24 | '''Regex(slowly) or Xpath(fast)''' 25 | # new_page_Info = re.findall(r'.*?(.*?)', new_page, re.S) 26 | # # new_page_Info = re.findall(r'.*?(.*?)', new_page, re.S) # bugs 27 | # results = [] 28 | # for url, item in new_page_Info: 29 | # results.append((item, url+".html")) 30 | # return results 31 | dom = etree.HTML(new_page) 32 | new_items = dom.xpath('//tr/td/a/text()') 33 | new_urls = dom.xpath('//tr/td/a/@href') 34 | assert(len(new_items) == len(new_urls)) 35 | return zip(new_items, new_urls) 36 | 37 | def Spider(url): 38 | i = 0 39 | print "downloading ", url 40 | myPage = requests.get(url).content.decode("gbk") 41 | # myPage = urllib2.urlopen(url).read().decode("gbk") 42 | myPageResults = Page_Info(myPage) 43 | save_path = u"网易新闻抓取" 44 | filename = str(i)+"_"+u"新闻排行榜" 45 | StringListSave(save_path, filename, myPageResults) 46 | i += 1 47 | for item, url in myPageResults: 48 | print "downloading ", url 49 | new_page = requests.get(url).content.decode("gbk") 50 | # new_page = urllib2.urlopen(url).read().decode("gbk") 51 | newPageResults = New_Page_Info(new_page) 52 | filename = str(i)+"_"+item 53 | StringListSave(save_path, filename, newPageResults) 54 | i += 1 55 | 56 | 57 | if __name__ == '__main__': 58 | print "start" 59 | start_url = "http://news.163.com/rank/" 60 | Spider(start_url) 61 | print "end" -------------------------------------------------------------------------------- /NewsSpider/ReadMe.md: -------------------------------------------------------------------------------- 1 | ### 网络爬虫之最基本的爬虫:爬取[网易新闻排行榜](http://news.163.com/rank/) 2 | 3 | **一些说明:** 4 | 5 | * 使用urllib2或requests包来爬取页面。 6 | 7 | * 使用正则表达式分析一级页面,使用Xpath来分析二级页面。 8 | 9 | * 将得到的标题和链接,保存为本地文件。 10 | -------------------------------------------------------------------------------- /QunarSpider/QunarSpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import os 5 | import time 6 | import datetime 7 | import codecs 8 | import multiprocessing as mp 9 | from os import makedirs 10 | from os.path import exists 11 | from selenium import webdriver 12 | from selenium.webdriver.common.proxy import * 13 | 14 | 15 | site = 'http://flight.qunar.com' 16 | hot_city_list = [u'上海', u'北京', u'广州', u'深圳'] 17 | num = len(hot_city_list) 18 | 19 | 20 | def one_driver_ticket(driver, from_city, to_city): 21 | # time = datetime.datetime.now() 22 | date = datetime.date.today() 23 | tomorrow = date+datetime.timedelta(days=1) 24 | # date格式转为string格式 25 | tomorrow_string = tomorrow.strftime('%Y-%m-%d') 26 | 27 | driver.find_element_by_name('fromCity').clear() 28 | driver.find_element_by_name('fromCity').send_keys(from_city) 29 | driver.find_element_by_name('toCity').clear() 30 | driver.find_element_by_name('toCity').send_keys(to_city) 31 | driver.find_element_by_name('fromDate').clear() 32 | driver.find_element_by_name('fromDate').send_keys(tomorrow_string) 33 | driver.find_element_by_xpath('//button[@type="submit"]').click() 34 | time.sleep(5) # 控制间隔时间,等待浏览器反映 35 | 36 | flag = True 37 | page_num = 0 38 | while flag: 39 | # 保存页面 40 | # print driver.page_source 41 | source_code = driver.find_element_by_xpath("//*").get_attribute("outerHTML") 42 | print type(source_code) 43 | dstdir = u'./ticket/' 44 | if not exists(dstdir): 45 | makedirs(dstdir) 46 | f = codecs.open(dstdir+from_city+u','+to_city+unicode(tomorrow_string)+u','+unicode(str(page_num+1))+u'.html', 'w+', 'utf8') 47 | f.write(source_code) 48 | f.close() 49 | 50 | next_page = None 51 | try: 52 | next_page = driver.find_element_by_id('nextXI3') 53 | except Exception as e: 54 | print e 55 | pass 56 | print "page: %d" % (page_num+1) 57 | if next_page: 58 | try: 59 | next_page.click() 60 | time.sleep(2) # 控制间隔时间,等待浏览器反映 61 | page_num += 1 62 | except Exception as e: 63 | print 'next_page could not be clicked' 64 | print e 65 | flag = False 66 | else: 67 | flag = False 68 | 69 | def get_proxy_list(file_path): 70 | proxy_list = [] 71 | try: 72 | f = open(file_path, 'r') 73 | all_lines = f.readlines() # readlines()每次按行读取整个文件内容,将读取到的内容放到一个列表中,返回list类型。 74 | for line in all_lines: 75 | proxy_list.append(line.replace('\r', '').replace('\n', '')) 76 | f.close() 77 | except Exception as e: 78 | print e 79 | return proxy_list 80 | 81 | def ticket_worker_proxy(city_proxy): 82 | city = city_proxy.split(',')[0] 83 | proxy = city_proxy.split(',')[1] 84 | proxy = Proxy({ 85 | 'proxyType': ProxyType.MANUAL, 86 | 'httpProxy': proxy, 87 | 'ftpProxy': proxy, 88 | 'sslProxy': proxy, 89 | 'noProxy': '' # 过滤不需要代理的地址 90 | }) 91 | driver = webdriver.Firefox(proxy=proxy) 92 | driver.get(site) 93 | driver.maximize_window() # 将浏览器最大化显示 94 | for i in xrange(num): 95 | if city == hot_city_list[i]: 96 | continue 97 | from_city = city 98 | to_city = hot_city_list[i] 99 | one_driver_ticket(driver, from_city, to_city) 100 | driver.close() 101 | 102 | def all_ticket_proxy(): 103 | hot_city_proxy_list = [] 104 | proxy_list = get_proxy_list('./proxy/proxy.txt') # ./表示当前目录,../表示上一级目录 105 | for i in xrange(num): 106 | hot_city_proxy_list.append(hot_city_list[i]+','+proxy_list[i]) 107 | pool = mp.Pool(processes=1) 108 | pool.map(ticket_worker_proxy, hot_city_proxy_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)] 109 | pool.close() 110 | pool.join() 111 | 112 | def ticket_worker_no_proxy(city): 113 | driver = webdriver.Firefox() 114 | # chromedriver = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe' 115 | # os.environ['webdriver.chrome.driver'] = chromedriver 116 | # driver = webdriver.Chrome(chromedriver) 117 | driver.get(site) 118 | driver.maximize_window() # 将浏览器最大化显示 119 | time.sleep(5) # 控制间隔时间,等待浏览器反映 120 | for i in xrange(num): 121 | if city == hot_city_list[i]: 122 | continue 123 | from_city = city 124 | to_city = hot_city_list[i] 125 | one_driver_ticket(driver, from_city, to_city) 126 | driver.close() 127 | 128 | def all_ticket_no_proxy(): 129 | pool = mp.Pool(processes=1) 130 | pool.map(ticket_worker_no_proxy, hot_city_list) # map(f, [x1, x2, x3, x4]) = [f(x1), f(x2), f(x3), f(x4)] 131 | pool.close() 132 | pool.join() 133 | 134 | 135 | if __name__ == '__main__': 136 | print "start" 137 | start = datetime.datetime.now() 138 | # all_ticket_proxy() # proxy 139 | all_ticket_no_proxy() # no proxy 140 | end = datetime.datetime.now() 141 | print "end" 142 | print "time: ", end-start 143 | -------------------------------------------------------------------------------- /QunarSpider/ReadMe.md: -------------------------------------------------------------------------------- 1 | ### 网络爬虫之Selenium使用代理登陆:爬取[去哪儿](http://flight.qunar.com/)网站 2 | 3 | **一些说明:** 4 | 5 | * 使用selenium模拟浏览器登陆,获取翻页操作。 6 | 7 | * 代理可以存入一个文件,程序读取并使用。 8 | 9 | * 支持多进程抓取。 -------------------------------------------------------------------------------- /ReadMe.md: -------------------------------------------------------------------------------- 1 | # [Python入门网络爬虫之精华版](https://github.com/lining0806/PythonSpiderNotes) 2 | 3 | *** 4 | 5 | Python学习网络爬虫主要分3个大的版块:**抓取**,**分析**,**存储** 6 | 7 | 另外,比较常用的爬虫框架[Scrapy](http://scrapy.org/),这里最后也详细介绍一下。 8 | 9 | 首先列举一下本人总结的相关文章,这些覆盖了入门网络爬虫需要的基本概念和技巧:[宁哥的小站-网络爬虫](http://www.lining0806.com/category/spider/) 10 | *** 11 | 12 | 当我们在浏览器中输入一个url后回车,后台会发生什么?比如说你输入[http://www.lining0806.com/](http://www.lining0806.com/),你就会看到宁哥的小站首页。 13 | 14 | 简单来说这段过程发生了以下四个步骤: 15 | 16 | * 查找域名对应的IP地址。 17 | * 向IP对应的服务器发送请求。 18 | * 服务器响应请求,发回网页内容。 19 | * 浏览器解析网页内容。 20 | 21 | 网络爬虫要做的,简单来说,就是实现浏览器的功能。通过指定url,直接返回给用户所需要的数据,而不需要一步步人工去操纵浏览器获取。 22 | 23 | ## 抓取 24 | 这一步,你要明确要得到的内容是什么?是HTML源码,还是Json格式的字符串等。 25 | 26 | #### 1. 最基本的抓取 27 | 28 | 抓取大多数情况属于get请求,即直接从对方服务器上获取数据。 29 | 30 | 首先,Python中自带urllib及urllib2这两个模块,基本上能满足一般的页面抓取。另外,[requests](https://github.com/kennethreitz/requests)也是非常有用的包,与此类似的,还有[httplib2](https://github.com/jcgregorio/httplib2)等等。 31 | 32 | ``` 33 | Requests: 34 | import requests 35 | response = requests.get(url) 36 | content = requests.get(url).content 37 | print "response headers:", response.headers 38 | print "content:", content 39 | Urllib2: 40 | import urllib2 41 | response = urllib2.urlopen(url) 42 | content = urllib2.urlopen(url).read() 43 | print "response headers:", response.headers 44 | print "content:", content 45 | Httplib2: 46 | import httplib2 47 | http = httplib2.Http() 48 | response_headers, content = http.request(url, 'GET') 49 | print "response headers:", response_headers 50 | print "content:", content 51 | ``` 52 | 53 | 此外,对于带有查询字段的url,get请求一般会将来请求的数据附在url之后,以?分割url和传输数据,多个参数用&连接。 54 | 55 | ``` 56 | data = {'data1':'XXXXX', 'data2':'XXXXX'} 57 | Requests:data为dict,json 58 | import requests 59 | response = requests.get(url=url, params=data) 60 | Urllib2:data为string 61 | import urllib, urllib2 62 | data = urllib.urlencode(data) 63 | full_url = url+'?'+data 64 | response = urllib2.urlopen(full_url) 65 | ``` 66 | 67 | 相关参考:[网易新闻排行榜抓取回顾](http://www.lining0806.com/%E7%BD%91%E6%98%93%E6%96%B0%E9%97%BB%E6%8E%92%E8%A1%8C%E6%A6%9C%E6%8A%93%E5%8F%96%E5%9B%9E%E9%A1%BE/) 68 | 69 | 参考项目:[网络爬虫之最基本的爬虫:爬取网易新闻排行榜](https://github.com/lining0806/PythonSpiderNotes/blob/master/NewsSpider) 70 | 71 | ### 2. 对于登陆情况的处理 72 | 73 | **2.1 使用表单登陆** 74 | 75 | 这种情况属于post请求,即先向服务器发送表单数据,服务器再将返回的cookie存入本地。 76 | 77 | ``` 78 | data = {'data1':'XXXXX', 'data2':'XXXXX'} 79 | Requests:data为dict,json 80 | import requests 81 | response = requests.post(url=url, data=data) 82 | Urllib2:data为string 83 | import urllib, urllib2 84 | data = urllib.urlencode(data) 85 | req = urllib2.Request(url=url, data=data) 86 | response = urllib2.urlopen(req) 87 | ``` 88 | 89 | **2.2 使用cookie登陆** 90 | 91 | 使用cookie登陆,服务器会认为你是一个已登陆的用户,所以就会返回给你一个已登陆的内容。因此,需要验证码的情况可以使用带验证码登陆的cookie解决。 92 | 93 | ``` 94 | import requests 95 | requests_session = requests.session() 96 | response = requests_session.post(url=url_login, data=data) 97 | ``` 98 | 99 | 若存在验证码,此时采用response = requests_session.post(url=url_login, data=data)是不行的,做法应该如下: 100 | 101 | ``` 102 | response_captcha = requests_session.get(url=url_login, cookies=cookies) 103 | response1 = requests.get(url_login) # 未登陆 104 | response2 = requests_session.get(url_login) # 已登陆,因为之前拿到了Response Cookie! 105 | response3 = requests_session.get(url_results) # 已登陆,因为之前拿到了Response Cookie! 106 | ``` 107 | 108 | 相关参考:[网络爬虫-验证码登陆](http://www.lining0806.com/6-%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB-%E9%AA%8C%E8%AF%81%E7%A0%81%E7%99%BB%E9%99%86/) 109 | 110 | 参考项目:[网络爬虫之用户名密码及验证码登陆:爬取知乎网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/ZhihuSpider) 111 | 112 | ### 3. 对于反爬虫机制的处理 113 | 114 | **3.1 使用代理** 115 | 116 | 适用情况:限制IP地址情况,也可解决由于“频繁点击”而需要输入验证码登陆的情况。 117 | 118 | 这种情况最好的办法就是维护一个代理IP池,网上有很多免费的代理IP,良莠不齐,可以通过筛选找到能用的。对于“频繁点击”的情况,我们还可以通过限制爬虫访问网站的频率来避免被网站禁掉。 119 | 120 | ``` 121 | proxies = {'http':'http://XX.XX.XX.XX:XXXX'} 122 | Requests: 123 | import requests 124 | response = requests.get(url=url, proxies=proxies) 125 | Urllib2: 126 | import urllib2 127 | proxy_support = urllib2.ProxyHandler(proxies) 128 | opener = urllib2.build_opener(proxy_support, urllib2.HTTPHandler) 129 | urllib2.install_opener(opener) # 安装opener,此后调用urlopen()时都会使用安装过的opener对象 130 | response = urllib2.urlopen(url) 131 | ``` 132 | 133 | **3.2 时间设置** 134 | 135 | 适用情况:限制频率情况。 136 | 137 | Requests,Urllib2都可以使用time库的sleep()函数: 138 | 139 | ``` 140 | import time 141 | time.sleep(1) 142 | ``` 143 | 144 | **3.3 伪装成浏览器,或者反“反盗链”** 145 | 146 | 有些网站会检查你是不是真的浏览器访问,还是机器自动访问的。这种情况,加上User-Agent,表明你是浏览器访问即可。有时还会检查是否带Referer信息还会检查你的Referer是否合法,一般再加上Referer。 147 | 148 | ``` 149 | headers = {'User-Agent':'XXXXX'} # 伪装成浏览器访问,适用于拒绝爬虫的网站 150 | headers = {'Referer':'XXXXX'} 151 | headers = {'User-Agent':'XXXXX', 'Referer':'XXXXX'} 152 | Requests: 153 | response = requests.get(url=url, headers=headers) 154 | Urllib2: 155 | import urllib, urllib2 156 | req = urllib2.Request(url=url, headers=headers) 157 | response = urllib2.urlopen(req) 158 | ``` 159 | 160 | ### 4. 对于断线重连 161 | 162 | 不多说。 163 | 164 | ``` 165 | def multi_session(session, *arg): 166 | retryTimes = 20 167 | while retryTimes>0: 168 | try: 169 | return session.post(*arg) 170 | except: 171 | print '.', 172 | retryTimes -= 1 173 | ``` 174 | 175 | 或者 176 | 177 | ``` 178 | def multi_open(opener, *arg): 179 | retryTimes = 20 180 | while retryTimes>0: 181 | try: 182 | return opener.open(*arg) 183 | except: 184 | print '.', 185 | retryTimes -= 1 186 | ``` 187 | 188 | 这样我们就可以使用multi_session或multi_open对爬虫抓取的session或opener进行保持。 189 | 190 | ### 5. 多进程抓取 191 | 192 | 这里针对[华尔街见闻](http://live.wallstreetcn.com/ )进行并行抓取的实验对比:[Python多进程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Python) 与 [Java单线程和多线程抓取](https://github.com/lining0806/PythonSpiderNotes/blob/master/Spider_Java) 193 | 194 | 相关参考:[关于Python和Java的多进程多线程计算方法对比](http://www.lining0806.com/%E5%85%B3%E4%BA%8Epython%E5%92%8Cjava%E7%9A%84%E5%A4%9A%E8%BF%9B%E7%A8%8B%E5%A4%9A%E7%BA%BF%E7%A8%8B%E8%AE%A1%E7%AE%97%E6%96%B9%E6%B3%95%E5%AF%B9%E6%AF%94/) 195 | 196 | ### 6. 对于Ajax请求的处理 197 | 198 | 对于“加载更多”情况,使用Ajax来传输很多数据。 199 | 200 | 它的工作原理是:从网页的url加载网页的源代码之后,会在浏览器里执行JavaScript程序。这些程序会加载更多的内容,“填充”到网页里。这就是为什么如果你直接去爬网页本身的url,你会找不到页面的实际内容。 201 | 202 | 这里,若使用Google Chrome分析”请求“对应的链接(方法:右键→审查元素→Network→清空,点击”加载更多“,出现对应的GET链接寻找Type为text/html的,点击,查看get参数或者复制Request URL),循环过程。 203 | 204 | * 如果“请求”之前有页面,依据上一步的网址进行分析推导第1页。以此类推,抓取抓Ajax地址的数据。 205 | * 对返回的json格式数据(str)进行正则匹配。json格式数据中,需从'\\uxxxx'形式的unicode_escape编码转换成u'\uxxxx'的unicode编码。 206 | 207 | ### 7. 自动化测试工具Selenium 208 | 209 | Selenium是一款自动化测试工具。它能实现操纵浏览器,包括字符填充、鼠标点击、获取元素、页面切换等一系列操作。总之,凡是浏览器能做的事,Selenium都能够做到。 210 | 211 | 这里列出在给定城市列表后,使用selenium来动态抓取[去哪儿网](http://flight.qunar.com/)的票价信息的代码。 212 | 213 | 参考项目:[网络爬虫之Selenium使用代理登陆:爬取去哪儿网站](https://github.com/lining0806/PythonSpiderNotes/blob/master/QunarSpider) 214 | 215 | ### 8. 验证码识别 216 | 217 | 对于网站有验证码的情况,我们有三种办法: 218 | 219 | * 使用代理,更新IP。 220 | * 使用cookie登陆。 221 | * 验证码识别。 222 | 223 | 使用代理和使用cookie登陆之前已经讲过,下面讲一下验证码识别。 224 | 225 | 可以利用开源的Tesseract-OCR系统进行验证码图片的下载及识别,将识别的字符传到爬虫系统进行模拟登陆。当然也可以将验证码图片上传到打码平台上进行识别。如果不成功,可以再次更新验证码识别,直到成功为止。 226 | 227 | 参考项目:[验证码识别项目第一版:Captcha1](https://github.com/lining0806/PythonSpiderNotes/blob/master/Captcha1) 228 | 229 | **爬取有两个需要注意的问题:** 230 | 231 | * 如何监控一系列网站的更新情况,也就是说,如何进行增量式爬取? 232 | * 对于海量数据,如何实现分布式爬取? 233 | 234 | ## 分析 235 | 236 | 抓取之后就是对抓取的内容进行分析,你需要什么内容,就从中提炼出相关的内容来。 237 | 238 | 常见的分析工具有[正则表达式](http://deerchao.net/tutorials/regex/regex.htm),[BeautifulSoup](http://www.crummy.com/software/BeautifulSoup/),[lxml](http://lxml.de/)等等。 239 | 240 | ## 存储 241 | 242 | 分析出我们需要的内容之后,接下来就是存储了。 243 | 244 | 我们可以选择存入文本文件,也可以选择存入[MySQL](http://www.mysql.com/)或[MongoDB](https://www.mongodb.org/)数据库等。 245 | 246 | **存储有两个需要注意的问题:** 247 | 248 | * 如何进行网页去重? 249 | * 内容以什么形式存储? 250 | 251 | 252 | ## Scrapy 253 | 254 | Scrapy是一个基于Twisted的开源的Python爬虫框架,在工业中应用非常广泛。 255 | 256 | 相关内容可以参考[基于Scrapy网络爬虫的搭建](http://www.lining0806.com/%E5%9F%BA%E4%BA%8Escrapy%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E7%9A%84%E6%90%AD%E5%BB%BA/),同时给出这篇文章介绍的[微信搜索](http://weixin.sogou.com/weixin)爬取的项目代码,给大家作为学习参考。 257 | 258 | 参考项目:[使用Scrapy或Requests递归抓取微信搜索结果](https://github.com/lining0806/PythonSpiderNotes/blob/master/WechatSearchProjects) 259 | 260 | ## Robots协议 261 | 262 | 好的网络爬虫,首先需要遵守**Robots协议**。Robots协议(也称为爬虫协议、机器人协议等)的全称是“网络爬虫排除标准”(Robots Exclusion Protocol),网站通过Robots协议告诉搜索引擎哪些页面可以抓取,哪些页面不能抓取。 263 | 264 | 在网站根目录下放一个robots.txt文本文件(如 https://www.taobao.com/robots.txt ),里面可以指定不同的网络爬虫能访问的页面和禁止访问的页面,指定的页面由正则表达式表示。网络爬虫在采集这个网站之前,首先获取到这个robots.txt文本文件,然后解析到其中的规则,然后根据规则来采集网站的数据。 265 | 266 | ### 1. Robots协议规则 267 | 268 | User-agent: 指定对哪些爬虫生效 269 | Disallow: 指定不允许访问的网址 270 | Allow: 指定允许访问的网址 271 | 272 | 注意: 一个英文要大写,冒号是英文状态下,冒号后面有一个空格,"/"代表整个网站 273 | 274 | ### 2. Robots协议举例 275 | 276 | 禁止所有机器人访问 277 | User-agent: * 278 | Disallow: / 279 | 允许所有机器人访问 280 | User-agent: * 281 | Disallow: 282 | 禁止特定机器人访问 283 | User-agent: BadBot 284 | Disallow: / 285 | 允许特定机器人访问 286 | User-agent: GoodBot 287 | Disallow: 288 | 禁止访问特定目录 289 | User-agent: * 290 | Disallow: /images/ 291 | 仅允许访问特定目录 292 | User-agent: * 293 | Allow: /images/ 294 | Disallow: / 295 | 禁止访问特定文件 296 | User-agent: * 297 | Disallow: /*.html$ 298 | 仅允许访问特定文件 299 | User-agent: * 300 | Allow: /*.html$ 301 | Disallow: / -------------------------------------------------------------------------------- /Spider_Java/README.md: -------------------------------------------------------------------------------- 1 | ### Spider_Java 2 | 3 | 抓取网址:[华尔街见闻](http://live.wallstreetcn.com/) 4 | 5 | 单线程抓取 Spider_Java1 6 | 7 | 多线程抓取 Spider_Java2 8 | -------------------------------------------------------------------------------- /Spider_Java/Spider_Java1/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Spider_Java/Spider_Java1/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Spider 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /Spider_Java/Spider_Java1/bin/synchronizetest/Booth.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/bin/synchronizetest/Booth.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java1/bin/synchronizetest/Reservoir.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/bin/synchronizetest/Reservoir.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java1/bin/synchronizetest/Test.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/bin/synchronizetest/Test.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java1/bin/wallstreetcnsave/WallstreetcnSaveTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/bin/wallstreetcnsave/WallstreetcnSaveTest.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java1/lib/mongo-java-driver-2.13.0-rc1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java1/lib/mongo-java-driver-2.13.0-rc1.jar -------------------------------------------------------------------------------- /Spider_Java/Spider_Java1/src/synchronizetest/Test.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package synchronizetest; 5 | 6 | /** 7 | * @author FIRELING 8 | * 9 | */ 10 | public class Test 11 | { 12 | public static void main(String[] args) 13 | { 14 | Reservoir r = new Reservoir(100); 15 | Booth b1 = new Booth(r); 16 | Booth b2 = new Booth(r); 17 | Booth b3 = new Booth(r); 18 | } 19 | } 20 | /** 21 | * contain shared resource 22 | */ 23 | class Reservoir { 24 | private int total; 25 | public Reservoir(int t) 26 | { 27 | this.total = t; 28 | } 29 | /** 30 | * Thread safe method 31 | * serialized access to Booth.total 32 | */ 33 | public synchronized boolean sellTicket() // 利用synchronized修饰符同步了整个方法 34 | { 35 | if(this.total > 0) { 36 | this.total = this.total-1; 37 | return true; // successfully sell one 38 | } 39 | else { 40 | return false; // no more tickets 41 | } 42 | } 43 | } 44 | /** 45 | * create new thread by inheriting Thread 46 | */ 47 | class Booth extends Thread { 48 | private static int threadID = 0; // owned by Class object 49 | 50 | private Reservoir release; // sell this reservoir 51 | private int count = 0; // owned by this thread object 52 | /** 53 | * constructor 54 | */ 55 | public Booth(Reservoir r) { 56 | super("ID:"+(++threadID)); 57 | this.release = r; // all threads share the same reservoir 58 | this.start(); 59 | } 60 | /** 61 | * convert object to string 62 | */ 63 | public String toString() { 64 | return super.getName(); 65 | } 66 | /** 67 | * what does the thread do? 68 | */ 69 | public void run() { 70 | while(true) { // 循环体!!! 71 | if(this.release.sellTicket()) { 72 | this.count = this.count+1; 73 | System.out.println(this.getName()+":sell 1"); 74 | try { 75 | sleep((int) Math.random()*100); // random intervals 76 | // sleep(100); // 若sleep时间相同,则每个窗口买票相当 77 | } 78 | catch (InterruptedException e) { 79 | throw new RuntimeException(e); 80 | } 81 | } 82 | else { 83 | break; 84 | } 85 | } 86 | System.out.println(this.getName()+" I sold:"+count); 87 | } 88 | } 89 | 90 | -------------------------------------------------------------------------------- /Spider_Java/Spider_Java1/src/wallstreetcnsave/WallstreetcnSaveTest.java: -------------------------------------------------------------------------------- 1 | package wallstreetcnsave; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.net.HttpURLConnection; 8 | import java.net.URL; 9 | import java.text.DateFormat; 10 | import java.util.ArrayList; 11 | import java.util.Date; 12 | import java.util.HashMap; 13 | import java.util.List; 14 | import java.util.Map; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | import com.mongodb.BasicDBObject; 19 | import com.mongodb.DB; 20 | import com.mongodb.DBCollection; 21 | import com.mongodb.Mongo; 22 | 23 | public class WallstreetcnSaveTest implements Runnable { 24 | 25 | private static String DataBaseName = "textclassify"; 26 | private static String CollectionName = "WallstreetSaveJava"; 27 | 28 | private static String url = "http://api.wallstreetcn.com/v2/livenews?&page="; 29 | 30 | private static String Regex = ".*?\"type\":\"(.*?)\".*?\"contentHtml\":\"

(.*?)<\\\\/p>\".*?\"categorySet\":\"(.*?)\".*?"; 31 | private static final String REGEXSTRING1 = "type"; 32 | private static final String REGEXSTRING2 = "content"; 33 | private static final String REGEXSTRING3 = "categoryset"; 34 | 35 | //map表的存放 36 | public static Map GetMap() { 37 | Map map = new HashMap(); 38 | map.put("1", "外汇"); 39 | map.put("2", "股市"); 40 | map.put("3", "商品"); 41 | map.put("4", "债市"); 42 | map.put("9", "中国"); 43 | map.put("10", "美国"); 44 | map.put("11", "欧元区"); 45 | map.put("12", "日本"); 46 | map.put("13", "英国"); 47 | map.put("14", "澳洲"); 48 | map.put("15", "加拿大"); 49 | map.put("16", "瑞士"); 50 | map.put("17", "其他地区"); 51 | map.put("5", "央行"); 52 | return map; 53 | } 54 | private static String[] ruleList_district = { "9", "10", "11", "12", "13", "14", "15", "16", "17" }; 55 | private static String[] ruleList_property = { "1", "2", "3", "4" }; 56 | private static String[] ruleList_centralbank = { "5" }; 57 | 58 | private static final int start = 1; 59 | private static final int end = 3000; 60 | 61 | //对x,x,x格式的内容进行分隔筛选 62 | public static String setCategory(String categorySet, String[] ruleList, Map map) { 63 | StringBuffer disStr = new StringBuffer(); 64 | String[] strArray = null; 65 | strArray = categorySet.split(","); // 拆分字符为",",然后把结果交给数组strArray 66 | // 获取需要的信息 67 | int length_strArray = strArray.length; 68 | int length_ruleList = ruleList.length; 69 | 70 | if (length_strArray > 0) { 71 | for (int iArr = 0; iArr < length_strArray; iArr++) { 72 | String s = strArray[iArr]; 73 | for (int iRul=0; iRul < length_ruleList; iRul++) { 74 | if (s.equals(ruleList[iRul])) { 75 | disStr.append(map.get(s)); 76 | disStr.append(","); 77 | break; 78 | } 79 | } 80 | } 81 | } 82 | if(disStr.length()>1) { 83 | disStr = disStr.deleteCharAt(disStr.length()-1); 84 | } 85 | return disStr.toString(); 86 | } 87 | 88 | //读取整个页面,返回html字符串 89 | private static String httpRequest(String requestUrl) { 90 | StringBuffer buffer = null; 91 | BufferedReader bufferedReader = null; 92 | InputStreamReader inputStreamReader = null; 93 | InputStream inputStream = null; 94 | HttpURLConnection httpUrlConn = null; 95 | try { 96 | // 建立get请求 97 | URL url = new URL(requestUrl); 98 | httpUrlConn = (HttpURLConnection) url.openConnection(); 99 | httpUrlConn.setDoInput(true); 100 | httpUrlConn.setRequestMethod("GET"); 101 | // 获取输入流 102 | inputStream = httpUrlConn.getInputStream(); 103 | inputStreamReader = new InputStreamReader(inputStream, "UTF-8"); 104 | bufferedReader = new BufferedReader(inputStreamReader); 105 | // 从输入流获取结果 106 | buffer = new StringBuffer(); 107 | String str = null; 108 | while ((str = bufferedReader.readLine()) != null) { 109 | str = new String(str.getBytes(), "UTF-8"); 110 | buffer.append(str); 111 | } 112 | } catch (Exception e) { 113 | e.printStackTrace(); 114 | } finally { 115 | if (bufferedReader != null) { 116 | try { 117 | bufferedReader.close(); 118 | } catch (IOException e) { 119 | e.printStackTrace(); 120 | } 121 | } 122 | if (inputStreamReader != null) { 123 | try { 124 | inputStreamReader.close(); 125 | } catch (IOException e) { 126 | e.printStackTrace(); 127 | } 128 | } 129 | if (inputStream != null) { 130 | try { 131 | inputStream.close(); 132 | } catch (IOException e) { 133 | e.printStackTrace(); 134 | } 135 | } 136 | if (httpUrlConn != null) { 137 | httpUrlConn.disconnect(); 138 | } 139 | } 140 | return buffer.toString(); 141 | } 142 | 143 | // 过滤掉无用的信息 144 | public static List> htmlFiter(String html, String Regex) { 145 | List> list = new ArrayList>(); 146 | // 查找目标 147 | Pattern p = Pattern.compile(Regex); 148 | Matcher m = p.matcher(html); 149 | while (m.find()) { 150 | Map map_save = new HashMap(); 151 | // 可修改部分 152 | map_save.put(REGEXSTRING1, m.group(1)); 153 | map_save.put(REGEXSTRING2, m.group(2)); 154 | map_save.put(REGEXSTRING3, m.group(3)); 155 | 156 | list.add(map_save); 157 | } 158 | return list; 159 | } 160 | 161 | //unicode格式转中文 162 | public static String UnicodeToString(String str) { 163 | Pattern pattern = Pattern.compile("(\\\\u(\\p{XDigit}{4}))"); // XDigit表示16进制数字,正则里的\p表示Unicode块 164 | Matcher matcher = pattern.matcher(str); 165 | char ch; 166 | while (matcher.find()) { 167 | ch = (char) Integer.parseInt(matcher.group(2), 16); // 16进制转10进制作为ascii码,再char转为字符 168 | str = str.replace(matcher.group(1), ch + ""); 169 | } 170 | return str; 171 | } 172 | 173 | public void run() { 174 | // 链接数据库 175 | try { 176 | Mongo mongo = new Mongo("localhost", 27017); 177 | DB db = mongo.getDB(DataBaseName); 178 | DBCollection collection = db.getCollection(CollectionName); 179 | 180 | // 调用抓取的方法获取内容 181 | for (int i = start; i <= end; i++) { 182 | String requestUrl = url + i; 183 | System.out.println(requestUrl); 184 | 185 | String html = httpRequest(requestUrl); 186 | List> resultList = htmlFiter(html, Regex); 187 | 188 | if (resultList.isEmpty()) { 189 | System.out.printf("The end url: %s", requestUrl); 190 | break; 191 | } else { 192 | for (Map result : resultList) { 193 | BasicDBObject dbObject = new BasicDBObject(); 194 | 195 | String type = result.get(REGEXSTRING1); 196 | String content = UnicodeToString(result.get(REGEXSTRING2)); 197 | // String content = result.get(REGEXSTRING2); 198 | 199 | Map map = GetMap(); 200 | String district = setCategory(result.get(REGEXSTRING3), ruleList_district, map); 201 | String property = setCategory(result.get(REGEXSTRING3), ruleList_property, map); 202 | String centralbank = setCategory(result.get(REGEXSTRING3), ruleList_centralbank, map); 203 | 204 | Date date = new Date(); 205 | DateFormat time = DateFormat.getDateTimeInstance(); 206 | String time_str = time.format(date); 207 | 208 | String source = "wangstreetcn"; 209 | 210 | dbObject.put("content", content); // 具体内容 211 | dbObject.put("createdtime", time_str); // 创建时间 212 | dbObject.put("source", source); // 信息来源 213 | dbObject.put("district", district); // 所属地区 214 | dbObject.put("property", property); // 资产类别 215 | dbObject.put("centralbank", centralbank); // 资产类别 216 | dbObject.put("type", type); //信息类型 217 | 218 | collection.insert(dbObject); 219 | } 220 | } 221 | } 222 | } catch (Exception e) { 223 | e.printStackTrace(); 224 | } 225 | } 226 | 227 | 228 | public static void main(String[] args) throws InterruptedException { 229 | WallstreetcnSaveTest wallstreetcnsave = new WallstreetcnSaveTest(); 230 | wallstreetcnsave.run(); 231 | } 232 | 233 | } 234 | -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | Spider 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/bin/synchronizetest/Booth.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java2/bin/synchronizetest/Booth.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/bin/synchronizetest/Reservoir.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java2/bin/synchronizetest/Reservoir.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/bin/synchronizetest/Test.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java2/bin/synchronizetest/Test.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/bin/wallstreetcnsave/GetrequestUrl.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java2/bin/wallstreetcnsave/GetrequestUrl.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/bin/wallstreetcnsave/WallstreetcnSave.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java2/bin/wallstreetcnsave/WallstreetcnSave.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/bin/wallstreetcnsave/WallstreetcnSaveTest.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java2/bin/wallstreetcnsave/WallstreetcnSaveTest.class -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/lib/mongo-java-driver-2.13.0-rc1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/Spider_Java/Spider_Java2/lib/mongo-java-driver-2.13.0-rc1.jar -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/src/synchronizetest/Test.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | */ 4 | package synchronizetest; 5 | 6 | /** 7 | * @author FIRELING 8 | * 9 | */ 10 | public class Test 11 | { 12 | public static void main(String[] args) 13 | { 14 | Reservoir r = new Reservoir(100); 15 | Booth b1 = new Booth(r); 16 | Booth b2 = new Booth(r); 17 | Booth b3 = new Booth(r); 18 | } 19 | } 20 | /** 21 | * contain shared resource 22 | */ 23 | class Reservoir { 24 | private int total; 25 | public Reservoir(int t) 26 | { 27 | this.total = t; 28 | } 29 | /** 30 | * Thread safe method 31 | * serialized access to Booth.total 32 | */ 33 | public synchronized boolean sellTicket() // 利用synchronized修饰符同步了整个方法 34 | { 35 | if(this.total > 0) { 36 | this.total = this.total-1; 37 | return true; // successfully sell one 38 | } 39 | else { 40 | return false; // no more tickets 41 | } 42 | } 43 | } 44 | /** 45 | * create new thread by inheriting Thread 46 | */ 47 | class Booth extends Thread { 48 | private static int threadID = 0; // owned by Class object 49 | 50 | private Reservoir release; // sell this reservoir 51 | private int count = 0; // owned by this thread object 52 | /** 53 | * constructor 54 | */ 55 | public Booth(Reservoir r) { 56 | super("ID:"+(++threadID)); 57 | this.release = r; // all threads share the same reservoir 58 | this.start(); 59 | } 60 | /** 61 | * convert object to string 62 | */ 63 | public String toString() { 64 | return super.getName(); 65 | } 66 | /** 67 | * what does the thread do? 68 | */ 69 | public void run() { 70 | while(true) { // 循环体!!! 71 | if(this.release.sellTicket()) { 72 | this.count = this.count+1; 73 | System.out.println(this.getName()+":sell 1"); 74 | try { 75 | sleep((int) Math.random()*100); // random intervals 76 | // sleep(100); // 若sleep时间相同,则每个窗口买票相当 77 | } 78 | catch (InterruptedException e) { 79 | throw new RuntimeException(e); 80 | } 81 | } 82 | else { 83 | break; 84 | } 85 | } 86 | System.out.println(this.getName()+" I sold:"+count); 87 | } 88 | } 89 | 90 | -------------------------------------------------------------------------------- /Spider_Java/Spider_Java2/src/wallstreetcnsave/WallstreetcnSaveTest.java: -------------------------------------------------------------------------------- 1 | package wallstreetcnsave; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | import java.net.HttpURLConnection; 8 | import java.net.URL; 9 | import java.text.DateFormat; 10 | import java.util.ArrayList; 11 | import java.util.Date; 12 | import java.util.HashMap; 13 | import java.util.List; 14 | import java.util.Map; 15 | import java.util.regex.Matcher; 16 | import java.util.regex.Pattern; 17 | 18 | import com.mongodb.BasicDBObject; 19 | import com.mongodb.DB; 20 | import com.mongodb.DBCollection; 21 | import com.mongodb.Mongo; 22 | 23 | 24 | class WallstreetcnSave implements Runnable { 25 | 26 | private GetrequestUrl release; 27 | public WallstreetcnSave(GetrequestUrl url) { 28 | this.release = url; // all threads share the same GetrequestUrl 29 | } 30 | 31 | private static String DataBaseName = "textclassify"; 32 | private static String CollectionName = "WallstreetSaveJava"; 33 | 34 | private static String Regex = ".*?\"type\":\"(.*?)\".*?\"contentHtml\":\"

(.*?)<\\\\/p>\".*?\"categorySet\":\"(.*?)\".*?"; 35 | private static final String REGEXSTRING1 = "type"; 36 | private static final String REGEXSTRING2 = "content"; 37 | private static final String REGEXSTRING3 = "categoryset"; 38 | 39 | //map表的存放 40 | public static Map GetMap() { 41 | Map map = new HashMap(); 42 | map.put("1", "外汇"); 43 | map.put("2", "股市"); 44 | map.put("3", "商品"); 45 | map.put("4", "债市"); 46 | map.put("9", "中国"); 47 | map.put("10", "美国"); 48 | map.put("11", "欧元区"); 49 | map.put("12", "日本"); 50 | map.put("13", "英国"); 51 | map.put("14", "澳洲"); 52 | map.put("15", "加拿大"); 53 | map.put("16", "瑞士"); 54 | map.put("17", "其他地区"); 55 | map.put("5", "央行"); 56 | return map; 57 | } 58 | private static String[] ruleList_district = { "9", "10", "11", "12", "13", "14", "15", "16", "17" }; 59 | private static String[] ruleList_property = { "1", "2", "3", "4" }; 60 | private static String[] ruleList_centralbank = { "5" }; 61 | 62 | //对x,x,x格式的内容进行分隔筛选 63 | public static String setCategory(String categorySet, String[] ruleList, Map map) { 64 | StringBuffer disStr = new StringBuffer(); 65 | String[] strArray = null; 66 | strArray = categorySet.split(","); // 拆分字符为",",然后把结果交给数组strArray 67 | // 获取需要的信息 68 | int length_strArray = strArray.length; 69 | int length_ruleList = ruleList.length; 70 | 71 | if (length_strArray > 0) { 72 | for (int iArr = 0; iArr < length_strArray; iArr++) { 73 | String s = strArray[iArr]; 74 | for (int iRul=0; iRul < length_ruleList; iRul++) { 75 | if (s.equals(ruleList[iRul])) { 76 | disStr.append(map.get(s)); 77 | disStr.append(","); 78 | break; 79 | } 80 | } 81 | } 82 | } 83 | if(disStr.length()>1) { 84 | disStr = disStr.deleteCharAt(disStr.length()-1); 85 | } 86 | return disStr.toString(); 87 | } 88 | 89 | //读取整个页面,返回html字符串 90 | private static String httpRequest(String requestUrl) { 91 | StringBuffer buffer = null; 92 | BufferedReader bufferedReader = null; 93 | InputStreamReader inputStreamReader = null; 94 | InputStream inputStream = null; 95 | HttpURLConnection httpUrlConn = null; 96 | try { 97 | // 建立get请求 98 | URL url = new URL(requestUrl); 99 | httpUrlConn = (HttpURLConnection) url.openConnection(); 100 | httpUrlConn.setDoInput(true); 101 | httpUrlConn.setRequestMethod("GET"); 102 | // 获取输入流 103 | inputStream = httpUrlConn.getInputStream(); 104 | inputStreamReader = new InputStreamReader(inputStream, "UTF-8"); 105 | bufferedReader = new BufferedReader(inputStreamReader); 106 | // 从输入流获取结果 107 | buffer = new StringBuffer(); 108 | String str = null; 109 | while ((str = bufferedReader.readLine()) != null) { 110 | str = new String(str.getBytes(), "UTF-8"); 111 | buffer.append(str); 112 | } 113 | } catch (Exception e) { 114 | e.printStackTrace(); 115 | } finally { 116 | if (bufferedReader != null) { 117 | try { 118 | bufferedReader.close(); 119 | } catch (IOException e) { 120 | e.printStackTrace(); 121 | } 122 | } 123 | if (inputStreamReader != null) { 124 | try { 125 | inputStreamReader.close(); 126 | } catch (IOException e) { 127 | e.printStackTrace(); 128 | } 129 | } 130 | if (inputStream != null) { 131 | try { 132 | inputStream.close(); 133 | } catch (IOException e) { 134 | e.printStackTrace(); 135 | } 136 | } 137 | if (httpUrlConn != null) { 138 | httpUrlConn.disconnect(); 139 | } 140 | } 141 | return buffer.toString(); 142 | } 143 | 144 | // 过滤掉无用的信息 145 | public static List> htmlFiter(String html, String Regex) { 146 | List> list = new ArrayList>(); 147 | // 查找目标 148 | Pattern p = Pattern.compile(Regex); 149 | Matcher m = p.matcher(html); 150 | while (m.find()) { 151 | Map map_save = new HashMap(); 152 | // 可修改部分 153 | map_save.put(REGEXSTRING1, m.group(1)); 154 | map_save.put(REGEXSTRING2, m.group(2)); 155 | map_save.put(REGEXSTRING3, m.group(3)); 156 | 157 | list.add(map_save); 158 | } 159 | return list; 160 | } 161 | 162 | //unicode格式转中文 163 | public static String UnicodeToString(String str) { 164 | Pattern pattern = Pattern.compile("(\\\\u(\\p{XDigit}{4}))"); // XDigit表示16进制数字,正则里的\p表示Unicode块 165 | Matcher matcher = pattern.matcher(str); 166 | char ch; 167 | while (matcher.find()) { 168 | ch = (char) Integer.parseInt(matcher.group(2), 16); // 16进制转10进制作为ascii码,再char转为字符 169 | str = str.replace(matcher.group(1), ch + ""); 170 | } 171 | return str; 172 | } 173 | 174 | public void run() { 175 | while(true) { // 循环体!!! 176 | // 链接数据库 177 | try { 178 | Mongo mongo = new Mongo("localhost", 27017); 179 | DB db = mongo.getDB(DataBaseName); 180 | DBCollection collection = db.getCollection(CollectionName); 181 | 182 | // 调用抓取的方法获取内容 183 | String requestUrl = this.release.GetMethod(); 184 | if(requestUrl.equals("")) { 185 | break; 186 | } else { 187 | System.out.println(requestUrl); 188 | 189 | String html = httpRequest(requestUrl); 190 | List> resultList = htmlFiter(html, Regex); 191 | 192 | if (resultList.isEmpty()) { 193 | System.out.printf("The end url: %s", requestUrl); 194 | break; 195 | } else { 196 | for (Map result : resultList) { 197 | BasicDBObject dbObject = new BasicDBObject(); 198 | 199 | String type = result.get(REGEXSTRING1); 200 | String content = UnicodeToString(result.get(REGEXSTRING2)); 201 | 202 | Map map = GetMap(); 203 | String district = setCategory(result.get(REGEXSTRING3), ruleList_district, map); 204 | String property = setCategory(result.get(REGEXSTRING3), ruleList_property, map); 205 | String centralbank = setCategory(result.get(REGEXSTRING3), ruleList_centralbank, map); 206 | 207 | Date date = new Date(); 208 | DateFormat time = DateFormat.getDateTimeInstance(); 209 | String time_str = time.format(date); 210 | 211 | String source = "wangstreetcn"; 212 | 213 | dbObject.put("content", content); // 具体内容 214 | dbObject.put("createdtime", time_str); // 创建时间 215 | dbObject.put("source", source); // 信息来源 216 | dbObject.put("district", district); // 所属地区 217 | dbObject.put("property", property); // 资产类别 218 | dbObject.put("centralbank", centralbank); // 资产类别 219 | dbObject.put("type", type); //信息类型 220 | 221 | collection.insert(dbObject); 222 | } 223 | } 224 | } 225 | } catch (Exception e) { 226 | e.printStackTrace(); 227 | } 228 | } 229 | } 230 | 231 | public void run1() { 232 | while(true) { // 循环体!!! 233 | // 链接数据库 234 | try { 235 | Mongo mongo = new Mongo("localhost", 27017); 236 | DB db = mongo.getDB(DataBaseName); 237 | DBCollection collection = db.getCollection(CollectionName); 238 | 239 | // 调用抓取的方法获取内容 240 | String requestUrl = this.release.GetMethod(); 241 | if(requestUrl.equals("")) { 242 | break; 243 | } else { 244 | System.out.println(requestUrl); 245 | 246 | String html = httpRequest(requestUrl); 247 | List> resultList = htmlFiter(html, Regex); 248 | 249 | if (resultList.isEmpty()) { 250 | System.out.printf("The end url: %s\n", requestUrl); 251 | break; 252 | } else { 253 | for (Map result : resultList) { 254 | BasicDBObject dbObject = new BasicDBObject(); 255 | 256 | String type = result.get(REGEXSTRING1); 257 | String content = UnicodeToString(result.get(REGEXSTRING2)); 258 | 259 | Map map = GetMap(); 260 | String district = setCategory(result.get(REGEXSTRING3), ruleList_district, map); 261 | String property = setCategory(result.get(REGEXSTRING3), ruleList_property, map); 262 | String centralbank = setCategory(result.get(REGEXSTRING3), ruleList_centralbank, map); 263 | 264 | Date date = new Date(); 265 | DateFormat time = DateFormat.getDateTimeInstance(); 266 | String time_str = time.format(date); 267 | 268 | String source = "wangstreetcn"; 269 | 270 | dbObject.put("content", content); // 具体内容 271 | dbObject.put("createdtime", time_str); // 创建时间 272 | dbObject.put("source", source); // 信息来源 273 | dbObject.put("district", district); // 所属地区 274 | dbObject.put("property", property); // 资产类别 275 | dbObject.put("centralbank", centralbank); // 资产类别 276 | dbObject.put("type", type); //信息类型 277 | 278 | collection.insert(dbObject); 279 | } 280 | } 281 | } 282 | } catch (Exception e) { 283 | e.printStackTrace(); 284 | } 285 | } 286 | } 287 | 288 | } 289 | 290 | /** 291 | * contain shared resource 292 | */ 293 | class GetrequestUrl { 294 | 295 | private String url = "http://api.wallstreetcn.com/v2/livenews?&page="; 296 | private int start; 297 | private int end = 5000; 298 | 299 | public GetrequestUrl(int start) 300 | { 301 | this.start = start; 302 | } 303 | public GetrequestUrl(int start, int end) 304 | { 305 | this.start = start; 306 | this.end = end; 307 | } 308 | 309 | /** 310 | * Thread safe method 311 | */ 312 | public synchronized String GetMethod() // 利用synchronized修饰符同步了整个方法 313 | { 314 | if(this.start <= this.end) { 315 | String requestUrl = this.url+this.start; 316 | this.start = this.start+1; 317 | return requestUrl; 318 | } 319 | else { 320 | return ""; 321 | } 322 | } 323 | } 324 | 325 | 326 | public class WallstreetcnSaveTest { 327 | public static void main(String[] args) { 328 | // 多线程抓取 329 | int start = 1; 330 | GetrequestUrl url = new GetrequestUrl(start); 331 | // int start = 1, end = 3000; 332 | // GetrequestUrl url = new GetrequestUrl(start, end); 333 | 334 | int thread_num = 1; 335 | while(true) { 336 | if(thread_num++ > 8) break; 337 | Thread thread = new Thread(new WallstreetcnSave(url)); 338 | thread.start(); 339 | } 340 | 341 | } 342 | } 343 | -------------------------------------------------------------------------------- /Spider_Python/README.md: -------------------------------------------------------------------------------- 1 | ### Spider_Python 2 | 3 | 抓取网址:[华尔街见闻](http://live.wallstreetcn.com/) 4 | 5 | 多进程抓取 6 | -------------------------------------------------------------------------------- /Spider_Python/WallstreetcnSaveTest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import sys 5 | import re 6 | import urllib, urllib2 7 | import requests 8 | import pymongo 9 | import datetime 10 | import multiprocessing as mp 11 | 12 | 13 | Category_Map = { 14 | "1":u"外汇", 15 | "2":u"股市", 16 | "3":u"商品", 17 | "4":u"债市", 18 | "5":u"央行", 19 | "9":u"中国", 20 | "10":u"美国", 21 | "11":u"欧元区", 22 | "12":u"日本", 23 | "13":u"英国", 24 | "14":u"澳洲", 25 | "15":u"加拿大", 26 | "16":u"瑞士", 27 | "17":u"其他地区" 28 | } 29 | def num2name(category_num): 30 | if Category_Map.has_key(category_num): 31 | return Category_Map[category_num] 32 | else: 33 | return "" 34 | 35 | class MongoDBIO: 36 | # 申明相关的属性 37 | def __init__(self, host, port, name, password, database, collection): 38 | self.host = host 39 | self.port = port 40 | self.name = name 41 | self.password = password 42 | self.database = database 43 | self.collection = collection 44 | 45 | # 连接数据库,db和posts为数据库和集合的游标 46 | def Connection(self): 47 | # connection = pymongo.Connection() # 连接本地数据库 48 | connection = pymongo.Connection(host=self.host, port=self.port) 49 | # db = connection.datas 50 | db = connection[self.database] 51 | if self.name or self.password: 52 | db.authenticate(name=self.name, password=self.password) # 验证用户名密码 53 | # print "Database:", db.name 54 | # posts = db.cn_live_news 55 | posts = db[self.collection] 56 | # print "Collection:", posts.name 57 | return posts 58 | 59 | # 保存操作 60 | # def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents): 61 | # posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() 62 | # for save_content in save_contents: 63 | # posts.save(save_content) 64 | def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content): 65 | posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() 66 | posts.save(save_content) 67 | 68 | def Spider(url, data): 69 | # # 方法1:requests get 70 | content = requests.get(url=url, params=data).content # GET请求发送 71 | # # 方法2:urllib2 get 72 | # data = urllib.urlencode(data) # 编码工作,由dict转为string 73 | # full_url = url+'?'+data 74 | # print full_url 75 | # content = urllib2.urlopen(full_url).read() # GET请求发送 76 | # # content = requests.get(full_url).content # GET请求发送 77 | # print type(content) # str 78 | return content 79 | 80 | def ContentSave(item): 81 | # 保存配置 82 | save_host = "localhost" 83 | save_port = 27017 84 | save_name = "" 85 | save_password = "" 86 | save_database = "textclassify" 87 | save_collection = "WallstreetcnSave" 88 | 89 | source = "wallstreetcn" 90 | createdtime = datetime.datetime.now() 91 | type = item[0] 92 | content = item[1].decode("unicode_escape") # json格式数据中,需从'\\uxxxx'形式的unicode_escape编码转换成u'\uxxxx'的unicode编码 93 | content = content.encode("utf-8") 94 | # print content 95 | # district的筛选 96 | categorySet = item[2] 97 | category_num = categorySet.split(",") 98 | category_name = map(num2name, category_num) 99 | districtset = set(category_name)&{u"中国", u"美国", u"欧元区", u"日本", u"英国", u"澳洲", u"加拿大", u"瑞士", u"其他地区"} 100 | district = ",".join(districtset) 101 | propertyset = set(category_name)&{u"外汇", u"股市", u"商品", u"债市"} 102 | property = ",".join(propertyset) 103 | centralbankset = set(category_name)&{u"央行"} 104 | centralbank = ",".join(centralbankset) 105 | save_content = { 106 | "source":source, 107 | "createdtime":createdtime, 108 | "content":content, 109 | "type":type, 110 | "district":district, 111 | "property":property, 112 | "centralbank":centralbank 113 | } 114 | ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content) 115 | 116 | def func(page): 117 | url = "http://api.wallstreetcn.com/v2/livenews" 118 | # get参数 119 | data = { 120 | "page":page 121 | } 122 | content = Spider(url, data) 123 | items = re.findall(r'"type":"(.*?)","codeType".*?"contentHtml":"(.*?)","data".*?"categorySet":"(.*?)","hasMore"', content) # 正则匹配 124 | if len(items) == 0: 125 | print "The End Page:", page 126 | data = urllib.urlencode(data) # 编码工作,由dict转为string 127 | full_url = url+'?'+data 128 | print full_url 129 | sys.exit(0) # 无错误退出 130 | else: 131 | print "The Page:", page, "Downloading..." 132 | for item in items: 133 | ContentSave(item) 134 | 135 | 136 | if __name__ == '__main__': 137 | 138 | start = datetime.datetime.now() 139 | 140 | start_page = 1 141 | end_page = 3300 142 | 143 | 144 | # 多进程抓取 145 | pages = [i for i in range(start_page, end_page)] 146 | p = mp.Pool() 147 | p.map_async(func, pages) 148 | p.close() 149 | p.join() 150 | 151 | 152 | # 单进程抓取 153 | page = end_page 154 | 155 | while 1: 156 | url = "http://api.wallstreetcn.com/v2/livenews" 157 | # get参数 158 | data = { 159 | "page":page 160 | } 161 | content = Spider(url, data) 162 | items = re.findall(r'"type":"(.*?)","codeType".*?"contentHtml":"(.*?)","data".*?"categorySet":"(.*?)","hasMore"', content) # 正则匹配 163 | if len(items) == 0: 164 | print "The End Page:", page 165 | data = urllib.urlencode(data) # 编码工作,由dict转为string 166 | full_url = url+'?'+data 167 | print full_url 168 | break 169 | else: 170 | print "The Page:", page, "Downloading..." 171 | for item in items: 172 | ContentSave(item) 173 | page += 1 174 | 175 | end = datetime.datetime.now() 176 | print "last time: ", end-start 177 | -------------------------------------------------------------------------------- /WechatSearchProjects/README.md: -------------------------------------------------------------------------------- 1 | ### 使用Scrapy或Requests递归抓取[微信搜索](http://weixin.sogou.com/weixin)结果 2 | 3 | 使用Scrapy方法 或者 使用Requests+BeautifulSoup 4 | 5 | **使用Scrapy方法:** 6 | 7 | * 将querystring替换为你要查询的单词 8 | 9 | * type可以选择 10 | 11 | * i的范围可以调整,对应查询的搜索结果页面数目 12 | -------------------------------------------------------------------------------- /WechatSearchProjects/Spider_Main.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | from scrapy.cmdline import execute 3 | import os 4 | 5 | if __name__ == '__main__': 6 | project_name = "Wechatproject" 7 | spider_name = "wechat" 8 | results_name = "results/results.json" 9 | 10 | if not os.path.exists(project_name): 11 | print "Please Edit the project files and Run again!!!" 12 | s = "scrapy startproject %s" % project_name 13 | execute(s.split()) 14 | else: 15 | print "Start Crawling!!!" 16 | path = os.getcwd() # 获取当前路径 17 | os.chdir(path+"/"+project_name) # 修改当前路径 18 | if os.path.exists(results_name): 19 | os.remove(results_name) 20 | s = "scrapy crawl %s" % spider_name 21 | # s = "scrapy crawl %s -o %s -t json" % (spider_name, results_name) 22 | execute(s.split()) 23 | -------------------------------------------------------------------------------- /WechatSearchProjects/WechatSearchTest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding:utf-8 -*- 3 | 4 | import sys 5 | import re 6 | import urllib, urllib2 7 | import requests 8 | import pymongo 9 | import datetime 10 | from bs4 import BeautifulSoup 11 | import multiprocessing as mp 12 | 13 | 14 | class MongoDBIO: 15 | # 申明相关的属性 16 | def __init__(self, host, port, name, password, database, collection): 17 | self.host = host 18 | self.port = port 19 | self.name = name 20 | self.password = password 21 | self.database = database 22 | self.collection = collection 23 | 24 | # 连接数据库,db和posts为数据库和集合的游标 25 | def Connection(self): 26 | # connection = pymongo.Connection() # 连接本地数据库 27 | connection = pymongo.Connection(host=self.host, port=self.port) 28 | # db = connection.datas 29 | db = connection[self.database] 30 | if self.name or self.password: 31 | db.authenticate(name=self.name, password=self.password) # 验证用户名密码 32 | # print "Database:", db.name 33 | # posts = db.cn_live_news 34 | posts = db[self.collection] 35 | # print "Collection:", posts.name 36 | return posts 37 | 38 | # # 保存操作 39 | # def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_contents): 40 | # posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() 41 | # 42 | # for save_content in save_contents: 43 | # posts.save(save_content) 44 | # 保存操作 45 | def ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content): 46 | posts = MongoDBIO(save_host, save_port, save_name, save_password, save_database, save_collection).Connection() 47 | posts.save(save_content) 48 | 49 | 50 | def GetTitleUrl(url, data): 51 | content = requests.get(url=url, params=data).content # GET请求发送 52 | soup = BeautifulSoup(content) 53 | tags = soup.findAll("h4") 54 | titleurl = [] 55 | for tag in tags: 56 | item = {"title":tag.text.strip(), "link":tag.find("a").get("href"), "content":""} 57 | titleurl.append(item) 58 | return titleurl 59 | 60 | def GetContent(url): 61 | soup = BeautifulSoup(requests.get(url=url).content) 62 | tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签 63 | content_list = [tag_i.text for tag_i in tag.findAll("p")] 64 | content = "".join(content_list) 65 | return content 66 | 67 | def ContentSave(item): 68 | # 保存配置 69 | save_host = "localhost" 70 | save_port = 27017 71 | save_name = "" 72 | save_password = "" 73 | save_database = "testwechat" 74 | save_collection = "result" 75 | 76 | save_content = { 77 | "title":item["title"], 78 | "link":item["link"], 79 | "content":item["content"] 80 | } 81 | 82 | ResultSave(save_host, save_port, save_name, save_password, save_database, save_collection, save_content) 83 | 84 | def func(tuple): 85 | querystring, type, page = tuple[0], tuple[1], tuple[2] 86 | url = "http://weixin.sogou.com/weixin" 87 | # get参数 88 | data = { 89 | "query":querystring, 90 | "type":type, 91 | "page":page 92 | } 93 | 94 | titleurl = GetTitleUrl(url, data) 95 | 96 | for item in titleurl: 97 | url = item["link"] 98 | print "url:", url 99 | content = GetContent(url) 100 | item["content"] = content 101 | ContentSave(item) 102 | 103 | 104 | if __name__ == '__main__': 105 | start = datetime.datetime.now() 106 | 107 | querystring = u"清华" 108 | type = 2 # 2-文章,1-微信号 109 | 110 | # 多进程抓取 111 | p = mp.Pool() 112 | p.map_async(func, [(querystring, type, page) for page in range(1, 50, 1)]) 113 | p.close() 114 | p.join() 115 | 116 | # # 单进程抓取 117 | # for page in range(1, 50, 1): 118 | # tuple = (querystring, type, page) 119 | # func(tuple) 120 | 121 | end = datetime.datetime.now() 122 | print "last time: ", end-start 123 | -------------------------------------------------------------------------------- /WechatSearchProjects/Wechatproject/Wechatproject/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lining0806/PythonSpiderNotes/da645036061fcdcd43ecfd16a9980a958c023160/WechatSearchProjects/Wechatproject/Wechatproject/__init__.py -------------------------------------------------------------------------------- /WechatSearchProjects/Wechatproject/Wechatproject/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | from scrapy.item import Item, Field 7 | 8 | class WechatprojectItem(Item): 9 | # define the fields for your item here like: 10 | # name = Field() 11 | title = Field() 12 | link = Field() 13 | content = Field() 14 | pass 15 | -------------------------------------------------------------------------------- /WechatSearchProjects/Wechatproject/Wechatproject/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | # class WechatprojectPipeline(object): 7 | # def process_item(self, item, spider): 8 | # return item 9 | 10 | 11 | 12 | # # MySQL Database 13 | # from twisted.enterprise import adbapi # import twisted package 14 | # class WechatprojectPipeline(object): 15 | # # connnect databases 16 | # def __init__(self): 17 | # self.dbpool = adbapi.ConnectionPool("MySQLdb", 18 | # host = "localhost", 19 | # db = "testwechat", # you must build database named testwechat 20 | # user = "root", 21 | # passwd = "testpasswd", 22 | # charset = "utf8") 23 | # # pipeline default function 24 | # def process_item(self, item, spider): 25 | # query = self.dbpool.runInteraction(self._conditional_insert, item) 26 | # return item 27 | # # insert the data to databases 28 | # def _conditional_insert(self, tx, item): # item dictionary 29 | # # you must build table named result in database testwechat 30 | # tx.execute("insert into result values (%s, %s, %s)", (item["title"], item["link"], item["content"])) 31 | 32 | 33 | # MongoDB Database 34 | import pymongo 35 | class WechatprojectPipeline(object): 36 | # connnect databases 37 | def __init__(self): 38 | connection = pymongo.Connection(host = "localhost", port = 27017) 39 | db = connection["testwechat"] # you need no build database named testdouban 40 | # db.authenticate(name = "root", password = "testpasswd") # no name and password for localhost 41 | self.posts = db["result"] # you need not build collection named book 42 | # pipeline default function 43 | def process_item(self, item, spider): 44 | self.posts.insert(dict(item)) # convert json to dict 45 | return item 46 | 47 | 48 | # # Json File 49 | # import json 50 | # import codecs 51 | # class WechatprojectPipeline(object): 52 | # def __init__(self): 53 | # self.file = codecs.open('results.json', 'w', 'utf-8') 54 | # def process_item(self, item, spider): 55 | # line = json.dumps(dict(item))+'\n' 56 | # self.file.write(line) 57 | # return item 58 | 59 | 60 | ############################################################################################# 61 | # '''if you want to download images''' 62 | # from scrapy.http.request import Request 63 | # from scrapy.contrib.pipeline.images import ImagesPipeline 64 | # class MyImagesPipeline(ImagesPipeline): 65 | # #@TODO 66 | # def get_media_requests(self, item, info): 67 | # for image_url in item['image_urls']: # item['image_urls'] contains the image urls 68 | # # yield Request(image_url) 69 | # yield Request(image_url, meta={'name': item['name']}) # item['name'] contains the images name 70 | # def item_completed(self, results, item, info): 71 | # return super(MyImagesPipeline, self).item_completed(results, item, info) 72 | # def file_path(self, request, response=None, info=None): 73 | # f_path = super(MyImagesPipeline, self).file_path(request, response, info) 74 | # f_path = f_path.replace('full', request.meta['name']) 75 | # return f_path 76 | # ########################################################## 77 | # # import hashlib 78 | # # image_guid = hashlib.sha1(request.url).hexdigest() # change to request.url after deprecation 79 | # # return '%s/%s.jpg' % (request.meta['name'], image_guid) 80 | # pass 81 | # # from scrapy.contrib.pipeline.media import MediaPipeline 82 | # # class MyMediaPipeline(MediaPipeline): 83 | # # #@TODO 84 | # # pass 85 | 86 | -------------------------------------------------------------------------------- /WechatSearchProjects/Wechatproject/Wechatproject/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for Wechatproject project 2 | # 3 | # For simplicity, this file contains only the most important settings by 4 | # default. All the other settings are documented here: 5 | # 6 | # http://doc.scrapy.org/en/latest/topics/settings.html 7 | # 8 | 9 | BOT_NAME = 'Wechatproject' 10 | 11 | SPIDER_MODULES = ['Wechatproject.spiders'] 12 | NEWSPIDER_MODULE = 'Wechatproject.spiders' 13 | 14 | ITEM_PIPELINES = ['Wechatproject.pipelines.WechatprojectPipeline'] # add settings 15 | ############################################################################################# 16 | # '''if you want to download images''' 17 | # ITEM_PIPELINES = {'Wechatproject.pipelines.WechatprojectPipeline':1, 'Wechatproject.pipelines.MyImagesPipeline':2 # add settings 18 | # IMAGES_STORE = './images' 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | #USER_AGENT = 'Wechatproject (+http://www.yourdomain.com)' 22 | -------------------------------------------------------------------------------- /WechatSearchProjects/Wechatproject/Wechatproject/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /WechatSearchProjects/Wechatproject/Wechatproject/spiders/spider.py: -------------------------------------------------------------------------------- 1 | #coding: utf-8 2 | from scrapy.spider import BaseSpider 3 | from scrapy.selector import Selector 4 | from Wechatproject.items import WechatprojectItem 5 | from bs4 import BeautifulSoup 6 | from scrapy.http import Request 7 | 8 | 9 | class WechatSpider(BaseSpider): 10 | ############################################################################################# 11 | '''微信搜索程序''' 12 | name = "wechat" 13 | 14 | start_urls = [] 15 | querystring = u"清华" 16 | type = 2 # 2-文章,1-微信号 17 | for i in range(1, 50, 1): 18 | start_urls.append("http://weixin.sogou.com/weixin?type=%d&query=%s&page=%d" % (type, querystring, i)) 19 | # print start_urls 20 | 21 | ############################################################################################# 22 | ## 递归抓取 23 | 24 | ## 使用xpath()方法,注意item中键对值为string类型,extract()方法返回list 25 | def parse(self, response): 26 | # print response.body 27 | sel = Selector(response) 28 | sites = sel.xpath('//div[@class="txt-box"]/h4/a') 29 | for site in sites: 30 | item = WechatprojectItem() 31 | item["title"] = site.xpath("text()").extract() # 其中在item.py中定义了title = Field() 32 | item["link"] = site.xpath("@href").extract() # 其中在item.py中定义了link = Field() 33 | ############################################################################################# 34 | # yield item ## 只抓取当前页数据 35 | next_url = item["link"][0] 36 | # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据 37 | yield Request(url=next_url, meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据 38 | 39 | ## 使用BeautifulSoup方法,注意item中键对值为string类型 40 | def parse(self, response): 41 | # print response.body 42 | soup = BeautifulSoup(response.body) 43 | tags = soup.findAll("h4") 44 | for tag in tags: 45 | item = WechatprojectItem() 46 | item["title"] = tag.text # 其中在item.py中定义了title = Field() 47 | item["link"] = tag.find("a").get("href") # 其中在item.py中定义了link = Field() 48 | ############################################################################################# 49 | # yield item ## 只抓取当前页数据 50 | next_url = item["link"] 51 | # yield Request(url=next_url, callback=self.parse2) ## 只抓取二级页面数据 52 | yield Request(url=next_url, meta={"item":item}, callback=self.parse2) ## 抓取当前页数和二级页面数据 53 | 54 | def parse2(self, response): 55 | soup = BeautifulSoup(response.body) 56 | tag = soup.find("div", attrs={"class":"rich_media_content", "id":"js_content"}) # 提取第一个标签 57 | content_list = [tag_i.text for tag_i in tag.findAll("p")] 58 | content = "".join(content_list) 59 | # print content 60 | # item = WechatprojectItem() ## 只抓取二级页面数据 61 | item = response.meta['item'] ## 抓取当前页数和二级页面数据 62 | item["content"] = content 63 | return item 64 | -------------------------------------------------------------------------------- /WechatSearchProjects/Wechatproject/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = Wechatproject.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Wechatproject 12 | -------------------------------------------------------------------------------- /ZhihuSpider/ReadMe.md: -------------------------------------------------------------------------------- 1 | ### 网络爬虫之用户名密码及验证码登陆:爬取[知乎](http://www.zhihu.com/)网站 2 | 3 | **一些说明:** 4 | 5 | * 使用requests包来爬取。首先尝试用用户名密码自动登陆,如果失败,则需要采用cookie登陆。 6 | 7 | * 配置文件config.ini,其中包括用户名密码信息,如果有验证码情况,需要手动登陆一次网站获取cookie信息。 8 | 9 | * 判断登陆成功与否,看生成的html文件中有没有用户信息。 10 | -------------------------------------------------------------------------------- /ZhihuSpider/ZhihuSpider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | ''' 3 | 网络爬虫之用户名密码及验证码登陆:爬取知乎网站 4 | ''' 5 | import requests 6 | import ConfigParser 7 | 8 | def create_session(): 9 | cf = ConfigParser.ConfigParser() 10 | cf.read('config.ini') 11 | cookies = cf.items('cookies') 12 | cookies = dict(cookies) 13 | from pprint import pprint 14 | pprint(cookies) 15 | email = cf.get('info', 'email') 16 | password = cf.get('info', 'password') 17 | 18 | session = requests.session() 19 | login_data = {'email': email, 'password': password} 20 | header = { 21 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36', 22 | 'Host': 'www.zhihu.com', 23 | 'Referer': 'http://www.zhihu.com/' 24 | } 25 | r = session.post('http://www.zhihu.com/login/email', data=login_data, headers=header) 26 | if r.json()['r'] == 1: 27 | print 'Login Failed, reason is:', 28 | for m in r.json()['data']: 29 | print r.json()['data'][m] 30 | print 'So we use cookies to login in...' 31 | has_cookies = False 32 | for key in cookies: 33 | if key != '__name__' and cookies[key] != '': 34 | has_cookies = True 35 | break 36 | if has_cookies is False: 37 | raise ValueError('请填写config.ini文件中的cookies项.') 38 | else: 39 | # r = requests.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆 40 | r = session.get('http://www.zhihu.com/login/email', cookies=cookies) # 实现验证码登陆 41 | 42 | with open('login.html', 'w') as fp: 43 | fp.write(r.content) 44 | 45 | return session, cookies 46 | 47 | 48 | if __name__ == '__main__': 49 | requests_session, requests_cookies = create_session() 50 | 51 | # url = 'http://www.zhihu.com/login/email' 52 | url = 'http://www.zhihu.com/topic/19552832' 53 | # content = requests_session.get(url).content # 未登陆 54 | # content = requests.get(url, cookies=requests_cookies).content # 已登陆 55 | content = requests_session.get(url, cookies=requests_cookies).content # 已登陆 56 | with open('url.html', 'w') as fp: 57 | fp.write(content) -------------------------------------------------------------------------------- /ZhihuSpider/config.ini: -------------------------------------------------------------------------------- 1 | [info] 2 | email = xxxx@163.com 3 | password = xxxx 4 | 5 | [cookies] 6 | q_c1 = 7 | cap_id = 8 | _za = 9 | __utmt = 10 | __utma = 11 | __utmb = 12 | __utmc = 13 | __utmz = 14 | __utmv = 15 | z_c0 = 16 | unlock_ticket = --------------------------------------------------------------------------------