├── .gitattributes ├── .gitignore ├── BaseTools ├── CompareUtil.py ├── MyDownload.py ├── MyUtil.py ├── ScreenShotUtil.py ├── __init__.py └── test │ ├── Parent.py │ ├── data │ ├── es-query-lean.md │ ├── result.md │ └── test-file.html │ ├── edit-distance-test.py │ ├── util-test.py │ ├── util-test2.py │ └── util-test3.py ├── DBTools ├── MyES.py ├── MyMongoDB.py ├── MySqlite.py ├── __init__.py └── test │ ├── Parent.py │ ├── es-test.py │ ├── sqlite-test.py │ └── sqlite-test │ └── test.db ├── README.md ├── __init__.py ├── ctrip.com-visa ├── Parent.py ├── README.md ├── img │ ├── ctrip-visa-gqtp.png │ └── ctrip-visa-lsgxx.png └── xc-visa-lqxx.py ├── framework └── base_scrapy │ ├── README.md │ ├── base_scrapy │ ├── __init__.py │ ├── entrypoint.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ └── base_spider.py │ └── scrapy.cfg ├── huaban.com ├── PreviewHtmlTool.py ├── README.md ├── huaban-border-text.py ├── huaban-simple.py ├── img │ ├── huaban-border-txt.png │ ├── huaban-preview-border.png │ └── huaban-simple-1.png ├── test.html └── 淡然小笺赋箴言 │ ├── 13448395_1.jpg │ ├── 13448395_1.txt │ ├── 13448395_10.jpg │ ├── 13448395_10.txt │ ├── 13448395_11.jpg │ ├── 13448395_11.txt │ ├── 13448395_12.jpg │ ├── 13448395_12.txt │ ├── 13448395_13.jpg │ ├── 13448395_13.txt │ ├── 13448395_14.jpg │ ├── 13448395_14.txt │ ├── 13448395_15.jpg │ ├── 13448395_15.txt │ ├── 13448395_16.jpg │ ├── 13448395_16.txt │ ├── 13448395_17.jpg │ ├── 13448395_17.txt │ ├── 13448395_18.jpg │ ├── 13448395_18.txt │ ├── 13448395_19.jpg │ ├── 13448395_19.txt │ ├── 13448395_2.jpg │ ├── 13448395_2.txt │ ├── 13448395_20.jpg │ ├── 13448395_20.txt │ ├── 13448395_21.jpg │ ├── 13448395_21.txt │ ├── 13448395_22.jpg │ ├── 13448395_22.txt │ ├── 13448395_23.jpg │ ├── 13448395_23.txt │ ├── 13448395_24.jpg │ ├── 13448395_24.txt │ ├── 13448395_25.jpg │ ├── 13448395_25.txt │ ├── 13448395_26.jpg │ ├── 13448395_26.txt │ ├── 13448395_27.jpg │ ├── 13448395_27.txt │ ├── 13448395_28.jpg │ ├── 13448395_28.txt │ ├── 13448395_29.jpg │ ├── 13448395_29.txt │ ├── 13448395_3.jpg │ ├── 13448395_3.txt │ ├── 13448395_30.jpg │ ├── 13448395_30.txt │ ├── 13448395_31.jpg │ ├── 13448395_31.txt │ ├── 13448395_32.jpg │ ├── 13448395_32.txt │ ├── 13448395_33.jpg │ ├── 13448395_33.txt │ ├── 13448395_34.jpg │ ├── 13448395_34.txt │ ├── 13448395_35.jpg │ ├── 13448395_35.txt │ ├── 13448395_36.jpg │ ├── 13448395_36.txt │ ├── 13448395_37.jpg │ ├── 13448395_37.txt │ ├── 13448395_38.jpg │ ├── 13448395_38.txt │ ├── 13448395_39.jpg │ ├── 13448395_39.txt │ ├── 13448395_4.jpg │ ├── 13448395_4.txt │ ├── 13448395_40.jpg │ ├── 13448395_40.txt │ ├── 13448395_5.jpg │ ├── 13448395_5.txt │ ├── 13448395_6.jpg │ ├── 13448395_6.txt │ ├── 13448395_7.jpg │ ├── 13448395_7.txt │ ├── 13448395_8.jpg │ ├── 13448395_8.txt │ ├── 13448395_9.jpg │ ├── 13448395_9.txt │ └── index.html ├── jjwxk.net ├── Parent.py ├── README.md ├── img │ ├── jjwxk-free-simple-1.png │ └── jjwxk-free-simple-2.png ├── jjwxk-free-simple.py └── simple-http-server.py ├── mzitu.com ├── Parent.py ├── README.md ├── mzitu-crawler-es.py ├── mzitu_es.py ├── mzitu_for_thread.py ├── scrapy-mzitu-es.py └── scrapy-mzitu-no-es.py └── wallhaven.cc ├── Parent.py ├── README.md ├── img └── 20210623210831.png └── wallpic_scrapy.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.js linguist-language=python 2 | *.css linguist-language=python 3 | *.html linguist-language=python 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | .idea/ 3 | .vscode/ 4 | *.png 5 | *.jpg 6 | *.csv 7 | *.wpr 8 | *.txt 9 | *.log 10 | *.json 11 | *.exe 12 | plugin/* 13 | .DS_Store 14 | .scrapy/ -------------------------------------------------------------------------------- /BaseTools/CompareUtil.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | class EditDistance(): 4 | @classmethod 5 | def minEditDist(cls, sm, sn): 6 | ''' 7 | 计算两个字符串的最小莱温斯坦距离 8 | ''' 9 | m,n = len(sm)+1,len(sn)+1 10 | 11 | # create a matrix (m*n) 12 | matrix = [[0]*n for i in range(m)] 13 | 14 | matrix[0][0]=0 15 | for i in range(1,m): 16 | matrix[i][0] = matrix[i-1][0] + 1 17 | 18 | for j in range(1,n): 19 | matrix[0][j] = matrix[0][j-1]+1 20 | 21 | 22 | for i in range(m): 23 | print(matrix[i]) 24 | 25 | print("********************") 26 | 27 | cost = 0 28 | 29 | for i in range(1,m): 30 | for j in range(1,n): 31 | if sm[i-1]==sn[j-1]: 32 | cost = 0 33 | else: 34 | cost = 1 35 | 36 | matrix[i][j]=min(matrix[i-1][j]+1,matrix[i][j-1]+1,matrix[i-1][j-1]+cost) 37 | 38 | for i in range(m): 39 | print(matrix[i]) 40 | 41 | return matrix[m-1][n-1] 42 | 43 | @classmethod 44 | def similarityDegree(cls, str1, str2): 45 | ''' 46 | 计算两个字符串的相似度 47 | ''' 48 | mindist = 0 49 | if str1 == None and str2 != None: 50 | mindist = len(str2) 51 | return 0 52 | elif str1 != None and str2 == None: 53 | mindist = len(str1) 54 | return 0 55 | elif str1 != None and str2 != None: 56 | mindist = cls.minEditDist(str1,str2) 57 | else: 58 | return 0 59 | maxLength = min(len(str1), len(str2)) 60 | similarityDegree = 1-mindist/maxLength 61 | print(str1, "和", str2, "的相似度为:", similarityDegree) 62 | return similarityDegree 63 | -------------------------------------------------------------------------------- /BaseTools/MyDownload.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import requests 3 | import re 4 | import random 5 | import time 6 | from bs4 import BeautifulSoup 7 | 8 | 9 | class download(): 10 | def __init__(self): 11 | self.iplist = [] ##初始化一个list用来存放我们获取到的IP 12 | # self.get_ip_list() 13 | self.get_ip_list3() 14 | print(self.iplist) 15 | self.user_agent_list = [ 16 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", 17 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 19 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 20 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 21 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 22 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 23 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 24 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 25 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 26 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 27 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 28 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 29 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 30 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 31 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 32 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 33 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 34 | ] 35 | 36 | #功能:爬取IP存入ip_list列表 37 | def get_ip_list(self): 38 | #html = requests.get("http://haoip.cc/tiqu.htm") ##不解释咯,获取免费代理IP地址的网站,用正则过滤获取到代理IP 39 | #iplistn = re.findall(r'r/>(.*?)(.*?)@HTTP', html.text, re.S) ##表示从html.text中获取所有r/> 6: 59 | if not tds[6].text.find('天')==-1: 60 | # print('tds[8]为:'+str(tds[8])) 61 | self.iplist.append(tds[1].text + ':' + tds[2].text) 62 | # print(tds[1].text + ':' + tds[2].text) 63 | 64 | #功能:爬取IP存入ip_list列表 65 | def get_ip_list3(self): 66 | web_data = requests.get("https://www.kuaidaili.com/free/", headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}) 67 | soup = BeautifulSoup(web_data.text, 'lxml') 68 | ips = soup.find_all('tr') 69 | for i in range(1, len(ips)): 70 | ip_info = ips[i] 71 | tds = ip_info.find_all('td') 72 | currIp = '' 73 | if len(tds) > 1: 74 | for item in tds: 75 | if item["data-title"] == 'IP': 76 | currIp = item.text 77 | if item["data-title"] == 'PORT': 78 | currIp += ':' + item.text 79 | break 80 | self.iplist.append(currIp) 81 | 82 | def get(self, url, headers, timeout, proxy=None, num_retries=10): ##给函数一个默认参数proxy为空 83 | UA = random.choice(self.user_agent_list) ##从self.user_agent_list中随机取出一个字符串 84 | headers['User-Agent'] = UA ##构造成一个完整的User-Agent (UA代表的是上面随机取出来的字符串哦) 85 | 86 | if proxy == None: ##当代理为空时,不使用代理获取response(别忘了response啥哦!之前说过了!!) 87 | try: 88 | return requests.get(url, headers=headers, timeout=timeout)##这样服务器就会以为我们是真的浏览器了 89 | except:##如过上面的代码执行报错则执行下面的代码 90 | if num_retries > 0: ##num_retries是我们限定的重试次数 91 | time.sleep(10) ##延迟十秒 92 | print('获取网页出错,10S后将获取倒数第:', num_retries, '次') 93 | return self.get(url, headers, timeout, num_retries - 1) ##调用自身 并将次数减1 94 | else: 95 | print('开始使用代理') 96 | time.sleep(10) 97 | IP = ''.join(str(random.choice(self.iplist)).strip()) ##下面有解释哦 98 | proxy = {'http': IP} 99 | return self.get(url, headers, timeout, proxy) ##代理不为空的时候 100 | else: ##当代理不为空 101 | try: 102 | IP = ''.join(str(random.choice(self.iplist)).strip()) ##将从self.iplist中获取的字符串处理成我们需要的格式(处理了些什么自己看哦,这是基础呢) 103 | proxy = {'http': IP} ##构造成一个代理 104 | return requests.get(url, headers=headers, proxies=proxy, timeout=timeout) ##使用代理获取response 105 | except: 106 | if num_retries > 0: 107 | time.sleep(10) 108 | IP = ''.join(str(random.choice(self.iplist)).strip()) 109 | proxy = {'http': IP} 110 | print('正在更换代理,10S后将重新获取倒数第', num_retries, '次') 111 | print('当前代理是:', proxy) 112 | return self.get(url, headers, timeout, proxy, num_retries - 1) 113 | else: 114 | print('代理也不好使了!取消代理') 115 | return self.get(url, headers, 3) 116 | 117 | # 获取文本编码 118 | def get_encoding(self, text): 119 | return requests.utils.get_encodings_from_content(text) 120 | 121 | # 获取非中文乱码的文本 122 | def get_utf8_content(self, url, headers): 123 | req = request.get(url, headers, timeout=3) 124 | if req.content == None: 125 | return "" 126 | encoding = "utf-8" 127 | if req.encoding == 'ISO-8859-1': 128 | encodings = request.get_encoding(req.text) 129 | if encodings: 130 | encoding = encodings[0] 131 | else: 132 | encoding = req.apparent_encoding 133 | # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace') 134 | return req.content.decode(encoding, 'replace') #如果设置为replace,则会用?取代非法字符; 135 | return req.content 136 | 137 | 138 | request = download() -------------------------------------------------------------------------------- /BaseTools/MyUtil.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import random,string 3 | from hashlib import md5 4 | import os 5 | import codecs 6 | import tomd 7 | 8 | class MyStr(): 9 | @classmethod 10 | def getRandomPsw(cls, length=6): 11 | src = string.ascii_letters + string.digits 12 | if length < 6: 13 | length = 6 14 | list_passwd_all = random.sample(src, length - 3) #从字母和数字中随机取3位 15 | list_passwd_all.extend(random.sample(string.digits, 1)) #让密码中一定包含数字 16 | list_passwd_all.extend(random.sample(string.ascii_lowercase, 1)) #让密码中一定包含小写字母 17 | list_passwd_all.extend(random.sample(string.ascii_uppercase, 1)) #让密码中一定包含大写字母 18 | random.shuffle(list_passwd_all) #打乱列表顺序 19 | 20 | @classmethod 21 | def getFileMd5(cls, name): 22 | m = md5() 23 | a_file = open(name, 'rb') #需要使用二进制格式读取文件内容 24 | m.update(a_file.read()) 25 | a_file.close() 26 | return m.hexdigest() 27 | 28 | @classmethod 29 | def getMd5(cls, instr, length=32): 30 | m = md5() 31 | m.update(instr) 32 | res = m.hexdigest() 33 | if length < 32: 34 | res = random.sample(res, length) #从字母和数字中随机取3位 35 | return res 36 | 37 | @classmethod 38 | def html2markdown(cls, html): 39 | mdTxt = tomd.Tomd(html).markdown 40 | return mdTxt 41 | 42 | class FileTool(object): 43 | #追加写入:写一个写入数据的接口 44 | @classmethod 45 | def write_behind(cls, filename, content, split='\n'): 46 | ''''' 47 | :param content: 要写入的数据 48 | :param split: 每条数据之间的分隔符 49 | :return: 50 | ''' 51 | if content == None: 52 | return 53 | # 判断传入的参数是否字符串类型,如果是,写入 . 如果不是,抛出异常 54 | if isinstance(content, str): 55 | #1.打开文件 56 | f = codecs.open(filename, 'a', 'utf-8') 57 | #2.写入数据 58 | f.write(content) 59 | f.write(split) 60 | #3.关闭文件 61 | f.close() 62 | else: 63 | raise TypeError('content must be a str!') 64 | 65 | #追加写入:写入多行数据 66 | @classmethod 67 | def write_behind_muti(cls, filename, str_list, split='\n'): 68 | #判断某个对象是否是某个类型,若是,返回True;否则,返回False 69 | rs = isinstance(str_list, list) 70 | #如果为True 71 | if rs: 72 | #for循环遍历列表,取出每一数据,判断数据类型是否为字符串 73 | for content in str_list: 74 | #如果不是字符串类型 75 | if isinstance(content,str) == False: 76 | #抛出异常 77 | raise TypeError('str_list must be a list of "str",ex:["str1","str2"...]') 78 | #如果没有异常,就可以写入数据了 79 | #1.打开文件 80 | f = open(filename,'a') 81 | #2.写入数据 str1\nstr2\nstr3... 82 | string = split.join(str_list) 83 | f.write(string) 84 | #3.关闭文件 85 | f.close() 86 | else: 87 | #如果传入的不是列表,抛出异常 88 | raise TypeError('str_list must be a list of "str",ex:["str1","str2"...]') 89 | #创建文件夹 90 | @classmethod 91 | def mkdir(cls, path): ##这个函数创建文件夹 92 | isExists = os.path.exists(path) 93 | if not isExists: 94 | print('建了一个名字叫做', path, '的文件夹!') 95 | os.makedirs(path) 96 | return True 97 | else: 98 | print('名字叫做', path, '的文件夹已经存在了!') 99 | return False 100 | #读取文件内容 101 | @classmethod 102 | def read_utf8(cls, path): 103 | isExists = os.path.exists(path) 104 | if isExists: 105 | with open(path, 'r', encoding='UTF-8') as f: 106 | return str(f.read()) 107 | else: 108 | return '' 109 | # 覆盖写入 110 | @classmethod 111 | def overwrite(cls, path, text): 112 | with open(path, 'w', encoding='UTF-8') as f: 113 | f.write(text) 114 | 115 | # 判断文件是否存在 116 | @classmethod 117 | def isExit(cls, path): 118 | return os.path.exists(path) 119 | 120 | # 检查文件名是否合理,替换特殊字符 121 | @classmethod 122 | def replace_invalid_filename(cls, filename, replaced_char='_'): 123 | ''' 124 | 替换有特殊字符的文件名中的特殊字符,默认将特殊字符替换为'_'. 125 | 例如 C/C++ -> C_C++ 126 | ''' 127 | valid_filename = filename 128 | invalid_characaters = '\\/:*?"<>|' 129 | for c in invalid_characaters: 130 | #print 'c:', c 131 | valid_filename = valid_filename.replace(c, replaced_char) 132 | return valid_filename 133 | 134 | 135 | class DateTool(object): 136 | #日期格式化工具类,用类执行一个函数,返回一个对象,对象分别有year\month\day 137 | ''' 138 | 2018-2-1 2018.2.1 2018/2/1 139 | date.year = 2018 140 | date.month = 2 141 | date.day = 1 142 | ''' 143 | #初始化函数 144 | def __init__(self,year=1970,month=1,day=1): 145 | self.year = year 146 | self.month = month 147 | self.day = day 148 | #类函数,传递进来一个日期,返回一个该类的对象 149 | @classmethod 150 | def get_date(cls,date): 151 | #判断date是否为str类型 152 | if not isinstance(date,str): 153 | #不是str类型,直接触发异常 154 | raise TypeError('date must be a str!') 155 | #转换 156 | #判断是-还是.还是空格 157 | if '-' in date: 158 | #分别将2018赋值year 2赋值给month 1赋值给day 159 | # year, month, day = [2018,2,1] 160 | year,month,day = list(map(int,date.split('-'))) 161 | elif '.' in date: 162 | year,month,day = list(map(int,date.split('.'))) 163 | elif ' ' in date: 164 | year,month,day = list(map(int,date.split(' '))) 165 | elif '/' in date: 166 | year,month,day = list(map(int,date.split('/'))) 167 | #创建对象 168 | # obj = DateTool(year,month,day) 169 | obj = cls(year,month,day) 170 | #返回对象 171 | return obj -------------------------------------------------------------------------------- /BaseTools/ScreenShotUtil.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import tagui as t 3 | import uuid 4 | 5 | def url2png(url): 6 | t.init() 7 | t.url(url) 8 | # t.type('q', 'decentralization[enter]') 9 | t.snap('page', 'results-' + str(uuid.uuid1()) + '.png') 10 | t.close() 11 | 12 | -------------------------------------------------------------------------------- /BaseTools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/BaseTools/__init__.py -------------------------------------------------------------------------------- /BaseTools/test/Parent.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") -------------------------------------------------------------------------------- /BaseTools/test/data/es-query-lean.md: -------------------------------------------------------------------------------- 1 | 2 | 1. query string search 3 | 2. query DSL 4 | 3. query filter 5 | 4. full-text search 6 | 5. phrase search 7 | 6. highlight search 8 | 9 | ## query string search 10 | 11 | 1. took:耗费了几毫秒 12 | 1. timed_out:是否超时,这里是没有 13 | 1. _shards:数据拆成了5个分片,所以对于搜索请求,会打到所有的primary shard(或者是它的某个replica shard也可以) 14 | 1. hits.total:查询结果的数量,3个document 15 | 1. hits.max_score:score的含义,就是document对于一个search的相关度的匹配分数,越相关,就越匹配,分数也高 16 | 1. hits.hits:包含了匹配搜索的document的详细数据 17 | 18 | 搜索全部 19 | 20 | ```json 21 | GET /nginx/log_base/_search 22 | 23 | 结果如下: 24 | { 25 | "took" : 18, 26 | "timed_out" : false, 27 | "_shards" : { 28 | "total" : 5, 29 | "successful" : 5, 30 | "skipped" : 0, 31 | "failed" : 0 32 | }, 33 | "hits" : { 34 | "total" : 143405, 35 | "max_score" : 1.0, 36 | “hits”: [ 37 | { 38 | "_index" : "nginx", 39 | "_type" : "log_base", 40 | "_id" : "swZwhmwB82qtm9SxinXv", 41 | "_score" : 10.191514, 42 | "_source" : { 43 | "ip" : "10.95.30.42", 44 | "timestamp" : "17/Jul/2019:00:00:29 +0800", 45 | "url" : "GET /v-dist/static/js/vendor.min.js HTTP/1.1", 46 | "status" : "200", 47 | "bytes" : "782353" 48 | } 49 | }, 50 | {...}, 51 | {...} 52 | ] 53 | } 54 | } 55 | ``` 56 | 57 | query string search 的由来,因为 search 参数都是以 http 请求的 query string 来附带的 58 | 59 | 搜索商品名称中包含yagao的商品,而且按照售价降序排序: 60 | 61 | ```json 62 | # 查询 所有字段 包含 10.95 的数据集 63 | GET /bookdb_index/book/_search?q=10.95 64 | # 查询 ip 包含 10.95.30.42 的数据集 65 | GET nginx/log_base/_search?q=ip:10.95.30.42 66 | # 使用 sort 功能需要定义 timestamp 属性 fielddata=true 有可排序功能 67 | # 出现该错误是因为 5.x 之后,Elasticsearch对排序、聚合所依据的字段用单独的数据结构(fielddata)缓存到内存里了, 68 | # 但是在text字段上默认是禁用的,如果有需要单独开启,这样做的目的是为了节省内存空间。 69 | GET nginx/log_base/_search?q=ip:10.95.30.42&sort=timestamp:desc 70 | # 使用 _mapping 查看结构定义 71 | GET nginx/_mapping/log_base 72 | # 改变某个属性结构 73 | PUT nginx/_mapping/log_base 74 | { 75 | "properties": { 76 | "timestamp":{ 77 | "type": "text", 78 | "fielddata": true 79 | } 80 | } 81 | } 82 | ``` 83 | 84 | 适用于临时的在命令行使用一些工具,比如curl,快速的发出请求,来检索想要的信息; 85 | 86 | 但是如果查询请求很复杂,是很难去构建的在生产环境中,几乎很少使用 query string search 87 | 88 | ## query DSL 89 | 90 |

DSL:Domain Specified Language,特定领域的语言 91 | http request body:请求体,可以用json的格式来构建查询语法,比较方便,可以构建各种复杂的语法,比query string search肯定强大多了

92 | 93 | **查询所有** 94 | 95 | ``` 96 | GET nginx/log_base/_search 97 | { 98 | "query": { "match_all": {} } 99 | } 100 | ``` 101 | 102 | **查询 ip 包含 ,同时按照价格降序排序** 103 | 104 | ```json 105 | GET nginx/log_base/_search 106 | { 107 | "query" : { 108 | "match" : { 109 | "ip" : "10.95.30.42" 110 | } 111 | }, 112 | "sort": [ 113 | { "timestamp": "desc" } 114 | ] 115 | } 116 | ``` 117 | 118 | **分页查询** 119 | 120 | ```json 121 | # from:从第几个开始,es 从 0 开始计数的 122 | # size:往后查询 100 个 123 | GET nginx/log_base/_search 124 | { 125 | "query": { "match_all": {} }, 126 | "from": 1, 127 | "size": 100 128 | } 129 | ``` 130 | 131 | **指定要查询展示的属性** 132 | 133 | ```json 134 | GET nginx/log_base/_search 135 | { 136 | "query": { "match_all": {} }, 137 | "_source": ["ip", "status"] 138 | } 139 | ``` 140 | 141 | 更加适合生产环境的使用,可以构建复杂的查询 142 | 143 | ## query filter 144 | 145 | **结果集里面过滤** 146 | 147 | ```json 148 | GET nginx/log_base/_search 149 | { 150 | "query": { 151 | "bool": { 152 | "must": { 153 | "match":{ 154 | "ip" : "10.95.30.42" 155 | } 156 | }, 157 | "filter": { 158 | "match":{ 159 | "status" : "302" 160 | } 161 | } 162 | } 163 | } 164 | } 165 | ``` 166 | 167 | ## full-text search(全文检索) 168 | 169 | ```json 170 | GET nginx/log_base/_search 171 | { 172 | "query" : { 173 | "match" : { 174 | "url" : ".js" 175 | } 176 | } 177 | } 178 | ``` 179 | 180 | ## phrase search(短语搜索) 181 | 182 | 跟全文检索相对应,相反,全文检索会将输入的搜索串拆解开来,去倒排索引里面去一一匹配,只要能匹配上任意一个拆解后的单词,就可以作为结果返回 183 | phrase search,要求输入的搜索串,必须在指定的字段文本中,完全包含一模一样的短语(空格等其他非数字字母分隔开的字符),才可以算匹配,才能作为结果返回 184 | 185 | ```json 186 | GET nginx/log_base/_search 187 | { 188 | "query" : { 189 | "match_phrase" : { 190 | "ip" : "10.94.53.32" 191 | } 192 | } 193 | } 194 | ``` 195 | 196 | ## highlight search(高亮搜索结果) 197 | 198 | ```json 199 | GET nginx/log_base/_search 200 | { 201 | "query" : { 202 | "match" : { 203 | "ip" : "10.94.53.32" 204 | } 205 | }, 206 | "highlight": { 207 | "fields" : { 208 | "ip" : {} 209 | } 210 | } 211 | } 212 | 213 | { 214 | "took" : 295, 215 | "timed_out" : false, 216 | "_shards" : { 217 | "total" : 5, 218 | "successful" : 5, 219 | "skipped" : 0, 220 | "failed" : 0 221 | }, 222 | "hits" : { 223 | "total" : 29977, 224 | "max_score" : 1.5757076, 225 | "hits" : [ 226 | { 227 | "_index" : "nginx", 228 | "_type" : "log_base", 229 | "_id" : "yAZwhmwB82qtm9SxinXv", 230 | "_score" : 1.5757076, 231 | "_source" : { 232 | "ip" : "10.94.53.32", 233 | "timestamp" : "17/Jul/2019:00:01:20 +0800", 234 | "url" : "GET /v-dist/static/css/app.min.css HTTP/1.1", 235 | "status" : "200", 236 | "bytes" : "217190" 237 | }, 238 | "highlight" : { 239 | "ip" : [ 240 | "10.94.53.32" 241 | ] 242 | } 243 | }, 244 | {...} 245 | ] 246 | } 247 | } 248 | ``` 249 | -------------------------------------------------------------------------------- /BaseTools/test/edit-distance-test.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | from CompareUtil import EditDistance 4 | def main(): 5 | EditDistance.similarityDegree("黄鹤楼","i黄鹤楼van2") 6 | EditDistance.similarityDegree("黄鹤楼","黄黄鹤鹤楼") 7 | EditDistance.similarityDegree("黄鹤楼","鹤楼黄楼黄楼") 8 | EditDistance.similarityDegree("黄鹤楼","鹤鹤楼") 9 | EditDistance.similarityDegree("黄鹤楼","汤逊湖") 10 | EditDistance.similarityDegree("黄鹤楼","岳阳楼") 11 | 12 | if __name__ == '__main__': 13 | main() -------------------------------------------------------------------------------- /BaseTools/test/util-test.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | from MyUtil import FileTool 4 | from MyUtil import DateTool 5 | def main(): 6 | # 指定写入文件的名称 7 | filename = 'test.txt' 8 | # 执行写入功能函数 9 | FileTool.write_behind(filename, 'hello') 10 | FileTool.write_behind(filename, 'world') 11 | print("1.追加单行写\n", FileTool.read_utf8(filename)) 12 | 13 | FileTool.write_behind(filename, '你好!') 14 | print("1.1.追加写中文\n", FileTool.read_utf8(filename)) 15 | 16 | FileTool.write_behind_muti(filename, ['hello', 'world', 'zhangzhang']) 17 | print("2.追加多行写\n", FileTool.read_utf8(filename)) 18 | 19 | FileTool.overwrite(filename, "hello_world!") 20 | print("1.覆写\n", FileTool.read_utf8(filename)) 21 | 22 | FileTool.write_behind(filename, '你好,世界!') 23 | print("1.1.覆写写中文\n", FileTool.read_utf8(filename)) 24 | 25 | 26 | 27 | # 开始进行日期转换 28 | # 转换之后 返回一个结果对象 29 | date = DateTool.get_date('2020 2 22') 30 | #date有三个属性 分别为year,month,day 31 | print("日期转换") 32 | print(date.year) 33 | print(date.month) 34 | print(date.day) 35 | 36 | 37 | if __name__ == '__main__': 38 | main() -------------------------------------------------------------------------------- /BaseTools/test/util-test2.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | from MyUtil import FileTool 4 | from MyUtil import MyStr 5 | def html2markdown(input_file_path, output_file_path): 6 | html = FileTool.read_utf8(input_file_path) 7 | mdTxt = MyStr.html2markdown(html) 8 | FileTool.overwrite(output_file_path, mdTxt) 9 | 10 | 11 | if __name__ == '__main__': 12 | html2markdown('data/test-file.html', 'data/result.md') -------------------------------------------------------------------------------- /BaseTools/test/util-test3.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | import ScreenShotUtil as screenshot 4 | 5 | def main(): 6 | screenshot.url2png("https://www.baidu.com/") 7 | 8 | if __name__ == '__main__': 9 | main() -------------------------------------------------------------------------------- /DBTools/MyES.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import os 3 | import time 4 | import csv 5 | from os import walk 6 | from datetime import datetime 7 | from elasticsearch import Elasticsearch 8 | from elasticsearch.helpers import bulk 9 | 10 | class MyESClient(object): 11 | def __init__(self, index_name, index_type, ip ="127.0.0.1", print=False): 12 | ''' 13 | :param index_name: 索引名称 14 | :param index_type: 索引类型 15 | ''' 16 | self.index_name =index_name 17 | self.index_type = index_type 18 | # 无用户名密码状态 19 | self.es = Elasticsearch([ip], port=9200) 20 | #用户名密码状态 21 | self.es = Elasticsearch([ip], http_auth=('elastic', 'password'), port=9200) 22 | self.show_es_result = print 23 | 24 | def createIndex(self, index_mappings): 25 | ''' 26 | 创建索引,创建索引名称为ott,类型为ott_type的索引 27 | :param ex: Elasticsearch对象 28 | :return: 29 | ''' 30 | #创建映射 31 | if self.es.indices.exists(index=self.index_name) is not True: 32 | res = self.es.indices.create(index=self.index_name, body=index_mappings) 33 | if self.show_es_result: 34 | print(res) 35 | 36 | def indexDataFromCvsDir(self, cloumnDict): 37 | csvdir = './ElasticSearch/exportExcels' 38 | filenamelist = [] 39 | for (dirpath, dirnames, filenames) in walk(csvdir): 40 | filenamelist.extend(filenames) 41 | break 42 | for file in filenamelist: 43 | csvfile = csvdir + '/' + file 44 | self.indexDataFromCSV(csvfile, cloumnDict) 45 | time.sleep(10) 46 | 47 | def indexDataFromCSV(self, csvfile, cloumnList=None): 48 | ''' 49 | 从CSV文件中读取数据,并存储到es中 50 | :param csvfile: csv文件,包括完整路径 51 | :return: 52 | ''' 53 | with open(csvfile) as f: 54 | reader = csv.reader(f) 55 | # 读取一行,下面的reader中已经没有该行了 56 | index = 0 57 | if cloumnList == None: 58 | cloumnList = next(reader) 59 | index = 1 60 | doc = {} 61 | cloumnLength = len(cloumnList) 62 | for item in reader: 63 | if index > 0:#第一行是标题 64 | if cloumnLength <= len(item): 65 | for i in range(cloumnLength): 66 | doc[cloumnList[i]] = item[i] 67 | self.es.index(index=self.index_name, doc_type=self.index_type, body=doc) 68 | index += 1 69 | 70 | def getDataExportCSV(self, csvfile, query={'query': {'match_all': {}}}, cloumnList=None): 71 | ''' 72 | 从数据库导出csv表格 73 | :param csvfile: 74 | :param query: 75 | :param cloumnList: 76 | :return: 77 | ''' 78 | res = self.getDataByBody(query) 79 | if res is not None and len(res['hits']['hits']) > 0: 80 | # fobj = open(csvfile, 'w+') 81 | with open(csvfile, 'w', newline='') as fobj: 82 | if cloumnList == None: 83 | cloumnList = res['hits']['hits'][0]["_source"].keys() 84 | writer = csv.DictWriter(fobj, fieldnames=cloumnList) 85 | writer.writeheader() 86 | for hit in res['hits']['hits']: 87 | writer.writerow(hit["_source"]) 88 | 89 | def indexDataList(self, list=[]): 90 | ''' 91 | 数据存储到es 92 | :return: 93 | ''' 94 | for item in list: 95 | res = self.es.index(index=self.index_name, doc_type=self.index_type, body=item) 96 | if self.show_es_result: 97 | print(res) 98 | 99 | def indexData(self, data, id=None): 100 | ''' 101 | 单条数据添加 102 | :param data: 103 | :return: 104 | ''' 105 | res = self.es.index(index=self.index_name, doc_type=self.index_type, body=data, id=id) 106 | if self.show_es_result: 107 | print(res) 108 | return res 109 | 110 | def bulkIndexData(self, list=[]): 111 | ''' 112 | 用bulk将批量数据存储到es 113 | :return: 114 | ''' 115 | ACTIONS = [] 116 | for line in list: 117 | action = { 118 | "_index": self.index_name, 119 | "_type": self.index_type, 120 | "_source": line 121 | } 122 | ACTIONS.append(action) 123 | # 批量处理 124 | success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True) 125 | if self.show_es_result: 126 | print('Performed %d actions' % success) 127 | return success 128 | 129 | def deleteDataById(self,id): 130 | ''' 131 | 删除索引中的一条 132 | :param id: 133 | :return: 134 | ''' 135 | res = self.es.delete(index=self.index_name, doc_type=self.index_type, id=id) 136 | if self.show_es_result: 137 | print(res) 138 | return res 139 | 140 | def getDataId(self,id): 141 | res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id) 142 | # 输出查询到的结果 143 | if self.show_es_result: 144 | print(res) 145 | return res 146 | 147 | def getDataSourceById(self,id): 148 | res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id) 149 | # 输出查询到的结果 150 | if self.show_es_result: 151 | print(res) 152 | if res is not None and len(res['hits']['hits']) > 0: 153 | return res['hits']['hits'][0]["_source"] 154 | else: 155 | return None 156 | 157 | def exit(self, queryBody): 158 | if queryBody == None: 159 | return False 160 | res = self.getDataByBody(queryBody) 161 | if res is not None and len(res['hits']['hits']) > 0: 162 | return True 163 | else: 164 | return False 165 | 166 | def getOneByBody(self, query): 167 | params = {"size":1} 168 | res = self.getDataByBody(query, params) 169 | if res is not None and len(res['hits']['hits']) > 0: 170 | return res['hits']['hits'][0]["_source"] 171 | else: 172 | return None 173 | 174 | def getDataByBody(self, queryBody={'query': {'match_all': {}}}, params=None): 175 | # queryBody = {'query': {'match_all': {}}} 176 | _searched = None 177 | if params == None: 178 | _searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=queryBody) 179 | else: 180 | _searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=queryBody, params=params) 181 | 182 | if self.show_es_result: 183 | print(_searched) 184 | return _searched -------------------------------------------------------------------------------- /DBTools/MyMongoDB.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from pymongo import MongoClient 3 | 4 | class MyMongoClient(object): 5 | def __init__(self, dbname=None, setname=None): 6 | self.dbname = dbname 7 | self.setname = setname 8 | self.client = MongoClient() ##与MongDB建立连接(这是默认连接本地MongDB数据库) 9 | self.db = self.client[dbname] ## 选择一个数据库 10 | self.collection = self.db[setname] ##在这个数据库中,选择一个集合 11 | 12 | def save(self, data): 13 | res = self.collection.save(data) 14 | if SHOW_RESULT: 15 | print(res) 16 | return res 17 | 18 | def getOne(self, query): 19 | res = self.collection.find_one(query) 20 | if SHOW_RESULT: 21 | print(res) 22 | return res 23 | 24 | def isExit(self, query): 25 | if self.getOne(query): 26 | return True 27 | else: 28 | return False 29 | 30 | def get(self, query): 31 | res = self.collection.find(query) 32 | if SHOW_RESULT: 33 | print(res) 34 | return res 35 | 36 | SHOW_RESULT = True 37 | -------------------------------------------------------------------------------- /DBTools/MySqlite.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | #python sqlite 3 | #DB-API 2.0 interface for SQLite databases 4 | 5 | import sqlite3 6 | import os 7 | 8 | ''' 9 | SQLite数据库是一款非常小巧的嵌入式开源数据库软件,也就是说 10 | 没有独立的维护进程,所有的维护都来自于程序本身。 11 | 在python中,使用sqlite3创建数据库的连接,当我们指定的数据库文件不存在的时候 12 | 连接对象会自动创建数据库文件;如果数据库文件已经存在,则连接对象不会再创建 13 | 数据库文件,而是直接打开该数据库文件。 14 | 连接对象可以是硬盘上面的数据库文件,也可以是建立在内存中的,在内存中的数据库 15 | 执行完任何操作后,都不需要提交事务的(commit) 16 | 17 | 创建在硬盘上面: conn = sqlite3.connect('c:\\test\\test.db') 18 | 创建在内存上面: conn = sqlite3.connect('"memory:') 19 | 20 | 下面我们一硬盘上面创建数据库文件为例来具体说明: 21 | conn = sqlite3.connect('c:\\test\\hongten.db') 22 | 其中conn对象是数据库链接对象,而对于数据库链接对象来说,具有以下操作: 23 | 24 | commit() --事务提交 25 | rollback() --事务回滚 26 | close() --关闭一个数据库链接 27 | cursor() --创建一个游标 28 | 29 | cu = conn.cursor() 30 | 这样我们就创建了一个游标对象:cu 31 | 在sqlite3中,所有sql语句的执行都要在游标对象的参与下完成 32 | 对于游标对象cu,具有以下具体操作: 33 | 34 | execute() --执行一条sql语句 35 | executemany() --执行多条sql语句 36 | close() --游标关闭 37 | fetchone() --从结果中取出一条记录 38 | fetchmany() --从结果中取出多条记录 39 | fetchall() --从结果中取出所有记录 40 | scroll() --游标滚动 41 | 42 | ''' 43 | 44 | class MySqlite(object): 45 | def __init__(self, dbpath, tablename, print=False): 46 | self.dbpath = dbpath 47 | self.tablename = tablename 48 | #是否打印sql 49 | self.show_sql = print 50 | #是否打印sql结果 51 | self.show_sql_result = print 52 | 53 | def get_conn(self,path=None): 54 | '''获取到数据库的连接对象,参数为数据库文件的绝对路径 55 | 如果传递的参数是存在,并且是文件,那么就返回硬盘上面改 56 | 路径下的数据库文件的连接对象;否则,返回内存中的数据接 57 | 连接对象''' 58 | if path == None: 59 | path = self.dbpath 60 | if os.path.exists(path) and os.path.isfile(path): 61 | print('硬盘上面:[{}]'.format(path)) 62 | conn = sqlite3.connect(path) 63 | conn.text_factory = str ##!!! 64 | return conn 65 | else: 66 | conn = None 67 | print('内存上面:[:memory:]') 68 | return sqlite3.connect(':memory:') 69 | 70 | def get_cursor(self, conn=None): 71 | '''该方法是获取数据库的游标对象,参数为数据库的连接对象 72 | 如果数据库的连接对象不为None,则返回数据库连接对象所创 73 | 建的游标对象;否则返回一个游标对象,该对象是内存中数据 74 | 库连接对象所创建的游标对象''' 75 | if conn is not None: 76 | return conn.cursor() 77 | else: 78 | return self.get_conn().cursor() 79 | 80 | ############################################################### 81 | #### 创建|删除表操作 START 82 | ############################################################### 83 | def dropTable(self, table=None, conn=None): 84 | if table == None: 85 | table = self.tablename 86 | if conn == None: 87 | conn = self.get_conn() 88 | '''如果表存在,则删除表,如果表中存在数据的时候,使用该 89 | 方法的时候要慎用!''' 90 | if table is not None and table != '': 91 | sql = 'DROP TABLE IF EXISTS ' + table 92 | if self.show_sql: 93 | print('执行sql:[{}]'.format(sql)) 94 | cu = self.get_cursor(conn) 95 | cu.execute(sql) 96 | conn.commit() 97 | if self.show_sql_result: 98 | print('删除数据库表[{}]成功!'.format(table)) 99 | self.close_all(conn, cu) 100 | else: 101 | print('the [{}] is empty or equal None!'.format(sql)) 102 | 103 | def createTable(self, sql, conn=None): 104 | if conn == None: 105 | conn = self.get_conn() 106 | '''创建数据库表''' 107 | if sql is not None and sql != '': 108 | cu = self.get_cursor(conn) 109 | if self.show_sql: 110 | print('执行sql:[{}]'.format(sql)) 111 | cu.execute(sql) 112 | conn.commit() 113 | if self.show_sql_result: 114 | print('创建数据库表成功!') 115 | self.close_all(conn, cu) 116 | else: 117 | print('the [{}] is empty or equal None!'.format(sql)) 118 | ############################################################### 119 | #### 创建|删除表操作 END 120 | ############################################################### 121 | 122 | def close_all(self, conn, cu): 123 | '''关闭数据库游标对象和数据库连接对象''' 124 | try: 125 | if cu is not None: 126 | cu.close() 127 | finally: 128 | if cu is not None: 129 | cu.close() 130 | 131 | ############################################################### 132 | #### 数据库操作CRUD START 133 | ############################################################### 134 | def insert(self, sql, data, conn=None): 135 | if conn == None: 136 | conn = self.get_conn() 137 | '''插入数据''' 138 | if sql is not None and sql != '': 139 | if data is not None: 140 | cu = self.get_cursor(conn) 141 | for d in data: 142 | if self.show_sql: 143 | print('执行sql:[{}],参数:[{}]'.format(sql, d)) 144 | cu.execute(sql, d) 145 | conn.commit() 146 | self.close_all(conn, cu) 147 | else: 148 | print('the [{}] is empty or equal None!'.format(sql)) 149 | 150 | def selectAll(self, sql, conn=None): 151 | if conn == None: 152 | conn = self.get_conn() 153 | '''查询所有数据''' 154 | if sql is not None and sql != '': 155 | cu = self.get_cursor(conn) 156 | if self.show_sql: 157 | print('执行sql:[{}]'.format(sql)) 158 | cu.execute(sql) 159 | r = cu.fetchall() 160 | if self.show_sql_result: 161 | if len(r) > 0: 162 | for e in range(len(r)): 163 | print(r[e]) 164 | return r 165 | else: 166 | print('the [{}] is empty or equal None!'.format(sql)) 167 | return None 168 | 169 | def selectOne(self, sql, data, conn=None): 170 | if conn == None: 171 | conn = self.get_conn() 172 | '''查询一条数据''' 173 | if sql is not None and sql != '': 174 | if data is not None: 175 | #Do this instead 176 | d = (data,) 177 | cu = self.get_cursor(conn) 178 | if self.show_sql: 179 | print('执行sql:[{}],参数:[{}]'.format(sql, data)) 180 | cu.execute(sql, d) 181 | r = cu.fetchall() 182 | if self.show_sql_result: 183 | if len(r) > 0: 184 | for e in range(len(r)): 185 | print(r[e]) 186 | return r 187 | else: 188 | print('the [{}] equal None!'.format(data)) 189 | else: 190 | print('the [{}] is empty or equal None!'.format(sql)) 191 | return None 192 | 193 | def update(self, sql, data, conn=None): 194 | if conn == None: 195 | conn = self.get_conn() 196 | '''更新数据''' 197 | if sql is not None and sql != '': 198 | if data is not None: 199 | cu = self.get_cursor(conn) 200 | for d in data: 201 | if self.show_sql: 202 | print('执行sql:[{}],参数:[{}]'.format(sql, d)) 203 | cu.execute(sql, d) 204 | conn.commit() 205 | self.close_all(conn, cu) 206 | else: 207 | print('the [{}] is empty or equal None!'.format(sql)) 208 | 209 | def delete(self, sql, data, conn=None): 210 | if conn == None: 211 | conn = self.get_conn() 212 | '''删除数据''' 213 | if sql is not None and sql != '': 214 | if data is not None: 215 | cu = self.get_cursor(conn) 216 | for d in data: 217 | if self.show_sql: 218 | print('执行sql:[{}],参数:[{}]'.format(sql, d)) 219 | cu.execute(sql, d) 220 | conn.commit() 221 | self.close_all(conn, cu) 222 | else: 223 | print('the [{}] is empty or equal None!'.format(sql)) 224 | ############################################################### 225 | #### 数据库操作CRUD END 226 | ############################################################### 227 | 228 | def setDbPath(self, dbpath): 229 | self.dbpath = dbpath 230 | 231 | def setTableName(self, tablename): 232 | self.tablename = tablename 233 | 234 | def openPrint(self): 235 | self.show_sql = True 236 | print('self.show_sql : {}'.format(self.show_sql)) 237 | self.show_sql_result = True 238 | print('self.show_sql_result : {}'.format(self.show_sql_result)) -------------------------------------------------------------------------------- /DBTools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/DBTools/__init__.py -------------------------------------------------------------------------------- /DBTools/test/Parent.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") -------------------------------------------------------------------------------- /DBTools/test/es-test.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | from MyES import MyESClient 4 | 5 | list = [ 6 | {"date": "2017-09-13", 7 | "source": "慧聪网", 8 | "link": "http://info.broadcast.hc360.com/2017/09/130859749974.shtml", 9 | "keyword": "电视", 10 | "title": "付费 电视 行业面临的转型和挑战" 11 | }, 12 | {"date": "2017-09-13", 13 | "source": "中国文明网", 14 | "link": "http://www.wenming.cn/xj_pd/yw/201709/t20170913_4421323.shtml", 15 | "keyword": "电视", 16 | "title": "电视 专题片《巡视利剑》广获好评:铁腕反腐凝聚党心民心" 17 | }, 18 | {"date": "2017-09-13", 19 | "source": "人民电视", 20 | "link": "http://tv.people.com.cn/BIG5/n1/2017/0913/c67816-29533981.html", 21 | "keyword": "电视", 22 | "title": "中国第21批赴刚果(金)维和部隊启程--人民 电视 --人民网" 23 | }, 24 | {"date": "2017-09-13", 25 | "source": "站长之家", 26 | "link": "http://www.chinaz.com/news/2017/0913/804263.shtml", 27 | "keyword": "电视", 28 | "title": "电视 盒子 哪个牌子好? 吐血奉献三大选购秘笈" 29 | } 30 | ] 31 | 32 | # 提前给elasticsearch安装对应版本的中文分词器 https://github.com/medcl/elasticsearch-analysis-ik 33 | index_mappings = { 34 | "mappings": { 35 | "ott_type": { 36 | "properties": { 37 | "title": { 38 | "type": "text", 39 | "index": True, 40 | "analyzer": "ik_max_word", 41 | "search_analyzer": "ik_max_word" 42 | }, 43 | "date": { 44 | "type": "text", 45 | "index": True 46 | }, 47 | "keyword": { 48 | "type": "text", 49 | "index": False 50 | }, 51 | "source": { 52 | "type": "text", 53 | "index": False 54 | }, 55 | "link": { 56 | "type": "text", 57 | "index": False 58 | } 59 | } 60 | } 61 | } 62 | } 63 | 64 | es = MyESClient("ott", "ott_type", print=True) 65 | 66 | es.createIndex(index_mappings) 67 | 68 | es.indexDataList(list) 69 | 70 | queryBody = { 71 | "query": { 72 | "match": { 73 | "title": "电视" 74 | } 75 | } 76 | } 77 | 78 | es.getDataByBody(queryBody) 79 | 80 | es.getDataExportCSV('es-test/ott.csv') 81 | 82 | es.indexDataFromCSV("es-test/ott.csv") 83 | -------------------------------------------------------------------------------- /DBTools/test/sqlite-test.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*-s 2 | import Parent 3 | from MySqlite import MySqlite 4 | import os 5 | 6 | 7 | ############################################################### 8 | #### 测试操作 START 9 | ############################################################### 10 | def drop_table_test(): 11 | '''删除数据库表测试''' 12 | print('删除数据库表测试...') 13 | sqlite.dropTable(TABLE_NAME) 14 | 15 | 16 | def create_table_test(): 17 | '''创建数据库表测试''' 18 | print('创建数据库表测试...') 19 | create_table_sql = '''CREATE TABLE `student` ( 20 | `id` int(11) NOT NULL, 21 | `name` varchar(20) NOT NULL, 22 | `gender` varchar(4) DEFAULT NULL, 23 | `age` int(11) DEFAULT NULL, 24 | `address` varchar(200) DEFAULT NULL, 25 | `phone` varchar(20) DEFAULT NULL, 26 | PRIMARY KEY (`id`) 27 | )''' 28 | sqlite.createTable(create_table_sql) 29 | 30 | 31 | def save_test(): 32 | '''保存数据测试...''' 33 | print('保存数据测试...') 34 | save_sql = '''INSERT INTO student values (?, ?, ?, ?, ?, ?)''' 35 | data = [(1, 'Hongten', '男', 20, '广东省广州市', 36 | '13423****62'), (2, 'Tom', '男', 22, '美国旧金山', '15423****63'), 37 | (3, 'Jake', '女', 18, '广东省广州市', 38 | '18823****87'), (4, 'Cate', '女', 21, '广东省广州市', '14323****32')] 39 | sqlite.insert(save_sql, data) 40 | 41 | 42 | def fetchall_test(): 43 | '''查询所有数据...''' 44 | print('查询所有数据...') 45 | fetchall_sql = '''SELECT * FROM student''' 46 | sqlite.selectAll(fetchall_sql) 47 | 48 | 49 | def fetchone_test(): 50 | '''查询一条数据...''' 51 | print('查询一条数据...') 52 | fetchone_sql = 'SELECT * FROM student WHERE ID = ? ' 53 | data = 1 54 | sqlite.selectOne(fetchone_sql, data) 55 | 56 | def update_test(): 57 | '''更新数据...''' 58 | print('更新数据...') 59 | update_sql = 'UPDATE student SET name = ? WHERE ID = ? ' 60 | data = [('HongtenAA', 1), ('HongtenBB', 2), ('HongtenCC', 3), ('HongtenDD', 61 | 4)] 62 | sqlite.update(update_sql, data) 63 | 64 | 65 | def delete_test(): 66 | '''删除数据...''' 67 | print('删除数据...') 68 | delete_sql = 'DELETE FROM student WHERE NAME = ? AND ID = ? ' 69 | data = [('HongtenAA', 1), ('HongtenCC', 3)] 70 | sqlite.delete(delete_sql, data) 71 | 72 | 73 | ############################################################### 74 | #### 测试操作 END 75 | ############################################################### 76 | 77 | 78 | def init(): 79 | '''初始化方法''' 80 | #数据库文件绝句路径 81 | global DB_FILE_PATH 82 | DB_FILE_PATH = os.getcwd() + '/sqlite-test/test.db' 83 | #数据库表名称 84 | global TABLE_NAME 85 | TABLE_NAME = 'student' 86 | 87 | global sqlite 88 | sqlite = MySqlite(DB_FILE_PATH, TABLE_NAME, True) 89 | #如果存在数据库表,则删除表 90 | drop_table_test() 91 | #创建数据库表student 92 | create_table_test() 93 | #向数据库表中插入数据 94 | save_test() 95 | 96 | 97 | def main(): 98 | init() 99 | fetchall_test() 100 | print('#' * 50) 101 | fetchone_test() 102 | print('#' * 50) 103 | update_test() 104 | fetchall_test() 105 | print('#' * 50) 106 | delete_test() 107 | fetchall_test() 108 | 109 | 110 | if __name__ == '__main__': 111 | main() -------------------------------------------------------------------------------- /DBTools/test/sqlite-test/test.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/DBTools/test/sqlite-test/test.db -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 个人python爬虫的学习和实践记录 2 | 3 | 1. [携程签证国家图片和使馆信息爬取](./ctrip.com-visa) 4 | 2. [妹子图图片资源信息爬取](./mzitu.com) 5 | 3. [晋江文学库免费小说爬取](./jjwxk.net) 6 | 4. [花瓣画板异步爬取](./huaban.com) 7 | 5. [wallhaven The best wallpapers on the Net!](./wallhaven.cc) 8 | 9 | >PS:本项目仅学习分享用,请不要用于商业 -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/__init__.py -------------------------------------------------------------------------------- /ctrip.com-visa/Parent.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") -------------------------------------------------------------------------------- /ctrip.com-visa/README.md: -------------------------------------------------------------------------------- 1 | ## 背景 2 | 3 | 公司做签证需要国家图片和领事馆信息(该页面目前 404,截止发现时间 2019-06-16) 4 | 5 | ## 启动 6 | 7 | 控制台 cd 到当前目录 8 | 9 | >python xc-visa-lqxx.py 10 | 11 | 相关截图: 12 | 13 | ![国旗](img/ctrip-visa-gqtp.png) 14 | 15 | ![领事馆信息](img/ctrip-visa-lsgxx.png) -------------------------------------------------------------------------------- /ctrip.com-visa/img/ctrip-visa-gqtp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/ctrip.com-visa/img/ctrip-visa-gqtp.png -------------------------------------------------------------------------------- /ctrip.com-visa/img/ctrip-visa-lsgxx.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/ctrip.com-visa/img/ctrip-visa-lsgxx.png -------------------------------------------------------------------------------- /ctrip.com-visa/xc-visa-lqxx.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | from bs4 import BeautifulSoup 4 | import os 5 | import re 6 | # from BaseTools.MyUtil import FileTool 7 | from BaseTools.MyDownload import request 8 | import csv 9 | ## http://vacations.ctrip.com/visa/lsg 10 | ## div.c_con a 11 | ## table.sin_lis td 12 | # lqmc: h4 13 | # lsgmc: p[0] 14 | # lsgdz: p[1] 15 | # lsggzsj: p[3] 16 | class VisaLqxxCrawler(): 17 | def __init__(self): 18 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"} 19 | self.gqtpPath = "./gqtp/" 20 | self.mkdir(self.gqtpPath) 21 | self.mkdir("./lsgxx/") 22 | self.lsgxxFilePath = "./lsgxx/lsgxx.txt" 23 | self.lsgxxCsvPath = "./lsgxx/lsgxx.csv" 24 | self.lsgxxList = [] 25 | def all_url(self, url="http://vacations.ctrip.com/visa/lsg"): 26 | html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response 27 | all_div = BeautifulSoup(html.text, 'lxml').find_all('div', class_='c_con') 28 | print("一共有 %d 个州" % len(all_div)) 29 | for div in all_div: 30 | all_a = div.find_all('a') 31 | print("该洲一共有 %d 个国家" % len(all_a)) 32 | for a in all_a: 33 | img = a.find("img") 34 | self.headers['referer'] = url 35 | self.save(img["src"]) 36 | href = "http://vacations.ctrip.com" + a['href'] 37 | title = a["title"] 38 | self.currGjmc = title 39 | print(title, href) 40 | self.headers['referer'] = href 41 | self.html(href) 42 | self.exportCsv(self.lsgxxCsvPath) 43 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址 44 | try: 45 | html = self.request(href) 46 | #max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() 47 | tds = BeautifulSoup(html.text, 'lxml').find('table', class_="sin_lis").find_all('td') 48 | for td in tds: 49 | lsgInfo = {} 50 | lsgInfo["gjmc"] = self.currGjmc 51 | h4 = td.find("h4").get_text() 52 | lsgInfo["lqmc"] = self.trim(h4) 53 | ps = td.find_all('p') 54 | lsgInfo["lqgmc"] = self.trim(ps[0].get_text()) 55 | lsgInfo["lqgdz"] = self.trim(ps[1].get_text()) 56 | lsgInfo["lsggzsj"] = self.trim(ps[2].get_text()) 57 | print(lsgInfo) 58 | self.lsgxxList.append(lsgInfo) 59 | # FileTool.write(self.lsgxxFilePath,lsgInfo.encode("utf-8")) 60 | except Exception as e: 61 | print('发生了异常:', e) 62 | 63 | def exportCsv(self,csvfile, list=None, cloumnList=None): 64 | if list == None: 65 | list = self.lsgxxList 66 | if cloumnList == None and len(list) > 0: 67 | cloumnList = list[0].keys() 68 | # fobj = open(csvfile, 'w+') 69 | # fobj = open(csvfile, 'ab+') 70 | with open(csvfile, 'w', newline='') as fobj: 71 | writer = csv.DictWriter(fobj, fieldnames=cloumnList) 72 | writer.writeheader() 73 | for item in list: 74 | writer.writerow(item) 75 | 76 | def trim(self, myStr): 77 | myStr = re.sub('\n', '', myStr) 78 | myStr = re.sub(' ', '', myStr) 79 | myStr = re.sub('\ufffd', ' ', myStr) 80 | return myStr 81 | 82 | def save(self, img_url): ##这个函数保存图片 83 | try: 84 | index = img_url.rindex("/") 85 | name = img_url[index:] 86 | img = self.request(img_url) 87 | f = open(self.gqtpPath + name, 'ab') 88 | f.write(img.content) 89 | f.close() 90 | except Exception as e: 91 | print('发生了 异常:', e) 92 | 93 | def mkdir(self, path=""): ##这个函数创建文件夹 94 | path = path.strip() 95 | isExists = os.path.exists(path) 96 | if not isExists: 97 | print('建了一个名字叫做', path, '的文件夹!') 98 | os.makedirs(path) 99 | #os.chdir(os.path.join(self.gqtpPath, path)) ##切换到目录 100 | return True 101 | else: 102 | print('名字叫做', path, '的文件夹已经存在了!') 103 | return False 104 | 105 | def request(self, url): ##这个函数获取网页的response 然后返回 106 | content = request.get(url, headers=self.headers, timeout=3) 107 | return content 108 | 109 | visaLqxxCrawler = VisaLqxxCrawler() 110 | visaLqxxCrawler.all_url() -------------------------------------------------------------------------------- /framework/base_scrapy/README.md: -------------------------------------------------------------------------------- 1 | ## 安装 Scrapy 框架 2 | 3 | ```bash 4 | pip install Scrapy 5 | ``` 6 | 7 | 但是网上都推荐用 Anaconda 安装,初学者建议先安装 Anaconda (请百度安装方法) 8 | 9 | ```bash 10 | conda install scrapy 11 | 或专业点的 ↓ 12 | conda install -c conda-forge scrapy 13 | ``` 14 | 15 | 我用的是 Python3,双环境,所以 16 | 17 | ```bash 18 | pip3 install Scrapy 19 | ``` 20 | 21 | ## 创建 Scrapy 项目 22 | 23 | ```bash 24 | scrapy startproject base_scrapy 25 | 26 | PS:base_scrapy 为项目名,一般看你自己啦 27 | 28 | 于是就生成如下目录和文件: 29 | 30 | base_scrapy 31 | ├── base_scrapy 32 | │ ├── __init__.py 33 | │ ├── __pycache__ 34 | │ ├── items.py 35 | │ ├── middlewares.py 36 | │ ├── pipelines.py 37 | │ ├── settings.py 38 | │ └── spiders 39 | │ ├── __init__.py 40 | │ └── __pycache__ 41 | └── scrapy.cfg 42 | ``` -------------------------------------------------------------------------------- /framework/base_scrapy/base_scrapy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/framework/base_scrapy/base_scrapy/__init__.py -------------------------------------------------------------------------------- /framework/base_scrapy/base_scrapy/entrypoint.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.cmdline import execute 3 | # 该文件用于调试,第三个变量是 项目名 4 | execute(['scrapy', 'crawl', 'base_scrapy']) -------------------------------------------------------------------------------- /framework/base_scrapy/base_scrapy/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 在这里定义你的 items,可以定义很多个 class,不同的 spiders 里面引用不同的 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class BaseScrapyItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | url = scrapy.Field() 15 | status = scrapy.Field() 16 | # headers = scrapy.Field() 17 | body = scrapy.Field() 18 | pass -------------------------------------------------------------------------------- /framework/base_scrapy/base_scrapy/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class BaseScrapySpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Request, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class BaseScrapyDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /framework/base_scrapy/base_scrapy/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class BaseScrapyPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /framework/base_scrapy/base_scrapy/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for base_scrapy project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://docs.scrapy.org/en/latest/topics/settings.html 9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'base_scrapy' 13 | 14 | SPIDER_MODULES = ['base_scrapy.spiders'] 15 | NEWSPIDER_MODULE = 'base_scrapy.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'base_scrapy (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | #ROBOTSTXT_OBEY = True 23 | # 不遵循 robots.txt 规则 24 | ROBOTSTXT_OBEY = False 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | #CONCURRENT_REQUESTS = 32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | #DOWNLOAD_DELAY = 3 33 | # The download delay setting will honor only one of: 34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 35 | #CONCURRENT_REQUESTS_PER_IP = 16 36 | 37 | # Disable cookies (enabled by default) 38 | #COOKIES_ENABLED = False 39 | 40 | # Disable Telnet Console (enabled by default) 41 | #TELNETCONSOLE_ENABLED = False 42 | 43 | # Override the default request headers: 44 | #DEFAULT_REQUEST_HEADERS = { 45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 46 | # 'Accept-Language': 'en', 47 | #} 48 | 49 | # Enable or disable spider middlewares 50 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 51 | #SPIDER_MIDDLEWARES = { 52 | # 'base_scrapy.middlewares.BaseScrapySpiderMiddleware': 543, 53 | #} 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 57 | #DOWNLOADER_MIDDLEWARES = { 58 | # 'base_scrapy.middlewares.BaseScrapyDownloaderMiddleware': 543, 59 | #} 60 | 61 | # Enable or disable extensions 62 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 63 | #EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | #} 66 | 67 | # Configure item pipelines 68 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 69 | #ITEM_PIPELINES = { 70 | # 'base_scrapy.pipelines.BaseScrapyPipeline': 300, 71 | #} 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 75 | #AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | #AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | #AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | #AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | HTTPCACHE_ENABLED = True 89 | HTTPCACHE_EXPIRATION_SECS = 0 90 | HTTPCACHE_DIR = 'httpcache' 91 | HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | -------------------------------------------------------------------------------- /framework/base_scrapy/base_scrapy/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /framework/base_scrapy/base_scrapy/spiders/base_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 导入scrapy包 3 | import os 4 | import scrapy 5 | from bs4 import BeautifulSoup 6 | # 一个单独的 request 的模块,需要跟进 URL 的时候,需要用它 7 | from scrapy.http import Request 8 | # 这是我定义的需要保存的字段,(导入项目中,items文件中的 BaseScrapyItem 类) 9 | from base_scrapy.items import BaseScrapyItem 10 | 11 | # 在 Scrapy 框架根目录,控制台输入: scrapy crawl base_spider -o data/base_spider/item.json 12 | class BaseSpider(scrapy.Spider): 13 | # 爬虫名字,定义后在项目根目录: scrapy crawl {name} ,运行该爬虫 14 | name = 'base_spider' 15 | # 定义一些常量 16 | data_dir = 'data' 17 | allowed_domains = ['baidu.com'] 18 | bash_url = 'https://www.baidu.com/s?wd=' 19 | 20 | def start_requests(self): 21 | for i in range(1, 10): 22 | url = self.bash_url + str(i) 23 | # 爬取到的页面 提交 给 parse 方法处理 24 | yield Request(url, self.parse) 25 | 26 | def parse(self, response): 27 | ''' 28 | start_requests 已经爬取到页面,那如何提取我们想要的内容,可以在这个方法里面定义。 29 | 也就是用xpath、正则、或是css进行相应提取,这个例子就是让你看看scrapy运行的流程: 30 | 1、定义链接; 31 | 2、通过链接爬取(下载)页面; 32 | 3、定义规则,然后提取数据;(当前步骤) 33 | ''' 34 | # # 根据上面的链接提取个数,文件名:baidu.com-{n}.txt 35 | # file_name = self.allowed_domains[0] + '-' + response.url.split("=")[-1] + '.txt' 36 | # # 文件路径 37 | # file_path = os.path.join(self.data_dir, self.name) 38 | # # 创建文件夹 39 | # if not os.path.exists(file_path): 40 | # os.makedirs(file_path) 41 | # # 拼接文件名 42 | # file_full_name = os.path.join(file_path, file_name) 43 | # with open(file_full_name, 'wb') as f: 44 | # # python文件操作,不多说了; 45 | # f.write(response.body) 46 | # # 打个日志 47 | # self.log('保存文件: %s' % file_full_name) 48 | item = BaseScrapyItem() 49 | item['url'] = response.url 50 | item['status'] = response.status 51 | # item['headers'] = str(response.headers, encoding='utf8') 52 | item['body'] = str(response.body, encoding='utf8') 53 | yield item -------------------------------------------------------------------------------- /framework/base_scrapy/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = base_scrapy.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = base_scrapy 12 | -------------------------------------------------------------------------------- /huaban.com/PreviewHtmlTool.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | def saveIndexHtmlFile(save_path, title, border_id, max_page): 3 | template = ''' 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | %(title)s 13 | 112 | 113 | 114 |
115 | %(title)s - 1/%(max_page)s 116 |
117 |
118 |
119 |
120 | 121 |
122 |
123 |
124 |
125 |
126 | 182 | 183 | 184 | ''' 185 | # html = template % {'title':title, 'border_id':border_id, 'max_page':str(max_page)} 186 | html = template.replace("%(title)s", title).replace("%(border_id)s", border_id).replace("%(max_page)s", str(max_page)) 187 | with open(save_path, 'w', encoding='UTF-8') as f: 188 | f.write(html) 189 | 190 | saveIndexHtmlFile("./test.html", "adfaf", "12341", 123) -------------------------------------------------------------------------------- /huaban.com/README.md: -------------------------------------------------------------------------------- 1 | 2 | ## 前情提要 3 | 4 | 最近看到花瓣的一个图集(淡然小笺赋箴言)蛮不错,想用爬虫收集下图集和对应的文字。 5 | 6 | ## 花瓣网 7 | 8 | 0. 测试异步爬虫 9 | 1. 简单文本模式 10 | 2. HTML静态模式 11 | 12 | 13 | ### 测试异步爬虫 14 | 15 | 花瓣里面的所有图片都是异步加载的,需要模拟浏览器操作,简单学习测试一下 selenium 的使用 16 | 17 | 代码来源: 18 | 作者:疯魔的小咸鱼 19 | 链接:https://www.jianshu.com/p/554c6d5af3ca 20 | 21 | PS:selenium 安装注意事项 22 | 23 | - 问题1:selenium 已经放弃 PhantomJS 了,建议使用火狐或者谷歌无界面浏览器。 24 | 25 | 解决方案:selenium 版本降级。 26 | 通过 `pip show selenium` 显示,默认安装版本为 3.14.0。 27 | 将其卸载 `pip uninstall selenium`,重新安装并指定版本号 `pip install selenium==2.48.0`。 28 | 29 | - 问题2: Unable to start phantomjs with ghostdriver: [WinError 2] 系统找不到指定的文件 30 | 31 | 解决方案:下载 phantomjs 到该目录下,或配置 phantomjs 的目录路径到 path 环境变量 32 | 33 | 下载路径 http://phantomjs.org/download.html ,选择对应操作系统下载 34 | 35 | - 问题3:使用 chromedrive ,对应下载地址与版本对照表 36 | 37 | 下载地址:http://npm.taobao.org/mirrors/chromedriver 38 | 39 | 版本对照表:https://blog.csdn.net/yoyocat915/article/details/80580066 40 | 41 | 测试结果: 42 | 43 | ![图片截图](img/huaban-simple-1.png) 44 | 45 | ### 简单文本模式 46 | 47 | 保存画板的里面所有图片信息,包含图片和图片描述 48 | 49 | 以画板 [淡然小笺赋箴言](http://huaban.com/boards/13448395/) 为例: 50 | 51 | ![淡然小笺赋箴言](img/huaban-border-txt.png) 52 | 53 | 保存画板信息的同时,会在同层目录生成一个单 `index.html` 页面 54 | 55 | 我将一部分爬取到的信息(40张)上传到了该项目里面,点击[这里](https://petterobam.github.io/learn-scrapy/huaban.com/%E6%B7%A1%E7%84%B6%E5%B0%8F%E7%AC%BA%E8%B5%8B%E7%AE%B4%E8%A8%80/index.html)预览 56 | 57 | ![预览界面](img/huaban-preview-border.png) 58 | -------------------------------------------------------------------------------- /huaban.com/huaban-border-text.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from selenium import webdriver 3 | import time 4 | import os 5 | import requests 6 | import PreviewHtmlTool 7 | 8 | 9 | class Huaban(): 10 | def __init__(self, username, password): 11 | self.username = username 12 | self.password = password 13 | 14 | # 获取图片和图片文字信息,并存储成文件 15 | def get_picture_info_by_border_url(self, border_url): 16 | 17 | # 使用Chrome浏览器模拟打开网页,但是要把下载的chromedriver.exe放在python的文件路径下, 18 | # 调试好之后换成PhantomJs,速度应该会快一点 19 | # driver = webdriver.PhantomJs() 20 | # driver = webdriver.PhantomJS('../plugin/phantomjs-2.1.1-macosx/bin/phantomjs') 21 | driver = webdriver.Chrome('../plugin/chromedriver') 22 | # 设置全屏 23 | driver.maximize_window() 24 | 25 | if username != None and len(username) > 0: 26 | url = "http://huaban.com" 27 | driver.get(url) 28 | time.sleep(8) 29 | 30 | # 点击登录、呼起登录窗口 31 | driver.find_elements_by_xpath('//a[@class="login bounce btn wbtn"]')[0].click() 32 | # 输入用户名 33 | try: 34 | driver.find_elements_by_xpath('//input[@name="email"]')[0].send_keys(self.username) 35 | print('用户名输入OK!') 36 | except: 37 | print('用户名输入异常!') 38 | time.sleep(3) 39 | # 输入密码 40 | try: 41 | driver.find_elements_by_xpath('//input[@name="password"]')[0].send_keys(self.password) 42 | print('密码输入OK!') 43 | except: 44 | print('密码输入异常!') 45 | time.sleep(3) 46 | # 点击登陆按钮 47 | try: 48 | driver.find_elements_by_xpath('//a[@class="btn btn18 rbtn"]')[0].click() 49 | print('点击登陆OK!') 50 | except: 51 | print('点击登陆异常') 52 | time.sleep(3) 53 | 54 | #访问画板,例如 http://huaban.com/boards/13448395/ 55 | driver.get(border_url) 56 | time.sleep(5) 57 | i = 0 58 | page = 1 59 | global name 60 | global store_path 61 | global path 62 | # 获取画板标题 //div[@id="board_card"]/div[@class="inner"]/div[@class="head-line"]/h1 63 | content = driver.find_elements_by_xpath('//div[@id="board_card"]/div[@class="inner"]/div[@class="head-line"]/h1')[0].text 64 | path = "./" + content 65 | # hash_content = str(hash(content)) 66 | # hash_content = border_url[-9:-1] 67 | url_split_list = border_url.split("/") 68 | hash_content = url_split_list[-2] + url_split_list[-1] 69 | 70 | # 保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的文件夹 71 | if not os.path.exists(path): 72 | os.makedirs(path) 73 | #获取图片的总数 //div[@id="board_card"]/div[@class="bar"]/div[@class="tabs"]/a 74 | pictures_count = driver.find_elements_by_xpath('//div[@id="board_card"]/div[@class="bar"]/div[@class="tabs"]/a')[0].text.replace('采集', '') 75 | print(pictures_count) 76 | 77 | # 生成预览用的HTML页面 78 | PreviewHtmlTool.saveIndexHtmlFile(path + "/index.html", content, hash_content, pictures_count) 79 | 80 | pages = int(int(pictures_count) / 20) 81 | print(pages) 82 | #匹配到图片url所在的元素 83 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img') 84 | #匹配图片对应的文字描述 85 | pic_info_elements = driver.find_elements_by_xpath('//div[@id="waterfall"]//p[@class="description"]') 86 | 87 | while page <= pages: 88 | while len(url_elements) < 20 * page: 89 | driver.execute_script("window.scrollBy(0,1000)") 90 | time.sleep(3) 91 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img') 92 | pic_info_elements = driver.find_elements_by_xpath('//div[@id="waterfall"]//p[@class="description"]') 93 | 94 | print("第%s页" % page) 95 | 96 | for url_element in url_elements[20 * (page - 1):20 * page]: 97 | download_url = url_element.get_attribute("src")[:-3] + "658" 98 | pic_info = pic_info_elements[i].get_attribute("data-raw") 99 | i += 1 100 | store_path = hash_content + "_" + str(i) 101 | self.store(download_url, pic_info) 102 | 103 | page += 1 104 | 105 | #最后一页 106 | print("第%s页" % int(page)) 107 | 108 | while len(url_elements) < int(pictures_count): 109 | driver.execute_script("window.scrollBy(0,1000)") 110 | time.sleep(3) 111 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img') 112 | pic_info_elements = driver.find_elements_by_xpath('//div[@id="waterfall"]//p[@class="description"]') 113 | 114 | for url_element in url_elements[20 * (page - 1):]: 115 | download_url = url_element.get_attribute("src")[:-3] + "658" 116 | pic_info = pic_info_elements[i].get_attribute("data-raw") 117 | i += 1 118 | store_path = hash_content + "_" + str(i) 119 | self.store(download_url, pic_info) 120 | 121 | #存储图片到本地 122 | def store(self, picture_url, picture_info): 123 | pic_path = path + '/'+ store_path 124 | 125 | with open(pic_path + '.jpg', 'wb') as f: 126 | picture = requests.get(picture_url) 127 | f.write(picture.content) 128 | print('正在保存图片:' + picture_url) 129 | print(f'文件:{pic_path}.jpg') 130 | 131 | with open(pic_path + '.txt', 'w', encoding='UTF-8') as f: 132 | f.write(picture_info) 133 | print('正在保存图片文字信息:' + picture_url) 134 | print(f'文件:{pic_path}.txt') 135 | 136 | if __name__ == "__main__": 137 | username = input('请输入花瓣账号名:') # '花瓣账号' 138 | password = input('请输入账号对应密码:') # '账号密码' 139 | huaban = Huaban(username, password) 140 | #获取画板图片信息[淡然小笺赋箴言] http://huaban.com/boards/13448395/ 141 | border_url = 'http://huaban.com/boards/13448395/' 142 | huaban.get_picture_info_by_border_url(border_url) 143 | -------------------------------------------------------------------------------- /huaban.com/huaban-simple.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from selenium import webdriver 3 | import time 4 | import os 5 | import requests 6 | 7 | 8 | class Huaban(): 9 | def __init__(self, username, password): 10 | self.username = username 11 | self.password = password 12 | 13 | # 获取图片url并存到列表urls_list 14 | def get_picture_url(self, content): 15 | global path 16 | path = "./" + content 17 | # 保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的文件夹 18 | if not os.path.exists(path): 19 | os.makedirs(path) 20 | url = "http://huaban.com" 21 | # 使用Chrome浏览器模拟打开网页,但是要把下载的 chromedriver.exe 放在python的文件路径下, 22 | # 调试好之后换成 PhantomJs,速度应该会快一点 23 | # driver = webdriver.PhantomJs() 24 | # driver = webdriver.PhantomJS('../plugin/phantomjs-2.1.1-macosx/bin/phantomjs') 25 | driver = webdriver.Chrome('../plugin/chromedriver') 26 | # 设置全屏 27 | driver.maximize_window() 28 | driver.get(url) 29 | time.sleep(8) 30 | 31 | # 点击登录、呼起登录窗口 32 | driver.find_elements_by_xpath('//a[@class="login bounce btn wbtn"]')[0].click() 33 | # 输入用户名 34 | try: 35 | driver.find_elements_by_xpath('//input[@name="email"]')[0].send_keys(self.username) 36 | print('用户名输入OK!') 37 | except: 38 | print('用户名输入异常!') 39 | time.sleep(3) 40 | # 输入密码 41 | try: 42 | driver.find_elements_by_xpath('//input[@name="password"]')[0].send_keys(self.password) 43 | print('密码输入OK!') 44 | except: 45 | print('密码输入异常!') 46 | time.sleep(3) 47 | # 点击登陆按钮 48 | try: 49 | driver.find_elements_by_xpath('//a[@class="btn btn18 rbtn"]')[0].click() 50 | print('点击登陆OK!') 51 | except: 52 | print('点击登陆异常') 53 | time.sleep(3) 54 | #搜索图片 55 | driver.find_elements_by_xpath('//input[@placeholder="搜索你喜欢的"]')[0].send_keys(content) 56 | driver.find_elements_by_xpath('//form[@id="search_form"]/a')[0].click() 57 | time.sleep(5) 58 | i = 0 59 | page = 1 60 | global name 61 | global store_path 62 | global urls_list 63 | urls_list = [] 64 | #获取图片的总数 65 | pictures_count = driver.find_elements_by_xpath('//a[@class="selected"]/i')[0].text 66 | print(pictures_count) 67 | pages = int(int(pictures_count) / 20) 68 | print(pages) 69 | #匹配到图片url所在的元素 70 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img') 71 | #遍历图片元素的列表获取图片的url 72 | for url_element in url_elements: 73 | picture_url = url_element.get_attribute("src")[:-3] + "658" 74 | #防止获取重复的图片url 75 | if picture_url not in urls_list: 76 | urls_list.append(picture_url) 77 | while page <= pages: 78 | while len(urls_list) < 20*page: 79 | driver.execute_script("window.scrollBy(0,1000)") 80 | time.sleep(3) 81 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img') 82 | for url_element in url_elements: 83 | picture_url = url_element.get_attribute("src")[:-3] + "658" 84 | if picture_url not in urls_list: 85 | urls_list.append(picture_url) 86 | print("第%s页" % page) 87 | 88 | for download_url in urls_list[20*(page-1):20*page]: 89 | i += 1 90 | name = content + "_" + str(i) 91 | store_path = name + '.jpg' 92 | self.store(download_url) 93 | page += 1 94 | #最后一页 95 | print("第%s页" % int(page)) 96 | 97 | while len(urls_list) < int(pictures_count): 98 | driver.execute_script("window.scrollBy(0,1000)") 99 | time.sleep(3) 100 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img') 101 | for url_element in url_elements: 102 | picture_url = url_element.get_attribute("src")[:-3] + "658" 103 | if picture_url not in urls_list: 104 | urls_list.append(picture_url) 105 | for download_url in urls_list[20*(page-1): ]: 106 | i += 1 107 | name = content + "_" + str(i) 108 | store_path = name + '.jpg' 109 | self.store(download_url) 110 | 111 | #存储图片到本地 112 | def store(self, picture_url): 113 | picture = requests.get(picture_url) 114 | f = open(os.path.join(path, store_path), 'wb') 115 | f.write(picture.content) 116 | print('正在保存图片:' + picture_url) 117 | print('文件:' + name) 118 | 119 | if __name__ == "__main__": 120 | content = '赵丽颖' 121 | username = input('请输入花瓣账号名:') # '花瓣账号' 122 | password = input('请输入账号对应密码:') # '账号密码' 123 | huaban = Huaban(username, password) 124 | huaban.get_picture_url(content) 125 | -------------------------------------------------------------------------------- /huaban.com/img/huaban-border-txt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/img/huaban-border-txt.png -------------------------------------------------------------------------------- /huaban.com/img/huaban-preview-border.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/img/huaban-preview-border.png -------------------------------------------------------------------------------- /huaban.com/img/huaban-simple-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/img/huaban-simple-1.png -------------------------------------------------------------------------------- /huaban.com/test.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | adfaf 7 | 106 | 107 | 108 |
109 | adfaf - 1/123 110 |
111 |
112 |
113 |
114 | 115 |
116 |
117 |
118 |
119 |
120 | 176 | 177 | 178 | -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_1.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_1.txt: -------------------------------------------------------------------------------- 1 | 入夏偏宜澹薄妆,越罗衣褪郁金黄,翠钿檀注助容光。
相见无言还有恨,几回判却又思量,月窗香径梦悠飏。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_10.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_10.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_10.txt: -------------------------------------------------------------------------------- 1 | 初心已恨花期晚。别后相思长在眼。兰衾犹有旧时香,每到梦回珠泪满。
多应不信人肠断。几夜夜寒谁共暖。欲将恩爱结来生,只恐来生缘又短。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_11.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_11.txt: -------------------------------------------------------------------------------- 1 | 年年此夕东城见,欢意匆匆。明日还重。却在楼台缥缈中。
垂螺拂黛清歌女,曾唱相逢。秋月春风。醉枕香衾一岁同。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_12.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_12.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_12.txt: -------------------------------------------------------------------------------- 1 | 四十年来家国,三千里地山河。凤阁龙楼连霄汉,玉树琼枝作烟萝,几曾识干戈?
一旦归为臣虏,沈腰潘鬓消磨。最是仓皇辞庙日,教坊犹奏别离歌,垂泪对宫娥。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_13.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_13.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_13.txt: -------------------------------------------------------------------------------- 1 | 彩袖殷勤捧玉钟,当年拚却醉颜红。舞低杨柳楼心月,歌尽桃花扇底风。
从别后,忆相逢,几回魂梦与君同。今宵剩把银釭照,犹恐相逢是梦中。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_14.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_14.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_14.txt: -------------------------------------------------------------------------------- 1 | 可怜白雪曲,未遇知音人。
恓惶戎旅下,蹉跎淮海滨。
涧树含朝雨,山鸟哢馀春。
我有一瓢酒,可以慰风尘。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_15.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_15.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_15.txt: -------------------------------------------------------------------------------- 1 | 罗带惹香,犹系别时红豆。泪痕新,金缕旧,断离肠。
一双娇燕语雕梁,还是去年时节。绿杨浓,芳草歇,柳花狂。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_16.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_16.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_16.txt: -------------------------------------------------------------------------------- 1 | 天涯旧恨,独自凄凉人不问。欲见回肠,断尽金炉小篆香。
黛蛾长敛,任是春风吹不展。困倚危楼,过尽飞鸿字字愁。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_17.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_17.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_17.txt: -------------------------------------------------------------------------------- 1 | 人生愁恨何能免,销魂独我情何限!故国梦重归,觉来双泪垂。
高楼谁与上?长记秋晴望。往事已成空,还如一梦中。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_18.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_18.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_18.txt: -------------------------------------------------------------------------------- 1 | 恨君不似江楼月,南北东西,南北东西,只有相随无别离。
恨君却似江楼月,暂满还亏,暂满还亏,待得团圆是几时? -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_19.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_19.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_19.txt: -------------------------------------------------------------------------------- 1 | 多少恨,昨夜梦魂中。还似旧时游上苑,车如流水马如龙。花月正春风。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_2.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_2.txt: -------------------------------------------------------------------------------- 1 | 身外闲愁空满,眼中欢事常稀。明年应赋送君诗。细从今夜数,相会几多时。
浅酒欲邀谁劝,深情惟有君知。东溪春近好同归。柳垂江上影,梅谢雪中枝。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_20.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_20.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_20.txt: -------------------------------------------------------------------------------- 1 | 涉江玩秋水,爱此红蕖鲜。
攀荷弄其珠,荡漾不成圆。
佳人彩云里,欲赠隔远天。
相思无因见,怅望凉风前。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_21.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_21.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_21.txt: -------------------------------------------------------------------------------- 1 | 急景流年真一箭。残雪声中,省识东风面。风里垂杨千万线,昨宵染就鹅黄浅。
又是廉纤春雨暗。倚遍危楼,高处人难见。已恨平芜随雁远,暝烟更界平芜断。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_22.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_22.txt: -------------------------------------------------------------------------------- 1 | 淡烟飘薄。莺花谢、清和院落。树阴翠、密叶成幄。麦秋霁景,夏云忽变奇峰、倚寥廊。波暖银塘,涨新萍绿鱼跃。想端忧多暇,陈王是日,嫩苔生阁。
正铄石天高,流金昼永,楚榭光风转蕙,披襟处、波翻翠幕。以文会友,沈李浮瓜忍轻诺。别馆清闲,避炎蒸、岂须河朔。但尊前随分,雅歌艳舞,尽成欢乐。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_23.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_23.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_23.txt: -------------------------------------------------------------------------------- 1 | 春到南楼雪尽。惊动灯期花信。小雨一番寒。倚阑干。
莫把栏干频倚。一望几重烟水。何处是京华。暮云遮。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_24.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_24.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_24.txt: -------------------------------------------------------------------------------- 1 | 柳色披衫金缕凤,纤手轻拈红豆弄,翠蛾双敛正含情。桃花洞,瑶台梦,一片春愁谁与共? -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_25.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_25.txt: -------------------------------------------------------------------------------- 1 | 阑珊心绪,醉倚绿琴相伴住。一枕新愁,残夜花香月满楼。
繁笙脆管,吹得锦屏春梦远。只有垂杨,不放秋千影过墙。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_26.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_26.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_26.txt: -------------------------------------------------------------------------------- 1 | 卷尽愁云,素娥临夜新梳洗。暗尘不起。酥润凌波地。
辇路重来,仿佛灯前事。情如水。小楼熏被。春梦笙歌里。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_27.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_27.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_27.txt: -------------------------------------------------------------------------------- 1 | 杨柳丝丝弄轻柔,烟缕织成愁。海棠未雨,梨花先雪,一半春休。
而今往事难重省,归梦绕秦楼。相思只在:丁香枝上,豆蔻梢头。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_28.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_28.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_28.txt: -------------------------------------------------------------------------------- 1 | 秋池阁。风傍晓庭帘幕。霜叶未衰吹未落。半惊鸦喜鹊。
自笑浮名情薄。似与世人疏略。一片懒心双懒脚。好教闲处著。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_29.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_29.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_29.txt: -------------------------------------------------------------------------------- 1 | 谢却荼蘼,一片月明如水。篆香消,犹未睡,早鸦啼。
嫩寒无赖罗衣薄,休傍阑干角。最愁人,灯欲落,雁还飞。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_3.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_3.txt: -------------------------------------------------------------------------------- 1 | 背庭缘恐花羞坠。心事遥山里。小帘愁卷月笼明。一寸秋怀禁得、几蛩声。
井梧不放西风起。供与离人睡。梦和新月未圆时。起看檐蛛结网、又寻思。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_30.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_30.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_30.txt: -------------------------------------------------------------------------------- 1 | 雨后春容清更丽。只有离人,幽恨终难洗。北固山前三面水。碧琼梳拥青螺髻。
一纸乡书来万里。问我何年,真个成归计。白首送春拚一醉。东风吹破千行泪。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_31.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_31.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_31.txt: -------------------------------------------------------------------------------- 1 | 如花貌。当来便约,永结同心偕老。为妙年、俊格聪明,凌厉多方怜爱,何期养成心性近,元来都不相表。渐作分飞计料。
稍觉因情难供,恁殛恼。争克罢同欢笑。已是断弦尤续,覆水难收,常向人前诵谈,空遣时传音耗。漫悔懊。此事何时坏了。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_32.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_32.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_32.txt: -------------------------------------------------------------------------------- 1 | 西施宜笑复宜颦,丑女效之徒累身。
君王虽爱蛾眉好,无奈宫中妒杀人! -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_33.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_33.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_33.txt: -------------------------------------------------------------------------------- 1 | 十里春风,二分明月,蕊仙飞下琼楼。看冰花翦翦,拥碎玉成毬。想长日、云阶伫立,太真肌骨,飞燕风流。敛群芳、清丽精神,都付扬州。
雨窗数朵,梦惊回、天际香浮。似阆苑花神,怜人冷落,骑鹤来游。为问竹西风景,长空淡、烟水悠悠。又黄昏,羌管孤城,吹起新愁。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_34.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_34.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_34.txt: -------------------------------------------------------------------------------- 1 | 记得来时春未暮,执手攀花,袖染花梢露。暗卜春心共花语,争寻双朵争先去。
多情因甚相辜负,轻拆轻离,欲向谁分诉。泪湿海棠花枝处,东君空把奴分付。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_35.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_35.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_35.txt: -------------------------------------------------------------------------------- 1 | 名花倾国两相欢,常得君王带笑看。
解释春风无限恨,沉香亭北倚栏杆。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_36.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_36.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_36.txt: -------------------------------------------------------------------------------- 1 | 无情最是江头柳。长条折尽还依旧。木叶下平湖。雁来书有无。雁无书尚可。妙语凭谁和。风雨断肠时。小山生桂枝。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_37.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_37.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_37.txt: -------------------------------------------------------------------------------- 1 | 画楼影蘸清溪水。歌声响彻行云里。帘幕燕双双。绿杨低映窗。
曲中特地误。要试周郎顾。醉里客魂消。春风大小乔。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_38.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_38.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_38.txt: -------------------------------------------------------------------------------- 1 | 燕语莺啼人乍远。却恨西园,依旧莺和燕。笑语十分愁一半。翠园特地春光暖。
只道书来无过雁。不道柔肠,近日无肠断。柄玉莫摇湘泪点。怕君唤作秋风扇。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_39.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_39.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_39.txt: -------------------------------------------------------------------------------- 1 | 可怜今夕月,向何处、去悠悠?
是别有人间,那边才见,光影东头?
是天外空汗漫,但长风、浩浩送中秋?
飞镜无根谁系?嫦娥不嫁谁留? 
谓经海底问无由,恍惚使人愁。
怕万里长鲸,纵横触破,玉殿琼楼。
虾蟆故堪浴水,问云何、玉兔解沉浮?
若道都齐无恙,云何渐渐如钩? -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_4.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_4.txt: -------------------------------------------------------------------------------- 1 | 香叆雕盘,寒生冰箸,画堂别是风光。主人情重,开宴出红妆。腻玉圆搓素颈,藕丝嫩、新织仙裳。双歌罢,虚檐转月,余韵尚悠扬。
人间,何处有,司空见惯,应谓寻常。坐中有狂客,恼乱愁肠。报道金钗坠也,十指露、春笋纤长。亲曾见,全胜宋玉,想像赋高唐。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_40.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_40.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_40.txt: -------------------------------------------------------------------------------- 1 | 风骨萧然,称独立、群仙首。春江雪、一枝梅秀。小样香檀,映朗玉、纤纤手。未久。转新声、泠泠山溜。
曲里传情,更浓似、尊中酒。信倾盖、相逢如旧。别后相思,记敏政堂前柳。知否。又拚了、一场消瘦。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_5.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_5.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_5.txt: -------------------------------------------------------------------------------- 1 | 雕阴无树水南流,雉堞连云古帝州。
带雨晚驼鸣远戍,望乡孤客倚高楼。
明妃去日花应笑,蔡琰归时鬓已秋。
一曲单于暮烽起,扶苏城上月如钩。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_6.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_6.txt: -------------------------------------------------------------------------------- 1 | 美女妖且闲,采桑歧路间。
柔条纷冉冉,叶落何翩翩。
攘袖见素手,皓腕约金环。
头上金爵钗,腰佩翠琅玕。
明珠交玉体,珊瑚间木难。
罗衣何飘飘,轻裾随风还。
顾盼遗光彩,长啸气若兰。
行徒用息驾,休者以忘餐。
借问女安居,乃在城南端。
青楼临大路,高门结重关。
容华耀朝日,谁不希令颜?
媒氏何所营?玉帛不时安。
佳人慕高义,求贤良独难。
众人徒嗷嗷,安知彼所观?
盛年处房室,中夜起长叹。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_7.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_7.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_7.txt: -------------------------------------------------------------------------------- 1 | 斑骓路与阳台近。前度无题初借问。暖风鞭袖尽闲垂,微月帘栊曾暗认。
梅花未足凭芳信。弦语岂堪传素恨。翠眉饶似远山长,寄与此愁颦不尽。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_8.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_8.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_8.txt: -------------------------------------------------------------------------------- 1 | 留春不住。恰似年光无味处。满眼飞英。弹指东风太浅情。
筝弦未稳。学得新声难破恨。转枕花前。且占香红一夜眠。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_9.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_9.jpg -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/13448395_9.txt: -------------------------------------------------------------------------------- 1 | 出墙花,当路柳。借问芳心谁有。红解笑,绿能颦。千般恼乱春。
北来人,南去客。朝暮等闲攀折。怜晚芳,惜残阳。情知枉断肠。 -------------------------------------------------------------------------------- /huaban.com/淡然小笺赋箴言/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 淡然小笺赋箴言 11 | 110 | 111 | 112 |
113 | 淡然小笺赋箴言 - 1/40 114 |
115 |
116 |
117 |
118 | 119 |
120 |
121 |
122 |
123 |
124 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /jjwxk.net/Parent.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") -------------------------------------------------------------------------------- /jjwxk.net/README.md: -------------------------------------------------------------------------------- 1 | ## 前情提要 2 | 3 | 最近想学习学习怎么写小说,爬点资源分析分析 4 | 5 | ## 晋江文学库 6 | 7 | 免费小说爬取 8 | 9 | 1. 简单模式 10 | 2. HTML静态模式 11 | 3. Sqlite模式 12 | 4. ES模式 13 | 5. ES+ECHARTS模式 14 | 15 | ### 简单模式 16 | 17 | 运行方法:进入到当前目录下 18 | 19 | ``` 20 | python jjwxk-free-simple.py 21 | ``` 22 | 23 | 1. 简单模式是以文本文件作为数据媒介,存储格式用文件夹实现层次,每个子文件夹一本小说数据 24 | 2. 所有小说列表和小说具体内容爬取步骤分离,可以分两个线程运行 free_list 和 book_list 方法,每个方法不支持多进程运行 25 | 3. 实现了简单的断点续爬,利用文本文件记录爬取进度,包括小说列表进度、已完成的小说进度、当前这本小说章节进度 26 | 27 | 部分截图: 28 | 29 | ![所有小说图片截图](img/jjwxk-free-simple-1.png) 30 | 31 | ![单本小说图片截图](img/jjwxk-free-simple-2.png) 32 | 33 | >PS:本项目仅学习分享用,请不要用于商业 -------------------------------------------------------------------------------- /jjwxk.net/img/jjwxk-free-simple-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/jjwxk.net/img/jjwxk-free-simple-1.png -------------------------------------------------------------------------------- /jjwxk.net/img/jjwxk-free-simple-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/jjwxk.net/img/jjwxk-free-simple-2.png -------------------------------------------------------------------------------- /jjwxk.net/jjwxk-free-simple.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | from bs4 import BeautifulSoup 4 | from BaseTools.MyDownload import request 5 | from BaseTools.MyUtil import FileTool 6 | import time 7 | 8 | class jjwxk_free_simple(): 9 | def __init__(self): 10 | self.headers = { 11 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 12 | # 'Accept-Encoding': 'gzip, deflate', 13 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 14 | 'Host': 'www.jjwxc.net', 15 | 'Upgrade-Insecure-Requests': '1', 16 | 'User-Agent':"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" 17 | } 18 | self.basePath = "jjwxk_free_simple/" 19 | FileTool.mkdir(self.basePath) 20 | self.baseListFilePath = self.basePath + "book-list.txt" 21 | self.baseUrlFilePath = self.basePath + "book-url.txt" 22 | self.basePageFilePath = self.basePath + "book-page.txt" 23 | self.totalFinishFilePath = self.basePath + "total.txt" 24 | self.finishBookLineCountFilePath = self.basePath + "book-total.txt" 25 | self.globalPageCount = 0 26 | self.pageCount = 0 27 | self.lineCount = 0 28 | self.readFinishCountInfo() 29 | 30 | # 抓取入口:默认 http://www.jjwxc.net/bookbase_slave.php?booktype=free 31 | def free_list(self, limitPage=1, url="http://www.jjwxc.net/bookbase_slave.php?booktype=free"): 32 | html_content = self.request_content(url) ##调用request_content返回html文本给我们 33 | FileTool.write_behind(self.basePageFilePath, url) 34 | html_ele = BeautifulSoup(html_content, 'lxml') 35 | self.globalPageCount = self.globalPageCount + 1 36 | if self.globalPageCount > limitPage: 37 | return 38 | 39 | if self.globalPageCount >= self.pageCount: 40 | # 如果当前页码比记录的页码大,行数从第一行开始记录,否则就当前页码记录 41 | if(self.globalPageCount > self.pageCount): 42 | self.lineCount = 0 43 | self.pageCount = self.globalPageCount 44 | 45 | # 获取图书表格元素 46 | book_table = html_ele.find("table", class_="cytable") 47 | if book_table == None: 48 | return 49 | list_tr = book_table.find_all("tr") 50 | count = -1 51 | for tr in list_tr: 52 | count = count + 1 53 | if count == 0 or self.lineCount >= count: 54 | continue 55 | list_td = tr.find_all("td") 56 | book_list_url = None 57 | book_info_arr = [] 58 | count_td = 0 59 | for td in list_td: 60 | book_info_arr.append(td.get_text().replace('\n', '').replace(' ', '')) 61 | if count_td == 1: 62 | book_list_url = "http://www.jjwxc.net/" + td.find("a")['href'] 63 | count_td = count_td + 1 64 | FileTool.write_behind(self.baseUrlFilePath, book_list_url) 65 | book_list_info = " | ".join(book_info_arr) 66 | FileTool.write_behind(self.baseListFilePath, book_list_info) 67 | self.lineCount = count 68 | # 完成一行,记录一下count信息,便于后面断点爬取 69 | self.saveFinishCountInfo() 70 | else: 71 | self.globalPageCount = self.pageCount - 1 72 | 73 | # page_next = "http://www.jjwxc.net/" + html_ele.find_all("div", class_="controlbar")[1].find_all("a")[2]["href"] 74 | page_next = "http://www.jjwxc.net/bookbase_slave.php?booktype=free&opt=&endstr=&orderstr=4&page=" + str(self.globalPageCount + 1) 75 | if page_next == None or "" == page_next: 76 | return 77 | print("书籍清单第", self.globalPageCount, "页信息:[", url, "]抓取完毕") 78 | # 暂停一秒,防止爬虫被发现 79 | # time.sleep(1) 80 | self.headers['Referer'] = url 81 | # 继续拉取下一页 82 | self.free_list(limitPage, page_next) 83 | 84 | # 从保存的书籍链接记录里面抓取每一本书的内容 85 | def book_list(self): 86 | book_count = 0 87 | book_finish_count = self.readSimpleFinishCountInfo(self.finishBookLineCountFilePath) 88 | for line in open(self.baseUrlFilePath): 89 | # 逐行读取此前爬取的书籍链接,去掉最后的换行符号 90 | url = line.replace("\n", "") 91 | book_count = book_count + 1 92 | if book_count <= book_finish_count: 93 | print("[", url, "],该本书已经抓取过!") 94 | continue 95 | self.book_one(url) 96 | # 记录抓取书的数量,实现简单断点续爬 97 | FileTool.overwrite(self.finishBookLineCountFilePath, str(book_count)) 98 | print("[", url, "],该本书所有章节已经抓取完毕!") 99 | 100 | # 保存一本书的内容 101 | def book_one(self, url="http://www.jjwxc.net/onebook.php?novelid=3468871"): 102 | html_content = self.request_content(url) ##调用request_content返回html文本给我们 103 | html_ele = BeautifulSoup(html_content, 'lxml') 104 | # 获取图书表格元素 105 | book_table = html_ele.find("table", id="oneboolt") 106 | list_tr = book_table.find_all("tr") 107 | self.headers['Referer'] = url 108 | if len(list_tr) > 0: 109 | book_title = list_tr[0].find("h1").get_text() 110 | # 去掉文件夹中特殊字符,防止小说名中特殊字符 111 | book_floder = self.basePath + FileTool.replace_invalid_filename(book_title) + "/" 112 | FileTool.mkdir(book_floder) 113 | book_chapter_file = book_floder + "0.chapter_list.txt" 114 | book_chapter_url_file = book_floder + "0.chapter_url_list.txt" 115 | book_chapter_finish_count_file = book_floder + "0.current_count.txt" 116 | chapter_count = 0 117 | chapter_finish_count = self.readSimpleFinishCountInfo(book_chapter_finish_count_file) 118 | for tr in list_tr: 119 | if "itemprop" in tr.attrs: 120 | chapter_count = chapter_count + 1 121 | if chapter_count <= chapter_finish_count: 122 | print("第", chapter_count, "章,该章节已经抓取过!") 123 | continue 124 | list_td = tr.find_all("td") 125 | count_td = 0 126 | chapter_info_arr = [] 127 | chapter_url = None 128 | chapter_title = None 129 | for td in list_td: 130 | chapter_info_arr.append(td.get_text().replace('\n', '').replace(' ', '')) 131 | if count_td == 1: 132 | chapter_a = td.find("a") 133 | if chapter_a != None: 134 | chapter_url = chapter_a['href'] 135 | chapter_title = chapter_a.get_text() 136 | count_td = count_td + 1 137 | if chapter_url == None: 138 | print("第", chapter_count, "章,该章节已丢失!") 139 | chapter_url = "第" + str(chapter_count) + "章,该章节已丢失!" 140 | else: 141 | # 去掉文件名中的特殊字符 142 | curr_filename = FileTool.replace_invalid_filename(str(chapter_count) + "." + chapter_title + ".txt") 143 | curr_chapter_file_path = book_floder + curr_filename 144 | self.save_chapter(curr_chapter_file_path, chapter_url) 145 | FileTool.write_behind(book_chapter_url_file, chapter_url) 146 | chapter_info = " | ".join(chapter_info_arr) 147 | FileTool.write_behind(book_chapter_file, chapter_info) 148 | # 记录完成的章节数,简单实现断点续爬 149 | FileTool.overwrite(book_chapter_finish_count_file, str(chapter_count)) 150 | print("第", chapter_count, "章,该章节已经抓取完毕!") 151 | 152 | # 保存一个章节的内容 153 | def save_chapter(self, path, chapter_url): 154 | html_content = self.request_content(chapter_url) ##调用request_content返回html文本给我们 155 | html_ele = BeautifulSoup(html_content, 'lxml') 156 | novelDiv = html_ele.find("div", class_="noveltext") 157 | if novelDiv == None: 158 | return 159 | novelHtmls = novelDiv.contents 160 | novelTextArr = [] 161 | # 处理小说文本数据,保证简单换行,保证基本格式 162 | for novelHtml in novelHtmls: 163 | if novelHtml.name == "div" or novelHtml.name == "br": 164 | continue 165 | else: 166 | text = novelHtml.string 167 | if text == None: 168 | continue 169 | text = text.replace('\n', '').replace("\r", "").replace(" ", "") 170 | if len(text) > 0: 171 | novelTextArr.append(text) 172 | novelText = "\n\n".join(novelTextArr) 173 | FileTool.overwrite(path, novelText) 174 | 175 | 176 | # 读取简单的数字信息 177 | def readSimpleFinishCountInfo(self, path): 178 | isExists = FileTool.isExit(path) 179 | if isExists: 180 | countTxt = FileTool.read_utf8(path) 181 | return int(countTxt) 182 | else: 183 | return 0 184 | 185 | # 保存已完成的条数信息 186 | def saveFinishCountInfo(self): 187 | FileTool.overwrite(self.totalFinishFilePath, str(self.pageCount) + "-" + str(self.lineCount)) 188 | 189 | # 读取已完成的条数信息 190 | def readFinishCountInfo(self): 191 | isExists = FileTool.isExit(self.totalFinishFilePath) 192 | if isExists: 193 | countTxt = FileTool.read_utf8(self.totalFinishFilePath) 194 | countStrArr = countTxt.split("-") 195 | self.pageCount = int(countStrArr[0]) 196 | self.lineCount = int(countStrArr[1]) 197 | else: 198 | self.pageCount = 0 199 | self.lineCount = 0 200 | 201 | # 获取网页html文本内容 202 | def request_content(self, url): 203 | try: 204 | return request.get_utf8_content(url, headers=self.headers) 205 | except: 206 | return "" 207 | 208 | 209 | jjwxk = jjwxk_free_simple() 210 | jjwxk.free_list() 211 | # while jjwxk.globalPageCount < 10000: 212 | # try: 213 | # jjwxk.free_list() 214 | # except Exception as e: 215 | # print('except:', e) 216 | # finally: 217 | # print('finally...') 218 | jjwxk.book_list() -------------------------------------------------------------------------------- /jjwxk.net/simple-http-server.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # python2.0为SimpleHTTPServer 3 | import http.server 4 | # python2.0为SocketServer 5 | import socketserver 6 | # 自定义端口 7 | PORT = 8888 8 | # 服务句柄定义 9 | Handler = http.server.SimpleHTTPRequestHandler 10 | # TCP服务 11 | httpd = socketserver.TCPServer(("", PORT), Handler) 12 | # 启动Web服务 13 | print("Web服务端口为:", PORT) 14 | httpd.serve_forever() -------------------------------------------------------------------------------- /mzitu.com/Parent.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") -------------------------------------------------------------------------------- /mzitu.com/README.md: -------------------------------------------------------------------------------- 1 | ## 前情提要 2 | 3 | 爬取 妹子图 网站的 图片 4 | 学习python的一个练手例子,来自 https://cuiqingcai.com/4352.html 5 | 6 | ## 可启动文件 7 | 8 | 1. scrapy-mzitu-no-es.py,简单文件夹格式目录存储,简单断点续爬,未使用数据库 9 | 2. scrapy-mzitu-es.py,基于ES数据库存储,图片相对本目录存储路径,断点续爬 10 | 3. mzitu-crawler-es.py,简单多线程爬取,基于ES数据库存储,图片相对本目录存储路径,断点续爬 11 | 12 | ## 启动方法 13 | 14 | 控制台 cd 到当前目录 15 | 16 | >python filename.py 17 | 18 | >PS:本项目仅学习分享用,请不要用于商业 -------------------------------------------------------------------------------- /mzitu.com/mzitu-crawler-es.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import time 3 | import threading 4 | import multiprocessing 5 | from mzitu_for_thread import MzituThread 6 | from mzitu_es import mzitu_es 7 | 8 | SLEEP_TIME = 1 9 | def mzitu_crawler(max_threads=5): 10 | def pageurl_crawler(): 11 | mzituThread = MzituThread(mzitu_es) 12 | while True: 13 | if mzituThread.scrapy_one() is not True: 14 | time.sleep(SLEEP_TIME) 15 | 16 | threads = [] 17 | while True: 18 | """ 19 | threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行 20 | """ 21 | for thread in threads: 22 | if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉 23 | threads.remove(thread) 24 | while len(threads) < max_threads: ##线程池中的线程少于max_threads 或者 crawl_qeue时 25 | thread = threading.Thread(target=pageurl_crawler) ##创建线程 26 | thread.setDaemon(True) ##设置守护线程 27 | thread.start() ##启动线程 28 | threads.append(thread) ##添加进线程队列 29 | time.sleep(SLEEP_TIME) 30 | 31 | def process_crawler(): 32 | process = [] 33 | num_cpus = multiprocessing.cpu_count() 34 | print('将会启动进程数为:', num_cpus) 35 | for i in range(num_cpus): 36 | p = multiprocessing.Process(target=mzitu_crawler) ##创建进程 37 | p.start() ##启动进程 38 | process.append(p) ##添加进进程队列 39 | for p in process: 40 | p.join() ##等待进程队列里面的进程结束 41 | 42 | if __name__ == "__main__": 43 | #mzituThread = MzituThread(mzitu_es) 44 | #mzituThread.all_url() # 抓取所有需要带处理的链接 45 | process_crawler() -------------------------------------------------------------------------------- /mzitu.com/mzitu_es.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | from DBTools.MyES import MyESClient 4 | from datetime import datetime 5 | 6 | class MzituEs(): 7 | def __init__(self): 8 | self.init_es() 9 | 10 | def init_es(self): 11 | self.esindex = "mzitu" 12 | self.estype = "mzitu_imgs" 13 | index_mappings = { 14 | "mappings": { 15 | self.estype: { 16 | "properties": { 17 | "imgThemeTitle": { 18 | "type": "text", 19 | "index": True, 20 | "analyzer": "ik_max_word", 21 | "search_analyzer": "ik_max_word" 22 | }, 23 | "imgThemeUrl": { 24 | "type": "keyword", 25 | "index": True 26 | }, 27 | "createTime": { 28 | "type": "date", 29 | "index": True 30 | }, 31 | "scrapyStatus":{ 32 | "type": "integer", 33 | "index": True, 34 | # 0,1,2 待爬取,爬取中,已完成 35 | "null_value": 0 36 | } 37 | } 38 | } 39 | } 40 | } 41 | self.es = MyESClient(self.esindex, self.estype) 42 | self.es.createIndex(index_mappings) 43 | self.currdata = {} 44 | self.currdata["imgUrlList"] = [] 45 | 46 | def save_es(self, data=None): 47 | ''' 48 | 存储当前数据到ES,并清空 49 | :return: 50 | ''' 51 | if data == None: 52 | data = self.currdata 53 | data["createTime"] = datetime.now() 54 | data["scrapyStatus"] = 0 55 | self.currdata = {} 56 | self.currdata["imgUrlList"] = [] 57 | self.es.indexData(data, data["imgThemeUrl"]) 58 | 59 | def get_one_need_scrapy_es(self): 60 | ''' 61 | 从ES库中找一个待爬取的数据 62 | ''' 63 | queryBody = { 64 | "query": { 65 | "bool": { 66 | "must": [ 67 | { 68 | "term": { 69 | "scrapyStatus": { 70 | "value": 0 71 | } 72 | } 73 | } 74 | ] 75 | } 76 | } 77 | } 78 | res = self.es.getOneByBody(queryBody) 79 | return res 80 | 81 | def get_by_themeId_es(self, themeId): 82 | res = self.es.getDataSourceById(themeId) 83 | return res 84 | 85 | def exit_es(self, themeurl): 86 | queryBody = { 87 | "query": { 88 | "bool": { 89 | "must": [ 90 | { 91 | "term": { 92 | "imgThemeUrl": { 93 | "value": themeurl 94 | } 95 | } 96 | } 97 | ] 98 | } 99 | } 100 | } 101 | if self.es.exit(queryBody): 102 | print("ES数据库里面已经存在!!") 103 | return True 104 | else: 105 | return False 106 | 107 | mzitu_es = MzituEs() -------------------------------------------------------------------------------- /mzitu.com/mzitu_for_thread.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | import datetime 4 | from bs4 import BeautifulSoup 5 | import os 6 | # import lxml 7 | from BaseTools.MyDownload import request ##导入模块变了一下 8 | 9 | class MzituThread(object): 10 | def __init__(self, mzitu_es): 11 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"} 12 | self.currPath = "./mzitu/" 13 | self.currdata = {} 14 | self.currdata["imgUrlList"] = [] 15 | self.es = mzitu_es 16 | 17 | def all_url(self, url='http://www.mzitu.com/all'): 18 | html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response 19 | all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find('ul', class_="archives").find_all('a') 20 | for a in all_a: 21 | try: 22 | title = a.get_text() 23 | href = a['href'] 24 | print(title, href) ##加点提示不然太枯燥了 25 | if self.es.exit_es(href): 26 | continue 27 | self.currdata["imgThemeTitle"] = title 28 | self.currdata["imgThemeUrl"] = href 29 | self.es.save_es(self.currdata) 30 | except Exception as e: 31 | print(e) 32 | continue 33 | 34 | def scrapy_one(self, url=None): 35 | try: 36 | data = None 37 | if url == None: 38 | data = self.es.get_one_need_scrapy_es() 39 | else: 40 | data = self.es.get_by_themeId_es(url) 41 | 42 | if data == None: 43 | return False 44 | else: 45 | data["scrapyStatus"]=1 46 | self.es.save_es(data) ## 更新状态为爬取中 47 | href = data["imgThemeUrl"] 48 | self.mkdir(href) ##调用mkdir函数创建文件夹! 49 | self.html(href, data) ##调用html函数把href参数传递过去! 50 | data["scrapyStatus"]=2 51 | self.es.save_es(data) ## 保存数据,并更新状态为已完成 52 | return True 53 | except Exception as e: 54 | print(e) 55 | return False 56 | 57 | 58 | def html(self, href, data=None): ##这个函数是处理套图地址获得图片的页面地址 59 | try: 60 | html = self.request(href) 61 | self.headers['referer'] = href 62 | ## max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() 63 | # max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text() 64 | max_span = 100 65 | pageDiv = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi') 66 | if len(pageDiv) > 1: 67 | max_span = pageDiv.find_all('span')[-2].get_text() 68 | for page in range(1, int(max_span) + 1): 69 | page_url = href + '/' + str(page) 70 | self.img(page_url, data) ##调用img函数 71 | except Exception as e: 72 | print('发生了异常:', e) 73 | 74 | def img(self, page_url, data=None): ##这个函数处理图片页面地址获得图片的实际地址 75 | img_html = self.request(page_url) 76 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src'] 77 | print(img_url) 78 | self.saveImg(img_url, data) 79 | 80 | def saveImg(self, img_url, data=None): ##这个函数保存图片 81 | name = img_url[-9:-4] 82 | currUrl = self.currPath + name + '.jpg' 83 | isExists = os.path.exists(currUrl) 84 | if not isExists: 85 | img = self.request(img_url) 86 | f = open(currUrl, 'ab') 87 | f.write(img.content) 88 | f.close() 89 | print('该图片下载完毕') 90 | if data == None: 91 | self.currdata["imgUrlList"].append({"originUrl":img_url, "currentUrl": currUrl}) 92 | else: 93 | data["imgUrlList"].append({"originUrl":img_url, "currentUrl": currUrl}) 94 | else: 95 | print('该图片已经存在') 96 | 97 | def mkdir(self, path): ##这个函数创建文件夹 98 | if USE_ONE_DIR: 99 | path = "" 100 | elif USE_DEF_DIR: 101 | if path == None: 102 | path = self.currdata["imgThemeUrl"] 103 | index = path.rindex("/") 104 | path = path[index + 1:] 105 | else: 106 | path = path.strip() 107 | isExists = os.path.exists(os.path.join("./mzitu", path)) 108 | if not isExists: 109 | print('建了一个名字叫做', path, '的文件夹!') 110 | os.makedirs(os.path.join("./mzitu", path)) 111 | self.currPath = "./mzitu/" + path + "/" 112 | ## os.chdir(os.path.join("./mzitu", path)) ##切换到目录 113 | return True 114 | else: 115 | print('名字叫做', self.currPath, '的文件夹已经存在了!') 116 | return False 117 | 118 | def request(self, url): ##这个函数获取网页的response 然后返回 119 | content = request.get(url, headers=self.headers, timeout=3) 120 | return content 121 | 122 | 123 | 124 | USE_ONE_DIR = False 125 | USE_DEF_DIR = True 126 | 127 | #mzituThread = MzituThread() ##实例化 128 | #mzituThread.all_url() 129 | #mzituThread.scrapy_one() -------------------------------------------------------------------------------- /mzitu.com/scrapy-mzitu-es.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | import datetime 4 | from bs4 import BeautifulSoup 5 | import os 6 | # import lxml 7 | from BaseTools.MyDownload import request ##导入模块变了一下 8 | from mzitu_es import mzitu_es 9 | 10 | class mzitu(): 11 | 12 | def __init__(self): 13 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"} 14 | self.currPath = "./mzitu/" 15 | self.currdata = {} 16 | self.currdata["imgUrlList"] = [] 17 | 18 | def all_url(self, url): 19 | html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response 20 | all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find('ul', class_="archives").find_all('a') 21 | for a in all_a: 22 | title = a.get_text() 23 | href = a['href'] 24 | print(title, href) ##加点提示不然太枯燥了 25 | if mzitu_es.exit_es(href): 26 | continue 27 | self.currdata["imgThemeTitle"] = title 28 | self.currdata["imgThemeUrl"] = href 29 | #path = str(title).replace("?", '_') ##我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 30 | self.mkdir(title) ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦! 31 | self.html(href) ##调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦! 32 | mzitu_es.save_es(self.currdata) 33 | 34 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址 35 | try: 36 | html = self.request(href) 37 | self.headers['referer'] = href 38 | #max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() 39 | max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text() 40 | for page in range(1, int(max_span) + 1): 41 | page_url = href + '/' + str(page) 42 | self.img(page_url) ##调用img函数 43 | except Exception as e: 44 | print('发生了异常:', e) 45 | 46 | def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址 47 | img_html = self.request(page_url) 48 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src'] 49 | print(img_url) 50 | self.saveImg(img_url) 51 | 52 | def saveImg(self, img_url): ##这个函数保存图片 53 | name = img_url[-9:-4] 54 | currUrl = self.currPath + name + '.jpg' 55 | isExists = os.path.exists(currUrl) 56 | if not isExists: 57 | img = self.request(img_url) 58 | f = open(currUrl, 'ab') 59 | f.write(img.content) 60 | f.close() 61 | print('该图片下载完毕') 62 | self.currdata["imgUrlList"].append({"originUrl":img_url, "currentUrl": currUrl}) 63 | else: 64 | print('该图片已经存在') 65 | 66 | def mkdir(self, path): ##这个函数创建文件夹 67 | if USE_ONE_DIR: 68 | path = "" 69 | elif USE_DEF_DIR: 70 | path = self.currdata["imgThemeUrl"] 71 | index = path.rindex("/") 72 | path = path[index + 1:] 73 | else: 74 | path = path.strip() 75 | isExists = os.path.exists(os.path.join("./mzitu", path)) 76 | if not isExists: 77 | print('建了一个名字叫做', path, '的文件夹!') 78 | os.makedirs(os.path.join("./mzitu", path)) 79 | self.currPath = "./mzitu/" + path + "/" 80 | #os.chdir(os.path.join("./mzitu", path)) ##切换到目录 81 | return True 82 | else: 83 | print('名字叫做', self.currPath, '的文件夹已经存在了!') 84 | return False 85 | 86 | def request(self, url): ##这个函数获取网页的response 然后返回 87 | content = request.get(url, headers=self.headers, timeout=3) 88 | return content 89 | 90 | USE_ONE_DIR = False 91 | USE_DEF_DIR = True 92 | Mzitu = mzitu() ##实例化 93 | Mzitu.all_url('http://www.mzitu.com/all') ##给函数all_url传入参数 你可以当作启动爬虫(就是入口) 94 | -------------------------------------------------------------------------------- /mzitu.com/scrapy-mzitu-no-es.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | from bs4 import BeautifulSoup 4 | import os 5 | from BaseTools.MyDownload import request 6 | 7 | class mzitu(): 8 | def __init__(self): 9 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"} 10 | self.basePath = "./mzitu-no-es/" 11 | self.currPath = self.basePath 12 | self.mkdir(self.basePath) 13 | self.totalFinishPath = "./mzitu-no-es/totalPage.txt" 14 | self.totalFinish = self.getTotalFinish() 15 | 16 | def all_url(self, url): 17 | html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response 18 | all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find('ul', class_="archives").find_all('a') 19 | count = 0 20 | for a in all_a: 21 | count = count + 1 22 | if count > self.totalFinish: 23 | self.overwriteTotalFinish(count) 24 | else: 25 | print("第", count, "页已经抓取过,跳过!") 26 | continue 27 | title = a.get_text() 28 | href = a['href'] 29 | print(title, href) ##加点提示不然太枯燥了 30 | #path = str(title).replace("?", '_') ##我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉 31 | self.mkdir(title) ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦! 32 | self.html(href) ##调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦! 33 | self.totalFinish = count 34 | 35 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址 36 | try: 37 | html = self.request(href) 38 | self.headers['referer'] = href 39 | # max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text() 40 | # max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text() 41 | max_span = 100 42 | pageDiv = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi') 43 | if len(pageDiv) > 1: 44 | max_span = pageDiv.find_all('span')[-2].get_text() 45 | for page in range(1, int(max_span) + 1): 46 | page_url = href + '/' + str(page) 47 | self.img(page_url) ##调用img函数 48 | except Exception as e: 49 | print('发生了异常:', e) 50 | 51 | def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址 52 | img_html = self.request(page_url) 53 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src'] 54 | print(img_url) 55 | self.saveImg(img_url) 56 | 57 | def saveImg(self, img_url): ##这个函数保存图片 58 | name = img_url[-9:-4] 59 | imgPath = self.currPath + name + '.jpg' 60 | isExists = os.path.exists(imgPath) 61 | if not isExists: 62 | img = self.request(img_url) 63 | f = open(imgPath, 'ab') 64 | f.write(img.content) 65 | f.close() 66 | print('该图片下载完毕') 67 | else: 68 | print('该图片已经存在') 69 | 70 | def mkdir(self, path): ##这个函数创建文件夹 71 | if USE_ONE_DIR: 72 | path = "" 73 | elif USE_DEF_DIR: 74 | index = path.rindex("/") 75 | path = path[index + 1:] 76 | else: 77 | path = path.strip() 78 | self.currPath = os.path.join(self.basePath, path) 79 | isExists = os.path.exists(self.currPath) 80 | if not isExists: 81 | print('建了一个名字叫做', path, '的文件夹!') 82 | os.makedirs(self.currPath) 83 | #os.chdir(os.path.join("./mzitu", path)) ##切换到目录 84 | return True 85 | else: 86 | print('名字叫做', self.currPath, '的文件夹已经存在了!') 87 | return False 88 | 89 | 90 | def request(self, url): ##这个函数获取网页的response 然后返回 91 | content = request.get(url, headers=self.headers, timeout=3) 92 | return content 93 | 94 | def getTotalFinish(self): 95 | isExists = os.path.exists(self.totalFinishPath) 96 | if isExists: 97 | with open(self.totalFinishPath, 'r', encoding='UTF-8') as f: 98 | return int(f.read()) 99 | else: 100 | return 0 101 | 102 | def overwriteTotalFinish(self, count): 103 | with open(self.totalFinishPath, 'w', encoding='UTF-8') as f: 104 | f.write(str(count)) 105 | 106 | USE_ONE_DIR = True 107 | USE_DEF_DIR = False 108 | Mzitu = mzitu() ##实例化 109 | Mzitu.all_url('http://www.mzitu.com/all') ##给函数all_url传入参数 你可以当作启动爬虫(就是入口) 110 | -------------------------------------------------------------------------------- /wallhaven.cc/Parent.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") -------------------------------------------------------------------------------- /wallhaven.cc/README.md: -------------------------------------------------------------------------------- 1 | ## 前情提要 2 | 3 | 最近朋友发了个壁纸的网站,情不自禁花十几分钟写了个爬取。 4 | 5 | ![](img/20210623210831.png) 6 | 7 | 支持简单按页断点续爬 8 | 9 | ## 启动方法 10 | 11 | 控制台 cd 到当前目录 12 | 13 | >python wallpic_scrapy.py 14 | 15 | >PS:本项目仅学习分享用,请不要用于商业 -------------------------------------------------------------------------------- /wallhaven.cc/img/20210623210831.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/wallhaven.cc/img/20210623210831.png -------------------------------------------------------------------------------- /wallhaven.cc/wallpic_scrapy.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | import Parent 3 | from bs4 import BeautifulSoup 4 | import os 5 | from BaseTools.MyDownload import request 6 | 7 | class wallpic(): 8 | def __init__(self): 9 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"} 10 | self.basePath = "./wallpic/" 11 | self.currPath = self.basePath 12 | self.mkdir(self.basePath) 13 | self.totalFinishPath = "./wallpic/totalPage.txt" 14 | self.totalFinish = self.getTotalFinish() 15 | 16 | def all_get(self, totalPage): 17 | count = 0 18 | while count < totalPage: 19 | count = count + 1 20 | if count > self.totalFinish: 21 | self.overwriteTotalFinish(count) 22 | else: 23 | print("第", count, "页已经抓取过,跳过!") 24 | continue 25 | title = '第' + str(count) + '页/' 26 | href = 'https://wallhaven.cc/toplist?page=' + str(count) 27 | print(title, href) ##加点提示不然太枯燥了 28 | ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦! 29 | path = title 30 | self.mkdir(path) 31 | self.html(href) ##调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦! 32 | self.totalFinish = count 33 | 34 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址 35 | try: 36 | html = self.request(href) 37 | self.headers['referer'] = href 38 | figures = BeautifulSoup(html.text, 'lxml').find('section', class_='thumb-listing-page').find_all('figure') 39 | for figure in figures: 40 | page_url = figure.find_all('a')[0]['href'] 41 | self.img(page_url) ##调用img函数 42 | except Exception as e: 43 | print('发生了异常:', e) 44 | 45 | def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址 46 | img_html = self.request(page_url) 47 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='scrollbox').find_all('img')[0]['src'] 48 | print(img_url) 49 | self.saveImg(img_url) 50 | 51 | def saveImg(self, img_url): ##这个函数保存图片 52 | name = img_url[-9:-4] 53 | imgPath = self.currPath + name + '.jpg' 54 | isExists = os.path.exists(imgPath) 55 | if not isExists: 56 | img = self.request(img_url) 57 | f = open(imgPath, 'ab') 58 | f.write(img.content) 59 | f.close() 60 | print('该图片下载完毕') 61 | else: 62 | print('该图片已经存在') 63 | 64 | def mkdir(self, path): ##这个函数创建文件夹 65 | if USE_ONE_DIR: 66 | path = "" 67 | elif USE_DEF_DIR: 68 | index = path.rindex("/") 69 | path = path[index + 1:] 70 | else: 71 | path = path.strip() 72 | self.currPath = os.path.join(self.basePath, path) 73 | isExists = os.path.exists(self.currPath) 74 | if not isExists: 75 | print('建了一个名字叫做', path, '的文件夹!') 76 | os.makedirs(self.currPath) 77 | #os.chdir(os.path.join("./mzitu", path)) ##切换到目录 78 | return True 79 | else: 80 | print('名字叫做', self.currPath, '的文件夹已经存在了!') 81 | return False 82 | 83 | 84 | def request(self, url): ##这个函数获取网页的response 然后返回 85 | content = request.get(url, headers=self.headers, timeout=3) 86 | return content 87 | 88 | def getTotalFinish(self): 89 | isExists = os.path.exists(self.totalFinishPath) 90 | if isExists: 91 | with open(self.totalFinishPath, 'r', encoding='UTF-8') as f: 92 | return int(f.read()) 93 | else: 94 | return 0 95 | 96 | def overwriteTotalFinish(self, count): 97 | with open(self.totalFinishPath, 'w', encoding='UTF-8') as f: 98 | f.write(str(count)) 99 | 100 | USE_ONE_DIR = False 101 | USE_DEF_DIR = False 102 | WallPic = wallpic() ##实例化 103 | 104 | if __name__ == "__main__": 105 | ## 传入你要爬取的页数,你可以当作启动爬虫(就是入口) 106 | WallPic.all_get(11) 107 | --------------------------------------------------------------------------------