├── .gitattributes
├── .gitignore
├── BaseTools
├── CompareUtil.py
├── MyDownload.py
├── MyUtil.py
├── ScreenShotUtil.py
├── __init__.py
└── test
│ ├── Parent.py
│ ├── data
│ ├── es-query-lean.md
│ ├── result.md
│ └── test-file.html
│ ├── edit-distance-test.py
│ ├── util-test.py
│ ├── util-test2.py
│ └── util-test3.py
├── DBTools
├── MyES.py
├── MyMongoDB.py
├── MySqlite.py
├── __init__.py
└── test
│ ├── Parent.py
│ ├── es-test.py
│ ├── sqlite-test.py
│ └── sqlite-test
│ └── test.db
├── README.md
├── __init__.py
├── ctrip.com-visa
├── Parent.py
├── README.md
├── img
│ ├── ctrip-visa-gqtp.png
│ └── ctrip-visa-lsgxx.png
└── xc-visa-lqxx.py
├── framework
└── base_scrapy
│ ├── README.md
│ ├── base_scrapy
│ ├── __init__.py
│ ├── entrypoint.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ │ ├── __init__.py
│ │ └── base_spider.py
│ └── scrapy.cfg
├── huaban.com
├── PreviewHtmlTool.py
├── README.md
├── huaban-border-text.py
├── huaban-simple.py
├── img
│ ├── huaban-border-txt.png
│ ├── huaban-preview-border.png
│ └── huaban-simple-1.png
├── test.html
└── 淡然小笺赋箴言
│ ├── 13448395_1.jpg
│ ├── 13448395_1.txt
│ ├── 13448395_10.jpg
│ ├── 13448395_10.txt
│ ├── 13448395_11.jpg
│ ├── 13448395_11.txt
│ ├── 13448395_12.jpg
│ ├── 13448395_12.txt
│ ├── 13448395_13.jpg
│ ├── 13448395_13.txt
│ ├── 13448395_14.jpg
│ ├── 13448395_14.txt
│ ├── 13448395_15.jpg
│ ├── 13448395_15.txt
│ ├── 13448395_16.jpg
│ ├── 13448395_16.txt
│ ├── 13448395_17.jpg
│ ├── 13448395_17.txt
│ ├── 13448395_18.jpg
│ ├── 13448395_18.txt
│ ├── 13448395_19.jpg
│ ├── 13448395_19.txt
│ ├── 13448395_2.jpg
│ ├── 13448395_2.txt
│ ├── 13448395_20.jpg
│ ├── 13448395_20.txt
│ ├── 13448395_21.jpg
│ ├── 13448395_21.txt
│ ├── 13448395_22.jpg
│ ├── 13448395_22.txt
│ ├── 13448395_23.jpg
│ ├── 13448395_23.txt
│ ├── 13448395_24.jpg
│ ├── 13448395_24.txt
│ ├── 13448395_25.jpg
│ ├── 13448395_25.txt
│ ├── 13448395_26.jpg
│ ├── 13448395_26.txt
│ ├── 13448395_27.jpg
│ ├── 13448395_27.txt
│ ├── 13448395_28.jpg
│ ├── 13448395_28.txt
│ ├── 13448395_29.jpg
│ ├── 13448395_29.txt
│ ├── 13448395_3.jpg
│ ├── 13448395_3.txt
│ ├── 13448395_30.jpg
│ ├── 13448395_30.txt
│ ├── 13448395_31.jpg
│ ├── 13448395_31.txt
│ ├── 13448395_32.jpg
│ ├── 13448395_32.txt
│ ├── 13448395_33.jpg
│ ├── 13448395_33.txt
│ ├── 13448395_34.jpg
│ ├── 13448395_34.txt
│ ├── 13448395_35.jpg
│ ├── 13448395_35.txt
│ ├── 13448395_36.jpg
│ ├── 13448395_36.txt
│ ├── 13448395_37.jpg
│ ├── 13448395_37.txt
│ ├── 13448395_38.jpg
│ ├── 13448395_38.txt
│ ├── 13448395_39.jpg
│ ├── 13448395_39.txt
│ ├── 13448395_4.jpg
│ ├── 13448395_4.txt
│ ├── 13448395_40.jpg
│ ├── 13448395_40.txt
│ ├── 13448395_5.jpg
│ ├── 13448395_5.txt
│ ├── 13448395_6.jpg
│ ├── 13448395_6.txt
│ ├── 13448395_7.jpg
│ ├── 13448395_7.txt
│ ├── 13448395_8.jpg
│ ├── 13448395_8.txt
│ ├── 13448395_9.jpg
│ ├── 13448395_9.txt
│ └── index.html
├── jjwxk.net
├── Parent.py
├── README.md
├── img
│ ├── jjwxk-free-simple-1.png
│ └── jjwxk-free-simple-2.png
├── jjwxk-free-simple.py
└── simple-http-server.py
├── mzitu.com
├── Parent.py
├── README.md
├── mzitu-crawler-es.py
├── mzitu_es.py
├── mzitu_for_thread.py
├── scrapy-mzitu-es.py
└── scrapy-mzitu-no-es.py
└── wallhaven.cc
├── Parent.py
├── README.md
├── img
└── 20210623210831.png
└── wallpic_scrapy.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.html linguist-language=python
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__/
2 | .idea/
3 | .vscode/
4 | *.png
5 | *.jpg
6 | *.csv
7 | *.wpr
8 | *.txt
9 | *.log
10 | *.json
11 | *.exe
12 | plugin/*
13 | .DS_Store
14 | .scrapy/
--------------------------------------------------------------------------------
/BaseTools/CompareUtil.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 | class EditDistance():
4 | @classmethod
5 | def minEditDist(cls, sm, sn):
6 | '''
7 | 计算两个字符串的最小莱温斯坦距离
8 | '''
9 | m,n = len(sm)+1,len(sn)+1
10 |
11 | # create a matrix (m*n)
12 | matrix = [[0]*n for i in range(m)]
13 |
14 | matrix[0][0]=0
15 | for i in range(1,m):
16 | matrix[i][0] = matrix[i-1][0] + 1
17 |
18 | for j in range(1,n):
19 | matrix[0][j] = matrix[0][j-1]+1
20 |
21 |
22 | for i in range(m):
23 | print(matrix[i])
24 |
25 | print("********************")
26 |
27 | cost = 0
28 |
29 | for i in range(1,m):
30 | for j in range(1,n):
31 | if sm[i-1]==sn[j-1]:
32 | cost = 0
33 | else:
34 | cost = 1
35 |
36 | matrix[i][j]=min(matrix[i-1][j]+1,matrix[i][j-1]+1,matrix[i-1][j-1]+cost)
37 |
38 | for i in range(m):
39 | print(matrix[i])
40 |
41 | return matrix[m-1][n-1]
42 |
43 | @classmethod
44 | def similarityDegree(cls, str1, str2):
45 | '''
46 | 计算两个字符串的相似度
47 | '''
48 | mindist = 0
49 | if str1 == None and str2 != None:
50 | mindist = len(str2)
51 | return 0
52 | elif str1 != None and str2 == None:
53 | mindist = len(str1)
54 | return 0
55 | elif str1 != None and str2 != None:
56 | mindist = cls.minEditDist(str1,str2)
57 | else:
58 | return 0
59 | maxLength = min(len(str1), len(str2))
60 | similarityDegree = 1-mindist/maxLength
61 | print(str1, "和", str2, "的相似度为:", similarityDegree)
62 | return similarityDegree
63 |
--------------------------------------------------------------------------------
/BaseTools/MyDownload.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import requests
3 | import re
4 | import random
5 | import time
6 | from bs4 import BeautifulSoup
7 |
8 |
9 | class download():
10 | def __init__(self):
11 | self.iplist = [] ##初始化一个list用来存放我们获取到的IP
12 | # self.get_ip_list()
13 | self.get_ip_list3()
14 | print(self.iplist)
15 | self.user_agent_list = [
16 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
17 | "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
18 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
19 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
20 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
21 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
22 | "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
23 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
24 | "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
25 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
26 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
27 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
28 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
29 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
30 | "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
31 | "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
32 | "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
33 | "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
34 | ]
35 |
36 | #功能:爬取IP存入ip_list列表
37 | def get_ip_list(self):
38 | #html = requests.get("http://haoip.cc/tiqu.htm") ##不解释咯,获取免费代理IP地址的网站,用正则过滤获取到代理IP
39 | #iplistn = re.findall(r'r/>(.*?)(.*?)@HTTP', html.text, re.S) ##表示从html.text中获取所有r/> 6:
59 | if not tds[6].text.find('天')==-1:
60 | # print('tds[8]为:'+str(tds[8]))
61 | self.iplist.append(tds[1].text + ':' + tds[2].text)
62 | # print(tds[1].text + ':' + tds[2].text)
63 |
64 | #功能:爬取IP存入ip_list列表
65 | def get_ip_list3(self):
66 | web_data = requests.get("https://www.kuaidaili.com/free/", headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'})
67 | soup = BeautifulSoup(web_data.text, 'lxml')
68 | ips = soup.find_all('tr')
69 | for i in range(1, len(ips)):
70 | ip_info = ips[i]
71 | tds = ip_info.find_all('td')
72 | currIp = ''
73 | if len(tds) > 1:
74 | for item in tds:
75 | if item["data-title"] == 'IP':
76 | currIp = item.text
77 | if item["data-title"] == 'PORT':
78 | currIp += ':' + item.text
79 | break
80 | self.iplist.append(currIp)
81 |
82 | def get(self, url, headers, timeout, proxy=None, num_retries=10): ##给函数一个默认参数proxy为空
83 | UA = random.choice(self.user_agent_list) ##从self.user_agent_list中随机取出一个字符串
84 | headers['User-Agent'] = UA ##构造成一个完整的User-Agent (UA代表的是上面随机取出来的字符串哦)
85 |
86 | if proxy == None: ##当代理为空时,不使用代理获取response(别忘了response啥哦!之前说过了!!)
87 | try:
88 | return requests.get(url, headers=headers, timeout=timeout)##这样服务器就会以为我们是真的浏览器了
89 | except:##如过上面的代码执行报错则执行下面的代码
90 | if num_retries > 0: ##num_retries是我们限定的重试次数
91 | time.sleep(10) ##延迟十秒
92 | print('获取网页出错,10S后将获取倒数第:', num_retries, '次')
93 | return self.get(url, headers, timeout, num_retries - 1) ##调用自身 并将次数减1
94 | else:
95 | print('开始使用代理')
96 | time.sleep(10)
97 | IP = ''.join(str(random.choice(self.iplist)).strip()) ##下面有解释哦
98 | proxy = {'http': IP}
99 | return self.get(url, headers, timeout, proxy) ##代理不为空的时候
100 | else: ##当代理不为空
101 | try:
102 | IP = ''.join(str(random.choice(self.iplist)).strip()) ##将从self.iplist中获取的字符串处理成我们需要的格式(处理了些什么自己看哦,这是基础呢)
103 | proxy = {'http': IP} ##构造成一个代理
104 | return requests.get(url, headers=headers, proxies=proxy, timeout=timeout) ##使用代理获取response
105 | except:
106 | if num_retries > 0:
107 | time.sleep(10)
108 | IP = ''.join(str(random.choice(self.iplist)).strip())
109 | proxy = {'http': IP}
110 | print('正在更换代理,10S后将重新获取倒数第', num_retries, '次')
111 | print('当前代理是:', proxy)
112 | return self.get(url, headers, timeout, proxy, num_retries - 1)
113 | else:
114 | print('代理也不好使了!取消代理')
115 | return self.get(url, headers, 3)
116 |
117 | # 获取文本编码
118 | def get_encoding(self, text):
119 | return requests.utils.get_encodings_from_content(text)
120 |
121 | # 获取非中文乱码的文本
122 | def get_utf8_content(self, url, headers):
123 | req = request.get(url, headers, timeout=3)
124 | if req.content == None:
125 | return ""
126 | encoding = "utf-8"
127 | if req.encoding == 'ISO-8859-1':
128 | encodings = request.get_encoding(req.text)
129 | if encodings:
130 | encoding = encodings[0]
131 | else:
132 | encoding = req.apparent_encoding
133 | # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
134 | return req.content.decode(encoding, 'replace') #如果设置为replace,则会用?取代非法字符;
135 | return req.content
136 |
137 |
138 | request = download()
--------------------------------------------------------------------------------
/BaseTools/MyUtil.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import random,string
3 | from hashlib import md5
4 | import os
5 | import codecs
6 | import tomd
7 |
8 | class MyStr():
9 | @classmethod
10 | def getRandomPsw(cls, length=6):
11 | src = string.ascii_letters + string.digits
12 | if length < 6:
13 | length = 6
14 | list_passwd_all = random.sample(src, length - 3) #从字母和数字中随机取3位
15 | list_passwd_all.extend(random.sample(string.digits, 1)) #让密码中一定包含数字
16 | list_passwd_all.extend(random.sample(string.ascii_lowercase, 1)) #让密码中一定包含小写字母
17 | list_passwd_all.extend(random.sample(string.ascii_uppercase, 1)) #让密码中一定包含大写字母
18 | random.shuffle(list_passwd_all) #打乱列表顺序
19 |
20 | @classmethod
21 | def getFileMd5(cls, name):
22 | m = md5()
23 | a_file = open(name, 'rb') #需要使用二进制格式读取文件内容
24 | m.update(a_file.read())
25 | a_file.close()
26 | return m.hexdigest()
27 |
28 | @classmethod
29 | def getMd5(cls, instr, length=32):
30 | m = md5()
31 | m.update(instr)
32 | res = m.hexdigest()
33 | if length < 32:
34 | res = random.sample(res, length) #从字母和数字中随机取3位
35 | return res
36 |
37 | @classmethod
38 | def html2markdown(cls, html):
39 | mdTxt = tomd.Tomd(html).markdown
40 | return mdTxt
41 |
42 | class FileTool(object):
43 | #追加写入:写一个写入数据的接口
44 | @classmethod
45 | def write_behind(cls, filename, content, split='\n'):
46 | '''''
47 | :param content: 要写入的数据
48 | :param split: 每条数据之间的分隔符
49 | :return:
50 | '''
51 | if content == None:
52 | return
53 | # 判断传入的参数是否字符串类型,如果是,写入 . 如果不是,抛出异常
54 | if isinstance(content, str):
55 | #1.打开文件
56 | f = codecs.open(filename, 'a', 'utf-8')
57 | #2.写入数据
58 | f.write(content)
59 | f.write(split)
60 | #3.关闭文件
61 | f.close()
62 | else:
63 | raise TypeError('content must be a str!')
64 |
65 | #追加写入:写入多行数据
66 | @classmethod
67 | def write_behind_muti(cls, filename, str_list, split='\n'):
68 | #判断某个对象是否是某个类型,若是,返回True;否则,返回False
69 | rs = isinstance(str_list, list)
70 | #如果为True
71 | if rs:
72 | #for循环遍历列表,取出每一数据,判断数据类型是否为字符串
73 | for content in str_list:
74 | #如果不是字符串类型
75 | if isinstance(content,str) == False:
76 | #抛出异常
77 | raise TypeError('str_list must be a list of "str",ex:["str1","str2"...]')
78 | #如果没有异常,就可以写入数据了
79 | #1.打开文件
80 | f = open(filename,'a')
81 | #2.写入数据 str1\nstr2\nstr3...
82 | string = split.join(str_list)
83 | f.write(string)
84 | #3.关闭文件
85 | f.close()
86 | else:
87 | #如果传入的不是列表,抛出异常
88 | raise TypeError('str_list must be a list of "str",ex:["str1","str2"...]')
89 | #创建文件夹
90 | @classmethod
91 | def mkdir(cls, path): ##这个函数创建文件夹
92 | isExists = os.path.exists(path)
93 | if not isExists:
94 | print('建了一个名字叫做', path, '的文件夹!')
95 | os.makedirs(path)
96 | return True
97 | else:
98 | print('名字叫做', path, '的文件夹已经存在了!')
99 | return False
100 | #读取文件内容
101 | @classmethod
102 | def read_utf8(cls, path):
103 | isExists = os.path.exists(path)
104 | if isExists:
105 | with open(path, 'r', encoding='UTF-8') as f:
106 | return str(f.read())
107 | else:
108 | return ''
109 | # 覆盖写入
110 | @classmethod
111 | def overwrite(cls, path, text):
112 | with open(path, 'w', encoding='UTF-8') as f:
113 | f.write(text)
114 |
115 | # 判断文件是否存在
116 | @classmethod
117 | def isExit(cls, path):
118 | return os.path.exists(path)
119 |
120 | # 检查文件名是否合理,替换特殊字符
121 | @classmethod
122 | def replace_invalid_filename(cls, filename, replaced_char='_'):
123 | '''
124 | 替换有特殊字符的文件名中的特殊字符,默认将特殊字符替换为'_'.
125 | 例如 C/C++ -> C_C++
126 | '''
127 | valid_filename = filename
128 | invalid_characaters = '\\/:*?"<>|'
129 | for c in invalid_characaters:
130 | #print 'c:', c
131 | valid_filename = valid_filename.replace(c, replaced_char)
132 | return valid_filename
133 |
134 |
135 | class DateTool(object):
136 | #日期格式化工具类,用类执行一个函数,返回一个对象,对象分别有year\month\day
137 | '''
138 | 2018-2-1 2018.2.1 2018/2/1
139 | date.year = 2018
140 | date.month = 2
141 | date.day = 1
142 | '''
143 | #初始化函数
144 | def __init__(self,year=1970,month=1,day=1):
145 | self.year = year
146 | self.month = month
147 | self.day = day
148 | #类函数,传递进来一个日期,返回一个该类的对象
149 | @classmethod
150 | def get_date(cls,date):
151 | #判断date是否为str类型
152 | if not isinstance(date,str):
153 | #不是str类型,直接触发异常
154 | raise TypeError('date must be a str!')
155 | #转换
156 | #判断是-还是.还是空格
157 | if '-' in date:
158 | #分别将2018赋值year 2赋值给month 1赋值给day
159 | # year, month, day = [2018,2,1]
160 | year,month,day = list(map(int,date.split('-')))
161 | elif '.' in date:
162 | year,month,day = list(map(int,date.split('.')))
163 | elif ' ' in date:
164 | year,month,day = list(map(int,date.split(' ')))
165 | elif '/' in date:
166 | year,month,day = list(map(int,date.split('/')))
167 | #创建对象
168 | # obj = DateTool(year,month,day)
169 | obj = cls(year,month,day)
170 | #返回对象
171 | return obj
--------------------------------------------------------------------------------
/BaseTools/ScreenShotUtil.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import tagui as t
3 | import uuid
4 |
5 | def url2png(url):
6 | t.init()
7 | t.url(url)
8 | # t.type('q', 'decentralization[enter]')
9 | t.snap('page', 'results-' + str(uuid.uuid1()) + '.png')
10 | t.close()
11 |
12 |
--------------------------------------------------------------------------------
/BaseTools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/BaseTools/__init__.py
--------------------------------------------------------------------------------
/BaseTools/test/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
--------------------------------------------------------------------------------
/BaseTools/test/data/es-query-lean.md:
--------------------------------------------------------------------------------
1 |
2 | 1. query string search
3 | 2. query DSL
4 | 3. query filter
5 | 4. full-text search
6 | 5. phrase search
7 | 6. highlight search
8 |
9 | ## query string search
10 |
11 | 1. took:耗费了几毫秒
12 | 1. timed_out:是否超时,这里是没有
13 | 1. _shards:数据拆成了5个分片,所以对于搜索请求,会打到所有的primary shard(或者是它的某个replica shard也可以)
14 | 1. hits.total:查询结果的数量,3个document
15 | 1. hits.max_score:score的含义,就是document对于一个search的相关度的匹配分数,越相关,就越匹配,分数也高
16 | 1. hits.hits:包含了匹配搜索的document的详细数据
17 |
18 | 搜索全部
19 |
20 | ```json
21 | GET /nginx/log_base/_search
22 |
23 | 结果如下:
24 | {
25 | "took" : 18,
26 | "timed_out" : false,
27 | "_shards" : {
28 | "total" : 5,
29 | "successful" : 5,
30 | "skipped" : 0,
31 | "failed" : 0
32 | },
33 | "hits" : {
34 | "total" : 143405,
35 | "max_score" : 1.0,
36 | “hits”: [
37 | {
38 | "_index" : "nginx",
39 | "_type" : "log_base",
40 | "_id" : "swZwhmwB82qtm9SxinXv",
41 | "_score" : 10.191514,
42 | "_source" : {
43 | "ip" : "10.95.30.42",
44 | "timestamp" : "17/Jul/2019:00:00:29 +0800",
45 | "url" : "GET /v-dist/static/js/vendor.min.js HTTP/1.1",
46 | "status" : "200",
47 | "bytes" : "782353"
48 | }
49 | },
50 | {...},
51 | {...}
52 | ]
53 | }
54 | }
55 | ```
56 |
57 | query string search 的由来,因为 search 参数都是以 http 请求的 query string 来附带的
58 |
59 | 搜索商品名称中包含yagao的商品,而且按照售价降序排序:
60 |
61 | ```json
62 | # 查询 所有字段 包含 10.95 的数据集
63 | GET /bookdb_index/book/_search?q=10.95
64 | # 查询 ip 包含 10.95.30.42 的数据集
65 | GET nginx/log_base/_search?q=ip:10.95.30.42
66 | # 使用 sort 功能需要定义 timestamp 属性 fielddata=true 有可排序功能
67 | # 出现该错误是因为 5.x 之后,Elasticsearch对排序、聚合所依据的字段用单独的数据结构(fielddata)缓存到内存里了,
68 | # 但是在text字段上默认是禁用的,如果有需要单独开启,这样做的目的是为了节省内存空间。
69 | GET nginx/log_base/_search?q=ip:10.95.30.42&sort=timestamp:desc
70 | # 使用 _mapping 查看结构定义
71 | GET nginx/_mapping/log_base
72 | # 改变某个属性结构
73 | PUT nginx/_mapping/log_base
74 | {
75 | "properties": {
76 | "timestamp":{
77 | "type": "text",
78 | "fielddata": true
79 | }
80 | }
81 | }
82 | ```
83 |
84 | 适用于临时的在命令行使用一些工具,比如curl,快速的发出请求,来检索想要的信息;
85 |
86 | 但是如果查询请求很复杂,是很难去构建的在生产环境中,几乎很少使用 query string search
87 |
88 | ## query DSL
89 |
90 |
DSL:Domain Specified Language,特定领域的语言
91 | http request body:请求体,可以用json的格式来构建查询语法,比较方便,可以构建各种复杂的语法,比query string search肯定强大多了
92 |
93 | **查询所有**
94 |
95 | ```
96 | GET nginx/log_base/_search
97 | {
98 | "query": { "match_all": {} }
99 | }
100 | ```
101 |
102 | **查询 ip 包含 ,同时按照价格降序排序**
103 |
104 | ```json
105 | GET nginx/log_base/_search
106 | {
107 | "query" : {
108 | "match" : {
109 | "ip" : "10.95.30.42"
110 | }
111 | },
112 | "sort": [
113 | { "timestamp": "desc" }
114 | ]
115 | }
116 | ```
117 |
118 | **分页查询**
119 |
120 | ```json
121 | # from:从第几个开始,es 从 0 开始计数的
122 | # size:往后查询 100 个
123 | GET nginx/log_base/_search
124 | {
125 | "query": { "match_all": {} },
126 | "from": 1,
127 | "size": 100
128 | }
129 | ```
130 |
131 | **指定要查询展示的属性**
132 |
133 | ```json
134 | GET nginx/log_base/_search
135 | {
136 | "query": { "match_all": {} },
137 | "_source": ["ip", "status"]
138 | }
139 | ```
140 |
141 | 更加适合生产环境的使用,可以构建复杂的查询
142 |
143 | ## query filter
144 |
145 | **结果集里面过滤**
146 |
147 | ```json
148 | GET nginx/log_base/_search
149 | {
150 | "query": {
151 | "bool": {
152 | "must": {
153 | "match":{
154 | "ip" : "10.95.30.42"
155 | }
156 | },
157 | "filter": {
158 | "match":{
159 | "status" : "302"
160 | }
161 | }
162 | }
163 | }
164 | }
165 | ```
166 |
167 | ## full-text search(全文检索)
168 |
169 | ```json
170 | GET nginx/log_base/_search
171 | {
172 | "query" : {
173 | "match" : {
174 | "url" : ".js"
175 | }
176 | }
177 | }
178 | ```
179 |
180 | ## phrase search(短语搜索)
181 |
182 | 跟全文检索相对应,相反,全文检索会将输入的搜索串拆解开来,去倒排索引里面去一一匹配,只要能匹配上任意一个拆解后的单词,就可以作为结果返回
183 | phrase search,要求输入的搜索串,必须在指定的字段文本中,完全包含一模一样的短语(空格等其他非数字字母分隔开的字符),才可以算匹配,才能作为结果返回
184 |
185 | ```json
186 | GET nginx/log_base/_search
187 | {
188 | "query" : {
189 | "match_phrase" : {
190 | "ip" : "10.94.53.32"
191 | }
192 | }
193 | }
194 | ```
195 |
196 | ## highlight search(高亮搜索结果)
197 |
198 | ```json
199 | GET nginx/log_base/_search
200 | {
201 | "query" : {
202 | "match" : {
203 | "ip" : "10.94.53.32"
204 | }
205 | },
206 | "highlight": {
207 | "fields" : {
208 | "ip" : {}
209 | }
210 | }
211 | }
212 |
213 | {
214 | "took" : 295,
215 | "timed_out" : false,
216 | "_shards" : {
217 | "total" : 5,
218 | "successful" : 5,
219 | "skipped" : 0,
220 | "failed" : 0
221 | },
222 | "hits" : {
223 | "total" : 29977,
224 | "max_score" : 1.5757076,
225 | "hits" : [
226 | {
227 | "_index" : "nginx",
228 | "_type" : "log_base",
229 | "_id" : "yAZwhmwB82qtm9SxinXv",
230 | "_score" : 1.5757076,
231 | "_source" : {
232 | "ip" : "10.94.53.32",
233 | "timestamp" : "17/Jul/2019:00:01:20 +0800",
234 | "url" : "GET /v-dist/static/css/app.min.css HTTP/1.1",
235 | "status" : "200",
236 | "bytes" : "217190"
237 | },
238 | "highlight" : {
239 | "ip" : [
240 | "10.94.53.32"
241 | ]
242 | }
243 | },
244 | {...}
245 | ]
246 | }
247 | }
248 | ```
249 |
--------------------------------------------------------------------------------
/BaseTools/test/edit-distance-test.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | from CompareUtil import EditDistance
4 | def main():
5 | EditDistance.similarityDegree("黄鹤楼","i黄鹤楼van2")
6 | EditDistance.similarityDegree("黄鹤楼","黄黄鹤鹤楼")
7 | EditDistance.similarityDegree("黄鹤楼","鹤楼黄楼黄楼")
8 | EditDistance.similarityDegree("黄鹤楼","鹤鹤楼")
9 | EditDistance.similarityDegree("黄鹤楼","汤逊湖")
10 | EditDistance.similarityDegree("黄鹤楼","岳阳楼")
11 |
12 | if __name__ == '__main__':
13 | main()
--------------------------------------------------------------------------------
/BaseTools/test/util-test.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | from MyUtil import FileTool
4 | from MyUtil import DateTool
5 | def main():
6 | # 指定写入文件的名称
7 | filename = 'test.txt'
8 | # 执行写入功能函数
9 | FileTool.write_behind(filename, 'hello')
10 | FileTool.write_behind(filename, 'world')
11 | print("1.追加单行写\n", FileTool.read_utf8(filename))
12 |
13 | FileTool.write_behind(filename, '你好!')
14 | print("1.1.追加写中文\n", FileTool.read_utf8(filename))
15 |
16 | FileTool.write_behind_muti(filename, ['hello', 'world', 'zhangzhang'])
17 | print("2.追加多行写\n", FileTool.read_utf8(filename))
18 |
19 | FileTool.overwrite(filename, "hello_world!")
20 | print("1.覆写\n", FileTool.read_utf8(filename))
21 |
22 | FileTool.write_behind(filename, '你好,世界!')
23 | print("1.1.覆写写中文\n", FileTool.read_utf8(filename))
24 |
25 |
26 |
27 | # 开始进行日期转换
28 | # 转换之后 返回一个结果对象
29 | date = DateTool.get_date('2020 2 22')
30 | #date有三个属性 分别为year,month,day
31 | print("日期转换")
32 | print(date.year)
33 | print(date.month)
34 | print(date.day)
35 |
36 |
37 | if __name__ == '__main__':
38 | main()
--------------------------------------------------------------------------------
/BaseTools/test/util-test2.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | from MyUtil import FileTool
4 | from MyUtil import MyStr
5 | def html2markdown(input_file_path, output_file_path):
6 | html = FileTool.read_utf8(input_file_path)
7 | mdTxt = MyStr.html2markdown(html)
8 | FileTool.overwrite(output_file_path, mdTxt)
9 |
10 |
11 | if __name__ == '__main__':
12 | html2markdown('data/test-file.html', 'data/result.md')
--------------------------------------------------------------------------------
/BaseTools/test/util-test3.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | import ScreenShotUtil as screenshot
4 |
5 | def main():
6 | screenshot.url2png("https://www.baidu.com/")
7 |
8 | if __name__ == '__main__':
9 | main()
--------------------------------------------------------------------------------
/DBTools/MyES.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import os
3 | import time
4 | import csv
5 | from os import walk
6 | from datetime import datetime
7 | from elasticsearch import Elasticsearch
8 | from elasticsearch.helpers import bulk
9 |
10 | class MyESClient(object):
11 | def __init__(self, index_name, index_type, ip ="127.0.0.1", print=False):
12 | '''
13 | :param index_name: 索引名称
14 | :param index_type: 索引类型
15 | '''
16 | self.index_name =index_name
17 | self.index_type = index_type
18 | # 无用户名密码状态
19 | self.es = Elasticsearch([ip], port=9200)
20 | #用户名密码状态
21 | self.es = Elasticsearch([ip], http_auth=('elastic', 'password'), port=9200)
22 | self.show_es_result = print
23 |
24 | def createIndex(self, index_mappings):
25 | '''
26 | 创建索引,创建索引名称为ott,类型为ott_type的索引
27 | :param ex: Elasticsearch对象
28 | :return:
29 | '''
30 | #创建映射
31 | if self.es.indices.exists(index=self.index_name) is not True:
32 | res = self.es.indices.create(index=self.index_name, body=index_mappings)
33 | if self.show_es_result:
34 | print(res)
35 |
36 | def indexDataFromCvsDir(self, cloumnDict):
37 | csvdir = './ElasticSearch/exportExcels'
38 | filenamelist = []
39 | for (dirpath, dirnames, filenames) in walk(csvdir):
40 | filenamelist.extend(filenames)
41 | break
42 | for file in filenamelist:
43 | csvfile = csvdir + '/' + file
44 | self.indexDataFromCSV(csvfile, cloumnDict)
45 | time.sleep(10)
46 |
47 | def indexDataFromCSV(self, csvfile, cloumnList=None):
48 | '''
49 | 从CSV文件中读取数据,并存储到es中
50 | :param csvfile: csv文件,包括完整路径
51 | :return:
52 | '''
53 | with open(csvfile) as f:
54 | reader = csv.reader(f)
55 | # 读取一行,下面的reader中已经没有该行了
56 | index = 0
57 | if cloumnList == None:
58 | cloumnList = next(reader)
59 | index = 1
60 | doc = {}
61 | cloumnLength = len(cloumnList)
62 | for item in reader:
63 | if index > 0:#第一行是标题
64 | if cloumnLength <= len(item):
65 | for i in range(cloumnLength):
66 | doc[cloumnList[i]] = item[i]
67 | self.es.index(index=self.index_name, doc_type=self.index_type, body=doc)
68 | index += 1
69 |
70 | def getDataExportCSV(self, csvfile, query={'query': {'match_all': {}}}, cloumnList=None):
71 | '''
72 | 从数据库导出csv表格
73 | :param csvfile:
74 | :param query:
75 | :param cloumnList:
76 | :return:
77 | '''
78 | res = self.getDataByBody(query)
79 | if res is not None and len(res['hits']['hits']) > 0:
80 | # fobj = open(csvfile, 'w+')
81 | with open(csvfile, 'w', newline='') as fobj:
82 | if cloumnList == None:
83 | cloumnList = res['hits']['hits'][0]["_source"].keys()
84 | writer = csv.DictWriter(fobj, fieldnames=cloumnList)
85 | writer.writeheader()
86 | for hit in res['hits']['hits']:
87 | writer.writerow(hit["_source"])
88 |
89 | def indexDataList(self, list=[]):
90 | '''
91 | 数据存储到es
92 | :return:
93 | '''
94 | for item in list:
95 | res = self.es.index(index=self.index_name, doc_type=self.index_type, body=item)
96 | if self.show_es_result:
97 | print(res)
98 |
99 | def indexData(self, data, id=None):
100 | '''
101 | 单条数据添加
102 | :param data:
103 | :return:
104 | '''
105 | res = self.es.index(index=self.index_name, doc_type=self.index_type, body=data, id=id)
106 | if self.show_es_result:
107 | print(res)
108 | return res
109 |
110 | def bulkIndexData(self, list=[]):
111 | '''
112 | 用bulk将批量数据存储到es
113 | :return:
114 | '''
115 | ACTIONS = []
116 | for line in list:
117 | action = {
118 | "_index": self.index_name,
119 | "_type": self.index_type,
120 | "_source": line
121 | }
122 | ACTIONS.append(action)
123 | # 批量处理
124 | success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True)
125 | if self.show_es_result:
126 | print('Performed %d actions' % success)
127 | return success
128 |
129 | def deleteDataById(self,id):
130 | '''
131 | 删除索引中的一条
132 | :param id:
133 | :return:
134 | '''
135 | res = self.es.delete(index=self.index_name, doc_type=self.index_type, id=id)
136 | if self.show_es_result:
137 | print(res)
138 | return res
139 |
140 | def getDataId(self,id):
141 | res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id)
142 | # 输出查询到的结果
143 | if self.show_es_result:
144 | print(res)
145 | return res
146 |
147 | def getDataSourceById(self,id):
148 | res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id)
149 | # 输出查询到的结果
150 | if self.show_es_result:
151 | print(res)
152 | if res is not None and len(res['hits']['hits']) > 0:
153 | return res['hits']['hits'][0]["_source"]
154 | else:
155 | return None
156 |
157 | def exit(self, queryBody):
158 | if queryBody == None:
159 | return False
160 | res = self.getDataByBody(queryBody)
161 | if res is not None and len(res['hits']['hits']) > 0:
162 | return True
163 | else:
164 | return False
165 |
166 | def getOneByBody(self, query):
167 | params = {"size":1}
168 | res = self.getDataByBody(query, params)
169 | if res is not None and len(res['hits']['hits']) > 0:
170 | return res['hits']['hits'][0]["_source"]
171 | else:
172 | return None
173 |
174 | def getDataByBody(self, queryBody={'query': {'match_all': {}}}, params=None):
175 | # queryBody = {'query': {'match_all': {}}}
176 | _searched = None
177 | if params == None:
178 | _searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=queryBody)
179 | else:
180 | _searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=queryBody, params=params)
181 |
182 | if self.show_es_result:
183 | print(_searched)
184 | return _searched
--------------------------------------------------------------------------------
/DBTools/MyMongoDB.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from pymongo import MongoClient
3 |
4 | class MyMongoClient(object):
5 | def __init__(self, dbname=None, setname=None):
6 | self.dbname = dbname
7 | self.setname = setname
8 | self.client = MongoClient() ##与MongDB建立连接(这是默认连接本地MongDB数据库)
9 | self.db = self.client[dbname] ## 选择一个数据库
10 | self.collection = self.db[setname] ##在这个数据库中,选择一个集合
11 |
12 | def save(self, data):
13 | res = self.collection.save(data)
14 | if SHOW_RESULT:
15 | print(res)
16 | return res
17 |
18 | def getOne(self, query):
19 | res = self.collection.find_one(query)
20 | if SHOW_RESULT:
21 | print(res)
22 | return res
23 |
24 | def isExit(self, query):
25 | if self.getOne(query):
26 | return True
27 | else:
28 | return False
29 |
30 | def get(self, query):
31 | res = self.collection.find(query)
32 | if SHOW_RESULT:
33 | print(res)
34 | return res
35 |
36 | SHOW_RESULT = True
37 |
--------------------------------------------------------------------------------
/DBTools/MySqlite.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | #python sqlite
3 | #DB-API 2.0 interface for SQLite databases
4 |
5 | import sqlite3
6 | import os
7 |
8 | '''
9 | SQLite数据库是一款非常小巧的嵌入式开源数据库软件,也就是说
10 | 没有独立的维护进程,所有的维护都来自于程序本身。
11 | 在python中,使用sqlite3创建数据库的连接,当我们指定的数据库文件不存在的时候
12 | 连接对象会自动创建数据库文件;如果数据库文件已经存在,则连接对象不会再创建
13 | 数据库文件,而是直接打开该数据库文件。
14 | 连接对象可以是硬盘上面的数据库文件,也可以是建立在内存中的,在内存中的数据库
15 | 执行完任何操作后,都不需要提交事务的(commit)
16 |
17 | 创建在硬盘上面: conn = sqlite3.connect('c:\\test\\test.db')
18 | 创建在内存上面: conn = sqlite3.connect('"memory:')
19 |
20 | 下面我们一硬盘上面创建数据库文件为例来具体说明:
21 | conn = sqlite3.connect('c:\\test\\hongten.db')
22 | 其中conn对象是数据库链接对象,而对于数据库链接对象来说,具有以下操作:
23 |
24 | commit() --事务提交
25 | rollback() --事务回滚
26 | close() --关闭一个数据库链接
27 | cursor() --创建一个游标
28 |
29 | cu = conn.cursor()
30 | 这样我们就创建了一个游标对象:cu
31 | 在sqlite3中,所有sql语句的执行都要在游标对象的参与下完成
32 | 对于游标对象cu,具有以下具体操作:
33 |
34 | execute() --执行一条sql语句
35 | executemany() --执行多条sql语句
36 | close() --游标关闭
37 | fetchone() --从结果中取出一条记录
38 | fetchmany() --从结果中取出多条记录
39 | fetchall() --从结果中取出所有记录
40 | scroll() --游标滚动
41 |
42 | '''
43 |
44 | class MySqlite(object):
45 | def __init__(self, dbpath, tablename, print=False):
46 | self.dbpath = dbpath
47 | self.tablename = tablename
48 | #是否打印sql
49 | self.show_sql = print
50 | #是否打印sql结果
51 | self.show_sql_result = print
52 |
53 | def get_conn(self,path=None):
54 | '''获取到数据库的连接对象,参数为数据库文件的绝对路径
55 | 如果传递的参数是存在,并且是文件,那么就返回硬盘上面改
56 | 路径下的数据库文件的连接对象;否则,返回内存中的数据接
57 | 连接对象'''
58 | if path == None:
59 | path = self.dbpath
60 | if os.path.exists(path) and os.path.isfile(path):
61 | print('硬盘上面:[{}]'.format(path))
62 | conn = sqlite3.connect(path)
63 | conn.text_factory = str ##!!!
64 | return conn
65 | else:
66 | conn = None
67 | print('内存上面:[:memory:]')
68 | return sqlite3.connect(':memory:')
69 |
70 | def get_cursor(self, conn=None):
71 | '''该方法是获取数据库的游标对象,参数为数据库的连接对象
72 | 如果数据库的连接对象不为None,则返回数据库连接对象所创
73 | 建的游标对象;否则返回一个游标对象,该对象是内存中数据
74 | 库连接对象所创建的游标对象'''
75 | if conn is not None:
76 | return conn.cursor()
77 | else:
78 | return self.get_conn().cursor()
79 |
80 | ###############################################################
81 | #### 创建|删除表操作 START
82 | ###############################################################
83 | def dropTable(self, table=None, conn=None):
84 | if table == None:
85 | table = self.tablename
86 | if conn == None:
87 | conn = self.get_conn()
88 | '''如果表存在,则删除表,如果表中存在数据的时候,使用该
89 | 方法的时候要慎用!'''
90 | if table is not None and table != '':
91 | sql = 'DROP TABLE IF EXISTS ' + table
92 | if self.show_sql:
93 | print('执行sql:[{}]'.format(sql))
94 | cu = self.get_cursor(conn)
95 | cu.execute(sql)
96 | conn.commit()
97 | if self.show_sql_result:
98 | print('删除数据库表[{}]成功!'.format(table))
99 | self.close_all(conn, cu)
100 | else:
101 | print('the [{}] is empty or equal None!'.format(sql))
102 |
103 | def createTable(self, sql, conn=None):
104 | if conn == None:
105 | conn = self.get_conn()
106 | '''创建数据库表'''
107 | if sql is not None and sql != '':
108 | cu = self.get_cursor(conn)
109 | if self.show_sql:
110 | print('执行sql:[{}]'.format(sql))
111 | cu.execute(sql)
112 | conn.commit()
113 | if self.show_sql_result:
114 | print('创建数据库表成功!')
115 | self.close_all(conn, cu)
116 | else:
117 | print('the [{}] is empty or equal None!'.format(sql))
118 | ###############################################################
119 | #### 创建|删除表操作 END
120 | ###############################################################
121 |
122 | def close_all(self, conn, cu):
123 | '''关闭数据库游标对象和数据库连接对象'''
124 | try:
125 | if cu is not None:
126 | cu.close()
127 | finally:
128 | if cu is not None:
129 | cu.close()
130 |
131 | ###############################################################
132 | #### 数据库操作CRUD START
133 | ###############################################################
134 | def insert(self, sql, data, conn=None):
135 | if conn == None:
136 | conn = self.get_conn()
137 | '''插入数据'''
138 | if sql is not None and sql != '':
139 | if data is not None:
140 | cu = self.get_cursor(conn)
141 | for d in data:
142 | if self.show_sql:
143 | print('执行sql:[{}],参数:[{}]'.format(sql, d))
144 | cu.execute(sql, d)
145 | conn.commit()
146 | self.close_all(conn, cu)
147 | else:
148 | print('the [{}] is empty or equal None!'.format(sql))
149 |
150 | def selectAll(self, sql, conn=None):
151 | if conn == None:
152 | conn = self.get_conn()
153 | '''查询所有数据'''
154 | if sql is not None and sql != '':
155 | cu = self.get_cursor(conn)
156 | if self.show_sql:
157 | print('执行sql:[{}]'.format(sql))
158 | cu.execute(sql)
159 | r = cu.fetchall()
160 | if self.show_sql_result:
161 | if len(r) > 0:
162 | for e in range(len(r)):
163 | print(r[e])
164 | return r
165 | else:
166 | print('the [{}] is empty or equal None!'.format(sql))
167 | return None
168 |
169 | def selectOne(self, sql, data, conn=None):
170 | if conn == None:
171 | conn = self.get_conn()
172 | '''查询一条数据'''
173 | if sql is not None and sql != '':
174 | if data is not None:
175 | #Do this instead
176 | d = (data,)
177 | cu = self.get_cursor(conn)
178 | if self.show_sql:
179 | print('执行sql:[{}],参数:[{}]'.format(sql, data))
180 | cu.execute(sql, d)
181 | r = cu.fetchall()
182 | if self.show_sql_result:
183 | if len(r) > 0:
184 | for e in range(len(r)):
185 | print(r[e])
186 | return r
187 | else:
188 | print('the [{}] equal None!'.format(data))
189 | else:
190 | print('the [{}] is empty or equal None!'.format(sql))
191 | return None
192 |
193 | def update(self, sql, data, conn=None):
194 | if conn == None:
195 | conn = self.get_conn()
196 | '''更新数据'''
197 | if sql is not None and sql != '':
198 | if data is not None:
199 | cu = self.get_cursor(conn)
200 | for d in data:
201 | if self.show_sql:
202 | print('执行sql:[{}],参数:[{}]'.format(sql, d))
203 | cu.execute(sql, d)
204 | conn.commit()
205 | self.close_all(conn, cu)
206 | else:
207 | print('the [{}] is empty or equal None!'.format(sql))
208 |
209 | def delete(self, sql, data, conn=None):
210 | if conn == None:
211 | conn = self.get_conn()
212 | '''删除数据'''
213 | if sql is not None and sql != '':
214 | if data is not None:
215 | cu = self.get_cursor(conn)
216 | for d in data:
217 | if self.show_sql:
218 | print('执行sql:[{}],参数:[{}]'.format(sql, d))
219 | cu.execute(sql, d)
220 | conn.commit()
221 | self.close_all(conn, cu)
222 | else:
223 | print('the [{}] is empty or equal None!'.format(sql))
224 | ###############################################################
225 | #### 数据库操作CRUD END
226 | ###############################################################
227 |
228 | def setDbPath(self, dbpath):
229 | self.dbpath = dbpath
230 |
231 | def setTableName(self, tablename):
232 | self.tablename = tablename
233 |
234 | def openPrint(self):
235 | self.show_sql = True
236 | print('self.show_sql : {}'.format(self.show_sql))
237 | self.show_sql_result = True
238 | print('self.show_sql_result : {}'.format(self.show_sql_result))
--------------------------------------------------------------------------------
/DBTools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/DBTools/__init__.py
--------------------------------------------------------------------------------
/DBTools/test/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
--------------------------------------------------------------------------------
/DBTools/test/es-test.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | from MyES import MyESClient
4 |
5 | list = [
6 | {"date": "2017-09-13",
7 | "source": "慧聪网",
8 | "link": "http://info.broadcast.hc360.com/2017/09/130859749974.shtml",
9 | "keyword": "电视",
10 | "title": "付费 电视 行业面临的转型和挑战"
11 | },
12 | {"date": "2017-09-13",
13 | "source": "中国文明网",
14 | "link": "http://www.wenming.cn/xj_pd/yw/201709/t20170913_4421323.shtml",
15 | "keyword": "电视",
16 | "title": "电视 专题片《巡视利剑》广获好评:铁腕反腐凝聚党心民心"
17 | },
18 | {"date": "2017-09-13",
19 | "source": "人民电视",
20 | "link": "http://tv.people.com.cn/BIG5/n1/2017/0913/c67816-29533981.html",
21 | "keyword": "电视",
22 | "title": "中国第21批赴刚果(金)维和部隊启程--人民 电视 --人民网"
23 | },
24 | {"date": "2017-09-13",
25 | "source": "站长之家",
26 | "link": "http://www.chinaz.com/news/2017/0913/804263.shtml",
27 | "keyword": "电视",
28 | "title": "电视 盒子 哪个牌子好? 吐血奉献三大选购秘笈"
29 | }
30 | ]
31 |
32 | # 提前给elasticsearch安装对应版本的中文分词器 https://github.com/medcl/elasticsearch-analysis-ik
33 | index_mappings = {
34 | "mappings": {
35 | "ott_type": {
36 | "properties": {
37 | "title": {
38 | "type": "text",
39 | "index": True,
40 | "analyzer": "ik_max_word",
41 | "search_analyzer": "ik_max_word"
42 | },
43 | "date": {
44 | "type": "text",
45 | "index": True
46 | },
47 | "keyword": {
48 | "type": "text",
49 | "index": False
50 | },
51 | "source": {
52 | "type": "text",
53 | "index": False
54 | },
55 | "link": {
56 | "type": "text",
57 | "index": False
58 | }
59 | }
60 | }
61 | }
62 | }
63 |
64 | es = MyESClient("ott", "ott_type", print=True)
65 |
66 | es.createIndex(index_mappings)
67 |
68 | es.indexDataList(list)
69 |
70 | queryBody = {
71 | "query": {
72 | "match": {
73 | "title": "电视"
74 | }
75 | }
76 | }
77 |
78 | es.getDataByBody(queryBody)
79 |
80 | es.getDataExportCSV('es-test/ott.csv')
81 |
82 | es.indexDataFromCSV("es-test/ott.csv")
83 |
--------------------------------------------------------------------------------
/DBTools/test/sqlite-test.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-s
2 | import Parent
3 | from MySqlite import MySqlite
4 | import os
5 |
6 |
7 | ###############################################################
8 | #### 测试操作 START
9 | ###############################################################
10 | def drop_table_test():
11 | '''删除数据库表测试'''
12 | print('删除数据库表测试...')
13 | sqlite.dropTable(TABLE_NAME)
14 |
15 |
16 | def create_table_test():
17 | '''创建数据库表测试'''
18 | print('创建数据库表测试...')
19 | create_table_sql = '''CREATE TABLE `student` (
20 | `id` int(11) NOT NULL,
21 | `name` varchar(20) NOT NULL,
22 | `gender` varchar(4) DEFAULT NULL,
23 | `age` int(11) DEFAULT NULL,
24 | `address` varchar(200) DEFAULT NULL,
25 | `phone` varchar(20) DEFAULT NULL,
26 | PRIMARY KEY (`id`)
27 | )'''
28 | sqlite.createTable(create_table_sql)
29 |
30 |
31 | def save_test():
32 | '''保存数据测试...'''
33 | print('保存数据测试...')
34 | save_sql = '''INSERT INTO student values (?, ?, ?, ?, ?, ?)'''
35 | data = [(1, 'Hongten', '男', 20, '广东省广州市',
36 | '13423****62'), (2, 'Tom', '男', 22, '美国旧金山', '15423****63'),
37 | (3, 'Jake', '女', 18, '广东省广州市',
38 | '18823****87'), (4, 'Cate', '女', 21, '广东省广州市', '14323****32')]
39 | sqlite.insert(save_sql, data)
40 |
41 |
42 | def fetchall_test():
43 | '''查询所有数据...'''
44 | print('查询所有数据...')
45 | fetchall_sql = '''SELECT * FROM student'''
46 | sqlite.selectAll(fetchall_sql)
47 |
48 |
49 | def fetchone_test():
50 | '''查询一条数据...'''
51 | print('查询一条数据...')
52 | fetchone_sql = 'SELECT * FROM student WHERE ID = ? '
53 | data = 1
54 | sqlite.selectOne(fetchone_sql, data)
55 |
56 | def update_test():
57 | '''更新数据...'''
58 | print('更新数据...')
59 | update_sql = 'UPDATE student SET name = ? WHERE ID = ? '
60 | data = [('HongtenAA', 1), ('HongtenBB', 2), ('HongtenCC', 3), ('HongtenDD',
61 | 4)]
62 | sqlite.update(update_sql, data)
63 |
64 |
65 | def delete_test():
66 | '''删除数据...'''
67 | print('删除数据...')
68 | delete_sql = 'DELETE FROM student WHERE NAME = ? AND ID = ? '
69 | data = [('HongtenAA', 1), ('HongtenCC', 3)]
70 | sqlite.delete(delete_sql, data)
71 |
72 |
73 | ###############################################################
74 | #### 测试操作 END
75 | ###############################################################
76 |
77 |
78 | def init():
79 | '''初始化方法'''
80 | #数据库文件绝句路径
81 | global DB_FILE_PATH
82 | DB_FILE_PATH = os.getcwd() + '/sqlite-test/test.db'
83 | #数据库表名称
84 | global TABLE_NAME
85 | TABLE_NAME = 'student'
86 |
87 | global sqlite
88 | sqlite = MySqlite(DB_FILE_PATH, TABLE_NAME, True)
89 | #如果存在数据库表,则删除表
90 | drop_table_test()
91 | #创建数据库表student
92 | create_table_test()
93 | #向数据库表中插入数据
94 | save_test()
95 |
96 |
97 | def main():
98 | init()
99 | fetchall_test()
100 | print('#' * 50)
101 | fetchone_test()
102 | print('#' * 50)
103 | update_test()
104 | fetchall_test()
105 | print('#' * 50)
106 | delete_test()
107 | fetchall_test()
108 |
109 |
110 | if __name__ == '__main__':
111 | main()
--------------------------------------------------------------------------------
/DBTools/test/sqlite-test/test.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/DBTools/test/sqlite-test/test.db
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 个人python爬虫的学习和实践记录
2 |
3 | 1. [携程签证国家图片和使馆信息爬取](./ctrip.com-visa)
4 | 2. [妹子图图片资源信息爬取](./mzitu.com)
5 | 3. [晋江文学库免费小说爬取](./jjwxk.net)
6 | 4. [花瓣画板异步爬取](./huaban.com)
7 | 5. [wallhaven The best wallpapers on the Net!](./wallhaven.cc)
8 |
9 | >PS:本项目仅学习分享用,请不要用于商业
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/__init__.py
--------------------------------------------------------------------------------
/ctrip.com-visa/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
--------------------------------------------------------------------------------
/ctrip.com-visa/README.md:
--------------------------------------------------------------------------------
1 | ## 背景
2 |
3 | 公司做签证需要国家图片和领事馆信息(该页面目前 404,截止发现时间 2019-06-16)
4 |
5 | ## 启动
6 |
7 | 控制台 cd 到当前目录
8 |
9 | >python xc-visa-lqxx.py
10 |
11 | 相关截图:
12 |
13 | 
14 |
15 | 
--------------------------------------------------------------------------------
/ctrip.com-visa/img/ctrip-visa-gqtp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/ctrip.com-visa/img/ctrip-visa-gqtp.png
--------------------------------------------------------------------------------
/ctrip.com-visa/img/ctrip-visa-lsgxx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/ctrip.com-visa/img/ctrip-visa-lsgxx.png
--------------------------------------------------------------------------------
/ctrip.com-visa/xc-visa-lqxx.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | from bs4 import BeautifulSoup
4 | import os
5 | import re
6 | # from BaseTools.MyUtil import FileTool
7 | from BaseTools.MyDownload import request
8 | import csv
9 | ## http://vacations.ctrip.com/visa/lsg
10 | ## div.c_con a
11 | ## table.sin_lis td
12 | # lqmc: h4
13 | # lsgmc: p[0]
14 | # lsgdz: p[1]
15 | # lsggzsj: p[3]
16 | class VisaLqxxCrawler():
17 | def __init__(self):
18 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
19 | self.gqtpPath = "./gqtp/"
20 | self.mkdir(self.gqtpPath)
21 | self.mkdir("./lsgxx/")
22 | self.lsgxxFilePath = "./lsgxx/lsgxx.txt"
23 | self.lsgxxCsvPath = "./lsgxx/lsgxx.csv"
24 | self.lsgxxList = []
25 | def all_url(self, url="http://vacations.ctrip.com/visa/lsg"):
26 | html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response
27 | all_div = BeautifulSoup(html.text, 'lxml').find_all('div', class_='c_con')
28 | print("一共有 %d 个州" % len(all_div))
29 | for div in all_div:
30 | all_a = div.find_all('a')
31 | print("该洲一共有 %d 个国家" % len(all_a))
32 | for a in all_a:
33 | img = a.find("img")
34 | self.headers['referer'] = url
35 | self.save(img["src"])
36 | href = "http://vacations.ctrip.com" + a['href']
37 | title = a["title"]
38 | self.currGjmc = title
39 | print(title, href)
40 | self.headers['referer'] = href
41 | self.html(href)
42 | self.exportCsv(self.lsgxxCsvPath)
43 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址
44 | try:
45 | html = self.request(href)
46 | #max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
47 | tds = BeautifulSoup(html.text, 'lxml').find('table', class_="sin_lis").find_all('td')
48 | for td in tds:
49 | lsgInfo = {}
50 | lsgInfo["gjmc"] = self.currGjmc
51 | h4 = td.find("h4").get_text()
52 | lsgInfo["lqmc"] = self.trim(h4)
53 | ps = td.find_all('p')
54 | lsgInfo["lqgmc"] = self.trim(ps[0].get_text())
55 | lsgInfo["lqgdz"] = self.trim(ps[1].get_text())
56 | lsgInfo["lsggzsj"] = self.trim(ps[2].get_text())
57 | print(lsgInfo)
58 | self.lsgxxList.append(lsgInfo)
59 | # FileTool.write(self.lsgxxFilePath,lsgInfo.encode("utf-8"))
60 | except Exception as e:
61 | print('发生了异常:', e)
62 |
63 | def exportCsv(self,csvfile, list=None, cloumnList=None):
64 | if list == None:
65 | list = self.lsgxxList
66 | if cloumnList == None and len(list) > 0:
67 | cloumnList = list[0].keys()
68 | # fobj = open(csvfile, 'w+')
69 | # fobj = open(csvfile, 'ab+')
70 | with open(csvfile, 'w', newline='') as fobj:
71 | writer = csv.DictWriter(fobj, fieldnames=cloumnList)
72 | writer.writeheader()
73 | for item in list:
74 | writer.writerow(item)
75 |
76 | def trim(self, myStr):
77 | myStr = re.sub('\n', '', myStr)
78 | myStr = re.sub(' ', '', myStr)
79 | myStr = re.sub('\ufffd', ' ', myStr)
80 | return myStr
81 |
82 | def save(self, img_url): ##这个函数保存图片
83 | try:
84 | index = img_url.rindex("/")
85 | name = img_url[index:]
86 | img = self.request(img_url)
87 | f = open(self.gqtpPath + name, 'ab')
88 | f.write(img.content)
89 | f.close()
90 | except Exception as e:
91 | print('发生了 异常:', e)
92 |
93 | def mkdir(self, path=""): ##这个函数创建文件夹
94 | path = path.strip()
95 | isExists = os.path.exists(path)
96 | if not isExists:
97 | print('建了一个名字叫做', path, '的文件夹!')
98 | os.makedirs(path)
99 | #os.chdir(os.path.join(self.gqtpPath, path)) ##切换到目录
100 | return True
101 | else:
102 | print('名字叫做', path, '的文件夹已经存在了!')
103 | return False
104 |
105 | def request(self, url): ##这个函数获取网页的response 然后返回
106 | content = request.get(url, headers=self.headers, timeout=3)
107 | return content
108 |
109 | visaLqxxCrawler = VisaLqxxCrawler()
110 | visaLqxxCrawler.all_url()
--------------------------------------------------------------------------------
/framework/base_scrapy/README.md:
--------------------------------------------------------------------------------
1 | ## 安装 Scrapy 框架
2 |
3 | ```bash
4 | pip install Scrapy
5 | ```
6 |
7 | 但是网上都推荐用 Anaconda 安装,初学者建议先安装 Anaconda (请百度安装方法)
8 |
9 | ```bash
10 | conda install scrapy
11 | 或专业点的 ↓
12 | conda install -c conda-forge scrapy
13 | ```
14 |
15 | 我用的是 Python3,双环境,所以
16 |
17 | ```bash
18 | pip3 install Scrapy
19 | ```
20 |
21 | ## 创建 Scrapy 项目
22 |
23 | ```bash
24 | scrapy startproject base_scrapy
25 |
26 | PS:base_scrapy 为项目名,一般看你自己啦
27 |
28 | 于是就生成如下目录和文件:
29 |
30 | base_scrapy
31 | ├── base_scrapy
32 | │ ├── __init__.py
33 | │ ├── __pycache__
34 | │ ├── items.py
35 | │ ├── middlewares.py
36 | │ ├── pipelines.py
37 | │ ├── settings.py
38 | │ └── spiders
39 | │ ├── __init__.py
40 | │ └── __pycache__
41 | └── scrapy.cfg
42 | ```
--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/framework/base_scrapy/base_scrapy/__init__.py
--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/entrypoint.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy.cmdline import execute
3 | # 该文件用于调试,第三个变量是 项目名
4 | execute(['scrapy', 'crawl', 'base_scrapy'])
--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | # 在这里定义你的 items,可以定义很多个 class,不同的 spiders 里面引用不同的
5 | # See documentation in:
6 | # https://docs.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class BaseScrapyItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | url = scrapy.Field()
15 | status = scrapy.Field()
16 | # headers = scrapy.Field()
17 | body = scrapy.Field()
18 | pass
--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class BaseScrapySpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Request, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class BaseScrapyDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class BaseScrapyPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for base_scrapy project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://docs.scrapy.org/en/latest/topics/settings.html
9 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'base_scrapy'
13 |
14 | SPIDER_MODULES = ['base_scrapy.spiders']
15 | NEWSPIDER_MODULE = 'base_scrapy.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'base_scrapy (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | #ROBOTSTXT_OBEY = True
23 | # 不遵循 robots.txt 规则
24 | ROBOTSTXT_OBEY = False
25 |
26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
27 | #CONCURRENT_REQUESTS = 32
28 |
29 | # Configure a delay for requests for the same website (default: 0)
30 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
31 | # See also autothrottle settings and docs
32 | #DOWNLOAD_DELAY = 3
33 | # The download delay setting will honor only one of:
34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
35 | #CONCURRENT_REQUESTS_PER_IP = 16
36 |
37 | # Disable cookies (enabled by default)
38 | #COOKIES_ENABLED = False
39 |
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 |
43 | # Override the default request headers:
44 | #DEFAULT_REQUEST_HEADERS = {
45 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | # 'Accept-Language': 'en',
47 | #}
48 |
49 | # Enable or disable spider middlewares
50 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | # 'base_scrapy.middlewares.BaseScrapySpiderMiddleware': 543,
53 | #}
54 |
55 | # Enable or disable downloader middlewares
56 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | # 'base_scrapy.middlewares.BaseScrapyDownloaderMiddleware': 543,
59 | #}
60 |
61 | # Enable or disable extensions
62 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 |
67 | # Configure item pipelines
68 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
69 | #ITEM_PIPELINES = {
70 | # 'base_scrapy.pipelines.BaseScrapyPipeline': 300,
71 | #}
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | HTTPCACHE_ENABLED = True
89 | HTTPCACHE_EXPIRATION_SECS = 0
90 | HTTPCACHE_DIR = 'httpcache'
91 | HTTPCACHE_IGNORE_HTTP_CODES = []
92 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/spiders/base_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # 导入scrapy包
3 | import os
4 | import scrapy
5 | from bs4 import BeautifulSoup
6 | # 一个单独的 request 的模块,需要跟进 URL 的时候,需要用它
7 | from scrapy.http import Request
8 | # 这是我定义的需要保存的字段,(导入项目中,items文件中的 BaseScrapyItem 类)
9 | from base_scrapy.items import BaseScrapyItem
10 |
11 | # 在 Scrapy 框架根目录,控制台输入: scrapy crawl base_spider -o data/base_spider/item.json
12 | class BaseSpider(scrapy.Spider):
13 | # 爬虫名字,定义后在项目根目录: scrapy crawl {name} ,运行该爬虫
14 | name = 'base_spider'
15 | # 定义一些常量
16 | data_dir = 'data'
17 | allowed_domains = ['baidu.com']
18 | bash_url = 'https://www.baidu.com/s?wd='
19 |
20 | def start_requests(self):
21 | for i in range(1, 10):
22 | url = self.bash_url + str(i)
23 | # 爬取到的页面 提交 给 parse 方法处理
24 | yield Request(url, self.parse)
25 |
26 | def parse(self, response):
27 | '''
28 | start_requests 已经爬取到页面,那如何提取我们想要的内容,可以在这个方法里面定义。
29 | 也就是用xpath、正则、或是css进行相应提取,这个例子就是让你看看scrapy运行的流程:
30 | 1、定义链接;
31 | 2、通过链接爬取(下载)页面;
32 | 3、定义规则,然后提取数据;(当前步骤)
33 | '''
34 | # # 根据上面的链接提取个数,文件名:baidu.com-{n}.txt
35 | # file_name = self.allowed_domains[0] + '-' + response.url.split("=")[-1] + '.txt'
36 | # # 文件路径
37 | # file_path = os.path.join(self.data_dir, self.name)
38 | # # 创建文件夹
39 | # if not os.path.exists(file_path):
40 | # os.makedirs(file_path)
41 | # # 拼接文件名
42 | # file_full_name = os.path.join(file_path, file_name)
43 | # with open(file_full_name, 'wb') as f:
44 | # # python文件操作,不多说了;
45 | # f.write(response.body)
46 | # # 打个日志
47 | # self.log('保存文件: %s' % file_full_name)
48 | item = BaseScrapyItem()
49 | item['url'] = response.url
50 | item['status'] = response.status
51 | # item['headers'] = str(response.headers, encoding='utf8')
52 | item['body'] = str(response.body, encoding='utf8')
53 | yield item
--------------------------------------------------------------------------------
/framework/base_scrapy/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = base_scrapy.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = base_scrapy
12 |
--------------------------------------------------------------------------------
/huaban.com/PreviewHtmlTool.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | def saveIndexHtmlFile(save_path, title, border_id, max_page):
3 | template = '''
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | %(title)s
13 |
112 |
113 |
114 |
115 | %(title)s - 1/%(max_page)s
116 |
117 |
118 |
119 |
120 |
s_1.jpg)
121 |
122 |
123 |
124 |
125 |
126 |
182 |
183 |
184 | '''
185 | # html = template % {'title':title, 'border_id':border_id, 'max_page':str(max_page)}
186 | html = template.replace("%(title)s", title).replace("%(border_id)s", border_id).replace("%(max_page)s", str(max_page))
187 | with open(save_path, 'w', encoding='UTF-8') as f:
188 | f.write(html)
189 |
190 | saveIndexHtmlFile("./test.html", "adfaf", "12341", 123)
--------------------------------------------------------------------------------
/huaban.com/README.md:
--------------------------------------------------------------------------------
1 |
2 | ## 前情提要
3 |
4 | 最近看到花瓣的一个图集(淡然小笺赋箴言)蛮不错,想用爬虫收集下图集和对应的文字。
5 |
6 | ## 花瓣网
7 |
8 | 0. 测试异步爬虫
9 | 1. 简单文本模式
10 | 2. HTML静态模式
11 |
12 |
13 | ### 测试异步爬虫
14 |
15 | 花瓣里面的所有图片都是异步加载的,需要模拟浏览器操作,简单学习测试一下 selenium 的使用
16 |
17 | 代码来源:
18 | 作者:疯魔的小咸鱼
19 | 链接:https://www.jianshu.com/p/554c6d5af3ca
20 |
21 | PS:selenium 安装注意事项
22 |
23 | - 问题1:selenium 已经放弃 PhantomJS 了,建议使用火狐或者谷歌无界面浏览器。
24 |
25 | 解决方案:selenium 版本降级。
26 | 通过 `pip show selenium` 显示,默认安装版本为 3.14.0。
27 | 将其卸载 `pip uninstall selenium`,重新安装并指定版本号 `pip install selenium==2.48.0`。
28 |
29 | - 问题2: Unable to start phantomjs with ghostdriver: [WinError 2] 系统找不到指定的文件
30 |
31 | 解决方案:下载 phantomjs 到该目录下,或配置 phantomjs 的目录路径到 path 环境变量
32 |
33 | 下载路径 http://phantomjs.org/download.html ,选择对应操作系统下载
34 |
35 | - 问题3:使用 chromedrive ,对应下载地址与版本对照表
36 |
37 | 下载地址:http://npm.taobao.org/mirrors/chromedriver
38 |
39 | 版本对照表:https://blog.csdn.net/yoyocat915/article/details/80580066
40 |
41 | 测试结果:
42 |
43 | 
44 |
45 | ### 简单文本模式
46 |
47 | 保存画板的里面所有图片信息,包含图片和图片描述
48 |
49 | 以画板 [淡然小笺赋箴言](http://huaban.com/boards/13448395/) 为例:
50 |
51 | 
52 |
53 | 保存画板信息的同时,会在同层目录生成一个单 `index.html` 页面
54 |
55 | 我将一部分爬取到的信息(40张)上传到了该项目里面,点击[这里](https://petterobam.github.io/learn-scrapy/huaban.com/%E6%B7%A1%E7%84%B6%E5%B0%8F%E7%AC%BA%E8%B5%8B%E7%AE%B4%E8%A8%80/index.html)预览
56 |
57 | 
58 |
--------------------------------------------------------------------------------
/huaban.com/huaban-border-text.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from selenium import webdriver
3 | import time
4 | import os
5 | import requests
6 | import PreviewHtmlTool
7 |
8 |
9 | class Huaban():
10 | def __init__(self, username, password):
11 | self.username = username
12 | self.password = password
13 |
14 | # 获取图片和图片文字信息,并存储成文件
15 | def get_picture_info_by_border_url(self, border_url):
16 |
17 | # 使用Chrome浏览器模拟打开网页,但是要把下载的chromedriver.exe放在python的文件路径下,
18 | # 调试好之后换成PhantomJs,速度应该会快一点
19 | # driver = webdriver.PhantomJs()
20 | # driver = webdriver.PhantomJS('../plugin/phantomjs-2.1.1-macosx/bin/phantomjs')
21 | driver = webdriver.Chrome('../plugin/chromedriver')
22 | # 设置全屏
23 | driver.maximize_window()
24 |
25 | if username != None and len(username) > 0:
26 | url = "http://huaban.com"
27 | driver.get(url)
28 | time.sleep(8)
29 |
30 | # 点击登录、呼起登录窗口
31 | driver.find_elements_by_xpath('//a[@class="login bounce btn wbtn"]')[0].click()
32 | # 输入用户名
33 | try:
34 | driver.find_elements_by_xpath('//input[@name="email"]')[0].send_keys(self.username)
35 | print('用户名输入OK!')
36 | except:
37 | print('用户名输入异常!')
38 | time.sleep(3)
39 | # 输入密码
40 | try:
41 | driver.find_elements_by_xpath('//input[@name="password"]')[0].send_keys(self.password)
42 | print('密码输入OK!')
43 | except:
44 | print('密码输入异常!')
45 | time.sleep(3)
46 | # 点击登陆按钮
47 | try:
48 | driver.find_elements_by_xpath('//a[@class="btn btn18 rbtn"]')[0].click()
49 | print('点击登陆OK!')
50 | except:
51 | print('点击登陆异常')
52 | time.sleep(3)
53 |
54 | #访问画板,例如 http://huaban.com/boards/13448395/
55 | driver.get(border_url)
56 | time.sleep(5)
57 | i = 0
58 | page = 1
59 | global name
60 | global store_path
61 | global path
62 | # 获取画板标题 //div[@id="board_card"]/div[@class="inner"]/div[@class="head-line"]/h1
63 | content = driver.find_elements_by_xpath('//div[@id="board_card"]/div[@class="inner"]/div[@class="head-line"]/h1')[0].text
64 | path = "./" + content
65 | # hash_content = str(hash(content))
66 | # hash_content = border_url[-9:-1]
67 | url_split_list = border_url.split("/")
68 | hash_content = url_split_list[-2] + url_split_list[-1]
69 |
70 | # 保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的文件夹
71 | if not os.path.exists(path):
72 | os.makedirs(path)
73 | #获取图片的总数 //div[@id="board_card"]/div[@class="bar"]/div[@class="tabs"]/a
74 | pictures_count = driver.find_elements_by_xpath('//div[@id="board_card"]/div[@class="bar"]/div[@class="tabs"]/a')[0].text.replace('采集', '')
75 | print(pictures_count)
76 |
77 | # 生成预览用的HTML页面
78 | PreviewHtmlTool.saveIndexHtmlFile(path + "/index.html", content, hash_content, pictures_count)
79 |
80 | pages = int(int(pictures_count) / 20)
81 | print(pages)
82 | #匹配到图片url所在的元素
83 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
84 | #匹配图片对应的文字描述
85 | pic_info_elements = driver.find_elements_by_xpath('//div[@id="waterfall"]//p[@class="description"]')
86 |
87 | while page <= pages:
88 | while len(url_elements) < 20 * page:
89 | driver.execute_script("window.scrollBy(0,1000)")
90 | time.sleep(3)
91 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
92 | pic_info_elements = driver.find_elements_by_xpath('//div[@id="waterfall"]//p[@class="description"]')
93 |
94 | print("第%s页" % page)
95 |
96 | for url_element in url_elements[20 * (page - 1):20 * page]:
97 | download_url = url_element.get_attribute("src")[:-3] + "658"
98 | pic_info = pic_info_elements[i].get_attribute("data-raw")
99 | i += 1
100 | store_path = hash_content + "_" + str(i)
101 | self.store(download_url, pic_info)
102 |
103 | page += 1
104 |
105 | #最后一页
106 | print("第%s页" % int(page))
107 |
108 | while len(url_elements) < int(pictures_count):
109 | driver.execute_script("window.scrollBy(0,1000)")
110 | time.sleep(3)
111 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
112 | pic_info_elements = driver.find_elements_by_xpath('//div[@id="waterfall"]//p[@class="description"]')
113 |
114 | for url_element in url_elements[20 * (page - 1):]:
115 | download_url = url_element.get_attribute("src")[:-3] + "658"
116 | pic_info = pic_info_elements[i].get_attribute("data-raw")
117 | i += 1
118 | store_path = hash_content + "_" + str(i)
119 | self.store(download_url, pic_info)
120 |
121 | #存储图片到本地
122 | def store(self, picture_url, picture_info):
123 | pic_path = path + '/'+ store_path
124 |
125 | with open(pic_path + '.jpg', 'wb') as f:
126 | picture = requests.get(picture_url)
127 | f.write(picture.content)
128 | print('正在保存图片:' + picture_url)
129 | print(f'文件:{pic_path}.jpg')
130 |
131 | with open(pic_path + '.txt', 'w', encoding='UTF-8') as f:
132 | f.write(picture_info)
133 | print('正在保存图片文字信息:' + picture_url)
134 | print(f'文件:{pic_path}.txt')
135 |
136 | if __name__ == "__main__":
137 | username = input('请输入花瓣账号名:') # '花瓣账号'
138 | password = input('请输入账号对应密码:') # '账号密码'
139 | huaban = Huaban(username, password)
140 | #获取画板图片信息[淡然小笺赋箴言] http://huaban.com/boards/13448395/
141 | border_url = 'http://huaban.com/boards/13448395/'
142 | huaban.get_picture_info_by_border_url(border_url)
143 |
--------------------------------------------------------------------------------
/huaban.com/huaban-simple.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | from selenium import webdriver
3 | import time
4 | import os
5 | import requests
6 |
7 |
8 | class Huaban():
9 | def __init__(self, username, password):
10 | self.username = username
11 | self.password = password
12 |
13 | # 获取图片url并存到列表urls_list
14 | def get_picture_url(self, content):
15 | global path
16 | path = "./" + content
17 | # 保存图片到磁盘文件夹 file_path中,默认为当前脚本运行目录下的文件夹
18 | if not os.path.exists(path):
19 | os.makedirs(path)
20 | url = "http://huaban.com"
21 | # 使用Chrome浏览器模拟打开网页,但是要把下载的 chromedriver.exe 放在python的文件路径下,
22 | # 调试好之后换成 PhantomJs,速度应该会快一点
23 | # driver = webdriver.PhantomJs()
24 | # driver = webdriver.PhantomJS('../plugin/phantomjs-2.1.1-macosx/bin/phantomjs')
25 | driver = webdriver.Chrome('../plugin/chromedriver')
26 | # 设置全屏
27 | driver.maximize_window()
28 | driver.get(url)
29 | time.sleep(8)
30 |
31 | # 点击登录、呼起登录窗口
32 | driver.find_elements_by_xpath('//a[@class="login bounce btn wbtn"]')[0].click()
33 | # 输入用户名
34 | try:
35 | driver.find_elements_by_xpath('//input[@name="email"]')[0].send_keys(self.username)
36 | print('用户名输入OK!')
37 | except:
38 | print('用户名输入异常!')
39 | time.sleep(3)
40 | # 输入密码
41 | try:
42 | driver.find_elements_by_xpath('//input[@name="password"]')[0].send_keys(self.password)
43 | print('密码输入OK!')
44 | except:
45 | print('密码输入异常!')
46 | time.sleep(3)
47 | # 点击登陆按钮
48 | try:
49 | driver.find_elements_by_xpath('//a[@class="btn btn18 rbtn"]')[0].click()
50 | print('点击登陆OK!')
51 | except:
52 | print('点击登陆异常')
53 | time.sleep(3)
54 | #搜索图片
55 | driver.find_elements_by_xpath('//input[@placeholder="搜索你喜欢的"]')[0].send_keys(content)
56 | driver.find_elements_by_xpath('//form[@id="search_form"]/a')[0].click()
57 | time.sleep(5)
58 | i = 0
59 | page = 1
60 | global name
61 | global store_path
62 | global urls_list
63 | urls_list = []
64 | #获取图片的总数
65 | pictures_count = driver.find_elements_by_xpath('//a[@class="selected"]/i')[0].text
66 | print(pictures_count)
67 | pages = int(int(pictures_count) / 20)
68 | print(pages)
69 | #匹配到图片url所在的元素
70 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
71 | #遍历图片元素的列表获取图片的url
72 | for url_element in url_elements:
73 | picture_url = url_element.get_attribute("src")[:-3] + "658"
74 | #防止获取重复的图片url
75 | if picture_url not in urls_list:
76 | urls_list.append(picture_url)
77 | while page <= pages:
78 | while len(urls_list) < 20*page:
79 | driver.execute_script("window.scrollBy(0,1000)")
80 | time.sleep(3)
81 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
82 | for url_element in url_elements:
83 | picture_url = url_element.get_attribute("src")[:-3] + "658"
84 | if picture_url not in urls_list:
85 | urls_list.append(picture_url)
86 | print("第%s页" % page)
87 |
88 | for download_url in urls_list[20*(page-1):20*page]:
89 | i += 1
90 | name = content + "_" + str(i)
91 | store_path = name + '.jpg'
92 | self.store(download_url)
93 | page += 1
94 | #最后一页
95 | print("第%s页" % int(page))
96 |
97 | while len(urls_list) < int(pictures_count):
98 | driver.execute_script("window.scrollBy(0,1000)")
99 | time.sleep(3)
100 | url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
101 | for url_element in url_elements:
102 | picture_url = url_element.get_attribute("src")[:-3] + "658"
103 | if picture_url not in urls_list:
104 | urls_list.append(picture_url)
105 | for download_url in urls_list[20*(page-1): ]:
106 | i += 1
107 | name = content + "_" + str(i)
108 | store_path = name + '.jpg'
109 | self.store(download_url)
110 |
111 | #存储图片到本地
112 | def store(self, picture_url):
113 | picture = requests.get(picture_url)
114 | f = open(os.path.join(path, store_path), 'wb')
115 | f.write(picture.content)
116 | print('正在保存图片:' + picture_url)
117 | print('文件:' + name)
118 |
119 | if __name__ == "__main__":
120 | content = '赵丽颖'
121 | username = input('请输入花瓣账号名:') # '花瓣账号'
122 | password = input('请输入账号对应密码:') # '账号密码'
123 | huaban = Huaban(username, password)
124 | huaban.get_picture_url(content)
125 |
--------------------------------------------------------------------------------
/huaban.com/img/huaban-border-txt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/img/huaban-border-txt.png
--------------------------------------------------------------------------------
/huaban.com/img/huaban-preview-border.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/img/huaban-preview-border.png
--------------------------------------------------------------------------------
/huaban.com/img/huaban-simple-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/img/huaban-simple-1.png
--------------------------------------------------------------------------------
/huaban.com/test.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | adfaf
7 |
106 |
107 |
108 |
109 | adfaf - 1/123
110 |
111 |
112 |
113 |
114 |

115 |
116 |
117 |
118 |
119 |
120 |
176 |
177 |
178 |
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_1.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_1.txt:
--------------------------------------------------------------------------------
1 | 入夏偏宜澹薄妆,越罗衣褪郁金黄,翠钿檀注助容光。
相见无言还有恨,几回判却又思量,月窗香径梦悠飏。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_10.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_10.txt:
--------------------------------------------------------------------------------
1 | 初心已恨花期晚。别后相思长在眼。兰衾犹有旧时香,每到梦回珠泪满。
多应不信人肠断。几夜夜寒谁共暖。欲将恩爱结来生,只恐来生缘又短。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_11.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_11.txt:
--------------------------------------------------------------------------------
1 | 年年此夕东城见,欢意匆匆。明日还重。却在楼台缥缈中。
垂螺拂黛清歌女,曾唱相逢。秋月春风。醉枕香衾一岁同。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_12.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_12.txt:
--------------------------------------------------------------------------------
1 | 四十年来家国,三千里地山河。凤阁龙楼连霄汉,玉树琼枝作烟萝,几曾识干戈?
一旦归为臣虏,沈腰潘鬓消磨。最是仓皇辞庙日,教坊犹奏别离歌,垂泪对宫娥。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_13.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_13.txt:
--------------------------------------------------------------------------------
1 | 彩袖殷勤捧玉钟,当年拚却醉颜红。舞低杨柳楼心月,歌尽桃花扇底风。
从别后,忆相逢,几回魂梦与君同。今宵剩把银釭照,犹恐相逢是梦中。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_14.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_14.txt:
--------------------------------------------------------------------------------
1 | 可怜白雪曲,未遇知音人。
恓惶戎旅下,蹉跎淮海滨。
涧树含朝雨,山鸟哢馀春。
我有一瓢酒,可以慰风尘。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_15.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_15.txt:
--------------------------------------------------------------------------------
1 | 罗带惹香,犹系别时红豆。泪痕新,金缕旧,断离肠。
一双娇燕语雕梁,还是去年时节。绿杨浓,芳草歇,柳花狂。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_16.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_16.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_16.txt:
--------------------------------------------------------------------------------
1 | 天涯旧恨,独自凄凉人不问。欲见回肠,断尽金炉小篆香。
黛蛾长敛,任是春风吹不展。困倚危楼,过尽飞鸿字字愁。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_17.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_17.txt:
--------------------------------------------------------------------------------
1 | 人生愁恨何能免,销魂独我情何限!故国梦重归,觉来双泪垂。
高楼谁与上?长记秋晴望。往事已成空,还如一梦中。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_18.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_18.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_18.txt:
--------------------------------------------------------------------------------
1 | 恨君不似江楼月,南北东西,南北东西,只有相随无别离。
恨君却似江楼月,暂满还亏,暂满还亏,待得团圆是几时?
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_19.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_19.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_19.txt:
--------------------------------------------------------------------------------
1 | 多少恨,昨夜梦魂中。还似旧时游上苑,车如流水马如龙。花月正春风。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_2.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_2.txt:
--------------------------------------------------------------------------------
1 | 身外闲愁空满,眼中欢事常稀。明年应赋送君诗。细从今夜数,相会几多时。
浅酒欲邀谁劝,深情惟有君知。东溪春近好同归。柳垂江上影,梅谢雪中枝。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_20.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_20.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_20.txt:
--------------------------------------------------------------------------------
1 | 涉江玩秋水,爱此红蕖鲜。
攀荷弄其珠,荡漾不成圆。
佳人彩云里,欲赠隔远天。
相思无因见,怅望凉风前。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_21.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_21.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_21.txt:
--------------------------------------------------------------------------------
1 | 急景流年真一箭。残雪声中,省识东风面。风里垂杨千万线,昨宵染就鹅黄浅。
又是廉纤春雨暗。倚遍危楼,高处人难见。已恨平芜随雁远,暝烟更界平芜断。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_22.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_22.txt:
--------------------------------------------------------------------------------
1 | 淡烟飘薄。莺花谢、清和院落。树阴翠、密叶成幄。麦秋霁景,夏云忽变奇峰、倚寥廊。波暖银塘,涨新萍绿鱼跃。想端忧多暇,陈王是日,嫩苔生阁。
正铄石天高,流金昼永,楚榭光风转蕙,披襟处、波翻翠幕。以文会友,沈李浮瓜忍轻诺。别馆清闲,避炎蒸、岂须河朔。但尊前随分,雅歌艳舞,尽成欢乐。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_23.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_23.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_23.txt:
--------------------------------------------------------------------------------
1 | 春到南楼雪尽。惊动灯期花信。小雨一番寒。倚阑干。
莫把栏干频倚。一望几重烟水。何处是京华。暮云遮。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_24.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_24.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_24.txt:
--------------------------------------------------------------------------------
1 | 柳色披衫金缕凤,纤手轻拈红豆弄,翠蛾双敛正含情。桃花洞,瑶台梦,一片春愁谁与共?
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_25.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_25.txt:
--------------------------------------------------------------------------------
1 | 阑珊心绪,醉倚绿琴相伴住。一枕新愁,残夜花香月满楼。
繁笙脆管,吹得锦屏春梦远。只有垂杨,不放秋千影过墙。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_26.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_26.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_26.txt:
--------------------------------------------------------------------------------
1 | 卷尽愁云,素娥临夜新梳洗。暗尘不起。酥润凌波地。
辇路重来,仿佛灯前事。情如水。小楼熏被。春梦笙歌里。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_27.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_27.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_27.txt:
--------------------------------------------------------------------------------
1 | 杨柳丝丝弄轻柔,烟缕织成愁。海棠未雨,梨花先雪,一半春休。
而今往事难重省,归梦绕秦楼。相思只在:丁香枝上,豆蔻梢头。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_28.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_28.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_28.txt:
--------------------------------------------------------------------------------
1 | 秋池阁。风傍晓庭帘幕。霜叶未衰吹未落。半惊鸦喜鹊。
自笑浮名情薄。似与世人疏略。一片懒心双懒脚。好教闲处著。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_29.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_29.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_29.txt:
--------------------------------------------------------------------------------
1 | 谢却荼蘼,一片月明如水。篆香消,犹未睡,早鸦啼。
嫩寒无赖罗衣薄,休傍阑干角。最愁人,灯欲落,雁还飞。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_3.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_3.txt:
--------------------------------------------------------------------------------
1 | 背庭缘恐花羞坠。心事遥山里。小帘愁卷月笼明。一寸秋怀禁得、几蛩声。
井梧不放西风起。供与离人睡。梦和新月未圆时。起看檐蛛结网、又寻思。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_30.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_30.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_30.txt:
--------------------------------------------------------------------------------
1 | 雨后春容清更丽。只有离人,幽恨终难洗。北固山前三面水。碧琼梳拥青螺髻。
一纸乡书来万里。问我何年,真个成归计。白首送春拚一醉。东风吹破千行泪。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_31.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_31.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_31.txt:
--------------------------------------------------------------------------------
1 | 如花貌。当来便约,永结同心偕老。为妙年、俊格聪明,凌厉多方怜爱,何期养成心性近,元来都不相表。渐作分飞计料。
稍觉因情难供,恁殛恼。争克罢同欢笑。已是断弦尤续,覆水难收,常向人前诵谈,空遣时传音耗。漫悔懊。此事何时坏了。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_32.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_32.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_32.txt:
--------------------------------------------------------------------------------
1 | 西施宜笑复宜颦,丑女效之徒累身。
君王虽爱蛾眉好,无奈宫中妒杀人!
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_33.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_33.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_33.txt:
--------------------------------------------------------------------------------
1 | 十里春风,二分明月,蕊仙飞下琼楼。看冰花翦翦,拥碎玉成毬。想长日、云阶伫立,太真肌骨,飞燕风流。敛群芳、清丽精神,都付扬州。
雨窗数朵,梦惊回、天际香浮。似阆苑花神,怜人冷落,骑鹤来游。为问竹西风景,长空淡、烟水悠悠。又黄昏,羌管孤城,吹起新愁。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_34.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_34.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_34.txt:
--------------------------------------------------------------------------------
1 | 记得来时春未暮,执手攀花,袖染花梢露。暗卜春心共花语,争寻双朵争先去。
多情因甚相辜负,轻拆轻离,欲向谁分诉。泪湿海棠花枝处,东君空把奴分付。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_35.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_35.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_35.txt:
--------------------------------------------------------------------------------
1 | 名花倾国两相欢,常得君王带笑看。
解释春风无限恨,沉香亭北倚栏杆。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_36.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_36.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_36.txt:
--------------------------------------------------------------------------------
1 | 无情最是江头柳。长条折尽还依旧。木叶下平湖。雁来书有无。雁无书尚可。妙语凭谁和。风雨断肠时。小山生桂枝。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_37.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_37.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_37.txt:
--------------------------------------------------------------------------------
1 | 画楼影蘸清溪水。歌声响彻行云里。帘幕燕双双。绿杨低映窗。
曲中特地误。要试周郎顾。醉里客魂消。春风大小乔。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_38.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_38.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_38.txt:
--------------------------------------------------------------------------------
1 | 燕语莺啼人乍远。却恨西园,依旧莺和燕。笑语十分愁一半。翠园特地春光暖。
只道书来无过雁。不道柔肠,近日无肠断。柄玉莫摇湘泪点。怕君唤作秋风扇。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_39.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_39.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_39.txt:
--------------------------------------------------------------------------------
1 | 可怜今夕月,向何处、去悠悠?
是别有人间,那边才见,光影东头?
是天外空汗漫,但长风、浩浩送中秋?
飞镜无根谁系?嫦娥不嫁谁留?
谓经海底问无由,恍惚使人愁。
怕万里长鲸,纵横触破,玉殿琼楼。
虾蟆故堪浴水,问云何、玉兔解沉浮?
若道都齐无恙,云何渐渐如钩?
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_4.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_4.txt:
--------------------------------------------------------------------------------
1 | 香叆雕盘,寒生冰箸,画堂别是风光。主人情重,开宴出红妆。腻玉圆搓素颈,藕丝嫩、新织仙裳。双歌罢,虚檐转月,余韵尚悠扬。
人间,何处有,司空见惯,应谓寻常。坐中有狂客,恼乱愁肠。报道金钗坠也,十指露、春笋纤长。亲曾见,全胜宋玉,想像赋高唐。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_40.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_40.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_40.txt:
--------------------------------------------------------------------------------
1 | 风骨萧然,称独立、群仙首。春江雪、一枝梅秀。小样香檀,映朗玉、纤纤手。未久。转新声、泠泠山溜。
曲里传情,更浓似、尊中酒。信倾盖、相逢如旧。别后相思,记敏政堂前柳。知否。又拚了、一场消瘦。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_5.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_5.txt:
--------------------------------------------------------------------------------
1 | 雕阴无树水南流,雉堞连云古帝州。
带雨晚驼鸣远戍,望乡孤客倚高楼。
明妃去日花应笑,蔡琰归时鬓已秋。
一曲单于暮烽起,扶苏城上月如钩。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_6.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_6.txt:
--------------------------------------------------------------------------------
1 | 美女妖且闲,采桑歧路间。
柔条纷冉冉,叶落何翩翩。
攘袖见素手,皓腕约金环。
头上金爵钗,腰佩翠琅玕。
明珠交玉体,珊瑚间木难。
罗衣何飘飘,轻裾随风还。
顾盼遗光彩,长啸气若兰。
行徒用息驾,休者以忘餐。
借问女安居,乃在城南端。
青楼临大路,高门结重关。
容华耀朝日,谁不希令颜?
媒氏何所营?玉帛不时安。
佳人慕高义,求贤良独难。
众人徒嗷嗷,安知彼所观?
盛年处房室,中夜起长叹。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_7.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_7.txt:
--------------------------------------------------------------------------------
1 | 斑骓路与阳台近。前度无题初借问。暖风鞭袖尽闲垂,微月帘栊曾暗认。
梅花未足凭芳信。弦语岂堪传素恨。翠眉饶似远山长,寄与此愁颦不尽。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_8.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_8.txt:
--------------------------------------------------------------------------------
1 | 留春不住。恰似年光无味处。满眼飞英。弹指东风太浅情。
筝弦未稳。学得新声难破恨。转枕花前。且占香红一夜眠。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_9.jpg
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_9.txt:
--------------------------------------------------------------------------------
1 | 出墙花,当路柳。借问芳心谁有。红解笑,绿能颦。千般恼乱春。
北来人,南去客。朝暮等闲攀折。怜晚芳,惜残阳。情知枉断肠。
--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 | 淡然小笺赋箴言
11 |
110 |
111 |
112 |
113 | 淡然小笺赋箴言 - 1/40
114 |
115 |
116 |
117 |
118 |

119 |
120 |
121 |
122 |
123 |
124 |
180 |
181 |
182 |
--------------------------------------------------------------------------------
/jjwxk.net/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
--------------------------------------------------------------------------------
/jjwxk.net/README.md:
--------------------------------------------------------------------------------
1 | ## 前情提要
2 |
3 | 最近想学习学习怎么写小说,爬点资源分析分析
4 |
5 | ## 晋江文学库
6 |
7 | 免费小说爬取
8 |
9 | 1. 简单模式
10 | 2. HTML静态模式
11 | 3. Sqlite模式
12 | 4. ES模式
13 | 5. ES+ECHARTS模式
14 |
15 | ### 简单模式
16 |
17 | 运行方法:进入到当前目录下
18 |
19 | ```
20 | python jjwxk-free-simple.py
21 | ```
22 |
23 | 1. 简单模式是以文本文件作为数据媒介,存储格式用文件夹实现层次,每个子文件夹一本小说数据
24 | 2. 所有小说列表和小说具体内容爬取步骤分离,可以分两个线程运行 free_list 和 book_list 方法,每个方法不支持多进程运行
25 | 3. 实现了简单的断点续爬,利用文本文件记录爬取进度,包括小说列表进度、已完成的小说进度、当前这本小说章节进度
26 |
27 | 部分截图:
28 |
29 | 
30 |
31 | 
32 |
33 | >PS:本项目仅学习分享用,请不要用于商业
--------------------------------------------------------------------------------
/jjwxk.net/img/jjwxk-free-simple-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/jjwxk.net/img/jjwxk-free-simple-1.png
--------------------------------------------------------------------------------
/jjwxk.net/img/jjwxk-free-simple-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/jjwxk.net/img/jjwxk-free-simple-2.png
--------------------------------------------------------------------------------
/jjwxk.net/jjwxk-free-simple.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | from bs4 import BeautifulSoup
4 | from BaseTools.MyDownload import request
5 | from BaseTools.MyUtil import FileTool
6 | import time
7 |
8 | class jjwxk_free_simple():
9 | def __init__(self):
10 | self.headers = {
11 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
12 | # 'Accept-Encoding': 'gzip, deflate',
13 | 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
14 | 'Host': 'www.jjwxc.net',
15 | 'Upgrade-Insecure-Requests': '1',
16 | 'User-Agent':"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
17 | }
18 | self.basePath = "jjwxk_free_simple/"
19 | FileTool.mkdir(self.basePath)
20 | self.baseListFilePath = self.basePath + "book-list.txt"
21 | self.baseUrlFilePath = self.basePath + "book-url.txt"
22 | self.basePageFilePath = self.basePath + "book-page.txt"
23 | self.totalFinishFilePath = self.basePath + "total.txt"
24 | self.finishBookLineCountFilePath = self.basePath + "book-total.txt"
25 | self.globalPageCount = 0
26 | self.pageCount = 0
27 | self.lineCount = 0
28 | self.readFinishCountInfo()
29 |
30 | # 抓取入口:默认 http://www.jjwxc.net/bookbase_slave.php?booktype=free
31 | def free_list(self, limitPage=1, url="http://www.jjwxc.net/bookbase_slave.php?booktype=free"):
32 | html_content = self.request_content(url) ##调用request_content返回html文本给我们
33 | FileTool.write_behind(self.basePageFilePath, url)
34 | html_ele = BeautifulSoup(html_content, 'lxml')
35 | self.globalPageCount = self.globalPageCount + 1
36 | if self.globalPageCount > limitPage:
37 | return
38 |
39 | if self.globalPageCount >= self.pageCount:
40 | # 如果当前页码比记录的页码大,行数从第一行开始记录,否则就当前页码记录
41 | if(self.globalPageCount > self.pageCount):
42 | self.lineCount = 0
43 | self.pageCount = self.globalPageCount
44 |
45 | # 获取图书表格元素
46 | book_table = html_ele.find("table", class_="cytable")
47 | if book_table == None:
48 | return
49 | list_tr = book_table.find_all("tr")
50 | count = -1
51 | for tr in list_tr:
52 | count = count + 1
53 | if count == 0 or self.lineCount >= count:
54 | continue
55 | list_td = tr.find_all("td")
56 | book_list_url = None
57 | book_info_arr = []
58 | count_td = 0
59 | for td in list_td:
60 | book_info_arr.append(td.get_text().replace('\n', '').replace(' ', ''))
61 | if count_td == 1:
62 | book_list_url = "http://www.jjwxc.net/" + td.find("a")['href']
63 | count_td = count_td + 1
64 | FileTool.write_behind(self.baseUrlFilePath, book_list_url)
65 | book_list_info = " | ".join(book_info_arr)
66 | FileTool.write_behind(self.baseListFilePath, book_list_info)
67 | self.lineCount = count
68 | # 完成一行,记录一下count信息,便于后面断点爬取
69 | self.saveFinishCountInfo()
70 | else:
71 | self.globalPageCount = self.pageCount - 1
72 |
73 | # page_next = "http://www.jjwxc.net/" + html_ele.find_all("div", class_="controlbar")[1].find_all("a")[2]["href"]
74 | page_next = "http://www.jjwxc.net/bookbase_slave.php?booktype=free&opt=&endstr=&orderstr=4&page=" + str(self.globalPageCount + 1)
75 | if page_next == None or "" == page_next:
76 | return
77 | print("书籍清单第", self.globalPageCount, "页信息:[", url, "]抓取完毕")
78 | # 暂停一秒,防止爬虫被发现
79 | # time.sleep(1)
80 | self.headers['Referer'] = url
81 | # 继续拉取下一页
82 | self.free_list(limitPage, page_next)
83 |
84 | # 从保存的书籍链接记录里面抓取每一本书的内容
85 | def book_list(self):
86 | book_count = 0
87 | book_finish_count = self.readSimpleFinishCountInfo(self.finishBookLineCountFilePath)
88 | for line in open(self.baseUrlFilePath):
89 | # 逐行读取此前爬取的书籍链接,去掉最后的换行符号
90 | url = line.replace("\n", "")
91 | book_count = book_count + 1
92 | if book_count <= book_finish_count:
93 | print("[", url, "],该本书已经抓取过!")
94 | continue
95 | self.book_one(url)
96 | # 记录抓取书的数量,实现简单断点续爬
97 | FileTool.overwrite(self.finishBookLineCountFilePath, str(book_count))
98 | print("[", url, "],该本书所有章节已经抓取完毕!")
99 |
100 | # 保存一本书的内容
101 | def book_one(self, url="http://www.jjwxc.net/onebook.php?novelid=3468871"):
102 | html_content = self.request_content(url) ##调用request_content返回html文本给我们
103 | html_ele = BeautifulSoup(html_content, 'lxml')
104 | # 获取图书表格元素
105 | book_table = html_ele.find("table", id="oneboolt")
106 | list_tr = book_table.find_all("tr")
107 | self.headers['Referer'] = url
108 | if len(list_tr) > 0:
109 | book_title = list_tr[0].find("h1").get_text()
110 | # 去掉文件夹中特殊字符,防止小说名中特殊字符
111 | book_floder = self.basePath + FileTool.replace_invalid_filename(book_title) + "/"
112 | FileTool.mkdir(book_floder)
113 | book_chapter_file = book_floder + "0.chapter_list.txt"
114 | book_chapter_url_file = book_floder + "0.chapter_url_list.txt"
115 | book_chapter_finish_count_file = book_floder + "0.current_count.txt"
116 | chapter_count = 0
117 | chapter_finish_count = self.readSimpleFinishCountInfo(book_chapter_finish_count_file)
118 | for tr in list_tr:
119 | if "itemprop" in tr.attrs:
120 | chapter_count = chapter_count + 1
121 | if chapter_count <= chapter_finish_count:
122 | print("第", chapter_count, "章,该章节已经抓取过!")
123 | continue
124 | list_td = tr.find_all("td")
125 | count_td = 0
126 | chapter_info_arr = []
127 | chapter_url = None
128 | chapter_title = None
129 | for td in list_td:
130 | chapter_info_arr.append(td.get_text().replace('\n', '').replace(' ', ''))
131 | if count_td == 1:
132 | chapter_a = td.find("a")
133 | if chapter_a != None:
134 | chapter_url = chapter_a['href']
135 | chapter_title = chapter_a.get_text()
136 | count_td = count_td + 1
137 | if chapter_url == None:
138 | print("第", chapter_count, "章,该章节已丢失!")
139 | chapter_url = "第" + str(chapter_count) + "章,该章节已丢失!"
140 | else:
141 | # 去掉文件名中的特殊字符
142 | curr_filename = FileTool.replace_invalid_filename(str(chapter_count) + "." + chapter_title + ".txt")
143 | curr_chapter_file_path = book_floder + curr_filename
144 | self.save_chapter(curr_chapter_file_path, chapter_url)
145 | FileTool.write_behind(book_chapter_url_file, chapter_url)
146 | chapter_info = " | ".join(chapter_info_arr)
147 | FileTool.write_behind(book_chapter_file, chapter_info)
148 | # 记录完成的章节数,简单实现断点续爬
149 | FileTool.overwrite(book_chapter_finish_count_file, str(chapter_count))
150 | print("第", chapter_count, "章,该章节已经抓取完毕!")
151 |
152 | # 保存一个章节的内容
153 | def save_chapter(self, path, chapter_url):
154 | html_content = self.request_content(chapter_url) ##调用request_content返回html文本给我们
155 | html_ele = BeautifulSoup(html_content, 'lxml')
156 | novelDiv = html_ele.find("div", class_="noveltext")
157 | if novelDiv == None:
158 | return
159 | novelHtmls = novelDiv.contents
160 | novelTextArr = []
161 | # 处理小说文本数据,保证简单换行,保证基本格式
162 | for novelHtml in novelHtmls:
163 | if novelHtml.name == "div" or novelHtml.name == "br":
164 | continue
165 | else:
166 | text = novelHtml.string
167 | if text == None:
168 | continue
169 | text = text.replace('\n', '').replace("\r", "").replace(" ", "")
170 | if len(text) > 0:
171 | novelTextArr.append(text)
172 | novelText = "\n\n".join(novelTextArr)
173 | FileTool.overwrite(path, novelText)
174 |
175 |
176 | # 读取简单的数字信息
177 | def readSimpleFinishCountInfo(self, path):
178 | isExists = FileTool.isExit(path)
179 | if isExists:
180 | countTxt = FileTool.read_utf8(path)
181 | return int(countTxt)
182 | else:
183 | return 0
184 |
185 | # 保存已完成的条数信息
186 | def saveFinishCountInfo(self):
187 | FileTool.overwrite(self.totalFinishFilePath, str(self.pageCount) + "-" + str(self.lineCount))
188 |
189 | # 读取已完成的条数信息
190 | def readFinishCountInfo(self):
191 | isExists = FileTool.isExit(self.totalFinishFilePath)
192 | if isExists:
193 | countTxt = FileTool.read_utf8(self.totalFinishFilePath)
194 | countStrArr = countTxt.split("-")
195 | self.pageCount = int(countStrArr[0])
196 | self.lineCount = int(countStrArr[1])
197 | else:
198 | self.pageCount = 0
199 | self.lineCount = 0
200 |
201 | # 获取网页html文本内容
202 | def request_content(self, url):
203 | try:
204 | return request.get_utf8_content(url, headers=self.headers)
205 | except:
206 | return ""
207 |
208 |
209 | jjwxk = jjwxk_free_simple()
210 | jjwxk.free_list()
211 | # while jjwxk.globalPageCount < 10000:
212 | # try:
213 | # jjwxk.free_list()
214 | # except Exception as e:
215 | # print('except:', e)
216 | # finally:
217 | # print('finally...')
218 | jjwxk.book_list()
--------------------------------------------------------------------------------
/jjwxk.net/simple-http-server.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # python2.0为SimpleHTTPServer
3 | import http.server
4 | # python2.0为SocketServer
5 | import socketserver
6 | # 自定义端口
7 | PORT = 8888
8 | # 服务句柄定义
9 | Handler = http.server.SimpleHTTPRequestHandler
10 | # TCP服务
11 | httpd = socketserver.TCPServer(("", PORT), Handler)
12 | # 启动Web服务
13 | print("Web服务端口为:", PORT)
14 | httpd.serve_forever()
--------------------------------------------------------------------------------
/mzitu.com/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
--------------------------------------------------------------------------------
/mzitu.com/README.md:
--------------------------------------------------------------------------------
1 | ## 前情提要
2 |
3 | 爬取 妹子图 网站的 图片
4 | 学习python的一个练手例子,来自 https://cuiqingcai.com/4352.html
5 |
6 | ## 可启动文件
7 |
8 | 1. scrapy-mzitu-no-es.py,简单文件夹格式目录存储,简单断点续爬,未使用数据库
9 | 2. scrapy-mzitu-es.py,基于ES数据库存储,图片相对本目录存储路径,断点续爬
10 | 3. mzitu-crawler-es.py,简单多线程爬取,基于ES数据库存储,图片相对本目录存储路径,断点续爬
11 |
12 | ## 启动方法
13 |
14 | 控制台 cd 到当前目录
15 |
16 | >python filename.py
17 |
18 | >PS:本项目仅学习分享用,请不要用于商业
--------------------------------------------------------------------------------
/mzitu.com/mzitu-crawler-es.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import time
3 | import threading
4 | import multiprocessing
5 | from mzitu_for_thread import MzituThread
6 | from mzitu_es import mzitu_es
7 |
8 | SLEEP_TIME = 1
9 | def mzitu_crawler(max_threads=5):
10 | def pageurl_crawler():
11 | mzituThread = MzituThread(mzitu_es)
12 | while True:
13 | if mzituThread.scrapy_one() is not True:
14 | time.sleep(SLEEP_TIME)
15 |
16 | threads = []
17 | while True:
18 | """
19 | threads 或者 crawl_queue为真都代表我们还没下载完成,程序就会继续执行
20 | """
21 | for thread in threads:
22 | if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉
23 | threads.remove(thread)
24 | while len(threads) < max_threads: ##线程池中的线程少于max_threads 或者 crawl_qeue时
25 | thread = threading.Thread(target=pageurl_crawler) ##创建线程
26 | thread.setDaemon(True) ##设置守护线程
27 | thread.start() ##启动线程
28 | threads.append(thread) ##添加进线程队列
29 | time.sleep(SLEEP_TIME)
30 |
31 | def process_crawler():
32 | process = []
33 | num_cpus = multiprocessing.cpu_count()
34 | print('将会启动进程数为:', num_cpus)
35 | for i in range(num_cpus):
36 | p = multiprocessing.Process(target=mzitu_crawler) ##创建进程
37 | p.start() ##启动进程
38 | process.append(p) ##添加进进程队列
39 | for p in process:
40 | p.join() ##等待进程队列里面的进程结束
41 |
42 | if __name__ == "__main__":
43 | #mzituThread = MzituThread(mzitu_es)
44 | #mzituThread.all_url() # 抓取所有需要带处理的链接
45 | process_crawler()
--------------------------------------------------------------------------------
/mzitu.com/mzitu_es.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | from DBTools.MyES import MyESClient
4 | from datetime import datetime
5 |
6 | class MzituEs():
7 | def __init__(self):
8 | self.init_es()
9 |
10 | def init_es(self):
11 | self.esindex = "mzitu"
12 | self.estype = "mzitu_imgs"
13 | index_mappings = {
14 | "mappings": {
15 | self.estype: {
16 | "properties": {
17 | "imgThemeTitle": {
18 | "type": "text",
19 | "index": True,
20 | "analyzer": "ik_max_word",
21 | "search_analyzer": "ik_max_word"
22 | },
23 | "imgThemeUrl": {
24 | "type": "keyword",
25 | "index": True
26 | },
27 | "createTime": {
28 | "type": "date",
29 | "index": True
30 | },
31 | "scrapyStatus":{
32 | "type": "integer",
33 | "index": True,
34 | # 0,1,2 待爬取,爬取中,已完成
35 | "null_value": 0
36 | }
37 | }
38 | }
39 | }
40 | }
41 | self.es = MyESClient(self.esindex, self.estype)
42 | self.es.createIndex(index_mappings)
43 | self.currdata = {}
44 | self.currdata["imgUrlList"] = []
45 |
46 | def save_es(self, data=None):
47 | '''
48 | 存储当前数据到ES,并清空
49 | :return:
50 | '''
51 | if data == None:
52 | data = self.currdata
53 | data["createTime"] = datetime.now()
54 | data["scrapyStatus"] = 0
55 | self.currdata = {}
56 | self.currdata["imgUrlList"] = []
57 | self.es.indexData(data, data["imgThemeUrl"])
58 |
59 | def get_one_need_scrapy_es(self):
60 | '''
61 | 从ES库中找一个待爬取的数据
62 | '''
63 | queryBody = {
64 | "query": {
65 | "bool": {
66 | "must": [
67 | {
68 | "term": {
69 | "scrapyStatus": {
70 | "value": 0
71 | }
72 | }
73 | }
74 | ]
75 | }
76 | }
77 | }
78 | res = self.es.getOneByBody(queryBody)
79 | return res
80 |
81 | def get_by_themeId_es(self, themeId):
82 | res = self.es.getDataSourceById(themeId)
83 | return res
84 |
85 | def exit_es(self, themeurl):
86 | queryBody = {
87 | "query": {
88 | "bool": {
89 | "must": [
90 | {
91 | "term": {
92 | "imgThemeUrl": {
93 | "value": themeurl
94 | }
95 | }
96 | }
97 | ]
98 | }
99 | }
100 | }
101 | if self.es.exit(queryBody):
102 | print("ES数据库里面已经存在!!")
103 | return True
104 | else:
105 | return False
106 |
107 | mzitu_es = MzituEs()
--------------------------------------------------------------------------------
/mzitu.com/mzitu_for_thread.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | import datetime
4 | from bs4 import BeautifulSoup
5 | import os
6 | # import lxml
7 | from BaseTools.MyDownload import request ##导入模块变了一下
8 |
9 | class MzituThread(object):
10 | def __init__(self, mzitu_es):
11 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
12 | self.currPath = "./mzitu/"
13 | self.currdata = {}
14 | self.currdata["imgUrlList"] = []
15 | self.es = mzitu_es
16 |
17 | def all_url(self, url='http://www.mzitu.com/all'):
18 | html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response
19 | all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find('ul', class_="archives").find_all('a')
20 | for a in all_a:
21 | try:
22 | title = a.get_text()
23 | href = a['href']
24 | print(title, href) ##加点提示不然太枯燥了
25 | if self.es.exit_es(href):
26 | continue
27 | self.currdata["imgThemeTitle"] = title
28 | self.currdata["imgThemeUrl"] = href
29 | self.es.save_es(self.currdata)
30 | except Exception as e:
31 | print(e)
32 | continue
33 |
34 | def scrapy_one(self, url=None):
35 | try:
36 | data = None
37 | if url == None:
38 | data = self.es.get_one_need_scrapy_es()
39 | else:
40 | data = self.es.get_by_themeId_es(url)
41 |
42 | if data == None:
43 | return False
44 | else:
45 | data["scrapyStatus"]=1
46 | self.es.save_es(data) ## 更新状态为爬取中
47 | href = data["imgThemeUrl"]
48 | self.mkdir(href) ##调用mkdir函数创建文件夹!
49 | self.html(href, data) ##调用html函数把href参数传递过去!
50 | data["scrapyStatus"]=2
51 | self.es.save_es(data) ## 保存数据,并更新状态为已完成
52 | return True
53 | except Exception as e:
54 | print(e)
55 | return False
56 |
57 |
58 | def html(self, href, data=None): ##这个函数是处理套图地址获得图片的页面地址
59 | try:
60 | html = self.request(href)
61 | self.headers['referer'] = href
62 | ## max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
63 | # max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text()
64 | max_span = 100
65 | pageDiv = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi')
66 | if len(pageDiv) > 1:
67 | max_span = pageDiv.find_all('span')[-2].get_text()
68 | for page in range(1, int(max_span) + 1):
69 | page_url = href + '/' + str(page)
70 | self.img(page_url, data) ##调用img函数
71 | except Exception as e:
72 | print('发生了异常:', e)
73 |
74 | def img(self, page_url, data=None): ##这个函数处理图片页面地址获得图片的实际地址
75 | img_html = self.request(page_url)
76 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
77 | print(img_url)
78 | self.saveImg(img_url, data)
79 |
80 | def saveImg(self, img_url, data=None): ##这个函数保存图片
81 | name = img_url[-9:-4]
82 | currUrl = self.currPath + name + '.jpg'
83 | isExists = os.path.exists(currUrl)
84 | if not isExists:
85 | img = self.request(img_url)
86 | f = open(currUrl, 'ab')
87 | f.write(img.content)
88 | f.close()
89 | print('该图片下载完毕')
90 | if data == None:
91 | self.currdata["imgUrlList"].append({"originUrl":img_url, "currentUrl": currUrl})
92 | else:
93 | data["imgUrlList"].append({"originUrl":img_url, "currentUrl": currUrl})
94 | else:
95 | print('该图片已经存在')
96 |
97 | def mkdir(self, path): ##这个函数创建文件夹
98 | if USE_ONE_DIR:
99 | path = ""
100 | elif USE_DEF_DIR:
101 | if path == None:
102 | path = self.currdata["imgThemeUrl"]
103 | index = path.rindex("/")
104 | path = path[index + 1:]
105 | else:
106 | path = path.strip()
107 | isExists = os.path.exists(os.path.join("./mzitu", path))
108 | if not isExists:
109 | print('建了一个名字叫做', path, '的文件夹!')
110 | os.makedirs(os.path.join("./mzitu", path))
111 | self.currPath = "./mzitu/" + path + "/"
112 | ## os.chdir(os.path.join("./mzitu", path)) ##切换到目录
113 | return True
114 | else:
115 | print('名字叫做', self.currPath, '的文件夹已经存在了!')
116 | return False
117 |
118 | def request(self, url): ##这个函数获取网页的response 然后返回
119 | content = request.get(url, headers=self.headers, timeout=3)
120 | return content
121 |
122 |
123 |
124 | USE_ONE_DIR = False
125 | USE_DEF_DIR = True
126 |
127 | #mzituThread = MzituThread() ##实例化
128 | #mzituThread.all_url()
129 | #mzituThread.scrapy_one()
--------------------------------------------------------------------------------
/mzitu.com/scrapy-mzitu-es.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | import datetime
4 | from bs4 import BeautifulSoup
5 | import os
6 | # import lxml
7 | from BaseTools.MyDownload import request ##导入模块变了一下
8 | from mzitu_es import mzitu_es
9 |
10 | class mzitu():
11 |
12 | def __init__(self):
13 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
14 | self.currPath = "./mzitu/"
15 | self.currdata = {}
16 | self.currdata["imgUrlList"] = []
17 |
18 | def all_url(self, url):
19 | html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response
20 | all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find('ul', class_="archives").find_all('a')
21 | for a in all_a:
22 | title = a.get_text()
23 | href = a['href']
24 | print(title, href) ##加点提示不然太枯燥了
25 | if mzitu_es.exit_es(href):
26 | continue
27 | self.currdata["imgThemeTitle"] = title
28 | self.currdata["imgThemeUrl"] = href
29 | #path = str(title).replace("?", '_') ##我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉
30 | self.mkdir(title) ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦!
31 | self.html(href) ##调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦!
32 | mzitu_es.save_es(self.currdata)
33 |
34 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址
35 | try:
36 | html = self.request(href)
37 | self.headers['referer'] = href
38 | #max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
39 | max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text()
40 | for page in range(1, int(max_span) + 1):
41 | page_url = href + '/' + str(page)
42 | self.img(page_url) ##调用img函数
43 | except Exception as e:
44 | print('发生了异常:', e)
45 |
46 | def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址
47 | img_html = self.request(page_url)
48 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
49 | print(img_url)
50 | self.saveImg(img_url)
51 |
52 | def saveImg(self, img_url): ##这个函数保存图片
53 | name = img_url[-9:-4]
54 | currUrl = self.currPath + name + '.jpg'
55 | isExists = os.path.exists(currUrl)
56 | if not isExists:
57 | img = self.request(img_url)
58 | f = open(currUrl, 'ab')
59 | f.write(img.content)
60 | f.close()
61 | print('该图片下载完毕')
62 | self.currdata["imgUrlList"].append({"originUrl":img_url, "currentUrl": currUrl})
63 | else:
64 | print('该图片已经存在')
65 |
66 | def mkdir(self, path): ##这个函数创建文件夹
67 | if USE_ONE_DIR:
68 | path = ""
69 | elif USE_DEF_DIR:
70 | path = self.currdata["imgThemeUrl"]
71 | index = path.rindex("/")
72 | path = path[index + 1:]
73 | else:
74 | path = path.strip()
75 | isExists = os.path.exists(os.path.join("./mzitu", path))
76 | if not isExists:
77 | print('建了一个名字叫做', path, '的文件夹!')
78 | os.makedirs(os.path.join("./mzitu", path))
79 | self.currPath = "./mzitu/" + path + "/"
80 | #os.chdir(os.path.join("./mzitu", path)) ##切换到目录
81 | return True
82 | else:
83 | print('名字叫做', self.currPath, '的文件夹已经存在了!')
84 | return False
85 |
86 | def request(self, url): ##这个函数获取网页的response 然后返回
87 | content = request.get(url, headers=self.headers, timeout=3)
88 | return content
89 |
90 | USE_ONE_DIR = False
91 | USE_DEF_DIR = True
92 | Mzitu = mzitu() ##实例化
93 | Mzitu.all_url('http://www.mzitu.com/all') ##给函数all_url传入参数 你可以当作启动爬虫(就是入口)
94 |
--------------------------------------------------------------------------------
/mzitu.com/scrapy-mzitu-no-es.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | from bs4 import BeautifulSoup
4 | import os
5 | from BaseTools.MyDownload import request
6 |
7 | class mzitu():
8 | def __init__(self):
9 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
10 | self.basePath = "./mzitu-no-es/"
11 | self.currPath = self.basePath
12 | self.mkdir(self.basePath)
13 | self.totalFinishPath = "./mzitu-no-es/totalPage.txt"
14 | self.totalFinish = self.getTotalFinish()
15 |
16 | def all_url(self, url):
17 | html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response
18 | all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find('ul', class_="archives").find_all('a')
19 | count = 0
20 | for a in all_a:
21 | count = count + 1
22 | if count > self.totalFinish:
23 | self.overwriteTotalFinish(count)
24 | else:
25 | print("第", count, "页已经抓取过,跳过!")
26 | continue
27 | title = a.get_text()
28 | href = a['href']
29 | print(title, href) ##加点提示不然太枯燥了
30 | #path = str(title).replace("?", '_') ##我注意到有个标题带有 ? 这个符号Windows系统是不能创建文件夹的所以要替换掉
31 | self.mkdir(title) ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦!
32 | self.html(href) ##调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦!
33 | self.totalFinish = count
34 |
35 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址
36 | try:
37 | html = self.request(href)
38 | self.headers['referer'] = href
39 | # max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
40 | # max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text()
41 | max_span = 100
42 | pageDiv = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi')
43 | if len(pageDiv) > 1:
44 | max_span = pageDiv.find_all('span')[-2].get_text()
45 | for page in range(1, int(max_span) + 1):
46 | page_url = href + '/' + str(page)
47 | self.img(page_url) ##调用img函数
48 | except Exception as e:
49 | print('发生了异常:', e)
50 |
51 | def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址
52 | img_html = self.request(page_url)
53 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
54 | print(img_url)
55 | self.saveImg(img_url)
56 |
57 | def saveImg(self, img_url): ##这个函数保存图片
58 | name = img_url[-9:-4]
59 | imgPath = self.currPath + name + '.jpg'
60 | isExists = os.path.exists(imgPath)
61 | if not isExists:
62 | img = self.request(img_url)
63 | f = open(imgPath, 'ab')
64 | f.write(img.content)
65 | f.close()
66 | print('该图片下载完毕')
67 | else:
68 | print('该图片已经存在')
69 |
70 | def mkdir(self, path): ##这个函数创建文件夹
71 | if USE_ONE_DIR:
72 | path = ""
73 | elif USE_DEF_DIR:
74 | index = path.rindex("/")
75 | path = path[index + 1:]
76 | else:
77 | path = path.strip()
78 | self.currPath = os.path.join(self.basePath, path)
79 | isExists = os.path.exists(self.currPath)
80 | if not isExists:
81 | print('建了一个名字叫做', path, '的文件夹!')
82 | os.makedirs(self.currPath)
83 | #os.chdir(os.path.join("./mzitu", path)) ##切换到目录
84 | return True
85 | else:
86 | print('名字叫做', self.currPath, '的文件夹已经存在了!')
87 | return False
88 |
89 |
90 | def request(self, url): ##这个函数获取网页的response 然后返回
91 | content = request.get(url, headers=self.headers, timeout=3)
92 | return content
93 |
94 | def getTotalFinish(self):
95 | isExists = os.path.exists(self.totalFinishPath)
96 | if isExists:
97 | with open(self.totalFinishPath, 'r', encoding='UTF-8') as f:
98 | return int(f.read())
99 | else:
100 | return 0
101 |
102 | def overwriteTotalFinish(self, count):
103 | with open(self.totalFinishPath, 'w', encoding='UTF-8') as f:
104 | f.write(str(count))
105 |
106 | USE_ONE_DIR = True
107 | USE_DEF_DIR = False
108 | Mzitu = mzitu() ##实例化
109 | Mzitu.all_url('http://www.mzitu.com/all') ##给函数all_url传入参数 你可以当作启动爬虫(就是入口)
110 |
--------------------------------------------------------------------------------
/wallhaven.cc/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
--------------------------------------------------------------------------------
/wallhaven.cc/README.md:
--------------------------------------------------------------------------------
1 | ## 前情提要
2 |
3 | 最近朋友发了个壁纸的网站,情不自禁花十几分钟写了个爬取。
4 |
5 | 
6 |
7 | 支持简单按页断点续爬
8 |
9 | ## 启动方法
10 |
11 | 控制台 cd 到当前目录
12 |
13 | >python wallpic_scrapy.py
14 |
15 | >PS:本项目仅学习分享用,请不要用于商业
--------------------------------------------------------------------------------
/wallhaven.cc/img/20210623210831.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/wallhaven.cc/img/20210623210831.png
--------------------------------------------------------------------------------
/wallhaven.cc/wallpic_scrapy.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | from bs4 import BeautifulSoup
4 | import os
5 | from BaseTools.MyDownload import request
6 |
7 | class wallpic():
8 | def __init__(self):
9 | self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
10 | self.basePath = "./wallpic/"
11 | self.currPath = self.basePath
12 | self.mkdir(self.basePath)
13 | self.totalFinishPath = "./wallpic/totalPage.txt"
14 | self.totalFinish = self.getTotalFinish()
15 |
16 | def all_get(self, totalPage):
17 | count = 0
18 | while count < totalPage:
19 | count = count + 1
20 | if count > self.totalFinish:
21 | self.overwriteTotalFinish(count)
22 | else:
23 | print("第", count, "页已经抓取过,跳过!")
24 | continue
25 | title = '第' + str(count) + '页/'
26 | href = 'https://wallhaven.cc/toplist?page=' + str(count)
27 | print(title, href) ##加点提示不然太枯燥了
28 | ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦!
29 | path = title
30 | self.mkdir(path)
31 | self.html(href) ##调用html函数把href参数传递过去!href是啥还记的吧? 就是套图的地址哦!!不要迷糊了哦!
32 | self.totalFinish = count
33 |
34 | def html(self, href): ##这个函数是处理套图地址获得图片的页面地址
35 | try:
36 | html = self.request(href)
37 | self.headers['referer'] = href
38 | figures = BeautifulSoup(html.text, 'lxml').find('section', class_='thumb-listing-page').find_all('figure')
39 | for figure in figures:
40 | page_url = figure.find_all('a')[0]['href']
41 | self.img(page_url) ##调用img函数
42 | except Exception as e:
43 | print('发生了异常:', e)
44 |
45 | def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址
46 | img_html = self.request(page_url)
47 | img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='scrollbox').find_all('img')[0]['src']
48 | print(img_url)
49 | self.saveImg(img_url)
50 |
51 | def saveImg(self, img_url): ##这个函数保存图片
52 | name = img_url[-9:-4]
53 | imgPath = self.currPath + name + '.jpg'
54 | isExists = os.path.exists(imgPath)
55 | if not isExists:
56 | img = self.request(img_url)
57 | f = open(imgPath, 'ab')
58 | f.write(img.content)
59 | f.close()
60 | print('该图片下载完毕')
61 | else:
62 | print('该图片已经存在')
63 |
64 | def mkdir(self, path): ##这个函数创建文件夹
65 | if USE_ONE_DIR:
66 | path = ""
67 | elif USE_DEF_DIR:
68 | index = path.rindex("/")
69 | path = path[index + 1:]
70 | else:
71 | path = path.strip()
72 | self.currPath = os.path.join(self.basePath, path)
73 | isExists = os.path.exists(self.currPath)
74 | if not isExists:
75 | print('建了一个名字叫做', path, '的文件夹!')
76 | os.makedirs(self.currPath)
77 | #os.chdir(os.path.join("./mzitu", path)) ##切换到目录
78 | return True
79 | else:
80 | print('名字叫做', self.currPath, '的文件夹已经存在了!')
81 | return False
82 |
83 |
84 | def request(self, url): ##这个函数获取网页的response 然后返回
85 | content = request.get(url, headers=self.headers, timeout=3)
86 | return content
87 |
88 | def getTotalFinish(self):
89 | isExists = os.path.exists(self.totalFinishPath)
90 | if isExists:
91 | with open(self.totalFinishPath, 'r', encoding='UTF-8') as f:
92 | return int(f.read())
93 | else:
94 | return 0
95 |
96 | def overwriteTotalFinish(self, count):
97 | with open(self.totalFinishPath, 'w', encoding='UTF-8') as f:
98 | f.write(str(count))
99 |
100 | USE_ONE_DIR = False
101 | USE_DEF_DIR = False
102 | WallPic = wallpic() ##实例化
103 |
104 | if __name__ == "__main__":
105 | ## 传入你要爬取的页数,你可以当作启动爬虫(就是入口)
106 | WallPic.all_get(11)
107 |
--------------------------------------------------------------------------------