├── .gitattributes
├── .gitignore
├── BaseTools
    ├── CompareUtil.py
    ├── MyDownload.py
    ├── MyUtil.py
    ├── ScreenShotUtil.py
    ├── __init__.py
    └── test
    │   ├── Parent.py
    │   ├── data
    │       ├── es-query-lean.md
    │       ├── result.md
    │       └── test-file.html
    │   ├── edit-distance-test.py
    │   ├── util-test.py
    │   ├── util-test2.py
    │   └── util-test3.py
├── DBTools
    ├── MyES.py
    ├── MyMongoDB.py
    ├── MySqlite.py
    ├── __init__.py
    └── test
    │   ├── Parent.py
    │   ├── es-test.py
    │   ├── sqlite-test.py
    │   └── sqlite-test
    │       └── test.db
├── README.md
├── __init__.py
├── ctrip.com-visa
    ├── Parent.py
    ├── README.md
    ├── img
    │   ├── ctrip-visa-gqtp.png
    │   └── ctrip-visa-lsgxx.png
    └── xc-visa-lqxx.py
├── framework
    └── base_scrapy
    │   ├── README.md
    │   ├── base_scrapy
    │       ├── __init__.py
    │       ├── entrypoint.py
    │       ├── items.py
    │       ├── middlewares.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   └── base_spider.py
    │   └── scrapy.cfg
├── huaban.com
    ├── PreviewHtmlTool.py
    ├── README.md
    ├── huaban-border-text.py
    ├── huaban-simple.py
    ├── img
    │   ├── huaban-border-txt.png
    │   ├── huaban-preview-border.png
    │   └── huaban-simple-1.png
    ├── test.html
    └── 淡然小笺赋箴言
    │   ├── 13448395_1.jpg
    │   ├── 13448395_1.txt
    │   ├── 13448395_10.jpg
    │   ├── 13448395_10.txt
    │   ├── 13448395_11.jpg
    │   ├── 13448395_11.txt
    │   ├── 13448395_12.jpg
    │   ├── 13448395_12.txt
    │   ├── 13448395_13.jpg
    │   ├── 13448395_13.txt
    │   ├── 13448395_14.jpg
    │   ├── 13448395_14.txt
    │   ├── 13448395_15.jpg
    │   ├── 13448395_15.txt
    │   ├── 13448395_16.jpg
    │   ├── 13448395_16.txt
    │   ├── 13448395_17.jpg
    │   ├── 13448395_17.txt
    │   ├── 13448395_18.jpg
    │   ├── 13448395_18.txt
    │   ├── 13448395_19.jpg
    │   ├── 13448395_19.txt
    │   ├── 13448395_2.jpg
    │   ├── 13448395_2.txt
    │   ├── 13448395_20.jpg
    │   ├── 13448395_20.txt
    │   ├── 13448395_21.jpg
    │   ├── 13448395_21.txt
    │   ├── 13448395_22.jpg
    │   ├── 13448395_22.txt
    │   ├── 13448395_23.jpg
    │   ├── 13448395_23.txt
    │   ├── 13448395_24.jpg
    │   ├── 13448395_24.txt
    │   ├── 13448395_25.jpg
    │   ├── 13448395_25.txt
    │   ├── 13448395_26.jpg
    │   ├── 13448395_26.txt
    │   ├── 13448395_27.jpg
    │   ├── 13448395_27.txt
    │   ├── 13448395_28.jpg
    │   ├── 13448395_28.txt
    │   ├── 13448395_29.jpg
    │   ├── 13448395_29.txt
    │   ├── 13448395_3.jpg
    │   ├── 13448395_3.txt
    │   ├── 13448395_30.jpg
    │   ├── 13448395_30.txt
    │   ├── 13448395_31.jpg
    │   ├── 13448395_31.txt
    │   ├── 13448395_32.jpg
    │   ├── 13448395_32.txt
    │   ├── 13448395_33.jpg
    │   ├── 13448395_33.txt
    │   ├── 13448395_34.jpg
    │   ├── 13448395_34.txt
    │   ├── 13448395_35.jpg
    │   ├── 13448395_35.txt
    │   ├── 13448395_36.jpg
    │   ├── 13448395_36.txt
    │   ├── 13448395_37.jpg
    │   ├── 13448395_37.txt
    │   ├── 13448395_38.jpg
    │   ├── 13448395_38.txt
    │   ├── 13448395_39.jpg
    │   ├── 13448395_39.txt
    │   ├── 13448395_4.jpg
    │   ├── 13448395_4.txt
    │   ├── 13448395_40.jpg
    │   ├── 13448395_40.txt
    │   ├── 13448395_5.jpg
    │   ├── 13448395_5.txt
    │   ├── 13448395_6.jpg
    │   ├── 13448395_6.txt
    │   ├── 13448395_7.jpg
    │   ├── 13448395_7.txt
    │   ├── 13448395_8.jpg
    │   ├── 13448395_8.txt
    │   ├── 13448395_9.jpg
    │   ├── 13448395_9.txt
    │   └── index.html
├── jjwxk.net
    ├── Parent.py
    ├── README.md
    ├── img
    │   ├── jjwxk-free-simple-1.png
    │   └── jjwxk-free-simple-2.png
    ├── jjwxk-free-simple.py
    └── simple-http-server.py
├── mzitu.com
    ├── Parent.py
    ├── README.md
    ├── mzitu-crawler-es.py
    ├── mzitu_es.py
    ├── mzitu_for_thread.py
    ├── scrapy-mzitu-es.py
    └── scrapy-mzitu-no-es.py
└── wallhaven.cc
    ├── Parent.py
    ├── README.md
    ├── img
        └── 20210623210831.png
    └── wallpic_scrapy.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python
2 | *.css linguist-language=python
3 | *.html linguist-language=python
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | .idea/
 3 | .vscode/
 4 | *.png
 5 | *.jpg
 6 | *.csv
 7 | *.wpr
 8 | *.txt
 9 | *.log
10 | *.json
11 | *.exe
12 | plugin/*
13 | .DS_Store
14 | .scrapy/


--------------------------------------------------------------------------------
/BaseTools/CompareUtil.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | 
 3 | class EditDistance():
 4 |     @classmethod
 5 |     def minEditDist(cls, sm, sn):
 6 |         '''
 7 |         计算两个字符串的最小莱温斯坦距离
 8 |         '''
 9 |         m,n = len(sm)+1,len(sn)+1
10 | 
11 |         # create a matrix (m*n)
12 |         matrix = [[0]*n for i in range(m)]
13 | 
14 |         matrix[0][0]=0
15 |         for i in range(1,m):
16 |             matrix[i][0] = matrix[i-1][0] + 1
17 | 
18 |         for j in range(1,n):
19 |             matrix[0][j] = matrix[0][j-1]+1
20 | 
21 | 
22 |         for i in range(m):
23 |             print(matrix[i])
24 | 
25 |         print("********************")
26 | 
27 |         cost = 0
28 | 
29 |         for i in range(1,m):
30 |             for j in range(1,n):
31 |                 if sm[i-1]==sn[j-1]:
32 |                     cost = 0
33 |                 else:
34 |                     cost = 1
35 | 
36 |                 matrix[i][j]=min(matrix[i-1][j]+1,matrix[i][j-1]+1,matrix[i-1][j-1]+cost)
37 | 
38 |         for i in range(m):
39 |             print(matrix[i])
40 | 
41 |         return matrix[m-1][n-1]
42 | 
43 |     @classmethod
44 |     def similarityDegree(cls, str1, str2):
45 |         '''
46 |         计算两个字符串的相似度
47 |         '''
48 |         mindist = 0
49 |         if str1 == None and str2 != None:
50 |             mindist = len(str2)
51 |             return 0
52 |         elif str1 != None and str2 == None:
53 |             mindist = len(str1)
54 |             return 0
55 |         elif str1 != None and str2 != None:
56 |             mindist = cls.minEditDist(str1,str2)
57 |         else:
58 |             return 0
59 |         maxLength = min(len(str1), len(str2))
60 |         similarityDegree = 1-mindist/maxLength
61 |         print(str1, "和", str2, "的相似度为：", similarityDegree)
62 |         return similarityDegree
63 | 


--------------------------------------------------------------------------------
/BaseTools/MyDownload.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import requests
  3 | import re
  4 | import random
  5 | import time
  6 | from bs4 import BeautifulSoup
  7 | 
  8 | 
  9 | class download():
 10 |     def __init__(self):
 11 |         self.iplist = []  ##初始化一个list用来存放我们获取到的IP
 12 |         # self.get_ip_list()
 13 |         self.get_ip_list3()
 14 |         print(self.iplist)
 15 |         self.user_agent_list = [
 16 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 17 |             "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 18 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 19 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
 20 |             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
 21 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
 22 |             "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
 23 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 24 |             "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 25 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
 26 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
 27 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
 28 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
 29 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
 30 |             "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
 31 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
 32 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
 33 |             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
 34 |         ]
 35 | 
 36 |     #功能：爬取IP存入ip_list列表
 37 |     def get_ip_list(self):
 38 |         #html = requests.get("http://haoip.cc/tiqu.htm")  ##不解释咯，获取免费代理IP地址的网站，用正则过滤获取到代理IP
 39 |         #iplistn = re.findall(r'r/>(.*?)<b', html.text, re.S)  ##表示从html.text中获取所有r/><b中的内容，re.S的意思是包括匹配包括换行符，findall返回的是个list哦！
 40 |         #for ip in iplistn:
 41 |         #i = re.sub('\n', '', ip)  ##re.sub 是re模块替换的方法，这儿表示将\n替换为空
 42 |         #self.iplist.append(i.strip())  ##添加到我们上面初始化的list里面
 43 |         # html = requests.get("http://www.youdaili.net/Daili/guonei/36810.html")  ##获取免费代理IP地址的网站（百度一下），用正则过滤获取到代理IP
 44 |         html = requests.get("http://www.youdaili.net/Daili/guonei/36810_2.html")  ##获取免费代理IP地址的网站（百度一下），用正则过滤获取到代理IP
 45 |         iplistn = re.findall(r'<p>(.*?)@HTTP', html.text, re.S)  ##表示从html.text中获取所有r/><b中的内容，re.S=1的意思是包括匹配包括换行符，findall返回的是个list哦！
 46 |         for ip in iplistn:
 47 |             i = re.sub('\n', '', ip)  ##re.sub 是re模块替换的方法，这儿表示将\n替换为空
 48 |             self.iplist.append(i.strip())  ##添加到我们上面初始化的list里面
 49 | 
 50 |     #功能：爬取IP存入ip_list列表
 51 |     def get_ip_list2(self):
 52 |         web_data = requests.get("http://www.xicidaili.com/", headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'})
 53 |         soup = BeautifulSoup(web_data.text, 'lxml')
 54 |         ips = soup.find_all('tr')
 55 |         for i in range(1, len(ips)):
 56 |             ip_info = ips[i]
 57 |             tds = ip_info.find_all('td')
 58 |             if len(tds) > 6:
 59 |                 if not tds[6].text.find('天')==-1:
 60 |                     # print('tds[8]为：'+str(tds[8]))
 61 |                     self.iplist.append(tds[1].text + ':' + tds[2].text)
 62 |                     # print(tds[1].text + ':' + tds[2].text)
 63 |     
 64 |     #功能：爬取IP存入ip_list列表
 65 |     def get_ip_list3(self):
 66 |         web_data = requests.get("https://www.kuaidaili.com/free/", headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'})
 67 |         soup = BeautifulSoup(web_data.text, 'lxml')
 68 |         ips = soup.find_all('tr')
 69 |         for i in range(1, len(ips)):
 70 |             ip_info = ips[i]
 71 |             tds = ip_info.find_all('td')
 72 |             currIp = ''
 73 |             if len(tds) > 1:
 74 |                 for item in tds:
 75 |                     if item["data-title"] == 'IP':
 76 |                         currIp = item.text
 77 |                     if item["data-title"] == 'PORT':
 78 |                         currIp += ':' + item.text
 79 |                         break
 80 |                 self.iplist.append(currIp)
 81 | 
 82 |     def get(self, url, headers, timeout, proxy=None, num_retries=10): ##给函数一个默认参数proxy为空
 83 |         UA = random.choice(self.user_agent_list) ##从self.user_agent_list中随机取出一个字符串
 84 |         headers['User-Agent'] = UA  ##构造成一个完整的User-Agent （UA代表的是上面随机取出来的字符串哦）
 85 | 
 86 |         if proxy == None: ##当代理为空时，不使用代理获取response（别忘了response啥哦！之前说过了！！）
 87 |             try:
 88 |                 return requests.get(url, headers=headers, timeout=timeout)##这样服务器就会以为我们是真的浏览器了
 89 |             except:##如过上面的代码执行报错则执行下面的代码
 90 |                 if num_retries > 0: ##num_retries是我们限定的重试次数
 91 |                     time.sleep(10) ##延迟十秒
 92 |                     print('获取网页出错，10S后将获取倒数第：', num_retries, '次')
 93 |                     return self.get(url, headers, timeout, num_retries - 1)  ##调用自身 并将次数减1
 94 |                 else:
 95 |                     print('开始使用代理')
 96 |                     time.sleep(10)
 97 |                     IP = ''.join(str(random.choice(self.iplist)).strip()) ##下面有解释哦
 98 |                     proxy = {'http': IP}
 99 |                     return self.get(url, headers, timeout, proxy) ##代理不为空的时候
100 |         else: ##当代理不为空
101 |             try:
102 |                 IP = ''.join(str(random.choice(self.iplist)).strip()) ##将从self.iplist中获取的字符串处理成我们需要的格式（处理了些什么自己看哦，这是基础呢）
103 |                 proxy = {'http': IP} ##构造成一个代理
104 |                 return requests.get(url, headers=headers, proxies=proxy, timeout=timeout) ##使用代理获取response
105 |             except:
106 |                 if num_retries > 0:
107 |                     time.sleep(10)
108 |                     IP = ''.join(str(random.choice(self.iplist)).strip())
109 |                     proxy = {'http': IP}
110 |                     print('正在更换代理，10S后将重新获取倒数第', num_retries, '次')
111 |                     print('当前代理是：', proxy)
112 |                     return self.get(url, headers, timeout, proxy, num_retries - 1)
113 |                 else:
114 |                     print('代理也不好使了！取消代理')
115 |                     return self.get(url, headers, 3)
116 | 
117 |     # 获取文本编码
118 |     def get_encoding(self, text):
119 |         return requests.utils.get_encodings_from_content(text)
120 | 
121 |     # 获取非中文乱码的文本
122 |     def get_utf8_content(self, url, headers):
123 |         req = request.get(url, headers, timeout=3)
124 |         if req.content == None:
125 |             return ""
126 |         encoding = "utf-8"
127 |         if req.encoding == 'ISO-8859-1':
128 |             encodings = request.get_encoding(req.text)
129 |             if encodings:
130 |                 encoding = encodings[0]
131 |             else:
132 |                 encoding = req.apparent_encoding
133 |             # encode_content = req.content.decode(encoding, 'replace').encode('utf-8', 'replace')
134 |             return req.content.decode(encoding, 'replace')  #如果设置为replace，则会用?取代非法字符；
135 |         return req.content
136 | 
137 | 
138 | request = download()


--------------------------------------------------------------------------------
/BaseTools/MyUtil.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import random,string
  3 | from hashlib import md5
  4 | import os
  5 | import codecs
  6 | import tomd
  7 | 
  8 | class MyStr():
  9 |     @classmethod
 10 |     def getRandomPsw(cls, length=6):
 11 |         src = string.ascii_letters + string.digits
 12 |         if length < 6:
 13 |             length = 6
 14 |         list_passwd_all = random.sample(src, length - 3) #从字母和数字中随机取3位
 15 |         list_passwd_all.extend(random.sample(string.digits, 1))  #让密码中一定包含数字
 16 |         list_passwd_all.extend(random.sample(string.ascii_lowercase, 1)) #让密码中一定包含小写字母
 17 |         list_passwd_all.extend(random.sample(string.ascii_uppercase, 1)) #让密码中一定包含大写字母
 18 |         random.shuffle(list_passwd_all) #打乱列表顺序
 19 | 
 20 |     @classmethod
 21 |     def getFileMd5(cls, name):
 22 |         m = md5()
 23 |         a_file = open(name, 'rb')    #需要使用二进制格式读取文件内容
 24 |         m.update(a_file.read())
 25 |         a_file.close()
 26 |         return m.hexdigest()
 27 | 
 28 |     @classmethod
 29 |     def getMd5(cls, instr, length=32):
 30 |         m = md5()
 31 |         m.update(instr)
 32 |         res = m.hexdigest()
 33 |         if length < 32:
 34 |             res = random.sample(res, length) #从字母和数字中随机取3位
 35 |         return res
 36 |     
 37 |     @classmethod
 38 |     def html2markdown(cls, html):
 39 |         mdTxt = tomd.Tomd(html).markdown
 40 |         return mdTxt
 41 | 
 42 | class FileTool(object):
 43 |     #追加写入：写一个写入数据的接口
 44 |     @classmethod
 45 |     def write_behind(cls, filename, content, split='\n'):
 46 |         ''''' 
 47 |         :param content: 要写入的数据 
 48 |         :param split: 每条数据之间的分隔符 
 49 |         :return: 
 50 |         '''
 51 |         if content == None:
 52 |             return
 53 |         # 判断传入的参数是否字符串类型，如果是，写入 . 如果不是,抛出异常
 54 |         if isinstance(content, str):
 55 |             #1.打开文件
 56 |             f = codecs.open(filename, 'a', 'utf-8')
 57 |             #2.写入数据
 58 |             f.write(content)
 59 |             f.write(split)
 60 |             #3.关闭文件
 61 |             f.close()
 62 |         else:
 63 |             raise TypeError('content must be a str!')
 64 | 
 65 |     #追加写入：写入多行数据
 66 |     @classmethod
 67 |     def write_behind_muti(cls, filename, str_list, split='\n'):
 68 |         #判断某个对象是否是某个类型，若是，返回True;否则，返回False
 69 |         rs = isinstance(str_list, list)
 70 |         #如果为True
 71 |         if rs:
 72 |             #for循环遍历列表，取出每一数据，判断数据类型是否为字符串
 73 |             for content in str_list:
 74 |                 #如果不是字符串类型
 75 |                 if isinstance(content,str) == False:
 76 |                     #抛出异常
 77 |                     raise TypeError('str_list must be a list of "str",ex:["str1","str2"...]')
 78 |             #如果没有异常，就可以写入数据了
 79 |             #1.打开文件
 80 |             f = open(filename,'a')
 81 |             #2.写入数据 str1\nstr2\nstr3...
 82 |             string = split.join(str_list)
 83 |             f.write(string)
 84 |             #3.关闭文件
 85 |             f.close()
 86 |         else:
 87 |             #如果传入的不是列表，抛出异常
 88 |             raise TypeError('str_list must be a list of "str",ex:["str1","str2"...]')
 89 |     #创建文件夹
 90 |     @classmethod
 91 |     def mkdir(cls, path):  ##这个函数创建文件夹
 92 |         isExists = os.path.exists(path)
 93 |         if not isExists:
 94 |             print('建了一个名字叫做', path, '的文件夹！')
 95 |             os.makedirs(path)
 96 |             return True
 97 |         else:
 98 |             print('名字叫做', path, '的文件夹已经存在了！')
 99 |             return False
100 |     #读取文件内容
101 |     @classmethod
102 |     def read_utf8(cls, path):
103 |         isExists = os.path.exists(path)
104 |         if isExists:
105 |             with open(path, 'r', encoding='UTF-8') as f:
106 |                 return str(f.read())
107 |         else:
108 |             return ''
109 |     # 覆盖写入
110 |     @classmethod
111 |     def overwrite(cls, path, text):
112 |         with open(path, 'w', encoding='UTF-8') as f:
113 |             f.write(text)
114 | 
115 |     # 判断文件是否存在
116 |     @classmethod
117 |     def isExit(cls, path):
118 |         return os.path.exists(path)
119 | 
120 |     # 检查文件名是否合理，替换特殊字符
121 |     @classmethod
122 |     def replace_invalid_filename(cls, filename, replaced_char='_'):
123 |         '''
124 |         替换有特殊字符的文件名中的特殊字符，默认将特殊字符替换为'_'.
125 |         例如 C/C++ -> C_C++
126 |         '''
127 |         valid_filename = filename
128 |         invalid_characaters = '\\/:*?"<>|'
129 |         for c in invalid_characaters:
130 |             #print 'c:', c
131 |             valid_filename = valid_filename.replace(c, replaced_char)
132 |         return valid_filename 
133 | 
134 | 
135 | class DateTool(object):
136 |     #日期格式化工具类，用类执行一个函数，返回一个对象，对象分别有year\month\day
137 |     '''
138 |     2018-2-1 2018.2.1 2018/2/1 
139 |     date.year = 2018 
140 |     date.month = 2 
141 |     date.day = 1 
142 |     '''
143 |     #初始化函数
144 |     def __init__(self,year=1970,month=1,day=1):
145 |         self.year = year
146 |         self.month = month
147 |         self.day = day
148 |     #类函数，传递进来一个日期，返回一个该类的对象
149 |     @classmethod
150 |     def get_date(cls,date):
151 |         #判断date是否为str类型
152 |         if not isinstance(date,str):
153 |             #不是str类型，直接触发异常
154 |             raise TypeError('date must be a str!')
155 |         #转换
156 |         #判断是-还是.还是空格
157 |         if '-' in date:
158 |             #分别将2018赋值year 2赋值给month 1赋值给day
159 |             # year, month, day = [2018,2,1]
160 |             year,month,day = list(map(int,date.split('-')))
161 |         elif '.' in date:
162 |             year,month,day = list(map(int,date.split('.')))
163 |         elif ' ' in date:
164 |             year,month,day = list(map(int,date.split(' ')))
165 |         elif '/' in date:
166 |             year,month,day = list(map(int,date.split('/')))
167 |         #创建对象
168 |         # obj = DateTool(year,month,day)
169 |         obj = cls(year,month,day)
170 |         #返回对象
171 |         return obj


--------------------------------------------------------------------------------
/BaseTools/ScreenShotUtil.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import tagui as t
 3 | import uuid
 4 | 
 5 | def url2png(url):
 6 |     t.init()
 7 |     t.url(url)
 8 |     # t.type('q', 'decentralization[enter]')
 9 |     t.snap('page', 'results-' + str(uuid.uuid1()) + '.png')
10 |     t.close()
11 |         
12 | 


--------------------------------------------------------------------------------
/BaseTools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/BaseTools/__init__.py


--------------------------------------------------------------------------------
/BaseTools/test/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")


--------------------------------------------------------------------------------
/BaseTools/test/data/es-query-lean.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 1. query string search
  3 | 2. query DSL
  4 | 3. query filter
  5 | 4. full-text search
  6 | 5. phrase search
  7 | 6. highlight search
  8 | 
  9 | ## query string search
 10 | 
 11 | 1. took：耗费了几毫秒
 12 | 1. timed_out：是否超时，这里是没有
 13 | 1. _shards：数据拆成了5个分片，所以对于搜索请求，会打到所有的primary shard（或者是它的某个replica shard也可以）
 14 | 1. hits.total：查询结果的数量，3个document
 15 | 1. hits.max_score：score的含义，就是document对于一个search的相关度的匹配分数，越相关，就越匹配，分数也高
 16 | 1. hits.hits：包含了匹配搜索的document的详细数据
 17 | 
 18 | 搜索全部
 19 | 
 20 | ```json
 21 | GET /nginx/log_base/_search
 22 | 
 23 | 结果如下：
 24 | {
 25 |   "took" : 18,
 26 |   "timed_out" : false,
 27 |   "_shards" : {
 28 |     "total" : 5,
 29 |     "successful" : 5,
 30 |     "skipped" : 0,
 31 |     "failed" : 0
 32 |   },
 33 |   "hits" : {
 34 |     "total" : 143405,
 35 |     "max_score" : 1.0,
 36 |         “hits”: [
 37 |             {
 38 |                 "_index" : "nginx",
 39 |                 "_type" : "log_base",
 40 |                 "_id" : "swZwhmwB82qtm9SxinXv",
 41 |                 "_score" : 10.191514,
 42 |                 "_source" : {
 43 |                     "ip" : "10.95.30.42",
 44 |                     "timestamp" : "17/Jul/2019:00:00:29 +0800",
 45 |                     "url" : "GET /v-dist/static/js/vendor.min.js HTTP/1.1",
 46 |                     "status" : "200",
 47 |                     "bytes" : "782353"
 48 |                 }
 49 |             },
 50 |             {...},
 51 |             {...}
 52 |         ]
 53 |     }
 54 | }
 55 | ```
 56 | 
 57 | query string search 的由来，因为 search 参数都是以 http 请求的 query string 来附带的
 58 | 
 59 | 搜索商品名称中包含yagao的商品，而且按照售价降序排序：
 60 | 
 61 | ```json
 62 | # 查询 所有字段 包含 10.95 的数据集
 63 | GET /bookdb_index/book/_search?q=10.95
 64 | # 查询 ip 包含 10.95.30.42 的数据集
 65 | GET nginx/log_base/_search?q=ip:10.95.30.42
 66 | # 使用 sort 功能需要定义 timestamp 属性 fielddata=true 有可排序功能
 67 | # 出现该错误是因为 5.x 之后，Elasticsearch对排序、聚合所依据的字段用单独的数据结构（fielddata）缓存到内存里了，
 68 | # 但是在text字段上默认是禁用的，如果有需要单独开启，这样做的目的是为了节省内存空间。
 69 | GET nginx/log_base/_search?q=ip:10.95.30.42&sort=timestamp:desc
 70 | # 使用 _mapping 查看结构定义
 71 | GET nginx/_mapping/log_base
 72 | # 改变某个属性结构
 73 | PUT nginx/_mapping/log_base
 74 | {
 75 |   "properties": {
 76 |     "timestamp":{
 77 |       "type": "text",
 78 |       "fielddata": true
 79 |     }
 80 |   }
 81 | }
 82 | ```
 83 | 
 84 | 适用于临时的在命令行使用一些工具，比如curl，快速的发出请求，来检索想要的信息；
 85 | 
 86 | 但是如果查询请求很复杂，是很难去构建的在生产环境中，几乎很少使用 query string search
 87 | 
 88 | ## query DSL
 89 | 
 90 | <p>DSL：Domain Specified Language，特定领域的语言
 91 | http request body：请求体，可以用json的格式来构建查询语法，比较方便，可以构建各种复杂的语法，比query string search肯定强大多了</p>
 92 | 
 93 | **查询所有**
 94 | 
 95 | ```
 96 | GET nginx/log_base/_search
 97 | {
 98 |     "query": { "match_all": {} }
 99 | }
100 | ```
101 | 
102 | **查询 ip 包含 ，同时按照价格降序排序**
103 | 
104 | ```json
105 | GET nginx/log_base/_search
106 | {
107 |   "query" : {
108 |     "match" : {
109 |       "ip" : "10.95.30.42"
110 |     }
111 |   },
112 |   "sort": [
113 |     { "timestamp": "desc" }
114 |   ]
115 | }
116 | ```
117 | 
118 | **分页查询**
119 | 
120 | ```json
121 | # from：从第几个开始，es 从 0 开始计数的
122 | # size：往后查询 100 个
123 | GET nginx/log_base/_search
124 | {
125 |   "query": { "match_all": {} },
126 |   "from": 1,
127 |   "size": 100
128 | }
129 | ```
130 | 
131 | **指定要查询展示的属性**
132 | 
133 | ```json
134 | GET nginx/log_base/_search
135 | {
136 |     "query": { "match_all": {} },
137 |     "_source": ["ip", "status"]
138 | }
139 | ```
140 | 
141 | 更加适合生产环境的使用，可以构建复杂的查询
142 | 
143 | ## query filter
144 | 
145 | **结果集里面过滤**
146 | 
147 | ```json
148 | GET nginx/log_base/_search
149 | {
150 |   "query": {
151 |     "bool": {
152 |       "must": {
153 |         "match":{
154 |           "ip" : "10.95.30.42" 
155 |         }
156 |       }, 
157 |       "filter": {
158 |         "match":{
159 |           "status" : "302" 
160 |         }
161 |       }
162 |     }
163 |   }
164 | }
165 | ```
166 | 
167 | ## full-text search（全文检索）
168 | 
169 | ```json
170 | GET nginx/log_base/_search
171 | {
172 |   "query" : {
173 |     "match" : {
174 |       "url" : ".js"
175 |     }
176 |   }
177 | }
178 | ```
179 | 
180 | ## phrase search（短语搜索）
181 | 
182 | 跟全文检索相对应，相反，全文检索会将输入的搜索串拆解开来，去倒排索引里面去一一匹配，只要能匹配上任意一个拆解后的单词，就可以作为结果返回
183 | phrase search，要求输入的搜索串，必须在指定的字段文本中，完全包含一模一样的短语（空格等其他非数字字母分隔开的字符），才可以算匹配，才能作为结果返回
184 | 
185 | ```json
186 | GET nginx/log_base/_search
187 | {
188 |   "query" : {
189 |     "match_phrase" : {
190 |       "ip" : "10.94.53.32"
191 |     }
192 |   }
193 | }
194 | ```
195 | 
196 | ## highlight search（高亮搜索结果）
197 | 
198 | ```json
199 | GET nginx/log_base/_search
200 | {
201 |   "query" : {
202 |     "match" : {
203 |       "ip" : "10.94.53.32"
204 |     }
205 |   },
206 |   "highlight": {
207 |     "fields" : {
208 |       "ip" : {}
209 |     }
210 |   }
211 | }
212 | 
213 | {
214 |   "took" : 295,
215 |   "timed_out" : false,
216 |   "_shards" : {
217 |     "total" : 5,
218 |     "successful" : 5,
219 |     "skipped" : 0,
220 |     "failed" : 0
221 |   },
222 |   "hits" : {
223 |     "total" : 29977,
224 |     "max_score" : 1.5757076,
225 |     "hits" : [
226 |       {
227 |         "_index" : "nginx",
228 |         "_type" : "log_base",
229 |         "_id" : "yAZwhmwB82qtm9SxinXv",
230 |         "_score" : 1.5757076,
231 |         "_source" : {
232 |           "ip" : "10.94.53.32",
233 |           "timestamp" : "17/Jul/2019:00:01:20 +0800",
234 |           "url" : "GET /v-dist/static/css/app.min.css HTTP/1.1",
235 |           "status" : "200",
236 |           "bytes" : "217190"
237 |         },
238 |         "highlight" : {
239 |           "ip" : [
240 |             "<em>10.94.53.32</em>"
241 |           ]
242 |         }
243 |       },
244 |       {...}
245 |     ]
246 |   }
247 | }
248 | ```
249 | 


--------------------------------------------------------------------------------
/BaseTools/test/edit-distance-test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import Parent
 3 | from CompareUtil import EditDistance
 4 | def main():
 5 |     EditDistance.similarityDegree("黄鹤楼","i黄鹤楼van2")
 6 |     EditDistance.similarityDegree("黄鹤楼","黄黄鹤鹤楼")
 7 |     EditDistance.similarityDegree("黄鹤楼","鹤楼黄楼黄楼")
 8 |     EditDistance.similarityDegree("黄鹤楼","鹤鹤楼")
 9 |     EditDistance.similarityDegree("黄鹤楼","汤逊湖")
10 |     EditDistance.similarityDegree("黄鹤楼","岳阳楼")
11 | 
12 | if __name__ == '__main__':
13 |     main()


--------------------------------------------------------------------------------
/BaseTools/test/util-test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import Parent
 3 | from MyUtil import FileTool
 4 | from MyUtil import DateTool
 5 | def main():
 6 |     # 指定写入文件的名称
 7 |     filename = 'test.txt'
 8 |     # 执行写入功能函数
 9 |     FileTool.write_behind(filename, 'hello')
10 |     FileTool.write_behind(filename, 'world')
11 |     print("1.追加单行写\n", FileTool.read_utf8(filename))
12 | 
13 |     FileTool.write_behind(filename, '你好！')
14 |     print("1.1.追加写中文\n", FileTool.read_utf8(filename))
15 | 
16 |     FileTool.write_behind_muti(filename, ['hello', 'world', 'zhangzhang'])
17 |     print("2.追加多行写\n", FileTool.read_utf8(filename))
18 | 
19 |     FileTool.overwrite(filename, "hello_world!")
20 |     print("1.覆写\n", FileTool.read_utf8(filename))
21 |     
22 |     FileTool.write_behind(filename, '你好，世界！')
23 |     print("1.1.覆写写中文\n", FileTool.read_utf8(filename))
24 | 
25 | 
26 | 
27 |     # 开始进行日期转换
28 |     # 转换之后 返回一个结果对象
29 |     date = DateTool.get_date('2020 2 22')
30 |     #date有三个属性 分别为year，month，day
31 |     print("日期转换")
32 |     print(date.year)
33 |     print(date.month)
34 |     print(date.day)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     main()


--------------------------------------------------------------------------------
/BaseTools/test/util-test2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import Parent
 3 | from MyUtil import FileTool
 4 | from MyUtil import MyStr
 5 | def html2markdown(input_file_path, output_file_path):
 6 |     html = FileTool.read_utf8(input_file_path)
 7 |     mdTxt = MyStr.html2markdown(html)
 8 |     FileTool.overwrite(output_file_path, mdTxt)
 9 |     
10 | 
11 | if __name__ == '__main__':
12 |     html2markdown('data/test-file.html', 'data/result.md')


--------------------------------------------------------------------------------
/BaseTools/test/util-test3.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | import Parent
3 | import ScreenShotUtil as screenshot
4 | 
5 | def main():
6 |     screenshot.url2png("https://www.baidu.com/")
7 | 
8 | if __name__ == '__main__':
9 |     main()


--------------------------------------------------------------------------------
/DBTools/MyES.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import os
  3 | import time
  4 | import csv
  5 | from os import walk
  6 | from datetime import datetime
  7 | from elasticsearch import Elasticsearch
  8 | from elasticsearch.helpers import bulk
  9 | 
 10 | class MyESClient(object):
 11 |     def __init__(self, index_name, index_type, ip ="127.0.0.1", print=False):
 12 |         '''
 13 |         :param index_name: 索引名称
 14 |         :param index_type: 索引类型
 15 |         '''
 16 |         self.index_name =index_name
 17 |         self.index_type = index_type
 18 |         # 无用户名密码状态
 19 |         self.es = Elasticsearch([ip], port=9200)
 20 |         #用户名密码状态
 21 |         self.es = Elasticsearch([ip], http_auth=('elastic', 'password'), port=9200)
 22 |         self.show_es_result = print
 23 | 
 24 |     def createIndex(self, index_mappings):
 25 |         '''
 26 |         创建索引,创建索引名称为ott，类型为ott_type的索引
 27 |         :param ex: Elasticsearch对象
 28 |         :return:
 29 |         '''
 30 |         #创建映射
 31 |         if self.es.indices.exists(index=self.index_name) is not True:
 32 |             res = self.es.indices.create(index=self.index_name, body=index_mappings)
 33 |             if self.show_es_result:
 34 |                 print(res)
 35 | 
 36 |     def indexDataFromCvsDir(self, cloumnDict):
 37 |         csvdir = './ElasticSearch/exportExcels'
 38 |         filenamelist = []
 39 |         for (dirpath, dirnames, filenames) in walk(csvdir):
 40 |             filenamelist.extend(filenames)
 41 |             break
 42 |         for file in filenamelist:
 43 |             csvfile = csvdir + '/' + file
 44 |             self.indexDataFromCSV(csvfile, cloumnDict)
 45 |             time.sleep(10)
 46 | 
 47 |     def indexDataFromCSV(self, csvfile, cloumnList=None):
 48 |         '''
 49 |         从CSV文件中读取数据，并存储到es中
 50 |         :param csvfile: csv文件，包括完整路径
 51 |         :return:
 52 |         '''
 53 |         with open(csvfile) as f:
 54 |             reader = csv.reader(f)
 55 |             # 读取一行，下面的reader中已经没有该行了
 56 |             index = 0
 57 |             if cloumnList == None:
 58 |                 cloumnList = next(reader)
 59 |                 index = 1
 60 |             doc = {}
 61 |             cloumnLength = len(cloumnList)
 62 |             for item in reader:
 63 |                 if index > 0:#第一行是标题
 64 |                     if cloumnLength <= len(item):
 65 |                         for i in range(cloumnLength):
 66 |                             doc[cloumnList[i]] = item[i]
 67 |                         self.es.index(index=self.index_name, doc_type=self.index_type, body=doc)
 68 |                 index += 1
 69 | 
 70 |     def getDataExportCSV(self, csvfile, query={'query': {'match_all': {}}}, cloumnList=None):
 71 |         '''
 72 |         从数据库导出csv表格
 73 |         :param csvfile:
 74 |         :param query:
 75 |         :param cloumnList:
 76 |         :return:
 77 |         '''
 78 |         res = self.getDataByBody(query)
 79 |         if res is not None and len(res['hits']['hits']) > 0:
 80 |             # fobj = open(csvfile, 'w+')
 81 |             with open(csvfile, 'w', newline='') as fobj:
 82 |                 if cloumnList == None:
 83 |                     cloumnList = res['hits']['hits'][0]["_source"].keys()
 84 |                 writer = csv.DictWriter(fobj, fieldnames=cloumnList)
 85 |                 writer.writeheader()
 86 |                 for hit in res['hits']['hits']:
 87 |                     writer.writerow(hit["_source"])
 88 | 
 89 |     def indexDataList(self, list=[]):
 90 |         '''
 91 |         数据存储到es
 92 |         :return:
 93 |         '''
 94 |         for item in list:
 95 |             res = self.es.index(index=self.index_name, doc_type=self.index_type, body=item)
 96 |             if self.show_es_result:
 97 |                 print(res)
 98 | 
 99 |     def indexData(self, data, id=None):
100 |         '''
101 |         单条数据添加
102 |         :param data:
103 |         :return:
104 |         '''
105 |         res = self.es.index(index=self.index_name, doc_type=self.index_type, body=data, id=id)
106 |         if self.show_es_result:
107 |             print(res)
108 |         return res
109 | 
110 |     def bulkIndexData(self, list=[]):
111 |         '''
112 |         用bulk将批量数据存储到es
113 |         :return:
114 |         '''
115 |         ACTIONS = []
116 |         for line in list:
117 |             action = {
118 |                 "_index": self.index_name,
119 |                 "_type": self.index_type,
120 |                 "_source": line
121 |             }
122 |             ACTIONS.append(action)
123 |             # 批量处理
124 |         success, _ = bulk(self.es, ACTIONS, index=self.index_name, raise_on_error=True)
125 |         if self.show_es_result:
126 |             print('Performed %d actions' % success)
127 |         return success
128 | 
129 |     def deleteDataById(self,id):
130 |         '''
131 |         删除索引中的一条
132 |         :param id:
133 |         :return:
134 |         '''
135 |         res = self.es.delete(index=self.index_name, doc_type=self.index_type, id=id)
136 |         if self.show_es_result:
137 |             print(res)
138 |         return res
139 | 
140 |     def getDataId(self,id):
141 |         res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id)
142 |         # 输出查询到的结果
143 |         if self.show_es_result:
144 |             print(res)
145 |         return res
146 | 
147 |     def getDataSourceById(self,id):
148 |         res = self.es.get(index=self.index_name, doc_type=self.index_type, id=id)
149 |         # 输出查询到的结果
150 |         if self.show_es_result:
151 |             print(res)
152 |         if res is not None and len(res['hits']['hits']) > 0:
153 |             return res['hits']['hits'][0]["_source"]
154 |         else:
155 |             return None
156 | 
157 |     def exit(self, queryBody):
158 |         if queryBody == None:
159 |             return False
160 |         res = self.getDataByBody(queryBody)
161 |         if res is not None and len(res['hits']['hits']) > 0:
162 |             return True
163 |         else:
164 |             return False
165 | 
166 |     def getOneByBody(self, query):
167 |         params = {"size":1}
168 |         res = self.getDataByBody(query, params)
169 |         if res is not None and len(res['hits']['hits']) > 0:
170 |             return res['hits']['hits'][0]["_source"]
171 |         else:
172 |             return None
173 | 
174 |     def getDataByBody(self, queryBody={'query': {'match_all': {}}}, params=None):
175 |         # queryBody = {'query': {'match_all': {}}}
176 |         _searched = None
177 |         if params == None:
178 |             _searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=queryBody)
179 |         else:
180 |             _searched = self.es.search(index=self.index_name, doc_type=self.index_type, body=queryBody, params=params)
181 | 
182 |         if self.show_es_result:
183 |             print(_searched)
184 |         return _searched


--------------------------------------------------------------------------------
/DBTools/MyMongoDB.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from pymongo import MongoClient
 3 | 
 4 | class MyMongoClient(object):
 5 |     def __init__(self, dbname=None, setname=None):
 6 |         self.dbname = dbname
 7 |         self.setname = setname
 8 |         self.client = MongoClient() ##与MongDB建立连接（这是默认连接本地MongDB数据库）
 9 |         self.db = self.client[dbname] ## 选择一个数据库
10 |         self.collection = self.db[setname] ##在这个数据库中，选择一个集合        
11 |         
12 |     def save(self, data):
13 |         res = self.collection.save(data)
14 |         if SHOW_RESULT:
15 |             print(res)
16 |         return res
17 |     
18 |     def getOne(self, query):
19 |         res = self.collection.find_one(query)
20 |         if SHOW_RESULT:
21 |             print(res)
22 |         return res
23 |     
24 |     def isExit(self, query):
25 |         if self.getOne(query):
26 |             return True
27 |         else:
28 |             return False
29 |         
30 |     def get(self, query):
31 |         res = self.collection.find(query)
32 |         if SHOW_RESULT:
33 |             print(res)
34 |         return res      
35 | 
36 | SHOW_RESULT = True
37 |         


--------------------------------------------------------------------------------
/DBTools/MySqlite.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | #python sqlite
  3 | #DB-API 2.0 interface for SQLite databases
  4 | 
  5 | import sqlite3
  6 | import os
  7 | 
  8 | '''
  9 | SQLite数据库是一款非常小巧的嵌入式开源数据库软件，也就是说
 10 | 没有独立的维护进程，所有的维护都来自于程序本身。
 11 | 在python中，使用sqlite3创建数据库的连接，当我们指定的数据库文件不存在的时候
 12 | 连接对象会自动创建数据库文件；如果数据库文件已经存在，则连接对象不会再创建
 13 | 数据库文件，而是直接打开该数据库文件。
 14 |     连接对象可以是硬盘上面的数据库文件，也可以是建立在内存中的，在内存中的数据库
 15 |     执行完任何操作后，都不需要提交事务的(commit)
 16 | 
 17 |     创建在硬盘上面： conn = sqlite3.connect('c:\\test\\test.db')
 18 |     创建在内存上面： conn = sqlite3.connect('"memory:')
 19 | 
 20 |     下面我们一硬盘上面创建数据库文件为例来具体说明：
 21 |     conn = sqlite3.connect('c:\\test\\hongten.db')
 22 |     其中conn对象是数据库链接对象，而对于数据库链接对象来说，具有以下操作：
 23 | 
 24 |         commit()            --事务提交
 25 |         rollback()          --事务回滚
 26 |         close()             --关闭一个数据库链接
 27 |         cursor()            --创建一个游标
 28 | 
 29 |     cu = conn.cursor()
 30 |     这样我们就创建了一个游标对象：cu
 31 |     在sqlite3中，所有sql语句的执行都要在游标对象的参与下完成
 32 |     对于游标对象cu，具有以下具体操作：
 33 | 
 34 |         execute()           --执行一条sql语句
 35 |         executemany()       --执行多条sql语句
 36 |         close()             --游标关闭
 37 |         fetchone()          --从结果中取出一条记录
 38 |         fetchmany()         --从结果中取出多条记录
 39 |         fetchall()          --从结果中取出所有记录
 40 |         scroll()            --游标滚动
 41 | 
 42 | '''
 43 | 
 44 | class MySqlite(object):
 45 |     def __init__(self, dbpath, tablename, print=False):
 46 |         self.dbpath = dbpath
 47 |         self.tablename = tablename
 48 |         #是否打印sql
 49 |         self.show_sql = print
 50 |         #是否打印sql结果
 51 |         self.show_sql_result = print
 52 | 
 53 |     def get_conn(self,path=None):
 54 |         '''获取到数据库的连接对象，参数为数据库文件的绝对路径
 55 |         如果传递的参数是存在，并且是文件，那么就返回硬盘上面改
 56 |         路径下的数据库文件的连接对象；否则，返回内存中的数据接
 57 |         连接对象'''
 58 |         if path == None:
 59 |             path = self.dbpath
 60 |         if os.path.exists(path) and os.path.isfile(path):
 61 |             print('硬盘上面:[{}]'.format(path))
 62 |             conn = sqlite3.connect(path)
 63 |             conn.text_factory = str  ##!!!
 64 |             return conn
 65 |         else:
 66 |             conn = None
 67 |             print('内存上面:[:memory:]')
 68 |             return sqlite3.connect(':memory:')
 69 | 
 70 |     def get_cursor(self, conn=None):
 71 |         '''该方法是获取数据库的游标对象，参数为数据库的连接对象
 72 |         如果数据库的连接对象不为None，则返回数据库连接对象所创
 73 |         建的游标对象；否则返回一个游标对象，该对象是内存中数据
 74 |         库连接对象所创建的游标对象'''
 75 |         if conn is not None:
 76 |             return conn.cursor()
 77 |         else:
 78 |             return self.get_conn().cursor()
 79 | 
 80 |     ###############################################################
 81 |     ####            创建|删除表操作     START
 82 |     ###############################################################
 83 |     def dropTable(self, table=None, conn=None):
 84 |         if table == None:
 85 |             table = self.tablename
 86 |         if conn == None:
 87 |             conn = self.get_conn()
 88 |         '''如果表存在,则删除表，如果表中存在数据的时候，使用该
 89 |         方法的时候要慎用！'''
 90 |         if table is not None and table != '':
 91 |             sql = 'DROP TABLE IF EXISTS ' + table
 92 |             if self.show_sql:
 93 |                 print('执行sql:[{}]'.format(sql))
 94 |             cu = self.get_cursor(conn)
 95 |             cu.execute(sql)
 96 |             conn.commit()
 97 |             if self.show_sql_result:
 98 |                 print('删除数据库表[{}]成功!'.format(table))
 99 |             self.close_all(conn, cu)
100 |         else:
101 |             print('the [{}] is empty or equal None!'.format(sql))
102 | 
103 |     def createTable(self, sql, conn=None):
104 |         if conn == None:
105 |             conn = self.get_conn()
106 |         '''创建数据库表'''
107 |         if sql is not None and sql != '':
108 |             cu = self.get_cursor(conn)
109 |             if self.show_sql:
110 |                 print('执行sql:[{}]'.format(sql))
111 |             cu.execute(sql)
112 |             conn.commit()
113 |             if self.show_sql_result:
114 |                 print('创建数据库表成功!')
115 |             self.close_all(conn, cu)
116 |         else:
117 |             print('the [{}] is empty or equal None!'.format(sql))
118 |     ###############################################################
119 |     ####            创建|删除表操作     END
120 |     ###############################################################
121 | 
122 |     def close_all(self, conn, cu):
123 |         '''关闭数据库游标对象和数据库连接对象'''
124 |         try:
125 |             if cu is not None:
126 |                 cu.close()
127 |         finally:
128 |             if cu is not None:
129 |                 cu.close()
130 | 
131 |     ###############################################################
132 |     ####            数据库操作CRUD     START
133 |     ###############################################################
134 |     def insert(self, sql, data, conn=None):
135 |         if conn == None:
136 |             conn = self.get_conn()
137 |         '''插入数据'''
138 |         if sql is not None and sql != '':
139 |             if data is not None:
140 |                 cu = self.get_cursor(conn)
141 |                 for d in data:
142 |                     if self.show_sql:
143 |                         print('执行sql:[{}],参数:[{}]'.format(sql, d))
144 |                     cu.execute(sql, d)
145 |                     conn.commit()
146 |                 self.close_all(conn, cu)
147 |         else:
148 |             print('the [{}] is empty or equal None!'.format(sql))
149 | 
150 |     def selectAll(self, sql, conn=None):
151 |         if conn == None:
152 |             conn = self.get_conn()
153 |         '''查询所有数据'''
154 |         if sql is not None and sql != '':
155 |             cu = self.get_cursor(conn)
156 |             if self.show_sql:
157 |                 print('执行sql:[{}]'.format(sql))
158 |             cu.execute(sql)
159 |             r = cu.fetchall()
160 |             if self.show_sql_result:
161 |                 if len(r) > 0:
162 |                     for e in range(len(r)):
163 |                         print(r[e])
164 |             return r
165 |         else:
166 |             print('the [{}] is empty or equal None!'.format(sql))
167 |             return None
168 | 
169 |     def selectOne(self, sql, data, conn=None):
170 |         if conn == None:
171 |             conn = self.get_conn()
172 |         '''查询一条数据'''
173 |         if sql is not None and sql != '':
174 |             if data is not None:
175 |                 #Do this instead
176 |                 d = (data,)
177 |                 cu = self.get_cursor(conn)
178 |                 if self.show_sql:
179 |                     print('执行sql:[{}],参数:[{}]'.format(sql, data))
180 |                 cu.execute(sql, d)
181 |                 r = cu.fetchall()
182 |                 if self.show_sql_result:
183 |                     if len(r) > 0:
184 |                         for e in range(len(r)):
185 |                             print(r[e])
186 |                 return r
187 |             else:
188 |                 print('the [{}] equal None!'.format(data))
189 |         else:
190 |             print('the [{}] is empty or equal None!'.format(sql))
191 |         return None
192 | 
193 |     def update(self, sql, data, conn=None):
194 |         if conn == None:
195 |             conn = self.get_conn()
196 |         '''更新数据'''
197 |         if sql is not None and sql != '':
198 |             if data is not None:
199 |                 cu = self.get_cursor(conn)
200 |                 for d in data:
201 |                     if self.show_sql:
202 |                         print('执行sql:[{}],参数:[{}]'.format(sql, d))
203 |                     cu.execute(sql, d)
204 |                     conn.commit()
205 |                 self.close_all(conn, cu)
206 |         else:
207 |             print('the [{}] is empty or equal None!'.format(sql))
208 | 
209 |     def delete(self, sql, data, conn=None):
210 |         if conn == None:
211 |             conn = self.get_conn()
212 |         '''删除数据'''
213 |         if sql is not None and sql != '':
214 |             if data is not None:
215 |                 cu = self.get_cursor(conn)
216 |                 for d in data:
217 |                     if self.show_sql:
218 |                         print('执行sql:[{}],参数:[{}]'.format(sql, d))
219 |                     cu.execute(sql, d)
220 |                     conn.commit()
221 |                 self.close_all(conn, cu)
222 |         else:
223 |             print('the [{}] is empty or equal None!'.format(sql))
224 |     ###############################################################
225 |     ####            数据库操作CRUD     END
226 |     ###############################################################
227 | 
228 |     def setDbPath(self, dbpath):
229 |         self.dbpath = dbpath
230 | 
231 |     def setTableName(self, tablename):
232 |         self.tablename = tablename
233 | 
234 |     def openPrint(self):
235 |         self.show_sql = True
236 |         print('self.show_sql : {}'.format(self.show_sql))
237 |         self.show_sql_result = True
238 |         print('self.show_sql_result : {}'.format(self.show_sql_result))


--------------------------------------------------------------------------------
/DBTools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/DBTools/__init__.py


--------------------------------------------------------------------------------
/DBTools/test/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")


--------------------------------------------------------------------------------
/DBTools/test/es-test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import Parent
 3 | from MyES import MyESClient
 4 | 
 5 | list = [
 6 |     {"date": "2017-09-13",
 7 |      "source": "慧聪网",
 8 |      "link": "http://info.broadcast.hc360.com/2017/09/130859749974.shtml",
 9 |      "keyword": "电视",
10 |      "title": "付费 电视 行业面临的转型和挑战"
11 |      },
12 |     {"date": "2017-09-13",
13 |      "source": "中国文明网",
14 |      "link": "http://www.wenming.cn/xj_pd/yw/201709/t20170913_4421323.shtml",
15 |      "keyword": "电视",
16 |      "title": "电视 专题片《巡视利剑》广获好评：铁腕反腐凝聚党心民心"
17 |      },
18 |     {"date": "2017-09-13",
19 |      "source": "人民电视",
20 |      "link": "http://tv.people.com.cn/BIG5/n1/2017/0913/c67816-29533981.html",
21 |      "keyword": "电视",
22 |      "title": "中国第21批赴刚果（金）维和部隊启程--人民 电视 --人民网"
23 |      },
24 |     {"date": "2017-09-13",
25 |      "source": "站长之家",
26 |      "link": "http://www.chinaz.com/news/2017/0913/804263.shtml",
27 |      "keyword": "电视",
28 |      "title": "电视 盒子 哪个牌子好？ 吐血奉献三大选购秘笈"
29 |      }
30 | ]
31 | 
32 | # 提前给elasticsearch安装对应版本的中文分词器 https://github.com/medcl/elasticsearch-analysis-ik
33 | index_mappings = {
34 |     "mappings": {
35 |         "ott_type": {
36 |             "properties": {
37 |                 "title": {
38 |                     "type": "text",
39 |                     "index": True,
40 |                     "analyzer": "ik_max_word",
41 |                     "search_analyzer": "ik_max_word"
42 |                 },
43 |                 "date": {
44 |                     "type": "text",
45 |                     "index": True
46 |                 },
47 |                 "keyword": {
48 |                     "type": "text",
49 |                     "index": False
50 |                 },
51 |                 "source": {
52 |                     "type": "text",
53 |                     "index": False
54 |                 },
55 |                 "link": {
56 |                     "type": "text",
57 |                     "index": False
58 |                 }
59 |             }
60 |         }
61 |     }
62 | }
63 | 
64 | es = MyESClient("ott", "ott_type", print=True)
65 | 
66 | es.createIndex(index_mappings)
67 | 
68 | es.indexDataList(list)
69 | 
70 | queryBody = {
71 |     "query": {
72 |         "match": {
73 |             "title": "电视"
74 |         }
75 |     }
76 | }
77 | 
78 | es.getDataByBody(queryBody)
79 | 
80 | es.getDataExportCSV('es-test/ott.csv')
81 | 
82 | es.indexDataFromCSV("es-test/ott.csv")
83 | 


--------------------------------------------------------------------------------
/DBTools/test/sqlite-test.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-s
  2 | import Parent
  3 | from MySqlite import MySqlite
  4 | import os
  5 | 
  6 | 
  7 | ###############################################################
  8 | ####            测试操作     START
  9 | ###############################################################
 10 | def drop_table_test():
 11 |     '''删除数据库表测试'''
 12 |     print('删除数据库表测试...')
 13 |     sqlite.dropTable(TABLE_NAME)
 14 | 
 15 | 
 16 | def create_table_test():
 17 |     '''创建数据库表测试'''
 18 |     print('创建数据库表测试...')
 19 |     create_table_sql = '''CREATE TABLE `student` (
 20 |                           `id` int(11) NOT NULL,
 21 |                           `name` varchar(20) NOT NULL,
 22 |                           `gender` varchar(4) DEFAULT NULL,
 23 |                           `age` int(11) DEFAULT NULL,
 24 |                           `address` varchar(200) DEFAULT NULL,
 25 |                           `phone` varchar(20) DEFAULT NULL,
 26 |                            PRIMARY KEY (`id`)
 27 |                         )'''
 28 |     sqlite.createTable(create_table_sql)
 29 | 
 30 | 
 31 | def save_test():
 32 |     '''保存数据测试...'''
 33 |     print('保存数据测试...')
 34 |     save_sql = '''INSERT INTO student values (?, ?, ?, ?, ?, ?)'''
 35 |     data = [(1, 'Hongten', '男', 20, '广东省广州市',
 36 |              '13423****62'), (2, 'Tom', '男', 22, '美国旧金山', '15423****63'),
 37 |             (3, 'Jake', '女', 18, '广东省广州市',
 38 |              '18823****87'), (4, 'Cate', '女', 21, '广东省广州市', '14323****32')]
 39 |     sqlite.insert(save_sql, data)
 40 | 
 41 | 
 42 | def fetchall_test():
 43 |     '''查询所有数据...'''
 44 |     print('查询所有数据...')
 45 |     fetchall_sql = '''SELECT * FROM student'''
 46 |     sqlite.selectAll(fetchall_sql)
 47 | 
 48 | 
 49 | def fetchone_test():
 50 |     '''查询一条数据...'''
 51 |     print('查询一条数据...')
 52 |     fetchone_sql = 'SELECT * FROM student WHERE ID = ? '
 53 |     data = 1
 54 |     sqlite.selectOne(fetchone_sql, data)
 55 | 
 56 | def update_test():
 57 |     '''更新数据...'''
 58 |     print('更新数据...')
 59 |     update_sql = 'UPDATE student SET name = ? WHERE ID = ? '
 60 |     data = [('HongtenAA', 1), ('HongtenBB', 2), ('HongtenCC', 3), ('HongtenDD',
 61 |                                                                    4)]
 62 |     sqlite.update(update_sql, data)
 63 | 
 64 | 
 65 | def delete_test():
 66 |     '''删除数据...'''
 67 |     print('删除数据...')
 68 |     delete_sql = 'DELETE FROM student WHERE NAME = ? AND ID = ? '
 69 |     data = [('HongtenAA', 1), ('HongtenCC', 3)]
 70 |     sqlite.delete(delete_sql, data)
 71 | 
 72 | 
 73 | ###############################################################
 74 | ####            测试操作     END
 75 | ###############################################################
 76 | 
 77 | 
 78 | def init():
 79 |     '''初始化方法'''
 80 |     #数据库文件绝句路径
 81 |     global DB_FILE_PATH
 82 |     DB_FILE_PATH = os.getcwd() + '/sqlite-test/test.db'
 83 |     #数据库表名称
 84 |     global TABLE_NAME
 85 |     TABLE_NAME = 'student'
 86 | 
 87 |     global sqlite
 88 |     sqlite = MySqlite(DB_FILE_PATH, TABLE_NAME, True)
 89 |     #如果存在数据库表，则删除表
 90 |     drop_table_test()
 91 |     #创建数据库表student
 92 |     create_table_test()
 93 |     #向数据库表中插入数据
 94 |     save_test()
 95 | 
 96 | 
 97 | def main():
 98 |     init()
 99 |     fetchall_test()
100 |     print('#' * 50)
101 |     fetchone_test()
102 |     print('#' * 50)
103 |     update_test()
104 |     fetchall_test()
105 |     print('#' * 50)
106 |     delete_test()
107 |     fetchall_test()
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     main()


--------------------------------------------------------------------------------
/DBTools/test/sqlite-test/test.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/DBTools/test/sqlite-test/test.db


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 个人python爬虫的学习和实践记录
2 | 
3 | 1. [携程签证国家图片和使馆信息爬取](./ctrip.com-visa)
4 | 2. [妹子图图片资源信息爬取](./mzitu.com)
5 | 3. [晋江文学库免费小说爬取](./jjwxk.net)
6 | 4. [花瓣画板异步爬取](./huaban.com)
7 | 5. [wallhaven The best wallpapers on the Net!](./wallhaven.cc)
8 | 
9 | >PS：本项目仅学习分享用，请不要用于商业


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/__init__.py


--------------------------------------------------------------------------------
/ctrip.com-visa/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")


--------------------------------------------------------------------------------
/ctrip.com-visa/README.md:
--------------------------------------------------------------------------------
 1 | ## 背景
 2 | 
 3 | 公司做签证需要国家图片和领事馆信息（该页面目前 404，截止发现时间 2019-06-16）
 4 | 
 5 | ## 启动
 6 | 
 7 | 控制台 cd 到当前目录
 8 | 
 9 | >python xc-visa-lqxx.py
10 | 
11 | 相关截图：
12 | 
13 | ![国旗](img/ctrip-visa-gqtp.png)
14 | 
15 | ![领事馆信息](img/ctrip-visa-lsgxx.png)


--------------------------------------------------------------------------------
/ctrip.com-visa/img/ctrip-visa-gqtp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/ctrip.com-visa/img/ctrip-visa-gqtp.png


--------------------------------------------------------------------------------
/ctrip.com-visa/img/ctrip-visa-lsgxx.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/ctrip.com-visa/img/ctrip-visa-lsgxx.png


--------------------------------------------------------------------------------
/ctrip.com-visa/xc-visa-lqxx.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import Parent
  3 | from bs4 import BeautifulSoup
  4 | import os
  5 | import re
  6 | # from BaseTools.MyUtil import FileTool
  7 | from BaseTools.MyDownload import request
  8 | import csv
  9 | ## http://vacations.ctrip.com/visa/lsg
 10 | ## div.c_con a
 11 | ## table.sin_lis td
 12 | # lqmc: h4
 13 | # lsgmc: p[0]
 14 | # lsgdz: p[1]
 15 | # lsggzsj: p[3]
 16 | class VisaLqxxCrawler():
 17 |     def __init__(self):
 18 |         self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
 19 |         self.gqtpPath = "./gqtp/"
 20 |         self.mkdir(self.gqtpPath)
 21 |         self.mkdir("./lsgxx/")
 22 |         self.lsgxxFilePath = "./lsgxx/lsgxx.txt"
 23 |         self.lsgxxCsvPath = "./lsgxx/lsgxx.csv"
 24 |         self.lsgxxList = []
 25 |     def all_url(self, url="http://vacations.ctrip.com/visa/lsg"):
 26 |         html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response
 27 |         all_div = BeautifulSoup(html.text, 'lxml').find_all('div', class_='c_con')
 28 |         print("一共有 %d 个州" % len(all_div))
 29 |         for div in all_div:
 30 |             all_a = div.find_all('a')
 31 |             print("该洲一共有 %d 个国家" % len(all_a))
 32 |             for a in all_a:
 33 |                 img = a.find("img")
 34 |                 self.headers['referer'] = url
 35 |                 self.save(img["src"])
 36 |                 href = "http://vacations.ctrip.com" + a['href']
 37 |                 title = a["title"]
 38 |                 self.currGjmc = title
 39 |                 print(title, href)
 40 |                 self.headers['referer'] = href
 41 |                 self.html(href)
 42 |         self.exportCsv(self.lsgxxCsvPath)
 43 |     def html(self, href):   ##这个函数是处理套图地址获得图片的页面地址
 44 |         try:
 45 |             html = self.request(href)
 46 |             #max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
 47 |             tds = BeautifulSoup(html.text, 'lxml').find('table', class_="sin_lis").find_all('td')
 48 |             for td in tds:
 49 |                 lsgInfo = {}
 50 |                 lsgInfo["gjmc"] = self.currGjmc
 51 |                 h4 = td.find("h4").get_text()
 52 |                 lsgInfo["lqmc"] = self.trim(h4)
 53 |                 ps = td.find_all('p')
 54 |                 lsgInfo["lqgmc"] = self.trim(ps[0].get_text())
 55 |                 lsgInfo["lqgdz"] = self.trim(ps[1].get_text())
 56 |                 lsgInfo["lsggzsj"] = self.trim(ps[2].get_text())
 57 |                 print(lsgInfo)
 58 |                 self.lsgxxList.append(lsgInfo)
 59 |                 # FileTool.write(self.lsgxxFilePath,lsgInfo.encode("utf-8"))
 60 |         except Exception as e:
 61 |             print('发生了异常：', e)
 62 | 
 63 |     def exportCsv(self,csvfile, list=None, cloumnList=None):
 64 |         if list == None:
 65 |             list = self.lsgxxList
 66 |         if cloumnList == None and len(list) > 0:
 67 |             cloumnList = list[0].keys()
 68 |         # fobj = open(csvfile, 'w+')
 69 |         # fobj = open(csvfile, 'ab+')
 70 |         with open(csvfile, 'w', newline='') as fobj:
 71 |             writer = csv.DictWriter(fobj, fieldnames=cloumnList)
 72 |             writer.writeheader()
 73 |             for item in list:
 74 |                 writer.writerow(item)
 75 | 
 76 |     def trim(self, myStr):
 77 |         myStr = re.sub('\n', '', myStr)
 78 |         myStr = re.sub(' ', '', myStr)
 79 |         myStr = re.sub('\ufffd', ' ', myStr)
 80 |         return myStr
 81 | 
 82 |     def save(self, img_url): ##这个函数保存图片
 83 |         try:
 84 |             index = img_url.rindex("/")
 85 |             name = img_url[index:]
 86 |             img = self.request(img_url)
 87 |             f = open(self.gqtpPath + name, 'ab')
 88 |             f.write(img.content)
 89 |             f.close()
 90 |         except Exception as e:
 91 |             print('发生了        异常：', e)
 92 | 
 93 |     def mkdir(self, path=""): ##这个函数创建文件夹
 94 |         path = path.strip()
 95 |         isExists = os.path.exists(path)
 96 |         if not isExists:
 97 |             print('建了一个名字叫做', path, '的文件夹！')
 98 |             os.makedirs(path)
 99 |             #os.chdir(os.path.join(self.gqtpPath, path)) ##切换到目录
100 |             return True
101 |         else:
102 |             print('名字叫做', path, '的文件夹已经存在了！')
103 |             return False
104 | 
105 |     def request(self, url): ##这个函数获取网页的response 然后返回
106 |         content = request.get(url, headers=self.headers, timeout=3)
107 |         return content
108 | 
109 | visaLqxxCrawler = VisaLqxxCrawler()
110 | visaLqxxCrawler.all_url()


--------------------------------------------------------------------------------
/framework/base_scrapy/README.md:
--------------------------------------------------------------------------------
 1 | ## 安装 Scrapy 框架
 2 | 
 3 | ```bash
 4 | pip install Scrapy
 5 | ```
 6 | 
 7 | 但是网上都推荐用 Anaconda 安装，初学者建议先安装 Anaconda （请百度安装方法）
 8 | 
 9 | ```bash
10 | conda install scrapy
11 | 或专业点的 ↓
12 | conda install -c conda-forge scrapy
13 | ```
14 | 
15 | 我用的是 Python3，双环境，所以
16 | 
17 | ```bash
18 | pip3 install Scrapy
19 | ```
20 | 
21 | ## 创建 Scrapy 项目
22 | 
23 | ```bash
24 | scrapy startproject base_scrapy
25 | 
26 | PS：base_scrapy 为项目名，一般看你自己啦
27 | 
28 | 于是就生成如下目录和文件：
29 | 
30 | base_scrapy
31 |     ├── base_scrapy
32 |     │   ├── __init__.py
33 |     │   ├── __pycache__
34 |     │   ├── items.py
35 |     │   ├── middlewares.py
36 |     │   ├── pipelines.py
37 |     │   ├── settings.py
38 |     │   └── spiders
39 |     │       ├── __init__.py
40 |     │       └── __pycache__
41 |     └── scrapy.cfg
42 | ```


--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/framework/base_scrapy/base_scrapy/__init__.py


--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/entrypoint.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | from scrapy.cmdline import execute
3 | # 该文件用于调试，第三个变量是 项目名
4 | execute(['scrapy', 'crawl', 'base_scrapy'])


--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | # 在这里定义你的 items，可以定义很多个 class，不同的 spiders 里面引用不同的
 5 | # See documentation in:
 6 | # https://docs.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class BaseScrapyItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     url = scrapy.Field()
15 |     status = scrapy.Field()
16 |     # headers = scrapy.Field()
17 |     body = scrapy.Field()
18 |     pass


--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class BaseScrapySpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Request, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class BaseScrapyDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class BaseScrapyPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for base_scrapy project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://docs.scrapy.org/en/latest/topics/settings.html
 9 | #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'base_scrapy'
13 | 
14 | SPIDER_MODULES = ['base_scrapy.spiders']
15 | NEWSPIDER_MODULE = 'base_scrapy.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'base_scrapy (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | #ROBOTSTXT_OBEY = True
23 | # 不遵循 robots.txt 规则
24 | ROBOTSTXT_OBEY = False
25 | 
26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
27 | #CONCURRENT_REQUESTS = 32
28 | 
29 | # Configure a delay for requests for the same website (default: 0)
30 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
31 | # See also autothrottle settings and docs
32 | #DOWNLOAD_DELAY = 3
33 | # The download delay setting will honor only one of:
34 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
35 | #CONCURRENT_REQUESTS_PER_IP = 16
36 | 
37 | # Disable cookies (enabled by default)
38 | #COOKIES_ENABLED = False
39 | 
40 | # Disable Telnet Console (enabled by default)
41 | #TELNETCONSOLE_ENABLED = False
42 | 
43 | # Override the default request headers:
44 | #DEFAULT_REQUEST_HEADERS = {
45 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
46 | #   'Accept-Language': 'en',
47 | #}
48 | 
49 | # Enable or disable spider middlewares
50 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'base_scrapy.middlewares.BaseScrapySpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | #    'base_scrapy.middlewares.BaseScrapyDownloaderMiddleware': 543,
59 | #}
60 | 
61 | # Enable or disable extensions
62 | # See https://docs.scrapy.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
69 | #ITEM_PIPELINES = {
70 | #    'base_scrapy.pipelines.BaseScrapyPipeline': 300,
71 | #}
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | HTTPCACHE_ENABLED = True
89 | HTTPCACHE_EXPIRATION_SECS = 0
90 | HTTPCACHE_DIR = 'httpcache'
91 | HTTPCACHE_IGNORE_HTTP_CODES = []
92 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/framework/base_scrapy/base_scrapy/spiders/base_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # 导入scrapy包
 3 | import os
 4 | import scrapy
 5 | from bs4 import BeautifulSoup
 6 | # 一个单独的 request 的模块，需要跟进 URL 的时候，需要用它
 7 | from scrapy.http import Request
 8 | # 这是我定义的需要保存的字段，（导入项目中，items文件中的 BaseScrapyItem 类）
 9 | from base_scrapy.items import BaseScrapyItem
10 | 
11 | # 在 Scrapy 框架根目录，控制台输入： scrapy crawl base_spider -o data/base_spider/item.json
12 | class BaseSpider(scrapy.Spider):
13 |     # 爬虫名字，定义后在项目根目录: scrapy crawl {name} ，运行该爬虫
14 |     name = 'base_spider'
15 |     # 定义一些常量
16 |     data_dir = 'data'
17 |     allowed_domains = ['baidu.com']
18 |     bash_url = 'https://www.baidu.com/s?wd='
19 | 
20 |     def start_requests(self):
21 |         for i in range(1, 10):
22 |             url = self.bash_url + str(i)
23 |             # 爬取到的页面 提交 给 parse 方法处理
24 |             yield Request(url, self.parse)
25 | 
26 |     def parse(self, response):
27 |         '''
28 |         start_requests 已经爬取到页面，那如何提取我们想要的内容，可以在这个方法里面定义。
29 |         也就是用xpath、正则、或是css进行相应提取，这个例子就是让你看看scrapy运行的流程：
30 |         1、定义链接；
31 |         2、通过链接爬取（下载）页面；
32 |         3、定义规则，然后提取数据；（当前步骤）
33 |         '''
34 |         # # 根据上面的链接提取个数，文件名：baidu.com-{n}.txt
35 |         # file_name = self.allowed_domains[0] + '-' + response.url.split("=")[-1] + '.txt'
36 |         # # 文件路径
37 |         # file_path = os.path.join(self.data_dir, self.name)
38 |         # # 创建文件夹
39 |         # if not os.path.exists(file_path):
40 |         #     os.makedirs(file_path)
41 |         # # 拼接文件名
42 |         # file_full_name = os.path.join(file_path, file_name)
43 |         # with open(file_full_name, 'wb') as f:        
44 |         #     # python文件操作，不多说了；
45 |         #     f.write(response.body)
46 |         # # 打个日志
47 |         # self.log('保存文件: %s' % file_full_name)
48 |         item = BaseScrapyItem()
49 |         item['url'] = response.url
50 |         item['status'] = response.status
51 |         # item['headers'] = str(response.headers, encoding='utf8')
52 |         item['body'] = str(response.body, encoding='utf8')
53 |         yield item


--------------------------------------------------------------------------------
/framework/base_scrapy/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = base_scrapy.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = base_scrapy
12 | 


--------------------------------------------------------------------------------
/huaban.com/PreviewHtmlTool.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | def saveIndexHtmlFile(save_path, title, border_id, max_page):
  3 |     template = '''
  4 |     <!DOCTYPE />
  5 |     <html>
  6 |         <head>
  7 |             <meta http-equiv="Access-Control-Allow-Origin" content="*" />
  8 |             <meta http-equiv="Access-Control-Allow-Credentials" content="true" />
  9 |             <meta http-equiv="Access-Control-Allow-Methods" content="get" />
 10 |             <meta http-equiv="Access-Control-Allow-Headers" content="x-requested-with, x-ui-request, lang" />
 11 |             <meta charset="utf-8" />
 12 |             <title>%(title)s</title>
 13 |             <style>
 14 |                 html {
 15 |                     background: #f5f5f5;
 16 |                     --gbshadow-0: 0 none;
 17 |                     --gbshadow-1: 0 1px 3px 0 rgba(0,0,0,.02),0 4px 8px 0 rgba(0,0,0,.02);
 18 |                     --gbshadow-2: 0 1px 3px 0 rgba(0,0,0,.05),0 8px 16px 0 rgba(0,0,0,.07);
 19 |                     --gbshadow-3: 0 1px 3px 0 rgba(0,0,0,.02),0 16px 32px 0 rgba(0,0,0,.07);
 20 |                     --gbshadow-hover: 0 1px 3px rgba(0,0,0,.02),0 16px 32px -4px rgba(0,0,0,.17);
 21 |                 }
 22 |                 .title{
 23 |                     text-align: center;
 24 |                     width: 700px;
 25 |                     line-height: 30px;
 26 |                     margin: 0 auto;
 27 |                     margin-top: 20px;
 28 |                 }
 29 |                 .main{
 30 |                     text-align: center;
 31 |                     width: 700px;
 32 |                     height: 700px;
 33 |                     margin: auto;
 34 |                     position: absolute;
 35 |                     top: 0;
 36 |                     left: 0;
 37 |                     right: 0;
 38 |                     bottom: 0;
 39 |                     box-sizing: border-box;
 40 |                     vertical-align: middle;
 41 |                     overflow: hidden;
 42 |                     background-color: #fff;
 43 |                     border-radius: 3px;
 44 |                     box-shadow: 0 1px 3px rgba(0,0,0,.02), 0 4px 8px rgba(0,0,0,.02);
 45 |                     -webkit-transition: transform .15s ease-in-out,box-shadow .15s ease-in-out;
 46 |                     -moz-box-shadow: 0 1px 3px rgba(0,0,0,.02),0 4px 8px rgba(0,0,0,.02);
 47 |                     -webkit-box-shadow: 0 1px 3px rgba(0,0,0,.02), 0 4px 8px rgba(0,0,0,.02);
 48 |                     -webkit-backface-visibility: hidden;
 49 |                 }
 50 |                 .pre-page{
 51 |                     width: 5%;
 52 |                     height: 100%;
 53 |                     float: left;
 54 |                     box-sizing: border-box;
 55 |                     cursor: pointer;
 56 |                 }
 57 |                 .pre-page:hover {
 58 |                     background-color: #ddd;
 59 |                 }
 60 |                 .left-main{
 61 |                     width: 50%;
 62 |                     height: 100%;
 63 |                     float: left;
 64 |                     box-sizing: border-box;
 65 |                     overflow: hidden;
 66 |                     display: -webkit-flex;
 67 |                     display: flex;
 68 |                     -webkit-align-items: center;
 69 |                     align-items: center;
 70 |                     -webkit-justify-content: center;
 71 |                     justify-content: center;
 72 |                 }
 73 |                 .left-main img{
 74 |                     width: 80%;
 75 |                     border-radius: 3px;
 76 |                 }
 77 |                 .left-main img:hover {
 78 |                     box-shadow: 0 1px 3px rgba(0,0,0,.02), 0 16px 32px -4px rgba(0,0,0,.17);
 79 |                     transform: translateY(-1px);
 80 |                     cursor: pointer;
 81 |                 }
 82 |                 .right-main{
 83 |                     width: 40%;
 84 |                     height: 100%;
 85 |                     float: left;
 86 |                     box-sizing: border-box;
 87 |                     display: -webkit-flex;
 88 |                     display: flex;
 89 |                     -webkit-align-items: center;
 90 |                     align-items: center;
 91 |                     -webkit-justify-content: center;
 92 |                     justify-content: center;
 93 |                 
 94 |                     line-height: 1.5;
 95 |                     font-family: cursive;
 96 |                     color: #444;
 97 |                     padding: 15px;
 98 |                     margin-bottom: 20px;
 99 |                     word-wrap: break-word;
100 |                 }
101 |                 .next-page{
102 |                     width: 5%;
103 |                     height: 100%;
104 |                     float: right;
105 |                     box-sizing: border-box;
106 |                     cursor: pointer;
107 |                 }
108 |                 .next-page:hover {
109 |                     background-color: #ddd;
110 |                 }
111 |             </style>
112 |         </head>
113 |         <body>
114 |             <div class="title">
115 |                 %(title)s - <span id="cur_page">1/%(max_page)s</span>
116 |             </div>
117 |             <div class="main">
118 |                 <div class="pre-page" id="pre_page" onclick="prePage();"></div>
119 |                 <div class="left-main">
120 |                     <img id="curr_img" src="%(border_id)s_1.jpg" />
121 |                 </div>
122 |                 <div id="curr_txt" class="right-main">
123 |                 </div>
124 |                 <div class="next-page" id="next_page" onclick="nextPage()"></div>
125 |             </div>
126 |             <script type="text/javascript">
127 |                 var cur_page = 1;
128 |                 var max_page = %(max_page)s;
129 |                 var border_id = "%(border_id)s";
130 |                 /*加载当前页*/
131 |                 function updatePage() {
132 |                     document.getElementById("cur_page").innerHTML = cur_page + "/" + max_page;
133 | 					if(cur_page == 1){
134 | 						document.getElementById("pre_page").style.border="0px";
135 | 					} else {
136 | 						document.getElementById("pre_page").style["border-right"]="1px solid #ddd";
137 | 					}
138 | 					if(cur_page == max_page){
139 | 						document.getElementById("next_page").style.border="0px";
140 | 					} else {
141 | 						document.getElementById("next_page").style["border-left"]="1px solid #ddd";
142 | 					}
143 |                     document.getElementById("curr_img")["src"] = border_id + "_" + cur_page + ".jpg";
144 |                     loadText();
145 |                 }
146 |                 /*异步加载文本*/
147 |                 function loadText(){
148 |                     //创建XMLHttpRequst对象
149 |                     var xhr = new XMLHttpRequest();
150 |                     // open(请求方式type,访问文件url/file,是否异步async)
151 |                     xhr.open('get', border_id + "_" + cur_page + ".txt", true);
152 |                 
153 |                     //两种请求方式:onload/onreadystatechange
154 |                     //onload 方式:
155 |                     xhr.onload = function(){
156 |                     //输出一下请求返回的文本
157 |                         document.getElementById("curr_txt").innerHTML = this.responseText;
158 |                     }
159 |                     //onreadystatechange  方式
160 |                     xhr.onreadystatechange = function(){
161 |                         document.getElementById("curr_txt").innerHTML = this.responseText;
162 |                     }
163 |                     xhr.send();
164 |                 }
165 |                 /*上一页*/
166 |                 function prePage() {
167 |                     if(cur_page > 1) {
168 |                         cur_page--;
169 |                         updatePage();
170 |                     }
171 |                 }
172 |                 /*下一页*/
173 |                 function nextPage() {
174 |                     if(cur_page < max_page) {
175 |                         cur_page++;
176 |                         updatePage();
177 |                     }
178 |                 }
179 |                 // 加载第一页
180 |                 updatePage();
181 |             </script>
182 |         </body>
183 |     </html>
184 |     '''
185 |     # html = template % {'title':title, 'border_id':border_id, 'max_page':str(max_page)}
186 |     html = template.replace("%(title)s", title).replace("%(border_id)s", border_id).replace("%(max_page)s", str(max_page))
187 |     with open(save_path, 'w', encoding='UTF-8') as f:
188 |         f.write(html)
189 | 
190 | saveIndexHtmlFile("./test.html", "adfaf", "12341", 123)


--------------------------------------------------------------------------------
/huaban.com/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## 前情提要
 3 | 
 4 | 最近看到花瓣的一个图集（淡然小笺赋箴言）蛮不错，想用爬虫收集下图集和对应的文字。
 5 | 
 6 | ## 花瓣网
 7 | 
 8 | 0. 测试异步爬虫
 9 | 1. 简单文本模式
10 | 2. HTML静态模式
11 | 
12 | 
13 | ### 测试异步爬虫
14 | 
15 | 花瓣里面的所有图片都是异步加载的，需要模拟浏览器操作，简单学习测试一下 selenium 的使用
16 | 
17 | 代码来源：
18 | 作者：疯魔的小咸鱼
19 | 链接：https://www.jianshu.com/p/554c6d5af3ca
20 | 
21 | PS：selenium 安装注意事项
22 | 
23 | - 问题1：selenium 已经放弃 PhantomJS 了，建议使用火狐或者谷歌无界面浏览器。
24 | 
25 |     解决方案：selenium 版本降级。
26 |     通过 `pip show selenium` 显示，默认安装版本为 3.14.0。
27 |     将其卸载 `pip uninstall selenium`，重新安装并指定版本号 `pip install selenium==2.48.0`。 
28 | 
29 | - 问题2： Unable to start phantomjs with ghostdriver: [WinError 2] 系统找不到指定的文件
30 | 
31 |     解决方案：下载 phantomjs 到该目录下，或配置 phantomjs 的目录路径到 path 环境变量
32 | 
33 |     下载路径 http://phantomjs.org/download.html ，选择对应操作系统下载
34 | 
35 | - 问题3：使用 chromedrive ，对应下载地址与版本对照表
36 | 
37 |     下载地址：http://npm.taobao.org/mirrors/chromedriver
38 | 
39 |     版本对照表：https://blog.csdn.net/yoyocat915/article/details/80580066
40 | 
41 | 测试结果：
42 | 
43 | ![图片截图](img/huaban-simple-1.png)
44 | 
45 | ### 简单文本模式
46 | 
47 | 保存画板的里面所有图片信息，包含图片和图片描述
48 | 
49 | 以画板 [淡然小笺赋箴言](http://huaban.com/boards/13448395/) 为例：
50 | 
51 | ![淡然小笺赋箴言](img/huaban-border-txt.png)
52 | 
53 | 保存画板信息的同时，会在同层目录生成一个单 `index.html` 页面
54 | 
55 | 我将一部分爬取到的信息（40张）上传到了该项目里面，点击[这里](https://petterobam.github.io/learn-scrapy/huaban.com/%E6%B7%A1%E7%84%B6%E5%B0%8F%E7%AC%BA%E8%B5%8B%E7%AE%B4%E8%A8%80/index.html)预览
56 | 
57 | ![预览界面](img/huaban-preview-border.png)
58 | 


--------------------------------------------------------------------------------
/huaban.com/huaban-border-text.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | from selenium import webdriver
  3 | import time
  4 | import os
  5 | import requests
  6 | import PreviewHtmlTool
  7 | 
  8 | 
  9 | class Huaban():
 10 |     def __init__(self, username, password):
 11 |         self.username = username
 12 |         self.password = password
 13 | 
 14 |     # 获取图片和图片文字信息，并存储成文件
 15 |     def get_picture_info_by_border_url(self, border_url):
 16 |         
 17 |         # 使用Chrome浏览器模拟打开网页，但是要把下载的chromedriver.exe放在python的文件路径下,
 18 |         # 调试好之后换成PhantomJs,速度应该会快一点
 19 |         # driver = webdriver.PhantomJs()
 20 |         # driver = webdriver.PhantomJS('../plugin/phantomjs-2.1.1-macosx/bin/phantomjs')
 21 |         driver = webdriver.Chrome('../plugin/chromedriver')
 22 |         # 设置全屏
 23 |         driver.maximize_window()
 24 | 
 25 |         if username != None and len(username) > 0:
 26 |             url = "http://huaban.com"
 27 |             driver.get(url)
 28 |             time.sleep(8)
 29 | 
 30 |             # 点击登录、呼起登录窗口
 31 |             driver.find_elements_by_xpath('//a[@class="login bounce btn wbtn"]')[0].click()
 32 |             # 输入用户名
 33 |             try:
 34 |                 driver.find_elements_by_xpath('//input[@name="email"]')[0].send_keys(self.username)
 35 |                 print('用户名输入OK!')
 36 |             except:
 37 |                 print('用户名输入异常!')
 38 |             time.sleep(3)
 39 |             # 输入密码
 40 |             try:
 41 |                 driver.find_elements_by_xpath('//input[@name="password"]')[0].send_keys(self.password)
 42 |                 print('密码输入OK!')
 43 |             except:
 44 |                 print('密码输入异常!')
 45 |             time.sleep(3)
 46 |             # 点击登陆按钮
 47 |             try:
 48 |                 driver.find_elements_by_xpath('//a[@class="btn btn18 rbtn"]')[0].click()
 49 |                 print('点击登陆OK!')
 50 |             except:
 51 |                 print('点击登陆异常')
 52 |             time.sleep(3)
 53 | 
 54 |         #访问画板，例如 http://huaban.com/boards/13448395/
 55 |         driver.get(border_url)
 56 |         time.sleep(5)
 57 |         i = 0
 58 |         page = 1
 59 |         global name
 60 |         global store_path
 61 |         global path
 62 |         # 获取画板标题 //div[@id="board_card"]/div[@class="inner"]/div[@class="head-line"]/h1
 63 |         content = driver.find_elements_by_xpath('//div[@id="board_card"]/div[@class="inner"]/div[@class="head-line"]/h1')[0].text
 64 |         path = "./" + content
 65 |         # hash_content = str(hash(content))
 66 |         # hash_content = border_url[-9:-1]
 67 |         url_split_list = border_url.split("/")
 68 |         hash_content = url_split_list[-2] + url_split_list[-1]
 69 | 
 70 |         # 保存图片到磁盘文件夹 file_path中，默认为当前脚本运行目录下的文件夹
 71 |         if not os.path.exists(path):
 72 |             os.makedirs(path)
 73 |         #获取图片的总数  //div[@id="board_card"]/div[@class="bar"]/div[@class="tabs"]/a
 74 |         pictures_count = driver.find_elements_by_xpath('//div[@id="board_card"]/div[@class="bar"]/div[@class="tabs"]/a')[0].text.replace('采集', '')
 75 |         print(pictures_count)
 76 | 
 77 |         # 生成预览用的HTML页面
 78 |         PreviewHtmlTool.saveIndexHtmlFile(path + "/index.html", content, hash_content, pictures_count)
 79 | 
 80 |         pages = int(int(pictures_count) / 20)
 81 |         print(pages)
 82 |         #匹配到图片url所在的元素
 83 |         url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
 84 |         #匹配图片对应的文字描述
 85 |         pic_info_elements = driver.find_elements_by_xpath('//div[@id="waterfall"]//p[@class="description"]')
 86 | 
 87 |         while page <= pages:
 88 |             while len(url_elements) < 20 * page:
 89 |                 driver.execute_script("window.scrollBy(0,1000)")
 90 |                 time.sleep(3)
 91 |                 url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
 92 |                 pic_info_elements = driver.find_elements_by_xpath('//div[@id="waterfall"]//p[@class="description"]')
 93 | 
 94 |             print("第%s页" % page)
 95 | 
 96 |             for url_element in url_elements[20 * (page - 1):20 * page]:
 97 |                 download_url = url_element.get_attribute("src")[:-3] + "658"
 98 |                 pic_info = pic_info_elements[i].get_attribute("data-raw")
 99 |                 i += 1
100 |                 store_path = hash_content + "_" + str(i)
101 |                 self.store(download_url, pic_info)
102 | 
103 |             page += 1
104 | 
105 |         #最后一页
106 |         print("第%s页" % int(page))
107 | 
108 |         while len(url_elements) < int(pictures_count):
109 |             driver.execute_script("window.scrollBy(0,1000)")
110 |             time.sleep(3)
111 |             url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
112 |             pic_info_elements = driver.find_elements_by_xpath('//div[@id="waterfall"]//p[@class="description"]')
113 | 
114 |         for url_element in url_elements[20 * (page - 1):]:
115 |             download_url = url_element.get_attribute("src")[:-3] + "658"
116 |             pic_info = pic_info_elements[i].get_attribute("data-raw")
117 |             i += 1
118 |             store_path = hash_content + "_" + str(i)
119 |             self.store(download_url, pic_info)
120 | 
121 |     #存储图片到本地
122 |     def store(self, picture_url, picture_info):
123 |         pic_path = path + '/'+ store_path
124 | 
125 |         with open(pic_path + '.jpg', 'wb') as f:
126 |             picture = requests.get(picture_url)
127 |             f.write(picture.content)
128 |         print('正在保存图片：' + picture_url)
129 |         print(f'文件：{pic_path}.jpg')
130 | 
131 |         with open(pic_path + '.txt', 'w', encoding='UTF-8') as f:
132 |             f.write(picture_info)
133 |         print('正在保存图片文字信息：' + picture_url)
134 |         print(f'文件：{pic_path}.txt')
135 | 
136 | if __name__ == "__main__":
137 |     username = input('请输入花瓣账号名：') # '花瓣账号'
138 |     password = input('请输入账号对应密码：') # '账号密码'
139 |     huaban = Huaban(username, password)
140 |     #获取画板图片信息[淡然小笺赋箴言] http://huaban.com/boards/13448395/
141 |     border_url = 'http://huaban.com/boards/13448395/'
142 |     huaban.get_picture_info_by_border_url(border_url)
143 | 


--------------------------------------------------------------------------------
/huaban.com/huaban-simple.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | from selenium import webdriver
  3 | import time
  4 | import os
  5 | import requests
  6 | 
  7 | 
  8 | class Huaban():
  9 |     def __init__(self, username, password):
 10 |         self.username = username
 11 |         self.password = password
 12 | 
 13 |     # 获取图片url并存到列表urls_list
 14 |     def get_picture_url(self, content):
 15 |         global path
 16 |         path = "./" + content
 17 |         # 保存图片到磁盘文件夹 file_path中，默认为当前脚本运行目录下的文件夹
 18 |         if not os.path.exists(path):
 19 |             os.makedirs(path)
 20 |         url = "http://huaban.com"
 21 |         # 使用Chrome浏览器模拟打开网页，但是要把下载的 chromedriver.exe 放在python的文件路径下,
 22 |         # 调试好之后换成 PhantomJs,速度应该会快一点
 23 |         # driver = webdriver.PhantomJs()
 24 |         # driver = webdriver.PhantomJS('../plugin/phantomjs-2.1.1-macosx/bin/phantomjs')
 25 |         driver = webdriver.Chrome('../plugin/chromedriver')
 26 |         # 设置全屏
 27 |         driver.maximize_window()
 28 |         driver.get(url)
 29 |         time.sleep(8)
 30 | 
 31 |         # 点击登录、呼起登录窗口
 32 |         driver.find_elements_by_xpath('//a[@class="login bounce btn wbtn"]')[0].click()
 33 |         # 输入用户名
 34 |         try:
 35 |             driver.find_elements_by_xpath('//input[@name="email"]')[0].send_keys(self.username)
 36 |             print('用户名输入OK!')
 37 |         except:
 38 |             print('用户名输入异常!')
 39 |         time.sleep(3)
 40 |         # 输入密码
 41 |         try:
 42 |             driver.find_elements_by_xpath('//input[@name="password"]')[0].send_keys(self.password)
 43 |             print('密码输入OK!')
 44 |         except:
 45 |             print('密码输入异常!')
 46 |         time.sleep(3)
 47 |         # 点击登陆按钮
 48 |         try:
 49 |             driver.find_elements_by_xpath('//a[@class="btn btn18 rbtn"]')[0].click()
 50 |             print('点击登陆OK!')
 51 |         except:
 52 |             print('点击登陆异常')
 53 |         time.sleep(3)
 54 |         #搜索图片
 55 |         driver.find_elements_by_xpath('//input[@placeholder="搜索你喜欢的"]')[0].send_keys(content)
 56 |         driver.find_elements_by_xpath('//form[@id="search_form"]/a')[0].click()
 57 |         time.sleep(5)
 58 |         i = 0
 59 |         page = 1
 60 |         global name
 61 |         global store_path
 62 |         global urls_list
 63 |         urls_list = []
 64 |         #获取图片的总数
 65 |         pictures_count = driver.find_elements_by_xpath('//a[@class="selected"]/i')[0].text
 66 |         print(pictures_count)
 67 |         pages = int(int(pictures_count) / 20)
 68 |         print(pages)
 69 |         #匹配到图片url所在的元素
 70 |         url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
 71 |         #遍历图片元素的列表获取图片的url
 72 |         for url_element in url_elements:
 73 |             picture_url = url_element.get_attribute("src")[:-3] + "658"
 74 |             #防止获取重复的图片url
 75 |             if picture_url not in urls_list:
 76 |                 urls_list.append(picture_url)
 77 |         while page <= pages:
 78 |             while len(urls_list) < 20*page:
 79 |                 driver.execute_script("window.scrollBy(0,1000)")
 80 |                 time.sleep(3)
 81 |                 url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
 82 |                 for url_element in url_elements:
 83 |                     picture_url = url_element.get_attribute("src")[:-3] + "658"
 84 |                     if picture_url not in urls_list:
 85 |                         urls_list.append(picture_url)
 86 |             print("第%s页" % page)
 87 | 
 88 |             for download_url in urls_list[20*(page-1):20*page]:
 89 |                 i += 1
 90 |                 name = content + "_" + str(i)
 91 |                 store_path = name + '.jpg'
 92 |                 self.store(download_url)
 93 |             page += 1
 94 |         #最后一页
 95 |         print("第%s页" % int(page))
 96 | 
 97 |         while len(urls_list) < int(pictures_count):
 98 |             driver.execute_script("window.scrollBy(0,1000)")
 99 |             time.sleep(3)
100 |             url_elements = driver.find_elements_by_xpath('//span[@class="stop"]/../img')
101 |             for url_element in url_elements:
102 |                 picture_url = url_element.get_attribute("src")[:-3] + "658"
103 |                 if picture_url not in urls_list:
104 |                     urls_list.append(picture_url)
105 |         for download_url in urls_list[20*(page-1): ]:
106 |             i += 1
107 |             name = content + "_" + str(i)
108 |             store_path = name + '.jpg'
109 |             self.store(download_url)
110 | 
111 |     #存储图片到本地
112 |     def store(self, picture_url):
113 |         picture = requests.get(picture_url)
114 |         f = open(os.path.join(path, store_path), 'wb')
115 |         f.write(picture.content)
116 |         print('正在保存图片：' + picture_url)
117 |         print('文件：' + name)
118 | 
119 | if __name__ == "__main__":
120 |     content = '赵丽颖'
121 |     username = input('请输入花瓣账号名：') # '花瓣账号'
122 |     password = input('请输入账号对应密码：') # '账号密码'
123 |     huaban = Huaban(username, password)
124 |     huaban.get_picture_url(content)
125 | 


--------------------------------------------------------------------------------
/huaban.com/img/huaban-border-txt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/img/huaban-border-txt.png


--------------------------------------------------------------------------------
/huaban.com/img/huaban-preview-border.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/img/huaban-preview-border.png


--------------------------------------------------------------------------------
/huaban.com/img/huaban-simple-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/img/huaban-simple-1.png


--------------------------------------------------------------------------------
/huaban.com/test.html:
--------------------------------------------------------------------------------
  1 | 
  2 |     <!DOCTYPE />
  3 |     <html>
  4 |         <head>
  5 |             <meta charset="utf-8" />
  6 |             <title>adfaf</title>
  7 |             <style>
  8 |                 html {
  9 |                     background: #f5f5f5;
 10 |                     --gbshadow-0: 0 none;
 11 |                     --gbshadow-1: 0 1px 3px 0 rgba(0,0,0,.02),0 4px 8px 0 rgba(0,0,0,.02);
 12 |                     --gbshadow-2: 0 1px 3px 0 rgba(0,0,0,.05),0 8px 16px 0 rgba(0,0,0,.07);
 13 |                     --gbshadow-3: 0 1px 3px 0 rgba(0,0,0,.02),0 16px 32px 0 rgba(0,0,0,.07);
 14 |                     --gbshadow-hover: 0 1px 3px rgba(0,0,0,.02),0 16px 32px -4px rgba(0,0,0,.17);
 15 |                 }
 16 |                 .title{
 17 |                     text-align: center;
 18 |                     width: 700px;
 19 |                     line-height: 30px;
 20 |                     margin: 0 auto;
 21 |                     margin-top: 20px;
 22 |                 }
 23 |                 .main{
 24 |                     text-align: center;
 25 |                     width: 700px;
 26 |                     height: 700px;
 27 |                     margin: auto;
 28 |                     position: absolute;
 29 |                     top: 0;
 30 |                     left: 0;
 31 |                     right: 0;
 32 |                     bottom: 0;
 33 |                     box-sizing: border-box;
 34 |                     vertical-align: middle;
 35 |                     overflow: hidden;
 36 |                     background-color: #fff;
 37 |                     border-radius: 3px;
 38 |                     box-shadow: 0 1px 3px rgba(0,0,0,.02), 0 4px 8px rgba(0,0,0,.02);
 39 |                     -webkit-transition: transform .15s ease-in-out,box-shadow .15s ease-in-out;
 40 |                     -moz-box-shadow: 0 1px 3px rgba(0,0,0,.02),0 4px 8px rgba(0,0,0,.02);
 41 |                     -webkit-box-shadow: 0 1px 3px rgba(0,0,0,.02), 0 4px 8px rgba(0,0,0,.02);
 42 |                     -webkit-backface-visibility: hidden;
 43 |                 }
 44 |                 .pre-page{
 45 |                     width: 5%;
 46 |                     height: 100%;
 47 |                     float: left;
 48 |                     box-sizing: border-box;
 49 |                     cursor: pointer;
 50 |                 }
 51 |                 .pre-page:hover {
 52 |                     background-color: #ddd;
 53 |                 }
 54 |                 .left-main{
 55 |                     width: 50%;
 56 |                     height: 100%;
 57 |                     float: left;
 58 |                     box-sizing: border-box;
 59 |                     overflow: hidden;
 60 |                     display: -webkit-flex;
 61 |                     display: flex;
 62 |                     -webkit-align-items: center;
 63 |                     align-items: center;
 64 |                     -webkit-justify-content: center;
 65 |                     justify-content: center;
 66 |                 }
 67 |                 .left-main img{
 68 |                     width: 80%;
 69 |                     border-radius: 3px;
 70 |                 }
 71 |                 .left-main img:hover {
 72 |                     box-shadow: 0 1px 3px rgba(0,0,0,.02), 0 16px 32px -4px rgba(0,0,0,.17);
 73 |                     transform: translateY(-1px);
 74 |                     cursor: pointer;
 75 |                 }
 76 |                 .right-main{
 77 |                     width: 40%;
 78 |                     height: 100%;
 79 |                     float: left;
 80 |                     box-sizing: border-box;
 81 |                     display: -webkit-flex;
 82 |                     display: flex;
 83 |                     -webkit-align-items: center;
 84 |                     align-items: center;
 85 |                     -webkit-justify-content: center;
 86 |                     justify-content: center;
 87 |                 
 88 |                     line-height: 1.5;
 89 |                     font-family: cursive;
 90 |                     color: #444;
 91 |                     padding: 15px;
 92 |                     margin-bottom: 20px;
 93 |                     word-wrap: break-word;
 94 |                 }
 95 |                 .next-page{
 96 |                     width: 5%;
 97 |                     height: 100%;
 98 |                     float: right;
 99 |                     box-sizing: border-box;
100 |                     cursor: pointer;
101 |                 }
102 |                 .next-page:hover {
103 |                     background-color: #ddd;
104 |                 }
105 |             </style>
106 |         </head>
107 |         <body>
108 |             <div class="title">
109 |                 adfaf - <span id="cur_page">1/123</span>
110 |             </div>
111 |             <div class="main">
112 |                 <div class="pre-page" id="pre_page" onclick="prePage();"></div>
113 |                 <div class="left-main">
114 |                     <img id="curr_img" src="12341_1.jpg" />
115 |                 </div>
116 |                 <div id="curr_txt" class="right-main">
117 |                 </div>
118 |                 <div class="next-page" id="next_page" onclick="nextPage()"></div>
119 |             </div>
120 |             <script type="text/javascript">
121 |                 var cur_page = 1;
122 |                 var max_page = 123;
123 |                 var border_id = "12341";
124 |                 /*加载当前页*/
125 |                 function updatePage() {
126 |                     document.getElementById("cur_page").innerHTML = cur_page + "/" + max_page;
127 | 					if(cur_page == 1){
128 | 						document.getElementById("pre_page").style.border="0px";
129 | 					} else {
130 | 						document.getElementById("pre_page").style["border-right"]="1px solid #ddd";
131 | 					}
132 | 					if(cur_page == max_page){
133 | 						document.getElementById("next_page").style.border="0px";
134 | 					} else {
135 | 						document.getElementById("next_page").style["border-left"]="1px solid #ddd";
136 | 					}
137 |                     document.getElementById("curr_img")["src"] = border_id + "_" + cur_page + ".jpg";
138 |                     loadText();
139 |                 }
140 |                 /*异步加载文本*/
141 |                 function loadText(){
142 |                     //创建XMLHttpRequst对象
143 |                     var xhr = new XMLHttpRequest();
144 |                     // open(请求方式type,访问文件url/file,是否异步async)
145 |                     xhr.open('get', border_id + "_" + cur_page + ".txt", true);
146 |                 
147 |                     //两种请求方式:onload/onreadystatechange
148 |                     //onload 方式:
149 |                     xhr.onload = function(){
150 |                     //输出一下请求返回的文本
151 |                         document.getElementById("curr_txt").innerHTML = this.responseText;
152 |                     }
153 |                     //onreadystatechange  方式
154 |                     xhr.onreadystatechange = function(){
155 |                         document.getElementById("curr_txt").innerHTML = this.responseText;
156 |                     }
157 |                     xhr.send();
158 |                 }
159 |                 /*上一页*/
160 |                 function prePage() {
161 |                     if(cur_page > 1) {
162 |                         cur_page--;
163 |                         updatePage();
164 |                     }
165 |                 }
166 |                 /*下一页*/
167 |                 function nextPage() {
168 |                     if(cur_page < max_page) {
169 |                         cur_page++;
170 |                         updatePage();
171 |                     }
172 |                 }
173 |                 // 加载第一页
174 |                 updatePage();
175 |             </script>
176 |         </body>
177 |     </html>
178 |     


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_1.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_1.txt:
--------------------------------------------------------------------------------
1 | 入夏偏宜澹薄妆，越罗衣褪郁金黄，翠钿檀注助容光。<br/>相见无言还有恨，几回判却又思量，月窗香径梦悠飏。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_10.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_10.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_10.txt:
--------------------------------------------------------------------------------
1 | 初心已恨花期晚。别后相思长在眼。兰衾犹有旧时香，每到梦回珠泪满。 <br/>多应不信人肠断。几夜夜寒谁共暖。欲将恩爱结来生，只恐来生缘又短。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_11.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_11.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_11.txt:
--------------------------------------------------------------------------------
1 | 年年此夕东城见，欢意匆匆。明日还重。却在楼台缥缈中。 <br/>垂螺拂黛清歌女，曾唱相逢。秋月春风。醉枕香衾一岁同。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_12.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_12.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_12.txt:
--------------------------------------------------------------------------------
1 | 四十年来家国，三千里地山河。凤阁龙楼连霄汉，玉树琼枝作烟萝，几曾识干戈？<br/>一旦归为臣虏，沈腰潘鬓消磨。最是仓皇辞庙日，教坊犹奏别离歌，垂泪对宫娥。 


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_13.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_13.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_13.txt:
--------------------------------------------------------------------------------
1 | 彩袖殷勤捧玉钟，当年拚却醉颜红。舞低杨柳楼心月，歌尽桃花扇底风。<br/>从别后，忆相逢，几回魂梦与君同。今宵剩把银釭照，犹恐相逢是梦中。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_14.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_14.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_14.txt:
--------------------------------------------------------------------------------
1 | 可怜白雪曲，未遇知音人。<br/>恓惶戎旅下，蹉跎淮海滨。<br/>涧树含朝雨，山鸟哢馀春。<br/>我有一瓢酒，可以慰风尘。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_15.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_15.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_15.txt:
--------------------------------------------------------------------------------
1 | 罗带惹香，犹系别时红豆。泪痕新，金缕旧，断离肠。<br/>一双娇燕语雕梁，还是去年时节。绿杨浓，芳草歇，柳花狂。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_16.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_16.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_16.txt:
--------------------------------------------------------------------------------
1 | 天涯旧恨，独自凄凉人不问。欲见回肠，断尽金炉小篆香。<br/>黛蛾长敛，任是春风吹不展。困倚危楼，过尽飞鸿字字愁。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_17.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_17.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_17.txt:
--------------------------------------------------------------------------------
1 | 人生愁恨何能免，销魂独我情何限！故国梦重归，觉来双泪垂。<br/>高楼谁与上？长记秋晴望。往事已成空，还如一梦中。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_18.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_18.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_18.txt:
--------------------------------------------------------------------------------
1 | 恨君不似江楼月，南北东西，南北东西，只有相随无别离。<br/>恨君却似江楼月，暂满还亏，暂满还亏，待得团圆是几时？


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_19.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_19.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_19.txt:
--------------------------------------------------------------------------------
1 | 多少恨，昨夜梦魂中。还似旧时游上苑，车如流水马如龙。花月正春风。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_2.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_2.txt:
--------------------------------------------------------------------------------
1 | 身外闲愁空满，眼中欢事常稀。明年应赋送君诗。细从今夜数，相会几多时。<br/>浅酒欲邀谁劝，深情惟有君知。东溪春近好同归。柳垂江上影，梅谢雪中枝。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_20.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_20.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_20.txt:
--------------------------------------------------------------------------------
1 | 涉江玩秋水，爱此红蕖鲜。<br/>攀荷弄其珠，荡漾不成圆。<br/>佳人彩云里，欲赠隔远天。<br/>相思无因见，怅望凉风前。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_21.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_21.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_21.txt:
--------------------------------------------------------------------------------
1 | 急景流年真一箭。残雪声中，省识东风面。风里垂杨千万线，昨宵染就鹅黄浅。<br/>又是廉纤春雨暗。倚遍危楼，高处人难见。已恨平芜随雁远，暝烟更界平芜断。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_22.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_22.txt:
--------------------------------------------------------------------------------
1 | 淡烟飘薄。莺花谢、清和院落。树阴翠、密叶成幄。麦秋霁景，夏云忽变奇峰、倚寥廊。波暖银塘，涨新萍绿鱼跃。想端忧多暇，陈王是日，嫩苔生阁。<br/>正铄石天高，流金昼永，楚榭光风转蕙，披襟处、波翻翠幕。以文会友，沈李浮瓜忍轻诺。别馆清闲，避炎蒸、岂须河朔。但尊前随分，雅歌艳舞，尽成欢乐。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_23.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_23.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_23.txt:
--------------------------------------------------------------------------------
1 | 春到南楼雪尽。惊动灯期花信。小雨一番寒。倚阑干。 <br/>莫把栏干频倚。一望几重烟水。何处是京华。暮云遮。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_24.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_24.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_24.txt:
--------------------------------------------------------------------------------
1 | 柳色披衫金缕凤，纤手轻拈红豆弄，翠蛾双敛正含情。桃花洞，瑶台梦，一片春愁谁与共？


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_25.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_25.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_25.txt:
--------------------------------------------------------------------------------
1 | 阑珊心绪，醉倚绿琴相伴住。一枕新愁，残夜花香月满楼。<br/>繁笙脆管，吹得锦屏春梦远。只有垂杨，不放秋千影过墙。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_26.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_26.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_26.txt:
--------------------------------------------------------------------------------
1 | 卷尽愁云，素娥临夜新梳洗。暗尘不起。酥润凌波地。 <br/>辇路重来，仿佛灯前事。情如水。小楼熏被。春梦笙歌里。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_27.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_27.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_27.txt:
--------------------------------------------------------------------------------
1 | 杨柳丝丝弄轻柔，烟缕织成愁。海棠未雨，梨花先雪，一半春休。<br/>而今往事难重省，归梦绕秦楼。相思只在：丁香枝上，豆蔻梢头。 


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_28.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_28.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_28.txt:
--------------------------------------------------------------------------------
1 | 秋池阁。风傍晓庭帘幕。霜叶未衰吹未落。半惊鸦喜鹊。<br/>自笑浮名情薄。似与世人疏略。一片懒心双懒脚。好教闲处著。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_29.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_29.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_29.txt:
--------------------------------------------------------------------------------
1 | 谢却荼蘼，一片月明如水。篆香消，犹未睡，早鸦啼。<br/>嫩寒无赖罗衣薄，休傍阑干角。最愁人，灯欲落，雁还飞。 


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_3.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_3.txt:
--------------------------------------------------------------------------------
1 | 背庭缘恐花羞坠。心事遥山里。小帘愁卷月笼明。一寸秋怀禁得、几蛩声。 <br/>井梧不放西风起。供与离人睡。梦和新月未圆时。起看檐蛛结网、又寻思。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_30.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_30.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_30.txt:
--------------------------------------------------------------------------------
1 | 雨后春容清更丽。只有离人，幽恨终难洗。北固山前三面水。碧琼梳拥青螺髻。<br/>一纸乡书来万里。问我何年，真个成归计。白首送春拚一醉。东风吹破千行泪。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_31.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_31.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_31.txt:
--------------------------------------------------------------------------------
1 | 如花貌。当来便约，永结同心偕老。为妙年、俊格聪明，凌厉多方怜爱，何期养成心性近，元来都不相表。渐作分飞计料。 <br/>稍觉因情难供，恁殛恼。争克罢同欢笑。已是断弦尤续，覆水难收，常向人前诵谈，空遣时传音耗。漫悔懊。此事何时坏了。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_32.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_32.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_32.txt:
--------------------------------------------------------------------------------
1 | 西施宜笑复宜颦，丑女效之徒累身。<br/>君王虽爱蛾眉好，无奈宫中妒杀人！


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_33.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_33.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_33.txt:
--------------------------------------------------------------------------------
1 | 十里春风，二分明月，蕊仙飞下琼楼。看冰花翦翦，拥碎玉成毬。想长日、云阶伫立，太真肌骨，飞燕风流。敛群芳、清丽精神，都付扬州。<br/>雨窗数朵，梦惊回、天际香浮。似阆苑花神，怜人冷落，骑鹤来游。为问竹西风景，长空淡、烟水悠悠。又黄昏，羌管孤城，吹起新愁。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_34.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_34.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_34.txt:
--------------------------------------------------------------------------------
1 | 记得来时春未暮，执手攀花，袖染花梢露。暗卜春心共花语，争寻双朵争先去。<br/>多情因甚相辜负，轻拆轻离，欲向谁分诉。泪湿海棠花枝处，东君空把奴分付。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_35.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_35.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_35.txt:
--------------------------------------------------------------------------------
1 | 名花倾国两相欢，常得君王带笑看。<br/>解释春风无限恨，沉香亭北倚栏杆。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_36.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_36.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_36.txt:
--------------------------------------------------------------------------------
1 | 无情最是江头柳。长条折尽还依旧。木叶下平湖。雁来书有无。雁无书尚可。妙语凭谁和。风雨断肠时。小山生桂枝。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_37.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_37.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_37.txt:
--------------------------------------------------------------------------------
1 | 画楼影蘸清溪水。歌声响彻行云里。帘幕燕双双。绿杨低映窗。 <br/>曲中特地误。要试周郎顾。醉里客魂消。春风大小乔。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_38.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_38.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_38.txt:
--------------------------------------------------------------------------------
1 | 燕语莺啼人乍远。却恨西园，依旧莺和燕。笑语十分愁一半。翠园特地春光暖。 <br/>只道书来无过雁。不道柔肠，近日无肠断。柄玉莫摇湘泪点。怕君唤作秋风扇。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_39.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_39.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_39.txt:
--------------------------------------------------------------------------------
1 | 可怜今夕月，向何处、去悠悠？<br/>是别有人间，那边才见，光影东头？<br/>是天外空汗漫，但长风、浩浩送中秋？<br/>飞镜无根谁系？嫦娥不嫁谁留？　<br/>谓经海底问无由，恍惚使人愁。<br/>怕万里长鲸，纵横触破，玉殿琼楼。<br/>虾蟆故堪浴水，问云何、玉兔解沉浮？<br/>若道都齐无恙，云何渐渐如钩？


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_4.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_4.txt:
--------------------------------------------------------------------------------
1 | 香叆雕盘，寒生冰箸，画堂别是风光。主人情重，开宴出红妆。腻玉圆搓素颈，藕丝嫩、新织仙裳。双歌罢，虚檐转月，余韵尚悠扬。<br/>人间，何处有，司空见惯，应谓寻常。坐中有狂客，恼乱愁肠。报道金钗坠也，十指露、春笋纤长。亲曾见，全胜宋玉，想像赋高唐。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_40.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_40.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_40.txt:
--------------------------------------------------------------------------------
1 | 风骨萧然，称独立、群仙首。春江雪、一枝梅秀。小样香檀，映朗玉、纤纤手。未久。转新声、泠泠山溜。 <br/>曲里传情，更浓似、尊中酒。信倾盖、相逢如旧。别后相思，记敏政堂前柳。知否。又拚了、一场消瘦。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_5.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_5.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_5.txt:
--------------------------------------------------------------------------------
1 | 雕阴无树水南流，雉堞连云古帝州。<br/>带雨晚驼鸣远戍，望乡孤客倚高楼。<br/>明妃去日花应笑，蔡琰归时鬓已秋。<br/>一曲单于暮烽起，扶苏城上月如钩。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_6.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_6.txt:
--------------------------------------------------------------------------------
1 | 美女妖且闲，采桑歧路间。<br/>柔条纷冉冉，叶落何翩翩。<br/>攘袖见素手，皓腕约金环。<br/>头上金爵钗，腰佩翠琅玕。<br/>明珠交玉体，珊瑚间木难。<br/>罗衣何飘飘，轻裾随风还。<br/>顾盼遗光彩，长啸气若兰。<br/>行徒用息驾，休者以忘餐。<br/>借问女安居，乃在城南端。<br/>青楼临大路，高门结重关。<br/>容华耀朝日，谁不希令颜？<br/>媒氏何所营？玉帛不时安。<br/>佳人慕高义，求贤良独难。<br/>众人徒嗷嗷，安知彼所观？<br/>盛年处房室，中夜起长叹。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_7.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_7.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_7.txt:
--------------------------------------------------------------------------------
1 | 斑骓路与阳台近。前度无题初借问。暖风鞭袖尽闲垂，微月帘栊曾暗认。 <br/>梅花未足凭芳信。弦语岂堪传素恨。翠眉饶似远山长，寄与此愁颦不尽。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_8.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_8.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_8.txt:
--------------------------------------------------------------------------------
1 | 留春不住。恰似年光无味处。满眼飞英。弹指东风太浅情。 <br/>筝弦未稳。学得新声难破恨。转枕花前。且占香红一夜眠。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_9.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/huaban.com/淡然小笺赋箴言/13448395_9.jpg


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/13448395_9.txt:
--------------------------------------------------------------------------------
1 | 出墙花，当路柳。借问芳心谁有。红解笑，绿能颦。千般恼乱春。 <br/>北来人，南去客。朝暮等闲攀折。怜晚芳，惜残阳。情知枉断肠。


--------------------------------------------------------------------------------
/huaban.com/淡然小笺赋箴言/index.html:
--------------------------------------------------------------------------------
  1 | 
  2 |     <!DOCTYPE />
  3 |     <html>
  4 |         <head>
  5 |             <meta charset="utf-8" />
  6 |             <meta http-equiv="Access-Control-Allow-Origin" content="*" />
  7 |             <meta http-equiv="Access-Control-Allow-Credentials" content="true" />
  8 |             <meta http-equiv="Access-Control-Allow-Methods" content="get" />
  9 |             <meta http-equiv="Access-Control-Allow-Headers" content="x-requested-with, x-ui-request, lang" />
 10 |             <title>淡然小笺赋箴言</title>
 11 |             <style>
 12 |                 html {
 13 |                     background: #f5f5f5;
 14 |                     --gbshadow-0: 0 none;
 15 |                     --gbshadow-1: 0 1px 3px 0 rgba(0,0,0,.02),0 4px 8px 0 rgba(0,0,0,.02);
 16 |                     --gbshadow-2: 0 1px 3px 0 rgba(0,0,0,.05),0 8px 16px 0 rgba(0,0,0,.07);
 17 |                     --gbshadow-3: 0 1px 3px 0 rgba(0,0,0,.02),0 16px 32px 0 rgba(0,0,0,.07);
 18 |                     --gbshadow-hover: 0 1px 3px rgba(0,0,0,.02),0 16px 32px -4px rgba(0,0,0,.17);
 19 |                 }
 20 |                 .title{
 21 |                     text-align: center;
 22 |                     width: 700px;
 23 |                     line-height: 30px;
 24 |                     margin: 0 auto;
 25 |                     margin-top: 20px;
 26 |                 }
 27 |                 .main{
 28 |                     text-align: center;
 29 |                     width: 700px;
 30 |                     height: 700px;
 31 |                     margin: auto;
 32 |                     position: absolute;
 33 |                     top: 0;
 34 |                     left: 0;
 35 |                     right: 0;
 36 |                     bottom: 0;
 37 |                     box-sizing: border-box;
 38 |                     vertical-align: middle;
 39 |                     overflow: hidden;
 40 |                     background-color: #fff;
 41 |                     border-radius: 3px;
 42 |                     box-shadow: 0 1px 3px rgba(0,0,0,.02), 0 4px 8px rgba(0,0,0,.02);
 43 |                     -webkit-transition: transform .15s ease-in-out,box-shadow .15s ease-in-out;
 44 |                     -moz-box-shadow: 0 1px 3px rgba(0,0,0,.02),0 4px 8px rgba(0,0,0,.02);
 45 |                     -webkit-box-shadow: 0 1px 3px rgba(0,0,0,.02), 0 4px 8px rgba(0,0,0,.02);
 46 |                     -webkit-backface-visibility: hidden;
 47 |                 }
 48 |                 .pre-page{
 49 |                     width: 5%;
 50 |                     height: 100%;
 51 |                     float: left;
 52 |                     box-sizing: border-box;
 53 |                     cursor: pointer;
 54 |                 }
 55 |                 .pre-page:hover {
 56 |                     background-color: #ddd;
 57 |                 }
 58 |                 .left-main{
 59 |                     width: 50%;
 60 |                     height: 100%;
 61 |                     float: left;
 62 |                     box-sizing: border-box;
 63 |                     overflow: hidden;
 64 |                     display: -webkit-flex;
 65 |                     display: flex;
 66 |                     -webkit-align-items: center;
 67 |                     align-items: center;
 68 |                     -webkit-justify-content: center;
 69 |                     justify-content: center;
 70 |                 }
 71 |                 .left-main img{
 72 |                     width: 80%;
 73 |                     border-radius: 3px;
 74 |                 }
 75 |                 .left-main img:hover {
 76 |                     box-shadow: 0 1px 3px rgba(0,0,0,.02), 0 16px 32px -4px rgba(0,0,0,.17);
 77 |                     transform: translateY(-1px);
 78 |                     cursor: pointer;
 79 |                 }
 80 |                 .right-main{
 81 |                     width: 40%;
 82 |                     height: 100%;
 83 |                     float: left;
 84 |                     box-sizing: border-box;
 85 |                     display: -webkit-flex;
 86 |                     display: flex;
 87 |                     -webkit-align-items: center;
 88 |                     align-items: center;
 89 |                     -webkit-justify-content: center;
 90 |                     justify-content: center;
 91 |                 
 92 |                     line-height: 1.5;
 93 |                     font-family: cursive;
 94 |                     color: #444;
 95 |                     padding: 15px;
 96 |                     margin-bottom: 20px;
 97 |                     word-wrap: break-word;
 98 |                 }
 99 |                 .next-page{
100 |                     width: 5%;
101 |                     height: 100%;
102 |                     float: right;
103 |                     box-sizing: border-box;
104 |                     cursor: pointer;
105 |                 }
106 |                 .next-page:hover {
107 |                     background-color: #ddd;
108 |                 }
109 |             </style>
110 |         </head>
111 |         <body>
112 |             <div class="title">
113 |                 淡然小笺赋箴言 - <span id="cur_page">1/40</span>
114 |             </div>
115 |             <div class="main">
116 |                 <div class="pre-page" id="pre_page" onclick="prePage();"></div>
117 |                 <div class="left-main">
118 |                     <img id="curr_img" src="13448395_1.jpg" />
119 |                 </div>
120 |                 <div id="curr_txt" class="right-main">
121 |                 </div>
122 |                 <div class="next-page" id="next_page" onclick="nextPage()"></div>
123 |             </div>
124 |             <script type="text/javascript">
125 |                 var cur_page = 1;
126 |                 var max_page = 40;
127 |                 var border_id = "13448395";
128 |                 /*加载当前页*/
129 |                 function updatePage() {
130 |                     document.getElementById("cur_page").innerHTML = cur_page + "/" + max_page;
131 | 					if(cur_page == 1){
132 | 						document.getElementById("pre_page").style.border="0px";
133 | 					} else {
134 | 						document.getElementById("pre_page").style["border-right"]="1px solid #ddd";
135 | 					}
136 | 					if(cur_page == max_page){
137 | 						document.getElementById("next_page").style.border="0px";
138 | 					} else {
139 | 						document.getElementById("next_page").style["border-left"]="1px solid #ddd";
140 | 					}
141 |                     document.getElementById("curr_img")["src"] = border_id + "_" + cur_page + ".jpg";
142 |                     loadText();
143 |                 }
144 |                 /*异步加载文本*/
145 |                 function loadText(){
146 |                     //创建XMLHttpRequst对象
147 |                     var xhr = new XMLHttpRequest();
148 |                     // open(请求方式type,访问文件url/file,是否异步async)
149 |                     xhr.open('get', border_id + "_" + cur_page + ".txt", true);
150 |                 
151 |                     //两种请求方式:onload/onreadystatechange
152 |                     //onload 方式:
153 |                     xhr.onload = function(){
154 |                     //输出一下请求返回的文本
155 |                         document.getElementById("curr_txt").innerHTML = this.responseText;
156 |                     }
157 |                     //onreadystatechange  方式
158 |                     xhr.onreadystatechange = function(){
159 |                         document.getElementById("curr_txt").innerHTML = this.responseText;
160 |                     }
161 |                     xhr.send();
162 |                 }
163 |                 /*上一页*/
164 |                 function prePage() {
165 |                     if(cur_page > 1) {
166 |                         cur_page--;
167 |                         updatePage();
168 |                     }
169 |                 }
170 |                 /*下一页*/
171 |                 function nextPage() {
172 |                     if(cur_page < max_page) {
173 |                         cur_page++;
174 |                         updatePage();
175 |                     }
176 |                 }
177 |                 // 加载第一页
178 |                 updatePage();
179 |             </script>
180 |         </body>
181 |     </html>
182 |     


--------------------------------------------------------------------------------
/jjwxk.net/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")


--------------------------------------------------------------------------------
/jjwxk.net/README.md:
--------------------------------------------------------------------------------
 1 | ## 前情提要
 2 | 
 3 | 最近想学习学习怎么写小说，爬点资源分析分析
 4 | 
 5 | ## 晋江文学库
 6 | 
 7 | 免费小说爬取
 8 | 
 9 | 1. 简单模式
10 | 2. HTML静态模式
11 | 3. Sqlite模式
12 | 4. ES模式
13 | 5. ES+ECHARTS模式
14 | 
15 | ### 简单模式
16 | 
17 | 运行方法：进入到当前目录下 
18 | 
19 | ```
20 | python jjwxk-free-simple.py
21 | ```
22 | 
23 | 1. 简单模式是以文本文件作为数据媒介，存储格式用文件夹实现层次，每个子文件夹一本小说数据
24 | 2. 所有小说列表和小说具体内容爬取步骤分离，可以分两个线程运行 free_list 和 book_list 方法，每个方法不支持多进程运行
25 | 3. 实现了简单的断点续爬，利用文本文件记录爬取进度，包括小说列表进度、已完成的小说进度、当前这本小说章节进度
26 | 
27 | 部分截图：
28 | 
29 | ![所有小说图片截图](img/jjwxk-free-simple-1.png)
30 | 
31 | ![单本小说图片截图](img/jjwxk-free-simple-2.png)
32 | 
33 | >PS：本项目仅学习分享用，请不要用于商业


--------------------------------------------------------------------------------
/jjwxk.net/img/jjwxk-free-simple-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/jjwxk.net/img/jjwxk-free-simple-1.png


--------------------------------------------------------------------------------
/jjwxk.net/img/jjwxk-free-simple-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/jjwxk.net/img/jjwxk-free-simple-2.png


--------------------------------------------------------------------------------
/jjwxk.net/jjwxk-free-simple.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import Parent
  3 | from bs4 import BeautifulSoup
  4 | from BaseTools.MyDownload import request
  5 | from BaseTools.MyUtil import FileTool
  6 | import time
  7 | 
  8 | class jjwxk_free_simple():
  9 |     def __init__(self):
 10 |         self.headers = {
 11 |             # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 12 |             # 'Accept-Encoding': 'gzip, deflate',
 13 |             'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
 14 |             'Host': 'www.jjwxc.net',
 15 |             'Upgrade-Insecure-Requests': '1',
 16 |             'User-Agent':"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
 17 |         }
 18 |         self.basePath = "jjwxk_free_simple/"
 19 |         FileTool.mkdir(self.basePath)
 20 |         self.baseListFilePath = self.basePath + "book-list.txt"
 21 |         self.baseUrlFilePath = self.basePath + "book-url.txt"
 22 |         self.basePageFilePath = self.basePath + "book-page.txt"
 23 |         self.totalFinishFilePath = self.basePath + "total.txt"
 24 |         self.finishBookLineCountFilePath = self.basePath + "book-total.txt"
 25 |         self.globalPageCount = 0
 26 |         self.pageCount = 0
 27 |         self.lineCount = 0
 28 |         self.readFinishCountInfo()
 29 | 
 30 |     # 抓取入口:默认 http://www.jjwxc.net/bookbase_slave.php?booktype=free
 31 |     def free_list(self, limitPage=1, url="http://www.jjwxc.net/bookbase_slave.php?booktype=free"):
 32 |         html_content = self.request_content(url)  ##调用request_content返回html文本给我们
 33 |         FileTool.write_behind(self.basePageFilePath, url)
 34 |         html_ele = BeautifulSoup(html_content, 'lxml')
 35 |         self.globalPageCount = self.globalPageCount + 1
 36 |         if self.globalPageCount > limitPage:
 37 |             return
 38 | 
 39 |         if self.globalPageCount >= self.pageCount:
 40 |             # 如果当前页码比记录的页码大，行数从第一行开始记录，否则就当前页码记录
 41 |             if(self.globalPageCount > self.pageCount):
 42 |                 self.lineCount = 0
 43 |                 self.pageCount = self.globalPageCount
 44 | 
 45 |             # 获取图书表格元素
 46 |             book_table = html_ele.find("table", class_="cytable")
 47 |             if book_table == None:
 48 |                 return
 49 |             list_tr = book_table.find_all("tr")
 50 |             count = -1
 51 |             for tr in list_tr:
 52 |                 count = count + 1
 53 |                 if count == 0 or self.lineCount >= count:
 54 |                     continue
 55 |                 list_td = tr.find_all("td")
 56 |                 book_list_url = None
 57 |                 book_info_arr = []
 58 |                 count_td = 0
 59 |                 for td in list_td:
 60 |                     book_info_arr.append(td.get_text().replace('\n', '').replace(' ', ''))
 61 |                     if count_td == 1:
 62 |                         book_list_url = "http://www.jjwxc.net/" + td.find("a")['href']
 63 |                     count_td = count_td + 1
 64 |                 FileTool.write_behind(self.baseUrlFilePath, book_list_url)
 65 |                 book_list_info = "  |  ".join(book_info_arr)
 66 |                 FileTool.write_behind(self.baseListFilePath, book_list_info)
 67 |                 self.lineCount = count
 68 |                 # 完成一行，记录一下count信息，便于后面断点爬取
 69 |                 self.saveFinishCountInfo()
 70 |         else:
 71 |             self.globalPageCount = self.pageCount - 1
 72 | 
 73 |         # page_next = "http://www.jjwxc.net/" + html_ele.find_all("div", class_="controlbar")[1].find_all("a")[2]["href"]
 74 |         page_next = "http://www.jjwxc.net/bookbase_slave.php?booktype=free&opt=&endstr=&orderstr=4&page=" + str(self.globalPageCount + 1)
 75 |         if page_next == None or "" == page_next:
 76 |             return
 77 |         print("书籍清单第", self.globalPageCount, "页信息：[", url, "]抓取完毕")
 78 |         # 暂停一秒，防止爬虫被发现
 79 |         # time.sleep(1)
 80 |         self.headers['Referer'] = url
 81 |         # 继续拉取下一页
 82 |         self.free_list(limitPage, page_next)
 83 | 
 84 |     # 从保存的书籍链接记录里面抓取每一本书的内容
 85 |     def book_list(self):
 86 |         book_count = 0
 87 |         book_finish_count = self.readSimpleFinishCountInfo(self.finishBookLineCountFilePath)
 88 |         for line in open(self.baseUrlFilePath):
 89 |             # 逐行读取此前爬取的书籍链接，去掉最后的换行符号
 90 |             url = line.replace("\n", "")
 91 |             book_count = book_count + 1
 92 |             if book_count <= book_finish_count:
 93 |                 print("[", url, "]，该本书已经抓取过！")
 94 |                 continue
 95 |             self.book_one(url)
 96 |             # 记录抓取书的数量，实现简单断点续爬
 97 |             FileTool.overwrite(self.finishBookLineCountFilePath, str(book_count))
 98 |         print("[", url, "]，该本书所有章节已经抓取完毕！")
 99 | 
100 |     # 保存一本书的内容
101 |     def book_one(self, url="http://www.jjwxc.net/onebook.php?novelid=3468871"):
102 |         html_content = self.request_content(url)  ##调用request_content返回html文本给我们
103 |         html_ele = BeautifulSoup(html_content, 'lxml')
104 |         # 获取图书表格元素
105 |         book_table = html_ele.find("table", id="oneboolt")
106 |         list_tr = book_table.find_all("tr")
107 |         self.headers['Referer'] = url
108 |         if len(list_tr) > 0:
109 |             book_title = list_tr[0].find("h1").get_text()
110 |             # 去掉文件夹中特殊字符，防止小说名中特殊字符
111 |             book_floder = self.basePath + FileTool.replace_invalid_filename(book_title) + "/"
112 |             FileTool.mkdir(book_floder)
113 |             book_chapter_file = book_floder + "0.chapter_list.txt"
114 |             book_chapter_url_file = book_floder + "0.chapter_url_list.txt"
115 |             book_chapter_finish_count_file = book_floder + "0.current_count.txt"
116 |             chapter_count = 0
117 |             chapter_finish_count = self.readSimpleFinishCountInfo(book_chapter_finish_count_file)
118 |             for tr in list_tr:
119 |                 if "itemprop" in tr.attrs:
120 |                     chapter_count = chapter_count + 1
121 |                     if chapter_count <= chapter_finish_count:
122 |                         print("第", chapter_count, "章，该章节已经抓取过！")
123 |                         continue
124 |                     list_td = tr.find_all("td")
125 |                     count_td = 0
126 |                     chapter_info_arr = []
127 |                     chapter_url = None
128 |                     chapter_title = None
129 |                     for td in list_td:
130 |                         chapter_info_arr.append(td.get_text().replace('\n', '').replace(' ', ''))
131 |                         if count_td == 1:
132 |                             chapter_a = td.find("a")
133 |                             if chapter_a != None:
134 |                                 chapter_url = chapter_a['href']
135 |                                 chapter_title = chapter_a.get_text()
136 |                         count_td = count_td + 1
137 |                     if chapter_url == None:
138 |                         print("第", chapter_count, "章，该章节已丢失！")
139 |                         chapter_url = "第" + str(chapter_count) + "章，该章节已丢失！"
140 |                     else:
141 |                         # 去掉文件名中的特殊字符
142 |                         curr_filename = FileTool.replace_invalid_filename(str(chapter_count) + "." + chapter_title + ".txt")
143 |                         curr_chapter_file_path = book_floder + curr_filename
144 |                         self.save_chapter(curr_chapter_file_path, chapter_url)
145 |                     FileTool.write_behind(book_chapter_url_file, chapter_url)
146 |                     chapter_info = "  |  ".join(chapter_info_arr)
147 |                     FileTool.write_behind(book_chapter_file, chapter_info)
148 |                     # 记录完成的章节数，简单实现断点续爬
149 |                     FileTool.overwrite(book_chapter_finish_count_file, str(chapter_count))
150 |                     print("第", chapter_count, "章，该章节已经抓取完毕！")
151 | 
152 |     # 保存一个章节的内容
153 |     def save_chapter(self, path, chapter_url):
154 |         html_content = self.request_content(chapter_url)  ##调用request_content返回html文本给我们
155 |         html_ele = BeautifulSoup(html_content, 'lxml')
156 |         novelDiv = html_ele.find("div", class_="noveltext")
157 |         if novelDiv == None:
158 |             return
159 |         novelHtmls = novelDiv.contents
160 |         novelTextArr = []
161 |         # 处理小说文本数据，保证简单换行，保证基本格式
162 |         for novelHtml in novelHtmls:
163 |             if novelHtml.name == "div" or novelHtml.name == "br":
164 |                 continue
165 |             else:
166 |                 text = novelHtml.string
167 |                 if text == None:
168 |                     continue
169 |                 text = text.replace('\n', '').replace("\r", "").replace(" ", "")
170 |                 if len(text) > 0:
171 |                     novelTextArr.append(text)
172 |         novelText = "\n\n".join(novelTextArr)
173 |         FileTool.overwrite(path, novelText)
174 | 
175 | 
176 |     # 读取简单的数字信息
177 |     def readSimpleFinishCountInfo(self, path):
178 |         isExists = FileTool.isExit(path)
179 |         if isExists:
180 |             countTxt = FileTool.read_utf8(path)
181 |             return int(countTxt)
182 |         else:
183 |             return 0
184 | 
185 |     # 保存已完成的条数信息
186 |     def saveFinishCountInfo(self):
187 |         FileTool.overwrite(self.totalFinishFilePath, str(self.pageCount) + "-" + str(self.lineCount))
188 | 
189 |     # 读取已完成的条数信息
190 |     def readFinishCountInfo(self):
191 |         isExists = FileTool.isExit(self.totalFinishFilePath)
192 |         if isExists:
193 |             countTxt = FileTool.read_utf8(self.totalFinishFilePath)
194 |             countStrArr = countTxt.split("-")
195 |             self.pageCount = int(countStrArr[0])
196 |             self.lineCount = int(countStrArr[1])
197 |         else:
198 |             self.pageCount = 0
199 |             self.lineCount = 0
200 | 
201 |     # 获取网页html文本内容
202 |     def request_content(self, url):
203 |         try:
204 |             return request.get_utf8_content(url, headers=self.headers)
205 |         except:
206 |             return ""
207 | 
208 | 
209 | jjwxk = jjwxk_free_simple()
210 | jjwxk.free_list()
211 | # while jjwxk.globalPageCount < 10000:
212 | #     try:
213 | #         jjwxk.free_list()
214 | #     except Exception as e:
215 | #         print('except:', e)
216 | #     finally:
217 | #         print('finally...')
218 | jjwxk.book_list()


--------------------------------------------------------------------------------
/jjwxk.net/simple-http-server.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | # python2.0为SimpleHTTPServer
 3 | import http.server
 4 | # python2.0为SocketServer
 5 | import socketserver
 6 | # 自定义端口
 7 | PORT = 8888
 8 | # 服务句柄定义
 9 | Handler = http.server.SimpleHTTPRequestHandler
10 | # TCP服务
11 | httpd = socketserver.TCPServer(("", PORT), Handler)
12 | # 启动Web服务
13 | print("Web服务端口为：", PORT)
14 | httpd.serve_forever()


--------------------------------------------------------------------------------
/mzitu.com/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")


--------------------------------------------------------------------------------
/mzitu.com/README.md:
--------------------------------------------------------------------------------
 1 | ## 前情提要
 2 | 
 3 | 爬取 妹子图 网站的 图片
 4 | 学习python的一个练手例子，来自 https://cuiqingcai.com/4352.html
 5 | 
 6 | ## 可启动文件
 7 | 
 8 | 1. scrapy-mzitu-no-es.py，简单文件夹格式目录存储，简单断点续爬，未使用数据库
 9 | 2. scrapy-mzitu-es.py，基于ES数据库存储，图片相对本目录存储路径，断点续爬
10 | 3. mzitu-crawler-es.py，简单多线程爬取，基于ES数据库存储，图片相对本目录存储路径，断点续爬
11 | 
12 | ## 启动方法
13 | 
14 | 控制台 cd 到当前目录
15 | 
16 | >python filename.py
17 | 
18 | >PS：本项目仅学习分享用，请不要用于商业


--------------------------------------------------------------------------------
/mzitu.com/mzitu-crawler-es.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import time
 3 | import threading
 4 | import multiprocessing
 5 | from mzitu_for_thread import MzituThread
 6 | from mzitu_es import mzitu_es
 7 | 
 8 | SLEEP_TIME = 1
 9 | def mzitu_crawler(max_threads=5):
10 |     def pageurl_crawler():
11 |         mzituThread = MzituThread(mzitu_es)
12 |         while True:
13 |             if mzituThread.scrapy_one() is not True:
14 |                 time.sleep(SLEEP_TIME)
15 |             
16 |     threads = []
17 |     while True:
18 |         """
19 |         threads 或者 crawl_queue为真都代表我们还没下载完成，程序就会继续执行
20 |         """
21 |         for thread in threads:
22 |             if not thread.is_alive(): ##is_alive是判断是否为空,不是空则在队列中删掉
23 |                 threads.remove(thread)
24 |         while len(threads) < max_threads: ##线程池中的线程少于max_threads 或者 crawl_qeue时
25 |             thread = threading.Thread(target=pageurl_crawler) ##创建线程
26 |             thread.setDaemon(True) ##设置守护线程
27 |             thread.start() ##启动线程
28 |             threads.append(thread) ##添加进线程队列
29 |         time.sleep(SLEEP_TIME)
30 |      
31 | def process_crawler():
32 |     process = []
33 |     num_cpus = multiprocessing.cpu_count()
34 |     print('将会启动进程数为：', num_cpus)
35 |     for i in range(num_cpus):
36 |         p = multiprocessing.Process(target=mzitu_crawler) ##创建进程
37 |         p.start() ##启动进程
38 |         process.append(p) ##添加进进程队列
39 |     for p in process:
40 |         p.join() ##等待进程队列里面的进程结束
41 |  
42 | if __name__ == "__main__":
43 |     #mzituThread = MzituThread(mzitu_es)
44 |     #mzituThread.all_url()  # 抓取所有需要带处理的链接
45 |     process_crawler()


--------------------------------------------------------------------------------
/mzitu.com/mzitu_es.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import Parent
  3 | from DBTools.MyES import MyESClient
  4 | from datetime import datetime
  5 | 
  6 | class MzituEs():
  7 |     def __init__(self):
  8 |         self.init_es()
  9 |         
 10 |     def init_es(self):
 11 |         self.esindex = "mzitu"
 12 |         self.estype = "mzitu_imgs"
 13 |         index_mappings = {
 14 |             "mappings": {
 15 |                 self.estype: {
 16 |                     "properties": {
 17 |                         "imgThemeTitle": {
 18 |                             "type": "text",
 19 |                             "index": True,
 20 |                             "analyzer": "ik_max_word",
 21 |                             "search_analyzer": "ik_max_word"
 22 |                         },
 23 |                         "imgThemeUrl": {
 24 |                             "type": "keyword",
 25 |                             "index": True
 26 |                         },
 27 |                         "createTime": {
 28 |                             "type": "date",
 29 |                             "index": True
 30 |                         },
 31 |                         "scrapyStatus":{
 32 |                             "type": "integer",
 33 |                             "index": True,
 34 |                             # 0,1,2 待爬取，爬取中，已完成
 35 |                             "null_value": 0
 36 |                         }
 37 |                     }
 38 |                 }
 39 |             }
 40 |         }
 41 |         self.es = MyESClient(self.esindex, self.estype)
 42 |         self.es.createIndex(index_mappings)
 43 |         self.currdata = {}
 44 |         self.currdata["imgUrlList"] = []
 45 | 
 46 |     def save_es(self, data=None):
 47 |         '''
 48 |         存储当前数据到ES，并清空
 49 |         :return:
 50 |         '''
 51 |         if data == None:
 52 |             data = self.currdata
 53 |             data["createTime"] = datetime.now()
 54 |             data["scrapyStatus"] = 0
 55 |             self.currdata = {}
 56 |             self.currdata["imgUrlList"] = []
 57 |         self.es.indexData(data, data["imgThemeUrl"])
 58 |         
 59 |     def get_one_need_scrapy_es(self):
 60 |         '''
 61 |         从ES库中找一个待爬取的数据
 62 |         '''
 63 |         queryBody = {
 64 |           "query": {
 65 |             "bool": {
 66 |               "must": [
 67 |                 {
 68 |                   "term": {
 69 |                     "scrapyStatus": {
 70 |                       "value": 0
 71 |                     }
 72 |                   }
 73 |                 }
 74 |               ]
 75 |             }
 76 |           }
 77 |         }        
 78 |         res = self.es.getOneByBody(queryBody)
 79 |         return res
 80 |     
 81 |     def get_by_themeId_es(self, themeId):
 82 |         res = self.es.getDataSourceById(themeId)
 83 |         return res
 84 | 
 85 |     def exit_es(self, themeurl):
 86 |         queryBody = {
 87 |           "query": {
 88 |             "bool": {
 89 |               "must": [
 90 |                 {
 91 |                   "term": {
 92 |                     "imgThemeUrl": {
 93 |                       "value": themeurl
 94 |                     }
 95 |                   }
 96 |                 }
 97 |               ]
 98 |             }
 99 |           }
100 |         }
101 |         if self.es.exit(queryBody):
102 |             print("ES数据库里面已经存在！！")
103 |             return True
104 |         else:
105 |             return False
106 |         
107 | mzitu_es = MzituEs()


--------------------------------------------------------------------------------
/mzitu.com/mzitu_for_thread.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import Parent
  3 | import datetime
  4 | from bs4 import BeautifulSoup
  5 | import os
  6 | # import lxml
  7 | from BaseTools.MyDownload import request ##导入模块变了一下
  8 | 
  9 | class MzituThread(object):
 10 |     def __init__(self, mzitu_es):
 11 |         self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
 12 |         self.currPath = "./mzitu/"
 13 |         self.currdata = {}
 14 |         self.currdata["imgUrlList"] = []
 15 |         self.es = mzitu_es
 16 | 
 17 |     def all_url(self, url='http://www.mzitu.com/all'):
 18 |         html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response
 19 |         all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find('ul', class_="archives").find_all('a')
 20 |         for a in all_a:
 21 |             try:
 22 |                 title = a.get_text()
 23 |                 href = a['href']
 24 |                 print(title, href)  ##加点提示不然太枯燥了
 25 |                 if self.es.exit_es(href):
 26 |                     continue
 27 |                 self.currdata["imgThemeTitle"] = title
 28 |                 self.currdata["imgThemeUrl"] = href
 29 |                 self.es.save_es(self.currdata)
 30 |             except Exception as e:
 31 |                 print(e)
 32 |                 continue
 33 |     
 34 |     def scrapy_one(self, url=None):
 35 |         try:
 36 |             data = None
 37 |             if url == None:
 38 |                 data = self.es.get_one_need_scrapy_es()
 39 |             else:
 40 |                 data = self.es.get_by_themeId_es(url)
 41 |                 
 42 |             if data == None:
 43 |                 return False
 44 |             else:
 45 |                 data["scrapyStatus"]=1
 46 |                 self.es.save_es(data) ## 更新状态为爬取中
 47 |                 href = data["imgThemeUrl"]
 48 |                 self.mkdir(href) ##调用mkdir函数创建文件夹！
 49 |                 self.html(href, data) ##调用html函数把href参数传递过去！
 50 |                 data["scrapyStatus"]=2
 51 |                 self.es.save_es(data) ## 保存数据，并更新状态为已完成
 52 |                 return True
 53 |         except Exception as e:
 54 |             print(e)
 55 |             return False
 56 |             
 57 | 
 58 |     def html(self, href, data=None):   ##这个函数是处理套图地址获得图片的页面地址
 59 |         try:
 60 |             html = self.request(href)
 61 |             self.headers['referer'] = href
 62 |             ## max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
 63 |             # max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text()
 64 |             max_span = 100
 65 |             pageDiv = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi')
 66 |             if len(pageDiv) > 1:
 67 |                 max_span = pageDiv.find_all('span')[-2].get_text()
 68 |             for page in range(1, int(max_span) + 1):
 69 |                 page_url = href + '/' + str(page)
 70 |                 self.img(page_url, data) ##调用img函数
 71 |         except Exception as e:
 72 |             print('发生了异常：', e)
 73 | 
 74 |     def img(self, page_url, data=None): ##这个函数处理图片页面地址获得图片的实际地址
 75 |         img_html = self.request(page_url)
 76 |         img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
 77 |         print(img_url)
 78 |         self.saveImg(img_url, data)
 79 | 
 80 |     def saveImg(self, img_url, data=None): ##这个函数保存图片
 81 |         name = img_url[-9:-4]
 82 |         currUrl = self.currPath + name + '.jpg'
 83 |         isExists = os.path.exists(currUrl)
 84 |         if not isExists:
 85 |             img = self.request(img_url)
 86 |             f = open(currUrl, 'ab')
 87 |             f.write(img.content)
 88 |             f.close()
 89 |             print('该图片下载完毕')
 90 |             if data == None:
 91 |                 self.currdata["imgUrlList"].append({"originUrl":img_url, "currentUrl": currUrl})
 92 |             else:
 93 |                 data["imgUrlList"].append({"originUrl":img_url, "currentUrl": currUrl})
 94 |         else:
 95 |             print('该图片已经存在')
 96 | 
 97 |     def mkdir(self, path): ##这个函数创建文件夹
 98 |         if USE_ONE_DIR:
 99 |             path = ""
100 |         elif USE_DEF_DIR:
101 |             if path == None:
102 |                 path = self.currdata["imgThemeUrl"]
103 |             index = path.rindex("/")
104 |             path = path[index + 1:]
105 |         else:
106 |             path = path.strip()
107 |         isExists = os.path.exists(os.path.join("./mzitu", path))
108 |         if not isExists:
109 |             print('建了一个名字叫做', path, '的文件夹！')
110 |             os.makedirs(os.path.join("./mzitu", path))
111 |             self.currPath =  "./mzitu/" + path + "/"
112 |             ## os.chdir(os.path.join("./mzitu", path)) ##切换到目录
113 |             return True
114 |         else:
115 |             print('名字叫做', self.currPath, '的文件夹已经存在了！')
116 |             return False
117 | 
118 |     def request(self, url): ##这个函数获取网页的response 然后返回
119 |         content = request.get(url, headers=self.headers, timeout=3)
120 |         return content
121 | 
122 |     
123 | 
124 | USE_ONE_DIR = False
125 | USE_DEF_DIR = True
126 | 
127 | #mzituThread = MzituThread() ##实例化
128 | #mzituThread.all_url()
129 | #mzituThread.scrapy_one()


--------------------------------------------------------------------------------
/mzitu.com/scrapy-mzitu-es.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | import Parent
 3 | import datetime
 4 | from bs4 import BeautifulSoup
 5 | import os
 6 | # import lxml
 7 | from BaseTools.MyDownload import request ##导入模块变了一下
 8 | from mzitu_es import mzitu_es
 9 | 
10 | class mzitu():
11 | 
12 |     def __init__(self):
13 |         self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
14 |         self.currPath = "./mzitu/"
15 |         self.currdata = {}
16 |         self.currdata["imgUrlList"] = []
17 | 
18 |     def all_url(self, url):
19 |         html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response
20 |         all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find('ul', class_="archives").find_all('a')
21 |         for a in all_a:
22 |             title = a.get_text()
23 |             href = a['href']
24 |             print(title, href)  ##加点提示不然太枯燥了
25 |             if mzitu_es.exit_es(href):
26 |                 continue
27 |             self.currdata["imgThemeTitle"] = title
28 |             self.currdata["imgThemeUrl"] = href
29 |             #path = str(title).replace("?", '_') ##我注意到有个标题带有 ？  这个符号Windows系统是不能创建文件夹的所以要替换掉
30 |             self.mkdir(title) ##调用mkdir函数创建文件夹！这儿path代表的是标题title哦！！！！！不要糊涂了哦！
31 |             self.html(href) ##调用html函数把href参数传递过去！href是啥还记的吧？ 就是套图的地址哦！！不要迷糊了哦！
32 |             mzitu_es.save_es(self.currdata)
33 | 
34 |     def html(self, href):   ##这个函数是处理套图地址获得图片的页面地址
35 |         try:
36 |             html = self.request(href)
37 |             self.headers['referer'] = href
38 |             #max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
39 |             max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text()
40 |             for page in range(1, int(max_span) + 1):
41 |                 page_url = href + '/' + str(page)
42 |                 self.img(page_url) ##调用img函数
43 |         except Exception as e:
44 |             print('发生了异常：', e)
45 | 
46 |     def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址
47 |         img_html = self.request(page_url)
48 |         img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
49 |         print(img_url)
50 |         self.saveImg(img_url)
51 | 
52 |     def saveImg(self, img_url): ##这个函数保存图片
53 |         name = img_url[-9:-4]
54 |         currUrl = self.currPath + name + '.jpg'
55 |         isExists = os.path.exists(currUrl)
56 |         if not isExists:
57 |             img = self.request(img_url)
58 |             f = open(currUrl, 'ab')
59 |             f.write(img.content)
60 |             f.close()
61 |             print('该图片下载完毕')
62 |             self.currdata["imgUrlList"].append({"originUrl":img_url, "currentUrl": currUrl})
63 |         else:
64 |             print('该图片已经存在')
65 | 
66 |     def mkdir(self, path): ##这个函数创建文件夹
67 |         if USE_ONE_DIR:
68 |             path = ""
69 |         elif USE_DEF_DIR:
70 |             path = self.currdata["imgThemeUrl"]
71 |             index = path.rindex("/")
72 |             path = path[index + 1:]
73 |         else:
74 |             path = path.strip()
75 |         isExists = os.path.exists(os.path.join("./mzitu", path))
76 |         if not isExists:
77 |             print('建了一个名字叫做', path, '的文件夹！')
78 |             os.makedirs(os.path.join("./mzitu", path))
79 |             self.currPath =  "./mzitu/" + path + "/"
80 |             #os.chdir(os.path.join("./mzitu", path)) ##切换到目录
81 |             return True
82 |         else:
83 |             print('名字叫做', self.currPath, '的文件夹已经存在了！')
84 |             return False
85 | 
86 |     def request(self, url): ##这个函数获取网页的response 然后返回
87 |         content = request.get(url, headers=self.headers, timeout=3)
88 |         return content
89 | 
90 | USE_ONE_DIR = False
91 | USE_DEF_DIR = True
92 | Mzitu = mzitu() ##实例化
93 | Mzitu.all_url('http://www.mzitu.com/all') ##给函数all_url传入参数  你可以当作启动爬虫（就是入口）
94 | 


--------------------------------------------------------------------------------
/mzitu.com/scrapy-mzitu-no-es.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import Parent
  3 | from bs4 import BeautifulSoup
  4 | import os
  5 | from BaseTools.MyDownload import request
  6 | 
  7 | class mzitu():
  8 |     def __init__(self):
  9 |         self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
 10 |         self.basePath = "./mzitu-no-es/"
 11 |         self.currPath = self.basePath
 12 |         self.mkdir(self.basePath)
 13 |         self.totalFinishPath = "./mzitu-no-es/totalPage.txt"
 14 |         self.totalFinish = self.getTotalFinish()
 15 | 
 16 |     def all_url(self, url):
 17 |         html = self.request(url)##调用request函数把套图地址传进去会返回给我们一个response
 18 |         all_a = BeautifulSoup(html.text, 'lxml').find('div', class_='all').find('ul', class_="archives").find_all('a')
 19 |         count = 0
 20 |         for a in all_a:
 21 |             count = count + 1
 22 |             if count > self.totalFinish:
 23 |                 self.overwriteTotalFinish(count)
 24 |             else:
 25 |                 print("第", count, "页已经抓取过，跳过！")
 26 |                 continue
 27 |             title = a.get_text()
 28 |             href = a['href']
 29 |             print(title, href)  ##加点提示不然太枯燥了
 30 |             #path = str(title).replace("?", '_') ##我注意到有个标题带有 ？  这个符号Windows系统是不能创建文件夹的所以要替换掉
 31 |             self.mkdir(title) ##调用mkdir函数创建文件夹！这儿path代表的是标题title哦！！！！！不要糊涂了哦！
 32 |             self.html(href) ##调用html函数把href参数传递过去！href是啥还记的吧？ 就是套图的地址哦！！不要迷糊了哦！
 33 |             self.totalFinish = count
 34 | 
 35 |     def html(self, href):   ##这个函数是处理套图地址获得图片的页面地址
 36 |         try:
 37 |             html = self.request(href)
 38 |             self.headers['referer'] = href
 39 |             # max_span = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi').find_all('span')[-2].get_text()
 40 |             # max_span = BeautifulSoup(html.text, 'lxml').find_all('span')[10].get_text()
 41 |             max_span = 100
 42 |             pageDiv = BeautifulSoup(html.text, 'lxml').find('div', class_='pagenavi')
 43 |             if len(pageDiv) > 1:
 44 |                 max_span = pageDiv.find_all('span')[-2].get_text()
 45 |             for page in range(1, int(max_span) + 1):
 46 |                 page_url = href + '/' + str(page)
 47 |                 self.img(page_url) ##调用img函数
 48 |         except Exception as e:
 49 |             print('发生了异常：', e)
 50 | 
 51 |     def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址
 52 |         img_html = self.request(page_url)
 53 |         img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='main-image').find('img')['src']
 54 |         print(img_url)
 55 |         self.saveImg(img_url)
 56 | 
 57 |     def saveImg(self, img_url): ##这个函数保存图片
 58 |         name = img_url[-9:-4]
 59 |         imgPath = self.currPath + name + '.jpg'
 60 |         isExists = os.path.exists(imgPath)
 61 |         if not isExists:
 62 |             img = self.request(img_url)
 63 |             f = open(imgPath, 'ab')
 64 |             f.write(img.content)
 65 |             f.close()
 66 |             print('该图片下载完毕')
 67 |         else:
 68 |             print('该图片已经存在')
 69 | 
 70 |     def mkdir(self, path): ##这个函数创建文件夹
 71 |         if USE_ONE_DIR:
 72 |             path = ""
 73 |         elif USE_DEF_DIR:
 74 |             index = path.rindex("/")
 75 |             path = path[index + 1:]
 76 |         else:
 77 |             path = path.strip()
 78 |         self.currPath = os.path.join(self.basePath, path)
 79 |         isExists = os.path.exists(self.currPath)
 80 |         if not isExists:
 81 |             print('建了一个名字叫做', path, '的文件夹！')
 82 |             os.makedirs(self.currPath)
 83 |             #os.chdir(os.path.join("./mzitu", path)) ##切换到目录
 84 |             return True
 85 |         else:
 86 |             print('名字叫做', self.currPath, '的文件夹已经存在了！')
 87 |             return False
 88 | 
 89 | 
 90 |     def request(self, url): ##这个函数获取网页的response 然后返回
 91 |         content = request.get(url, headers=self.headers, timeout=3)
 92 |         return content
 93 | 
 94 |     def getTotalFinish(self):
 95 |         isExists = os.path.exists(self.totalFinishPath)
 96 |         if isExists:
 97 |             with open(self.totalFinishPath, 'r', encoding='UTF-8') as f:
 98 |                 return int(f.read())
 99 |         else:
100 |             return 0
101 | 
102 |     def overwriteTotalFinish(self, count):
103 |         with open(self.totalFinishPath, 'w', encoding='UTF-8') as f:
104 |             f.write(str(count))
105 | 
106 | USE_ONE_DIR = True
107 | USE_DEF_DIR = False
108 | Mzitu = mzitu() ##实例化
109 | Mzitu.all_url('http://www.mzitu.com/all') ##给函数all_url传入参数  你可以当作启动爬虫（就是入口）
110 | 


--------------------------------------------------------------------------------
/wallhaven.cc/Parent.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")


--------------------------------------------------------------------------------
/wallhaven.cc/README.md:
--------------------------------------------------------------------------------
 1 | ## 前情提要
 2 | 
 3 | 最近朋友发了个壁纸的网站，情不自禁花十几分钟写了个爬取。
 4 | 
 5 | ![](img/20210623210831.png)
 6 | 
 7 | 支持简单按页断点续爬
 8 | 
 9 | ## 启动方法
10 | 
11 | 控制台 cd 到当前目录
12 | 
13 | >python wallpic_scrapy.py
14 | 
15 | >PS：本项目仅学习分享用，请不要用于商业


--------------------------------------------------------------------------------
/wallhaven.cc/img/20210623210831.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/petterobam/learn-scrapy/e216ac40ddf5f8c595f170b2f79f7f86b686089c/wallhaven.cc/img/20210623210831.png


--------------------------------------------------------------------------------
/wallhaven.cc/wallpic_scrapy.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:utf-8 -*-
  2 | import Parent
  3 | from bs4 import BeautifulSoup
  4 | import os
  5 | from BaseTools.MyDownload import request
  6 | 
  7 | class wallpic():
  8 |     def __init__(self):
  9 |         self.headers = {'User-Agent': "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"}
 10 |         self.basePath = "./wallpic/"
 11 |         self.currPath = self.basePath
 12 |         self.mkdir(self.basePath)
 13 |         self.totalFinishPath = "./wallpic/totalPage.txt"
 14 |         self.totalFinish = self.getTotalFinish()
 15 | 
 16 |     def all_get(self, totalPage):
 17 |         count = 0
 18 |         while count < totalPage:
 19 |             count = count + 1
 20 |             if count > self.totalFinish:
 21 |                 self.overwriteTotalFinish(count)
 22 |             else:
 23 |                 print("第", count, "页已经抓取过，跳过！")
 24 |                 continue
 25 |             title = '第' + str(count) + '页/'
 26 |             href = 'https://wallhaven.cc/toplist?page=' + str(count)
 27 |             print(title, href)  ##加点提示不然太枯燥了
 28 |             ##调用mkdir函数创建文件夹！这儿path代表的是标题title哦！！！！！不要糊涂了哦！
 29 |             path = title
 30 |             self.mkdir(path)
 31 |             self.html(href) ##调用html函数把href参数传递过去！href是啥还记的吧？ 就是套图的地址哦！！不要迷糊了哦！
 32 |             self.totalFinish = count
 33 | 
 34 |     def html(self, href):   ##这个函数是处理套图地址获得图片的页面地址
 35 |         try:
 36 |             html = self.request(href)
 37 |             self.headers['referer'] = href
 38 |             figures = BeautifulSoup(html.text, 'lxml').find('section', class_='thumb-listing-page').find_all('figure')
 39 |             for figure in figures:
 40 |                 page_url = figure.find_all('a')[0]['href']
 41 |                 self.img(page_url) ##调用img函数
 42 |         except Exception as e:
 43 |             print('发生了异常：', e)
 44 | 
 45 |     def img(self, page_url): ##这个函数处理图片页面地址获得图片的实际地址
 46 |         img_html = self.request(page_url)
 47 |         img_url = BeautifulSoup(img_html.text, 'lxml').find('div', class_='scrollbox').find_all('img')[0]['src']
 48 |         print(img_url)
 49 |         self.saveImg(img_url)
 50 | 
 51 |     def saveImg(self, img_url): ##这个函数保存图片
 52 |         name = img_url[-9:-4]
 53 |         imgPath = self.currPath + name + '.jpg'
 54 |         isExists = os.path.exists(imgPath)
 55 |         if not isExists:
 56 |             img = self.request(img_url)
 57 |             f = open(imgPath, 'ab')
 58 |             f.write(img.content)
 59 |             f.close()
 60 |             print('该图片下载完毕')
 61 |         else:
 62 |             print('该图片已经存在')
 63 | 
 64 |     def mkdir(self, path): ##这个函数创建文件夹
 65 |         if USE_ONE_DIR:
 66 |             path = ""
 67 |         elif USE_DEF_DIR:
 68 |             index = path.rindex("/")
 69 |             path = path[index + 1:]
 70 |         else:
 71 |             path = path.strip()
 72 |         self.currPath = os.path.join(self.basePath, path)
 73 |         isExists = os.path.exists(self.currPath)
 74 |         if not isExists:
 75 |             print('建了一个名字叫做', path, '的文件夹！')
 76 |             os.makedirs(self.currPath)
 77 |             #os.chdir(os.path.join("./mzitu", path)) ##切换到目录
 78 |             return True
 79 |         else:
 80 |             print('名字叫做', self.currPath, '的文件夹已经存在了！')
 81 |             return False
 82 | 
 83 | 
 84 |     def request(self, url): ##这个函数获取网页的response 然后返回
 85 |         content = request.get(url, headers=self.headers, timeout=3)
 86 |         return content
 87 | 
 88 |     def getTotalFinish(self):
 89 |         isExists = os.path.exists(self.totalFinishPath)
 90 |         if isExists:
 91 |             with open(self.totalFinishPath, 'r', encoding='UTF-8') as f:
 92 |                 return int(f.read())
 93 |         else:
 94 |             return 0
 95 | 
 96 |     def overwriteTotalFinish(self, count):
 97 |         with open(self.totalFinishPath, 'w', encoding='UTF-8') as f:
 98 |             f.write(str(count))
 99 | 
100 | USE_ONE_DIR = False
101 | USE_DEF_DIR = False
102 | WallPic = wallpic() ##实例化
103 | 
104 | if __name__ == "__main__":
105 |     ## 传入你要爬取的页数，你可以当作启动爬虫（就是入口）
106 |     WallPic.all_get(11) 
107 | 


--------------------------------------------------------------------------------