├── README.md
├── algorithms
    ├── others
    │   └── base_algorithm_que.py
    ├── sorted
    │   └── sorted_algorithms.py
    └── test.py
├── basics
    ├── decorator.py
    ├── os.py
    ├── test_bloomfilter.py
    ├── test_test_demo.py
    └── zlib.py
├── blockchains
    ├── blockchain.py
    ├── blockchain_node1.py
    ├── blockchain_node2.py
    ├── 待解决问题.txt
    └── 模拟挖矿过程.txt
├── data_process
    ├── db_operation.py
    ├── divide_words.py
    ├── pandas_operation.py
    └── word_cloud.py
├── interesting
    ├── __init__.py
    ├── apscheduler
    │   ├── a.ico
    │   └── testApscheduler.py
    ├── dingding_push
    │   ├── __init__.py
    │   ├── demo.py
    │   └── dingding_push_msg.py
    └── hongzha.py
├── other_files
    └── chromedriver
    │   └── chromedriver.exe
├── practice
    ├── __init__.py
    ├── leetcode
    │   ├── 237.py
    │   ├── 709.py
    │   ├── 771.py
    │   ├── 832.py
    │   └── 977.py
    └── technique
    │   ├── __init__.py
    │   ├── code_technique.py
    │   ├── config
    │       ├── __init__.py
    │       ├── ip_pool.txt
    │       ├── random_ip.py
    │       ├── test_comfig.py
    │       └── user_agent.py
    │   ├── db_operate.py
    │   ├── excel_operate.py
    │   ├── file_operate.py
    │   ├── pdf_operate.py
    │   ├── selenium_template.py
    │   ├── test_data
    │       ├── test.py
    │       └── test_excel.xlsx
    │   ├── word_operate.py
    │   └── zip_file_operate.py
├── project_directory_structure.txt
└── spider
    ├── __init__.py
    ├── config
        ├── __init__.py
        ├── ipPool.txt
        ├── random_ip.py
        ├── test_comfig.py
        └── user_agent.py
    ├── configure
        ├── __init__.py
        ├── ipPool.txt
        ├── log.py
        ├── randomIp.py
        └── test.py
    ├── coroutine.py
    ├── down_doc_png.py
    ├── down_video
        ├── down_film.py
        ├── 参考.txt
        ├── 查看.ts文件.png
        └── 查看m3u8文件.png
    ├── ebook
        └── down_history_books.py
    ├── get_heml.py
    ├── get_html_new.py
    ├── get_meizi_image.py
    ├── get_url_data.py
    ├── gzh
        ├── GZH.py
        └── ip_pool.txt
    ├── kcb_info.py
    ├── movieReview
        ├── cleanData.py
        ├── cleanData.txt
        ├── film.py
        ├── filmComments.py
        ├── ipPool.txt
        ├── stopWords.txt
        ├── temp.txt
        ├── temp2.txt
        ├── test.csv
        └── test2.csv
    ├── news
        ├── __init__.py
        ├── get_news_url.py
        ├── new_pengpai.py
        ├── new_url.txt
        ├── re.txt
        └── wallstreetcn.py
    ├── pachong
        ├── geckodriver.log
        ├── get_dynamic_data.py
        ├── page_source.txt
        ├── result.csv
        ├── screenshot
        │   ├── 1.png
        │   └── bottom.jpg
        └── 笔记.txt
    ├── spider.zip
    ├── wechat
        ├── __init__.py
        ├── get_wechat_data_old.py
        └── get_wechat_data_simple.py
    └── weibo
        ├── __init__.py
        └── weibo_api.py


/README.md:
--------------------------------------------------------------------------------
 1 | # Python
 2 | Python Related Work
 3 | 1. algorithms 算法
 4 | 2. basics	基础
 5 | 3. blockchains 区块链
 6 | 4. data_process 数据处理
 7 | 5. practice 练习         
 8 |     technique         
 9 |         db_operation 数据库相关      
10 |         code_technique 代码技巧        
11 |         doc_operate 常规文件操作        
12 |         pdf_operate pdf操作        
13 |         word_operate word操作        
14 | 6. spider 爬虫        
15 |     gzh 公众号        
16 |     wechat_data 微信数据        
17 |     config 爬虫一般配置        
18 |     down_video 下载视频        
19 |     get_news 下载新闻        
20 |     movie_review 爬取影评        
21 |     pachong 爬取直播        
22 |     weibo 测试微博        
23 | 7. interesting 有趣内容        
24 |     apscheduler 定时器相关        
25 | 8. otherfiles 其他        
26 | 


--------------------------------------------------------------------------------
/algorithms/others/base_algorithm_que.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description : 基本经典算法
  6 |  1. 菲波拉契数列问题
  7 |  2. 判断素数问题
  8 |  3. 判断水仙花数问题
  9 |  4. 获取分数评级（嵌套条件运算符）
 10 |  5. 正整数分解质因数
 11 |  6. 最大公约数和最小公倍数
 12 |  7. 统计字符串内容
 13 |  8. 计算特殊表达式的和 求s=a+aa+aaa+aaaa+aa…a的值
 14 |  9. 判断一个数是否是完数（一个数如果恰好等于它的因子之和，如6=1＋2＋3）
 15 |  10.阶乘 求1+2!+3!+…+20!的和
 16 |  11.判断回文数
 17 |  12.四数取三，排列组合
 18 |  --------------------------------
 19 |  @Time    : 2018/11/26 20:30
 20 |  @File    : base_algorithm_que.py
 21 |  @Software: PyCharm
 22 |  --------------------------------
 23 |  @Author  : lixj
 24 |  @contact : lixj_zj@163.com
 25 | """
 26 | 
 27 | import math
 28 | 
 29 | 
 30 | ######### 1. 菲波拉契数列问题 #########
 31 | class fibonacci(object):
 32 |     def __init__(self, num):
 33 |         self.num = num
 34 | 
 35 |     # 建立数组---空间复杂度高
 36 |     def getFibonacciList_three(self):
 37 |         result = []
 38 |         result.insert(0, 1)
 39 |         result.insert(1, 1)
 40 |         for i in range(2, self.num):
 41 |             result.insert(i, result[i - 1] + result[i - 2])
 42 |         print(result)
 43 | 
 44 |     # 直接打印---从下往上计算 时间复杂度O(n)
 45 |     def getFibonacciList_two(self):
 46 |         one, two = 1, 1
 47 |         print(one, two, end="\t")
 48 |         for i in range(2, self.num):
 49 |             three = one + two
 50 |             one, two = two, three
 51 |             print(three, end="\t")
 52 | 
 53 | 
 54 | # 递归--时间复杂度高
 55 | def getFibonacciList_one(num):
 56 |     if num in [0, 1]:
 57 |         return 1
 58 |     else:
 59 |         return getFibonacciList_one(num - 1) + getFibonacciList_one(num - 2)
 60 | 
 61 | 
 62 | ######### 2. 判断素数问题 #########
 63 | def isPrime(num):
 64 |     flag = True
 65 |     for i in range(2, int(math.sqrt(num) + 1)):  # <=， +1表示判断math.sqrt(num)这个数是否为素数
 66 |         if num % i == 0:
 67 |             flag = False
 68 |             break
 69 |     return flag
 70 | 
 71 | 
 72 | ######### 3. 判断水仙花数 #########
 73 | def getDaffodil():
 74 |     beginNum = 101
 75 |     endNum = 1000
 76 |     for i in range(beginNum, endNum):
 77 |         a = int(i % 10)  # int()取整数，否则计算浮点数
 78 |         b = int(i / 10 % 10)
 79 |         c = int(i / 100)
 80 |         if a ** 3 + b ** 3 + c ** 3 == i:
 81 |             print(i)
 82 | 
 83 | 
 84 | ######### 4. 获取分数评级（嵌套条件运算符）#########
 85 | def getScoreSign(score):
 86 |     return "A" if score >= 90 else "B" if score >= 60 else "C"
 87 | 
 88 | 
 89 | ######### 5. 正整数分解质因数 #########
 90 | def getPrimeNum(num):
 91 |     n = 2
 92 |     result = []
 93 |     while num >= n:
 94 |         if num == n:
 95 |             result.append(num)  # ！找到最终的，也是最大的质因数
 96 |             break  # 退出循环
 97 |         elif num % n == 0:  # 说明此时的num可以再次被分解
 98 |             result.append(n)  # ！每次被n整除时，n均作为质因数，而不是num
 99 |             num = int(num / n)  # ！每次取整数
100 |         else:
101 |             n += 1  # ！n自增找到最大质因数本身。例如：7
102 |     return result
103 | 
104 | 
105 | ######### 6. 最大公约数和最小公倍数 #########
106 | # /*在循环中，只要除数不等于0，用较大数除以较小的数，
107 | # 若两数相同，最大公约数为本身；
108 | # 将小的一个数作为下一轮循环的大数，取得的余数作为下一轮循环的较小的数，
109 | # 如此循环直到较小的数的值为0，
110 | # 返回较大的数，此数即为最大公约数，最小公倍数为两数之积除以最大公约数。 /
111 | def getMaxComDivisorAndMinComMultiple(num_one, num_two):
112 |     # 判断大小数-设置num_two为大数
113 |     if num_one > num_two:
114 |         num_two, num_one = num_one, num_two
115 |     while num_one != 0:
116 |         if num_one == num_two:  # ！循环出口条件 两数相同，最大公约数为本身
117 |             return num_two
118 |         temp = num_one
119 |         num_one = int(num_two % num_one)  # ！余数肯定比被除数小，将余数设为小的数
120 |         num_two = temp
121 |     return num_two
122 | 
123 | 
124 | ######### 7. 统计字符串内容 #########
125 | # 输入一行字符，分别统计出其中 中文、英文字母、空格、数字和其它字符的个数
126 | def countNum(string):
127 |     result = [0, 0, 0, 0, 0]
128 |     for char in string:
129 |         if u'\u4e00' <= char <= u'\u9fa5':  # 判断是否是汉字，在isalpha()方法之前判断
130 |             result[0] += 1
131 |         elif char.isalpha():  # ！汉字也返回true
132 |             result[1] += 1
133 |         elif char.isspace():
134 |             result[2] += 1
135 |         elif char.isdigit():
136 |             result[3] += 1
137 |         else:
138 |             result[4] += 1
139 |     return result
140 | 
141 | 
142 | ######### 8. 计算特殊表达式的和 求s=a+aa+aaa+aaaa+aa…a的值 #########
143 | def getNumSum(num, count):
144 |     sum = 0
145 |     result = 0
146 |     for i in range(count):
147 |         sum += num * 10 ** i
148 |         result += sum
149 |     print(result)
150 |     return result
151 | 
152 | 
153 | ######### 9. 判断一个数是否是完数（一个数如果恰好等于它的因子之和，如6=1＋2＋3） #########
154 | def isCompleteNum(num):
155 |     sum = 0
156 |     for i in range(1, int(num / 2) + 1):
157 |         if num % i == 0:
158 |             sum += i
159 |     return True if sum == num else False
160 | 
161 | 
162 | ######### 10. 求1+2!+3!+…+20!的和 #########
163 | def factorial(num):
164 |     sum = 0
165 |     temp = 1
166 |     for one_num in range(1, num + 1):
167 |         temp *= one_num
168 |         sum += temp
169 |     return sum
170 | 
171 | 
172 | def factorial_two(num):
173 |     from functools import reduce
174 |     sum = 0
175 |     for i in range(2, num + 2):  # 从2开始，num+2结束
176 |         sum += reduce(lambda x, y: x * y, range(1, i))  # reduce() 累积
177 |         print(sum)
178 | 
179 | 
180 | ######### 11. 判断一个数是否为回文数 #########
181 | def isNumberOfTracts(num):
182 |     if num < 0 or (num % 10 == 0 and num != 0):
183 |         return False
184 |     str1 = str(num)
185 |     str2 = str(num)[::-1]  # 列表反转
186 |     for i in range(len(str1)):
187 |         if str1[i] != str2[i]:
188 |             print(str1[i], str2[i], "   error index is:", i)
189 |             return False
190 |     else:
191 |         return True
192 | 
193 | 
194 | ######### 12. 四数取三，排列组合 #########
195 | # 有1、2、3、4四个数字，能组成多少个互不相同且一个数字中无重复数字的三位数。不涉及0
196 | # 字符串涉及重复内容。set()去重。
197 | def permutations(num):
198 |     count = 0
199 |     for i in range(1, num + 1):
200 |         for j in range(1, num + 1):
201 |             for k in range(1, num + 1):
202 |                 if ((i != j) and (j != k) and (i != k)):
203 |                     count += 1
204 |                     print(i * 100 + j * 10 + k, i, j, k)
205 |     print(count)
206 | 
207 | 
208 | # itertools.combinations()组合；itertools.permutations()排列，列出所有可能性
209 | def permutations_one(tar, num):
210 |     '''
211 |     :param tar: 待排列或组合的目标
212 |     :param num: 选取个数
213 |     :return: 排列或组合的结果
214 |     '''
215 |     import itertools
216 |     permutations = itertools.permutations(str(tar), num)
217 |     result = []
218 |     for permutation in list(permutations):
219 |         sum = ""
220 |         for char in permutation:
221 |             sum += str(char)
222 |         if tar.__class__ is int:
223 |             result.append(int(sum))
224 |         else:
225 |             result.append(sum)
226 |     print(result)
227 | 
228 | 
229 | if __name__ == '__main__':
230 |     print(isNumberOfTracts(10))
231 |     pass
232 | 
233 | """
234 |     ######### 1. 菲波拉契数列问题 #########
235 |     num = 10
236 |     fibonacci = fibonacci(num)
237 |     for i in range(num):
238 |         print(getFibonacciList_one(i), end="\t")
239 |     fibonacci.getFibonacciList_two()
240 |     fibonacci.getFibonacciList_three()
241 | 
242 |     ######### 2. isPrime #########
243 |     beginNum, endNum = 101, 200
244 |     count = 0
245 |     for i in range(beginNum, endNum):
246 |         if isPrime(i):
247 |             count += 1
248 |             print(i, end=" ")
249 |     print("总数：", count)
250 | 
251 |     ######### 3. 判断水仙花数  #########
252 |     getDaffodil()
253 | 
254 |     ######### 4. 获取分数评级（嵌套条件运算符-python无三目运算） #########
255 |     score = 60
256 |     getScoreSign(score)
257 | 
258 |     ######### 5. 正整数分解质因数 #########
259 |     num = 12
260 |     getPrimeNum(num)
261 | 
262 |     ######### 6. 最大公约数和最小公倍数 #########
263 |     maxComDivisor = getMaxComDivisorAndMinComMultiple(200, 12)
264 |     minComMultiple = int(200 * 12 / maxComDivisor)
265 |     print(maxComDivisor, minComMultiple)
266 |     
267 |     ######### 7. 统计字符串内容 #########
268 |     countNum("123test 哈哈 #@*")
269 |     
270 |     ######### 8. 计算特殊表达式的和 求s=a+aa+aaa+aaaa+aa…a的值 #########
271 |     getNumSum(num=3, count=6)
272 |     
273 |     ######### 9. 判断一个数是否是完数（一个数如果恰好等于它的因子之和，如6=1＋2＋3） #########
274 |     isCompleteNum(num=6)
275 | 
276 |     ######### 10. 求1+2!+3!+…+20!的和 #########
277 |     factorial(20)
278 | 
279 |     ######### 11. 判断回文数 #########
280 |     isNumberOfTracts(12345687654321)
281 |     
282 |     ######### 12. 四数取三，排列组合 #########
283 |     permutations(4)
284 |     permutations_one(1230, 3)
285 | """
286 | 


--------------------------------------------------------------------------------
/algorithms/sorted/sorted_algorithms.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : 排序算法集合
 6 |  1. 插入排序
 7 |  2.
 8 |  3.
 9 |  4.
10 |  --------------------------------
11 |  @Time    : 2019/7/30 21:51
12 |  @File    : sorted_algorithms.py
13 |  @Software: PyCharm
14 |  --------------------------------
15 |  @Author  : lixj
16 |  @contact : lixj_zj@163.com
17 | """
18 | 
19 | 
20 | class SortedAlgorithms(object):
21 |     def __init__(self):
22 |         pass
23 | 
24 |     def straight_insertion_sort(self, ints):
25 |         """
26 |         直接插入排序
27 |         介绍：
28 |         一种依次将无序区的元素在有序区内找到合适位置依次插入的算法
29 |         基本思想：
30 |         每次从无序表中取出第一个元素，把它插入到有序表的合适位置，使有序表仍然有序，直到无序表内所有元素插入为止
31 |         评价：
32 |         插入排序的最坏时间复杂度为 O(n^2)，属于稳定排序算法，对于处理小批量数据时高效；
33 |         :return: 排序后列表
34 |         """
35 |         for key, value in enumerate(ints):
36 |             # 获取当前值
37 |             current = value
38 |             # 获取当前项前一项
39 |             j = key - 1
40 |             # 若前一项的 key >= 0 并且 前一项的值大于当前值
41 |             while j >= 0 and ints[j] > current:
42 |                 # 将前一项的值赋给当前值，即大的值赋给小的值，小的值此时存储为 current
43 |                 ints[j + 1] = ints[j]
44 |                 # 继续前移一位
45 |                 j = j - 1
46 |             # 将当前值（比较后的最小值）赋给当前项
47 |             ints[j + 1] = current
48 |         return ints
49 | 
50 |     def straight_insertion_sort_optimization(self):
51 |         """
52 |         直接插入排序算法优化--折半查找
53 |         :return:
54 |         """
55 |         pass
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     ints = [1, 5, 3, 8, 1, 9, 4, 7, 2]
60 |     algorithms = SortedAlgorithms()
61 | 
62 |     # 直接插入排序
63 |     print(algorithms.straight_insertion_sort(ints))
64 | 


--------------------------------------------------------------------------------
/algorithms/test.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2020/4/15 23:47
 8 |  @File    : test.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | a = [1,2,3,4,5]
16 | print(a[-4:-2])
17 | 


--------------------------------------------------------------------------------
/basics/decorator.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description : 装饰器常见用法
  6 |  参考网址：https://mp.weixin.qq.com/s/8z92pbhJV1ybfE6YZfvOuw?scene=25#wechat_redirect
  7 |  --------------------------------
  8 |  @Time    : 2019/8/14 14:10
  9 |  @File    : decorator.py
 10 |  @Software: PyCharm
 11 |  --------------------------------
 12 |  @Author  : lixj
 13 |  @contact : lixj_zj@163.com
 14 | """
 15 | 
 16 | import time
 17 | 
 18 | 
 19 | ## 1. 简单 demo
 20 | def decorator(func):
 21 |     """
 22 |     定义装饰器。调用含有该装饰器的函数时，先将这个函数做为参数传入该装饰器。
 23 |     :param func: 含有该装饰器的函数
 24 |     :return:
 25 |     """
 26 | 
 27 |     def wrapper(*args, **kwargs):
 28 |         return func()
 29 | 
 30 |     return wrapper
 31 | 
 32 | 
 33 | @decorator
 34 | def function():
 35 |     print("hello decorator")
 36 | 
 37 | 
 38 | ## 2. 日志打印装饰器
 39 | def logger(func):
 40 |     def wrapper(*args, **kwargs):
 41 |         print("开始执行：{}函数".format(func.__name__))
 42 |         # 真正执行
 43 |         func(*args, **kwargs)
 44 |         print("执行：{}函数完毕".format(func.__name__))
 45 | 
 46 |     return wrapper
 47 | 
 48 | 
 49 | @logger
 50 | def add(x, y):
 51 |     print("{}+{}={}".format(x, y, x + y))
 52 | 
 53 | 
 54 | ## 3. 时间计时器
 55 | def timer(func):
 56 |     def wrapper(*args, **kwargs):
 57 |         begin_time = time.time()
 58 |         # 执行函数
 59 |         func(*args, **kwargs)
 60 |         cost_time = time.time() - begin_time
 61 |         print("程序耗时：{}秒".format(cost_time))
 62 | 
 63 |     return wrapper
 64 | 
 65 | 
 66 | @timer
 67 | def test_timer_sleep(sleep_time):
 68 |     time.sleep(sleep_time)
 69 | 
 70 | 
 71 | ## 4. 带参数的函数装饰器--两层嵌套
 72 | def say_hello(contry):
 73 |     def wrapper(func):
 74 |         def deco(*args, **kwargs):
 75 |             if contry == "china":
 76 |                 print("你好！")
 77 |             elif contry == "america":
 78 |                 print("hello.")
 79 |             else:
 80 |                 return
 81 |             # 真正执行函数
 82 |             func(*args, **kwargs)
 83 | 
 84 |         return deco
 85 | 
 86 |     return wrapper
 87 | 
 88 | 
 89 | @say_hello("china")
 90 | def xiaoming():
 91 |     pass
 92 | 
 93 | 
 94 | @say_hello("america")
 95 | def jack():
 96 |     pass
 97 | 
 98 | 
 99 | ## 5. 高阶：不带参数的类装饰器
100 | # 基于类装饰器的实现，必须实现 __call__ 和 __init__两个内置函数。
101 | # __init__ ：接收被装饰函数
102 | # __call__ ：实现装饰逻辑。
103 | class Logger(object):
104 |     def __init__(self, func):
105 |         # 接收被装饰函数
106 |         self.func = func
107 | 
108 |     def __call__(self, *args, **kwargs):
109 |         # 实现装饰逻辑
110 |         print("[INFO]: the function {}() is running...".format(self.func.__name__))
111 |         # 真正执行函数
112 |         return self.func(*args, **kwargs)
113 | 
114 | 
115 | @Logger
116 | def say_no_parameter(something):
117 |     print("say {}!".format(something))
118 | 
119 | 
120 | ## 6.高阶：带参数的类装饰器
121 | # 带参数和不带参数的类装饰器有很大的不同。
122 | # __init__ ：不再接收被装饰函数，而是接收传入参数。
123 | # __call__ ：接收被装饰函数，实现装饰逻辑。
124 | class Logger(object):
125 |     def __init__(self, level='INFO'):
126 |         # 接收参数
127 |         self.level = level
128 | 
129 |     def __call__(self, func):
130 |         def wrapper(*args, **kwargs):
131 |             # 实现装饰逻辑
132 |             print("[{}]: the function {}() is running...".format(self.level, func.__name__))
133 |             # 接收被装饰函数，执行函数
134 |             func(*args, **kwargs)
135 | 
136 |         # 返回函数
137 |         return wrapper
138 | 
139 | 
140 | @Logger(level='WARNNING')
141 | def say_with_parameter(something):
142 |     print("say {}!".format(something))
143 | 
144 | 
145 | ## 7. 使用偏函数与类实现装饰器
146 | # Python 对某个对象是否能通过装饰器（ @decorator）形式使用只有一个要求：
147 | #   decorator 必须是一个 “可被调用” （callable）的对象。
148 | # 对于这个 callable 对象，我们最熟悉的就是函数了。
149 | # 除函数之外，类也可以是 callable 对象，只要实现了__call__ 函数（上面几个例子已经接触过了）。
150 | # 还有容易被人忽略的偏函数其实也是 callable 对象。
151 | import time
152 | import functools
153 | 
154 | 
155 | class DelayFunc:
156 |     def __init__(self, duration, func):
157 |         self.duration = duration
158 |         self.func = func  # @delay(duration=2) 装饰器修饰的 add_partial() 函数
159 | 
160 |     def __call__(self, *args, **kwargs):
161 |         print(f'Wait for {self.duration} seconds...')
162 |         time.sleep(self.duration)
163 |         return self.func(*args, **kwargs)
164 | 
165 | 
166 | def delay(duration):
167 |     """
168 |     装饰器：推迟某个函数的执行。
169 |     :param duration: 延迟时间
170 |     :return: 执行结果
171 |     """
172 |     # 此处为了避免定义额外函数，直接使用 functools.partial 帮助构造 DelayFunc 实例
173 |     return functools.partial(DelayFunc, duration)
174 | 
175 | 
176 | # duration=2，延迟2秒
177 | @delay(duration=2)
178 | def add_partial(a, b):
179 |     return a + b
180 | 
181 | 
182 | ## 8. 如何写能装饰类的装饰器？
183 | # 单例模式---装饰器实现控制生成类实例
184 | # 可以看到我们用 singleton 这个装饰函数来装饰 User 这个类。
185 | # 装饰器用在类上，并不是很常见，但只要熟悉装饰器的实现过程，就不难以实现对类的装饰。
186 | 
187 | # 类的实例字典，key: 实例名称，value: 实例 object
188 | instances = {}
189 | 
190 | 
191 | def singleton(cls):
192 |     """
193 |     单例模式生成类
194 |     :param cls: singleton 装饰器修饰的类
195 |     :return:
196 |     """
197 | 
198 |     def get_instance(*args, **kw):
199 |         cls_name = cls.__name__
200 |         if not cls_name in instances:
201 |             # 生成装饰器修饰的类的实例
202 |             instance = cls(*args, **kw)
203 |             # 将实例加到字典中。key: cls_name，value: instance
204 |             instances[cls_name] = instance
205 |         # 返回该类的实例
206 |         return instances[cls_name]
207 | 
208 |     return get_instance
209 | 
210 | 
211 | @singleton
212 | class User:
213 |     def __init__(self, name):
214 |         self.name = name
215 | 
216 | 
217 | ## 9. wraps 装饰器
218 | # functools 标准库中有提供一个 wraps 装饰器
219 | # 作用就是将 被修饰的函数(wrapped) 的一些属性值赋值给 修饰器函数(wrapper) ，最终让属性的显示更符合我们的直觉。
220 | # 准确点说，wraps 其实是一个偏函数对象（partial）
221 | from functools import update_wrapper
222 | 
223 | WRAPPER_ASSIGNMENTS = ('__module__', '__name__', '__qualname__', '__doc__',
224 |                        '__annotations__')
225 | 
226 | 
227 | def wrapper(func):
228 |     def inner_function():
229 |         pass
230 | 
231 |     update_wrapper(inner_function, func, assigned=WRAPPER_ASSIGNMENTS)
232 |     return inner_function
233 | 
234 | 
235 | @wrapper
236 | def wrapped():
237 |     pass
238 | 
239 | 
240 | print(wrapped.__name__)  # wrapped
241 | 
242 | 
243 | ## 10. 内置装饰器：property
244 | # 内建装饰器，它通常存在于类中，可以将一个函数定义成一个属性，属性的值就是该函数return的内容。
245 | # 用@property装饰过的函数，会将一个函数定义成一个属性，属性的值就是该函数return的内容。同时，会将这个函数变成另外一个装饰器。
246 | # property 的底层实现机制是「描述符」
247 | class TestProperty(object):
248 | 
249 |     def __init__(self, fget=None, fset=None, fdel=None, doc=None):
250 |         self.fget = fget
251 |         self.fset = fset
252 |         self.fdel = fdel
253 |         self.__doc__ = doc
254 | 
255 |     def __get__(self, obj, objtype=None):
256 |         print("in __get__")
257 |         if obj is None:
258 |             return self
259 |         if self.fget is None:
260 |             raise AttributeError
261 |         return self.fget(obj)
262 | 
263 |     def __set__(self, obj, value):
264 |         print("in __set__")
265 |         if self.fset is None:
266 |             raise AttributeError
267 |         self.fset(obj, value)
268 | 
269 |     def __delete__(self, obj):
270 |         print("in __delete__")
271 |         if self.fdel is None:
272 |             raise AttributeError
273 |         self.fdel(obj)
274 | 
275 |     def getter(self, fget):
276 |         print("in getter")
277 |         return type(self)(fget, self.fset, self.fdel, self.__doc__)
278 | 
279 |     def setter(self, fset):
280 |         print("in setter")
281 |         return type(self)(self.fget, fset, self.fdel, self.__doc__)
282 | 
283 |     def deleter(self, fdel):
284 |         print("in deleter")
285 |         return type(self)(self.fget, self.fset, fdel, self.__doc__)
286 | 
287 | 
288 | class Student:
289 |     def __init__(self, name):
290 |         self.name = name
291 | 
292 |     # 其实只有这里改变
293 |     @TestProperty
294 |     def math(self):
295 |         return self._math
296 | 
297 |     @math.setter
298 |     def math(self, value):
299 |         if 0 <= value <= 100:
300 |             self._math = value
301 |         else:
302 |             raise ValueError("Valid value must be in [0, 100]")
303 | 
304 | 
305 | # 说明：
306 | # 1. 使用TestProperty装饰后，math 不再是一个函数，而是TestProperty类的一个实例。
307 | # 所以第二个math函数可以使用 math.setter 来装饰，本质是调用TestProperty.setter 来产生一个新的 TestProperty 实例赋值给第二个math。
308 | # 2. 第一个 math 和第二个 math 是两个不同 TestProperty 实例。但他们都属于同一个描述符类（TestProperty），
309 | # 当对 math 对于赋值时，就会进入 TestProperty.__set__，
310 | # 当对 math 进行取值里，就会进入 TestProperty.__get__。
311 | # 仔细一看，其实最终访问的还是 Student 实例的 _math 属性。
312 | 
313 | 
314 | # 运行后，会直接打印这一行，这是在实例化 TestProperty 并赋值给第二个math
315 | # in setter
316 | # >>>
317 | # >>> s1.math = 90
318 | # in __set__
319 | # >>> s1.math
320 | # in __get__
321 | # 90
322 | 
323 | 
324 | ## 11. 其他装饰器：装饰器实战
325 | # 一个实现控制函数运行超时的装饰器。如果超时，则会抛出超时异常。
326 | import signal
327 | 
328 | 
329 | class TimeoutException(Exception):
330 |     def __init__(self, error='Timeout waiting for response from Cloud'):
331 |         Exception.__init__(self, error)
332 | 
333 | 
334 | def timeout_limit(timeout_time):
335 |     def wraps(func):
336 |         def handler(signum, frame):
337 |             raise TimeoutException()
338 | 
339 |         def deco(*args, **kwargs):
340 |             signal.signal(signal.SIGALRM, handler)
341 |             signal.alarm(timeout_time)
342 |             func(*args, **kwargs)
343 |             signal.alarm(0)
344 | 
345 |         return deco
346 | 
347 |     return wraps
348 | 
349 | 
350 | if __name__ == '__main__':
351 |     # function()
352 |     # add(1, 2)
353 |     # test_timer_sleep(3)
354 | 
355 |     # xiaoming()
356 |     # jack()
357 | 
358 |     # say_no_parameter("123")
359 |     # say_with_parameter("456")
360 | 
361 |     ## 7. 偏函数
362 |     # add_partial(1, 2)
363 |     # >>> add_partial    # 可见 add 变成了 Delay 的实例
364 |     # <__main__.DelayFunc object at 0x107bd0be0>
365 |     #
366 |     # >>> add_partial(3,5)  # 直接调用实例，进入 __call__
367 |     # Wait for 2 seconds...
368 |     # 8
369 |     #
370 |     # >>> add_partial.func # 实现实例方法
371 |     # <function add_partial at 0x107bef1e0>
372 | 
373 |     ## 8. 单例模式
374 |     # user_one = User("tony")
375 |     # print(user_one)
376 | 
377 |     ## 9. wraps 装饰器
378 |     # wrapped()
379 | 
380 |     ## 10. 内置装饰器：property
381 |     pass
382 | 


--------------------------------------------------------------------------------
/basics/os.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/5/26 10:12
 8 |  @File    : os.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import os
16 | import shutil
17 | 
18 | current_path = "D:\ZX_workspace\Python\Basics\osRelated.py"
19 | 
20 | # 1.获取当前文件绝对路径
21 | print(os.path.abspath(__file__))
22 | print(os.path.abspath("osRelated1.py"))
23 | 
24 | # 2.获取当前文件夹绝对路径
25 | print(os.path.dirname(__file__))
26 | print(os.path.dirname(os.path.abspath(__file__)))
27 | #   os.path.sep为 \\
28 | print(os.path.abspath(os.path.dirname(__file__) + os.path.sep + "."))
29 | print(os.path.abspath(os.path.dirname(current_path) + os.path.sep + "."))
30 | 
31 | # 3.切换目录 参数为目录路径，非文件路径
32 | print(os.chdir(os.path.dirname(current_path)))
33 | 
34 | # 4.拼接路径与文件
35 | print(os.path.join(os.path.abspath(os.path.dirname(__file__)), "aa.txt"))
36 | 
37 | # 5. . .. \\ ; 等标记在ntpath.py中已经封装
38 | print(os.path.pardir)
39 | 
40 | # 6.
41 | print(os.getcwd())
42 | 
43 | # 7.返回指定目录下的所有文件和目录名
44 | print(os.listdir(os.path.dirname(current_path)))
45 | 
46 | 


--------------------------------------------------------------------------------
/basics/test_bloomfilter.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : 布隆过滤器
 6 |  --------------------------------
 7 |  @Time    : 2019/9/6 17:16
 8 |  @File    : test_bloomfilter.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import mmh3
16 | from bitarray import bitarray
17 | 
18 | 
19 | class BloomFilter(object):
20 | 
21 |     def __init__(self, bit_size):
22 |         # 位向量大小
23 |         self.bit_size = bit_size
24 |         # 构建位向量
25 |         self.bit_array = bitarray(bit_size)
26 |         # 位向量初始化，全部设置为0
27 |         self.bit_array.setall(0)
28 | 
29 |     def add_data(self, url):
30 |         """
31 |         往布隆过滤器中添加数据，同时将其hash成bitarray，将位向量中与结果集对应的位置至1
32 |         :param url: 添加的 URL
33 |         """
34 |         position_list = self.get_positions(url)
35 |         for position in position_list:
36 |             self.bit_array[position] = 1
37 | 
38 |     def is_contained(self, url):
39 |         """
40 |         校验布隆过滤器中是否包含某个url
41 |         :param url: 目标URL
42 |         """
43 |         # 获取目标URL对应的多个hash值在位向量中的位置集
44 |         position_list = self.get_positions(url)
45 |         result = True
46 |         # 判断每个位置是否已经被至1
47 |         for position in position_list:
48 |             result = result and self.bit_array[position]
49 |         return result
50 | 
51 |     def get_positions(self, url):
52 |         """
53 |         返回url经过hash之后的位向量。此处采用三个hash函数构建。
54 |         取余数，保证向量组的比特位索引小于bit_size
55 |         :param url: 需要经过hash的数据
56 |         :return: url所在位向量的位置
57 |         """
58 |         # hash(key, seed=0, signed=True)
59 |         # 参数解释：
60 |         # key: 需要hash的元素
61 |         # seed: 种子参数，随机化函数的一种方法。采用不同的种子参数，生成不同的hash值，防止不同数据的hash冲突
62 |         # signed: 默认True
63 |         # seed 参数解释参考：https://stackoverflow.com/questions/9241230/what-is-murmurhash3-seed-parameter
64 |         position_one = mmh3.hash(url, 60) % self.bit_size
65 |         position_two = mmh3.hash(url, 61) % self.bit_size
66 |         position_three = mmh3.hash(url, 62) % self.bit_size
67 |         return [position_one, position_two, position_three]
68 | 
69 | if __name__ == '__main__':
70 |     bloom = BloomFilter(100000)
71 |     bloom.add_data('https://www.baidu.com')
72 |     print(bloom.is_contained('https://www.baidu.com'))
73 |     print(bloom.is_contained('test'))
74 | 


--------------------------------------------------------------------------------
/basics/test_test_demo.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : python 测试用例
 6 |  主要使用 python 中的 pytest 框架，在 pycharm 配置搜索 pytest 配置
 7 |  --------------------------------
 8 |  @Time    : 2019/8/1 22:07
 9 |  @File    : test_test_demo.py
10 |  @Software: PyCharm
11 |  --------------------------------
12 |  @Author  : lixj
13 |  @contact : lixj_zj@163.com
14 | """
15 | 
16 | import pytest
17 | from io import StringIO
18 | 
19 | 
20 | def count_vowels_v2(fp):
21 |     """
22 |     统计某个文件中，包含元音字母(aeiou)的数量（普通做法）
23 |     :param fp:
24 |     :return:
25 |     """
26 |     VOWELS_LETTERS = {'a', 'e', 'i', 'o', 'u'}
27 |     count = 0
28 |     for line in fp:
29 |         for char in line:
30 |             if char.lower() in VOWELS_LETTERS:
31 |                 count += 1
32 |     return count
33 | 
34 | 
35 | @pytest.mark.parametrize(
36 |     "content, vowels_count", [
37 |         # 使用 pytest 提供的参数化测试工具，定义测试参数列表（构造测试用例）
38 |         # (文件内容, 期待结果)
39 |         ('', 0),
40 |         ('Hello World!', 2),
41 |         ('HELLO_WORLD!', 3),
42 |         ('啊哈哈哈', 0),
43 |     ]
44 | )
45 | def test_demo(content, vowels_count):
46 |     # 利用 StirngIO 构建类文件对象 file
47 |     file = StringIO(content)
48 |     assert count_vowels_v2(file) == vowels_count
49 | 


--------------------------------------------------------------------------------
/basics/zlib.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : 压缩解压
 6 |  --------------------------------------
 7 |  @File    	  : zlib.py
 8 |  @Time    	  : 2018/8/25 22:10
 9 |  @Software	  : PyCharm
10 |  --------------------------------------
11 |  @Author  	  : lixj
12 |  @Contact	  : lixj_zj@163.com
13 |  --------------------------------------
14 | """
15 | 
16 | import zlib
17 | import requests
18 | 
19 | 
20 | # zlib.compress 用来压缩字符串的bytes类型
21 | def str_zlib():
22 |     req = requests.get("http://python.jobbole.com/81513/")
23 |     message = req.text
24 |     bytes_message = str.encode(message)
25 |     compressed = zlib.compress(bytes_message, zlib.Z_BEST_COMPRESSION)
26 |     decompressed = zlib.decompress(compressed)  # str、repr的区别
27 |     print("original string:", len(message))
28 |     print("original bytes:", len(bytes_message))
29 |     print("compressed:", len(compressed))
30 |     print("decompressed:", len(decompressed))
31 | 
32 | 
33 | # zlib.compressobj 用来压缩数据流，用于文件传输
34 | def file_compress(beginFile, zlibFile, level):
35 |     infile = open(beginFile, "rb")
36 |     zfile = open(zlibFile, "wb")
37 |     compressobj = zlib.compressobj(level)  # 压缩对象
38 |     data = infile.read(1024)  # 1024为读取的size参数
39 |     while data:
40 |         zfile.write(compressobj.compress(data))  # 写入压缩数据
41 |         data = infile.read(1024)  # 继续读取文件中的下一个size的内容
42 |     zfile.write(compressobj.flush())  # compressobj.flush()包含剩余压缩输出的字节对象，将剩余的字节内容写入到目标文件中
43 | 
44 | 
45 | def file_decompress(zlibFile, endFile):
46 |     zlibFile = open(zlibFile, "rb")
47 |     endFile = open(endFile, "wb")
48 |     decompressobj = zlib.decompressobj()
49 |     data = zlibFile.read(1024)
50 |     while data:
51 |         endFile.write(decompressobj.decompress(data))
52 |         data = zlibFile.read(1024)
53 |     endFile.write(decompressobj.flush())
54 | 
55 | 
56 | def main():
57 |     # 测试字符串的压缩与解压
58 |     str_zlib()
59 | 
60 |     # 测试数据流压缩
61 |     beginFile = "./beginFile.txt"
62 |     zlibFile = "./zlibFile.txt"
63 |     level = 9
64 |     file_compress(beginFile, zlibFile, level)
65 | 
66 |     # 测试数据流解压
67 |     zlibFile = "./zlibFile.txt"
68 |     endFile = "./endFile.txt"
69 |     file_decompress(zlibFile, endFile)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 | 
75 | 
76 | 


--------------------------------------------------------------------------------
/blockchains/blockchain.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @Description : 模拟区块链
  6 |  --------------------------------------
  7 |  @File        : blockchain.py
  8 |  @Time        : 2018/4/24 16:51
  9 |  @Software    : PyCharm
 10 |  --------------------------------------
 11 |  @Author      : lixj
 12 |  @Contact     : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | import hashlib
 17 | import json
 18 | from time import time
 19 | from urllib import parse
 20 | from typing import Any, Dict, List, Optional
 21 | from urllib.parse import urlparse
 22 | from uuid import uuid4
 23 | import requests
 24 | from argparse import ArgumentParser
 25 | 
 26 | # blockChain类用来管理链条，负责存储交易、加入新块等
 27 | class blockChain(object):
 28 |     def __init__(self):
 29 |         self.chain = []                     # 用于存储区块链
 30 |         self.current_transactions = []      # 用于存储交易记录
 31 |         self.nodes = set()                  # 用set来存储节点，避免重复添加节点（利用set的属性）
 32 | 
 33 |         # Create the genesis block 创建创世区块
 34 |         self.new_block(previous_hash = '1', proof = 100)
 35 | 
 36 | 
 37 |     # Creates a new block and adds it to the chain 生成新块并添加到区块链中
 38 |     # param proof: <int> The proof given by the proof of work algorithm
 39 |     # param previous_hash: (Optional 可选) <str> Hash of previous block
 40 |     # return: <dict> New block
 41 |     def new_block(self, proof, previous_hash: Optional[str]) -> Dict[str, Any]:
 42 |         # block
 43 |         block = {
 44 |             'index': len(self.chain) + 1,
 45 |             'timestamp': time(),
 46 |             'transactions': self.current_transactions,
 47 |             'proof': proof,
 48 |             'previous_hash': previous_hash or self.hash(self.chain[-1]),
 49 |         }
 50 | 
 51 |         # Reset the current list of transactions 将交易加入到区块后重置交易记录。目的：在新的区块中打包记录新的交易情况，不能含有之前的交易记录
 52 |         self.current_transactions = []
 53 | 
 54 |         # Add the new block to the list of chain
 55 |         self.chain.append(block)
 56 | 
 57 |         return block
 58 | 
 59 | 
 60 |     # Adds a new transaction to the list of transactions
 61 |     # 生成新的交易记录添加到交易列表中，新的交易记录将加入到下一个待挖的区块中，并返回该交易记录将被添加到的下一个待挖区块的索引
 62 |     # param sender: <str> Address of the sender 发送者地址
 63 |     # param recipient: <str> Address of the recipient 接收者地址
 64 |     # param amount: <str> Amount 发送比特币的数量
 65 |     # return: <int> The index of the block that will hold this transaction
 66 |     def new_transaction(self, sender: str, recipient: str, amount: int) -> int:
 67 |         self.current_transactions.append({
 68 |             "sender": sender,
 69 |             "recipient": recipient,
 70 |             "amount": amount
 71 |         })
 72 |         return self.last_block['index'] + 1
 73 | 
 74 | 
 75 |     # Hashes a block 生成块的SHA-256 hash值
 76 |     # param block: <dick> block
 77 |     # return: <str> Hash
 78 |     @staticmethod   # 静态方法
 79 |     def hash(block: Dict[str, Any]) -> str:
 80 |         block_string = json.dumps(block, sort_keys=True).encode()   # 序列化列表为json字符串格式
 81 |         return hashlib.sha256(block_string).hexdigest()             # hash.hexdigest()生成十六进制数据字符串值
 82 | 
 83 | 
 84 |     # Returns the last block in the chain
 85 |     @property       # 将一个方法变成属性调用
 86 |     def last_block(self) -> Dict[str, Any]:
 87 |         return self.chain[-1]
 88 | 
 89 | 
 90 |     # 简单工作量证明: 寻找一个数proof，使得它与前一个区块的工作量证明值(last_proof)拼接成的字符串的 Hash 值(hash(last_proof, proof))以 4 个零开头。
 91 |     # param last_proof: <int> 前一个区块的工作量证明值
 92 |     # return: <int> proof 算力，工作量证明
 93 |     def proof_of_work(self, last_proof: int) -> int:
 94 |         proof = 0
 95 |         while self.valid_proof(last_proof, proof) is False:
 96 |             proof += 1
 97 |         return proof
 98 | 
 99 | 
100 |     # 验证证明: hash(last_proof, proof)是否以四个零开头?
101 |     # param last_proof: <int> Previous proof 前一个区块的工作量证明值
102 |     # param proof: <int> Current Proof 当前区块的工作量证明值
103 |     # return: <bool> True is current, false if not
104 |     @staticmethod
105 |     def valid_proof(last_proof: int, proof: int) -> bool:
106 |         guess = f'{last_proof}{proof}'.encode()
107 |         guess_hash = hashlib.sha256(guess).hexdigest()
108 |         return guess_hash[:4] == "0000"
109 | 
110 | 
111 |     # 一致性（共识）：注册节点，在多个节点中添加一个新的节点
112 |     # param address: <str> Address of node. Eg. 'http://192.168.0.5:5000'
113 |     # return: None
114 |     def register_node(self, address):
115 |         parsed_url = urlparse(address)
116 |         self.nodes.add(parsed_url.netloc)       # 添加节点的地址和端口号，根据urlparse解析结果
117 | 
118 | 
119 |     # Determine if a given blockchain is valid  检查是否是有效链，遍历每个块验证hash和proof
120 |     # param chain: <list> A blockchain 区块链列表
121 |     # return: <bool> True if valid, false if not
122 |     def valid_chain(self, chain: List[Dict[str, Any]]) -> bool:
123 |         last_block = chain[0]
124 |         current_index = 1
125 | 
126 |         while current_index < len(chain):
127 |             block = chain[current_index]
128 |             print(f'{last_block}')
129 |             print(f'{block}')
130 |             print("\n------------------\n")
131 | 
132 |             # Check that the hash of the block is correct 检测当前块是否正确可用。依据：当前块的previous_hash值是否等于前一个块的hash值
133 |             if block['previous_hash'] != self.hash(last_block):
134 |                 return False
135 | 
136 |             # Check that the proof of work is correct 检测工作量证明是否正确
137 |             if not self.valid_proof(last_block['proof'], block['proof']):
138 |                 return False
139 | 
140 |             last_block = block
141 |             current_index += 1
142 | 
143 |         return True
144 | 
145 | 
146 |     # 共识算发解决冲突，使用网络中最长的链
147 |     # return: <bool> 如果链被取代返回True，否则返回False
148 |     def resolve_conflicts(self) -> bool:
149 |         # 获取所有临近节点，nodes包含网络中的所有节点内容
150 |         neighbours = self.nodes
151 |         new_chain = None
152 | 
153 |         # We're only looking for chains longer than ours
154 |         max_length = len(self.chain)
155 | 
156 |         # Grab and verify the chains from all the nodes in our network
157 |         # 遍历所有邻居节点，并验证链的有效性，如果发现有效更长链，则替换掉自己的链
158 |         for node in neighbours:
159 |             response = requests.get(f'http://{node}/chain')
160 | 
161 |             # 对于节点池nodes中可用的节点
162 |             if response.status_code == 200:
163 |                 length = response.json()['length']
164 |                 chain = response.json()['chain']
165 | 
166 |                 # Check if the length of the chain is longer and the chain is valid
167 |                 if length > max_length and self.valid_chain(chain):
168 |                     max_length = length
169 |                     new_chain = chain
170 | 
171 |         # Replace our chain if we discovered a new, valid chain longer than ours
172 |         if new_chain:
173 |             self.chain = new_chain
174 |             return True
175 | 
176 |         return  False
177 | 
178 | 
179 | # ----------------------------------------------------------------------------------
180 | 
181 | 
182 | # Instantiate our node 创建节点
183 | app = Flask(__name__)
184 | 
185 | # Generate a globally unique address for this node
186 | node_identifier = str(uuid4()).replace('-', '')
187 | 
188 | # Instantiate our node
189 | blockChain = blockChain()
190 | 
191 | # 创建/transactions/new POST接口,可以给接口发送交易数据. 创建一个交易并添加到区块
192 | @app.route("/transactions/new", methods = ["POST"])
193 | def new_transaction():
194 |     # values = request.get_json()  # 无交易数据返回
195 | 
196 |     # test 设置静态交易返回数据
197 |     values = {
198 |         'sender': '123',
199 |         'recipient': '456',
200 |         'amount': 5
201 |     }
202 | 
203 |     # 检查必填字段是否在POST中
204 |     required = ['sender', 'recipient', 'amount']
205 |     if not all(k in values for k in required):
206 |         return "Missing values", 400
207 | 
208 |     # 新建一笔交易
209 |     index = blockChain.new_transaction(values['sender'], values['recipient'], values['amount'])
210 | 
211 |     response = {'message': f'Transaction will be added to block {index}'}
212 |     return jsonify(response), 201
213 | 
214 | 
215 | # 创建/chain接口，返回整个区块链
216 | @app.route("/chain", methods = ["GET"])
217 | def full_chain():
218 |     response = {
219 |         'chain': blockChain.chain,
220 |         'length': len(blockChain.chain)
221 |     }
222 |     return jsonify(response), 200
223 | 
224 | 
225 | # 创建/mine GET接口. 告诉服务器去挖掘新的区块
226 | @app.route('/mine', methods = ['GET'])
227 | def mine():
228 |     ## 1. 运行工作证明算法以获得下一个证明，即计算工作量证明PoW。验证区块是否合格
229 |     last_block = blockChain.last_block
230 |     last_proof = last_block['proof']
231 |     proof = blockChain.proof_of_work(last_proof)
232 | 
233 |     ## 2. 系统给拥有工作量证明的节点提供奖励, 即挖到合格的区块，授予矿工比特币奖励
234 |     # 发送者为"0"表明是新挖出的币
235 |     blockChain.new_transaction(
236 |         sender="0",
237 |         recipient=node_identifier,
238 |         amount=1,
239 |     )
240 | 
241 |     ## 3. 构造新区块并将其添加到区块链中
242 |     block = blockChain.new_block(proof, None)
243 |     response = {
244 |         'message': "New block forged",
245 |         'index': block['index'],
246 |         'transactions': block['transactions'],
247 |         'proof': block['proof'],
248 |         'previous_hash': block['previous_hash'],
249 |     }
250 |     return jsonify(response), 200
251 | 
252 | 
253 | # 添加路由/nodes/register POST接口，注册节点
254 | @app.route('/nodes/register', methods = ['POST'])
255 | def register_nodes():
256 |     print("begin register...")
257 |     # values = request.get_json()
258 |     # nodes = values.get('nodes')
259 | 
260 |     # test 设置假定的端口号为5001、5002
261 |     nodes = ['http://192.168.2.111:5001',
262 |              'http://192.168.2.111:5000']
263 | 
264 |     if nodes is None:
265 |         return  "Error: Please supply a valid list of nodes", 400
266 | 
267 |     for node in nodes:
268 |         blockChain.register_node(node)
269 | 
270 |     response = {
271 |         'message': 'New nodes have been added',
272 |         'total_nodes': list(blockChain.nodes)
273 |     }
274 |     return jsonify(response), 201
275 | 
276 | 
277 | # 添加路由/nodes/resolve GET接口，解决冲突
278 | @app.route('/nodes/resolve', methods = ['GET'])
279 | def consensus():
280 |     print("begin resolve...")
281 |     replaced = blockChain.resolve_conflicts()
282 | 
283 |     if replaced:
284 |         response = {
285 |             'message': 'Our chain was replaced',
286 |             'new_chain': blockChain.chain,
287 |             'length': len(blockChain.chain)
288 |         }
289 |     else:
290 |         response = {
291 |             'message': 'Our chain is authoritative',
292 |             'chain': blockChain.chain,
293 |             'length': len(blockChain.chain)
294 |         }
295 | 
296 |     return jsonify(response), 200
297 | 
298 | 
299 | if __name__ == "__main__":
300 |     parser = ArgumentParser()
301 |     parser.add_argument('-p', '--port', default=5000, type=int, help='port to listen on')
302 |     args = parser.parse_args()
303 |     port = args.port
304 | 
305 |     # 服务运行在端口5000上
306 |     app.run(host='192.168.2.111', port=5000)
307 | 


--------------------------------------------------------------------------------
/blockchains/blockchain_node2.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @Description : 模拟区块链结点2
  6 |  --------------------------------------
  7 |  @File        : blockchain_node1.py
  8 |  @Time        : 2018/4/24 16:51
  9 |  @Software    : PyCharm
 10 |  --------------------------------------
 11 |  @Author      : lixj
 12 |  @Contact     : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | import hashlib
 17 | import json
 18 | from time import time
 19 | from urllib import parse
 20 | from typing import Any, Dict, List, Optional
 21 | from urllib.parse import urlparse
 22 | from uuid import uuid4
 23 | from flask import Flask, jsonify, request
 24 | import requests
 25 | from argparse import ArgumentParser
 26 | 
 27 | # blockChain类用来管理链条，负责存储交易、加入新块等
 28 | class blockChain(object):
 29 |     def __init__(self):
 30 |         self.chain = []                     # 用于存储区块链
 31 |         self.current_transactions = []      # 用于存储交易记录
 32 |         self.nodes = set()                  # 用set来存储节点，避免重复添加节点（利用set的属性）
 33 | 
 34 |         # Create the genesis block 创建创世区块
 35 |         self.new_block(previous_hash = '1', proof = 100)
 36 | 
 37 | 
 38 |     # Creates a new block and adds it to the chain 生成新块并添加到区块链中
 39 |     # param proof: <int> The proof given by the proof of work algorithm
 40 |     # param previous_hash: (Optional 可选) <str> Hash of previous block
 41 |     # return: <dict> New block
 42 |     def new_block(self, proof, previous_hash: Optional[str]) -> Dict[str, Any]:
 43 |         # block
 44 |         block = {
 45 |             'index': len(self.chain) + 1,
 46 |             'timestamp': time(),
 47 |             'transactions': self.current_transactions,
 48 |             'proof': proof,
 49 |             'previous_hash': previous_hash or self.hash(self.chain[-1]),
 50 |         }
 51 | 
 52 |         # Reset the current list of transactions 将交易加入到区块后重置交易记录。目的：在新的区块中打包记录新的交易情况，不能含有之前的交易记录
 53 |         self.current_transactions = []
 54 | 
 55 |         # Add the new block to the list of chain
 56 |         self.chain.append(block)
 57 | 
 58 |         return block
 59 | 
 60 | 
 61 |     # Adds a new transaction to the list of transactions
 62 |     # 生成新的交易记录添加到交易列表中，新的交易记录将加入到下一个待挖的区块中，并返回该交易记录将被添加到的下一个待挖区块的索引
 63 |     # param sender: <str> Address of the sender 发送者地址
 64 |     # param recipient: <str> Address of the recipient 接收者地址
 65 |     # param amount: <str> Amount 发送比特币的数量
 66 |     # return: <int> The index of the block that will hold this transaction
 67 |     def new_transaction(self, sender: str, recipient: str, amount: int) -> int:
 68 |         self.current_transactions.append({
 69 |             "sender": sender,
 70 |             "recipient": recipient,
 71 |             "amount": amount
 72 |         })
 73 |         return self.last_block['index'] + 1
 74 | 
 75 | 
 76 |     # Hashes a block 生成块的SHA-256 hash值
 77 |     # param block: <dick> block
 78 |     # return: <str> Hash
 79 |     @staticmethod   # 静态方法
 80 |     def hash(block: Dict[str, Any]) -> str:
 81 |         block_string = json.dumps(block, sort_keys=True).encode()   # 序列化列表为json字符串格式
 82 |         return hashlib.sha256(block_string).hexdigest()             # hash.hexdigest()生成十六进制数据字符串值
 83 | 
 84 | 
 85 |     # Returns the last block in the chain
 86 |     @property       # 将一个方法变成属性调用
 87 |     def last_block(self) -> Dict[str, Any]:
 88 |         return self.chain[-1]
 89 | 
 90 | 
 91 |     # 简单工作量证明: 寻找一个数p，使得它与前一个区块的工作量证明值(proof)拼接成的字符串的 Hash 值(hash(last_proof, proof))以 4 个零开头。
 92 |     # param last_proof: <int> 前一个区块的工作量证明值
 93 |     # return: <int> proof 算力，工作量证明
 94 |     def proof_of_work(self, last_proof: int) -> int:
 95 |         proof = 0
 96 |         while self.valid_proof(last_proof, proof) is False:
 97 |             proof += 1
 98 |         return proof
 99 | 
100 | 
101 |     # 验证证明: hash(last_proof, proof)是否以四个零开头?
102 |     # param last_proof: <int> Previous proof 前一个区块的工作量证明值
103 |     # param proof: <int> Current Proof 当前区块的工作量证明值
104 |     # return: <bool> True is current, false if not
105 |     @staticmethod
106 |     def valid_proof(last_proof: int, proof: int) -> bool:
107 |         guess = f'{last_proof}{proof}'.encode()
108 |         guess_hash = hashlib.sha256(guess).hexdigest()
109 |         return guess_hash[:4] == "0000"
110 | 
111 | 
112 |     # 一致性（共识）：注册节点，在多个节点中添加一个新的节点
113 |     # param address: <str> Address of node. Eg. 'http://192.168.0.5:5000'
114 |     # return: None
115 |     def register_node(self, address):
116 |         parsed_url = urlparse(address)
117 |         self.nodes.add(parsed_url.netloc)       # 添加节点的地址和端口号，根据urlparse解析结果
118 | 
119 | 
120 |     # Determine if a given blockchain is valid  检查是否是有效链，遍历每个块验证hash和proof
121 |     # param chain: <list> A blockchain 区块链列表
122 |     # return: <bool> True if valid, false if not
123 |     def valid_chain(self, chain: List[Dict[str, Any]]) -> bool:
124 |         last_block = chain[0]
125 |         current_index = 1
126 | 
127 |         while current_index < len(chain):
128 |             block = chain[current_index]
129 |             print(f'{last_block}')
130 |             print(f'{block}')
131 |             print("\n------------------\n")
132 | 
133 |             # Check that the hash of the block is correct 检测当前块是否正确可用。依据：当前块的previous_hash值是否等于前一个块的hash值
134 |             if block['previous_hash'] != self.hash(last_block):
135 |                 return False
136 | 
137 |             # Check that the proof of work is correct 检测工作量证明是否正确
138 |             if not self.valid_proof(last_block['proof'], block['proof']):
139 |                 return False
140 | 
141 |             last_block = block
142 |             current_index += 1
143 | 
144 |         return True
145 | 
146 | 
147 |     # 共识算发解决冲突，使用网络中最长的链
148 |     # return: <bool> 如果链被取代返回True，否则返回False
149 |     def resolve_conflicts(self) -> bool:
150 |         # 获取所有临近节点，nodes包含网络中的所有节点内容
151 |         neighbours = self.nodes
152 |         new_chain = None
153 | 
154 |         # We're only looking for chains longer than ours
155 |         max_length = len(self.chain)
156 | 
157 |         # Grab and verify the chains from all the nodes in our network
158 |         # 遍历所有邻居节点，并验证链的有效性，如果发现有效更长链，则替换掉自己的链
159 |         for node in neighbours:
160 |             response = requests.get(f'http://{node}/chain')
161 | 
162 |             # 对于节点池nodes中可用的节点
163 |             if response.status_code == 200:
164 |                 length = response.json()['length']
165 |                 chain = response.json()['chain']
166 | 
167 |                 # Check if the length of the chain is longer and the chain is valid
168 |                 if length > max_length and self.valid_chain(chain):
169 |                     max_length = length
170 |                     new_chain = chain
171 | 
172 |         # Replace our chain if we discovered a new, valid chain longer than ours
173 |         if new_chain:
174 |             self.chain = new_chain
175 |             return True
176 | 
177 |         return  False
178 | 
179 | 
180 | # ----------------------------------------------------------------------------------
181 | 
182 | 
183 | # Instantiate our node 创建节点
184 | app = Flask(__name__)
185 | 
186 | # Generate a globally unique address for this node
187 | node_identifier = str(uuid4()).replace('-', '')
188 | 
189 | # Instantiate our node
190 | blockChain = blockChain()
191 | 
192 | # 创建/transactions/new POST接口,可以给接口发送交易数据. 创建一个交易并添加到区块
193 | @app.route("/transactions/new", methods = ["POST"])
194 | def new_transaction():
195 |     # values = request.get_json()  # 无交易数据返回
196 | 
197 |     # test 设置静态交易返回数据
198 |     values = {
199 |         'sender': '123',
200 |         'recipient': '456',
201 |         'amount': 5
202 |     }
203 | 
204 |     # 检查必填字段是否在POST中
205 |     required = ['sender', 'recipient', 'amount']
206 |     if not all(k in values for k in required):
207 |         return "Missing values", 400
208 | 
209 |     # 新建一笔交易
210 |     index = blockChain.new_transaction(values['sender'], values['recipient'], values['amount'])
211 | 
212 |     response = {'message': f'Transaction will be added to block {index}'}
213 |     return jsonify(response), 201
214 | 
215 | 
216 | # 创建/chain接口，返回整个区块链
217 | @app.route("/chain", methods = ["GET"])
218 | def full_chain():
219 |     response = {
220 |         'chain': blockChain.chain,
221 |         'length': len(blockChain.chain)
222 |     }
223 |     return jsonify(response), 200
224 | 
225 | 
226 | # 创建/mine GET接口. 告诉服务器去挖掘新的区块
227 | @app.route('/mine', methods = ['GET'])
228 | def mine():
229 |     ## 1. 运行工作证明算法以获得下一个证明，即计算工作量证明PoW。验证区块是否合格
230 |     last_block = blockChain.last_block
231 |     last_proof = last_block['proof']
232 |     proof = blockChain.proof_of_work(last_proof)
233 | 
234 |     ## 2. 系统给拥有工作量证明的节点提供奖励, 即挖到合格的区块，授予矿工比特币奖励
235 |     # 发送者为"0"表明是新挖出的币
236 |     blockChain.new_transaction(
237 |         sender="0",
238 |         recipient=node_identifier,
239 |         amount=1,
240 |     )
241 | 
242 |     ## 3. 构造新区块并将其添加到区块链中
243 |     block = blockChain.new_block(proof, None)
244 |     response = {
245 |         'message': "New block forged",
246 |         'index': block['index'],
247 |         'transactions': block['transactions'],
248 |         'proof': block['proof'],
249 |         'previous_hash': block['previous_hash'],
250 |     }
251 |     return jsonify(response), 200
252 | 
253 | 
254 | # 添加路由/nodes/register POST接口，注册节点
255 | @app.route('/nodes/register', methods = ['POST'])
256 | def register_nodes():
257 |     print("begin register...")
258 |     # values = request.get_json()
259 |     # nodes = values.get('nodes')   # 无返回结果
260 | 
261 |     # test 设置假定的端口号为5021、5022；假定端口号第三位的2对应此文件node2
262 |     nodes = ['http://192.168.2.111:5001',
263 |              'http://192.168.2.111:5002',
264 |              'http://192.168.2.111:5003']
265 | 
266 |     if nodes is None:
267 |         return  "Error: Please supply a valid list of nodes", 400
268 | 
269 |     for node in nodes:
270 |         blockChain.register_node(node)
271 | 
272 |     response = {
273 |         'message': 'New nodes have been added',
274 |         'total_nodes': list(blockChain.nodes)
275 |     }
276 |     return jsonify(response), 201
277 | 
278 | 
279 | # 添加路由/nodes/resolve GET接口，解决冲突
280 | @app.route('/nodes/resolve', methods = ['GET'])
281 | def consensus():
282 |     print("begin resolve...")
283 |     replaced = blockChain.resolve_conflicts()
284 | 
285 |     if replaced:
286 |         response = {
287 |             'message': 'Our chain was replaced',
288 |             'new_chain': blockChain.chain,
289 |             'length': len(blockChain.chain)
290 |         }
291 |     else:
292 |         response = {
293 |             'message': 'Our chain is authoritative',
294 |             'chain': blockChain.chain,
295 |             'length': len(blockChain.chain)
296 |         }
297 | 
298 |     return jsonify(response), 200
299 | 
300 | 
301 | if __name__ == "__main__":
302 |     parser = ArgumentParser()
303 |     parser.add_argument('-p', '--port', default=5000, type=int, help='port to listen on')
304 |     args = parser.parse_args()
305 |     port = args.port
306 | 
307 |     # 服务运行在端口5002上
308 |     app.run(host='192.168.2.111', port=5003)
309 | 


--------------------------------------------------------------------------------
/blockchains/待解决问题.txt:
--------------------------------------------------------------------------------
 1 | ﻿0. 解决已有问题
 2 |     问题：request没有内容
 3 |     方法：设置静态的返回内容（解决方式暂定）
 4 | 
 5 | 包含技术点：
 6 | 1. @staticmethod @property
 7 | 2. hash.hexdigest() hash.sha256
 8 | 3. json.dumps
 9 | 4. uuid4()
10 | 5. flask框架
11 | 
12 | set()属性存储节点
13 | 
14 | from typing import Any, Dict, List, Optional
15 | new_block(self, proof, previous_hash: Optional[str]) -> Dict[str, Any]
16 | def new_transaction(self, sender: str, recipient: str, amount: int) -> int
17 | 
18 | guess = f'{last_proof}{proof}'.encode()   f'{}{}'
19 | 
20 | urlparse
21 | 
22 | jsonify(response)
23 | 
24 | 400 201 200
25 | 
26 | node_identifier = str(uuid4()).replace('-', '')
27 | 


--------------------------------------------------------------------------------
/blockchains/模拟挖矿过程.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | 模拟挖矿的整体过程：
  3 | 1. 访问链接 http://192.168.2.111:5000/chain，查看创世区块的主要内容，如下：
  4 | 其中，包括创世区块的索引为1，设定的hash、proof，时间戳，为空的交易记录。
  5 | {
  6 |   "chain": [
  7 |     {
  8 |       "index": 1,
  9 |       "previous_hash": "1",
 10 |       "proof": 100,
 11 |       "timestamp": 1524207612.618698,
 12 |       "transactions": []
 13 |     }
 14 |   ],
 15 |   "length": 1
 16 | }
 17 | 
 18 | 2，访问链接 http://192.168.2.111:5000/mine，查看新建区块的主要内容，如下：
 19 | 其中，包括新建区块的索引为2，新加入的区块消息"New block forged"，上一个区块的hash值，计算出当前区块的算力（工作量证明）proof=35293，
 20 | 交易记录为transactions，由于只是模拟新建区块的情况，没有添加具体的交易内容，只有完成挖矿对矿工的奖励交易信息，即sender=0的交易记录。
 21 | {
 22 |   "index": 2,
 23 |   "previous_hash": "b3bdbdb4b95b3f7a96b17bf3843af0ac22ebabc993490b594ccf10d82fa90b2e",
 24 |   "proof": 35293,
 25 |   "timestamp": 1524207627.0737095,
 26 |   "transactions": [
 27 |     {
 28 |       "amount": 1,
 29 |       "recipient": "fc30a002579d4c159538353a9a267d97",
 30 |       "sender": "0"
 31 |     }
 32 |   ]
 33 | }
 34 | 
 35 | 3. 再次访问链接 http://192.168.2.111:5000/chain，再次查看区块链的主要内容，结果如下：
 36 | 可以看到，此时的区块链已经包含创世区块（index=1）以及刚添加的区块（index=2），区块链的长度"length": 2
 37 | {
 38 |   "chain": [
 39 |     {
 40 |       "index": 1,
 41 |       "previous_hash": "1",
 42 |       "proof": 100,
 43 |       "timestamp": 1524207612.618698,
 44 |       "transactions": []
 45 |     },
 46 |     {
 47 |       "index": 2,
 48 |       "previous_hash": "b3bdbdb4b95b3f7a96b17bf3843af0ac22ebabc993490b594ccf10d82fa90b2e",
 49 |       "proof": 35293,
 50 |       "timestamp": 1524207627.0737095,
 51 |       "transactions": [
 52 |         {
 53 |           "amount": 1,
 54 |           "recipient": "fc30a002579d4c159538353a9a267d97",
 55 |           "sender": "0"
 56 |         }
 57 |       ]
 58 |     }
 59 |   ],
 60 |   "length": 2
 61 | }
 62 | 
 63 | 4. 访问链接 http://192.168.2.111:5000/transactions/new，即新建交易，结果如下：
 64 | {
 65 |   "message": "Transaction will be added to block 3"
 66 | }
 67 | 即新建的交易信息会被添加到block 3 的区块中包装。
 68 | 
 69 | 5. 继续访问链接 http://192.168.2.111:5000/transactions/new，再次新建交易，结果如下：
 70 | {
 71 |   "message": "Transaction will be added to block 3"
 72 | }
 73 | 即新建的不同的交易信息，会被再次添加到block 3 的区块中包装。
 74 | 
 75 | 6. 访问链接 http://192.168.2.111:5000/mine，将第四步、第五步生成的交易信息添加到区块中，即block 3 中。结果如下：
 76 | 其中，包括新建区块的索引为3，新加入的区块消息"New block forged"。上一个区块的hash值，计算出当前区块的算力（工作量证明）proof=35089，
 77 | 此时的交易记录中，包含第四步、第五步生成的交易信息，即transactions中的前两个交易记录。
 78 | 最后也包含系统对矿工的奖励交易信息，即sender=0交易记录。
 79 | {
 80 |   "index": 3,
 81 |   "message": "New block forged",
 82 |   "previous_hash": "cc8e7dfd9c448a401468384a83667669bcd8f654722882cd0106c26de5c96fbe",
 83 |   "proof": 35089,
 84 |   "transactions": [
 85 |     {
 86 |       "amount": 5,
 87 |       "recipient": "456",
 88 |       "sender": "123"
 89 |     },
 90 |     {
 91 |       "amount": 5,
 92 |       "recipient": "456",
 93 |       "sender": "123"
 94 |     },
 95 |     {
 96 |       "amount": 1,
 97 |       "recipient": "fc30a002579d4c159538353a9a267d97",
 98 |       "sender": "0"
 99 |     }
100 |   ]
101 | }
102 | 
103 | 7. 访问链接 http://192.168.2.111:5000/chain，查看此时的区块链的内容，结果如下：
104 | 可以看到，经过创建创世区块（index=1）、添加无交易信息的空区块（index=2）、添加两个交易信息的区块（index=3），整个区块链的长度达到3.
105 | {
106 |   "chain": [
107 |     {
108 |       "index": 1,
109 |       "previous_hash": "1",
110 |       "proof": 100,
111 |       "timestamp": 1524207612.618698,
112 |       "transactions": []
113 |     },
114 |     {
115 |       "index": 2,
116 |       "previous_hash": "b3bdbdb4b95b3f7a96b17bf3843af0ac22ebabc993490b594ccf10d82fa90b2e",
117 |       "proof": 35293,
118 |       "timestamp": 1524207627.0737095,
119 |       "transactions": [
120 |         {
121 |           "amount": 1,
122 |           "recipient": "fc30a002579d4c159538353a9a267d97",
123 |           "sender": "0"
124 |         }
125 |       ]
126 |     },
127 |     {
128 |       "index": 3,
129 |       "previous_hash": "cc8e7dfd9c448a401468384a83667669bcd8f654722882cd0106c26de5c96fbe",
130 |       "proof": 35089,
131 |       "timestamp": 1524207930.704413,
132 |       "transactions": [
133 |         {
134 |           "amount": 5,
135 |           "recipient": "456",
136 |           "sender": "123"
137 |         },
138 |         {
139 |           "amount": 5,
140 |           "recipient": "456",
141 |           "sender": "123"
142 |         },
143 |         {
144 |           "amount": 1,
145 |           "recipient": "fc30a002579d4c159538353a9a267d97",
146 |           "sender": "0"
147 |         }
148 |       ]
149 |     }
150 |   ],
151 |   "length": 3
152 | }
153 | 
154 | # 添加路由解决冲突问题
155 | 8. 在三个文件（blockchain.py, blockchain_node1.py, blockchain_node2.py）中，设定对应的节点及其端口号。
156 | 在pycharm中，依次运行三个文件，结果如下：
157 |  * Running on http://192.168.2.111:5000/ (Press CTRL+C to quit)
158 |  * Running on http://192.168.2.111:5001/ (Press CTRL+C to quit)
159 |  * Running on http://192.168.2.111:5002/ (Press CTRL+C to quit)
160 | 
161 | 9. 在Postman中依次打开三个窗口，模拟三个节点的发送请求。
162 | http://192.168.2.111:5000/chain  GET方式；
163 | http://192.168.2.111:5001/chain  GET方式；
164 | http://192.168.2.111:5002/chain  GET方式；
165 | 
166 | 10. 在第一个窗口（5001）中，模拟挖出三个区块，即 http://192.168.2.111:5001/mine GET方式 发送三次请求。
167 | 在访问http://192.168.2.111:5001/chain，得到结果如下：
168 | {
169 |   "chain": [
170 |     {
171 |       "index": 1,
172 |       "previous_hash": "1",
173 |       "proof": 100,
174 |       "timestamp": 1524636695.0677617,
175 |       "transactions": []
176 |     },
177 |     {
178 |       "index": 2,
179 |       "previous_hash": "413186ba30bc4706a94212b8837afc9a06064eabeb8d999d53f74b0280c9949a",
180 |       "proof": 35293,
181 |       "timestamp": 1524636713.2794747,
182 |       "transactions": [
183 |         {
184 |           "amount": 1,
185 |           "recipient": "db044e87361d42d6bb36a118da23cc4a",
186 |           "sender": "0"
187 |         }
188 |       ]
189 |     },
190 |     {
191 |       "index": 3,
192 |       "previous_hash": "5adc8d4cef47df9969d0d62a99290e5b0779b57b8c3582031974bf102b422f2c",
193 |       "proof": 35089,
194 |       "timestamp": 1524636714.5593677,
195 |       "transactions": [
196 |         {
197 |           "amount": 1,
198 |           "recipient": "db044e87361d42d6bb36a118da23cc4a",
199 |           "sender": "0"
200 |         }
201 |       ]
202 |     },
203 |     {
204 |       "index": 4,
205 |       "previous_hash": "3b552f1fbc39f5cd8a8327d41351d0c083211e7943b2f8ef97b6155654912898",
206 |       "proof": 119678,
207 |       "timestamp": 1524636715.4139643,
208 |       "transactions": [
209 |         {
210 |           "amount": 1,
211 |           "recipient": "db044e87361d42d6bb36a118da23cc4a",
212 |           "sender": "0"
213 |         }
214 |       ]
215 |     }
216 |   ],
217 |   "length": 4
218 | }
219 | 
220 | 11. 在第二个窗口（5002）中，模拟挖出两个区块，即 http://192.168.2.111:5002/mine GET方式 发送两次请求。
221 | 在访问http://192.168.2.111:5002/chain，得到结果如下：
222 | {
223 |   "chain": [
224 |     {
225 |       "index": 1,
226 |       "previous_hash": "1",
227 |       "proof": 100,
228 |       "timestamp": 1524636760.722611,
229 |       "transactions": []
230 |     },
231 |     {
232 |       "index": 2,
233 |       "previous_hash": "acb63047662a741fda3195fc765b6d2fb5c124e5c4e90172ec9b437a148969bf",
234 |       "proof": 35293,
235 |       "timestamp": 1524636768.9283345,
236 |       "transactions": [
237 |         {
238 |           "amount": 1,
239 |           "recipient": "6cbb4b7f8738471b9c33b0db0bddf8d8",
240 |           "sender": "0"
241 |         }
242 |       ]
243 |     },
244 |     {
245 |       "index": 3,
246 |       "previous_hash": "d14b77ee468e18f30da685bf7e8caa9ff201b20bd791d6174d7eaba898f59013",
247 |       "proof": 35089,
248 |       "timestamp": 1524636769.9380262,
249 |       "transactions": [
250 |         {
251 |           "amount": 1,
252 |           "recipient": "6cbb4b7f8738471b9c33b0db0bddf8d8",
253 |           "sender": "0"
254 |         }
255 |       ]
256 |     }
257 |   ],
258 |   "length": 3
259 | }
260 | 
261 | 12. 在第三个窗口（5003）中，模拟挖出一个区块，即 http://192.168.2.111:5003/mine GET方式 发送一次请求。
262 | 在访问http://192.168.2.111:5003/chain，得到结果如下：
263 | {
264 |   "chain": [
265 |     {
266 |       "index": 1,
267 |       "previous_hash": "1",
268 |       "proof": 100,
269 |       "timestamp": 1524636812.4287012,
270 |       "transactions": []
271 |     },
272 |     {
273 |       "index": 2,
274 |       "previous_hash": "257905fe9635c5c301ab08e43a6fb7d35302714d2632ab544731fa68556dac1e",
275 |       "proof": 35293,
276 |       "timestamp": 1524636819.811855,
277 |       "transactions": [
278 |         {
279 |           "amount": 1,
280 |           "recipient": "5743e8ae7f744261a5d394b67d9b528e",
281 |           "sender": "0"
282 |         }
283 |       ]
284 |     }
285 |   ],
286 |   "length": 2
287 | }
288 | 
289 | 13. 此时，5001端口挖到三个区块，5002端口挖到两个区块，5003端口挖到一个区块。
290 | 即节点5001对应的区块链是最长的链，节点5002对应的区块链次之，节点5003对应的区块链最短。
291 | 将三个节点进行注册，添加到节点nodes = set()中。
292 | 在第一个窗口（5001），访问连接 http://192.168.2.111:5001/nodes/register POST方式，注册当前的节点，结果如下：
293 | {
294 |   "message": "New nodes have been added",
295 |   "total_nodes": [
296 |     "192.168.2.111:5003",
297 |     "192.168.2.111:5002",
298 |     "192.168.2.111:5001"
299 |   ]
300 | }
301 | 在第二个窗口（5002），访问连接 http://192.168.2.111:5002/nodes/register POST方式，注册当前的节点，结果如下：
302 | {
303 |   "message": "New nodes have been added",
304 |   "total_nodes": [
305 |     "192.168.2.111:5001",
306 |     "192.168.2.111:5002",
307 |     "192.168.2.111:5003"
308 |   ]
309 | }
310 | 在第三个窗口（5003），访问连接 http://192.168.2.111:5003/nodes/register POST方式，注册当前的节点，结果如下：
311 | {
312 |   "message": "New nodes have been added",
313 |   "total_nodes": [
314 |     "192.168.2.111:5003",
315 |     "192.168.2.111:5001",
316 |     "192.168.2.111:5002"
317 |   ]
318 | }
319 | 
320 | 14. 验证共识算发，用最长的区块链替换短的链，操作如下：
321 | 在第二个窗口（5002），访问连接 http://192.168.2.111:5002/nodes/resolve GET方式，检查当前的节点，结果如下：
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 | 
329 | 
330 | 注：
331 | 1. 在交易的模拟过程中，设定的交易信息静态消息，主要内容为：
332 | {
333 |   "amount": 5,
334 |   "recipient": "456",
335 |   "sender": "123"
336 | }
337 | 包括：交易的数量、发送方、接收方。
338 | 所以在区块链的区块中，交易内容相同。
339 | 在request.get_json()中，无接受数据，此点尚且作为遗留问题。
340 | 
341 | 2. 在模拟挖矿的过程中，前一个区块的工作量证明last_proof，与本区块的工作量证明proof，共同组成的字符串的hash值hash(last_proof proof)，
342 | 以其是否以四个零开头，作为是否为合格区块的标志。即valid_proof()函数的作用。
343 | 此处慎重调整开头零的个数，当添加两个零，即判定是否以六个零作为开头时，计算的工作量证明为49259370（四个零开头为35089），工作量提升了1403.84倍。
344 | 
345 | 


--------------------------------------------------------------------------------
/data_process/db_operation.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @Description : 数据库相关操作
  6 |  --------------------------------------
  7 |  @File        : db_operation.py
  8 |  @Time        : 2018/8/25 12:28
  9 |  @Software    : PyCharm
 10 |  --------------------------------------
 11 |  @Author      : lixj
 12 |  @Contact     : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | import re
 17 | import traceback
 18 | import json
 19 | from pymongo import MongoClient
 20 | import pymysql
 21 | import csv
 22 | import cx_Oracle
 23 | 
 24 | class trans():
 25 |     def __init__(self):
 26 |         self.jsonPath = "json2mongo.json"
 27 |         self.mysqlPath = "csv2mysql.csv"
 28 |         self.oraclePath = "csv2oracle.csv"
 29 |         self.mongodb_localhost = "mongodb://localhost:27017"    # 修改的host为locaohost，或具体的连接地址（在MongoDB安装目录bin下，输入cmd，输入mongo查看具体的连接信息）
 30 |         self.mysql_localhost = pymysql.connect(
 31 |                 host = 'localhost',
 32 |                 port = 3306,
 33 |                 user = 'root',
 34 |                 passwd = '123456789',
 35 |                 db = 'demo',
 36 |                 charset = 'utf8'    # utf8 not utf-8
 37 |             )
 38 |         self.oracle_localhost = cx_Oracle.connect('scott/123456789@localhost:1521/orcl')    # 链接信息：localhost:1521/orcl，在数据库中右键属性，查看链接详细信息
 39 | 
 40 |     def json2mongodb(self):
 41 |         print("begin process json...")
 42 |         try:
 43 |             # 1. 连接MongoDB
 44 |             conn = MongoClient(self.mongodb_localhost)
 45 |             db = conn.demo  # 连接数据库demo，没有自动创建
 46 |             demo_json = db.demo_json    # 使用demo_json集合，没有自动创建
 47 | 
 48 |             # 2. 插入数据
 49 |             # demo_json.insert([{"name":"lxj", "age":"18"}, {"sex":"man"}])
 50 | 
 51 |             # 3. 查找数据
 52 |             # one_json = demo_json.find_one({"name":"lxj"})
 53 | 
 54 |             # obj = demo_json.find_one()
 55 |             # obj_id = obj["_id"]     # ObjectId类型，直接根据ObjectId用于定向查找
 56 |             # print(demo_json.find_one({"_id": obj_id}))
 57 | 
 58 |             # 4. 修改数据
 59 |             # demo_json.update_one({"name":"lxj"}, {"$set":{"age":"20"}})
 60 | 
 61 |             # 5. 遍历数据
 62 |             # print(db.demo_json.count())
 63 |             # for i in demo_json.find():
 64 |             #     print(i)
 65 | 
 66 |             # 6. 删除数据
 67 |             # db.demo_json.remove()   # 全部删除
 68 | 
 69 |             # 7. 插入json文件
 70 |             # with open(self.jsonPath, "r", encoding="utf-8") as f:
 71 |             #     jsonFile = json.load(f)
 72 |             #     demo_json.insert(jsonFile)
 73 |         except:
 74 |             traceback.print_exception
 75 | 
 76 |     def csv2mysql(self):
 77 |         print("begin mysql...")
 78 |         try:
 79 |             # 1. 连接MySQL
 80 |             conn = self.mysql_localhost
 81 |             cursor = conn.cursor()
 82 |             # findall_sql = "select * from test"
 83 |             # cursor.execute(findall_sql)
 84 | 
 85 |             # 2. 查看数据
 86 |             # all_row = cursor.fetchall()
 87 |             # print(all_row)
 88 | 
 89 |             # 3. 插入数据
 90 |             # insert_sql = "insert into test values ('2','zz','7')"
 91 |             # cursor.execute(insert_sql)
 92 | 
 93 |             # 4. 修改数据
 94 |             # update_sql = "update test set age = '33' where name = 'aaa'"
 95 |             # cursor.execute(update_sql)
 96 | 
 97 |             # 5. 删除数据
 98 |             # delete_sql = "delete from test where age<10"
 99 |             # cursor.execute(delete_sql)
100 | 
101 |             # 6. 新建表
102 |             # header = ['id', '主题', '用户ID', '用户名', '推荐力度', '评论时间', '评论标题', '评论内容']
103 |             # createTable_sqll = """
104 |             # CREATE TABLE IF NOT EXISTS`testtest` (
105 |             #   `%s` INT PRIMARY KEY AUTO_INCREMENT NOT NULL,
106 |             #   `%s` varchar(128) DEFAULT NULL,
107 |             #   `%s` varchar(128) DEFAULT NULL,
108 |             #   `%s` varchar(128) DEFAULT NULL,
109 |             #   `%s` varchar(30) DEFAULT NULL,
110 |             #   `%s` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
111 |             #   `%s` varchar(500) DEFAULT NULL,
112 |             #   `%s` varchar(65533) DEFAULT NULL
113 |             # ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
114 |             # """ % (header[0], header[1], header[2], header[3], header[4], header[5], header[6], header[7])
115 |             # cursor.execute(createTable_sqll)
116 | 
117 |             # 7. 写入csv文件--采用csv方式
118 |             # utf-8_sig编码，去掉多余字符BOM（打开utf-8文件时开头的一个多余字符，用来声明编码信息）
119 |             with open(self.mysqlPath, "r", encoding="utf-8_sig") as f:
120 |                 csv_reader = csv.reader(f)
121 |                 headers = next(csv_reader)
122 |                 headers[0] = "id"   # 当列名为空时替换
123 | 
124 |                 # 0. 遍历csv文件中的数据
125 |                 # for rows in csv_reader:
126 |                 #     print(rows)
127 | 
128 |                 # 1. 新建表
129 |                 createTable_sql = """
130 |                 CREATE TABLE IF NOT EXISTS`test2` (
131 |                   `%s` INT PRIMARY KEY AUTO_INCREMENT NOT NULL,
132 |                   `%s` varchar(128) DEFAULT NULL,
133 |                   `%s` varchar(128) DEFAULT NULL,
134 |                   `%s` varchar(128) DEFAULT NULL,
135 |                   `%s` varchar(30) DEFAULT NULL,
136 |                   `%s` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
137 |                   `%s` varchar(500) DEFAULT NULL,
138 |                   `%s` text(65533) DEFAULT NULL,
139 |                   `%s` varchar(30) DEFAULT NULL,
140 |                   `%s` varchar(30) DEFAULT NULL,
141 |                   `%s` varchar(50) DEFAULT NULL
142 |                 ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
143 |                 """ % (headers[0], headers[1], headers[2], headers[3], headers[4], headers[5], headers[6], headers[7], headers[8], headers[9], headers[10])
144 |                 # cursor.execute(createTable_sql)
145 | 
146 |                 # 2. 插入数据--注意数据清洗
147 |                 for i,rows in enumerate(csv_reader):    # enumerate为python内置函数，用于既要遍历索引又要遍历元素
148 | 
149 |                     # 涉及数据清洗，对存入数据库的数据清洁度的要求较高，双引号影响数据插入
150 |                     name = rows[3]
151 |                     comment = rows[7]
152 | 
153 |                     # 数据清洗
154 |                     name = name.replace("\'", "")
155 |                     comment = comment.replace("\"", "").replace(".,", ".").replace(",,", ",").replace("..,", "..").replace(":,", ":")
156 | 
157 |                     insert_sql = """INSERT INTO test2 VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )""" % (rows[0], "'" + rows[1] + "'", "'" + rows[2] + "'", "'" + name + "'", "'" + rows[4] + "'", "'" + rows[5] + "'", "'" + rows[6] + "'", "'" + comment + "'", "'" + rows[8] + "'", "'" + rows[9] + "'", "'" + rows[10] + "'")
158 |                     # cursor.execute(insert_sql)    # 插入数据
159 | 
160 |             conn.commit()   # 提交
161 |             cursor.close()
162 |             conn.close()
163 |         except:
164 |             conn.rollback() # 发生错误则全部回滚
165 |             traceback.print_exception
166 | 
167 |     def csv2oracle(self):
168 |         print("connect to oracle...")
169 |         try:
170 |             # 1. 链接Oracle数据库
171 |             conn = self.oracle_localhost
172 |             cursor = conn.cursor()
173 | 
174 |             # 2. 查询数据
175 |             sql = "select * from EMP"
176 |             cursor.execute(sql)
177 |             allData = cursor.fetchall()     # cursor.fetchone()
178 |             for data in allData:
179 |                 print(data)
180 | 
181 |             # # 3. 插入、更新、删除  主要区别在于sql不同
182 |             # def sqlDML(sql, conn):
183 |             #     cursor = conn.cursor()
184 |             #     cursor.execute(sql)
185 |             #     cursor.close()
186 |             #     conn.commit()
187 | 
188 |             conn.commit()
189 |             cursor.close()
190 |             conn.close()
191 |         except:
192 |             conn.rollback()
193 |             traceback.print_exception
194 | 
195 | trans = trans()
196 | # trans.json2mongodb()
197 | # trans.csv2mysql()
198 | # trans.csv2oracle()
199 | 


--------------------------------------------------------------------------------
/data_process/divide_words.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : HanLP, baiduNLP, jieba分词对比
 6 |  --------------------------------------
 7 |  @File        : divide_words.py
 8 |  @Time        : 2018/8/25 22:10
 9 |  @Software    : PyCharm
10 |  --------------------------------------
11 |  @Author      : lixj
12 |  @Contact     : lixj_zj@163.com
13 |  --------------------------------------
14 | """
15 | 
16 | # import hanlp
17 | from aip import AipNlp
18 | from jieba import *
19 | 
20 | class baidu_nlp:
21 |     def __init__(self):
22 |         self.APP_ID = "020e0df2b55441d9b90861ea2b457ddf"
23 |         self.API_KEY = "51fa55f6feb94a0fb7d4de49f111d6c2"
24 |         self.SECRET_KEY = "129ba31afdaa439da5cf9ab0cd07d8f4"
25 |         self.client = AipNlp(self.APP_ID, self.API_KEY, self.SECRET_KEY)
26 | 
27 |     def cifa(self, text):
28 |         cifa = self.client.lexer(text)
29 |         print(cifa)
30 | 
31 | class jieba:
32 |     pass
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     text = "你好，欢迎在Python中使用百度NLP"
37 |     baidu_nlp = baidu_nlp()
38 |     baidu_nlp.cifa(text)
39 | 
40 | # print(HanLP.segment("你好，欢迎在Python中调用HanLP的API"))
41 | #
42 | # testCase = [
43 | #     "商品和服务",
44 | #     "结婚的和尚未结婚的确实在干扰分词啊",
45 | #     "买水果然后来世博园最后去世博会",
46 | #     "中国的首都是北京",
47 | #     "欢迎新老师生前来就餐",
48 | #     "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作",
49 | #     "随着页游兴起到现在的页游繁盛，依赖于存档进行逻辑判断的设计减少了，但这块也不能完全忽略掉。"]
50 | # for sentence in testCase:
51 | #     print(HanLP.segment(sentence))
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/data_process/pandas_operation.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : 
 6 |  --------------------------------
 7 |  @Time    : 2019/8/27 20:20
 8 |  @File    : pandas_operation.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import pandas as pd
16 | import numpy as np
17 | from copy import deepcopy
18 | 
19 | ## 1. read_cvs
20 | # 当读取的数据量很大时，请尝试添加这个参数：nrows
21 | pd.read_csv(nrows=5)
22 | # dtype 声明列的类型
23 | df = pd.DataFrame(dtype={'col1': str, 'c2': int})
24 | 
25 | 
26 | ## 2. select_dtypes 在读取表之后，每个列的默认数据类型可以是bool、int64、float64、object、category、timedelta64或datetime64。
27 | df.select_dtypes(include=['float64', 'int64'])
28 | 
29 | 
30 | ## 3. copy 复制 df
31 | df1 = pd.DataFrame({'a': [0, 0, 0], 'b': [1, 1, 1]})
32 | df2 = df1
33 | df2['a'] = df2['a'] + 1
34 | df1.head()
35 | 
36 | # df2 = df1不是复制df1并将其赋值给df2，而是设置一个指向df1的指针。所以df2的任何变化都会导致df1的变化
37 | df2 = df1.copy()
38 | # 或者
39 | df3 = deepcopy(df1)
40 | 
41 | 
42 | ## 4. map
43 | # 数据转换。keys 是旧值，values 是新值
44 | level_map = {1: 'high', 2: 'medium', 3: 'low'}
45 | df['c_level'] = df['c'].map(level_map)
46 | 
47 | 
48 | ## 5. apply
49 | # 创建一个新列，其中包含其他列内容作为输入
50 | # 缺点：速度慢
51 | def rule(x, y):
52 |     if x == 'high' and y > 10:
53 |         return 1
54 |     else:
55 |         return 0
56 | 
57 | df = pd.DataFrame({'c1': ['high', 'high', 'low', 'low'], 'c2': [0, 23, 17, 4]})
58 | df['new'] = df.apply(lambda x: rule(x['c1'], x['c2']), axis=1)
59 | df.show()
60 | 
61 | 
62 | ## 6. value_counts 查看值分布
63 | df['col1'].value_counts()
64 | # normalize = True：如果你想查看频率而不是计数。
65 | # dropna = False：如果你还想在统计中包含缺失值。
66 | # sort = False：按值而不是按计数排序的统计结果。
67 | # df['c'].value_counts().reset_index()：如果你想将stats表转换为pandas dataframe并对其进行操作。
68 | 
69 | 
70 | ## 7. 缺失值数量
71 | # .isnull() 和 .sum() 来计算指定列中缺失值的数量。
72 | df = pd.DataFrame({'id': [1, 2, 3], 'c1': [0, 0, np.nan], 'c2': [np.nan, 1, 1]})
73 | df = df[['id', 'c1', 'c2']]
74 | df['num_nulls'] = df[['c1', 'c2']].isnull().sum(axis=1)
75 | df.head()
76 | 
77 | 
78 | ## 8. 选择特定多个 ID 的行
79 | df_filter = df['ID'].isin(['A001', 'C022'])
80 | print(df[df_filter])
81 | 
82 | 
83 | ## 9. 百分位组  将一列的值分类为几组。
84 | # 比如前5%的值分为组1，5-20%的值分为组2，20-50%的值分为组3，底部50%的值分为组4
85 | cut_points = [np.percentile(df['col'], i) for i in [50, 80, 95]]
86 | df['group'] = 1
87 | for i in range(3):
88 |     df['group'] = df['group'] + (df['col'] < cut_points[i])
89 | 
90 | 
91 | ## 10. to_csv
92 | # 准确地打印出写入文件的前五行
93 | print(df[:5].to_csv())
94 | 
95 | # 处理混合在一起的整数和缺失值。
96 | # 如果一个列同时包含缺失值和整数，那么数据类型仍然是float而不是int。
97 | # 导出表时，可以添加 float_format='%.0f' ，将所有浮点数化为整数。
98 | # 如果你只想要所有列的整数输出，请使用此技巧。
99 | 


--------------------------------------------------------------------------------
/data_process/word_cloud.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/python3
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @File    	  : word_cloud.py
  6 |  @Time    	  : 2018/8/26 0:37
  7 |  @Software	  : PyCharm
  8 |  --------------------------------------
  9 |  @Description : 
 10 |  --------------------------------------
 11 |  @Author  	  : lixj
 12 |  @Contact  	  : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | # WordCloud + 统计图表
 17 | '''
 18 | 关键点和难点在于：
 19 | 对于网上用户的评论+文字做分析，提取出关键点作为列表
 20 | '''
 21 | 
 22 | import os
 23 | from pyecharts import WordCloud
 24 | from pyecharts import Bar, Pie, Line, Scatter3D
 25 | from pyecharts import Page
 26 | import random
 27 | 
 28 | 
 29 | # 词云图
 30 | def wordCloud(x, y, label):
 31 |     wordCloud = WordCloud(width=1300, height=620)
 32 | 
 33 |     # word_size_ragne限定字体大小范围
 34 |     # shape参数用来调整词云形状（'circle', 'cardioid', 'diamond', 'triangle-forward', 'triangle', 'pentagon', 'star'）
 35 |     wordCloud.add("", x, y, word_size_range=[20, 100], shape="circle")
 36 |     wordCloud.render()
 37 |     os.system(r"render.html")  # 默认内容输出到根目录
 38 | 
 39 | 
 40 | # 统计图表
 41 | def get_charts(x, y, label, type):
 42 |     if type == 1:
 43 |         c = Pie("饼状图")
 44 |     elif type == 2:
 45 |         c = Bar3D("条形图")
 46 |     elif type == 3:
 47 |         c = Line("折线图")
 48 |     print(c)
 49 |     c.add(label, x, y, is_more_utils=True)
 50 |     # 打印输出图表的所有配置项
 51 |     c.show_config()
 52 |     c.render()
 53 |     os.system(r"render.html")
 54 | 
 55 | 
 56 | # 多个统计图
 57 | def get_otherCharts(page):
 58 |     attr = ["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"]
 59 |     v1 = [5, 20, 36, 10, 75, 90]
 60 |     v2 = [10, 25, 8, 60, 20, 80]
 61 |     bar = Bar("柱状图数据堆叠示例")
 62 |     bar.add("商家A", attr, v1, is_stack=True)
 63 |     bar.add("商家B", attr, v2, is_stack=True)
 64 |     page.add(bar)
 65 |     page.render()
 66 |     os.system(r"render.html")
 67 | 
 68 | 
 69 | # Scatter3D
 70 | def get_scatter3D(page):
 71 |     data = [[random.randint(0, 100), random.randint(0, 100), random.randint(0, 100)] for _ in range(80)]
 72 |     range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf',
 73 |                    '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026']
 74 |     scatter3D = Scatter3D("3D散点示例", width=1200, height=600)
 75 |     scatter3D.add("", data, is_visualmap=True, visual_range_color=range_color)
 76 |     page.add(scatter3D)
 77 |     page.render()
 78 |     os.system(r"render.html")
 79 | 
 80 | 
 81 | def main():
 82 |     '''
 83 |     # 测试词云图
 84 |     x = [
 85 |         "python", "lxj", "zj", "big data", "python", "lxj", "zj", "big data",
 86 |         "python", "lxj", "zj", "big data", "python", "lxj", "zj", "big data"
 87 |         ]
 88 |     y = [
 89 |         10000, 8000, 6000, 3000, 10000, 8000, 6000, 3000,
 90 |         10000, 8000, 6000, 3000, 10000, 8000, 6000, 3000
 91 |         ]
 92 |     label = "词云"
 93 |     wordCloud(x, y, label)
 94 |     '''
 95 | 
 96 |     '''
 97 |     # 测试统计图表    
 98 |     x = ["衬衫", "袜子", "高跟鞋", "羊毛衫", "裤子"]
 99 |     y1 = [5, 10, 38, 75, 90]
100 |     y2 = [15, 4, 70, 25, 190]
101 |     label = "服装"
102 |     type = 2
103 |     get_charts(x, y, label, type)
104 |     '''
105 | 
106 |     '''
107 |     # 测试多个统计图
108 |     page = Page()
109 |     get_otherCharts(page)
110 |     '''
111 | 
112 |     # 测试三维散点图
113 |     page = Page()
114 |     get_scatter3D(page)
115 | 
116 | 
117 | if __name__ == "__main__":
118 |     main()
119 | 
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/interesting/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2020/2/9 19:01
 8 |  @File    : __init__.py.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/interesting/apscheduler/a.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/interesting/apscheduler/a.ico


--------------------------------------------------------------------------------
/interesting/apscheduler/testApscheduler.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : 测试 apscheduler
 6 |  使用：
 7 |  1.安装pyinstaller包、pywin32包；
 8 |  2.同目录下，存放xx.ico图标文件；
 9 |  3.pycharm的terminal中执行打包指令：
10 |    >pyinstaller -F -w -i a.ico testApscheduler.py
11 |  4.生成对应的testApscheduler.exe文件
12 | 
13 |  参考链接：
14 |  https://www.jianshu.com/p/4f5305e220f0
15 |  https://zhuanlan.zhihu.com/p/46948464
16 |  --------------------------------
17 |  @Time    : 2019/5/21 10:50
18 |  @File    : testApscheduler.py
19 |  @Software: PyCharm
20 |  --------------------------------
21 |  @Author  : lixj
22 |  @contact : lixj_zj@163.com
23 | """
24 | 
25 | from multiprocessing import freeze_support
26 | from apscheduler.schedulers.blocking import BlockingScheduler
27 | from apscheduler.triggers.cron import CronTrigger
28 | 
29 | 
30 | def tick():
31 |     with open("aa.txt", "w", encoding="utf-8") as f:
32 |         f.write("123456")
33 |         print("write done")
34 | 
35 | 
36 | if __name__ == '__main__':
37 |     freeze_support()  # 防止多进程
38 |     scheduler = BlockingScheduler()
39 |     # 通过CronTrigger设置时间，防止pyinstall打包执行exe报错：No trigger by the name "cron" was found
40 |     trigger = CronTrigger(hour='15', minute='55')
41 |     scheduler.add_job(tick, trigger=trigger)
42 | 
43 |     try:
44 |         scheduler.start()
45 |     except (KeyboardInterrupt, SystemExit):
46 |         pass
47 | 


--------------------------------------------------------------------------------
/interesting/dingding_push/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2020/2/9 18:34
 8 |  @File    : __init__.py.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/interesting/dingding_push/demo.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : 测试 python 推送钉钉消息
 6 |  --------------------------------
 7 |  @Time    : 2020/2/9 19:13
 8 |  @File    : demo.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import requests
16 | import json
17 | import logging
18 | import pymysql
19 | 
20 | # 日志文件设置
21 | logging.basicConfig(level=logging.DEBUG,  # 控制台打印的日志级别
22 |                     filename='/app/mom/demo/run_log.log',  # 指定生成日志文件的位置，否则默认在 /root 路径下生成日志文件
23 |                     filemode='a',  ##模式，有w和a，w就是写模式，每次都会重新写日志，覆盖之前的日志
24 |                     # a是追加模式，默认如果不写的话，就是追加模式
25 |                     format=
26 |                     '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s'
27 |                     # 日志格式
28 |                     )
29 | 
30 | 
31 | # 链接数据库，读取数据
32 | def conn_read_data():
33 |     conn_info = pymysql.connect(
34 |         host='XXXX',
35 |         port=3306,
36 |         user='XXXX',
37 |         passwd='XXXX',
38 |         db='XXXX',
39 |         charset='utf8'  # utf8 not utf-8
40 |     )
41 |     try:
42 |         cursor = conn_info.cursor()
43 |         findall_sql = "select count(id) from XXXX"
44 |         count = cursor.execute(findall_sql)
45 |         cursor.close()
46 |         conn_info.close()
47 |         return count
48 |     except Exception:
49 |         conn_info.rollback()  # 发生错误则全部回滚
50 | 
51 | 
52 | # 钉钉推送消息
53 | def ding_push_message(msg):
54 |     # 构建请求数据
55 |     message = {
56 |         "msgtype": "text",
57 |         "text": {
58 |             "content": msg
59 |         },
60 |         "at": {
61 |             "isAtAll": True
62 |         }
63 |     }
64 | 
65 |     # 对请求的数据进行json封装
66 |     message_json = json.dumps(message)
67 |     # 发送请求
68 |     info = requests.post(url=web_url, data=message_json, headers=header)
69 |     # 打印返回的结果
70 |     logging.info(info.text)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     # 钉钉请求的URL，WebHook地址
75 |     web_url = "https://oapi.dingtalk.com/robot/send?access_token=XXXX"
76 | 
77 |     # 构建请求头部
78 |     header = {
79 |         "Content-Type": "application/json",
80 |         "Charset": "UTF-8"
81 |     }
82 | 
83 |     # 构建请求数据
84 |     msg = "【钉钉消息】目前人数共" + str(conn_read_data()) + "人。"
85 | 
86 |     ding_push_message(msg)
87 | 


--------------------------------------------------------------------------------
/interesting/dingding_push/dingding_push_msg.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : 测试钉钉群机器人推送消息
 6 |  安全设置是必填项，例如
 7 |      自定义关键字：钉钉
 8 |      钉钉发送通知时，必须包含关键字 “钉钉”，不然会报 keyword not in content。
 9 |  --------------------------------
10 |  @Time    : 2020/2/9 18:34
11 |  @File    : dingding_push_msg.py
12 |  @Software: PyCharm
13 |  --------------------------------
14 |  @Author  : lixj
15 |  @contact : lixj_zj@163.com
16 | """
17 | 
18 | import requests
19 | import json
20 | 
21 | 
22 | def ding_push_message():
23 |     # 构建请求头部
24 |     header = {
25 |         "Content-Type": "application/json",
26 |         "Charset": "UTF-8"
27 |     }
28 | 
29 |     # 构建请求数据
30 |     message = {
31 |         "msgtype": "text",
32 |         "text": {
33 |             "content": msg
34 |         },
35 |         "at": {
36 |             "isAtAll": True
37 |         }
38 |     }
39 | 
40 |     # 对请求的数据进行json封装
41 |     message_json = json.dumps(message)
42 |     # 发送请求
43 |     info = requests.post(url=web_url, data=message_json, headers=header)
44 |     # 打印返回的结果
45 |     print(info.text)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     # 请求的URL，WebHook地址
50 |     web_url = "https://oapi.dingtalk.com/robot/send?access_token=xxxx"
51 |     # 构建请求数据
52 |     msg = "钉钉，测试消息。。。"
53 | 
54 |     ding_push_message()
55 | 


--------------------------------------------------------------------------------
/interesting/hongzha.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : 短信轰炸
 6 |                 1. chromedriver.exe 放入到指定的文件路径下
 7 |                 2. 版本对应
 8 |  --------------------------------------
 9 |  @File        : financialCalculator.py
10 |  @Time        : 2018/8/25 12:28
11 |  @Software    : PyCharm
12 |  --------------------------------------
13 |  @Author      : lixj
14 |  @Contact     : lixj_zj@163.com
15 |  --------------------------------------
16 | """
17 | 
18 | import time
19 | from selenium import webdriver
20 | from threading import Thread
21 | 
22 | class hongZha():
23 |     def __init__(self):
24 |         self.target_phone = "13636466080"   # phone
25 |         self.num = 0    # number
26 |         self.chrome_options = webdriver.ChromeOptions()
27 |         self.chrome_options.add_argument('--headless')
28 |         self.chrome_options.add_argument('--disable-gpu')
29 |         self.chrome_options.add_argument('--user-agent=Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30')
30 |         self.driver = webdriver.Chrome(chrome_options=self.chrome_options, executable_path='../otherfiles/chromedriver/chromedriver.exe')
31 | 
32 |     def send_result(self, button, name):
33 |         button.click()
34 |         self.num += 1
35 |         print("{} 第{}次发送成功 {}".format(self.target_phone, self.num, name))
36 |         time.sleep(2)
37 | 
38 |     # 1.
39 |     def zhihu(self, name):
40 |         self.driver.get("https://www.zhihu.com/signup")
41 |         self.driver.find_element_by_xpath("//button[@class='Button Button--primary Button--blue']").click()
42 |         time.sleep(3)
43 |         tel = self.driver.find_element_by_xpath("//input[@placeholder='手机号']")
44 |         tel.send_keys(self.target_phone)
45 |         button = self.driver.find_element_by_xpath("//button[@class='Button CountingDownButton SignFlow-smsInputButton Button--plain']")
46 |         self.send_result(button, name)
47 |         self.driver.quit()
48 | 
49 |     # 2.
50 |     def weipinhui(self, name):
51 |         self.driver.get("https://passport.vip.com/register")
52 |         tel = self.driver.find_element_by_xpath("//input[@placeholder='请输入手机号码']")
53 |         tel.send_keys(self.target_phone)
54 |         button = self.driver.find_element_by_xpath("//a[@id='J_mobile_verifycode_btn']")
55 |         self.send_result(button, name)
56 |         self.driver.quit()
57 | 
58 |     # 3.
59 |     def suning(self, name):
60 |         self.driver.get("https://reg.suning.com/person.do")
61 |         tel = self.driver.find_element_by_xpath("//input[@id='mobileAlias']")
62 |         tel.send_keys(self.target_phone)
63 |         button = self.driver.find_element_by_xpath("//a[@id='sendSmsCode']")
64 |         self.send_result(button, name)
65 |         self.driver.quit()
66 | 
67 |     # 3.
68 |     def mail163(self, name):
69 |         self.driver.get("http://reg.email.163.com/unireg/call.do?cmd=register.entrance&from=163mail_right")
70 |         tel = self.driver.find_element_by_xpath("//input[@id='mobileIpt']")
71 |         tel.send_keys(self.target_phone)
72 |         button = self.driver.find_element_by_xpath("//a[@id='sendAcodeStg']")
73 |         self.send_result(button, name)
74 |         self.driver.quit()
75 | 
76 | if __name__ == '__main__':
77 |     hongZha = hongZha()
78 | 
79 |     zh = Thread(target=hongZha.zhihu, args=("zhihu", ))
80 |     zh.start()
81 | 
82 |     # wph = Thread(target=hongZha.weipinhui, args=("weipinhui", ))
83 |     # wph.start()
84 | 
85 |     # sn = Thread(target=hongZha.suning, args=("suning", ))
86 |     # sn.start()
87 | 
88 |     # mail163 = Thread(target=hongZha.mail163, args=("mail163", ))
89 |     # mail163.start()
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/other_files/chromedriver/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/other_files/chromedriver/chromedriver.exe


--------------------------------------------------------------------------------
/practice/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/5/25 10:32
 8 |  @File    : __init__.py.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/practice/leetcode/237.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |     请编写一个函数，使其可以删除某个链表中给定的（非末尾）节点，你将只被给定要求被删除的节点。
 7 |     现有一个链表 -- head = [4,5,1,9]，它可以表示为:
 8 |     4->5->1->9
 9 | 
10 |     示例 1:
11 |     输入: head = [4,5,1,9], node = 5
12 |     输出: [4,1,9]
13 |     解释: 给定你链表中值为 5 的第二个节点，那么在调用了你的函数之后，该链表应变为 4 -> 1 -> 9.
14 | 
15 |     示例 2:
16 |     输入: head = [4,5,1,9], node = 1
17 |     输出: [4,5,9]
18 |     解释: 给定你链表中值为 1 的第三个节点，那么在调用了你的函数之后，该链表应变为 4 -> 5 -> 9.
19 | 
20 |     说明:
21 |     链表至少包含两个节点。
22 |     链表中所有节点的值都是唯一的。
23 |     给定的节点为非末尾节点并且一定是链表中的一个有效节点。
24 |     不要从你的函数中返回任何结果。
25 | 
26 |     注：直接把当前要删除的节点的下一个节点的值直接赋值给当前要删除的节点；
27 |     其实删除的是下一个节点，只是把下一个节点的值复制给了当前要删除的节点。
28 |  --------------------------------
29 |  @Time    : 2019/4/19 23:00
30 |  @File    : 237.py
31 |  @Software: PyCharm
32 |  --------------------------------
33 |  @Author  : lixj
34 |  @contact : lixj_zj@163.com
35 | """
36 | 
37 | 
38 | class ListNode:
39 |     def __init__(self, x):
40 |         self.val = x
41 |         self.next = None
42 | 
43 | 
44 | class Solution:
45 |     def deleteNode(self, node):
46 |         if node.next.next != None:
47 |             node.val = node.next.val    # 执行顺序：先处理值，再移动节点
48 |             node.next = node.next.next
49 |         node.val = node.next.val
50 |         node.next = None
51 |         # 同理可以删除前N个节点。for ()中从node节点处开始删除
52 | 
53 | 
54 | node = ListNode(5)
55 | print(Solution().deleteNode(node))
56 | 


--------------------------------------------------------------------------------
/practice/leetcode/709.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  实现函数 ToLowerCase()，该函数接收一个字符串参数 str，并将该字符串中的大写字母转换成小写字母，之后返回新的字符串。
 7 |  示例 1：
 8 |  输入: "Hello"
 9 |  输出: "hello"
10 |  示例 2：
11 |  输入: "here"
12 |  输出: "here"
13 |  示例 3：
14 |  输入: "LOVELY"
15 |  输出: "lovely"
16 |  --------------------------------
17 |  @Time    : 2019/4/17 23:06
18 |  @File    : 709.py
19 |  @Software: PyCharm
20 |  --------------------------------
21 |  @Author  : lixj
22 |  @contact : lixj_zj@163.com
23 | """
24 | 
25 | 
26 | class SolutionOne:
27 |     def toLowerCase(self, str: str) -> str:
28 |         return str.lower()
29 | 
30 | 
31 | class SolutionTwo:
32 |     def toLowerCase(self, str: str) -> str:
33 |         result = []
34 |         for char in str:
35 |             if 'A' <= char <= 'Z':  # Unicode编码是大写字母
36 |                 result.append(chr(ord(char) + 32))
37 |             else:
38 |                 result.append(char)
39 |         return "".join(result)
40 | 
41 | 
42 | string = "Hello WORD"
43 | # print(SolutionOne().toLowerCase(string))
44 | print(SolutionTwo().toLowerCase(string))
45 | 


--------------------------------------------------------------------------------
/practice/leetcode/771.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  给定字符串J 代表石头中宝石的类型，和字符串 S 代表你拥有的石头。S 中每个字符代表了一种你拥有的石头的类型，
 7 |  你想知道你拥有的石头中有多少是宝石。
 8 |  J 中的字母不重复，J 和 S中的所有字符都是字母。字母区分大小写，因此"a"和"A"是不同类型的石头。
 9 | 
10 |  示例 1:
11 |  输入: J = "aA", S = "aAAbbbb"
12 |  输出: 3
13 | 
14 |  示例 2:
15 |  输入: J = "z", S = "ZZ"
16 |  输出: 0
17 | 
18 |  注意:
19 |  S 和 J 最多含有50个字母。
20 |  J 中的字符不重复。
21 |  --------------------------------
22 |  @Time    : 2019/4/15 22:05
23 |  @File    : 771.py
24 |  @Software: PyCharm
25 |  --------------------------------
26 |  @Author  : lixj
27 |  @contact : lixj_zj@163.com
28 | """
29 | 
30 | 
31 | class SolutionOne:
32 |     """
33 |     两次for循环
34 |     """
35 |     def numJewelsInStones(self, J: str, S: str) -> int:
36 |         num = 0
37 |         for j in J:
38 |             for s in S:
39 |                 if j == s:
40 |                     num = num + 1
41 |         return num
42 | 
43 | 
44 | class SolutionTwo:
45 |     """
46 |     一层for循环 + 判断
47 |     """
48 |     def numJewelsInStones(self, J: str, S: str) -> int:
49 |         num=0
50 |         for s in S:
51 |             if s in J:
52 |                 num+=1
53 |         return num
54 | 
55 | class SolutionThree:
56 |     """
57 |     set()
58 |     """
59 |     def numJewelsInStones(self, J: str, S: str) -> int:
60 |         jSet = set(J)
61 |         return sum(s in jSet for s in S)
62 | 
63 | J = "aA"
64 | S = "aAAbbbb"
65 | print(SolutionTwo().numJewelsInStones(J, S))
66 | 


--------------------------------------------------------------------------------
/practice/leetcode/832.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |     给定一个二进制矩阵 A，我们想先水平翻转图像，然后反转图像并返回结果。
 7 | 
 8 |     水平翻转图片就是将图片的每一行都进行翻转，即逆序。
 9 |     例如，水平翻转 [1, 1, 0] 的结果是 [0, 1, 1]。
10 | 
11 |     反转图片的意思是图片中的 0 全部被 1 替换， 1 全部被 0 替换。例如，反转 [0, 1, 1] 的结果是 [1, 0, 0]。
12 | 
13 |     示例 1:
14 |     输入: [[1,1,0],[1,0,1],[0,0,0]]
15 |     输出: [[1,0,0],[0,1,0],[1,1,1]]
16 |     解释:
17 |     首先翻转每一行: [[0,1,1],[1,0,1],[0,0,0]]；
18 |     然后反转图片: [[1,0,0],[0,1,0],[1,1,1]]
19 | 
20 |     示例 2:
21 |     输入: [[1,1,0,0],[1,0,0,1],[0,1,1,1],[1,0,1,0]]
22 |     输出: [[1,1,0,0],[0,1,1,0],[0,0,0,1],[1,0,1,0]]
23 |     解释:
24 |     首先翻转每一行: [[0,0,1,1],[1,0,0,1],[1,1,1,0],[0,1,0,1]]；
25 |     然后反转图片: [[1,1,0,0],[0,1,1,0],[0,0,0,1],[1,0,1,0]]
26 | 
27 |     说明:
28 |     1 <= A.length = A[0].length <= 20
29 |     0 <= A[i][j] <= 1
30 |  --------------------------------
31 |  @Time    : 2019/4/20 14:33
32 |  @File    : 832.py
33 |  @Software: PyCharm
34 |  --------------------------------
35 |  @Author  : lixj
36 |  @contact : lixj_zj@163.com
37 | """
38 | 
39 | 
40 | class Solution:
41 |     def flipAndInvertImageOne(self, A):
42 |         """
43 |         enumerate()遍历
44 |         :param A:
45 |         :return:
46 |         """
47 |         result = []
48 |         for list in A:
49 |             list = list[::-1]
50 |             for key, value in enumerate(list):
51 |                 #
52 |                 if value == 0:
53 |                     list[key] = 1
54 |                 else:
55 |                     list[key] = 0
56 |                 # 或者 list[key] = 1 - list[key] 代替if-else判断
57 |             result.append(list)
58 |         return result
59 | 
60 |     def flipAndInvertImageTwo(self, A):
61 |         """
62 |         ^ 异或运算：相同为1，相异为0
63 |         :param A:
64 |         :return:
65 |         """
66 |         return [[j ^ 1 for j in i[::-1]] for i in A]
67 |         # 或者 j ^ 1 替换成 1 - j
68 | 
69 |     def flipAndInvertImageThree(self, A):
70 |         """
71 |         头尾数据取反并调换位置
72 |         i[end], i[start] = 1 - i[start], 1 - i[end]
73 |         :param A:
74 |         :return:
75 |         """
76 |         for i in A:
77 |             start = 0
78 |             end = len(i) - 1
79 |             while start <= end:
80 |                 i[end], i[start] = 1 - i[start], 1 - i[end]  # ！头尾数据取反并调换位置
81 |                 # 或者 i[end], i[start] = 1 ^ i[start], 1 ^ i[end]
82 |                 start = start + 1
83 |                 end = end - 1
84 |         return A
85 | 
86 | 
87 | A = [[1, 1, 0], [1, 0, 1], [0, 0, 0]]
88 | # print(Solution().flipAndInvertImageOne(A))
89 | # print(Solution().flipAndInvertImageTwo(A))
90 | print(Solution().flipAndInvertImageThree(A))
91 | 


--------------------------------------------------------------------------------
/practice/leetcode/977.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description :
  6 |     给定一个按非递减顺序排序的整数数组 A，返回每个数字的平方组成的新数组，要求也按非递减顺序排序。
  7 | 
  8 |     示例 1：
  9 |     输入：[-4,-1,0,3,10]
 10 |     输出：[0,1,9,16,100]
 11 | 
 12 |     示例 2：
 13 |     输入：[-7,-3,2,3,11]
 14 |     输出：[4,9,9,49,121]
 15 | 
 16 |     提示：
 17 |     1 <= A.length <= 10000
 18 |     -10000 <= A[i] <= 10000
 19 |     A 已按非递减顺序排序。
 20 |  --------------------------------
 21 |  @Time    : 2019/4/19 23:14
 22 |  @File    : 977.py
 23 |  @Software: PyCharm
 24 |  --------------------------------
 25 |  @Author  : lixj
 26 |  @contact : lixj_zj@163.com
 27 | """
 28 | 
 29 | 
 30 | class Solution:
 31 |     def sortedSquaresOne(self, A):
 32 |         """
 33 |         sorted()
 34 |         时间复杂度：O(N logN)，其中 N 是数组 A 的长度。
 35 |         空间复杂度：O(N)。
 36 |         :param A:
 37 |         :return:
 38 |         """
 39 |         return sorted(i ** 2 for i in A)  # sorted()与list.sort()的区别
 40 | 
 41 |     def sortedSquaresTwo(self, A):
 42 |         """
 43 |         左右双指针，从中间分界位置往两边移动
 44 |         时间复杂度：O(N)，其中 N 是数组 A 的长度。
 45 |         空间复杂度：O(N)。
 46 |         :param A:
 47 |         :return:
 48 |         """
 49 |         length = len(A)
 50 |         right = 0  # 正向读取非负数部分
 51 |         # 找到正负数的分界位置
 52 |         while right < length and A[right] < 0:
 53 |             right += 1
 54 |         left = right - 1  # 反向读取负数部分
 55 |         result = []
 56 | 
 57 |         # 当左右指针均有指向时
 58 |         while left >= 0 and right < length:  # left >= 0    right < length
 59 |             # 比较指针对应位置元素平方的大小，result中添加较小的值，并将对应的左右指针往头尾移动
 60 |             if A[left] ** 2 > A[right] ** 2:
 61 |                 result.append(A[right] ** 2)
 62 |                 right += 1
 63 |             else:
 64 |                 result.append(A[left] ** 2)
 65 |                 left -= 1
 66 |         # 其中一个指针移动到端点时，另一个指针仍指向数据，则在result中添加余下的数据，同时移动指针
 67 |         while left >= 0:
 68 |             result.append(A[left] ** 2)
 69 |             left -= 1
 70 |         while right < length:
 71 |             result.append(A[right] ** 2)
 72 |             right += 1
 73 |         return result
 74 | 
 75 |     def sortedSquaresThree(self, A):
 76 |         """
 77 |         左右双指针，从两边位置往中间移动
 78 |         :param A:
 79 |         :return:
 80 |         """
 81 |         left = 0
 82 |         right = len(A) - 1
 83 |         nowIndex = len(A) - 1
 84 |         result = [0] * len(A)  # 构建元素为0，个数为len(A)的列表，后续判断中替换元素
 85 |         while left <= right:
 86 |             if A[left] ** 2 < A[right] ** 2:
 87 |                 result[nowIndex] = A[right] ** 2  # 替换对应位置的元素
 88 |                 nowIndex -= 1  # 当前索引自减
 89 |                 right -= 1  # 右指针往中间移动
 90 |             else:
 91 |                 result[nowIndex] = A[left] ** 2
 92 |                 nowIndex -= 1
 93 |                 left += 1  # 左指针往中间移动
 94 |         return result
 95 | 
 96 | 
 97 | list = [-4, -1, 0, 3, 10]
 98 | # print(Solution().sortedSquaresOne(list))
 99 | # print(Solution().sortedSquaresTwo(list))
100 | print(Solution().sortedSquaresThree(list))
101 | 


--------------------------------------------------------------------------------
/practice/technique/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/5/25 10:32
 8 |  @File    : __init__.py.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/practice/technique/config/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/30 9:05
 8 |  @File    : __init__.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/practice/technique/config/random_ip.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description : 获取随机可以用IP
  6 |  --------------------------------
  7 |  @Time    : 19-2-28 上午11:28
  8 |  @File    : random_ip.py
  9 |  @Software: PyCharm
 10 |  --------------------------------
 11 |  @Author  : lixj
 12 |  @contact : lixj_zj@163.com
 13 | """
 14 | 
 15 | import requests
 16 | from bs4 import BeautifulSoup as bs
 17 | import user_agent
 18 | import logging
 19 | import os
 20 | import random
 21 | import traceback
 22 | import time
 23 | 
 24 | # logging.basicConfig函数对日志的输出格式及方式做相关配置
 25 | logging.basicConfig(level=logging.DEBUG,
 26 |                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
 27 | 
 28 | 
 29 | class RandomIp():
 30 |     def __init__(self):
 31 |         self.XICI_URL = "https://www.xicidaili.com/nn/"
 32 |         self.TEST_URL = "https://www.baidu.com/"
 33 |         self.IP_POOL_FILE = "ip_pool.txt"
 34 |         self.MAX_PAGE_OF_XICI = 3614  # 西刺网站总页数
 35 |         self.NUM_OF_PAGES = 300  # 爬取的目标页数
 36 |         # 获取随机的headers
 37 |         self.headers = user_agent.UserAgent().get_headers() # userAgent.UserAgent() 类实例化()括号就相当于self参数
 38 | 
 39 |     def get_target_pages(self, page):
 40 |         """
 41 |         获取随机的NUM_OF_PAGES页
 42 |         :param page:
 43 |         :return:
 44 |         """
 45 |         return random.sample(range(1, page), self.NUM_OF_PAGES)
 46 | 
 47 |     def request_get(self, url):
 48 |         """
 49 |         requests请求
 50 |         :param url:
 51 |         :return:
 52 |         """
 53 |         return requests.get(url, headers=self.headers, timeout=10)
 54 | 
 55 |     def analysis_page(self, req):
 56 |         result_ip_pool = []
 57 |         soup = bs(req.text, "lxml")
 58 |         ips = soup.find_all("tr")
 59 |         # 遍历当页的每条记录
 60 |         for i in range(1, len(ips)):
 61 |             try:
 62 |                 ip = ips[i]
 63 |                 tds = ip.find_all("td")
 64 |                 temp_ip = str(tds[5].contents[0]).lower() + '://' + tds[1].contents[0] + ':' + tds[2].contents[0]
 65 |                 speed = float(tds[6].div.get("title")[:-1])
 66 |                 connect_time = float(tds[7].div.get("title")[:-1])
 67 |                 #
 68 |                 if speed < 0.5 and connect_time < 0.5:
 69 |                     result_ip_pool.append(temp_ip)
 70 |             except Exception as e:
 71 |                 logging.error("解析IP参数异常！")
 72 |                 traceback.format_exc(e)
 73 |         return result_ip_pool
 74 | 
 75 |     def get_ip_pool(self):
 76 |         """
 77 |         获取IP池
 78 |         :return:
 79 |         """
 80 |         result_ip_pool = []
 81 |         max_page = self.MAX_PAGE_OF_XICI
 82 |         # 获取随机的NUM_OF_PAGES页
 83 |         target_pages = RandomIp().get_target_pages(max_page)
 84 |         for one_page in target_pages:
 85 |             one_page_url = self.XICI_URL + str(one_page)
 86 |             req = RandomIp().request_get(one_page_url)
 87 |             if (req.status_code == 200):
 88 |                 result_ip_pool.extend(RandomIp().analysis_page(req))
 89 |             else:
 90 |                 logging.error("连接异常！异常url:", one_page_url)
 91 |         return result_ip_pool
 92 | 
 93 |     def review_ip_pool(self):
 94 |         """
 95 |         重新验证IP池
 96 |         :return: ip_pool
 97 |         """
 98 |         ip_pool_path = self.IP_POOL_FILE
 99 |         try:
100 |             with open(ip_pool_path, "r", encoding="utf-8") as f:
101 |                 content = f.read()
102 |             ip_pool = content[2:len(content) - 2].split("', '")
103 |             reduce_num = 0
104 |             logging.info("IP池中待验证个数：{}".format(len(ip_pool)))
105 |             for ip in ip_pool[:]:
106 |                 proxy = {ip.split("://")[0]: ip.split("://")[1]}
107 |                 try:
108 |                     req = requests.get(self.TEST_URL, proxies=proxy, headers=self.headers, timeout=3)
109 |                     if req.status_code != 200:
110 |                         ip_pool.remove(ip)
111 |                         reduce_num += 1
112 |                     time.sleep(1)
113 |                 except Exception as e:
114 |                     logging.error("ip异常：{}，异常信息：{}".format(ip, str(e)))
115 |                     ip_pool.remove(ip)
116 |                     reduce_num += 1
117 |                     continue
118 |             logging.info("IP池中已验证个数：{}，减少个数：{}个".format(len(ip_pool), reduce_num))
119 |             with open(ip_pool_path, "w", encoding="utf-8") as f:
120 |                 f.write(str(ip_pool))
121 |             logging.info("IP池已更新！")
122 |         except Exception as e:
123 |             logging.error("重新验证IP池异常！异常信息：{}".format(e))
124 | 
125 |     def write_ip_to_file(self):
126 |         """
127 |         IP写入文件，追加形式
128 |         :return:
129 |         """
130 |         try:
131 |             module_path = os.path.dirname(__file__)
132 |             file_name = module_path + '\\' + self.IP_POOL_FILE
133 |             with open(file_name, "a+", encoding="utf-8") as f:
134 |                 f.write(str(self.get_ip_pool()))
135 |             logging.info("写入IP {} 个结束！".format(len(self.get_ip_pool())))
136 |         except Exception as e:
137 |             logging.error("IP写入文件异常！异常信息：", e)
138 |             traceback.format_exc(e)
139 | 
140 |     def get_one_ip(self):
141 |         """
142 |         获取随机的一个IP地址
143 |         :return:
144 |         """
145 |         try:
146 |             module_path = os.path.dirname(__file__)
147 |             file_name = module_path + '\\' + self.IP_POOL_FILE
148 |             with open(file_name, "r", encoding="utf-8") as f:
149 |                 content = f.read()
150 |                 ip_list = content[1:len(content) - 1].split(", ")
151 |                 random_ip = random.choice(ip_list)  # choice()获取一个
152 |                 logging.info("random_ip: %s", random_ip)
153 |             return random_ip
154 |         except Exception as e:
155 |             logging.error("IP读入文件异常！异常信息：", e)
156 |             traceback.format_exc(e)
157 | 
158 |     def get_one_proxies(self):
159 |         """
160 |         获取随机的一个Proxies
161 |         :return:
162 |         """
163 |         try:
164 |             module_path = os.path.dirname(__file__)
165 |             file_name = module_path + '\\' + self.IP_POOL_FILE
166 |             with open(file_name, "r", encoding="utf-8") as f:
167 |                 content = f.read()
168 |                 ip_list = content[2:len(content) - 2].split("', '")
169 |                 random_ip = random.choice(ip_list)  # choice()获取一个
170 |                 proxies = {"http": random_ip}
171 |                 logging.info("proxies: %s", str(proxies))
172 |             return proxies
173 |         except Exception as e:
174 |             logging.error("IP读入文件异常！异常信息：", e)
175 |             traceback.format_exc(e)
176 | 
177 |     def get_num_of_ip(self, num_of_ip):
178 |         """
179 |         获取指定数量的IP
180 |         :return:
181 |         """
182 |         try:
183 |             module_path = os.path.dirname(__file__)
184 |             file_name = module_path + '\\' + self.IP_POOL_FILE
185 |             with open(file_name, "r", encoding="utf-8") as f:
186 |                 content = f.read()
187 |                 cont_list = content.split("', '")
188 |                 ip_list = cont_list[1:len(cont_list) - 1]
189 |                 random_ip_list = random.sample(ip_list, num_of_ip)  # sample()获取多个
190 |                 logging.info("random_ip_list: %s", random_ip_list)
191 |             return random_ip_list
192 |         except Exception as e:
193 |             logging.error("IP读入文件异常！异常信息：", e)
194 |             traceback.format_exc(e)
195 | 
196 | 
197 | if __name__ == '__main__':
198 |     RandomIp().write_ip_to_file()
199 |     RandomIp().review_ip_pool()
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/practice/technique/config/test_comfig.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/6 17:49
 8 |  @File    : test_comfig.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import random_ip
16 | import user_agent
17 | 
18 | # 文件中写入IP
19 | random_ip.RandomIp().write_ip_to_file()
20 | 
21 | # 重新验证IP
22 | random_ip.RandomIp().review_ip_pool()
23 | 
24 | # 获取随机一个IP
25 | random_ip.RandomIp().get_one_ip()
26 | 
27 | # 获取随机一个prop
28 | random_ip.RandomIp().get_one_proxies()
29 | 
30 | # 获取随机的多个IP
31 | random_ip.RandomIp().get_num_of_ip(5)
32 | 
33 | # 获取随机userAgent
34 | user_agent.UserAgent().get_user_agent()
35 | 
36 | # 获取随机headers
37 | user_agent.UserAgent().get_headers()
38 | 


--------------------------------------------------------------------------------
/practice/technique/db_operate.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description : 数据库相关操作
  6 |  1. 操作 MySQL
  7 |  2. 操作 Oracle
  8 |  3. 操作 MongoDB
  9 |  4. 导出 MySQL 数据至 excel
 10 |  5. 导出 Oracle 数据至 excel
 11 |  --------------------------------
 12 |  @Time    : 2019/6/7 15:45
 13 |  @File    : db_operate.py
 14 |  @Software: PyCharm
 15 |  --------------------------------
 16 |  @Author  : lixj
 17 |  @contact : lixj_zj@163.com
 18 | """
 19 | 
 20 | 
 21 | import re
 22 | import json
 23 | import cx_Oracle
 24 | import xlrd
 25 | from pathlib import Path
 26 | import pymysql
 27 | import time
 28 | 
 29 | 
 30 | def create_mysql_connect():
 31 |     conn = pymysql.connect(
 32 |         host='localhost',
 33 |         user='root',
 34 |         passwd='123456789',
 35 |         db='page_log',
 36 |         port=3306,
 37 |         charset='utf8'
 38 |     )
 39 |     # 获得游标
 40 |     cur = conn.cursor()
 41 |     conn.autocommit(1)
 42 |     return cur, conn
 43 | 
 44 | def close_connect(conn):
 45 |     conn.commit()
 46 |     conn.close()
 47 | 
 48 | 
 49 | 
 50 | class Mysql():
 51 |     pass
 52 | 
 53 | class Oracle():
 54 |     pass
 55 | 
 56 | class Mongodb():
 57 |     pass
 58 | 
 59 | 
 60 | class Trans():
 61 |     def __init__(self):
 62 |         self.jsonPath = "json2mongo.json"
 63 |         self.mysqlPath = "csv2mysql.csv"
 64 |         self.oraclePath = "csv2oracle.csv"
 65 |         self.oracle_localhost = cx_Oracle.connect('app_common_service/Tebon@20180522@192.168.2.49:1521/orcl')    # 链接信息：localhost:1521/orcl，在数据库中右键属性，查看链接详细信息
 66 | 
 67 |     def csv2oracle(self):
 68 |         print("connect to oracle...")
 69 |         try:
 70 |             # 1. 链接Oracle数据库
 71 |             conn = self.oracle_localhost
 72 |             cursor = conn.cursor()
 73 | 
 74 |             # 2. 查询数据
 75 |             sql = "select * from KCB_INFO_AND_LISTED_INFO where row_count =1"
 76 |             cursor.execute(sql)
 77 |             allData = cursor.fetchall()  # cursor.fetchone()
 78 |             for data in allData:
 79 |                 print(data)
 80 | 
 81 |             # # 3. 插入、更新、删除  主要区别在于sql不同
 82 |             # def sqlDML(sql, conn):
 83 |             #     cursor = conn.cursor()
 84 |             #     cursor.execute(sql)
 85 |             #     cursor.close()
 86 |             #     conn.commit()
 87 | 
 88 |             conn.commit()
 89 |             cursor.close()
 90 |             conn.close()
 91 |         except:
 92 |             conn.rollback()
 93 | 
 94 | trans = trans()
 95 | # trans.json2mongodb()
 96 | # trans.csv2mysql()
 97 | trans.csv2oracle()
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/practice/technique/excel_operate.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : excel 相关操作
 6 |  1. 数据处理
 7 |  --------------------------------
 8 |  @Time    : 2019/8/3 12:07
 9 |  @File    : excel_operate.py
10 |  @Software: PyCharm
11 |  --------------------------------
12 |  @Author  : lixj
13 |  @contact : lixj_zj@163.com
14 | """
15 | 
16 | import pandas as pd
17 | import xlrd
18 | 
19 | # 转换某一列为字符串
20 | # df = pd.DataFrame(pd.read_excel('train_data.xlsx', converters={'visitor_id': str}))
21 | # 转换所有列为字符串
22 | df = pd.DataFrame(pd.read_excel('test_data/test_excel.xlsx', dtype=str))
23 | 
24 | # 数据预处理
25 | # 1. 填补 visitor_id 为空的缺省值（非空字符串），以特定值填充某一列的空值
26 | df["visitor_id"] = df["visitor_id"].fillna(0)
27 | 
28 | # 2. 删除某列包含特殊值的行
29 | df = df[~ df['证券名称'].str.contains('联通')]
30 | 
31 | 
32 | 
33 | # 常用操作
34 | # 1. 删除行（根据行索引）
35 | df = df.drop("row_id")
36 | # 删除行（根据行号）
37 | df = df.drop(df.index[6])
38 | # 删除特定数值的行
39 | df = df[df['成交金额'] > 10000]
40 | # 删除某列包含特殊字符的行
41 | df = df[~ df['证券名称'].str.contains('联通')]
42 | 
43 | 
44 | # 2. 删除列
45 | df = df.drop(['id'], axis=1)
46 | # 删除多列（列集合）
47 | df = df.drop(columns=['B', 'C'])
48 | 
49 | 
50 | 
51 | 
52 | # # 数据统计
53 | # # 读取前五条数据
54 | # df.head()
55 | # # 读取某列
56 | # created_time = df['created_time']
57 | # for one_time in created_time:
58 | #     date = one_time.split(" ")[0]
59 | #     time = one_time.split(" ")[1]
60 | #     print(date, time)
61 | 


--------------------------------------------------------------------------------
/practice/technique/file_operate.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description : 文件相关常用操作
  6 |  1. 获取指定目录及其子目录下，所有指定后缀的文件的绝对路径
  7 |  2. 遍历指定目录下（不包含子目录）所有文件，更新指定后缀
  8 |  3. 判断指定文件是否是指定后缀的文件
  9 |  4. 获取当前文件路径
 10 |  5. 获取目录层级
 11 |  6. 组合文件路径
 12 |  7. 流式分块读取大文件
 13 | 
 14 |  注：os 模块；path 模块；pathlib 库（重点）；
 15 |  --------------------------------
 16 |  @Time    : 2019/5/25 22:28
 17 |  @File    : file_operate.py
 18 |  @Software: PyCharm
 19 |  --------------------------------
 20 |  @Author  : lixj
 21 |  @contact : lixj_zj@163.com
 22 | """
 23 | 
 24 | import os
 25 | from pathlib import Path
 26 | import pathlib
 27 | import os.path
 28 | from functools import partial
 29 | 
 30 | 
 31 | def get_suffix_file_path(dir_path, suffix):
 32 |     """
 33 |     获取指定目录及其子目录下，所有指定后缀的文件的绝对路径
 34 |     :param dir_path: 指定目录 eg. "D:\\ZX\\temp"
 35 |     :param suffix: 指定后缀 eg. "txt"
 36 |     :return:
 37 |     """
 38 |     file_list = []
 39 |     for root, dirs, files in os.walk(dir_path):
 40 |         for file in files:
 41 |             if str(file).endswith(suffix):
 42 |                 file_list.append(os.path.join(root, file))
 43 |     return file_list
 44 | 
 45 | 
 46 | def rename_file_suffix(dir_path, old_suffix, new_suffix):
 47 |     """
 48 |     遍历指定目录下（不包含子目录）所有文件，更新指定后缀
 49 |     :param dir_path: 指定路径 eg. "F:\\temp"
 50 |     :param old_suffix: 待修改后缀 eg. "txt"
 51 |     :param new_suffix: 新后缀 eg. "jpg"
 52 |     :return:
 53 |     """
 54 |     for file_path in Path(dir_path).glob('*.' + old_suffix):
 55 |         file_path.rename(file_path.with_suffix("." + new_suffix))
 56 | 
 57 | 
 58 | def is_suffix_file(file, suffix):
 59 |     """
 60 |     判断指定文件是否是指定后缀的文件
 61 |     :param file: 指定文件 eg. "F:\\temp\\img.txt" or "demo.txt"
 62 |     :param suffix: 指定后缀 eg. "txt"
 63 |     :return: True or False
 64 |     """
 65 |     return pathlib.PurePath(file).match('*.' + suffix)
 66 | 
 67 | 
 68 | def get_current_working_directory():
 69 |     """
 70 |     获取当前文件路径
 71 |     :return:
 72 |     """
 73 |     print(os.path.dirname(__file__))
 74 |     print(os.getcwd())
 75 |     print(pathlib.Path.cwd())
 76 | 
 77 | 
 78 | def get_upper_two_levels():
 79 |     """
 80 |     获取目录层级 -- 获取上上层目录
 81 |     :return:
 82 |     """
 83 |     print(os.path.dirname(os.path.dirname(os.getcwd())))
 84 |     print(pathlib.Path.cwd().parent.parent)
 85 | 
 86 | 
 87 | def combined_path():
 88 |     """
 89 |     获取目录层级 -- 在上上层目录下拼接路径
 90 |     :return: 拼接结果
 91 |     """
 92 |     # os 模块
 93 |     print(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "1", "2", "3"))
 94 | 
 95 |     # Path 模块
 96 |     parts = ["1", "2", "3"]
 97 |     print(pathlib.Path.cwd().parent.parent.joinpath(*parts))
 98 | 
 99 | 
100 | def get_file_path():
101 |     """
102 |     组合文件路径
103 |     :return:
104 |     """
105 |     # 旧方法
106 |     print(os.path.join('/temp', 'foo.txt'))
107 |     # output: '/temp/foo.txt'
108 | 
109 |     # 新方法
110 |     print(Path('/temp') / 'foo.txt')
111 | 
112 | 
113 | def read_file(file_name):
114 |     """
115 |     快速读取文件
116 |     :param file_name: 文件名
117 |     :return:
118 |     """
119 |     # 标准做法
120 |     with open(file_name) as f:
121 |         f.read()
122 | 
123 |     # pathlib 模块，封装了 with open() 方法
124 |     Path(file_name).read_text()
125 | 
126 | 
127 | def read_big_file_by_line():
128 |     """
129 |     流式逐行读取大文件（常规做法）
130 |     :return:
131 |     """
132 |     # with 上下文管理器会自动关闭打开的文件描述符
133 |     # 在迭代文件对象时，内容是一行一行返回的，不会占用太多内存
134 |     # 缺点：大文本只有一行，所有内容读入内存
135 |     with open("foo.txt") as f:
136 |         for line in f:
137 |             print(line)
138 | 
139 | 
140 | def read_big_file_by_chunk(file_path):
141 |     """
142 |     流式分块读取大文件
143 |     :param file_path: 文件路径
144 |     :return:
145 |     """
146 |     # 普通做法
147 |     with open(file_path) as file:
148 |         for chunk in chunked_file_reader(file):
149 |             yield chunk
150 | 
151 |     # 优秀做法
152 |     with open(file_path) as file:
153 |         for chunk in chunked_file_reader_mod(file):
154 |             yield chunk
155 | 
156 | 
157 | def chunked_file_reader(file, block_size=1024 * 8):
158 |     """
159 |     流式分块读取大文件（普通做法）
160 |     :param file: 文件名，即 with open(file_name) as file:
161 |     :param block_size: 分块大小
162 |     :return:
163 |     """
164 |     while True:
165 |         chunk = file.read(block_size)
166 |         if not chunk:
167 |             break
168 |         yield chunk
169 | 
170 | 
171 | def chunked_file_reader_mod(file, block_size=1024 * 8):
172 |     """
173 |     流式分块读取大文件（优秀做法）
174 |     :param file: 文件名，with open(file_name) as file:
175 |     :param block_size: 分块大小
176 |     :return:
177 |     """
178 |     # 首先使用 partial(fp.read, block_size) 构造一个新的无需参数的偏函数
179 |     # 循环将不断返回 fp.read(block_size) 调用结果，直到其为 '' 时终止
180 |     for chunk in iter(partial(file.read, block_size), ''):
181 |         yield chunk
182 | 


--------------------------------------------------------------------------------
/practice/technique/pdf_operate.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : pdf相关的操作
 6 |  --------------------------------
 7 |  @Time    : 2019/5/25 10:39
 8 |  @File    : pdf_operate.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | from PyPDF2 import PdfFileWriter, PdfFileReader
16 | import os
17 | import comDocOperate
18 | 
19 | 
20 | class PdfOperate():
21 |     def __init__(self):
22 |         pass
23 | 
24 |     def get_limit_page_pdf(self, pdf_path, start_page, end_page):
25 |         """
26 |         截取pdf中的几页，输出到同目录
27 |         :param pdf_path:
28 |         :param start_page:
29 |         :param end_page:
30 |         :return:
31 |         """
32 |         output = PdfFileWriter()
33 |         pdf_file = PdfFileReader(open(pdf_path, "rb"))
34 | 
35 |         # 保存input.pdf中的start_page-end_page页到output.pdf
36 |         for i in range(start_page, end_page):
37 |             output.addPage(pdf_file.getPage(i))
38 | 
39 |         output_stream = open(str(os.path.dirname(pdf_path)) + os.path.sep + "output.pdf", "wb")
40 |         output.write(output_stream)
41 |         output_stream.close()
42 | 
43 |     def merge_pdf(self, file_dir, outfile):
44 |         """
45 |         合并同一目录下的所有PDF文件
46 |         :param filepath: 存放PDF的原文件夹
47 |         :param outfile: 输出的PDF文件的名称
48 |         :return: 
49 |         """
50 |         output = PdfFileWriter()
51 |         output_pages = 0
52 |         pdf_file_name = comDocOperate.getSameEndsFileInDir(file_dir, ".pdf")
53 | 
54 |         if pdf_file_name:
55 |             for pdf_file in pdf_file_name:
56 |                 # 读取源PDF文件
57 |                 input = PdfFileReader(open(pdf_file, "rb"))
58 |                 # 获得源PDF文件中页面总数
59 |                 page_count = input.getNumPages()
60 |                 output_pages += page_count
61 |                 print("{pdfFile}文件页数：{page_count}".
62 |                       format(pdfFile=pdf_file, page_count=page_count))
63 | 
64 |                 # 分别将page添加到输出output中
65 |                 for one_page in range(page_count):
66 |                     output.addPage(input.getPage(one_page))
67 |             print("合并后的总页数:{pages}".format(pages=output_pages))
68 | 
69 |             # 写入到目标PDF文件
70 |             output_stream = open(os.path.join(file_dir, outfile), "wb")
71 |             output.write(output_stream)
72 |             output_stream.close()
73 |             print("PDF文件合并完成！")
74 |         else:
75 |             print("没有可以合并的PDF文件！")
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     # 开始页
80 |     start_page = 0
81 |     # 截止页
82 |     end_page = 5
83 |     #
84 |     pdf_path = r"D:\ZX\temp\test\1\0.pdf"
85 | 
86 |     pdf_operate = PdfOperate()
87 |     # pdfOper.get_limit_page_pdf(pdf_path, start_page, end_page)
88 | 
89 |     file_dir = r'D:\ZX\temp\test\1'  # 存放PDF的原文件夹
90 |     outfile = "out.pdf"  # 输出的PDF文件的名称
91 |     pdf_operate.merge_pdf(file_dir, outfile)
92 | 


--------------------------------------------------------------------------------
/practice/technique/selenium_template.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/7/20 15:03
 8 |  @File    : selenium_template.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | from selenium import webdriver
16 | from comConfig.user_agent import UserAgent
17 | from comConfig.random_ip import RandomIp
18 | 
19 | 
20 | class SeleniumTemp:
21 |     def __init__(self):
22 |         self.chrome_options = webdriver.ChromeOptions()
23 |         self.chrome_options.add_argument('--headless')
24 |         self.chrome_options.add_argument('--disable-gpu')
25 |         # 指定 chromedriver.exe 文件路径
26 |         self.executable_path = "D:\ZX_workspace\Python\otherfiles\chromedriver\chromedriver.exe"
27 | 
28 |     def selenium_operate(self):
29 |         """
30 |         加载 chrome driver，每次加载时更新 IP 与 useragent
31 |         :return: driver
32 |         """
33 |         # 有代理 IP 时加载
34 |         # self.chrome_options.add_argument('--proxy-server=http://{}'.format(RandomIp().get_one_ip()))
35 |         self.chrome_options.add_argument('--user-agent=' + UserAgent().get_user_agent())
36 |         return webdriver.Chrome(chrome_options=self.chrome_options, executable_path=self.executable_path)
37 | 
38 |     def get_page_source(self, driver, url):
39 |         """
40 |         获取页面全部内容
41 |         :param driver:
42 |         :param url:
43 |         :return:
44 |         """
45 |         driver.get(url)
46 |         return driver.page_source
47 | 
48 | if __name__ == '__main__':
49 |     url = "http://exam.sac.net.cn/pages/registration/sac-finish-person.html?r2SS_IFjjk=8E0DEB6C9FC3F295E053D651A8C05FCD"
50 |     selenium_temp = SeleniumTemp()
51 |     driver = selenium_temp.selenium_operate()
52 |     data = selenium_temp.get_page_source(driver, url)
53 |     print(data)
54 | 


--------------------------------------------------------------------------------
/practice/technique/test_data/test.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/8/4 17:04
 8 |  @File    : test.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | # -*- coding:utf-8 -*-
16 | 
17 | import pandas as pd
18 | import numpy as np
19 | import seaborn as sns
20 | import matplotlib.pyplot as plt
21 | from pandas import DataFrame, Series
22 | # from sklearn.cross_validation import train_test_split
23 | from sklearn.linear_model import LinearRegression
24 | 
25 | # 读取文件
26 | datafile = 'test_excel.xlsx'  # 文件所在位置，u为防止路径中有中文名称，此处没有，可以省略
27 | data = pd.read_excel(datafile)  # datafile是excel文件，所以用read_excel,如果是csv文件则用read_csv
28 | examDf = DataFrame(data)
29 | 
30 | # 数据清洗,比如第一列有可能是日期，这样的话我们就只需要从第二列开始的数据，
31 | # 这个情况下，把下面中括号中的0改为1就好，要哪些列取哪些列
32 | new_examDf = examDf.ix[:, 1:]
33 | 
34 | # 检验数据
35 | print(new_examDf.describe())  # 数据描述，会显示最值，平均数等信息，可以简单判断数据中是否有异常值
36 | print(new_examDf[new_examDf.isnull() == True].count())  # 检验缺失值，若输出为0，说明该列没有缺失值
37 | 
38 | # 输出相关系数，判断是否值得做线性回归模型
39 | print(new_examDf.corr())  # 0-0.3弱相关；0.3-0.6中相关；0.6-1强相关；
40 | 
41 | # 通过seaborn添加一条最佳拟合直线和95%的置信带，直观判断相关关系
42 | # sns.pairplot(data, x_vars=['visitor_id'], y_vars='created_time', height=7, aspect=0.8, kind='reg')
43 | 


--------------------------------------------------------------------------------
/practice/technique/test_data/test_excel.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/practice/technique/test_data/test_excel.xlsx


--------------------------------------------------------------------------------
/practice/technique/word_operate.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description : word相关的文件操作
  6 |  参考链接：
  7 |  https://www.cnblogs.com/ontheway703/p/5266041.html
  8 |  https://blog.csdn.net/xtfge0915/article/details/83479922
  9 |  --------------------------------
 10 |  @Time    : 2019/5/25 10:39
 11 |  @File    : word_operate.py
 12 |  @Software: PyCharm
 13 |  --------------------------------
 14 |  @Author  : lixj
 15 |  @contact : lixj_zj@163.com
 16 | """
 17 | 
 18 | from docx import *
 19 | import re
 20 | from docx.shared import Pt
 21 | from docx.shared import Inches
 22 | from win32com import client
 23 | from docx.enum.style import WD_STYLE_TYPE
 24 | import os, zipfile, shutil
 25 | 
 26 | 
 27 | class WordOperate():
 28 |     def __init__(self):
 29 |         pass
 30 | 
 31 |     def show_style(self):
 32 |         """
 33 |         查看word常用样式，在读取或写入时设置
 34 |         :return:
 35 |         """
 36 |         doc = Document()
 37 |         styles = doc.styles
 38 | 
 39 |         # 查看段落样式
 40 |         for s in styles:
 41 |             if s.type == WD_STYLE_TYPE.PARAGRAPH:
 42 |                 print("段落样式：", s.name)
 43 | 
 44 |         # 查看字符样式
 45 |         for s in styles:
 46 |             if s.type == WD_STYLE_TYPE.CHARACTER:
 47 |                 print("字符样式：", s.name)
 48 | 
 49 |         # 查看表格样式
 50 |         for s in styles:
 51 |             if s.type == WD_STYLE_TYPE.TABLE:
 52 |                 print("表格样式：", s.name)
 53 | 
 54 |     def set_style(self):
 55 |         """
 56 |         word中设置样式
 57 |         :return:
 58 |         """
 59 |         document = Document()
 60 | 
 61 |         # 1.段落设置样式
 62 |         paragraph = document.add_paragraph()
 63 |         paragraph.style = document.styles['Heading 1']  # style选取
 64 |         paragraph.style = 'Heading 1'  # 用样式名称直接赋值
 65 |         paragraph2 = document.add_paragraph(style='Body Text')  # 创建段落时赋值
 66 | 
 67 |         # 2.设置段落中的字符格式，定义样式中的字符格式后，所有运用此样式的段落都有相应的字符格式
 68 |         #   从样式库中选取 'Normal' 样式，并提取 'Normal' 样式的字符属性
 69 |         style = document.styles['Normal']
 70 |         font = style.font
 71 |         #   设置样式中的字符属性 ，操作方法和上面改变内联对象属性方法一致
 72 |         font.name = "Microsoft YaHei UI"
 73 |         font.size = Pt(50)  # 字体大小
 74 |         #   将设置好字符属性的样式运用到段落中
 75 |         p = document.add_paragraph("change font attribution", style='Normal')
 76 | 
 77 |         # 3.设置段落格式，定义样式中的段落格式后，所有运用此样式的段落都有相应的段落格式
 78 |         styles = document.styles
 79 |         #   选取 style，并设置 style 中的段落格式
 80 |         style = styles['Heading 2']
 81 |         para_format = style.paragraph_format
 82 |         para_format.left_indent = Pt(20)
 83 |         para_format.widow_control = True
 84 |         #   将设置好段落格式的 style 运用到段落中
 85 |         p = document.add_paragraph('This is Heading, level 1', style=style)
 86 | 
 87 |     def get_head(self, doc_path):
 88 |         """
 89 |         读取 word 标题
 90 |         :param doc_path: 文档路径
 91 |         :return:
 92 |         """
 93 |         doc = Document(doc_path)
 94 |         for p in doc.paragraphs:
 95 |             # 遍历1、2、3级标题
 96 |             if p.style.name == 'Heading 1':
 97 |                 print(p.text)
 98 |             if p.style.name == 'Heading 2':
 99 |                 print(p.text)
100 |             if p.style.name == 'Heading 3':
101 |                 print(p.text)
102 | 
103 |             # 遍历所有标题
104 |             for p in doc.paragraphs:
105 |                 if re.match("^Heading \d+$", p.style.name):
106 |                     print(p.text)
107 | 
108 |     def get_content(self, doc_path):
109 |         """
110 |         读取 word 内容
111 |         :param doc_path: 文档路径
112 |         :return:
113 |         """
114 |         doc = Document(doc_path)
115 |         # 读取正文
116 |         for p in doc.paragraphs:
117 |             if p.style.name == 'Normal':
118 |                 print(p.text)
119 | 
120 |     def write_content(self, result_doc_path):
121 |         """
122 |         doc写入文件
123 |         :param result_doc_path: 写入 word 文档路径
124 |         :return:
125 |         """
126 |         doc = Document()
127 |         # 写入标题
128 |         doc.add_heading("heading 1", level=1)
129 |         doc.add_paragraph("heading 1", style='Heading 1')
130 | 
131 |         # 写入正文
132 |         doc.add_paragraph("正文")
133 | 
134 |         # 写入分页符
135 |         doc.add_page_break()
136 | 
137 |         # 写入表格
138 |         table = doc.add_table(rows=1, cols=3, style="Light List Accent 5")
139 |         hdr_cells = table.rows[0].cells
140 |         hdr_cells[0].text = 'testName'
141 |         hdr_cells[1].text = 'param'
142 |         hdr_cells[2].text = 'exc'
143 | 
144 |         # 写入图片
145 |         doc.add_picture("imgName", width=Inches(1.5))  # 设置宽度
146 | 
147 |         doc.save(result_doc_path)
148 | 
149 |     def get_img_from_doc(self, doc_path):
150 |         """
151 |         从word中提取图片到对应目录img文件夹
152 |         word转zip-提取midea中的word-复制到新目录并重命名同名为文件夹-zip还原成word，删除word文件夹
153 |         :param docdir: word 路径
154 |         :return:
155 |         """
156 |         # 切换路径
157 |         doc_abs_path = os.path.abspath(os.path.dirname(doc_path) + os.path.sep + ".")
158 |         os.chdir(doc_abs_path)
159 | 
160 |         # 遍历文件
161 |         for file in os.listdir(doc_abs_path):
162 |             if file.endswith("docx"):  # 匹配docx文件
163 |                 doc_name = file.split(".")  # 以“.”做成列表形式
164 |                 os.rename(file, "{docName}.ZIP".format(docName=doc_name[0]))  # 重命名为ZIP格式
165 |                 f = zipfile.ZipFile("{docName}.ZIP".format(docName=doc_name[0]), 'r')
166 |                 for file in f.namelist():
167 |                     if "word" in file:
168 |                         f.extract(file)  # 将压缩包里的word文件夹解压
169 |                 f.close()
170 |                 old_img_dir = r"{absPath}\word\media".format(absPath=doc_abs_path)  # 定义图片文件夹
171 |                 shutil.copytree(old_img_dir, "{absPath}\{docName}".format(absPath=doc_abs_path,
172 |                                                                           docName=doc_name[0]))  # 拷贝到新目录，名称为word文件的名字
173 |                 os.rename("{docName}.ZIP".format(docName=doc_name[0]),
174 |                           "{docName}.docx".format(docName=doc_name[0]))  # 将ZIP名字还原为DOCX
175 |                 shutil.rmtree("{absPath}\word".format(absPath=doc_abs_path))  # 删除word文件夹
176 |             else:
177 |                 print(file, "非docx文件！")
178 | 
179 |     def word_to_pdf(self, doc_path):
180 |         """
181 |         word 转换成 pdf
182 |         :param doc_path: docx 文件路径
183 |         :return:
184 |         """
185 |         # 切换路径
186 |         doc_abs_path = os.path.abspath(os.path.dirname(doc_path) + os.path.sep + ".")
187 |         os.chdir(doc_abs_path)
188 |         dir_list = os.listdir(doc_abs_path)
189 |         for file in dir_list:
190 |             if file.endswith("docx"):  # 匹配docx文件
191 |                 word = client.DispatchEx("Word.Application")
192 |                 base_name = os.path.basename(file).split('.')[0]
193 |                 word_file = word.Documents.Open(os.path.abspath(file), ReadOnly=1)
194 |                 word_file.SaveAs(doc_abs_path + os.path.sep + str(base_name) + ".pdf", FileFormat=17)
195 |                 word_file.Close()
196 |             else:
197 |                 print(file, "非docx文件！")
198 | 
199 | 
200 | if __name__ == '__main__':
201 |     root_doc_path = r'D:\ZX\temp\test\1\test.docx'
202 |     result_doc_path = r"C:\Users\Tebon\Desktop\test\result.docx"
203 |     word_oper = WordOperate()
204 |     word_oper.get_content(root_doc_path)
205 |     word_oper.write_content(result_doc_path)
206 |     word_oper.show_style()
207 |     word_oper.get_img_from_doc(root_doc_path)
208 |     word_oper.word_to_pdf(root_doc_path)
209 | 


--------------------------------------------------------------------------------
/practice/technique/zip_file_operate.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description : zip 文件常用操作
 6 |  --------------------------------
 7 |  @Time    : 2019/8/13 15:24
 8 |  @File    : test_zip.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import zipfile
16 | import os
17 | 
18 | def unzip_file(zip_file_name, unzip_path):
19 |     """
20 |     解压 zip 文件。（注：解压文件路径下包含同名称待解压文件，会覆盖！）
21 |     :param zip_file_name: 解压文件的名称，含路径全地址
22 |     :param dic_path: 解压文件路径
23 |     """
24 |     if zipfile.is_zipfile(zip_file_name):
25 |         archive = zipfile.ZipFile(zip_file_name, mode='r')
26 |         for file in archive.namelist():
27 |             archive.extract(file, unzip_path)
28 |     else:
29 |         print("{} is not zip file.".format(zip_file_name))
30 | 
31 | 
32 | def get_zip_file_name(dic_path):
33 |     """
34 |     将指定的 zip 文件内容解压到指定路径中
35 |     :param dic_path: 指定路径
36 |     :return: 压缩文件全路径
37 |     """
38 |     zip_file_path = []
39 |     for root, dirs, files in os.walk(dic_path):
40 |         for file in files:
41 |             if os.path.splitext(file)[1] == '.zip':  # 读取zip文件
42 |                 zip_file_path.append(os.path.join(root, file))
43 |     return zip_file_path
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     zip_file_path = "E:\\zip"  # zip file 路径
48 |     upzip_file_path = "E:\\zip\\res"  # 解压路径
49 | 
50 |     fn = get_zip_file_name(zip_file_path)
51 |     for file in fn:
52 |         unzip_file(file, upzip_file_path)
53 |         
54 | 


--------------------------------------------------------------------------------
/project_directory_structure.txt:
--------------------------------------------------------------------------------
 1 | projectName/
 2 | |-- bin/    或script/之类，存放项目的一些可执行文件，但bin/更直观。
 3 | |   |-- __init__
 4 | |　 |-- start.py   启动主程序，入口文件
 5 | |
 6 | |-- core/   存放项目的所有源代码(核心代码）。
 7 | |		(1) 源代码中的所有模块、包都应该放在此目录。不要置于顶层目录。 
 8 | |		(2) 程序的入口最好命名为main.py。
 9 | |   |-- tests/         子目录tests/存放单元测试代码； 
10 | |   |   |-- __init__.py
11 | |   |   |-- test.main.py  
12 | |   |
13 | |   |-- __init__.py
14 | |   |-- test_main.py|  存放核心逻辑  
15 | |
16 | |-- config/    配置文件
17 | |   |-- __init__.py
18 | |   |-- setting.py   写上相关配置
19 | |
20 | |---db/    数据库文件
21 | |   |--db.json    写数据库文件
22 | |   
23 | |-- docs/   相关文档说明
24 | |
25 | |-- examples/   案例或者临时文件存放目录
26 | |
27 | |-- wiki/   wiki
28 | |   
29 | |-- lib/   库文件，放自定义模块和包
30 | |   |-- __init__.py
31 | |   |-- common.py    放常用的功能
32 | |
33 | |-- log/   日志文件
34 | |   |-- access.log   日志
35 | |
36 | |-- __init__.py
37 | |-- README    项目说明文件（内容说明，作者清单，版权声明，编译脚本）
38 | 
39 | 注：运行程序时，在bin目录下执行start.py代码，不可以直接执行core下的模块。
40 | 
41 | 关于README的内容，它需要说明以下几个事项:
42 | 软件定位，软件的基本功能。
43 | 运行代码的方法: 安装环境、启动命令等。
44 | 简要的使用说明。
45 | 代码目录结构说明，更详细点可以说明软件的基本原理。
46 | 常见问题说明。
47 | 


--------------------------------------------------------------------------------
/spider/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/6 22:45
 8 |  @File    : __init__.py.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/spider/config/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/30 9:05
 8 |  @File    : __init__.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/spider/config/ipPool.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/config/ipPool.txt


--------------------------------------------------------------------------------
/spider/config/random_ip.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description : 获取随机可以用IP
  6 |  --------------------------------
  7 |  @Time    : 19-2-28 上午11:28
  8 |  @File    : random_ip.py
  9 |  @Software: PyCharm
 10 |  --------------------------------
 11 |  @Author  : lixj
 12 |  @contact : lixj_zj@163.com
 13 | """
 14 | 
 15 | import requests
 16 | from bs4 import BeautifulSoup as bs
 17 | import user_agent
 18 | import logging
 19 | import os
 20 | import random
 21 | import traceback
 22 | import time
 23 | 
 24 | # logging.basicConfig函数对日志的输出格式及方式做相关配置
 25 | logging.basicConfig(level=logging.DEBUG,
 26 |                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
 27 | 
 28 | 
 29 | class RandomIp():
 30 |     def __init__(self):
 31 |         self.XICI_URL = "https://www.xicidaili.com/nn/"
 32 |         self.TEST_URL = "https://www.jd.com/"
 33 |         self.IP_POOL_FILE = "ip_pool.txt"
 34 |         self.MAX_PAGE_OF_XICI = 3614  # 西刺网站总页数
 35 |         self.NUM_OF_PAGES = 2  # 爬取的目标页数
 36 |         # 获取随机的headers
 37 |         self.headers = user_agent.UserAgent().get_headers() # userAgent.UserAgent() 类实例化()括号就相当于self参数
 38 | 
 39 |     def get_target_pages(self, page):
 40 |         """
 41 |         获取随机的NUM_OF_PAGES页
 42 |         :param page:
 43 |         :return:
 44 |         """
 45 |         return random.sample(range(1, page), self.NUM_OF_PAGES)
 46 | 
 47 |     def request_get(self, url):
 48 |         """
 49 |         requests请求
 50 |         :param url:
 51 |         :return:
 52 |         """
 53 |         return requests.get(url, headers=self.headers, timeout=10)
 54 | 
 55 |     def analysis_page(self, req):
 56 |         result_ip_pool = []
 57 |         soup = bs(req.text, "lxml")
 58 |         ips = soup.find_all("tr")
 59 |         # 遍历当页的每条记录
 60 |         for i in range(1, len(ips)):
 61 |             try:
 62 |                 ip = ips[i]
 63 |                 tds = ip.find_all("td")
 64 |                 temp_ip = str(tds[5].contents[0]).lower() + '://' + tds[1].contents[0] + ':' + tds[2].contents[0]
 65 |                 speed = float(tds[6].div.get("title")[:-1])
 66 |                 connect_time = float(tds[7].div.get("title")[:-1])
 67 |                 #
 68 |                 if speed < 0.5 and connect_time < 0.5:
 69 |                     result_ip_pool.append(temp_ip)
 70 |             except Exception as e:
 71 |                 logging.error("解析IP参数异常！")
 72 |                 traceback.format_exc(e)
 73 |         return result_ip_pool
 74 | 
 75 |     def get_ip_pool(self):
 76 |         """
 77 |         获取IP池
 78 |         :return:
 79 |         """
 80 |         result_ip_pool = []
 81 |         max_page = self.MAX_PAGE_OF_XICI
 82 |         # 获取随机的NUM_OF_PAGES页
 83 |         target_pages = RandomIp().get_target_pages(max_page)
 84 |         for one_page in target_pages:
 85 |             one_page_url = self.XICI_URL + str(one_page)
 86 |             req = RandomIp().request_get(one_page_url)
 87 |             if (req.status_code == 200):
 88 |                 result_ip_pool.extend(RandomIp().analysis_page(req))
 89 |             else:
 90 |                 logging.error("连接异常！异常url:", one_page_url)
 91 |         return result_ip_pool
 92 | 
 93 |     def review_ip_pool(self):
 94 |         """
 95 |         重新验证IP池
 96 |         :return: ip_pool
 97 |         """
 98 |         ip_pool_path = self.IP_POOL_FILE
 99 |         try:
100 |             with open(ip_pool_path, "r", encoding="utf-8") as f:
101 |                 content = f.read()
102 |             ip_pool = content[2:len(content) - 2].split("', '")
103 |             reduce_num = 0
104 |             logging.info("IP池中待验证个数：{}".format(len(ip_pool)))
105 |             for ip in ip_pool[:]:
106 |                 proxy = {ip.split("://")[0]: ip.split("://")[1]}
107 |                 try:
108 |                     req = requests.get(self.TEST_URL, proxies=proxy, headers=self.headers, timeout=3)
109 |                     if req.status_code != 200:
110 |                         ip_pool.remove(ip)
111 |                         reduce_num += 1
112 |                     time.sleep(1)
113 |                 except Exception as e:
114 |                     logging.error("ip异常：{}，异常信息：{}".format(ip, str(e)))
115 |                     ip_pool.remove(ip)
116 |                     reduce_num += 1
117 |                     continue
118 |             logging.info("IP池中已验证个数：{}，减少个数：{}个".format(len(ip_pool), reduce_num))
119 |             with open(ip_pool_path, "w", encoding="utf-8") as f:
120 |                 f.write(str(ip_pool))
121 |             logging.info("IP池已更新！")
122 |         except Exception as e:
123 |             logging.error("重新验证IP池异常！异常信息：{}".format(e))
124 | 
125 |     def write_ip_to_file(self):
126 |         """
127 |         IP写入文件，追加形式
128 |         :return:
129 |         """
130 |         try:
131 |             module_path = os.path.dirname(__file__)
132 |             file_name = module_path + '\\' + self.IP_POOL_FILE
133 |             with open(file_name, "a+", encoding="utf-8") as f:
134 |                 f.write(str(self.get_ip_pool()))
135 |             logging.info("写入IP {} 个结束！".format(len(self.get_ip_pool())))
136 |         except Exception as e:
137 |             logging.error("IP写入文件异常！异常信息：", e)
138 |             traceback.format_exc(e)
139 | 
140 |     def get_one_ip(self):
141 |         """
142 |         获取随机的一个IP地址
143 |         :return:
144 |         """
145 |         try:
146 |             module_path = os.path.dirname(__file__)
147 |             file_name = module_path + '\\' + self.IP_POOL_FILE
148 |             with open(file_name, "r", encoding="utf-8") as f:
149 |                 content = f.read()
150 |                 ip_list = content[1:len(content) - 1].split(", ")
151 |                 random_ip = random.choice(ip_list)  # choice()获取一个
152 |                 logging.info("random_ip: %s", random_ip)
153 |             return random_ip
154 |         except Exception as e:
155 |             logging.error("IP读入文件异常！异常信息：", e)
156 |             traceback.format_exc(e)
157 | 
158 |     def get_one_proxies(self):
159 |         """
160 |         获取随机的一个Proxies
161 |         :return:
162 |         """
163 |         try:
164 |             module_path = os.path.dirname(__file__)
165 |             file_name = module_path + '\\' + self.IP_POOL_FILE
166 |             with open(file_name, "r", encoding="utf-8") as f:
167 |                 content = f.read()
168 |                 ip_list = content[2:len(content) - 2].split("', '")
169 |                 random_ip = random.choice(ip_list)  # choice()获取一个
170 |                 proxies = {"http": random_ip}
171 |                 logging.info("proxies: %s", str(proxies))
172 |             return proxies
173 |         except Exception as e:
174 |             logging.error("IP读入文件异常！异常信息：", e)
175 |             traceback.format_exc(e)
176 | 
177 |     def get_num_of_ip(self, num_of_ip):
178 |         """
179 |         获取指定数量的IP
180 |         :return:
181 |         """
182 |         try:
183 |             module_path = os.path.dirname(__file__)
184 |             file_name = module_path + '\\' + self.IP_POOL_FILE
185 |             with open(file_name, "r", encoding="utf-8") as f:
186 |                 content = f.read()
187 |                 cont_list = content.split("', '")
188 |                 ip_list = cont_list[1:len(cont_list) - 1]
189 |                 random_ip_list = random.sample(ip_list, num_of_ip)  # sample()获取多个
190 |                 logging.info("random_ip_list: %s", random_ip_list)
191 |             return random_ip_list
192 |         except Exception as e:
193 |             logging.error("IP读入文件异常！异常信息：", e)
194 |             traceback.format_exc(e)
195 | 
196 | 
197 | if __name__ == '__main__':
198 |     RandomIp().write_ip_to_file()
199 |     RandomIp().review_ip_pool()
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/spider/config/test_comfig.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/6 17:49
 8 |  @File    : test_comfig.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import random_ip
16 | import user_agent
17 | 
18 | # 文件中写入IP
19 | random_ip.RandomIp().write_ip_to_file()
20 | 
21 | # 重新验证IP
22 | random_ip.RandomIp().review_ip_pool()
23 | 
24 | # 获取随机一个IP
25 | random_ip.RandomIp().get_one_ip()
26 | 
27 | # 获取随机一个prop
28 | random_ip.RandomIp().get_one_proxies()
29 | 
30 | # 获取随机的多个IP
31 | random_ip.RandomIp().get_num_of_ip(5)
32 | 
33 | # 获取随机userAgent
34 | user_agent.UserAgent().get_user_agent()
35 | 
36 | # 获取随机headers
37 | user_agent.UserAgent().get_headers()
38 | 


--------------------------------------------------------------------------------
/spider/configure/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : 
 6 |  --------------------------------------
 7 |  @File        : __init__.py.py
 8 |  @Time        : 2019/1/2 23:00
 9 |  @Software    : PyCharm
10 |  --------------------------------------
11 |  @Author      : lixj
12 |  @Contact     : lixj_zj@163.com
13 |  --------------------------------------
14 | """
15 | 
16 | __all__ = ["userAgent"]
17 | 


--------------------------------------------------------------------------------
/spider/configure/log.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : 
 6 |  --------------------------------------
 7 |  @File        : log.py
 8 |  @Time        : 2019/2/28 22:31
 9 |  @Software    : PyCharm
10 |  --------------------------------------
11 |  @Author      : lixj
12 |  @Contact     : lixj_zj@163.com
13 |  --------------------------------------
14 | """
15 | 
16 | # -*- coding:utf-8 -*-
17 | import logging
18 | import logging.config
19 | import os
20 | 
21 | path = os.path.abspath(__file__)
22 | BASE_DIR = os.path.dirname(os.path.dirname(path))
23 | 
24 | debug_flag = True
25 | 
26 | # 给过滤器使用的判断
27 | class RequireDebugTrue(logging.Filter):
28 |     # 实现filter方法
29 |     def filter(self, record):
30 |         return debug_flag
31 | 
32 | logging_config = {
33 |     #必选项，其值是一个整数值，表示配置格式的版本，当前唯一可用的值就是1
34 |     'version': 1,
35 |     # 是否禁用现有的记录器
36 |     'disable_existing_loggers': False,
37 | 
38 |     # 过滤器
39 |     'filters': {
40 |         'require_debug_true': {
41 |             '()': RequireDebugTrue,   #在开发环境，我设置DEBUG为True；在客户端，我设置DEBUG为False。从而控制是否需要使用某些处理器。
42 |         }
43 |     },
44 | 
45 |     #日志格式集合
46 |     'formatters': {
47 |         'simple': {
48 |             'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
49 |         },
50 |     },
51 | 
52 |     # 处理器集合
53 |     'handlers': {
54 |         # 输出到控制台
55 |         'console': {
56 |             'level': 'DEBUG',  # 输出信息的最低级别
57 |             'class': 'logging.StreamHandler',
58 |             'formatter': 'simple',  # 使用standard格式
59 |             'filters': ['require_debug_true', ],  # 仅当 DEBUG = True 该处理器才生效
60 |         },
61 |         # 输出到文件
62 |         'log': {
63 |             'level': 'DEBUG',
64 |             'class': 'logging.handlers.RotatingFileHandler',
65 |             'formatter': 'simple',
66 |             'filename': os.path.join(BASE_DIR, 'debug.log'),  # 输出位置
67 |             'maxBytes': 1024 * 1024 * 5,  # 文件大小 5M
68 |             'backupCount': 5,  # 备份份数
69 |             'encoding': 'utf8',  # 文件编码
70 |         },
71 |     },
72 | 
73 |     # 日志管理器集合
74 |     'loggers':{
75 |         'root': {
76 |             'handlers': ['console','log'],
77 |             'level': 'DEBUG',
78 |             'propagate': True,  # 是否传递给父记录器
79 |         },
80 |         'simple': {
81 |             'handlers': ['console','log'],
82 |             'level': 'WARN',
83 |             'propagate': True,  # 是否传递给父记录器,
84 |         }
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/spider/configure/randomIp.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @Description : 
  6 |  --------------------------------------
  7 |  @File        : randomIp.py
  8 |  @Time        : 2019/1/2 23:22
  9 |  @Software    : PyCharm
 10 |  --------------------------------------
 11 |  @Author      : lixj
 12 |  @Contact     : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | import requests
 17 | from bs4 import BeautifulSoup as bs
 18 | import configure.userAgent as userAgent
 19 | import logging
 20 | import random
 21 | import traceback
 22 | 
 23 | # logging.basicConfig函数对日志的输出格式及方式做相关配置
 24 | logging.basicConfig(level=logging.DEBUG,
 25 |                     format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s')
 26 | 
 27 | 
 28 | class RandomIp():
 29 |     def __init__(self):
 30 |         self.XICI_URL = "https://www.xicidaili.com/nn/"
 31 |         self.BAIDU_URL = "https://zhidao.baidu.com/question/362128631342231812.html"
 32 |         self.MAX_PAGE_OF_XICI = 3614  # 西刺网站总页数
 33 |         self.NUM_OF_PAGES = 10  # 爬取的目标页数
 34 |         # 获取随机的headers
 35 |         # self.headers = userAgent.UserAgent().getRandomHeaders()  # userAgent.UserAgent() 类实例化()括号就相当于self参数
 36 | 
 37 |     def getIpPool(self):
 38 |         """
 39 |         获取IP池
 40 |         :return:
 41 |         """
 42 |         resultIpPool = []
 43 |         maxPage = self.MAX_PAGE_OF_XICI
 44 |         # 获取随机的NUM_OF_PAGES页
 45 |         targetPages = random.sample(range(1, maxPage), self.NUM_OF_PAGES)
 46 |         for onePage in targetPages:
 47 |             onePageUrl = self.XICI_URL + str(onePage)
 48 |             req = requests.get(onePageUrl,
 49 |                                headers=userAgent.UserAgent().getRandomHeaders(),
 50 |                                timeout=10)
 51 |             #
 52 |             if (req.status_code == 200):
 53 |                 soup = bs(req.text, "lxml")
 54 |                 ips = soup.find_all("tr")
 55 |                 # 遍历当页的每条记录
 56 |                 for i in range(1, len(ips)):
 57 |                     try:
 58 |                         ip = ips[i]
 59 |                         tds = ip.find_all("td")
 60 |                         tempIp = str(tds[5].contents[0]).lower() + "://" + tds[1].contents[0] + ":" + tds[2].contents[0]
 61 |                         speed = float(tds[6].div.get("title")[:-1])
 62 |                         connectTime = float(tds[7].div.get("title")[:-1])
 63 |                         #
 64 |                         if speed < 0.5 and connectTime < 0.5:
 65 |                             resultIpPool.append(tempIp)
 66 |                     except Exception as e:
 67 |                         logging.error("解析IP参数异常！")
 68 |                         traceback.format_exc(e)
 69 |             else:
 70 |                 logging.error("连接异常！异常url:", onePageUrl)
 71 |         return resultIpPool
 72 | 
 73 |     def reviewIp(self):
 74 |         """
 75 |         重新验证IP池
 76 |         :return: ipPool
 77 |         """
 78 |         ipPool = self.getIpPool()
 79 |         try:
 80 |             for ip in ipPool[:10]:
 81 |                 proxy = {ip.split("://")[0]: ip.split("://")[1]}
 82 |                 req = requests.get(self.BAIDU_URL,
 83 |                                    proxies=proxy,
 84 |                                    headers=userAgent.UserAgent().getRandomHeaders(),
 85 |                                    timeout=10)
 86 |                 if req.status_code != 200:
 87 |                     ipPool.remove(ip)
 88 |             return ipPool
 89 |         except Exception as e:
 90 |             logging.error("重新验证IP池异常！", e)
 91 |             traceback.format_exc(e)
 92 | 
 93 |     def writeIpToFile(self):
 94 |         """
 95 |         IP写入文件
 96 |         :return:
 97 |         """
 98 |         try:
 99 |             with open("ipPool.txt", "a+", encoding="utf-8") as f:
100 |                 f.write(str(self.getIpPool()))
101 |                 logging.info("写入IP {} 个结束！".format(len(self.getIpPool())))
102 |         except Exception as e:
103 |             logging.error("IP写入文件异常！", e)
104 |             traceback.format_exc(e)
105 | 
106 |     def getNumIpFromFile(self, num):
107 |         """
108 |         从文件中读入IP
109 |         :return:
110 |         """
111 |         try:
112 |             with open("ipPool.txt", "r", encoding="utf-8") as f:
113 |                 content = f.read()
114 |                 contList = content.split("', '")
115 |                 ipList = contList[1:len(contList) - 1]
116 |                 randomIpList = random.sample(ipList, num)
117 |             return randomIpList
118 | 
119 |         except Exception as e:
120 |             logging.error("IP写入文件异常！", e)
121 |             traceback.format_exc(e)
122 |         pass
123 | 
124 | 
125 | if __name__ == '__main__':
126 |     # RandomIp().writeIpToFile()
127 | 
128 |     print(RandomIp().getNumIpFromFile(num=3))
129 | 
130 |     # logging.info(len(ipPool))
131 |     # logging.info("ipPool is:" + str(ipPool))
132 | 


--------------------------------------------------------------------------------
/spider/configure/test.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : 
 6 |  --------------------------------------
 7 |  @File        : test.py
 8 |  @Time        : 2019/3/3 10:00
 9 |  @Software    : PyCharm
10 |  --------------------------------------
11 |  @Author      : lixj
12 |  @Contact     : lixj_zj@163.com
13 |  --------------------------------------
14 | """
15 | 
16 | import requests
17 | from bs4 import BeautifulSoup as bs
18 | req = requests.get("https://www.xicidaili.com/nn/")
19 | headers = {
20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'}
21 | urls = ["https://www.xicidaili.com/nn/1"]
22 | # 获取ip加入到队列
23 | def get_ips():
24 |     pool = []
25 | 
26 |     for url in urls:
27 |         res = requests.get(url,headers=headers)
28 |         # print(res.text)
29 |         soup = bs(res.text, "lxml")
30 |         ips = soup.find_all("tr")
31 |         for i in range(1, len(ips) - 50):
32 |             ip = ips[i]
33 |             tds = ip.find_all("td")
34 |             ip_temp = str(tds[5].contents[0]).lower() + "://" + tds[1].contents[0] + ":" + tds[2].contents[
35 |                 0]
36 |             speed = float(tds[6].div.get("title")[:-1])
37 |             connectTime = float(str(tds[7].div.get("title"))[:-1])
38 |             print(speed)
39 |             if speed < 0.5 and connectTime < 0.5:
40 |                 pool.append(ip_temp)
41 |     return pool
42 | 
43 | def get_ip():
44 |     pool = []
45 | 
46 |     for url in urls:
47 |         res = requests.get(url,headers=headers)
48 |         # print(res.text)
49 |         soup = bs(res.text, "lxml")
50 |         ips = soup.find_all("tr")
51 |         for i in range(1, len(ips) - 50):
52 |             ip = ips[i]
53 |             tds = ip.find_all("td")
54 |             ip_temp = str(tds[5].contents[0]).lower() + "://" + tds[1].contents[0] + ":" + tds[2].contents[
55 |                 0]
56 |             pool.append(ip_temp)
57 |     return pool
58 | 
59 | print(get_ips())
60 | # import configure.userAgent as userAgent
61 | # print(userAgent.UserAgent().getRandomHeaders())


--------------------------------------------------------------------------------
/spider/coroutine.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/python3
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @File    	  : coroutine.py
 6 |  @Time    	  : 2018/8/28 21:18
 7 |  @Software	  : PyCharm
 8 |  --------------------------------------
 9 |  @Description : 异步协程提高爬速
10 |  --------------------------------------
11 |  @Author  	  : lixj
12 |  @Email	  	  : lixj_zj@163.com
13 | 
14 | """
15 | 
16 | """
17 | ## 1. 引入包asyncio
18 | import asyncio
19 | import requests
20 | 
21 | ## 2. 自定义方法
22 | async def request():
23 |     url = "https://www.baidu.com"
24 |     status = requests.get(url)
25 |     return status
26 | 
27 | ## 3. 调用方法返回协程对象
28 | coroutine = request()
29 | 
30 | ## 4. 将协程对象封装成task对象（显式声明）
31 | task = asyncio.ensure_future(coroutine)
32 | print(task)
33 | 
34 | ## 5. 创建事件循环loop，将协程注册到事件循环中启动
35 | loop = asyncio.get_event_loop()
36 | 
37 | loop.run_until_complete(task)
38 | print("task", task)
39 | print("task result:", task.result())
40 | """
41 | 
42 | ## 1. 引入包asyncio
43 | import asyncio
44 | import time
45 | import aiohttp
46 | from lxml import etree
47 | 
48 | ## 2. 自定义方法
49 | async def getContent(url):
50 |     # ！unclosed client session 错误；The client session supports the context manager protocol for self closing.
51 |     # 此写法不需要session.close() 方法关闭session
52 |     async with aiohttp.ClientSession() as session:
53 |         response = await session.get(url)   # requests换成session
54 |         result = await response.text()
55 |         struct = etree.HTML(result)
56 |         content = struct.xpath("/html/body/div[1]/div[3]/ul/li[1]/div[4]/p/text()")
57 |         return content
58 | 
59 | ## 3. 调用自定义方法返回协程对象
60 | async def request(url):
61 |     result = await getContent(url)
62 |     print(result)
63 | 
64 | # 创建事件循环loop
65 | def even_loop(url_list):
66 |     ## 4. 将协程对象再封装一层封装成task对象（显式声明）
67 |     tasks = [asyncio.ensure_future(request(url)) for url in url_list]
68 | 
69 |     ## 5. 创建事件循环loop，将协程注册到事件循环中启动
70 |     loop = asyncio.get_event_loop()
71 |     loop.run_until_complete(asyncio.wait(tasks))
72 | 
73 | if __name__ == '__main__':
74 |     start = time.time()
75 |     # https://www.guancha.cn/society/2018_08_29_470073.shtml
76 |     url_list = ["url1", "url2", "..."]
77 |     even_loop(url_list)
78 |     print("cost time:", time.time()-start)
79 | 


--------------------------------------------------------------------------------
/spider/down_doc_png.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : 下载图片
 6 |  --------------------------------------
 7 |  @File        : downDocPng.py
 8 |  @Time        : 2018/10/2 14:08
 9 |  @Software    : PyCharm
10 |  --------------------------------------
11 |  @Author      : lixj
12 |  @Contact     : lixj_zj@163.com
13 |  --------------------------------------
14 | """
15 | 
16 | from selenium import webdriver
17 | from lxml import etree
18 | import requests
19 | import time
20 | 
21 | 
22 | def downOnePng(driver, url, i):
23 |     # 获取页面全部内容
24 |     driver.get(url)
25 |     data = driver.page_source
26 | 
27 |     struct = etree.HTML(data)
28 |     png = struct.xpath("//*[@id='main']/div[3]/div[2]/p/img/@src")
29 |     url = "http://m.360docs.net" + png[0]
30 |     print(url)
31 | 
32 |     img = requests.get(url)
33 |     with open("C:\\Users\\lenovo\\Desktop\\png\\" + str(i) + ".png", "wb") as f:
34 |         f.write(img.content)
35 |     time.sleep(2)
36 | 
37 | if __name__ == '__main__':
38 |     chrome_options = webdriver.ChromeOptions()
39 |     chrome_options.add_argument('--headless')
40 |     chrome_options.add_argument('--disable-gpu')
41 | 
42 |     chrome_options.add_argument(
43 |         '--user-agent=Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30')
44 |     driver = webdriver.Chrome(chrome_options=chrome_options,
45 |                               executable_path="../otherfiles/chromedriver/chromedriver.exe")
46 | 
47 |     url_base = "http://m.360docs.net/doc/info-eef589567ed5360cba1aa8114431b90d6c85892d"
48 | 
49 |     for i in range(2, 95):
50 |         url = url_base + "-" + str(i) + ".html"
51 |         downOnePng(driver, url, i)
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/spider/down_video/down_film.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @Description : 下载VIP视频
  6 |  --------------------------------------
  7 |  @File        : downFilm.py
  8 |  @Time        : 2018/8/26 0:28
  9 |  @Software    : PyCharm
 10 |  --------------------------------------
 11 |  @Author      : lixj
 12 |  @Contact     : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | '''
 17 | IPO：
 18 | input：.ts文件的url、.ts文件的个数 或者 m3u8文件
 19 | process：
 20 |         初始化配置参数（IP代理网页数，下载路径，视频名称，请求头部参数）
 21 |         获得.ts文件的URI列表（包含文件链接地址及文件个数）
 22 |         配置代理IP，选取随机IP地址进行文件下载
 23 |         下载.ts文件
 24 |         合并转换为MP4格式视频 
 25 | output：完成的MP4格式的视频
 26 | 
 27 | 关键点：
 28 | 1. 网页解析
 29 | 2. m3u8解析
 30 | 3. 设置动态代理
 31 | 4. 调用DOS命令合并文件
 32 | 
 33 | 难点：
 34 | 1. 获得m3u8文件
 35 | 2. 获取.ts文件的URI与个数
 36 | 
 37 | 重点：
 38 | 找到视频的m3u8文件（含有.ts文件的URI和个数）
 39 | 
 40 | 注：
 41 | 查看m3u8文件与.ts文件的过程：
 42 | 1. 打开视频页面，审查元素。
 43 | 2. 采用移动端的方式查看加载过程，即点击页面左上方的手机图标，然后刷新页面。
 44 | 3. 查找m3u8文件相关的内容。
 45 |    若有，则点击查看headers中的request URL，即可下载m3u8文件。若无，则第四步。
 46 | 4. 查找.ts文件加载内容。
 47 |    根据.ts文件加载headers中的request URL，可获得URI地址。
 48 |    查看所有的.ts文件的个数，可以通过解析网站，采用同样的方法，将视频快进到结束时，查看.ts文件的个数。
 49 | 5. 获得m3u8文件或.ts文件的URI与个数任其一，即可下载全部视频。
 50 | 
 51 | 视频解析网站：
 52 |     VIP视频解析：http://www.vipjiexi.com/
 53 |     无名小站：http://www.wmxz.wang/
 54 | 通用解析方式是：
 55 |     VIP视频解析： http://www.vipjiexi.com/tong.php?url=[播放地址或视频id] 
 56 |     无名小站：http://www.wmxz.wang/video.php?url=[播放地址或视频id]
 57 | 
 58 | 视频网站中关于管理m3u8文件各不相同：
 59 | 腾讯非VIP视频在前端页面中可以直接查看，下载后直接获取VIP试看部分的.ts文件。若获得全部的.ts文件个数，可以通过视频解析网站，按第四步操作查看。
 60 | 爱奇艺在前端页面中需制定特定.ts文件的URI地址，获取URI址与个数。
 61 | 
 62 | '''
 63 | 
 64 | import requests
 65 | from bs4 import BeautifulSoup as bs
 66 | import os
 67 | import time
 68 | import random
 69 | import re
 70 | import m3u8
 71 | import traceback
 72 | 
 73 | 
 74 | class VideoDownload():
 75 |     def __init__(self):
 76 |         self.pageNum = 3  # IP代理网站页数，一页100个IP
 77 | 
 78 |         #################### begin 以下路径地址可更改，具体视频的url地址又具体的视频而定 ###################
 79 | 
 80 |         # .ts文件的url，针对爱奇艺视频
 81 |         self.url = "http://video2.fxsdp.com:8091/81820180315/JAVHD00054/650kb/hls/Vf6Uur2229"
 82 | 
 83 |         # m3u8文件的url，针对腾讯视频
 84 |         self.TC_m3u8_url = "http://apd-983a0da8026d665ba14276af64267b05.v.smtcdns.com/vipts.tc.qq.com/A8_h69zsltM9kkOROl8Vx-l7g4JU8HQSrV-cE6aZ1uSc/SSXffv8zY6OTtSN-TvdRq_1UPxz2DDoymrbx04tr9kXcEXoqsDS-bUQNxi9ECFzDb0FC6fHlwXnBk3aN__auyD4rLuK7i9-Q7eCTkP8XE0qplasm_4UKUsyok_3nkKpoDIBP4GCk6THrBrsOST0EZxSi55wQO5Fh/0310_a0026o0eqrg.321002.ts.m3u8?ver=4"
 85 |         # .ts文件的url，针对腾讯视频
 86 |         # 根据此url与m3u8文件中的.ts路径相结合，获取.ts文件下载
 87 |         # 此url在网页审查元素 -> network -> 加载.ts文件的headers中获得
 88 |         self.TC_ts_url = "https://apd-983a0da8026d665ba14276af64267b05.v.smtcdns.com/vipts.tc.qq.com/A8_h69zsltM9kkOROl8Vx-l7g4JU8HQSrV-cE6aZ1uSc/SSXffv8zY6OTtSN-TvdRq_1UPxz2DDoymrbx04tr9kXcEXoqsDS-bUQNxi9ECFzDb0FC6fHlwXnBk3aN__auyD4rLuK7i9-Q7eCTkP8XE0qplasm_4UKUsyok_3nkKpoDIBP4GCk6THrBrsOST0EZxSi55wQO5Fh/"
 89 | 
 90 |         # 路径名称 “ \\ ”，绝对路径
 91 |         self.m3u8Path = "E:\\delete\\temp.m3u8"  # m3u8文件的暂存路径
 92 |         self.download_path = "E:\\delete\\DOWN"  # .ts文件下载路径
 93 |         self.final_path = "E:\\delete\\FINAL"  # 最终合并视频的路径
 94 |         self.name = "resultFilmName"  # 最终合并视频的名称
 95 | 
 96 |         #################### end 以上路径地址可更改，具体视频的url地址又具体的视频而定 ###################
 97 | 
 98 |         self.headers = {
 99 |             'Connection': 'Keep-Alive',
100 |             'Accept': 'text/html, application/xhtml+xml, */*',
101 |             'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
102 |             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
103 |         }
104 | 
105 |     # 获得.ts文件的URI信息（针对爱奇艺中的视频，需要手动处理）
106 |     def getURIList(self):
107 |         uri_list = []
108 |         for i in range(1, 1742):  ####################################     .ts文件的具体个数可在视频解析网站查看后确定
109 |             if i < 10:
110 |                 uri_list.append(self.url + "00" + str(i) + ".ts")
111 |             elif i > 100:
112 |                 uri_list.append(self.url + str(i) + ".ts")
113 |             else:
114 |                 uri_list.append(self.url + "0" + str(i) + ".ts")
115 |         return uri_list
116 | 
117 |     # 获得m3u8文件并解析（针对腾讯视频，可以直接获取m3u8文件）
118 |     def get_m3u8_uri(self):
119 |         # 开始获取m3u8文件
120 |         m3u8Content = requests.get(self.TC_m3u8_url)
121 |         try:
122 |             with open(self.m3u8Path, "w") as f:
123 |                 f.write(m3u8Content.text)
124 |             # 开始解析m3u8文件
125 |             uri_list = []
126 |             m3u8Cont = open(self.m3u8Path, "r")
127 |             for line in m3u8Cont.readlines():
128 |                 if ".ts" in line:  # 提取出含有.ts的一行，作为HTTP请求尾部
129 |                     uri_list.append(self.TC_ts_url + line.strip())
130 |                 else:
131 |                     continue
132 |             return uri_list
133 |         except:
134 |             traceback.print_exception
135 | 
136 |     # 获取代理IP网址
137 |     def getIPList(self):
138 |         print("获取代理IP...")
139 |         url = 'http://www.xicidaili.com/nn/'
140 |         ipList = []
141 |         for i in range(self.pageNum):
142 |             newurl = url + str(i + 1)  # 从第一个页面开始
143 |             r = requests.get(newurl, headers=self.headers)
144 |             soup = bs(r.text, "html.parser")
145 |             ips = soup.find_all("tr")
146 |             for i in range(1, len(ips)):  # 第0个为头信息获取时出现索引异常，从1开始
147 |                 ipInfo = ips[i]
148 |                 tds = ipInfo.find_all("td")
149 |                 ipList.append(tds[1].string + ":" + tds[2].string)
150 |         return ipList
151 | 
152 |     # 选取随机的IP地址
153 |     def getRandomIP(self, ipList):
154 |         random_ip = random.choice(ipList)
155 |         proxy_ip = "http://" + random_ip
156 |         proxies = {"http": proxy_ip}
157 |         return proxies
158 | 
159 |     # 下载
160 |     def downloadFilm(self):
161 |         print("开始下载...")
162 |         start_time = time.time()
163 |         os.chdir(self.download_path)  # 更改下载文件的执行路径，即在该路径下下载文件
164 |         ip_list = self.getIPList()
165 |         proxies = self.getRandomIP(ip_list)
166 | 
167 |         uri_list = self.getURIList()  # 爱奇艺视频
168 |         # uri_list = self.get_m3u8_uri()  # 腾讯视频
169 |         print(uri_list[:3])
170 | 
171 |         num = 1  # 操作文件的个数
172 |         for uri in uri_list[:10]:
173 |             if num % 5 == 0:
174 |                 print("更换代理IP")
175 |                 proxies = self.getRandomIP(ip_list)
176 |             if num % 60 == 0:
177 |                 print("休眠10s")
178 |                 time.sleep(10)
179 |             try:
180 |                 resp = requests.get(uri, headers=self.headers,
181 |                                     proxies=proxies)  # 存在有些网站不支持代理IP， 去掉proxies设置(proxies = proxies)
182 |             except:
183 |                 traceback.print_exception()
184 |             if num < 10:
185 |                 name = ('clip00%d.ts' % num)
186 |             elif num > 100:
187 |                 name = ('clip%d.ts' % num)
188 |             else:
189 |                 name = ('clip0%d.ts' % num)
190 |             with open(name, "wb") as f:
191 |                 f.write(resp.content)
192 |                 print("正在下载clip%d" % num)
193 |             num = num + 1
194 |         print("下载完成！总共耗时 %0.3f min " % float((time.time() - start_time) / 60.0))
195 | 
196 |     # 合并
197 |     def mergeFilm(self):
198 |         mess = input("是否进行电影合并？（y/n）")
199 |         if mess == "y":
200 |             try:
201 |                 os.chdir(self.download_path)  # 更改合并文件的执行路径，即在该路径下进行DOS命令操作
202 |                 allFile = ''
203 |                 for file in os.listdir(self.download_path):
204 |                     allFile = allFile + "+" + file
205 |                 allFileName = allFile[1:]
206 |                 # 合并所有.ts文件名，通过DOS命令进行文件合并
207 |                 command = "copy /b " + allFileName + " /y %s\%s.mp4" % (self.final_path, self.name)  # DOS语法，路径为“\”
208 |                 os.system(command)
209 |                 print("合并完成...")
210 |             except:
211 |                 traceback.print_exception()
212 |         else:
213 |             print("不合并电影，程序退出。")
214 | 
215 | 
216 | if __name__ == "__main__":
217 |     videoDownload = VideoDownload()
218 |     videoDownload.downloadFilm()
219 |     # videoDownload.mergeFilm()
220 | 
221 | 


--------------------------------------------------------------------------------
/spider/down_video/参考.txt:
--------------------------------------------------------------------------------
 1 | ﻿python下载视频：
 2 | 
 3 | https://blog.csdn.net/JosephPai/article/details/78897370
 4 | 
 5 | python爬虫设置动态代理：
 6 | 
 7 | https://blog.csdn.net/josephpai/article/details/78896613
 8 | 
 9 | 测试视频网址：
10 | http://www.iqiyi.com/v_19rr7p5oh8.html#vfrm=19-9-0-1
11 | 
12 | Python3网络爬虫(八)：爱奇艺等主流视频网站的VIP视频破解(在线观看+视频下载) 
13 | 
14 | https://blog.csdn.net/c406495762/article/details/71334633
15 | 
16 | 
17 | 爱奇艺好看的电影都有vip？神级程序员教你用Python下载Vip视频！ 
18 | 
19 | http://www.sohu.com/a/209336837_100033985
20 | 
21 | 
22 | 技术博客：
23 | https://blog.csdn.net/c406495762/article/category/6144934
24 | 
25 | 
26 | 
27 | 
28 | Python爬虫：抓取Python教程保存为PDF电子书：
29 | https://blog.csdn.net/JosephPai/article/details/78897562
30 | https://github.com/JosephPai/PythonCrawler-Html2Pdf/blob/master/LiaoPythonCrawler.py
31 | 
32 | 
33 | 
34 | 获得M3U8文件，里面可能包含所有M3U8文件的索引集合，
35 | 对应视频播放时的域名，更改最后的文件名为M3U8的文件名。
36 | 在浏览器中打开含有M3U8文件名的连接，即下载得到该M3U8文件。
37 | 打开该M3U8文件，文件中包含多个.ts文件结尾的视频。
38 | 在域名的最后更改为.ts文件结尾的连接，在浏览器中输入.ts文件结尾的连接，即可下载.ts文件。
39 | 将所有.ts文件结尾的视频合并，即可得到总视频。
40 | 
41 | 优酷网站视频直接查看方法：
42 | https://blog.csdn.net/u010025003/article/details/52752246
43 | https://jingyan.baidu.com/album/ff411625ba601d12e5823753.html?picindex=6
44 | https://blog.csdn.net/bonlog/article/details/52268556
45 | https://blog.csdn.net/bonlog/article/details/20628639
46 | https://blog.csdn.net/forfuture3513/article/details/52029153
47 | https://blog.csdn.net/qq_34158598/article/details/75330579
48 | 
49 | 
50 | 
51 | 
52 | 西游记之女儿国realAdr:
53 | https://tbm.alicdn.com/vUAdB2SRl6Uwi7hn7WB/itmrRpJrsM0prqSBNVt@@hd-00691.ts
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/spider/down_video/查看.ts文件.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/down_video/查看.ts文件.png


--------------------------------------------------------------------------------
/spider/down_video/查看m3u8文件.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/down_video/查看m3u8文件.png


--------------------------------------------------------------------------------
/spider/ebook/down_history_books.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/10/27 10:57
 8 |  @File    : downHistoryBooks.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import requests
16 | from lxml import etree
17 | from comConfig import user_agent
18 | import time
19 | 
20 | 
21 | def get_book_urls(url):
22 |     time.sleep(2)
23 |     content = requests.get(url=url, headers=user_agent.UserAgent().get_headers())
24 |     return str(content.text)
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     base_url = "http://www.shicimingju.com"
29 |     website_url = "http://www.shicimingju.com/book/"
30 |     book_urls = get_book_urls(website_url)
31 | 
32 |     struct = etree.HTML(book_urls)
33 |     books_list = struct.xpath('//*[@class="bookmark-list"]/ul/li/h2/a/@href')
34 |     books_name = struct.xpath('//*[@class="bookmark-list"]/ul/li/h2/a/text()')
35 | 
36 |     # 1. 所有书列表
37 |     print("共 {} 本。".format(len(books_list)))
38 | 
39 |     book_dict = dict(zip(books_list, books_name))
40 | 
41 |     for url in book_dict.keys():
42 |         name = book_dict.get(url)
43 |         with open(name + ".txt", "w", encoding="utf-8") as f:
44 |             # 2. 每本书的目录
45 |             book_html = get_book_urls(base_url + url)
46 |             struct = etree.HTML(book_html)
47 | 
48 |             # 书的章节数
49 |             book_content = struct.xpath('//*[@class="book-mulu"]/ul/li/a/@href')
50 | 
51 |             for content in book_content[:1]:
52 |                 # 3. 每一章节的内容
53 |                 one_book = get_book_urls(base_url + content)
54 |                 struct = etree.HTML(one_book)
55 | 
56 |                 # 章节名
57 |                 book_name = struct.xpath('//*/h1/text()')
58 |                 # 每章内容
59 |                 book_content = struct.xpath('//*[@class="chapter_content"]/p/text()')
60 | 
61 |                 f.write(str(book_name))
62 | 
63 |                 for content in book_content:
64 |                     f.write(content)
65 | 


--------------------------------------------------------------------------------
/spider/get_heml.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @Description : 
  6 |  --------------------------------------
  7 |  @File        : getURLContent.py
  8 |  @Time        : 2018/8/13 14:25
  9 |  @Software    : PyCharm
 10 |  --------------------------------------
 11 |  @Author      : lixj
 12 |  @Contact     : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | 
 17 | import requests
 18 | from lxml import etree
 19 | import random
 20 | import time
 21 | from selenium import webdriver
 22 | from multiprocessing import Pool
 23 | import pymysql
 24 | 
 25 | def getHTMLText(url):
 26 |     driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs')  # phantomjs的绝对路径
 27 |     driver.set_page_load_timeout(1)
 28 |     time.sleep(1)
 29 |     driver.get(url)  # 获取网页
 30 |     time.sleep(1)
 31 |     return driver.page_source
 32 | 
 33 | def getContent(headers, html):
 34 |     struct = etree.HTML(html)
 35 |     # title = struct.xpath('/html/body/section/div[1]/h1/text()')
 36 |     content = struct.xpath('/html/body/section/div[2]/p/text()')
 37 |     # time = struct.xpath('/html/body/section/div[1]/p/span[2]/text()')
 38 |     # source = struct.xpath('/html/body/section/div[1]/p/span[1]/text()')
 39 |     # return title, content, time, source
 40 |     return content
 41 | 
 42 | # def write2file(title, content, time, source):
 43 | #     conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8')
 44 | #     print(conn)
 45 | #     cursor = conn.cursor();
 46 | #     cursor.execute("SELECT * FROM info")
 47 | #     data = cursor.fetchone()
 48 | #     print(data)
 49 |     # with open("./content.txt", "a+", encoding = "utf-8") as f:
 50 |     #     f.write(''.join(title))
 51 |     #     f.write('\n')
 52 |     #     f.write(''.join(content))
 53 |     #     f.write('\n\n')
 54 | 
 55 | 
 56 | 
 57 | def write2mysql(num, content):
 58 |     conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8')
 59 |     cursor = conn.cursor();
 60 |     sql = "UPDATE docList SET content = '%s' WHERE news_id = %s" % (content[0], num)
 61 | 
 62 |     try:
 63 |         cursor.execute(sql)
 64 |         conn.commit()
 65 |     except:
 66 |         conn.rollback()
 67 | 
 68 | def run(num):
 69 |     url = "http://localhost:92/zx/cont.html?id=" + str(num) + "&type=jrtt&flags=null"
 70 |     html = getHTMLText(url)
 71 |     content = getContent(headers, html)
 72 |     write2mysql(str(num), content)
 73 | 
 74 | 
 75 | def get_news_id():
 76 |     conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8')
 77 |     cursor = conn.cursor();
 78 |     sql = "SELECT news_id FROM docList"
 79 |     try:
 80 |         cursor.execute(sql)
 81 |         result = cursor.fetchall()
 82 |         list = []
 83 |         for one in result:
 84 |             list.append(one[0])
 85 | 
 86 |         for newsid in list[:2]:
 87 |             run(newsid)
 88 | 
 89 |     except:
 90 |         conn.rollback()
 91 | 
 92 | 
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     headers = {
 97 |             'Connection': 'Keep-Alive',
 98 |             'Accept': 'text/html, application/xhtml+xml, */*',
 99 |             'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
100 |             'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
101 |         }
102 | 
103 |     get_news_id()
104 | 
105 | 
106 | 
107 | 
108 | 
109 |     # beginNum = 281189000
110 |     # endNum = 281189001
111 |     # beginTime = time.time()
112 |     #
113 |     # pool = Pool(4)
114 |     # for num in range(beginNum, endNum):
115 |     #     try:
116 |     #         pool.map(run, run(num))
117 |     #         pool.close()  # 关闭进程池，不再接受新的进程
118 |     #         pool.join()  # 主进程阻塞等待子进程的退出
119 |     #     except:
120 |     #         pass
121 |     #     continue
122 |     # endTime = time.time()
123 |     # usedTime = endTime - beginTime
124 |     # print(usedTime)
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/spider/get_html_new.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | -------------------------------------------------
  4 |    File Name:   get_html_new
  5 |    Description:
  6 |    Author:      tebon
  7 |    Date:        2018/8/13
  8 | -------------------------------------------------
  9 |    Change Activity: 2018/8/13
 10 | -------------------------------------------------
 11 | """
 12 | __author__ = 'tebon'
 13 | 
 14 | 
 15 | import requests
 16 | from lxml import etree
 17 | import random
 18 | import time
 19 | from selenium import webdriver
 20 | from multiprocessing import Pool
 21 | import pymysql
 22 | 
 23 | def getHTMLText(url):
 24 |     driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs')  # phantomjs的绝对路径
 25 |     driver.set_page_load_timeout(1)
 26 |     time.sleep(1)
 27 |     driver.get(url)  # 获取网页
 28 |     time.sleep(1)
 29 |     return driver.page_source
 30 | 
 31 | def getContent(headers, html):
 32 |     struct = etree.HTML(html)
 33 |     # title = struct.xpath('/html/body/section/div[1]/h1/text()')
 34 |     content = struct.xpath('/html/body/section/div[2]/p/text()')
 35 |     # time = struct.xpath('/html/body/section/div[1]/p/span[2]/text()')
 36 |     # source = struct.xpath('/html/body/section/div[1]/p/span[1]/text()')
 37 |     # return title, content, time, source
 38 |     return content
 39 | 
 40 | # def write2file(title, content, time, source):
 41 | #     conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8')
 42 | #     print(conn)
 43 | #     cursor = conn.cursor();
 44 | #     cursor.execute("SELECT * FROM info")
 45 | #     data = cursor.fetchone()
 46 | #     print(data)
 47 |     # with open("./content.txt", "a+", encoding = "utf-8") as f:
 48 |     #     f.write(''.join(title))
 49 |     #     f.write('\n')
 50 |     #     f.write(''.join(content))
 51 |     #     f.write('\n\n')
 52 | 
 53 | 
 54 | 
 55 | def write2mysql(num, content):
 56 |     conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8')
 57 |     cursor = conn.cursor();
 58 |     sql = "UPDATE docList SET content = '%s' WHERE news_id = %s" % (content[0], num)
 59 |     print(sql)
 60 |     pass
 61 |     try:
 62 |         cursor.execute(sql)
 63 |         conn.commit()
 64 |     except:
 65 |         print("error: write to mysql! ")
 66 |         conn.rollback()
 67 | 
 68 | def run(num):
 69 |     url = "http://localhost:92/zx/cont.html?id=" + str(num) + "&type=jrtt&flags=null"
 70 |     html = getHTMLText(url)
 71 |     content = getContent(headers, html)
 72 |     write2mysql(str(num), content)
 73 | 
 74 | 
 75 | def get_news_id():
 76 |     conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8')
 77 |     cursor = conn.cursor();
 78 |     sql = "SELECT news_id FROM docList"
 79 |     try:
 80 |         cursor.execute(sql)
 81 |         result = cursor.fetchall()
 82 |         list = []
 83 |         for one in result:
 84 |             list.append(one[0])
 85 | 
 86 |         for newsid in list[38:39]:
 87 |             run(newsid)
 88 | 
 89 |     except:
 90 |         print("错误！")
 91 |         conn.rollback()
 92 | 
 93 | 
 94 | 
 95 | 
 96 | if __name__ == '__main__':
 97 |     headers = {
 98 |             'Connection': 'Keep-Alive',
 99 |             'Accept': 'text/html, application/xhtml+xml, */*',
100 |             'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
101 |             'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
102 |         }
103 | 
104 |     get_news_id()
105 | 
106 | 
107 | 
108 | 
109 | 
110 |     # beginNum = 281189000
111 |     # endNum = 281189001
112 |     # beginTime = time.time()
113 |     #
114 |     # pool = Pool(4)
115 |     # for num in range(beginNum, endNum):
116 |     #     try:
117 |     #         pool.map(run, run(num))
118 |     #         pool.close()  # 关闭进程池，不再接受新的进程
119 |     #         pool.join()  # 主进程阻塞等待子进程的退出
120 |     #     except:
121 |     #         pass
122 |     #     continue
123 |     # endTime = time.time()
124 |     # usedTime = endTime - beginTime
125 |     # print(usedTime)
126 | 
127 | 
128 | 
129 | 


--------------------------------------------------------------------------------
/spider/get_meizi_image.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @Description : 多线程爬取网站图片
  6 |  --------------------------------------
  7 |  @File        : getMeiziImage.py
  8 |  @Time        : 2018/8/26 0:23
  9 |  @Software    : PyCharm
 10 |  --------------------------------------
 11 |  @Author      : lixj
 12 |  @Contact     : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | import requests
 17 | from bs4 import BeautifulSoup
 18 | import os
 19 | from multiprocessing import Pool
 20 | import sys
 21 | from datetime import datetime
 22 | import re
 23 | import traceback
 24 | 
 25 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Referer': 'http://www.mzitu.com'}
 26 | Picreferer = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Referer': 'http://i.meizitu.net'}
 27 | 
 28 | 
 29 | def request(url):
 30 |     try:
 31 |         html = requests.get(url, headers=headers)
 32 |         soup = BeautifulSoup(html.text, "lxml")  # beautiful库解析
 33 |         return soup
 34 |     except:
 35 |         print("request请求链接异常！")
 36 |         traceback.print_exc()
 37 | 
 38 | 
 39 | def get_MaxPage(href):
 40 |     try:
 41 |         html_soup = request(href)
 42 |         max_span = html_soup.find("div", class_="pagenavi").find_all("span")[-2].get_text()  # 获取页数
 43 |         return max_span
 44 |     except:
 45 |         print("获取最大页数异常！")
 46 |         traceback.print_exc()
 47 | 
 48 | def re2title(string):
 49 |     result = re.sub(r'\*|\?|\？|\:|\：|\||\&|\$|\@|\>|\<|\""|\'\'|\“”|\\|\/', "", string)
 50 |     return result
 51 | 
 52 | 
 53 | def download(all_url, root_path, num):
 54 |     try:
 55 |         count = 0
 56 |         a_soup = request(all_url)
 57 |         all_a = a_soup.find("div", class_="all").find_all("a")  # 20180308 11:50 共2693个
 58 |         print(len(all_a))
 59 |         for a in all_a[1:int(num)]:
 60 |             title = a.get_text()
 61 |             href = a["href"]  # 获取a标签的链接
 62 | 
 63 |             # makdirs
 64 |             newTitle = re2title(title)
 65 |             path = newTitle.strip()
 66 |             os.makedirs(os.path.join(root_path, path))
 67 |             os.chdir(root_path + "\\" + path)
 68 | 
 69 |             max_span = get_MaxPage(href)
 70 | 
 71 |             for page in range(1, int(max_span) + 1):
 72 |                 page_url = href + "/" + str(page)
 73 | 
 74 |                 img_soup = request(page_url)
 75 | 
 76 |                 img_url = img_soup.find("div", class_="main-image").find("img")["src"]
 77 |                 name = img_url[-9: -4]  # 截取
 78 |                 img = requests.get(img_url, headers=Picreferer)
 79 |                 f = open(name + ".jpg", "wb")
 80 |                 f.write(img.content)
 81 |                 f.close
 82 |             count += 1
 83 |             # print("完成：" + title)  多线程时无法执行
 84 |             print("完成：" + str(count / (num - 1) * 100) + "%")
 85 |     except:
 86 |         print("下载异常！")
 87 |         traceback.print_exc()
 88 | 
 89 | 
 90 | def main():
 91 |     all_url = 'http://www.mzitu.com/all'  # 爬取链接入口
 92 |     root_path = "E:\\mzitu"  # 本地存储根目录
 93 |     num = 4  # 爬取个数-1
 94 | 
 95 |     if not os.path.isdir(root_path):
 96 |         os.makedirs(root_path)
 97 | 
 98 |     start_time = datetime.now()
 99 |     print(start_time)
100 | 
101 |     # 方法1
102 |     download(all_url, root_path, num)
103 | 
104 |     # 方法2
105 |     '''
106 |     # 线程池个数，电脑CPU个数
107 |     pool = Pool(4)
108 |     for i in range(4):
109 |         pool.apply_async(download, args = (all_url, root_path, num) )  # apply_async非阻塞且支持结果返回进行回调
110 |     pool.close()
111 |     pool.join()
112 |     '''
113 | 
114 |     end_time = datetime.now()
115 |     print(end_time)
116 |     print("程序耗时：")
117 |     print(end_time - start_time)
118 | 
119 | 
120 | if __name__ == "__main__":
121 |     main()
122 | 
123 | 


--------------------------------------------------------------------------------
/spider/get_url_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : 获取接口资讯数据
 6 |  --------------------------------------
 7 |  @File        : get_url_data.py
 8 |  @Time        : 2018/8/26 16:10
 9 |  @Software    : PyCharm
10 |  --------------------------------------
11 |  @Author      : lixj
12 |  @Contact     : lixj_zj@163.com
13 |  --------------------------------------
14 | """
15 | 
16 | import requests
17 | from lxml import etree
18 | import random
19 | from time import sleep
20 | from selenium import webdriver
21 | 
22 | 
23 | def getHTMLText(url):
24 |     driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs')  # phantomjs的绝对路径
25 |     driver.set_page_load_timeout(5)
26 |     time.sleep(2)
27 |     driver.get(url)  # 获取网页
28 |     time.sleep(2)
29 |     return driver.page_source
30 | 
31 | 
32 | def getContent(headers, html):
33 |     print(html)
34 | 
35 |     # options = webdriver.ChromeOptions()
36 |     #
37 |     # options.add_argument('--headless')
38 |     #
39 |     # driver = webdriver.Chrome(options =options)
40 |     # driver.get(url)
41 |     # print(url)
42 |     # print(driver.page_source)
43 | 
44 | 
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     headers = {
49 |             'Connection': 'Keep-Alive',
50 |             'Accept': 'text/html, application/xhtml+xml, */*',
51 |             'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
52 |             'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
53 |         }
54 |     id_list = ['281239195']
55 | 
56 |     url = "http://localhost:92/zx/cont.html?id=" + id_list[0] + "&type=jrtt"
57 |     html = getHTMLText(url)
58 |     getContent(headers, html)
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/spider/gzh/GZH.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description :
  6 |  1. 爬取文章
  7 |  2. 下载图片
  8 |  3. 替换图片
  9 |  4. 输出html
 10 |  --------------------------------
 11 |  @Time    : 2019/5/29 11:54
 12 |  @File    : GZH.py
 13 |  @Software: PyCharm
 14 |  --------------------------------
 15 |  @Author  : lixj
 16 |  @contact : lixj_zj@163.com
 17 | """
 18 | 
 19 | import requests
 20 | from lxml import etree
 21 | import logging
 22 | import random
 23 | import re
 24 | import os
 25 | import user_agent as userAgent
 26 | import time
 27 | import uuid
 28 | 
 29 | # logging.basicConfig函数对日志的输出格式及方式做相关配置
 30 | logging.basicConfig(level=logging.INFO,
 31 |                     format='%(asctime)s - %(filename)s[line:%(lineno)d] '
 32 |                            '- %(levelname)s: %(message)s')
 33 | 
 34 | 
 35 | def get_random_ip():
 36 |     """
 37 |     获取随机的IP地址
 38 |     :return:
 39 |     """
 40 |     with open("ip_pool.txt", "r") as f:  # 构建IP池
 41 |         content = f.read()
 42 |         cont_list = content.split("', '")
 43 |         ip_list = cont_list[1:len(cont_list) - 1]
 44 |         random_ip = random.choice(ip_list)
 45 |         proxy_ip = "http://" + random_ip
 46 |         proxies = {"http": proxy_ip}
 47 |         logging.info("random ip is {}".format(proxies))
 48 |     return proxies
 49 | 
 50 | 
 51 | def download_img(img_link_list, img_path):
 52 |     """
 53 |     下载所有图片
 54 |     :param img_link_list:
 55 |     :param img_path:
 56 |     :return:
 57 |     """
 58 |     if not os.path.exists(img_path):
 59 |         os.makedirs(img_path)
 60 |     os.chdir(img_path)  # 切换下载图片的目录
 61 |     for img_num, img_link in enumerate(img_link_list):
 62 |         img = requests.get(img_link, headers=headers, proxies=proxies)
 63 |         try:
 64 |             suffix = img_link.split("=")[-1]
 65 |             if suffix in ["jpeg","png","jpg","gif","webp"]:
 66 |                 with open(str(img_num) + "." + suffix, "wb") as f:
 67 |                     f.write(img.content)
 68 |                     logging.info("Download {img_num} th img succeed!".format(img_num=str(img_num)))
 69 |             else:
 70 |                 continue
 71 |         except Exception as e:
 72 |             logging.error(str(e))
 73 | 
 74 | 
 75 | def getimg_link_list(url):
 76 |     """
 77 |     获取所有图片链接
 78 |     :param url:
 79 |     :return:
 80 |     """
 81 |     try:
 82 |         req = requests.get(url, headers=headers, proxies=proxies)
 83 |         struct = etree.HTML(req.text)
 84 |         # 获取所有图片地址
 85 |         x_path = "//img/@data-src"  # 匹配任意深度含有data-src熟悉的图片，获取链接
 86 |         img_link_list = struct.xpath(x_path)
 87 |         logging.info("get img link list succeed!")
 88 |         return img_link_list
 89 |     except Exception as e:
 90 |         logging.error(str(e))
 91 | 
 92 | 
 93 | def download_html(url, html_path):
 94 |     """
 95 |     下载html页面，命名为文章名.html
 96 |     :param url:
 97 |     :param html_path:
 98 |     :return:
 99 |     """
100 |     if not os.path.exists(html_path):
101 |         os.makedirs(html_path)
102 |     os.chdir(html_path)  # 切换根目录
103 |     try:
104 |         req = requests.get(url, headers=headers, proxies=proxies)
105 |         struct = etree.HTML(req.text)
106 |         x_path = "//h2/text()"
107 |         title = struct.xpath(x_path)
108 |         html_name = title[0].replace("\\n", "").strip()
109 |         with open(html_name + ".html", "w+", encoding="utf-8") as f:
110 |             f.write(req.text)
111 |         logging.info("download old html succeed!")
112 |         return html_name
113 |     except Exception as e:
114 |         logging.error(str(e))
115 | 
116 | 
117 | def replace_img(html_path, html_name, img_path):
118 |     """
119 |     替换图片
120 |     :param html_path:
121 |     :param html_name:
122 |     :param img_path:
123 |     :return:
124 |     """
125 |     path_list = os.listdir(img_path)
126 |     path_list.sort(key=lambda x: int(x.split(".")[0]))  # 顺序读取
127 |     os.chdir(html_path)
128 |     with open(html_name + ".html", "r+", encoding="utf-8") as f:
129 |         html = f.read()
130 |         pattern = r'<img .*?/>'
131 |         img_re = re.compile(pattern)
132 |         img_list = re.findall(img_re, html)
133 | 
134 |         for img, path in zip(imglist, path_list):
135 |             img_tag_list = img.split(" />")
136 |             fullimg_path = img_path + "\\" + path
137 |             new_img_tag = img_tag_list[0] + "src=" + "\"" + fullimg_path + "\"" + " />"
138 |             if html.__contains__(img):
139 |                 new_html = html.replace(img, new_img_tag)
140 |                 html = new_html
141 |     logging.info("replace img succeed!")
142 |     return html
143 | 
144 | 
145 | def del_file(path, html_name):
146 |     """
147 |     删除指定文件
148 |     :param path:
149 |     :param html_name:
150 |     :return:
151 |     """
152 |     try:
153 |         os.remove(path + os.path.altsep + html_name + ".html")
154 |         logging.info("remove file {html_name}.html done!".format(html_name=html_name))
155 |     except Exception as e:
156 |         logging.error(str(e))
157 | 
158 | 
159 | def write_img_to_new_html(newhtml_path, html, html_name):
160 |     """
161 |     重写文件中的图片，生成新的html
162 |     :param newhtml_path:
163 |     :param html:
164 |     :param html_name:
165 |     :return:
166 |     """
167 |     os.chdir(newhtml_path)
168 |     try:
169 |         # 直接覆盖原来没有图片的文件
170 |         with open(html_name + ".html", "w+", encoding="utf-8") as f:
171 |             f.write(html)
172 |         logging.info("rewrite img to new html succeed!")
173 |     except Exception as e:
174 |         logging.error(str(e))
175 | 
176 | 
177 | def get_random_path_name():
178 |     """
179 |     获取随机数命名文件夹
180 |     :return:
181 |     """
182 |     return time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) + "_" + str(uuid.uuid4())
183 | 
184 | 
185 | def run(url):
186 |     # 定义常量
187 |     root_path = os.path.dirname(__file__) + os.path.altsep + "GZH" + os.path.altsep + get_random_path_name()
188 |     img_path = root_path + os.path.altsep + "img"
189 | 
190 |     # 下载HTML文件
191 |     html_name = download_html(url, root_path)
192 | 
193 |     # 下载图片
194 |     img_list = getimg_link_list(url)
195 |     download_img(img_list, img_path)
196 | 
197 |     # 替换图片写入新的HTML
198 |     after_replace_img_html = replace_img(root_path, html_name, img_path)
199 |     write_img_to_new_html(root_path, after_replace_img_html, html_name)
200 | 
201 | 
202 | if __name__ == '__main__':
203 |     global proxies, headers
204 |     headers = userAgent.UserAgent().get_headers()
205 |     # 随机IP
206 |     proxies = get_random_ip()
207 | 
208 |     url = "https://mp.weixin.qq.com/s/LzRn5vaNayeJ3Z41ZpLqxA"
209 | 
210 |     run(url)
211 | 


--------------------------------------------------------------------------------
/spider/movieReview/cleanData.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : 数据处理
 6 |  --------------------------------------
 7 |  @File        : cleanData.py
 8 |  @Time        : 2018/8/25 16:33
 9 |  @Software    : PyCharm
10 |  --------------------------------------
11 |  @Author      : lixj
12 |  @Contact     : lixj_zj@163.com
13 |  --------------------------------------
14 | """
15 | 
16 | import re
17 | 
18 | def cleanData(HTMLDic):
19 |     # for key in HTMLDic.keys():
20 |     print(type(HTMLDic[str(6)]))   # 内容
21 |     content = HTMLDic[str(6)]
22 |     for i in range(len(content)):
23 |         content[i] = re.sub(r'\*|\'|\ |\\|\/', "", str(content[i]))
24 |     print(content)
25 |     '''
26 |     jasdfj
27 |     '''
28 | 
29 | 
30 | def getHTMLDic():
31 |     tempFile = "./cleanData.txt"
32 |     with open(tempFile, "r", encoding = "utf-8") as f:
33 |         tempStr = f.read()
34 |     tempDic = eval(tempStr)     # str to dic
35 |     return tempDic
36 | 
37 | 
38 | def main():
39 |     HTMLDic = getHTMLDic()
40 |     cleanData(HTMLDic)
41 | 
42 | main()
43 | 
44 | 


--------------------------------------------------------------------------------
/spider/movieReview/film.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @Description : 爬取电影影评
  6 |  --------------------------------------
  7 |  @File        : film.py
  8 |  @Time        : 
  9 |  @Software    : PyCharm
 10 |  --------------------------------------
 11 |  @Author      : lixj
 12 |  @Contact     : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | '''
 17 | requests+bs4+jieba
 18 | 
 19 | IPO
 20 | Input: 电影网站链接
 21 | Process: 网站链接 - 电影链接 - 评论处理 
 22 | Output: 词云图
 23 | '''
 24 | 
 25 | import os
 26 | import sys
 27 | import requests
 28 | import re
 29 | import jieba
 30 | import pandas as pd
 31 | import numpy
 32 | from pyecharts import WordCloud
 33 | from bs4 import BeautifulSoup as bs
 34 | 
 35 | 
 36 | # 获取电影
 37 | def getNowPlayingMovie(url):
 38 |     r = requests.get(url)
 39 |     html = r.text
 40 |     soup = bs(html, "html.parser")
 41 |     nowplaying_movie = soup.find_all("div", id = "nowplaying")
 42 |     nowplaying_movie_list = nowplaying_movie[0].find_all("li", class_ = "list-item")    # [0]
 43 |     movieList = []
 44 |     for oneMovie in nowplaying_movie_list:
 45 |         movieDict = {}                  # 列表中的元组存储数据
 46 |         movieDict["id"] = oneMovie["data-subject"]  # 通过css属性标签直接获取属性值
 47 |         movieDict["name"] = oneMovie["data-title"]
 48 |         movieList.append(movieDict)
 49 |     return movieList
 50 | 
 51 | # 获取评论
 52 | def getCommentsById(moviedId, pageNum):
 53 |     for i in range(pageNum):
 54 |         url = "https://movie.douban.com/subject/" + moviedId + "/comments?start=0&limit=" + str(i)
 55 | 
 56 |         # 处理特殊文字、符号的乱码问题
 57 |         non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)
 58 |         r = requests.get(url)
 59 |         html = r.text
 60 |         html = html.translate(non_bmp_map)
 61 |         soup = bs(html, "html.parser")
 62 |         comments = soup.find_all("div", id = "comments")
 63 |         comments_list = comments[0].find_all("p", class_="")
 64 | 
 65 |         commentsResult = []
 66 |         for comment in comments_list:
 67 |             if comment.string == None or comment.string == "":
 68 |                 continue
 69 |             else:
 70 |                 commentsResult.append(comment.string.strip())
 71 |     return commentsResult
 72 | 
 73 | # 数据处理
 74 | def dataWranging(dataList):
 75 |     ## 1. 筛选所有评论文字
 76 |     dataStr = ""
 77 |     for data in dataList:
 78 |         dataStr = dataStr + data
 79 |     patten = re.compile(r"[\u4e00-\u9fa5]+")        # 匹配所有文字
 80 |     filterData = re.findall(patten, dataStr)
 81 |     wrangedData = "".join(filterData)               # List to String
 82 | 
 83 |     ## 2. 分词
 84 |     segment = jieba.lcut(wrangedData)               # 结巴分词
 85 |     words_df = pd.DataFrame({'segment': segment})   # pandas显示分词结果
 86 | 
 87 |     ## 3. 除去停用词(设置chineseStopWords.txt文件为utf-8编码)
 88 |     stopwords = pd.read_csv(".\stopWords.txt", index_col = False, quoting = 3, sep = "\t", names = ["stopword"])
 89 |     keyWords = words_df[~words_df.segment.isin(stopwords.stopword)]
 90 |     keyWordsList = []
 91 |     temp = list(keyWords.as_matrix())      # 返回向量组成的列表
 92 |     for i in range(len(keyWords)):
 93 |         keyWordsList.append(temp[i][0])
 94 | 
 95 |     ## 4. 词频统计
 96 |     keyWordDict = {}
 97 |     for keyWord in keyWordsList:
 98 |         if keyWord not in keyWordDict:
 99 |             keyWordDict[keyWord] = 1
100 |         else:
101 |             keyWordDict[keyWord] += 1
102 |     keyWordDict = sorted(keyWordDict.items(), key = lambda x:x[1], reverse = True)  # 按照频率排序
103 |     return keyWordDict[:100]
104 | 
105 | # 绘制词云图
106 | def wordCloud(keyWordDict, label):
107 |     x = []; y = []
108 |     for i in range(len(keyWordDict)):
109 |         x.append(keyWordDict[i][0])
110 |         y.append(keyWordDict[i][1])
111 |     wordCloud = WordCloud(label, width = 1300, height = 620)
112 |     wordCloud.add("", x, y, word_size_range = [20, 100], shape = "circle")
113 |     wordCloud.render()
114 |     os.system(r"render.html")
115 | 
116 | 
117 | def main():
118 |     url = "https://movie.douban.com/cinema/nowplaying/shanghai/"
119 |     # movieList = getNowPlayingMovie(url)
120 | 
121 |     pageNum = 5
122 |     moviedId = "26363254"
123 |     commentsResult = getCommentsById(moviedId, pageNum)
124 | 
125 |     keyWordDict = dataWranging(commentsResult)
126 | 
127 |     label = "词云图"
128 |     wordCloud(keyWordDict, label)
129 | 
130 | if __name__ == "__main__":
131 |     main()
132 | 
133 | 
134 | 


--------------------------------------------------------------------------------
/spider/movieReview/filmComments.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------------
  5 |  @Description : 获取豆瓣影评
  6 |  --------------------------------------
  7 |  @File        : filmComments.py
  8 |  @Time        : 
  9 |  @Software    : PyCharm
 10 |  --------------------------------------
 11 |  @Author      : lixj
 12 |  @Contact     : lixj_zj@163.com
 13 |  --------------------------------------
 14 | """
 15 | 
 16 | 
 17 | """
 18 | requests + Xpath + pandas + MongoDB
 19 | IPO:
 20 | input: url
 21 | process: 获取HTML页面内容，Xpath解析，pandas数据处理，数据写入csv文件，数据存入数据库
 22 | output: csv文件、存入到mongodb
 23 | 
 24 | 评论内容解析：
 25 | id
 26 | name
 27 | recommend
 28 | time
 29 | title
 30 | content	  https://movie.douban.com/review/5199026 + id
 31 | useful
 32 | useless
 33 | comment
 34 | 	res_name
 35 | 	res_time
 36 | 	res_content
 37 | 	
 38 | 问题：
 39 | 1. 评论解析为空，适配不同的标签 √
 40 | 2. 评论文字过多，csv文件中单个单元格错乱 （设置阈值，大于则压缩或截取部分内容）
 41 | 3. IP被封，调试问题 （将爬取下来的内容存入临时文件中，从文件中读取数据）
 42 | 4. 数据清理，存储为文件
 43 | """
 44 | 
 45 | import requests
 46 | import re
 47 | from lxml import etree
 48 | import pandas as pd
 49 | import csv
 50 | import codecs
 51 | import traceback
 52 | import random
 53 | 
 54 | # 选取随机的IP地址
 55 | def getRandomIP():
 56 |     with open("./ip_pool.txt", "r") as f:
 57 |         content = f.read()
 58 |         contList = content.split("', '")
 59 |         ipList = contList[1:len(contList)-1]
 60 |         random_ip = random.choice(ipList)
 61 |         proxy_ip = "http://" + random_ip
 62 |         proxies = {"http" : proxy_ip }
 63 |     return proxies
 64 | 
 65 | def getHTMLContent(url, headers, proxies):
 66 |     res = requests.get(url, headers = headers, proxies = proxies)
 67 |     struct = etree.HTML(res.text)
 68 | 
 69 |     dic = {}
 70 |     filmName = struct.xpath('//div[@id="content"]/h1/text()')
 71 |     dic["0"] = filmName
 72 |     IDList = struct.xpath('//div[@class="main review-item"]/@id')
 73 |     dic["1"] = IDList
 74 |     nameList = struct.xpath('//a[@class="name"]/text()')
 75 |     dic["2"] = nameList
 76 |     recommendList = struct.xpath('//header[@class="main-hd"]/span/@title')
 77 |     dic["3"] = recommendList
 78 |     timeList = struct.xpath('//span[@class="main-meta"]/text()')
 79 |     dic["4"] = timeList
 80 |     titleList = struct.xpath('//div[@class="main-bd"]/h2/a/text()')
 81 |     dic["5"] = titleList
 82 | 
 83 |     contentList = []
 84 |     for userid in IDList:
 85 |         contentURL = "https://movie.douban.com/review/" + userid
 86 |         r = requests.get(contentURL, headers = headers, proxies = proxies)
 87 |         contStruct = etree.HTML(r.text)
 88 |         fullContentOne = stripForList(contStruct.xpath('//div[@class="review-content clearfix"]/text()'))
 89 |         fullContentTwo = stripForList(contStruct.xpath('//div[@class="review-content clearfix"]/p/text()'))
 90 |         resultContent = fullContentOne + fullContentTwo     # list合并
 91 |         contentList.append(resultContent)
 92 |     dic["6"] = contentList
 93 | 
 94 |     usefulList = stripForList(struct.xpath('//a[@title="有用"]/span/text()'))
 95 |     dic["7"] = usefulList
 96 |     uselessList = stripForList(struct.xpath('//a[@title="没用"]/span/text()'))
 97 |     dic["8"] = uselessList
 98 |     commentList = struct.xpath('//a[@class="reply"]/text()')
 99 |     dic["9"] = commentList
100 | 
101 |     with open("./temp2.txt", "w", encoding = "utf-8") as f:
102 |         f.write(str(dic))
103 | 
104 |     return dic
105 | 
106 | def writeData2CSV(HTMLDic):
107 |     csv_col_name = ["主题", "用户ID", "用户名", "推荐力度", "评论时间", "评论标题", "评论内容", "有用个数", "没用个数", "回应内容"]
108 |     resultDic = {}
109 | 
110 |     HTMLDic = cleanData(HTMLDic)
111 |     
112 |     try:
113 |         for i in range(1, len(csv_col_name)):
114 |             resultDic[csv_col_name[i]] = HTMLDic[str(i)]
115 |         dataframe = pd.DataFrame(resultDic)
116 |         dataframe.to_csv("./test2.csv", sep=',', encoding = "utf_8_sig", columns = csv_col_name)    # 解决中文在csv文件中乱码
117 |     except:
118 |         traceback.print_exception
119 |     
120 | 
121 | def cleanData(HTMLDic):
122 |     print("begin clean...")
123 |     dic = {'！,':'！', '……,':'……', '？,':'？', ',,':''}
124 |     for j in range(len(HTMLDic)):
125 |         if j == 6:  # 清理内容
126 |             for i in range(len(HTMLDic[str(j)])):
127 |                 HTMLDic[str(j)][i] = re.sub(r'\*|\'|\[|\]|\ |\\|\/', "", str(HTMLDic[str(j)][i]))
128 |                 for key,value in dic.items():
129 |                     HTMLDic[str(j)][i] = HTMLDic[str(j)][i].replace(key, value)
130 |         else:
131 |             continue
132 |         
133 |     return HTMLDic
134 | 
135 | 
136 | def getHTMLDic():
137 |     tempFile = "./temp2.txt"
138 |     tempStr = ""
139 |     with open(tempFile, "r", encoding = "utf-8") as f:
140 |         tempStr = f.read()
141 |     tempDic = eval(tempStr)     # str to dic
142 |     return tempDic
143 | 
144 |     
145 | def stripForList(targetList):
146 |     result = []
147 |     for target in targetList:
148 |         result.append(target.strip())
149 |     return result
150 | 
151 | 
152 | def main():
153 |     url = "https://movie.douban.com/subject/1292212/reviews"
154 |     headers = {
155 |             'Connection': 'Keep-Alive',
156 |             'Accept': 'text/html, application/xhtml+xml, */*',
157 |             'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3',
158 |             'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
159 |         }
160 |     #proxies = getRandomIP()
161 |     #HTMLDic = getHTMLContent(url, headers, proxies)
162 |     HTMLDic = getHTMLDic()
163 |     writeData2CSV(HTMLDic)
164 | 
165 | if __name__ == "__main__":
166 |     main()
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/spider/movieReview/stopWords.txt:
--------------------------------------------------------------------------------
  1 | ﻿的
  2 | 一
  3 | 不
  4 | 在
  5 | 人
  6 | 有
  7 | 是
  8 | 为
  9 | 以
 10 | 于
 11 | 上
 12 | 他
 13 | 而
 14 | 后
 15 | 之
 16 | 来
 17 | 及
 18 | 了
 19 | 因
 20 | 下
 21 | 可
 22 | 到
 23 | 由
 24 | 这
 25 | 与
 26 | 也
 27 | 此
 28 | 但
 29 | 并
 30 | 个
 31 | 其
 32 | 已
 33 | 无
 34 | 小
 35 | 我
 36 | 们
 37 | 起
 38 | 最
 39 | 再
 40 | 今
 41 | 去
 42 | 好
 43 | 只
 44 | 又
 45 | 或
 46 | 很
 47 | 亦
 48 | 某
 49 | 把
 50 | 那
 51 | 你
 52 | 乃
 53 | 它
 54 | 吧
 55 | 被
 56 | 比
 57 | 别
 58 | 趁
 59 | 当
 60 | 从
 61 | 到
 62 | 得
 63 | 打
 64 | 凡
 65 | 儿
 66 | 尔
 67 | 该
 68 | 各
 69 | 给
 70 | 跟
 71 | 和
 72 | 何
 73 | 还
 74 | 即
 75 | 几
 76 | 既
 77 | 看
 78 | 据
 79 | 距
 80 | 靠
 81 | 啦
 82 | 了
 83 | 另
 84 | 么
 85 | 每
 86 | 们
 87 | 嘛
 88 | 拿
 89 | 哪
 90 | 那
 91 | 您
 92 | 凭
 93 | 且
 94 | 却
 95 | 让
 96 | 仍
 97 | 啥
 98 | 如
 99 | 若
100 | 使
101 | 谁
102 | 虽
103 | 随
104 | 同
105 | 所
106 | 她
107 | 哇
108 | 嗡
109 | 往
110 | 哪
111 | 些
112 | 向
113 | 沿
114 | 哟
115 | 用
116 | 于
117 | 咱
118 | 则
119 | 怎
120 | 曾
121 | 至
122 | 致
123 | 着
124 | 诸
125 | 自
126 | 啊
127 | 阿
128 | 哎
129 | 哎呀
130 | 哎哟
131 | 唉
132 | 俺
133 | 俺们
134 | 按
135 | 按照
136 | 吧
137 | 吧哒
138 | 把
139 | 罢了
140 | 被
141 | 本
142 | 本着
143 | 比
144 | 比方
145 | 比如
146 | 鄙人
147 | 彼
148 | 彼此
149 | 边
150 | 别
151 | 别的
152 | 别说
153 | 并
154 | 并且
155 | 不比
156 | 不成
157 | 不单
158 | 不但
159 | 不独
160 | 不管
161 | 不光
162 | 不过
163 | 不仅
164 | 不拘
165 | 不论
166 | 不怕
167 | 不然
168 | 不如
169 | 不特
170 | 不惟
171 | 不问
172 | 不只
173 | 朝
174 | 朝着
175 | 趁
176 | 趁着
177 | 乘
178 | 冲
179 | 除
180 | 除此之外
181 | 除非
182 | 除了
183 | 此
184 | 此间
185 | 此外
186 | 从
187 | 从而
188 | 出
189 | 打
190 | 待
191 | 但
192 | 但是
193 | 当
194 | 当着
195 | 到
196 | 得
197 | 的
198 | 的话
199 | 等
200 | 等等
201 | 地
202 | 第
203 | 对
204 | 对于
205 | 多少
206 | 而
207 | 而况
208 | 而且
209 | 而是
210 | 而外
211 | 而言
212 | 而已
213 | 尔后
214 | 反过来
215 | 反过来说
216 | 反之
217 | 非但
218 | 非徒
219 | 否则
220 | 嘎
221 | 嘎登
222 | 刚
223 | 刚刚
224 | 该
225 | 赶
226 | 个
227 | 各
228 | 各个
229 | 各位
230 | 各种
231 | 各自
232 | 给
233 | 根据
234 | 跟
235 | 故
236 | 故此
237 | 固然
238 | 关于
239 | 管
240 | 归
241 | 果然
242 | 果真
243 | 过
244 | 哈
245 | 哈哈
246 | 呵
247 | 和
248 | 何
249 | 何处
250 | 何况
251 | 何时
252 | 嘿
253 | 哼
254 | 哼唷
255 | 呼哧
256 | 乎
257 | 哗
258 | 还是
259 | 还有
260 | 换句话说
261 | 换言之
262 | 或
263 | 或是
264 | 或者
265 | 极了
266 | 及
267 | 及其
268 | 及至
269 | 即
270 | 即便
271 | 即或
272 | 即令
273 | 即若
274 | 即使
275 | 几
276 | 几时
277 | 己
278 | 既
279 | 既然
280 | 既是
281 | 继而
282 | 加之
283 | 假如
284 | 假若
285 | 假使
286 | 鉴于
287 | 将
288 | 较
289 | 较之
290 | 叫
291 | 接着
292 | 结果
293 | 借
294 | 紧接着
295 | 进而
296 | 尽
297 | 尽管
298 | 经
299 | 经过
300 | 就
301 | 就是
302 | 就是说
303 | 据
304 | 具体地说
305 | 具体说来
306 | 开始
307 | 开外
308 | 靠
309 | 咳
310 | 可
311 | 可见
312 | 可是
313 | 可以
314 | 况且
315 | 啦
316 | 来
317 | 来着
318 | 离
319 | 例如
320 | 哩
321 | 连
322 | 连同
323 | 两者
324 | 了
325 | 临
326 | 另
327 | 另外
328 | 另一方面
329 | 论
330 | 嘛
331 | 吗
332 | 慢说
333 | 漫说
334 | 冒
335 | 么
336 | 每
337 | 每当
338 | 们
339 | 莫若
340 | 某
341 | 某个
342 | 某些
343 | 拿
344 | 哪
345 | 哪边
346 | 哪儿
347 | 哪个
348 | 哪里
349 | 哪年
350 | 哪怕
351 | 哪天
352 | 哪些
353 | 哪样
354 | 那
355 | 那边
356 | 那儿
357 | 那个
358 | 那会儿
359 | 那里
360 | 那么
361 | 那么些
362 | 那么样
363 | 那时
364 | 那些
365 | 那样
366 | 乃
367 | 乃至
368 | 呢
369 | 能
370 | 你
371 | 你们
372 | 您
373 | 宁
374 | 宁可
375 | 宁肯
376 | 宁愿
377 | 哦
378 | 呕
379 | 啪达
380 | 旁人
381 | 呸
382 | 凭
383 | 凭借
384 | 其
385 | 其次
386 | 其二
387 | 其他
388 | 其它
389 | 其一
390 | 其余
391 | 其中
392 | 却
393 | 去
394 | 起
395 | 起见
396 | 起见
397 | 岂但
398 | 恰恰相反
399 | 前后
400 | 前者
401 | 且
402 | 然而
403 | 然后
404 | 然则
405 | 让
406 | 人家
407 | 任
408 | 任何
409 | 任凭
410 | 如
411 | 如此
412 | 如果
413 | 如何
414 | 如其
415 | 如若
416 | 如上所述
417 | 若
418 | 若非
419 | 若是
420 | 啥
421 | 上下
422 | 尚且
423 | 设若
424 | 设使
425 | 甚而
426 | 甚么
427 | 甚至
428 | 省得
429 | 时候
430 | 十分
431 | 什么
432 | 什么样
433 | 使得
434 | 是
435 | 是的
436 | 首先
437 | 谁
438 | 谁知
439 | 顺
440 | 顺着
441 | 似的
442 | 虽
443 | 虽然
444 | 虽说
445 | 虽则
446 | 随
447 | 随着
448 | 所
449 | 所以
450 | 他
451 | 他们
452 | 他人
453 | 它
454 | 它们
455 | 她
456 | 她们
457 | 倘
458 | 倘或
459 | 倘然
460 | 倘若
461 | 倘使
462 | 腾
463 | 替
464 | 通过
465 | 同
466 | 同时
467 | 哇
468 | 万一
469 | 往
470 | 望
471 | 为
472 | 为何
473 | 为了
474 | 为什么
475 | 为着
476 | 喂
477 | 嗡嗡
478 | 我
479 | 我们
480 | 呜
481 | 呜呼
482 | 乌乎
483 | 无论
484 | 无宁
485 | 毋宁
486 | 嘻
487 | 吓
488 | 相对而言
489 | 像
490 | 向
491 | 向着
492 | 嘘
493 | 呀
494 | 焉
495 | 沿
496 | 沿着
497 | 要
498 | 要不
499 | 要不然
500 | 要不是
501 | 要么
502 | 要是
503 | 也
504 | 也罢
505 | 也好
506 | 一一
507 | ———
508 | 一般
509 | 一边
510 | 一会儿
511 | 一旦
512 | 一定
513 | 一点点
514 | 一方面
515 | 一面
516 | 一来
517 | 一起
518 | 一切
519 | 一下
520 | 一下子
521 | 一样
522 | 一些
523 | 一则
524 | 一直
525 | 依
526 | 依照
527 | 矣
528 | 以
529 | 以便
530 | 以及
531 | 以免
532 | 以至
533 | 以至于
534 | 以致
535 | 抑或
536 | 因
537 | 因此
538 | 因而
539 | 因为
540 | 哟
541 | 用
542 | 由
543 | 由此可见
544 | 由于
545 | 有
546 | 有的
547 | 有关
548 | 有些
549 | 又
550 | 于
551 | 于是
552 | 于是乎
553 | 与
554 | 与此同时
555 | 与否
556 | 与其
557 | 越是
558 | 云云
559 | 哉
560 | 再说
561 | 再者
562 | 在
563 | 在下
564 | 咱
565 | 咱们
566 | 则
567 | 怎
568 | 怎么
569 | 怎么办
570 | 怎么样
571 | 怎样
572 | 咋
573 | 照
574 | 照着
575 | 者
576 | 这
577 | 这边
578 | 这儿
579 | 这个
580 | 这会儿
581 | 这就是说
582 | 这里
583 | 这么
584 | 这么点儿
585 | 这么些
586 | 这么样
587 | 这时
588 | 这些
589 | 这样
590 | 正如
591 | 吱
592 | 之
593 | 之类
594 | 之所以
595 | 之一
596 | 只是
597 | 只限
598 | 只要
599 | 只有
600 | 至
601 | 至于
602 | 诸位
603 | 着
604 | 着呢
605 | 自
606 | 自从
607 | 自个儿
608 | 自各儿
609 | 自己
610 | 自家
611 | 自身
612 | 综上所述
613 | 总的来看
614 | 总的来说
615 | 总的说来
616 | 总而言之
617 | 总之
618 | 纵
619 | 纵令
620 | 纵然
621 | 纵使
622 | 遵照
623 | 作为
624 | 兮
625 | 呃
626 | 呗
627 | 咚
628 | 咦
629 | 喏
630 | 啐
631 | 喔唷
632 | 嗬
633 | 嗯
634 | 嗳
635 | 也许
636 | 人
637 | 前
638 | 令
639 | 份
640 | 件
641 | 伏
642 | 众
643 | 众多
644 | 会
645 | 位
646 | 做
647 | 停
648 | 顶
649 | 先
650 | 先前
651 | 全
652 | 公斤
653 | 其实
654 | 内
655 | 已
656 | 再
657 | 小
658 | 大
659 | 还
660 | 里
661 | 都
662 | 部
663 | 遍
664 | 道
665 | 说
666 | 只
667 | 后
668 | 太
669 | 看
670 | 年
671 | 很
672 | 才
673 | 时
674 | 更
675 | 最
676 | 本报
677 | 讯
678 | 演
679 | 片


--------------------------------------------------------------------------------
/spider/news/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/6 22:44
 8 |  @File    : __init__.py.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/spider/news/get_news_url.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/5 19:39
 8 |  @File    : getNewsUrl.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import requests
16 | from lxml import etree
17 | import user_agent
18 | import random_ip
19 | 
20 | url = 'https://news.hao123.com/wangzhi'
21 | re = requests.get(url, headers=user_agent.UserAgent().getRandomHeaders(), proxies = random_ip.RandomIp().getOneProxies())
22 | html = re.text
23 | struct = etree.HTML(html)
24 | 
25 | # with open("newUrl.txt",'w',encoding='utf-8') as f:
26 | #     f.write(html)
27 | 
28 | for i in range(1, 21):
29 |     newName = struct.xpath('//*[@id="bd"]/div[1]/div/ul/li[' + str(i) + ']/h3/div/a/text()')
30 |     href = struct.xpath('//*[@id="bd"]/div[1]/div/ul/li[' + str(i) + ']/h3/div/a/@href')
31 |     print(newName, href)
32 | 
33 | for i in range(1, 25):
34 |     newName1 = struct.xpath('//*[@id="bd"]/div[2]/div/ul/li[' + str(i) + ']/h3/div/a/text()')
35 |     href1 = struct.xpath('//*[@id="bd"]/div[2]/div/ul/li[' + str(i) + ']/h3/div/a/@href')
36 |     print(newName1, href1)


--------------------------------------------------------------------------------
/spider/news/new_pengpai.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/6 22:46
 8 |  @File    : new_pengpai.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | import requests
16 | import random_ip
17 | import user_agent
18 | from lxml import etree
19 | from bs4 import BeautifulSoup as bs
20 | 
21 | # url = "https://www.thepaper.cn/channel_25951"
22 | #
23 | # proxies = randomIp.RandomIp().getOneProxies()
24 | # headers = userAgent.UserAgent().getRandomHeaders()
25 | #
26 | # re = requests.get(url, headers=headers, proxies=proxies)
27 | # html = re.text
28 | # with open("re.txt","w",encoding="utf-8") as f:
29 | #     f.write(html)
30 | 
31 | with open("re.txt","r",encoding="utf-8") as f:
32 |     content = f.read()
33 | 
34 | soup = bs(content,"html.parser")
35 | aList = soup.find_all("a")
36 | for a in aList:
37 |     print(a.get('href'))
38 | 


--------------------------------------------------------------------------------
/spider/news/wallstreetcn.py:
--------------------------------------------------------------------------------
  1 | """
  2 |  !/usr/bin/env python3.6
  3 |  -*- coding: utf-8 -*-
  4 |  --------------------------------
  5 |  Description : 华尔街见闻
  6 |  --------------------------------
  7 |  @Time    : 2019/5/13 22:16
  8 |  @File    : wallstreetcn.py
  9 |  @Software: PyCharm
 10 |  --------------------------------
 11 |  @Author  : lixj
 12 |  @contact : lixj_zj@163.com
 13 | """
 14 | 
 15 | import requests
 16 | 
 17 | # 先拿到网页中要爬取部分的所有链接
 18 | # url去重
 19 | # scarpy-reids
 20 | # 读取字段写到数据库
 21 | 
 22 | # targetUrl = "https://wallstreetcn.com/kechuang"
 23 | #
 24 | # re = requests.get(targetUrl)
 25 | # re.encoding=re.apparent_encoding
 26 | # html = re.text
 27 | #
 28 | # with open("html.txt","w",encoding="utf-8") as f:
 29 | #     f.write(str(html))
 30 | 
 31 | import re
 32 | import json
 33 | 
 34 | 
 35 | def getWallstreetData():
 36 |     with open("html.txt", "r", encoding="utf-8") as f:
 37 |         html = f.read()
 38 | 
 39 |     # 匹配获取文章列表页数据
 40 |     parten = r'<script>window.__IVANKA_API_CACHE__=(.*)</script>'
 41 |     res = re.findall(parten, html)
 42 | 
 43 |     # 匹配结果转换成字典
 44 |     dictinfo = str2dict(res[0])
 45 |     cachedResponseDic = dictinfo['cachedResponse']
 46 | 
 47 |     # 遍历字典，匹配key
 48 |     for key, value in cachedResponseDic.items():
 49 |         if "information-flow" in key:
 50 |             nextCursor = value.get('value').get('next_cursor')
 51 |             items = value.get('value').get('items')
 52 |             return nextCursor, items
 53 | 
 54 | 
 55 | # json格式的字符串转换成字典（json）
 56 | def str2dict(str):
 57 |     return json.loads(str)
 58 | 
 59 | 
 60 | # dict中的数据入库
 61 | def data2oracle(dict):
 62 |     print(dict)
 63 | 
 64 | 
 65 | # 获取url的text
 66 | def requestUrl(url):
 67 |     re = requests.get(url)
 68 |     re.encoding = re.apparent_encoding
 69 |     return re.text
 70 | 
 71 | 
 72 | # 爬取指定个数的数据
 73 | def getLimitData(url):
 74 |     returnJsonStr = requestUrl(url)
 75 |     jsonDict = str2dict(returnJsonStr)
 76 |     items = jsonDict['data']['items']
 77 |     return items
 78 | 
 79 | 
 80 | # 解析文章的详细信息，返回详细信息的字典
 81 | def getArticleDetail(url):
 82 |     html = requestUrl(url)
 83 |     pass
 84 | 
 85 | 
 86 | if __name__ == '__main__':
 87 |     nextCursor, pre20Items = getWallstreetData()
 88 | 
 89 |     # limit = 10  # 返回个数
 90 |     # url = "https://api.wallstreetcn.com/apiv1/content/information-flow?channel=kechuang&accept=article%2Cad&cursor=" + nextCursor + "&limit=" + str(limit)
 91 |     # allItems = pre20Items.extend(getLimitData(url))
 92 |     # print(allItems)
 93 | 
 94 | 
 95 |     allItems = []
 96 | 
 97 |     # 遍历所有items，合并所有信息，入库
 98 |     for itemDic in allItems:
 99 |         print(itemDic)
100 |         detailDic = getArticleDetail(itemDic['resource']['uri'])
101 | 
102 |         # 合并两个字典itemDic与detailDic，返回最终结果入库
103 |         result = {}
104 | 
105 |         # 已有的信息与详细信息合并后，数据入库
106 |         data2oracle(result)
107 | 


--------------------------------------------------------------------------------
/spider/pachong/geckodriver.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/pachong/geckodriver.log


--------------------------------------------------------------------------------
/spider/pachong/get_dynamic_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------------
 5 |  @Description : selenium获取页面动态内容
 6 |  --------------------------------------
 7 |  @File        : get_dynamic_data.py
 8 |  @Time        : 2018/4/26 16:55
 9 |  @Software    : PyCharm
10 |  --------------------------------------
11 |  @Author      : lixj
12 |  @Contact     : lixj_zj@163.com
13 |  --------------------------------------
14 | """
15 | 
16 | from selenium import webdriver
17 | from selenium.webdriver.chrome.options import Options
18 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
19 | from bs4 import BeautifulSoup as bs
20 | from pandas import DataFrame
21 | import time
22 | 
23 | 
24 | ## 1. 动态抓取页面
25 | chrome_options = webdriver.ChromeOptions()
26 | chrome_options.add_argument('--headless')
27 | chrome_options.add_argument('--disable-gpu')
28 | 
29 | 
30 | ## 2. 更改user-agent
31 | # chrome_options.add_argument('--user-agent=Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3')
32 | # chrome_options.add_argument('--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3')
33 | # chrome_options.add_argument('--user-agent=Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1')
34 | chrome_options.add_argument('--user-agent=Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30')
35 | 
36 | driver = webdriver.Chrome(chrome_options=chrome_options)
37 | # PhantomJS 目前标记为不赞成，在未来版本中可能不支持，改用chrome的headless chrome
38 | # driver = webdriver.PhantomJS(executable_path="./phantomjs/bin/phantomjs")
39 | 
40 | url = "https://www.huomao.com/channel/lol"
41 | 
42 | # 获取页面全部内容
43 | driver.get(url)
44 | data = driver.page_source
45 | print(len(data))
46 | 
47 | # 获取网页截图
48 | # driver.save_screenshot("./screenshot/1.png")
49 | 
50 | 
51 | ## 3. 保存爬取内容到本地分析！
52 | with open("./page_source.txt", "w", encoding="utf-8") as f:
53 |     f.write(driver.page_source)
54 | 
55 | driver.quit()
56 | 
57 | 
58 | ## 4. 分析
59 | page_source = ''    # 本地文件内容代替爬取结果str类型
60 | with open("./page_source.txt", "r", encoding="utf-8") as f:
61 |     page_source = f.read()
62 | 
63 | # 存储最终分析结果
64 | name = []
65 | title = []
66 | watching = []
67 | 
68 | # 开始解析
69 | soup = bs(page_source, "html.parser")
70 | channelList = soup.find("div", attrs={'id':"channellist"})
71 | rooms = channelList.find_all("div", attrs={'class':"list-smallbox no-logo"})
72 | # 获取每个房间中的主播信息
73 | for room in rooms:
74 |     try:
75 |         this_title = room.find("a")["title"]    # title当作a的属性获取
76 |         this_name = room.find("span", class_="nickname").text   # bs 解析标签中的值
77 |         this_watching = room.find("em", attrs={"class":"flr"}).find("span").text
78 |     except:
79 |         this_watching = room.find("div", class_="no-playing").text
80 |     name.append(this_name)
81 |     title.append(this_title)
82 |     watching.append(this_watching)
83 | 
84 | result = DataFrame({
85 |     "主播名":name,
86 |     "节目名":title,
87 |     "观看人数":watching
88 | })
89 | 
90 | result.to_csv("./result.csv", encoding = "utf_8_sig")
91 | 


--------------------------------------------------------------------------------
/spider/pachong/result.csv:
--------------------------------------------------------------------------------
 1 | ﻿,主播名,节目名,观看人数
 2 | 0,猓狐狸°,【第一女刺客】你看见过我的小熊吗~,"6,163"
 3 | 1,齐天小小圣,晚上闹一会儿天宫吧~,289
 4 | 2,MrIvan,老司机带飞,2
 5 | 3,、小许,这个主播有点皮,主播正在休息
 6 | 4,-你心中的玮哥哥,很久不见。,主播正在休息
 7 | 5,鱼长官卡兹克,祖安王者局.,主播正在休息
 8 | 6,慕薛丶今年15,想我了来q群找我:167569325,主播正在休息
 9 | 7,宿命小磊,问道爆发,主播正在休息
10 | 8,居委会K某人,贾克斯的故事第69集,主播正在休息
11 | 9,MonicaOvO,未来人类品牌日 DRvsPDQ,主播正在休息
12 | 10,旺仔大馒头,带你们看火狐大美妞,主播正在休息
13 | 11,EWG女子电竞,EWG-LOL,主播正在休息
14 | 12,末路^_^,雨里夜里，峡谷之巅等你。,主播正在休息
15 | 13,Yas1n,网四钻石AD辅助冲分,主播正在休息
16 | 14,我的EZ会发光哟,电一王者峡谷上分 目前钻1,主播正在休息
17 | 15,DAi黛王巡山,【黛】帮我想个标题吧,主播正在休息
18 | 16,lime,【光家族】有你们是我的骄傲^_^,主播正在休息
19 | 17,魔女小娜迦,老板屋内位子随便上,主播正在休息
20 | 18,✿๓浅浅,一个不会翻车的女司机（・(ｪ)・）,主播正在休息
21 | 19,隔壁老王丶1,ob韩服王者局,主播正在休息
22 | 20,国服第一米老鼠,定个小目标，今天8把鸡。,主播正在休息
23 | 21,佳璐是个小仙女,皮城小仙女   佳璐璐哦,主播正在休息
24 | 22,跳儿,网一直播掉分儿,主播正在休息
25 | 23,狼狼不是娘娘酱,狼酱：别人凭本事拿的人头，为啥说我送？,主播正在休息
26 | 24,FANGQING丶,网1钻石AD,主播正在休息
27 | 25,Da丶新,新仔的菜鸟之路~~~~,主播正在休息
28 | 26,小智齿,火猫最强辅助莫甘娜 一Q一个男朋友,主播正在休息
29 | 27,蓝花楹,蓝花楹,主播正在休息
30 | 28,江湖人称一条柴,12345678,主播正在休息
31 | 29,OopsLeo,10把定位,主播正在休息
32 | 30,我叫该隐,人马：学会上不了钻石请举报我！,主播正在休息
33 | 31,小进进i,小进进：火猫第一德莱文,主播正在休息
34 | 32,青椒ccccccccccc,绝对有你意想不到的惊喜！！,主播正在休息
35 | 33,Star.Still,新人女主播,主播正在休息
36 | 34,小安萌啊-承承,我的直播间,主播正在休息
37 | 35,主播Carry丶锋,锐文专场！,主播正在休息
38 | 36,冷锋UU,小丑皇的无情杀戮,主播正在休息
39 | 37,二娜丶,【二娜】牙疼,主播正在休息
40 | 38,一个人的游戏.,中单专场,主播正在休息
41 | 39,电竞潇洒哥,其实躺赢也是门技术,主播正在休息
42 | 40,三斧子不解释,玩会LOL,主播正在休息
43 | 41,7777智障肉肉,ADC与辅助的爱恨情仇,主播正在休息
44 | 42,火猫丶TVForsake,火猫丶TVForsake,主播正在休息
45 | 43,古力逗,没有技术可言  送仙豆的都是大哥,主播正在休息
46 | 44,蓝俊熙丶,看一波电影,主播正在休息
47 | 45,雨晨姑奶奶,手受伤过几天开摄像头,主播正在休息
48 | 46,丑的被嫉妒,丑怪：不是技术 却很皮,主播正在休息
49 | 47,温柔风亚索,.......,主播正在休息
50 | 48,泽拉图丶,泽拉图：土嗨主播上路英雄海,主播正在休息
51 | 49,An阿南。,ADC 小王子。,主播正在休息
52 | 50,李梓煜,李梓煜  新主播求订阅,主播正在休息
53 | 51,月半湾丶小帅,新人主播求订阅,主播正在休息
54 | 52,我其实很懒,一区钻5单排上分,主播正在休息
55 | 53,空城丶智少,中路,主播正在休息
56 | 54,【文帝】,阴阳判官德莱文,主播正在休息
57 | 55,bewhy！,dddd~~~,主播正在休息
58 | 56,东北昊天,东北昊天：户外，唠嗑来。,主播正在休息
59 | 57,广东丶隔壁老黄,练下JJ~,主播正在休息
60 | 58,单纯主播蓄蓄,四月活动收礼只收“萌”2角一个！,主播正在休息
61 | 59,请叫我大C,征服之海,主播正在休息
62 | 


--------------------------------------------------------------------------------
/spider/pachong/screenshot/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/pachong/screenshot/1.png


--------------------------------------------------------------------------------
/spider/pachong/screenshot/bottom.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/pachong/screenshot/bottom.jpg


--------------------------------------------------------------------------------
/spider/pachong/笔记.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | Selenium + Chrome Driver 操作爬取过程
 3 | 
 4 | 问题：
 5 | 1. PhantomJS 运行时标记为不赞成
 6 | 解决方法：使用Headless Chrome替换
 7 | 参考网页：https://blog.csdn.net/visual0522/article/details/79343917
 8 | 
 9 | 2. 下载对应chrome 版本的chromedriver下载安装：
10 | 注：查看chrome版本，前两位对应版本号；将chrome driver添加到PATH系统路径中
11 | 参考网页：https://blog.csdn.net/huilan_same/article/details/51896672
12 | 
13 | 3. bs解析
14 | this_title = room.find("a")["title"]    # title当作a的属性获取
15 | this_name = room.find("span", class_="nickname").text   # bs 解析标签中的值
16 | 
17 | 主要区别：
18 | 1. 在使用动态网页爬取神器Selenium爬取网页时，获得的时网页的全部内容，包括实时显示更新的数据；
19 | 一般在静态页面中无法获取。
20 | 
21 | 2. 在代码的编写上，主要区别在于最初获取页面内容上，主要使用webdriver.Chrome()获取目标页面的全部内容
22 | 其余部分的解析、提取关键内容，同静态页面的提取、解析方式相同。
23 | 
24 | 参考连接：
25 | 1.
26 | python+Selenium2+chrome构建动态网页爬虫工具  https://blog.csdn.net/cjsafty/article/details/9206323
27 | 
28 | 2.
29 | python中selenium操作下拉滚动条方法汇总  https://www.cnblogs.com/landhu/p/5761794.html
30 | 
31 | 3.
32 | python+selenium+PhantomJS爬取网页动态加载内容  https://www.cnblogs.com/chenice/p/6994111.html
33 | （PhantomJS中的部分内容可用，仅参考driver之后的相关函数使用）
34 | 
35 | 


--------------------------------------------------------------------------------
/spider/spider.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/spider.zip


--------------------------------------------------------------------------------
/spider/wechat/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/6/7 15:21
 8 |  @File    : __init__.py.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/spider/weibo/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/14 18:39
 8 |  @File    : __init__.py.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """


--------------------------------------------------------------------------------
/spider/weibo/weibo_api.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  !/usr/bin/env python3.6
 3 |  -*- coding: utf-8 -*-
 4 |  --------------------------------
 5 |  Description :
 6 |  --------------------------------
 7 |  @Time    : 2019/4/14 18:40
 8 |  @File    : weiboAPI.py
 9 |  @Software: PyCharm
10 |  --------------------------------
11 |  @Author  : lixj
12 |  @contact : lixj_zj@163.com
13 | """
14 | 
15 | from weibo import APIClient
16 | 
17 | # 1.配置
18 | APP_KEY = '4073142975'
19 | APP_SECRET = '6e8a766757e8ae11b06f0e0bfc26b291'
20 | CALLBACK_URL = 'http://apps.weibo.com/heyshheyou'  # 回调授权页面，用户完成授权后返回的页面
21 | 
22 | # 2.调用APIClient生成client实例
23 | client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL)
24 | 
25 | # 3.得到授权页面的url
26 | url = client.get_authorize_url()
27 | print(url)
28 | 
29 | # 4.点击访问url，在浏览器端获得code
30 | code = '6ecdbf350f0680a6f00cc8c34ae721a6'
31 | req = client.request_access_token(code)
32 | client.set_access_token(req.get('access_token'), req.get('expires_in'))
33 | 
34 | # 5.调用微博普通读取接口，返回最新的公共微博。
35 | # 接口详情见 https://open.weibo.com/wiki/2/statuses/public_timeline
36 | statuses = client.statuses__public_timeline()['statuses']
37 | print(len(statuses))
38 | # 6.输出部分信息
39 | for i in range(0, len(statuses)):
40 |     print(u'昵称：' + statuses[i]['user']['screen_name'])
41 |     print(u'简单介绍：' + statuses[i]['user']['description'])
42 |     print(u'位置：' + statuses[i]['user']['location'])
43 |     print(u'微博：' + statuses[i]['text'])
44 |     print(statuses[i])
45 | 


--------------------------------------------------------------------------------