├── .gitattributes
├── .gitignore
├── Data-Processing
├── README.md
├── SuperCodetest.py
├── for_csv
│ ├── README.md
│ ├── csv_write.py
│ ├── read_csv.py
│ └── test.csv
├── for_docx
│ ├── creat_docx.py
│ ├── demo.docx
│ └── read_docx.py
├── for_excel
│ ├── README.md
│ ├── excel使用技巧.md
│ ├── read_excel.py
│ └── test.xlsx
├── for_file
│ ├── change_file_name.exe
│ ├── change_file_name.py
│ └── get_file_name.py
├── for_img
│ ├── README.md
│ ├── Tutorial.ipynb
│ ├── detect_line.py
│ └── lena.png
├── for_pdf
│ ├── OCR
│ │ ├── README.md
│ │ ├── easy_ocr.py
│ │ ├── pdf2ocr.py
│ │ ├── tesseract_test.py
│ │ └── transformer_ocr.py
│ ├── QR_code
│ │ ├── README.md
│ │ ├── zbar_img.py
│ │ └── zbar_pdf.py
│ ├── README.md
│ ├── Tools
│ │ ├── README.md
│ │ ├── pdf-rename.py
│ │ ├── tax_judge.py
│ │ └── test.ipynb
│ ├── pdf2excel.py
│ ├── pdf2img.py
│ └── pdf_utils.ipynb
├── for_txt
│ ├── 11.txt
│ ├── README.md
│ ├── add_content.py
│ ├── example.txt
│ ├── example1.txt
│ ├── how_many_lines.py
│ ├── read_data.py
│ ├── read_numpy.py
│ ├── read_txt.py
│ ├── test.py
│ └── write_txt.py
├── print
│ └── print_in_1line.py
└── remove_string_spaces.py
├── LICENSE
├── Messy
├── A.py
├── DataTransform.py
├── Graph.py
├── GraphAlgorithms.py
├── ListSearch.py
├── crop_morphology.py
├── labelImg.py
└── list_remove.py
├── Python+Algorithm
├── Evolutionary-Algorithm
│ ├── Match Phrase.py
│ └── genetic_algorithm.py
├── Geometric
│ ├── point_oblique_straight_point.py
│ └── two_point_straight_point.py
├── Least-Squares
│ ├── Least squares.py
│ ├── README.md
│ └── train_data.csv
├── Math
│ ├── Kalman
│ │ ├── Kalman_2D.py
│ │ ├── Kalman_3D.py
│ │ ├── README.md
│ │ ├── kalamn_unc.py
│ │ ├── kalman_1.py
│ │ ├── my_kalman_carmove.py
│ │ └── my_kalman_simple.py
│ ├── gram_schmidt.py
│ ├── math_base.py
│ ├── matrix.ipynb
│ ├── matrix.py
│ └── pareto-front.py
├── Optimization-Algorithm
│ ├── Adam.py
│ ├── BGD.py
│ ├── README.md
│ ├── SGD.py
│ ├── SGD_momentum.py
│ └── test.py
├── Search-Algorithm
│ ├── BFS.py
│ ├── DFS.py
│ ├── README.md
│ └── fig1.png
├── Sorting-Algorithm
│ ├── README.md
│ ├── bubble_sort.py
│ ├── counting_sort.py
│ ├── insertion_sort.py
│ ├── merge_sort.py
│ ├── quick_sort.py
│ ├── selection_sort.py
│ └── sleep_sort.py
├── Uncategorized
│ ├── pyramid.py
│ └── xingxingdiandeng.py
└── kalman.py
├── Python+Crawler
├── DoubanTop250.py
├── README.md
├── Web
│ ├── README.md
│ ├── fake_uragent.py
│ ├── html+save.py
│ ├── ip_get.py
│ ├── ip_test.py
│ ├── key_ua+ip.py
│ ├── test.py
│ ├── text
│ │ ├── ip.txt
│ │ └── user_agent.txt
│ ├── urlib.py
│ ├── view_ua+ip.py
│ ├── view_ua.py
│ ├── webpage_viewer.py
│ └── 刷网页.py
├── caixukun.py
├── crawler1.py
├── crawler2.py
├── crawler3.py
├── debug.log
├── form_test.py
├── ip.txt
├── ip1.txt
├── sample1.py
├── selenium_first.py
├── spider-google.py
├── test.py
├── user_agent.txt
└── 豆瓣最受欢迎的250部电影.xlsx
├── Python+HTML
├── README.md
├── test1.html
├── test2.html
├── test2.py
├── test3.html
├── test3.py
├── test4.html
└── test4.py
├── Python+Media
├── 发邮件
│ ├── 163.py
│ ├── README.md
│ ├── content.txt
│ └── outlook.py
└── 文字转音频
│ ├── demo.pcm
│ ├── new_use.py
│ ├── pcm2wav.py
│ ├── test_webtts.py
│ ├── text.txt
│ ├── text2audio.py
│ ├── tts_ws_python3_demo.py
│ └── use_old.py
├── Python+Opencv
├── README.md
├── opencv_draw.py
├── template_matching.py
├── 图像处理
│ ├── binarization.py
│ └── black-white.jpg
├── 基于颜色的物体追踪.py
├── 拾色器
│ ├── 0.jpg
│ ├── 1.jpg
│ ├── 2.png
│ ├── color_picker(取色器交互版).py
│ └── color_picker(取色器无交互版).py
├── 相机
│ └── camera_photo.py
└── 颜色_圆_相关
│ ├── 图片
│ ├── color_filtering.py
│ ├── color_list.py
│ ├── detect_picture_color.py
│ ├── detect_picture_color_circle.py
│ ├── judge_color_center.py
│ ├── judge_multi_color.py
│ ├── judge_single_color.py
│ └── multi_color_filtering.py
│ └── 视频
│ ├── color_list.py
│ ├── detect_camera_color.py
│ └── detect_camera_color_circle.py
├── Python+PC-Control
├── mouse_control.py
├── mouse_monitor.py
└── moyu.py
├── Python+Piano
└── read.py
├── Python+arXiv
├── README.md
├── arxiv.py
├── check_update.py
├── conf.json
├── conf_list.txt
├── file2md.py
└── id2md.py
├── Pythonic-Standard
├── Config文件编写
│ ├── Argparse使用指南.md
│ ├── Config文件编写.md
│ ├── YAML使用指南.md
│ ├── config.py
│ ├── config.yaml
│ ├── use_argparse.py
│ ├── use_omegaconf.py
│ └── use_yaml.py
├── Partial使用指南.md
├── Pathlib 使用指南.md
├── Python下划线含义.md
├── README.md
├── decorator装饰器.md
├── import.md
├── multi-level
│ ├── __init__.py
│ ├── file1.py
│ ├── file2.py
│ ├── folder1
│ │ ├── __init__.py
│ │ ├── file11.py
│ │ ├── file12.py
│ │ ├── folder11
│ │ │ ├── file111.py
│ │ │ └── file112.py
│ │ └── folder22
│ │ │ ├── __init__.py
│ │ │ └── file221.py
│ ├── folder2
│ │ └── file21.py
│ └── set_env.sh
├── print 输出.md
├── tqdm使用指南.md
├── 切片.md
├── 参数 传参 可变参数.md
├── 基础使用.md
├── 字典的实用.md
└── 类.md
└── README.md
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/settings.json
2 | .vscode
3 | Algorithm/Optimization-Algorithm/test.py
4 | **/__pycache__
5 | *.pyc
6 | .DS_Store
7 |
--------------------------------------------------------------------------------
/Data-Processing/README.md:
--------------------------------------------------------------------------------
1 |
Data Processing
2 |
3 | 整理处理txt、csv、docx、pdf 等的高效方法。这里记录一些最基本的用法,其他详细见对应子文件夹。
4 |
5 |
6 |
7 | ## 原则
8 |
9 | `逗号`隔开的文件,尽量改为csv格式,因为处理csv格式有天然的优势
10 |
11 | 可以直接由txt文件格式转为csv格式
12 |
13 |
14 |
15 | ## 文件操作
16 |
17 | 对路径进行操作,推荐使用 [Pathlib](https://docs.python.org/3/library/pathlib.html)。
18 |
19 | ```python
20 | # 获取当前文件夹所有pdf的文件名
21 |
22 | from pathlib import Path
23 |
24 | root_dir = Path('./')
25 | pdf_list = sorted(root_dir.glob('*.pdf'))
26 | ```
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/Data-Processing/for_csv/README.md:
--------------------------------------------------------------------------------
1 | # CSV
2 |
3 |
4 |
5 | python 自带就有一个 csv 库
6 |
7 | ```python
8 | import csv
9 | with open('some.csv', newline='') as f:
10 | reader = csv.reader(f)
11 | for row in reader:
12 | print(row)
13 | ```
14 |
15 |
16 |
17 | ```python
18 | import csv
19 | for row in csv.reader(['one,two,three']):
20 | print(row)
21 | ```
22 |
23 |
24 |
25 | 也可以借助 panda
26 |
27 | ```python
28 | import pandas
29 |
30 | df = pandas.read_csv('test.csv', encoding='utf-8')
31 | print(df['Price'][0])
32 | ```
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
--------------------------------------------------------------------------------
/Data-Processing/for_csv/csv_write.py:
--------------------------------------------------------------------------------
1 | #加入需要的库
2 | import csv
3 | import os
4 |
5 | #初始化工作
6 | path='competition/train'
7 | csvFile = open("2.csv", 'a',newline='')
8 | writer = csv.writer(csvFile)
9 |
10 |
11 | #开始读写文件
12 | files = os.listdir(path)
13 | for file in files:
14 | command='tesseract.exe '+path+'/'+file+' output -l chi_sim+chi_sim1 --psm 7'
15 | os.popen(command).read()
16 | f = open('output.txt', 'r', encoding="utf8") # 打开文件
17 | data=''
18 | for line in f.readlines():
19 | data = data + line.strip()
20 | data = data.replace(' ', '')
21 | csvFile = open("2.csv", 'a', newline='')
22 | add_info = [file, data]
23 | writer.writerow(add_info)
24 | f.close()
25 |
26 | csvFile.close()
--------------------------------------------------------------------------------
/Data-Processing/for_csv/read_csv.py:
--------------------------------------------------------------------------------
1 | '''
2 | 读取csv,csv示例如下:
3 |
4 | Symbol,Price,Date,Time,Change,Volume
5 | "AA",39.48,"6/11/2007","9:36am",-0.18,181800
6 | "AIG",71.38,"6/11/2007","9:36am",-0.15,195500
7 | "AXP",62.58,"6/11/2007","9:36am",-0.46,935000
8 | "BA",98.31,"6/11/2007","9:36am",+0.12,104800
9 | "C",53.08,"6/11/2007","9:36am",-0.25,360900
10 | "CAT",78.29,"6/11/2007","9:36am",-0.23,225400
11 |
12 | 数据存储格式为 DataFrame
13 | '''
14 |
15 | import pandas
16 |
17 | df = pandas.read_csv('test.csv', encoding='utf-8')
18 |
19 | print(df['Price'][0])
--------------------------------------------------------------------------------
/Data-Processing/for_csv/test.csv:
--------------------------------------------------------------------------------
1 | Symbol,Price,Date,Time,Change,Volume
2 | "AA",39.48,"6/11/2007","9:36am",-0.18,181800
3 | "AIG",71.38,"6/11/2007","9:36am",-0.15,195500
4 | "AXP",62.58,"6/11/2007","9:36am",-0.46,935000
5 | "BA",98.31,"6/11/2007","9:36am",+0.12,104800
6 | "C",53.08,"6/11/2007","9:36am",-0.25,360900
7 | "CAT",78.29,"6/11/2007","9:36am",-0.23,225400
--------------------------------------------------------------------------------
/Data-Processing/for_docx/creat_docx.py:
--------------------------------------------------------------------------------
1 | """Reference website
2 | https://python-docx.readthedocs.io/en/latest/
3 | """
4 |
5 | from docx import Document
6 | from docx.shared import Inches
7 |
8 | document = Document()
9 |
10 | document.add_heading('Document Title', 0)
11 |
12 | p = document.add_paragraph('A plain paragraph having some ')
13 | p.add_run('bold').bold = True
14 | p.add_run(' and some ')
15 | p.add_run('italic.').italic = True
16 |
17 | document.add_heading('Heading, level 1', level=1)
18 | document.add_paragraph('Intense quote', style='Intense Quote')
19 |
20 | document.add_paragraph(
21 | 'first item in unordered list', style='List Bullet'
22 | )
23 | document.add_paragraph(
24 | 'first item in ordered list', style='List Number'
25 | )
26 |
27 | # document.add_picture('monty-truth.png', width=Inches(1.25))
28 |
29 | records = (
30 | (3, '101', 'Spam'),
31 | (7, '422', 'Eggs'),
32 | (4, '631', 'Spam, spam, eggs, and spam')
33 | )
34 |
35 | table = document.add_table(rows=1, cols=3)
36 | hdr_cells = table.rows[0].cells
37 | hdr_cells[0].text = 'Qty'
38 | hdr_cells[1].text = 'Id'
39 | hdr_cells[2].text = 'Desc'
40 | for qty, id, desc in records:
41 | row_cells = table.add_row().cells
42 | row_cells[0].text = str(qty)
43 | row_cells[1].text = id
44 | row_cells[2].text = desc
45 |
46 | document.add_page_break()
47 |
48 | document.save('demo.docx')
--------------------------------------------------------------------------------
/Data-Processing/for_docx/demo.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzy1996/Python-Code/2abcaa6fbfa4a84aaffdf10d7bcc6b12649dd221/Data-Processing/for_docx/demo.docx
--------------------------------------------------------------------------------
/Data-Processing/for_docx/read_docx.py:
--------------------------------------------------------------------------------
1 | from docx import Document
2 |
3 | file = Document("demo.docx")
4 |
5 | for para in file.paragraphs:
6 | print(para.text)
--------------------------------------------------------------------------------
/Data-Processing/for_excel/README.md:
--------------------------------------------------------------------------------
1 | ## 用什么库读取
2 |
3 | panda
--------------------------------------------------------------------------------
/Data-Processing/for_excel/excel使用技巧.md:
--------------------------------------------------------------------------------
1 | =IF(ISERROR(VLOOKUP(C3,Sheet1!$A$1:$B$200,2,FALSE)),"",VALUE(VLOOKUP(C3,Sheet1!$A$1:$B$200,2,FALSE)))
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/Data-Processing/for_excel/read_excel.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | data = pd.read_excel('test.xlsx')
4 |
5 |
--------------------------------------------------------------------------------
/Data-Processing/for_excel/test.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzy1996/Python-Code/2abcaa6fbfa4a84aaffdf10d7bcc6b12649dd221/Data-Processing/for_excel/test.xlsx
--------------------------------------------------------------------------------
/Data-Processing/for_file/change_file_name.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzy1996/Python-Code/2abcaa6fbfa4a84aaffdf10d7bcc6b12649dd221/Data-Processing/for_file/change_file_name.exe
--------------------------------------------------------------------------------
/Data-Processing/for_file/change_file_name.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | file_list = os.listdir()
4 |
5 | for file in file_list:
6 | os.rename(file, file.replace(' ', '-'))
7 |
--------------------------------------------------------------------------------
/Data-Processing/for_file/get_file_name.py:
--------------------------------------------------------------------------------
1 | # 获取文件名,且不要后缀名
2 |
3 | import os
4 |
5 | path = r'demo.txt'
6 | name = os.path.splitext(os.path.basename(path))[0]
7 | print(name)
--------------------------------------------------------------------------------
/Data-Processing/for_img/README.md:
--------------------------------------------------------------------------------
1 | # 一文解释 图像读取
2 |
3 | > 希望通过本文帮助你了解一些基础用法,本质数据类型。常用的库包括了`Pillow`,`OpenCV`,`Matplotlib`,`torchvision`.
4 |
5 |
6 |
7 |
8 |
9 | ## Pillow
10 |
11 | https://pillow.readthedocs.io/en/stable/handbook/tutorial.html
12 |
13 | ```python
14 | from PIL import Image
15 |
16 | # read
17 | pil_img = Image.open("your_image.jpg") # RGB
18 |
19 | # grayscale
20 | pil_img = Image.open("your_image.jpg").convert("L")
21 |
22 | # save
23 | pil_img.save("new_image.jpg")
24 |
25 | # save a JPEG image with specific quality
26 | pil_img.save("new_image.jpg", quality=95)
27 | ```
28 |
29 |
30 |
31 | ```python
32 | # Pillow image to OpenCV image
33 |
34 | cv2_img = np.array(pil_img)
35 | cv2_img = cv2.cvtColor(cv2_img, cv2.COLOR_RGB2BGR)
36 | ```
37 |
38 |
39 |
40 | ## Opencv
41 |
42 | > OpenCV images are actually NumPy arrays
43 |
44 | ```python
45 | import cv2
46 |
47 | img = cv2.imread("your_image.jpg") # BGR
48 |
49 | img = cv2.imread("your_image.jpg", cv2.IMREAD_GRAYSCALE)
50 |
51 | cv2.imwrite("new_image.jpg", img)
52 |
53 | cv2.imwrite("new_image.jpg", img, [int(cv2.IMWRITE_JPEG_QUALITY), 95])
54 |
55 |
56 | cv.imshow(img)
57 | cv.waitKey(0)
58 | ```
59 |
60 | ```python
61 | # OpenCV image to Pillow image
62 |
63 | cv2_img = cv2.cvtColor(cv2_img, cv2.COLOR_BGR2RGB)
64 | pil_img = Image.fromarray(cv2_img
65 | ```
66 |
67 |
68 |
69 | ## Matplotlib
70 |
71 | 学习资料
72 |
73 | https://github.com/rougier/matplotlib-tutorial
74 |
75 | https://github.com/matplotlib/cheatsheets
76 |
77 | https://github.com/matplotlib/cheatsheets
78 |
79 | ```python
80 |
81 | plt.show()
82 | ```
83 |
84 |
85 |
86 | ref: https://medium.com/analytics-vidhya/the-ultimate-handbook-for-opencv-pillow-72b7eff77cd7
87 |
88 |
89 |
90 |
--------------------------------------------------------------------------------
/Data-Processing/for_img/Tutorial.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "\n",
9 | "%matplotlib notebook jupyter行内形成交互式的图表\n",
10 | "%matplotlib mac内形成交互式的图表,即会弹出图像窗口\n",
11 | "%matplotlib inline 可以显示图像,但无交互功能;同时方便导出为markdown的时候有图片\n",
12 | "the backend needs to be set in the ipython_config.py, not the jupyter_notebook_config.py."
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 8,
18 | "metadata": {},
19 | "outputs": [],
20 | "source": [
21 | "%matplotlib inline\n",
22 | "import matplotlib.pyplot as plt"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## Pillow\n",
30 | "\n",
31 | "`pip install Pillow`"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 10,
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "PNG (512, 512) RGB\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "from PIL import Image\n",
49 | "\n",
50 | "img = Image.open(\"lena.png\")\n",
51 | "\n",
52 | "print(img.format, img.size, img.mode)\n",
53 | "\n",
54 | "img.show()"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {},
60 | "source": [
61 | "## Opencv"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "import cv2\n",
71 | "\n",
72 | "cv2_img = cv2.imread(\"lena.png\") # BGR"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": []
81 | }
82 | ],
83 | "metadata": {
84 | "kernelspec": {
85 | "display_name": "Python 3.9.12 ('base')",
86 | "language": "python",
87 | "name": "python3"
88 | },
89 | "language_info": {
90 | "codemirror_mode": {
91 | "name": "ipython",
92 | "version": 3
93 | },
94 | "file_extension": ".py",
95 | "mimetype": "text/x-python",
96 | "name": "python",
97 | "nbconvert_exporter": "python",
98 | "pygments_lexer": "ipython3",
99 | "version": "3.9.12"
100 | },
101 | "orig_nbformat": 4,
102 | "vscode": {
103 | "interpreter": {
104 | "hash": "40d3a090f54c6569ab1632332b64b2c03c39dcf918b08424e98f38b5ae0af88f"
105 | }
106 | }
107 | },
108 | "nbformat": 4,
109 | "nbformat_minor": 2
110 | }
111 |
--------------------------------------------------------------------------------
/Data-Processing/for_img/lena.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzy1996/Python-Code/2abcaa6fbfa4a84aaffdf10d7bcc6b12649dd221/Data-Processing/for_img/lena.png
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/OCR/README.md:
--------------------------------------------------------------------------------
1 | ## PDF OCR 识别
2 |
3 | 使用python标准库做OCR识别。目前主流的库有这些,收集了Github上的Star数以及最新的更新日期。
4 |
5 | | 名称 | 更新时间(不带年份则表今年) |
6 | | :----------------------------------------------------------: | :----------------------------------------------------------: |
7 | | [tesseract](https://github.com/tesseract-ocr/tesseract)  |  |
8 | | [EasyOcr](https://github.com/JaidedAI/EasyOCR)  |  |
9 | | [OCRmyPDF](https://github.com/ocrmypdf/OCRmyPDF)  |  |
10 | | [pytesseract](https://github.com/madmaze/pytesseract)  |  |
11 |
12 |
13 |
14 |
15 |
16 | 使用 [ocrmypdf](https://ocrmypdf.readthedocs.io/en/latest/cookbook.html) ,根据[官方教程](https://ocrmypdf.readthedocs.io/en/latest/installation.html)安装。Linux系统(包含MacOS,WSL)会简单一点,Windows复杂一点。
17 |
18 | 本质上使用的是谷歌的tesseract工具,同时也有一个支持python的 https://github.com/madmaze/pytesseract
19 |
20 | 不过上述默认都不支持手写字,在学术上基于Transformer的技术也出现了,例如 https://github.com/microsoft/unilm/tree/master/trocr,https://github.com/Breta01/handwriting-ocr
21 |
22 | 使用方法:直接在命令行执行
23 |
24 | ```shell
25 | ocrmypdf --pages 1 --optimize 0 --output-type none --sidecar output.txt input.pdf -
26 | ```
27 |
28 | > --pages 1 是仅处理 pdf 的第一页,--optimize 0 禁用页面优化,--output-type none是不输出额外的一个pdf(需要配合最后的 -)
29 | >
30 | > 还可以加上 --quiet 不让打印过程
31 |
32 | 会在本地保存一个 output.txt 里面存有识别的文字。
33 |
34 | > 默认的是英文,可以替换为其他语言
35 |
36 |
37 |
38 | 如果想要写入python,注意如果简单加进去会报一个错误 `python stdout is connected to a terminal. Please redirect stdout to a file.`下面的程序中已经修复了。
39 |
40 | ```python
41 | import os
42 | import subprocess
43 | import shlex
44 |
45 | file = 'test.pdf'
46 | command = f"ocrmypdf --deskew --rotate-pages --rotate-pages-threshold 5 --output-type none --sidecar ocr_output.txt {file} -"
47 | command_args = shlex.split(command)
48 |
49 | with open('log', "w") as outfile:
50 | subprocess.run(command_args, stdout=outfile)
51 | os.remove('log')
52 | ```
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/OCR/easy_ocr.py:
--------------------------------------------------------------------------------
1 |
2 | import easyocr
3 |
4 | reader = easyocr.Reader(['ch_sim', 'en'], gpu=False) # this needs to run only once to load the model into memory
5 | result = reader.readtext('test.jpg', detail = 0)
6 |
7 | # 去掉空格
8 | print(result)
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/OCR/pdf2ocr.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import shlex
4 |
5 | file = '2.pdf'
6 | command = f"ocrmypdf --deskew --rotate-pages --rotate-pages-threshold 5 --output-type none --sidecar ocr_output.txt {file} -"
7 | command_args = shlex.split(command)
8 |
9 | with open('log', "w") as outfile:
10 | subprocess.run(command_args, stdout=outfile)
11 | os.remove('log')
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/OCR/tesseract_test.py:
--------------------------------------------------------------------------------
1 |
2 | import pytesseract
3 | from pikepdf import Pdf, PdfImage
4 |
5 | with Pdf.open('wrong2.pdf') as pdf:
6 | page = pdf.pages[-1]
7 | keyimage = list(page.images.keys())
8 | rawimage = page.images[keyimage[0]]
9 | pdfimage = PdfImage(rawimage)
10 |
11 | img = pdfimage.as_pil_image()
12 | w, h = img.size
13 | # img = img.crop((0, 0, w/2, h/3))
14 |
15 | print(pytesseract.image_to_string(img, lang='chi_sim'))
16 | # print(pytesseract.image_to_osd(img))
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/OCR/transformer_ocr.py:
--------------------------------------------------------------------------------
1 | from transformers import TrOCRProcessor, VisionEncoderDecoderModel
2 | from PIL import Image
3 | import requests
4 |
5 | from pikepdf import Pdf, PdfImage
6 |
7 | # with Pdf.open('test.pdf') as pdf:
8 | # page = pdf.pages[0]
9 | # keyimage = list(page.images.keys())
10 | # rawimage = page.images[keyimage[0]]
11 | # pdfimage = PdfImage(rawimage)
12 |
13 | # img = pdfimage.as_pil_image().convert("RGB")
14 |
15 | # # load image from the IAM database
16 | # url = 'https://fki.tic.heia-fr.ch/static/img/a01-122-02-00.jpg'
17 | img = Image.open('test.png').convert("RGB")
18 |
19 | processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
20 | model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten')
21 | pixel_values = processor(images=img, return_tensors="pt").pixel_values
22 |
23 | generated_ids = model.generate(pixel_values)
24 | generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
25 |
26 | print(generated_text)
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/QR_code/README.md:
--------------------------------------------------------------------------------
1 | # 识别二维码
2 |
3 |
4 |
5 | > 写在这里,需要先将pdf转换成相应的图像格式
6 |
7 | 使用 [pyzbar](https://pypi.org/project/pyzbar/) 来帮助识别二维码,是经典的 **zbar** 在python3上的支持。
8 |
9 | 根据官方安装步骤,针对mac的错误`ImportError: Unable to find zbar shared library`需要额外的:
10 |
11 | ```shell
12 | mkdir ~/lib
13 | ln -s $(brew --prefix zbar)/lib/libzbar.dylib ~/lib/libzbar.dylib
14 | ```
15 |
16 |
17 |
18 | 基本用法(搬运自主页)
19 |
20 | ```python
21 | # 使用 PIL.Image 类型
22 | from pyzbar.pyzbar import decode
23 | from PIL import Image
24 |
25 | QR_info = decode(Image.open('name.png'))
26 | QR_data = decoded_data[0].data.decode()
27 | ```
28 |
29 | ```python
30 | # 使用 cv2 numpy.ndarray类型
31 | from pyzbar.pyzbar import decode
32 | import cv2
33 |
34 | QR_info = decode(Image.open('name.png'))
35 | QR_data = decoded_data[0].data.decode()
36 | ```
37 |
38 |
39 |
40 | 和pdf搭配起来的用法
41 |
42 | ```python
43 | from pikepdf import Pdf, PdfImage
44 | from pyzbar.pyzbar import decode
45 |
46 | with Pdf.open('name.pdf') as pdf:
47 | page = pdf.pages[0]
48 | keyimage = list(page.images.keys())
49 | rawimage = page.images[keyimage[0]]
50 | pdfimage = PdfImage(rawimage)
51 |
52 | QR_info = decode(pdfimage.as_pil_image())
53 | if decoded_data:
54 | QR_data = decoded_data[0].data.decode()
55 | ```
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/QR_code/zbar_img.py:
--------------------------------------------------------------------------------
1 | from pyzbar.pyzbar import decode
2 | from PIL import Image
3 | from pikepdf import Pdf, PdfImage
4 | import cv2
5 |
6 | # pdf = Pdf.open('12.pdf')
7 | # page = pdf.pages[0]
8 | # keyimage = list(page.images.keys())
9 | # rawimage = page.images[keyimage[0]]
10 | # pdfimage = PdfImage(rawimage)
11 |
12 | # img = pdfimage.as_pil_image()
13 | # w, h = img.size
14 | # img = img.crop((0, h/2, w, h))
15 | from PIL import Image,ImageEnhance
16 | img1 = Image.open('111.png')
17 | img = cv2.imread('111.png')
18 |
19 | decoded_data = decode(img)
20 |
21 | for txt in decoded_data:
22 | barcodeData = txt.data.decode("utf-8")
23 | print(barcodeData)
24 |
25 | # if decoded_data:
26 | # scan_CR = decoded_data[0].data.decode()
27 |
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/QR_code/zbar_pdf.py:
--------------------------------------------------------------------------------
1 | from pyzbar.pyzbar import decode
2 | from PIL import Image
3 | from pikepdf import Pdf, PdfImage
4 |
5 |
6 | pdf = Pdf.open('2.pdf')
7 | page = pdf.pages[0]
8 | keyimage = list(page.images.keys())
9 | rawimage = page.images[keyimage[0]]
10 | pdfimage = PdfImage(rawimage)
11 | img = pdfimage.as_pil_image()
12 |
13 | # img = pdfimage.as_pil_image()
14 | # w, h = img.size
15 | # img = img.crop((0, h/2, w, h))
16 | # img.show()
17 |
18 | # img = cv2.imread('ttest.jpg')
19 | decoded_data = decode(img)
20 |
21 | print(decoded_data)
22 |
23 | # if decoded_data:
24 | # scan_CR = decoded_data[0].data.decode()
25 |
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/README.md:
--------------------------------------------------------------------------------
1 | # PDF 相关处理脚本
2 |
3 | > 这里是一些基础用法,以及其他 [PDF OCR 识别](https://github.com/yzy1996/Python-Code/tree/master/Data-Processing/for_pdf/OCR);[QR_code](https://github.com/yzy1996/Python-Code/tree/master/Data-Processing/for_pdf/QR_code)
4 |
5 |
6 |
7 | **先说有哪些热门的库**
8 |
9 | | 名称 | Stars | 最后更新时间(不带年份则表今年) | 特点 |
10 | | :------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: |
11 | | [PyPDF2](https://github.com/py-pdf/PyPDF2) |  |  | 基于pyPDF,纯python,支持超过10年了 |
12 | | [pdfminer.six](https://github.com/pdfminer/pdfminer.six) |  |  | 基于PDFMiner,extracting information |
13 | | [pdfplumber](https://github.com/jsvine/pdfplumber) |  |  | Built on [pdfminer.six](https://github.com/pdfminer/pdfminer.six),for detailed information about text character, rectangle, and line |
14 | | [PyMuPDF](https://github.com/pymupdf/PyMuPDF) |  |  | 基于[MuPDF](https://mupdf.com/),付费,C语言依赖 |
15 | | [pikepdf](https://github.com/pikepdf/pikepdf) |  |  | 基于[QPDF](https://github.com/qpdf/qpdf),C语言依赖 |
16 | | [pdfx](https://github.com/metachris/pdfx) |  |  | Extract references (pdf, url, doi, arxiv) and metadata from a PDF |
17 |
18 | **再说我比较推荐的库**,[PyPDF2](https://github.com/py-pdf/PyPDF2) and [pikepdf](https://github.com/pikepdf/pikepdf)
19 |
20 | **安装方式**:
21 |
22 | ```shell
23 | pip install PyPDF2
24 | pip install pikepdf
25 | ```
26 |
27 | **官网教程已经非常详细了**,在这里我只展示几个我常使用的脚本
28 |
29 | - [pdf2img](#pdf2img)
30 | - [extract_text](#extract_text)
31 | - [extract_annotation](#extract_annotation)
32 |
33 |
34 |
35 | ## pdf2img
36 |
37 | ```python
38 | from pikepdf import Pdf, PdfImage
39 |
40 | with Pdf.open('1.pdf') as pdf:
41 | page = pdf.pages[0]
42 | keyimage = list(page.images.keys())
43 | rawimage = page.images[keyimage[0]]
44 | pdfimage = PdfImage(rawimage)
45 |
46 | # 保存为图片文件
47 | pdfimage.extract_to(fileprefix='test')
48 |
49 | # 保存为PIL.image
50 | img = pdfimage.as_pil_image()
51 | img.show()
52 | ```
53 |
54 |
55 |
56 | ## extract_text
57 |
58 | ```python
59 | from PyPDF2 import PdfReader
60 |
61 | reader = PdfReader("1.pdf")
62 | page = reader.pages[0]
63 | text = page.extract_text()
64 |
65 | print(text)
66 | ```
67 |
68 |
69 |
70 | ## extract_annotation
71 |
72 | ```python
73 | from PyPDF2 import PdfReader
74 |
75 | reader = PdfReader("commented.pdf")
76 |
77 | for page in reader.pages:
78 | if "/Annots" in page:
79 | for annot in page["/Annots"]:
80 | obj = annot.get_object()
81 | annotation = {"subtype": obj["/Subtype"], "location": obj["/Rect"]}
82 | print(annotation)
83 | ```
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/Tools/README.md:
--------------------------------------------------------------------------------
1 | # 识别边框
2 |
3 | # 识别横竖线
4 |
5 | # 识别特定区域
6 |
7 |
8 |
9 |
10 |
11 | https://github.com/wxwwt/opencv-picture-to-excel
12 |
13 | https://blog.csdn.net/muxiong0308/article/details/80969355
14 |
15 | https://github.com/muxiong0308/form_pic_ocr
16 |
17 | https://juejin.cn/post/6844904078032666631
18 |
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/pdf2excel.py:
--------------------------------------------------------------------------------
1 | import pdfplumber
2 | import xlrd
3 | import xlwt
4 | from xlutils.copy import copy
5 | import sys
6 |
7 | def write_excel_xls(path, sheet_name, value):
8 | index = len(value) # 获取需要写入数据的行数
9 | workbook = xlwt.Workbook() # 新建一个工作簿
10 | sheet = workbook.add_sheet(sheet_name) # 在工作簿中新建一个表格
11 | for i in range(0, index):
12 | for j in range(0, len(value[i])):
13 | sheet.write(i, j, value[i][j]) # 像表格中写入数据(对应的行和列)
14 | workbook.save(path) # 保存工作簿
15 | print("xls格式表格写入数据成功!")
16 |
17 |
18 | def write_excel_xls_append(path, value):
19 | index = len(value) # 获取需要写入数据的行数
20 | workbook = xlrd.open_workbook(path) # 打开工作簿
21 | sheets = workbook.sheet_names() # 获取工作簿中的所有表格
22 | worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
23 | rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
24 | new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
25 | new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
26 | for i in range(0, index):
27 | for j in range(0, len(value[i])):
28 | new_worksheet.write(i+rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
29 | new_workbook.save(path) # 保存工作簿
30 | print("xls格式表格【追加】写入数据成功!")
31 |
32 |
33 | name = sys.argv[1]
34 | #name='2020-04-23-603214.SH-603214爱婴室2020年第一季度报告'
35 |
36 | path = name+'.pdf'
37 | pdf = pdfplumber.open(path)
38 |
39 | book_name_xls = name+'.xls'
40 | sheet_name_xls = '表1'
41 |
42 |
43 | write_excel_xls(book_name_xls, sheet_name_xls, [])
44 | for i in range(len(pdf.pages)):
45 | if i>=0:
46 | for table in pdf.pages[i].extract_tables():
47 | # print(table)
48 | for row in table:
49 | print(row)
50 | write_excel_xls_append(book_name_xls, [row])
51 |
52 |
53 | pdf.close()
54 |
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/pdf2img.py:
--------------------------------------------------------------------------------
1 | import io
2 | from PyPDF2 import PdfReader
3 | from PIL import Image
4 |
5 | # Open the PDF file and read all pages
6 | pdf_file = open('1.pdf', 'rb')
7 | pdf_reader = PdfReader(pdf_file)
8 | pages = []
9 | for i in range(pdf_reader.getNumPages()):
10 | page = pdf_reader.getPage(i)
11 | pages.append(page)
12 |
13 | # Merge all pages into a single image
14 | width = max(page.mediaBox.getWidth() for page in pages)
15 | height = sum(page.mediaBox.getHeight() for page in pages)
16 | image = Image.new('RGB', (width, height))
17 | y = 0
18 | for page in pages:
19 | x = (width - page.mediaBox.getWidth()) / 2
20 | img_bytes = bytes(page.getContents())
21 | try:
22 | img = Image.open(io.BytesIO(img_bytes)).convert('RGB')
23 | except TypeError:
24 | img = Image.open(io.BytesIO(img_bytes.decode())).convert('RGB')
25 | image.paste(img, (int(x), int(y)))
26 | y += page.mediaBox.getHeight()
27 |
28 | # Convert the image to PNG format and save it
29 | image.save('output.png', 'PNG')
30 |
--------------------------------------------------------------------------------
/Data-Processing/for_pdf/pdf_utils.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "from pathlib import Path\n",
10 | "from PyPDF2 import PdfReader\n",
11 | "from pikepdf import Pdf, PdfImage\n",
12 | "import pytesseract\n",
13 | "import re "
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {},
19 | "source": [
20 | "# 提取pdf内容"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "from PyPDF2 import PdfReader\n",
30 | "\n",
31 | "reader = PdfReader(\"test.pdf\")\n",
32 | "\n",
33 | "for page in reader.pages:\n",
34 | " print(page['/Annots'])\n",
35 | " break\n",
36 | " # if \"/Annots\" in page:\n",
37 | " # for annot in page[\"/Annots\"]:\n",
38 | " # obj = annot.get_object()\n",
39 | " # # annotation = {\"subtype\": obj[\"/Subtype\"], \"location\": obj[\"/Rect\"]}\n",
40 | " # # if subtype['/A']['/S'] == '/GoTo':\n",
41 | " # # print(subtype['/A']['/D'])\n",
42 | "\n",
43 | " # print(obj)"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "with Pdf.open('CR-PI-2208-3391 ok.pdf') as pdf:\n",
53 | "\n",
54 | " page = pdf.pages[-1]\n",
55 | " keyimage = list(page.images.keys())\n",
56 | " rawimage = page.images[keyimage[0]]\n",
57 | " pdfimage = PdfImage(rawimage)\n",
58 | " img = pdfimage.as_pil_image()\n",
59 | "\n",
60 | " if pdfimage.width / pdfimage.height < 0.6:\n",
61 | " rotation_degrees = pytesseract.image_to_osd(img).split('\\n')[1][-2:]\n",
62 | " if rotation_degrees != '0':\n",
63 | " img = img.rotate(int(rotation_degrees),expand=True)\n",
64 | "\n",
65 | " print(pdfimage.width / pdfimage.height)\n",
66 | "\n",
67 | " img = img.crop((0, 0, img.size[0] / 1.8, img.size[1] / 2))\n",
68 | "\n",
69 | " text = pytesseract.image_to_string(img, lang='chi_sim')\n",
70 | "\n",
71 | " print(text)"
72 | ]
73 | },
74 | {
75 | "cell_type": "markdown",
76 | "metadata": {},
77 | "source": [
78 | "# PDF to Image"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "from pikepdf import Pdf, PdfImage\n",
88 | "\n",
89 | "with Pdf.open('1.pdf') as pdf:\n",
90 | " page = pdf.pages[0]\n",
91 | " keyimage = list(page.images.keys())\n",
92 | " rawimage = page.images[keyimage[0]]\n",
93 | " pdfimage = PdfImage(rawimage)\n",
94 | "\n",
95 | " # 保存为图片文件\n",
96 | " pdfimage.extract_to(fileprefix='test')\n",
97 | "\n",
98 | " # 保存为PIL.image\n",
99 | " img = pdfimage.as_pil_image()\n",
100 | " img.show()"
101 | ]
102 | }
103 | ],
104 | "metadata": {
105 | "kernelspec": {
106 | "display_name": "Python 3.9.12 ('base')",
107 | "language": "python",
108 | "name": "python3"
109 | },
110 | "language_info": {
111 | "codemirror_mode": {
112 | "name": "ipython",
113 | "version": 3
114 | },
115 | "file_extension": ".py",
116 | "mimetype": "text/x-python",
117 | "name": "python",
118 | "nbconvert_exporter": "python",
119 | "pygments_lexer": "ipython3",
120 | "version": "3.9.12"
121 | },
122 | "orig_nbformat": 4,
123 | "vscode": {
124 | "interpreter": {
125 | "hash": "40d3a090f54c6569ab1632332b64b2c03c39dcf918b08424e98f38b5ae0af88f"
126 | }
127 | }
128 | },
129 | "nbformat": 4,
130 | "nbformat_minor": 2
131 | }
132 |
--------------------------------------------------------------------------------
/Data-Processing/for_txt/11.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 2
3 | 3
4 |
--------------------------------------------------------------------------------
/Data-Processing/for_txt/README.md:
--------------------------------------------------------------------------------
1 | For txt
2 |
3 |
4 |
5 |
6 |  
7 |
8 |
9 |
10 | ## Usage
11 |
12 | ### Without Package
13 |
14 | ```python
15 | with open('example.txt') as f:
16 | lines = (line.strip() for line in f)
17 | for line in lines:
18 | print(line)
19 |
20 | >>> 37 52 2
21 | 49 49 4
22 | 52 64 4
23 | 20 26 1
24 | 40 30 3
25 | ```
26 |
27 |
28 |
29 | ### With Package
30 |
31 | `numpy.genfromtxt(fname, dtype=, delimiter=None, encoding='bytes')`
32 |
33 | `returns out: ndarray`
34 |
35 |
36 |
37 | ### Something Useful
38 |
39 | 1. `str.split(str="", num=string.count(str)) `
40 |
41 | * str -- 分隔符,默认为所有的空字符,包括空格、换行(\n)、制表符(\t)等
42 | * num -- 分割次数。默认为 -1, 即分隔所有
43 |
44 | `returns out: str`
45 |
46 |
47 |
48 | 2. `map(function, iterable)` 会根据提供的函数对指定序列做映射
49 |
50 | 可以用 `int` ,来将字符串变整数
51 |
52 | ## Example
53 |
54 | You can see this example [txtfile](./example.txt)
55 |
56 | `37 52 2`
57 | `49 49 4`
58 | `52 64 4`
59 | `20 26 1`
60 | `40 30 3`
61 |
62 | ```
63 | import numpy as np
64 |
65 | data = np.genfromtxt('example.txt')
66 | >>> [[37. 52. 2.]
67 | [49. 49. 4.]
68 | [52. 64. 4.]
69 | [20. 26. 1.]
70 | [40. 30. 3.]]
71 |
72 | print(data[0][0])
73 | >>> 37.0
74 | ```
75 |
76 | Another example [txtfile](./example1.txt), but I really recommend you to use `csv` to store data with `,`
77 |
78 | `37,52,2`
79 | `49,49,4`
80 | `52,64,4`
81 | `20,26,1`
82 | `40,30,3`
83 |
84 | ```python
85 | import numpy as np
86 |
87 | data = np.genfromtxt('example.txt')
88 | >>> [nan nan nan nan nan]
89 |
90 | data = np.genfromtxt('example.txt', dtype='unicode')
91 | >>> ['37,52,2' '49,49,4' '52,64,4' '20,26,1' '40,30,3']
92 |
93 |
94 | data = np.genfromtxt('example1.txt', dtype='unicode')
95 | data1 = ()
96 |
97 | for i in range(len(data)):
98 | data1 += tuple(map(int, data[i].split(',')))
99 |
100 | data = np.reshape(data1, (5,3))
101 | >>> [[37 52 2]
102 | [49 49 4]
103 | [52 64 4]
104 | [20 26 1]
105 | [40 30 3]]
106 | ```
107 |
108 |
--------------------------------------------------------------------------------
/Data-Processing/for_txt/add_content.py:
--------------------------------------------------------------------------------
1 | # 给txt每一行增加内容
2 |
3 | with open('demo.txt') as f:
4 | lines = (line.strip() for line in f)
5 | for line in lines:
6 | print(line)
--------------------------------------------------------------------------------
/Data-Processing/for_txt/example.txt:
--------------------------------------------------------------------------------
1 | 37 52 2
2 | 49 49 4
3 | 52 64 4
4 | 20 26 1
5 | 40 30 3
--------------------------------------------------------------------------------
/Data-Processing/for_txt/example1.txt:
--------------------------------------------------------------------------------
1 | 37,52,2
2 | 49,49,4
3 | 52,64,4
4 | 20,26,1
5 | 40,30,3
--------------------------------------------------------------------------------
/Data-Processing/for_txt/how_many_lines.py:
--------------------------------------------------------------------------------
1 | # 输出txt有多少行
2 | # 使用with open的好处是:
3 | with open(r'demo.txt', 'rt') as f:
4 | count=len(f.readlines())
5 | print(count)
6 |
7 |
8 | # f = open(r'somefile.txt', 'rt')
9 | # data = f.read()
10 | # f.close()
--------------------------------------------------------------------------------
/Data-Processing/for_txt/read_data.py:
--------------------------------------------------------------------------------
1 | # 读取txt数据并保存到数组
2 | # txt数据类型为:每行带括号,用逗号隔开
3 | # (id,length,speed,channel,from,to,isDuplex)
4 | # (5000, 10, 5, 1, 1, 2, 1)
5 | # (5001, 10, 5, 1, 2, 3, 1)
6 | # (5002, 10, 5, 1, 3, 4, 1)
7 | # (5003, 10, 5, 1, 4, 5, 1)
8 | # (5004, 10, 5, 1, 5, 6, 1)
9 |
10 | with open(r'demo.txt') as f:
11 | next(f) # 从txt的第二行开始了
12 | lines = f.readlines()
13 |
14 | data = []
15 | for line in lines: #把lines中的数据逐行读取出来
16 | temp1=line.strip('\n()').split(',') # 去掉字符串首尾的分隔符
17 | data.append(temp)
18 | data = [list(map(int, x)) for x in data] # 将字符串转换为整数
19 |
--------------------------------------------------------------------------------
/Data-Processing/for_txt/read_numpy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | data = np.genfromtxt('example.txt')
4 |
--------------------------------------------------------------------------------
/Data-Processing/for_txt/read_txt.py:
--------------------------------------------------------------------------------
1 | with open('example.txt') as f:
2 | lines = (line.strip() for line in f) # 得到一个迭代器
3 | for line in lines:
4 | print(line)
--------------------------------------------------------------------------------
/Data-Processing/for_txt/test.py:
--------------------------------------------------------------------------------
1 | number = 7
2 | print(f"problem: {number}")
--------------------------------------------------------------------------------
/Data-Processing/for_txt/write_txt.py:
--------------------------------------------------------------------------------
1 | # 将列表写入txt
2 | import numpy as np
3 | a = [1, 2, 3]
4 |
5 | np.savetxt('11.txt', a, fmt='%i', delimiter=',')
6 |
--------------------------------------------------------------------------------
/Data-Processing/print/print_in_1line.py:
--------------------------------------------------------------------------------
1 | # 同一行输出
2 |
3 | import time
4 | for i in range(20):
5 | time.sleep(0.4)
6 | print('\r',str(30-i).ljust(10),end='')
7 |
8 |
9 | # import sys,time
10 | # for i in range(20):
11 | # print('#',end='',flush=True)
12 | # time.sleep(0.4)
13 |
14 | # 而 ‘\r‘ 则是回到当前的开头
15 | # 默认是Flase,只有缓冲区满或者全部内容都获取到了,才会一次全部执行打印
16 | # 改成True,就是强制刷新,立刻打印出来
17 |
18 | # end='\n' 这个是默认的end参数,所以平时是打印一条之后会换行。
19 | # 例子都将参数改为了空,所以不会换行了
--------------------------------------------------------------------------------
/Data-Processing/remove_string_spaces.py:
--------------------------------------------------------------------------------
1 | s = ' I love u forerver !'
2 | result = ''.join(s.split())
3 | print(result)
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 crazyang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Messy/A.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | def heuristic_distace(Neighbour_node,target_node):
4 | H = abs(Neighbour_node[0] - target_node[0]) + abs(Neighbour_node[1] - target_node[1])
5 | return H
6 |
7 | def go_around(direction):
8 | box_length = 1
9 | diagonal_line = box_length * 1.4
10 | if (direction==0 or direction==2 or direction==6 or direction==8):
11 | return diagonal_line
12 | elif (direction==1 or direction==3 or direction==4 or direction==5 or direction==7):
13 | return diagonal_line
14 |
15 | def find_coordinate(map,symble):
16 | #store coordinate
17 | result=[]
18 | for index1,value1 in enumerate(map):
19 | if symble in value1:
20 | row = index1
21 | for index2, value2 in enumerate(map[index1]):
22 | if symble==value2:
23 | column = index2
24 | result.append([row, column])
25 | return result
26 |
27 | map =[[".", ".", ".", "#", ".", "#", ".", ".", ".", "."],
28 | [".", ".", "#", ".", ".", "#", ".", "#", ".", "#"],
29 | ["s", ".", "#", ".", "#", ".", "#", ".", ".", "."],
30 | [".", "#", "#", ".", ".", ".", ".", ".", "#", "."],
31 | [".", ".", ".", ".", "#", "#", ".", ".", "#", "."],
32 | [".", "#", ".", ".", ".", ".", "#", ".", ".", "."],
33 | [".", "#", ".", ".", ".", "#", "#", ".", "#", "."],
34 | [".", ".", ".", ".", ".", ".", ".", ".", "#", "."],
35 | [".", "#", "#", ".", ".", ".", "#", ".", ".", "."],
36 | [".", ".", ".", "#", "#", "#", ".", ".", "#", "f"],
37 | ["#", "#", ".", ".", "#", "#", "#", ".", "#", "."],
38 | [".", "#", "#", ".", ".", ".", "#", ".", ".", "."],
39 | [".", ".", ".", ".", "#", "#", ".", ".", "#", "."]]
40 |
41 | #these datas are store in the form of list in a singal list
42 |
43 | obstacle = find_coordinate(map,"#")
44 | start_node = find_coordinate(map,"s")[0]
45 | target_node = find_coordinate(map,"f")[0]
46 | current_node = start_node
47 | path_vertices = [start_node]
48 | #visited_vertices should be stored in the form of a singal list
49 | Neighbour_vertices = []
50 |
51 | while current_node != target_node:
52 |
53 | x_coordinate = current_node[0]
54 | y_coordinate = current_node[1]
55 | F = []
56 | Neighbour_vertices = [[x_coordinate - 1, y_coordinate - 1],
57 | [x_coordinate - 1, y_coordinate ],
58 | [x_coordinate - 1, y_coordinate + 1],
59 | [x_coordinate, y_coordinate - 1],
60 | [x_coordinate , y_coordinate ],
61 | [x_coordinate, y_coordinate + 1],
62 | [x_coordinate + 1, y_coordinate - 1],
63 | [x_coordinate + 1, y_coordinate ],
64 | [x_coordinate + 1, y_coordinate + 1]]
65 |
66 | for index, value in enumerate(Neighbour_vertices):
67 | if value[0] in range(len(map)):
68 | if value[1] in range(len(map)):
69 | if value not in obstacle+path_vertices:
70 | F.append(heuristic_distace(value, target_node) + go_around(index))
71 | else:
72 | F.append(10000)
73 | else:
74 | F.append(10000)
75 | else:
76 | F.append(10000)
77 | #a very large number
78 | print(F)
79 | current_node=Neighbour_vertices[F.index(min(total_distance for total_distance in F))]
80 | print(current_node)
81 |
82 | path_vertices.append(current_node)
83 | # if current_node not in visited_vertices:
84 | # visited_vertices.append(current_node)
85 | # else:
86 | # print("there is no route between")
87 | # break
88 |
89 | print(path_vertices)
90 |
--------------------------------------------------------------------------------
/Messy/DataTransform.py:
--------------------------------------------------------------------------------
1 | from xml.etree.ElementTree import parse
2 | import os
3 |
4 | db_dir = './data/VMS/' #更改为自己的目录地址
5 | anno_path = db_dir
6 | fileList = os.listdir(anno_path)
7 | if not os.path.exists(anno_path):
8 | os.makedirs(anno_path)
9 | for file_name in fileList:
10 | if file_name[len(file_name)-3:] != 'xml':
11 | continue
12 | filePath = db_dir + file_name
13 | print(filePath)
14 | tree = parse(filePath)
15 | root = tree.getroot()
16 | parsed = []
17 | for annot in root.iter('annotation'):
18 | for obj in annot.findall('object'):
19 | label = obj.findtext('name')
20 | for coord in obj.findall('bndbox'):
21 | x_max = float(coord.findtext('xmax'))
22 | x_min = float(coord.findtext('xmin'))
23 | y_max = float(coord.findtext('ymax'))
24 | y_min = float(coord.findtext('ymin'))
25 | parsed = parsed + [str(label) + ',' + str(x_min) + ',' + str(y_min) + ','+ str(x_max) + ',' + str(y_max)]
26 | fp = open(anno_path + '/' + file_name[:-3] + 'txt','w')
27 | for elem in parsed:
28 | print>>fp, elem
29 | fp.close()
30 |
--------------------------------------------------------------------------------
/Messy/Graph.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import networkx as nx
3 |
4 | G = nx.Graph()
5 | # explicitly set positions
6 | pos = {0: (0, 0),
7 | 1: (1, 0),
8 | 2: (0, 1),
9 | 3: (1, 1),
10 | 4: (0.5, 2.0)}
11 |
12 | # nx.draw_networkx_nodes(G, pos, node_color='r', node_size=20)
13 | # nx.draw_networkx_edges(G, pos, edge_color='b', width=2)
14 | # nx.draw_networkx_labels(G, pos, font_size=10)
15 |
16 |
17 |
18 | G.add_edge('x','y')#添加边,起点为x,终点为y
19 | # G.add_edges_from([(0,1),(1,3),(2,4)])
20 | nx.draw(G,
21 | with_labels=True, #这个选项让节点有名称
22 | edge_color='b', # b stands for blue!
23 | # pos=pos, # 这个是选项选择点的排列方式,具体可以用 help(nx.drawing.layout) 查看
24 | # 主要有spring_layout (default), random_layout, circle_layout, shell_layout
25 | # 这里是环形排布,还有随机排列等其他方式
26 | node_color='r', # r = red
27 | node_size=1000, # 节点大小
28 | width=3, # 边的宽度
29 | )
30 | plt.axis('off')
31 | plt.show()
32 |
33 |
34 | # import matplotlib.pyplot as plt
35 | # import networkx as nx
36 |
37 | # G = nx.Graph()
38 | # 把地图绘制出来
39 | # def draw_map():
40 | # for i in range(len(road_label[0])):
41 | # G.add_edge(road_label[4][i],road_label[5][i])
42 |
43 | # nx.draw(G,
44 | # with_labels=True, #这个选项让节点有名称
45 | # edge_color='b', # b stands for blue!
46 | # # pos=pos,
47 | # node_color='r', # r = red
48 | # node_size=1000, # 节点大小
49 | # width=3, # 边的宽度
50 | # )
51 | # print(nx.shortest_path(G, 35, 35))
52 | # nx.draw_networkx_edge_labels # 可以给路加上路名
53 | # plt.axis('off')
54 | # plt.show()
55 |
--------------------------------------------------------------------------------
/Messy/GraphAlgorithms.py:
--------------------------------------------------------------------------------
1 | import classini
2 | import read
3 | # 路径规划算法集合
4 |
5 | # 生成邻接矩阵
6 | # input数据类型:列表list
7 | # output数据类型:列表List
8 | def adjacency_matrix(list):
9 | n = len(list)
10 | Na = 1000
11 | output_matrix = [ [ Na for i in range(n) ] for j in range(n) ]
12 |
13 | for i in range(n):
14 | output_matrix[i][i] = 0
15 | output_matrix[list[i].begin_id][list[i].end_id] = list[i].length
16 | output_matrix[list[i].begin_id][list[i].end_id] = list[i].length
17 |
18 | return output_matrix
19 |
20 |
21 |
22 | if __name__ == '__main__':
23 | car_path = r'car.txt'
24 | carfile=read.read_input_file(car_path)
25 |
26 | car_list=[]
27 | for i in range(len(carfile[0])):
28 | car_list.append(classini.Car(carfile[0][i],carfile[1][i],carfile[2][i],carfile[3][i],carfile[4][i]))
29 |
30 | road_path = r'road.txt'
31 | roadfile=read.read_input_file(road_path)
32 |
33 | road_list=[]
34 | for i in range(len(roadfile[0])):
35 | road_list.append(classini.road(roadfile[0][i],roadfile[1][i],roadfile[2][i],roadfile[3][i],roadfile[4][i],roadfile[5][i],roadfile[6][i]))
36 |
37 | cross_path = r'cross.txt'
38 | crossfile=read.read_input_file(cross_path)
39 |
40 | cross_list=[]
41 | for i in range(len(crossfile[0])):
42 | cross_list.append(classini.Cross(crossfile[0][i],crossfile[1][i],crossfile[2][i],crossfile[3][i],crossfile[4][i]))
43 |
44 | print(adjacency_matrix(road_list))
--------------------------------------------------------------------------------
/Messy/ListSearch.py:
--------------------------------------------------------------------------------
1 | # enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标
2 |
3 | map =[[".", ".", ".", "#", ".", "#", ".", ".", ".", "."],
4 | [".", ".", "#", ".", ".", "#", ".", "#", ".", "#"],
5 | ["s", ".", "#", ".", "#", ".", "#", ".", ".", "."],
6 | [".", "#", "#", ".", ".", ".", ".", ".", "#", "."],
7 | [".", ".", ".", ".", "#", "#", ".", ".", "#", "."],
8 | [".", "#", ".", ".", ".", ".", "#", ".", ".", "."],
9 | [".", "#", ".", ".", ".", "#", "#", ".", "#", "."],
10 | [".", ".", ".", ".", ".", ".", ".", ".", "#", "."],
11 | [".", "#", "#", ".", ".", ".", "#", ".", ".", "."],
12 | [".", ".", ".", "#", "#", "#", ".", ".", "#", "f"],
13 | ["#", "#", ".", ".", "#", "#", "#", ".", "#", "."],
14 | [".", "#", "#", ".", ".", ".", "#", ".", ".", "."],
15 | [".", ".", ".", ".", "#", "#", ".", ".", "#", "."]]
16 |
17 | def find_coordinate(map,symble):
18 | result=[]
19 | for index1,value1 in enumerate(map):
20 | if symble in value1:
21 | row = index1
22 | for index2, value2 in enumerate(map[index1]):
23 | if symble == value2:
24 | column = index2
25 | result.append([row, column])
26 | return result
27 |
28 | obstacle = find_coordinate(map,"#")
29 |
30 | print(obstacle)
--------------------------------------------------------------------------------
/Messy/labelImg.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import sys
4 | #import natsort
5 | import xml.etree.ElementTree as ET
6 | file_dir='D:\BD-label'
7 |
8 | def file_name(file_dir):
9 | L=[]
10 | for root, dirs, files in os.walk(file_dir):
11 | for file in files:
12 | if os.path.splitext(file)[1] == '.xml':
13 | L.append(os.path.join(root, file))
14 | return L
15 |
16 | if __name__=='__main__':
17 | Num=file_name(file_dir)
18 | Num.sort()
19 | for i in range(241,len(Num)-1): #每个单独的文件
20 | print Num[i]
21 | name=Num[i].split('\\')[2]
22 | print name
23 | tree = ET.parse(Num[i])
24 | root = tree.getroot()
25 | object = root.findall("object")
26 | count=0
27 | for tmp in object: #every object
28 | count=count+1
29 | a=tmp
30 | d=ET.SubElement(a, 'index')
31 | d.text=str(count)
32 | d.tail="\n\t\t"
33 | #print(a.find('name').text)
34 | print("print No.%d %s 's father index.End with 0\n"%(count,a.find('name').text))
35 |
36 | b = ET.SubElement(a, 'father')
37 | b.text="\n\t\t\t"
38 | b.tail="\n\t\t"
39 | count_father=0
40 | while(True):
41 | count_father = count_father+1
42 | insert_num=input()
43 | if(insert_num==0):
44 | break
45 | c=ET.SubElement(b,'num'+str(count_father))
46 | c.text=str(insert_num)
47 | c.tail = "\n\t\t\t"
48 | print("print No.%d %s's child num. End with 0\n" % (count,a.find('name').text))
49 | a = tmp
50 | b = ET.SubElement(a, 'child')
51 | b.text = "\n\t\t\t"
52 | b.tail = "\n\t"
53 | count_child=0
54 | while (True):
55 | count_child=count_child+1
56 | insert_num = input()
57 | if (insert_num == 0):
58 | break
59 | c = ET.SubElement(b, 'num'+str(count_child))
60 | c.text = str(insert_num)
61 | c.tail = "\n\t\t\t"
62 | tree.write(name)
63 |
--------------------------------------------------------------------------------
/Messy/list_remove.py:
--------------------------------------------------------------------------------
1 | list = ['Google', 'Runoob', 'Taobao', 'Baidu']
2 | for i in range(len(list)):
3 | list.remove(list[0])
4 | print(list)
--------------------------------------------------------------------------------
/Python+Algorithm/Evolutionary-Algorithm/Match Phrase.py:
--------------------------------------------------------------------------------
1 | """
2 | Visualize Genetic Algorithm to match the target phrase.
3 |
4 | Visit my tutorial website for more: https://morvanzhou.github.io/tutorials/
5 | """
6 | import numpy as np
7 |
8 | TARGET_PHRASE = 'You get it!' # target DNA
9 | POP_SIZE = 300 # population size
10 | CROSS_RATE = 0.4 # mating probability (DNA crossover)
11 | MUTATION_RATE = 0.01 # mutation probability
12 | N_GENERATIONS = 1000
13 |
14 | DNA_SIZE = len(TARGET_PHRASE)
15 | TARGET_ASCII = np.fromstring(TARGET_PHRASE, dtype=np.uint8) # 将字符串转变为数字
16 | ASCII_BOUND = [32, 126]
17 |
18 |
19 | class GA(object):
20 | def __init__(self, DNA_size, DNA_bound, cross_rate, mutation_rate, pop_size):
21 | self.DNA_size = DNA_size
22 | DNA_bound[1] += 1
23 | self.DNA_bound = DNA_bound
24 | self.cross_rate = cross_rate
25 | self.mutate_rate = mutation_rate
26 | self.pop_size = pop_size
27 |
28 | self.pop = np.random.randint(*DNA_bound, size=(pop_size, DNA_size)).astype(np.int8) # int8 for convert to ASCII
29 |
30 | def translateDNA(self, DNA): # convert to readable string
31 | return DNA.tostring().decode('ascii')
32 |
33 | def get_fitness(self): # count how many character matches
34 | match_count = (self.pop == TARGET_ASCII).sum(axis=1)
35 | return match_count
36 |
37 | def select(self):
38 | fitness = self.get_fitness() + 1e-4 # add a small amount to avoid all zero fitness
39 | idx = np.random.choice(np.arange(self.pop_size), size=self.pop_size, replace=True, p=fitness/fitness.sum())
40 | return self.pop[idx]
41 |
42 | def crossover(self, parent, pop):
43 | if np.random.rand() < self.cross_rate:
44 | i_ = np.random.randint(0, self.pop_size, size=1) # select another individual from pop
45 | cross_points = np.random.randint(0, 2, self.DNA_size).astype(np.bool) # choose crossover points
46 | parent[cross_points] = pop[i_, cross_points] # mating and produce one child
47 | return parent
48 |
49 | def mutate(self, child):
50 | for point in range(self.DNA_size):
51 | if np.random.rand() < self.mutate_rate:
52 | child[point] = np.random.randint(*self.DNA_bound) # choose a random ASCII index
53 | return child
54 |
55 | def evolve(self):
56 | pop = self.select()
57 | pop_copy = pop.copy()
58 | for parent in pop: # for every parent
59 | child = self.crossover(parent, pop_copy)
60 | child = self.mutate(child)
61 | parent[:] = child
62 | self.pop = pop
63 |
64 | if __name__ == '__main__':
65 | ga = GA(DNA_size=DNA_SIZE, DNA_bound=ASCII_BOUND, cross_rate=CROSS_RATE,
66 | mutation_rate=MUTATION_RATE, pop_size=POP_SIZE)
67 |
68 | for generation in range(N_GENERATIONS):
69 | fitness = ga.get_fitness()
70 | best_DNA = ga.pop[np.argmax(fitness)]
71 | best_phrase = ga.translateDNA(best_DNA)
72 | print('Gen', generation, ': ', best_phrase)
73 | if best_phrase == TARGET_PHRASE:
74 | break
75 | ga.evolve()
76 |
--------------------------------------------------------------------------------
/Python+Algorithm/Evolutionary-Algorithm/genetic_algorithm.py:
--------------------------------------------------------------------------------
1 | """
2 | Visualize Genetic Algorithm to find a maximum point in a function.
3 |
4 | Visit my tutorial website for more: https://morvanzhou.github.io/tutorials/
5 | """
6 | import numpy as np
7 | import matplotlib.pyplot as plt
8 |
9 | DNA_SIZE = 10 # DNA length
10 | POP_SIZE = 100 # population size
11 | CROSS_RATE = 0.8 # mating probability (DNA crossover)
12 | MUTATION_RATE = 0.003 # mutation probability
13 | N_GENERATIONS = 200
14 | X_BOUND = [0, 5] # x upper and lower bounds
15 |
16 | # to find the maximum of this function
17 | def F(x):
18 | return np.sin(10*x)*x + np.cos(2*x)*x
19 |
20 |
21 | # find non-zero fitness for selection
22 | def get_fitness(pred): return pred + 1e-3 - np.min(pred)
23 |
24 |
25 | # convert binary DNA to decimal and normalize it to a range(0, 5)
26 | def translateDNA(pop):
27 | return pop.dot(2 ** np.arange(DNA_SIZE)[::-1]) / float(2**DNA_SIZE-1) * X_BOUND[1]
28 |
29 |
30 | def select(pop, fitness): # nature selection wrt pop's fitness
31 | idx = np.random.choice(np.arange(POP_SIZE), size=POP_SIZE, replace=True,
32 | p=fitness/fitness.sum())
33 | return pop[idx]
34 |
35 |
36 | def crossover(parent, pop): # mating process (genes crossover)
37 | if np.random.rand() < CROSS_RATE:
38 | i_ = np.random.randint(0, POP_SIZE, size=1) # select another individual from pop
39 | cross_points = np.random.randint(0, 2, size=DNA_SIZE).astype(np.bool) # choose crossover points
40 | parent[cross_points] = pop[i_, cross_points] # mating and produce one child
41 | return parent
42 |
43 |
44 | def mutate(child):
45 | for point in range(DNA_SIZE):
46 | if np.random.rand() < MUTATION_RATE:
47 | child[point] = 1 if child[point] == 0 else 0
48 | return child
49 |
50 |
51 | pop = np.random.randint(2, size=(POP_SIZE, DNA_SIZE)) # initialize the pop DNA
52 |
53 | plt.ion() # something about plotting
54 | x = np.linspace(*X_BOUND, 200)
55 | plt.plot(x, F(x))
56 |
57 | for _ in range(N_GENERATIONS):
58 | F_values = F(translateDNA(pop)) # compute function value by extracting DNA
59 |
60 | # something about plotting
61 | if 'sca' in globals(): sca.remove()
62 | sca = plt.scatter(translateDNA(pop), F_values, s=200, lw=0, c='red', alpha=0.5); plt.pause(0.05)
63 |
64 | # GA part (evolution)
65 | fitness = get_fitness(F_values)
66 | print("Most fitted DNA: ", pop[np.argmax(fitness), :])
67 | pop = select(pop, fitness)
68 | pop_copy = pop.copy()
69 | for parent in pop:
70 | child = crossover(parent, pop_copy)
71 | child = mutate(child)
72 | parent[:] = child # parent is replaced by its child
73 |
74 | plt.ioff()
75 | plt.show()
76 |
77 |
78 |
--------------------------------------------------------------------------------
/Python+Algorithm/Geometric/point_oblique_straight_point.py:
--------------------------------------------------------------------------------
1 | # 解算出两线段的交点
2 | # 采用 一点和斜率,然后两直线求解
3 |
4 |
5 | class Point(object):
6 | def __init__(self, x=0, y=0):
7 | self.x = x
8 | self.y = y
9 |
10 |
11 | # 用"一点和斜率"定义的直线
12 | class Line(object):
13 | def __init__(self, p, k):
14 | self.p = p
15 | self.k = k
16 |
17 |
18 | # 解两直线的交点
19 | def get_cross_point(l1, l2):
20 | cross_point = Point()
21 | cross_point.x = (l2.p.y + l1.p.y - l2.k * l2.p.x - l1.k * l1.p.x) * 1.0 / (
22 | l1.k - l2.k)
23 | cross_point.y = (l1.k * (l2.p.y - l2.k * l2.p.x) - l2.k *
24 | (l1.p.y - l1.k * l1.p.x)) * 1.0 / (l1.k - l2.k)
25 | return cross_point
26 |
27 |
28 | if __name__ == '__main__':
29 | p1 = Point(1, 1)
30 | k1 = 1
31 | line1 = Line(p1, k1)
32 |
33 | p2 = Point(1, 1)
34 | k2 = -1
35 | line2 = Line(p2, k2)
36 |
37 | cross_point = get_cross_point(line1, line2)
38 | print("Cross point:", cross_point.x, cross_point.y)
39 |
--------------------------------------------------------------------------------
/Python+Algorithm/Geometric/two_point_straight_point.py:
--------------------------------------------------------------------------------
1 | # 解算出两线段的交点
2 | # 采用 两点确定一条直线,然后两直线求解
3 |
4 |
5 | class Point(object):
6 | def __init__(self, x=0, y=0):
7 | self.x = x
8 | self.y = y
9 |
10 |
11 | # 用"两点"定义的直线
12 | class Line(object):
13 | def __init__(self, p1, p2):
14 | self.p1 = p1
15 | self.p2 = p2
16 |
17 |
18 | # 求直线的参数,
19 | def get_line_parameter(line):
20 | line.a = line.p1.y - line.p2.y
21 | line.b = line.p2.x - line.p1.x
22 | line.c = line.p1.x * line.p2.y - line.p2.x * line.p1.y
23 |
24 |
25 | # 解两直线的交点
26 | def get_cross_point(l1, l2):
27 | get_line_parameter(l1)
28 | get_line_parameter(l2)
29 | d = l1.a * l2.b - l2.a * l1.b
30 | cross_point = Point()
31 | cross_point.x = (l1.b * l2.c - l2.b * l1.c) * 1.0 / d
32 | cross_point.y = (l1.c * l2.a - l2.c * l1.a) * 1.0 / d
33 | return cross_point
34 |
35 |
36 | if __name__ == '__main__':
37 | p1 = Point(0, 1)
38 | p2 = Point(1, 1)
39 | line1 = Line(p1, p2)
40 |
41 | p3 = Point(1, 1)
42 | p4 = Point(1, 0)
43 | line2 = Line(p3, p4)
44 |
45 | cross_point = get_cross_point(line1, line2)
46 | print("Cross point:", cross_point.x, cross_point.y)
47 |
--------------------------------------------------------------------------------
/Python+Algorithm/Least-Squares/Least squares.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | sales=pd.read_csv('train_data.csv',sep='\s*,\s*',engine='python') #读取CSV
4 | X=sales['X'].values #存csv的第一列
5 | Y=sales['Y'].values #存csv的第二列
6 |
7 | #初始化赋值
8 | s1 = 0
9 | s2 = 0
10 | s3 = 0
11 | s4 = 0
12 | n = 4 ####你需要根据的数据量进行修改
13 |
14 | #循环累加
15 | for i in range(n):
16 | s1 = s1 + X[i]*Y[i]
17 | s2 = s2 + X[i]
18 | s3 = s3 + Y[i]
19 | s4 = s4 + X[i]*X[i]
20 |
21 | #计算斜率和截距
22 | b = (s2*s3-n*s1)/(s2*s2-s4*n)
23 | a = (s3 - b*s2)/n
24 | print("Coeff: {} Intercept: {}".format(b, a))
25 |
26 |
--------------------------------------------------------------------------------
/Python+Algorithm/Least-Squares/README.md:
--------------------------------------------------------------------------------
1 | # Least-squares
2 |
3 | 最小二乘法的python实现,没有使用任何第三方库
4 |
5 | 数据是train_data文件,也可以你手动输入
6 |
7 | 参考我的[CSDN说明](https://blog.csdn.net/yzy_1996/article/details/81064140)
--------------------------------------------------------------------------------
/Python+Algorithm/Least-Squares/train_data.csv:
--------------------------------------------------------------------------------
1 | X,Y
2 | 1,6
3 | 2,5
4 | 3,7
5 | 4,10
6 |
--------------------------------------------------------------------------------
/Python+Algorithm/Math/Kalman/Kalman_2D.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | import matplotlib.pyplot as plt
4 |
5 | pos = np.array([
6 | [10, 50],
7 | [12, 49],
8 | [11, 52],
9 | [13, 52.2],
10 | [12.9, 50]], np.float32)
11 |
12 | '''
13 | 它有3个输入参数,dynam_params:状态空间的维数,这里为2;measure_param:测量值的维数,这里也为2; control_params:控制向量的维数,默认为0。由于这里该模型中并没有控制变量,因此也为0。
14 | '''
15 | kalman = cv2.KalmanFilter(2,2)
16 |
17 | kalman.measurementMatrix = np.array([[1,0],[0,1]],np.float32)
18 | kalman.transitionMatrix = np.array([[1,0],[0,1]], np.float32)
19 | kalman.processNoiseCov = np.array([[1,0],[0,1]], np.float32) * 1e-3
20 | kalman.measurementNoiseCov = np.array([[1,0],[0,1]], np.float32) * 0.01
21 | '''
22 | kalman.measurementNoiseCov为测量系统的协方差矩阵,方差越小,预测结果越接近测量值,kalman.processNoiseCov为模型系统的噪声,噪声越大,预测结果越不稳定,越容易接近模型系统预测值,且单步变化越大,相反,若噪声小,则预测结果与上个计算结果相差不大。
23 | '''
24 |
25 | kalman.statePre = np.array([[6],[6]],np.float32)
26 |
27 | for i in range(len(pos)):
28 | mes = np.reshape(pos[i,:],(2,1))
29 |
30 | x = kalman.correct(mes)
31 |
32 | y = kalman.predict()
33 | print (kalman.statePost[0],kalman.statePost[1])
34 | print (kalman.statePre[0],kalman.statePre[1])
35 | print ('measurement:\t',mes[0],mes[1])
36 | print ('correct:\t',x[0],x[1])
37 | print ('predict:\t',y[0],y[1])
38 | print ('='*30)
--------------------------------------------------------------------------------
/Python+Algorithm/Math/Kalman/Kalman_3D.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | import numpy as np
3 | #创建一个大小800*800的空帧
4 | frame = np.zeros((800,800,3),np.uint8)
5 | #初始化测量坐标和鼠标运动预测的数组
6 | last_measurement = current_measurement = np.array((2,1),np.float32)
7 | print(last_measurement)
8 | last_predicition = current_prediction = np.zeros((2,1),np.float32)
9 | '''
10 | mousemove()函数在这里的作用就是传递X,Y的坐标值,便于对轨迹进行卡尔曼滤波
11 | '''
12 | def mousemove(event,x,y,s,p):
13 | #定义全局变量
14 | global frame,current_measurement,measurements,last_measurement,current_prediction,last_prediction
15 | #初始化
16 | last_measurement = current_measurement
17 | last_prediction = current_prediction
18 | #传递当前测量坐标值
19 | current_measurement = np.array([[np.float32(x)],[np.float32(y)]])
20 | #用来修正卡尔曼滤波的预测结果
21 | kalman.correct(current_measurement)
22 | # 调用kalman这个类的predict方法得到状态的预测值矩阵,用来估算目标位置
23 | current_prediction = kalman.predict()
24 | #上一次测量值
25 | lmx,lmy = last_measurement[0],last_measurement[1]
26 | #当前测量值
27 | cmx,cmy = current_measurement[0],current_measurement[1]
28 | #上一次预测值
29 | lpx,lpy = last_prediction[0],last_prediction[1]
30 | #当前预测值
31 | cpx,cpy = current_prediction[0],current_prediction[1]
32 | #绘制测量值轨迹(绿色)
33 | cv2.line(frame,(lmx,lmy),(cmx,cmy),(0,100,0))
34 | #绘制预测值轨迹(红色)
35 | cv2.line(frame,(lpx,lpy),(cpx,cpy),(0,0,200))
36 |
37 | cv2.namedWindow("kalman_tracker")
38 |
39 | cv2.setMouseCallback("kalman_tracker",mousemove)
40 |
41 | kalman = cv2.KalmanFilter(4,2) # 状态空间的维数 和 测量值维数
42 | #设置测量矩阵
43 | kalman.measurementMatrix = np.array([[1,0,0,0],[0,1,0,0]],np.float32)
44 | #设置转移矩阵
45 | kalman.transitionMatrix = np.array([[1,0,1,0],[0,1,0,1],[0,0,1,0],[0,0,0,1]],np.float32)
46 | #设置过程噪声协方差矩阵
47 | kalman.processNoiseCov = np.array([[1,0,0,0],[0,1,0,0],[0,0,1,0],[0,0,0,1]],np.float32)*0.03
48 |
49 | while True:
50 | cv2.imshow("kalman_tracker",frame)
51 | if cv2.waitKey(1) & 0xFF == ord('q'):
52 | break
53 |
54 | cv2.destroyAllWindows()
55 |
--------------------------------------------------------------------------------
/Python+Algorithm/Math/Kalman/README.md:
--------------------------------------------------------------------------------
1 | # Kalman滤波器代码
2 |
3 | ## 1、Python
4 |
5 | you need to have $numpy$ & $pylab$ firstly!
6 |
7 | example 1: for the file named kalman_carmove.py
8 |
9 |
10 |
11 | ## 2、Matlab
12 |
13 |
--------------------------------------------------------------------------------
/Python+Algorithm/Math/Kalman/kalamn_unc.py:
--------------------------------------------------------------------------------
1 | # -*- coding=utf-8 -*-
2 | # Kalman filter example demo in Python
3 |
4 | # A Python implementation of the example given in pages 11-15 of "An
5 | # Introduction to the Kalman Filter" by Greg Welch and Gary Bishop,
6 | # University of North Carolina at Chapel Hill, Department of Computer
7 | # Science, TR 95-041,
8 | # http://www.cs.unc.edu/~welch/kalman/kalmanIntro.html
9 |
10 | # by Andrew D. Straw
11 | #coding:utf-8
12 | import numpy
13 | import pylab
14 |
15 | #这里是假设A=1,H=1的情况
16 |
17 | # intial parameters
18 | n_iter = 50
19 | sz = (n_iter,) # size of array
20 | x = -0.37727 # truth value (typo in example at top of p. 13 calls this z)
21 | z = numpy.random.normal(x,0.1,size=sz) # observations (normal about x, sigma=0.1)
22 |
23 | Q = 1e-5 # process variance
24 |
25 | # allocate space for arrays
26 | xhat=numpy.zeros(sz) # a posteri estimate of x
27 | P=numpy.zeros(sz) # a posteri error estimate
28 | xhatminus=numpy.zeros(sz) # a priori estimate of x
29 | Pminus=numpy.zeros(sz) # a priori error estimate
30 | K=numpy.zeros(sz) # gain or blending factor
31 |
32 | R = 0.1**2 # estimate of measurement variance, change to see effect
33 |
34 | # intial guesses
35 | xhat[0] = 0.0
36 | P[0] = 1.0
37 |
38 | for k in range(1,n_iter):
39 | # time update
40 | xhatminus[k] = xhat[k-1] #X(k|k-1) = AX(k-1|k-1) + BU(k) + W(k),A=1,BU(k) = 0
41 | Pminus[k] = P[k-1]+Q #P(k|k-1) = AP(k-1|k-1)A' + Q(k) ,A=1
42 |
43 | # measurement update
44 | K[k] = Pminus[k]/( Pminus[k]+R ) #Kg(k)=P(k|k-1)H'/[HP(k|k-1)H' + R],H=1
45 | xhat[k] = xhatminus[k]+K[k]*(z[k]-xhatminus[k]) #X(k|k) = X(k|k-1) + Kg(k)[Z(k) - HX(k|k-1)], H=1
46 | P[k] = (1-K[k])*Pminus[k] #P(k|k) = (1 - Kg(k)H)P(k|k-1), H=1
47 |
48 | pylab.figure()
49 | pylab.plot(z,'k+',label='noisy measurements') #测量值
50 | pylab.plot(xhat,'b-',label='a posteri estimate') #过滤后的值
51 | pylab.axhline(x,color='g',label='truth value') #系统值
52 | pylab.legend()
53 | pylab.xlabel('Iteration')
54 | pylab.ylabel('Voltage')
55 |
56 | pylab.figure()
57 | valid_iter = range(1,n_iter) # Pminus not valid at step 0
58 | pylab.plot(valid_iter,Pminus[valid_iter],label='a priori error estimate')
59 | pylab.xlabel('Iteration')
60 | pylab.ylabel('$(Voltage)^2$')
61 | pylab.setp(pylab.gca(),'ylim',[0,.01])
62 | pylab.show()
--------------------------------------------------------------------------------
/Python+Algorithm/Math/Kalman/kalman_1.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | '''
4 | kalman_mousetracker.py - OpenCV mouse-tracking demo using 2D Kalman filter
5 | Adapted from
6 | http://www.morethantechnical.com/2011/06/17/simple-kalman-filter-for-tracking-using-opencv-2-2-w-code/
7 |
8 | Copyright (C) 2014 Simon D. Levy
9 | This code is free software: you can redistribute it and/or modify
10 | it under the terms of the GNU Lesser General Public License as
11 | published by the Free Software Foundation, either version 3 of the
12 | License, or (at your option) any later version.
13 | This code is distributed in the hope that it will be useful,
14 | but WITHOUT ANY WARRANTY without even the implied warranty of
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | GNU General Public License for more details.
17 | You should have received a copy of the GNU Lesser General Public License
18 | along with this code. If not, see .
19 | '''
20 |
21 | # This delay will affect the Kalman update rate
22 | DELAY_MSEC = 20
23 |
24 | # Arbitrary display params
25 | WINDOW_NAME = 'Kalman Mousetracker [ESC to quit]'
26 | WINDOW_SIZE = 500
27 |
28 | import cv2
29 | import numpy as np
30 | from sys import exit
31 |
32 | from kalman2d import Kalman2D
33 |
34 | class MouseInfo(object):
35 | '''
36 | A class to store X,Y points
37 | '''
38 |
39 | def __init__(self):
40 |
41 | self.x, self.y = -1, -1
42 |
43 | def __str__(self):
44 |
45 | return '%4d %4d' % (self.x, self.y)
46 |
47 | def mouseCallback(event, x, y, flags, mouse_info):
48 | '''
49 | Callback to update a MouseInfo object with new X,Y coordinates
50 | '''
51 |
52 | mouse_info.x = x
53 | mouse_info.y = y
54 |
55 |
56 | def drawCross(img, center, r, g, b):
57 | '''
58 | Draws a cross a the specified X,Y coordinates with color RGB
59 | '''
60 |
61 | d = 5
62 | t = 2
63 |
64 | color = (r, g, b)
65 |
66 | ctrx = center[0]
67 | ctry = center[1]
68 |
69 | cv2.line(img, (ctrx - d, ctry - d), (ctrx + d, ctry + d), color, t, cv2.CV_AA)
70 | cv2.line(img, (ctrx + d, ctry - d), (ctrx - d, ctry + d), color, t, cv2.CV_AA)
71 |
72 |
73 | def drawLines(img, points, r, g, b):
74 | '''
75 | Draws lines
76 | '''
77 |
78 | cv2.polylines(img, [np.int32(points)], isClosed=False, color=(r, g, b))
79 |
80 |
81 | def newImage():
82 | '''
83 | Returns a new image
84 | '''
85 |
86 | return np.zeros((500,500,3), np.uint8)
87 |
88 |
89 | if __name__ == '__main__':
90 |
91 |
92 | # Create a new image in a named window
93 | img = newImage()
94 | cv2.namedWindow(WINDOW_NAME)
95 |
96 | # Create an X,Y mouse info object and set the window's mouse callback to modify it
97 | mouse_info = MouseInfo()
98 | cv2.setMouseCallback(WINDOW_NAME, mouseCallback, mouse_info)
99 |
100 | # Loop until mouse inside window
101 | while True:
102 |
103 | if mouse_info.x > 0 and mouse_info.y > 0:
104 | break
105 |
106 | cv2.imshow(WINDOW_NAME, img)
107 | if cv2.waitKey(1) == 27:
108 | exit(0)
109 |
110 |
111 | # These will get the trajectories for mouse location and Kalman estiamte
112 | measured_points = []
113 | kalman_points = []
114 |
115 | # Create a new Kalman2D filter and initialize it with starting mouse location
116 | kalman2d = Kalman2D()
117 |
118 | # Loop till user hits escape
119 | while True:
120 |
121 | # Serve up a fresh image
122 | img = newImage()
123 |
124 | # Grab current mouse position and add it to the trajectory
125 | measured = (mouse_info.x, mouse_info.y)
126 | measured_points.append(measured)
127 |
128 | # Update the Kalman filter with the mouse point
129 | kalman2d.update(mouse_info.x, mouse_info.y)
130 |
131 | # Get the current Kalman estimate and add it to the trajectory
132 | estimated = [int (c) for c in kalman2d.getEstimate()]
133 | kalman_points.append(estimated)
134 |
135 | # Display the trajectories and current points
136 | drawLines(img, kalman_points, 0, 255, 0)
137 | drawCross(img, estimated, 255, 255, 255)
138 | drawLines(img, measured_points, 255, 255, 0)
139 | drawCross(img, measured, 0, 0, 255)
140 |
141 | # Delay for specified interval, quitting on ESC
142 | cv2.imshow(WINDOW_NAME, img)
143 | if cv2.waitKey(DELAY_MSEC) == 27:
144 | break
--------------------------------------------------------------------------------
/Python+Algorithm/Math/Kalman/my_kalman_carmove.py:
--------------------------------------------------------------------------------
1 | '''本例说明
2 | 小车匀加速运动,对小车进行位移预测
3 | '''
4 |
5 | import numpy as np
6 | import pylab
7 |
8 | # 初始化参数
9 | delta_t = 0.1
10 | t = np.arange(0, 5, delta_t) # 时间序列
11 | N = len(t) # 序列长度
12 | sz = (2, N) # 数据量
13 | a = 10 # 真实加速度
14 | x = 1 / 2 * a * t**2 # 真实位移
15 | z = x + np.random.normal(0, 10, size=N) # 观测值,在真实值上加入了白噪声,服从高斯分布
16 |
17 | Q = [[0, 0], [0, 0.01]]
18 | R = 10
19 |
20 | A = np.array([[1, delta_t], [0, 1]])
21 | B = np.array([1 / 2 * delta_t**2, delta_t])
22 | H = np.array([1, 0])
23 |
24 | # 分配空间
25 | x_predict = np.zeros(sz) # x的先验估计,也就是预测值
26 | P_predict = np.zeros((2, 2)) # P的先验估计
27 | x_update = np.zeros(sz) # x的后验估计,也就是最终的估计量
28 | P_update = np.zeros((2, 2)) # 协方差的后验估计
29 | K = np.zeros(sz) # 卡尔曼增益
30 | I = np.eye(2)
31 |
32 | for k in range(1, N):
33 | # 预测过程
34 | x_predict[:, k] = A.dot(x_update[:, k - 1]) + a * B
35 | P_predict = A.dot(P_update).dot(A.T) + Q
36 |
37 | # 更新过程
38 | K[:, k] = P_predict.dot(H.T) / (H.dot(P_predict).dot(H.T) + R)
39 | x_update[:, k] = x_predict[:, k] + K[:, k].dot(
40 | (z[k] - H.dot(x_predict[:, k])))
41 | P_update = (I - K[:, k].dot(H)).dot(P_predict)
42 |
43 | pylab.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
44 | pylab.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
45 |
46 | pylab.figure()
47 | pylab.plot(z, color='g', linestyle='--', label='观测值') # 观测值
48 | pylab.plot(x_update[0], color='r', label='估计值') # 估计值
49 | pylab.plot(x, linestyle=':', label='真实值') # 真实值
50 | pylab.xlabel('时间/s')
51 | pylab.ylabel('位移/m')
52 | pylab.legend()
53 | pylab.show()
--------------------------------------------------------------------------------
/Python+Algorithm/Math/Kalman/my_kalman_simple.py:
--------------------------------------------------------------------------------
1 | '''本例说明
2 | 是Kalman滤波器的一种特殊情况,去掉了
3 | 设定真实值,按高斯分布,针对真实值随机生成【sz】个观测数据,然后进行卡尔曼滤波估计
4 | '''
5 |
6 | import numpy as np
7 | import pylab
8 |
9 | # 初始化参数
10 | sz = 50 # 数据量
11 |
12 | x = 0.1 # 真实值
13 | z = np.random.normal(x, 0.1, size=sz) # 观测值,服从高斯分布
14 |
15 | Q = 1e-5 # 过程噪声
16 | R = 1e-2 # 观测噪声
17 |
18 | # 为变量分配空间
19 | x_predict = np.zeros(sz) # x的先验估计,也就是预测值
20 | P_predict = np.zeros(sz) # P的先验估计
21 | x_update = np.zeros(sz) # x的后验估计,也就是最终的估计量
22 | P_update = np.zeros(sz) # 协方差的后验估计
23 | K = np.zeros(sz) # 卡尔曼增益
24 |
25 | # 赋初值
26 | x_update[0] = 0.0
27 | P_update[0] = 1.0
28 |
29 | for k in range(1, sz):
30 | # 预测过程
31 | x_predict[k] = x_update[k - 1]
32 | P_predict[k] = P_update[k - 1] + Q
33 |
34 | # 更新过程
35 | K[k] = P_predict[k] / (P_predict[k] + R)
36 | x_update[k] = x_predict[k] + K[k] * (z[k] - x_predict[k])
37 | P_update[k] = (1 - K[k]) * P_predict[k]
38 |
39 | pylab.rcParams['font.sans-serif'] = ['FangSong'] # 指定默认字体
40 | pylab.rcParams['axes.unicode_minus'] = False # 解决保存图像是负号'-'显示为方块的问题
41 |
42 | pylab.figure()
43 | pylab.plot(z, 'k+', label='观测值') # 观测值
44 | pylab.plot(x_update, 'b-', label='估计值') # 估计值
45 | pylab.axhline(x, color='g', label='真实值') # 真实值
46 | pylab.legend()
47 | pylab.show()
48 |
--------------------------------------------------------------------------------
/Python+Algorithm/Math/gram_schmidt.py:
--------------------------------------------------------------------------------
1 | # 施密特正交化 Gram-Schmidt
2 | # !注意 例子中 输入向量为 (3, 1)T (2, 2)T
3 | # 输出向量为 (0.9., 0.3.)T (-0.3., 0.9.)T
4 |
5 | import numpy as np
6 |
7 | def myGS(V):
8 | u = V.copy().transpose()
9 | E = []
10 | for i in range(len(u)):
11 | for j in range(i):
12 | u[i] = (V[i] @ u[j]) / (u[j] @ u[j]) * (u[j])
13 | E.append(u[i] / np.linalg.norm(u[i]))
14 | return np.array(E)
15 |
16 | if __name__ == "__main__":
17 | # 输入矩阵
18 | # V = np.array([[3., 1.], [2., 2.]])
19 | V = np.array([[1., 0, -1, 4], [-1, 2, 2, -1], [2, 3, 1, -3]]).transpose()
20 | print(V)
21 |
22 | # # 输出施密特正交化结果
23 | E = myGS(V)
24 | print(E)
25 |
26 | # # 验证是否为单位阵
27 | print(E.transpose() @ E)
--------------------------------------------------------------------------------
/Python+Algorithm/Math/math_base.py:
--------------------------------------------------------------------------------
1 | '''
2 | 在这里将用代码实现一些数学中常用的概念,方便理解
3 | 1、均方误差
4 | '''
5 | import numpy as np
6 | import matplotlib.pyplot as plt
7 |
8 | # 生成一组随机数
9 | x1 = np.random.rand(10)
10 | x2 = np.random.randn(100) # 标准正态分布
11 | x3 = np.random.randint(1, 10, 10) # 1-10的整数
12 | x4 = np.random.normal(size=10)
13 |
14 | # 生成一组真实值和预测值
15 | target = [1, 2, 3, 4, 5, 6]
16 | prediction = [1, 1, 2, 4, 4, 7]
17 |
18 |
19 | ## 对单组数据而言
20 | # 计算均值
21 | # mean = np.mean(target)
22 | # print('均值:', mean)
23 |
24 | # # 计算方差
25 | # var = np.var(target)
26 | # print('方差:', var)
27 |
28 | ## 对两组数据而言
29 | # 计算误差
30 | error = []
31 | for i in range(len(target)):
32 | error.append(target[i] - prediction[i])
33 |
34 | squaredError = []
35 | for val in error:
36 | squaredError.append(val^2)
37 |
38 | print('MSE:', sum(squaredError)/len(squaredError))
39 |
40 | # 计算标准差
41 | std = np.std(target)
42 | print(std)
43 | # 画出随机数的图像
44 | # plt.figure(1)
45 | # plt.plot(x4)
46 | # # plt.figure(2)
47 | # # plt.hist(x4, 100)
48 | # plt.show()
49 |
50 | # 1、均方误差(MSE)
51 | # print("MSE = ", sum(squaredError) / len(squaredError))
52 |
--------------------------------------------------------------------------------
/Python+Algorithm/Math/matrix.ipynb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzy1996/Python-Code/2abcaa6fbfa4a84aaffdf10d7bcc6b12649dd221/Python+Algorithm/Math/matrix.ipynb
--------------------------------------------------------------------------------
/Python+Algorithm/Math/matrix.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | '''
4 | list列表 [1, 2]
5 | np.array数组 np.array([1, 2])
6 | np.mat矩阵 np.mat([1, 2])
7 | 数组矩阵两者可以互相转换
8 |
9 | !!!不要用列表来进行运算,下面只说数组和矩阵
10 | '''
11 |
12 | # 1-D
13 | a = np.array([[1, 2], [3, 4]])
14 | b = np.array([[2, 4], [1, 3]])
15 |
16 | print(a.shape) # [3 8] 点乘
17 | print(a.T @ b) # 11 乘
18 | print(a.dot(b)) # 11 乘
19 |
20 |
--------------------------------------------------------------------------------
/Python+Algorithm/Math/pareto-front.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | import numpy as np
3 |
4 |
5 | def pareto_front(x, y):
6 | b = []
7 | i = 0
8 | while i < len(x):
9 | j = 0
10 | while j < len(a):
11 | if i != j:
12 | vj1 = a[j][0]
13 | vj2 = a[j][1]
14 | vi1 = a[i][0]
15 | vi2 = a[i][1]
16 |
17 | if (vj1 >= vi1 and vj2 <= vi2) and (vj1 > vi1 or vj2 < vi2):
18 | i += 1
19 | break
20 | else:
21 | j += 1
22 | if j == len(a):
23 | print(a[i])
24 | i += 1
25 | break
26 | else:
27 | j += 1
28 | if i == len(a)-1 and j == len(a):
29 | print(a[i])
30 | i += 1
31 |
32 |
33 | # 帕累托最优点需要满足左下区域没有其他点
34 | # 遍历Traversal
35 | # 数据点表示为 (x[i],y[i])
36 | for i = range(len(x)):
37 | for j = range(len(x)):
38 | if x[j]>=x[i] & y[j]>=y[i]:
39 | pareto_optimality=[x[i],y[i]]
40 |
41 |
42 |
43 | def plot_pareto():
44 | plt.plot(p1, p2, 'ro')
45 |
46 |
47 | if __name__ == '__main__':
48 |
49 | x = np.array([2, 5, 1, 3, 2, 7])
50 | y = np.array([9, 8, 12, 11, 16, 10])
51 | plt.plot(x, y, 'ro')
52 | plt.show()
53 |
54 |
55 | # 加上可视化
56 |
--------------------------------------------------------------------------------
/Python+Algorithm/Optimization-Algorithm/Adam.py:
--------------------------------------------------------------------------------
1 | # ADAM
2 | # 以 y=x1+2*x2为例
3 | import math
4 | import numpy as np
5 |
6 |
7 | def adam():
8 | # 训练集,每个样本有三个分量
9 | x = np.array([(1, 1), (1, 2), (2, 2), (3, 1), (1, 3), (2, 4), (2, 3), (3,
10 | 3)])
11 | y = np.array([3, 5, 6, 5, 7, 10, 8, 9])
12 |
13 | # 初始化
14 | m, dim = x.shape
15 | theta = np.zeros(dim) # 参数
16 | alpha = 0.01 # 学习率
17 | momentum = 0.1 # 冲量
18 | threshold = 0.0001 # 停止迭代的错误阈值
19 | iterations = 3000 # 迭代次数
20 | error = 0 # 初始错误为0
21 |
22 | b1 = 0.9 # 算法作者建议的默认值
23 | b2 = 0.999 # 算法作者建议的默认值
24 | e = 0.00000001 #算法作者建议的默认值
25 | mt = np.zeros(dim)
26 | vt = np.zeros(dim)
27 |
28 | for i in range(iterations):
29 | j = i % m
30 | error = 1 / (2 * m) * np.dot((np.dot(x, theta) - y).T,
31 | (np.dot(x, theta) - y))
32 | if abs(error) <= threshold:
33 | break
34 |
35 | gradient = x[j] * (np.dot(x[j], theta) - y[j])
36 | mt = b1 * mt + (1 - b1) * gradient
37 | vt = b2 * vt + (1 - b2) * (gradient**2)
38 | mtt = mt / (1 - (b1**(i + 1)))
39 | vtt = vt / (1 - (b2**(i + 1)))
40 | vtt_sqrt = np.array([math.sqrt(vtt[0]),
41 | math.sqrt(vtt[1])]) # 因为只能对标量进行开方
42 | theta = theta - alpha * mtt / (vtt_sqrt + e)
43 |
44 | print('迭代次数:%d' % (i + 1), 'theta:', theta, 'error:%f' % error)
45 |
46 |
47 | if __name__ == '__main__':
48 | adam()
--------------------------------------------------------------------------------
/Python+Algorithm/Optimization-Algorithm/BGD.py:
--------------------------------------------------------------------------------
1 | # 批量梯度下降BGD
2 | # 拟合函数为:y = theta * x
3 | # 代价函数为:J = 1 / (2 * m) * ((theta * x) - y) * ((theta * x) - y).T;
4 | # 梯度迭代为: theta = theta - alpha / m * (x * (theta * x - y).T);
5 | import numpy as np
6 |
7 |
8 | # 1、单元数据程序
9 | # 以 y=x为例,所以正确的结果应该趋近于theta = 1
10 | def bgd_single():
11 | # 训练集, 单样本
12 | x = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
13 |
14 | y = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
15 |
16 | # 初始化
17 | m = len(y)
18 | theta = 0 # 参数
19 | alpha = 0.01 # 学习率
20 | threshold = 0.0001 # 停止迭代的错误阈值
21 | iterations = 1500 # 迭代次数
22 | error = 0 # 初始错误为0
23 |
24 | # 迭代开始
25 | for i in range(iterations):
26 | error = 1 / (2 * m) * np.dot(((theta * x) - y).T, ((theta * x) - y))
27 | # 迭代停止
28 | if abs(error) <= threshold:
29 | break
30 |
31 | theta -= alpha / m * (np.dot(x.T, (theta * x - y)))
32 |
33 | print('单变量:', '迭代次数: %d' % (i + 1), 'theta: %f' % theta,
34 | 'error1: %f' % error)
35 |
36 |
37 | # 2、多元数据程序
38 | # 以 y=x1+2*x2为例,所以正确的结果应该趋近于theta = [1,2]
39 |
40 |
41 | def bgd_multi():
42 | # 训练集,每个样本有2个分量
43 | x = np.array([(1, 1), (1, 2), (2, 2), (3, 1), (1, 3), (2, 4), (2, 3), (3,
44 | 3)])
45 | y = np.array([3, 5, 6, 5, 7, 10, 8, 9])
46 |
47 | # 初始化
48 | m, dim = x.shape
49 | theta = np.zeros(dim) # 参数
50 | alpha = 0.01 # 学习率
51 | threshold = 0.0001 # 停止迭代的错误阈值
52 | iterations = 1500 # 迭代次数
53 | error = 0 # 初始错误为0
54 |
55 | # 迭代开始
56 | for i in range(iterations):
57 | error = 1 / (2 * m) * np.dot((np.dot(x, theta) - y).T,
58 | (np.dot(x, theta) - y))
59 | # 迭代停止
60 | if abs(error) <= threshold:
61 | break
62 |
63 | theta -= alpha / m * (np.dot(x.T, (np.dot(x, theta) - y)))
64 |
65 | print('多元变量:', '迭代次数:%d' % (i + 1), 'theta:', theta, 'error:%f' % error)
66 |
67 |
68 | if __name__ == '__main__':
69 | bgd_single()
70 | bgd_multi()
--------------------------------------------------------------------------------
/Python+Algorithm/Optimization-Algorithm/README.md:
--------------------------------------------------------------------------------
1 | # Optimization-Algorithm
2 |
3 | 参考CSDN[链接](https://blog.csdn.net/yzy_1996/article/details/84618536)
--------------------------------------------------------------------------------
/Python+Algorithm/Optimization-Algorithm/SGD.py:
--------------------------------------------------------------------------------
1 | # 随机梯度下降SGD
2 | # 以 y=x1+2*x2为例
3 |
4 | import numpy as np
5 |
6 |
7 | # 多元数据
8 | def sgd():
9 | # 训练集,每个样本有2个分量
10 | x = np.array([(1, 1), (1, 2), (2, 2), (3, 1), (1, 3), (2, 4), (2, 3), (3, 3)])
11 | y = np.array([3, 5, 6, 5, 7, 10, 8, 9])
12 |
13 | # 初始化
14 | m, dim = x.shape
15 | theta = np.zeros(dim) # 参数
16 | alpha = 0.01 # 学习率
17 | threshold = 0.0001 # 停止迭代的错误阈值
18 | iterations = 1500 # 迭代次数
19 | error = 0 # 初始错误为0
20 |
21 | # 迭代开始
22 | for i in range(iterations):
23 |
24 | error = 1 / (2 * m) * np.dot((np.dot(x, theta) - y).T, (np.dot(x, theta) - y))
25 | # 迭代停止
26 | if abs(error) <= threshold:
27 | break
28 |
29 | j = np.random.randint(0, m)
30 |
31 | theta -= alpha * (x[j] * (np.dot(x[j], theta) - y[j]))
32 |
33 | print('迭代次数:%d' % (i + 1), 'theta:', theta, 'error:%f' % error)
34 |
35 |
36 | if __name__ == '__main__':
37 | sgd()
--------------------------------------------------------------------------------
/Python+Algorithm/Optimization-Algorithm/SGD_momentum.py:
--------------------------------------------------------------------------------
1 | # 带冲量的随机梯度下降SGD
2 | # 以 y=x1+2*x2为例
3 |
4 | import numpy as np
5 |
6 |
7 | # 多元数据
8 | def sgd():
9 | # 训练集,每个样本有三个分量
10 | x = np.array([(1, 1), (1, 2), (2, 2), (3, 1), (1, 3), (2, 4), (2, 3), (3,
11 | 3)])
12 | y = np.array([3, 5, 6, 5, 7, 10, 8, 9])
13 |
14 | # 初始化
15 | m, dim = x.shape
16 | theta = np.zeros(dim) # 参数
17 | alpha = 0.01 # 学习率
18 | momentum = 0.1 # 冲量
19 | threshold = 0.0001 # 停止迭代的错误阈值
20 | iterations = 1500 # 迭代次数
21 | error = 0 # 初始错误为0
22 | gradient = 0 # 初始梯度为0
23 |
24 | # 迭代开始
25 | for i in range(iterations):
26 | j = i % m
27 | error = 1 / (2 * m) * np.dot((np.dot(x, theta) - y).T,
28 | (np.dot(x, theta) - y))
29 | # 迭代停止
30 | if abs(error) <= threshold:
31 | break
32 |
33 | gradient = momentum * gradient + alpha * (x[j] *
34 | (np.dot(x[j], theta) - y[j]))
35 | theta -= gradient
36 |
37 | print('迭代次数:%d' % (i + 1), 'theta:', theta, 'error:%f' % error)
38 |
39 |
40 | if __name__ == '__main__':
41 | sgd()
--------------------------------------------------------------------------------
/Python+Algorithm/Optimization-Algorithm/test.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | x = np.array([(1, 1), (1, 2), (2, 2), (3, 1), (1, 3), (2, 4), (2, 3), (3, 3)])
4 |
5 | m, dim = x.shape
6 |
7 |
8 | print(x)
--------------------------------------------------------------------------------
/Python+Algorithm/Search-Algorithm/BFS.py:
--------------------------------------------------------------------------------
1 | # 广度优先算法
2 | # 集合(set)是一个无序的不重复元素序列。
3 |
4 |
5 | def walk(G, s, S=set()):
6 | P, Q = dict(), set()
7 | P[s] = None # s节点没有前任节点
8 | Q.add(s) # 从s开始搜索
9 | while Q:
10 | u = Q.pop() # 随机移除元素
11 | for v in G[u].difference(P, S): # 得到新节点 difference 返回集合的差集
12 | Q.add(v)
13 | P[v] = u # 记录前任节点
14 | return P
15 |
16 |
17 | def components(G):
18 | comp = []
19 | seen = set()
20 | for u in range(9):
21 | if u in seen: continue
22 | C = walk(G, u)
23 | seen.update(C) # 添加
24 | comp.append(C)
25 | return comp
26 |
27 |
28 | if __name__ == "__main__":
29 | a, b, c, d, e, f, g, h, i = range(9)
30 | N = [
31 | {b, c, d}, # a
32 | {a, d}, # b
33 | {a, d}, # c
34 | {a, c, d}, # d
35 | {g, f}, # e
36 | {e, g}, # f
37 | {e, f}, # g
38 | {i}, # h
39 | {h} # i
40 | ]
41 | comp = components(N)
42 | print(comp)
--------------------------------------------------------------------------------
/Python+Algorithm/Search-Algorithm/DFS.py:
--------------------------------------------------------------------------------
1 | '''深度优先算法
2 | 算法逻辑:
3 | 从初始点开始,向子节点搜索,
4 | '''
5 |
6 |
7 | def iter_dfs(G, s): # G是整个图, s是起点
8 | S, Q = set(), [] # S是存放具体的访问路径
9 | Q.append(s) # Q是用来存放需要进行遍历的数据
10 | while Q: # 只要不是空
11 | u = Q.pop() # 删除并返回末尾元素
12 | if u in S:
13 | continue
14 | S.add(u)
15 | Q.extend(G[u]) # 在末尾追加
16 | yield u
17 |
18 |
19 | if __name__ == "__main__":
20 | a, b, c, d, e, f, g, h, i = range(9)
21 | G = [
22 | {b, c, d, e, f}, # a
23 | {c, e}, # b
24 | {d}, # c
25 | {e}, # d
26 | {f}, # e
27 | {c, g, h}, # f
28 | {f, h}, # g
29 | {f, g} # h
30 | ]
31 | print(list(iter_dfs(G, a))) # [0, 5, 7, 6, 2, 3, 4, 1]
32 |
--------------------------------------------------------------------------------
/Python+Algorithm/Search-Algorithm/README.md:
--------------------------------------------------------------------------------
1 | # Search Algorithm
2 |
3 | problem 1:
4 |
5 | 
6 |
7 | problem 2: fig 2
8 |
9 |
10 |
11 | ## depth-first search
12 |
13 | [python code]()
14 |
15 | [matlab code]()
16 |
17 | ## breadth-first search
18 |
19 | [python code]()
20 |
21 | [matlab code]()
22 |
23 |
24 |
25 |
26 |
27 | Travelling Salesman Problem (TSP),它寻求的是旅行者由起点出发,通过所有给定的需求点后,再次返回起点所花费的最小路径成本
28 |
29 |
30 |
31 | 动态规划算法(Dynamic Programming,简称DP)通常用于求解具有某种最优性质的问题,其基本思想是将待求解问题分解成若干个子问题,先求解子问题,然后由这些子问题的解再得到原问题的解。
32 |
33 |
34 |
35 | 代码
36 |
37 |
38 |
39 | BFS DFS
40 |
41 | 广度优先和深度优先
42 |
43 |
44 |
45 | 
46 |
47 |
48 |
49 | 深度优先遍历顺序为:1->2->4->8->5->3->6->7
50 |
51 | 广度优先遍历顺序为:1->2->3->4->5->6->7->8
52 |
53 |
54 |
55 | 代码
56 |
57 | python 和 matlab
58 |
59 |
60 |
61 | 我们会用到python中的集合(set),它是一个无序的不重复元素序列
62 |
63 |
--------------------------------------------------------------------------------
/Python+Algorithm/Search-Algorithm/fig1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzy1996/Python-Code/2abcaa6fbfa4a84aaffdf10d7bcc6b12649dd221/Python+Algorithm/Search-Algorithm/fig1.png
--------------------------------------------------------------------------------
/Python+Algorithm/Sorting-Algorithm/README.md:
--------------------------------------------------------------------------------
1 | # Sorting-Algorithm
2 |
3 | 参考CSDN[链接](https://blog.csdn.net/yzy_1996/article/details/85318705)
--------------------------------------------------------------------------------
/Python+Algorithm/Sorting-Algorithm/bubble_sort.py:
--------------------------------------------------------------------------------
1 | # Bubble Sort 冒泡排序
2 | # 冒泡排序只会操作相邻的两个数据。每次冒泡操作都会对相邻的两个元素进行比较,交换为正确的顺序,重复N次,时间复杂度为O(n^2)
3 | # 优化的地方在于:当某次排序已经没有数据可以交换,就可以停止了
4 | import time
5 |
6 |
7 | def bubble_sort(array):
8 | length = len(array)
9 | if length <= 1:
10 | return
11 |
12 | for i in range(length):
13 | made_swap = False
14 | for j in range(length - 1 - i):
15 | if array[j] > array[j + 1]:
16 | array[j], array[j + 1] = array[j + 1], array[j]
17 | made_swap = True
18 | if not made_swap:
19 | break
20 |
21 |
22 | if __name__ == '__main__':
23 | array = [5, 6, -1, 4, 2, 8, 10, 7, 6]
24 | start = time.clock()
25 | bubble_sort(array)
26 | print(array)
27 | end = time.clock()
28 | print('用时:', str(end - start))
29 |
--------------------------------------------------------------------------------
/Python+Algorithm/Sorting-Algorithm/counting_sort.py:
--------------------------------------------------------------------------------
1 | # counting_sort 计数排序
2 |
3 | import time
4 |
5 |
6 | def counting_sort(array):
7 | if len(array) <= 1:
8 | return
9 |
10 | counts = [0] * (max(array) + 1)
11 | for num in array:
12 | counts[num] += 1
13 |
14 | # 临时数组,储存排序之后的结果
15 | array_sorted = []
16 | for i in range(max(array) + 1):
17 | array_sorted += [i] * counts[i]
18 | array[:] = array_sorted
19 |
20 |
21 | if __name__ == '__main__':
22 | array = [5, 6, 1, 4, 2, 8, 10, 7, 6]
23 | start = time.clock()
24 | counting_sort(array)
25 | print(array)
26 | end = time.clock()
27 | print('用时:', str(end - start))
--------------------------------------------------------------------------------
/Python+Algorithm/Sorting-Algorithm/insertion_sort.py:
--------------------------------------------------------------------------------
1 | # insertion_sort 插入排序
2 | # 取未排序区间中的元素,在已排序区间中找到合适的插入位置将其插入,并保证已排序区间数据一直有序。重复这个过程,直到未排序区间中元素为空,算法结束。
3 |
4 | import time
5 |
6 |
7 | def insertion_sort(array):
8 | length = len(array)
9 | if length <= 1:
10 | return
11 |
12 | # 比较一下以下两种方法
13 |
14 | # for i in range(length - 1):
15 | # while i >= 0 and array[i] > array[i + 1]:
16 | # array[i + 1], array[i] = array[i], array[i + 1]
17 | # i -= 1
18 |
19 | for i in range(1, length):
20 | value = array[i]
21 | j = i - 1
22 | while j >= 0 and array[j] > value:
23 | array[j + 1] = array[j]
24 | j -= 1
25 | array[j + 1] = value
26 |
27 |
28 | if __name__ == '__main__':
29 | array = [5, 6, -1, 4, 2, 8, 10, 7, 6]
30 | start = time.clock()
31 | insertion_sort(array)
32 | print(array)
33 | end = time.clock()
34 | print('用时:', str(end - start))
35 |
--------------------------------------------------------------------------------
/Python+Algorithm/Sorting-Algorithm/merge_sort.py:
--------------------------------------------------------------------------------
1 | # merge_sort 归并排序
2 |
3 | import time
4 |
5 |
6 | def merge_sort(array):
7 | merge_out(array, 0, len(array) - 1)
8 |
9 |
10 | def merge_out(array, low, high):
11 | if low < high:
12 | mid = low + (high - low) // 2
13 | merge_out(array, low, mid)
14 | merge_out(array, mid + 1, high)
15 | merge_in(array, low, mid, high)
16 |
17 |
18 | def merge_in(array, low, mid, high):
19 | # a[low:mid], a[mid+1, high] are sorted.
20 | i, j = low, mid + 1
21 | tmp = []
22 | while i <= mid and j <= high:
23 | if array[i] <= array[j]:
24 | tmp.append(array[i])
25 | i += 1
26 | else:
27 | tmp.append(array[j])
28 | j += 1
29 | # 将超出索引未添加进tmp的添加进去
30 | if i <= mid: # 如果mid右边已添加,则需补充mid左边部分
31 | start, end = i, mid
32 | else:
33 | start, end = j, high
34 | tmp.extend(array[start:end + 1]) # 常规+1
35 | array[low:high + 1] = tmp
36 |
37 |
38 | if __name__ == '__main__':
39 | array = [5, 6, -1, 4, 2, 8, 10, 7, 6]
40 | start = time.clock()
41 | merge_sort(array)
42 | print(array)
43 | end = time.clock()
44 | print('用时:', str(end - start))
--------------------------------------------------------------------------------
/Python+Algorithm/Sorting-Algorithm/quick_sort.py:
--------------------------------------------------------------------------------
1 | # quick_sort 快速排序
2 |
3 | import time
4 | import random
5 |
6 | def quick_sort(array):
7 | quick_out(array, 0, len(array) - 1)
8 |
9 |
10 | def quick_out(array, low, high):
11 | if low < high:
12 | k = random.randint(low, high)
13 | array[low], array[k] = array[k], array[low] # 将分区点换到首位,避免了K的传参
14 | m = partition(array, low, high)
15 | quick_out(array, low, m - 1)
16 | quick_out(array, m + 1, high)
17 |
18 | # 返回pivot正确的位置索引(它的左边是比它小的,右边是比它大的)
19 | def partition(array, low, high):
20 | pivot, j = array[low], low # j指向pivot在的位置
21 | for i in range(low + 1, high + 1): # i指向待比较元素的位置,从pivot后一位开始,因为pivot在首位
22 | if array[i] <= pivot:
23 | j += 1 # 此时j指向pivot应该在的位置
24 | array[j], array[i] = array[i], array[j] # 先让待比较的元素交换位置
25 | array[low], array[j] = array[j], array[low] # 真实交换pivot到它正确的位置
26 | return j
27 |
28 |
29 |
30 | if __name__ == '__main__':
31 | array = [5, 6, -1, 4, 2, 8, 10, 7, 6]
32 | start = time.clock()
33 | quick_sort(array)
34 | print(array)
35 | end = time.clock()
36 | print('用时:', str(end - start))
--------------------------------------------------------------------------------
/Python+Algorithm/Sorting-Algorithm/selection_sort.py:
--------------------------------------------------------------------------------
1 | # selection_sort 选择排序
2 | # 选择排序算法的实现思路有点类似插入排序,也分已排序区间和未排序区间。但是选择排序每次会从未排序区间中找到最小的元素,将其放到已排序区间的末尾。
3 |
4 | import time
5 |
6 |
7 | def insertion_sort(array):
8 | length = len(array)
9 | if length <= 1:
10 | return
11 |
12 | for i in range(length):
13 | min_index = i
14 | min_val = array[i]
15 | for j in range(i, length):
16 | if array[j] < min_val:
17 | min_val = array[j]
18 | min_index = j
19 | array[i], array[min_index] = array[min_index], array[i]
20 |
21 |
22 | if __name__ == '__main__':
23 | array = [5, 6, -1, 4, 2, 8, 10, 7, 6]
24 | start = time.clock()
25 | insertion_sort(array)
26 | print(array)
27 | end = time.clock()
28 | print('用时:', str(end - start))
--------------------------------------------------------------------------------
/Python+Algorithm/Sorting-Algorithm/sleep_sort.py:
--------------------------------------------------------------------------------
1 | import time
2 | import threading
3 |
4 | # 你需要排序的序列(可以包含负数)
5 | num = [-5, 3, 9, 11, -1, 3, 12, 0, 8, -3, 23, 5, 19]
6 |
7 |
8 | # 睡眠的方法
9 | def doSleep(func):
10 | co = 0.02 # 添加系数让睡眠时间短一些
11 | time.sleep(co * pow(1.1, float(func))) # 使用幂函数就不怕负数排序了
12 | print(func)
13 |
14 |
15 | # 将多个线程存在一个数组中
16 | thread_list = []
17 | for i in range(len(num)):
18 | temp = threading.Thread(target=doSleep, args=(str(num[i]), ))
19 | thread_list.append(temp)
20 |
21 | if __name__ == '__main__':
22 | start = time.clock()
23 | for t in thread_list:
24 | t.start() # 开启线程
25 | for t in thread_list:
26 | t.join() # 所有子线程都结束了主线程才关闭
27 | end = time.clock()
28 | print('用时:', str(end - start))
29 |
--------------------------------------------------------------------------------
/Python+Algorithm/Uncategorized/pyramid.py:
--------------------------------------------------------------------------------
1 | '''
2 | 一个生成金字塔的代码,根据你输入的层数显示出来
3 | '''
4 |
5 | a=input("Enter N = ")
6 | b=int(a)
7 | for i in range(b-1):
8 | print(" ",end='')
9 | print("*",end='')
10 | print("\n")
11 |
12 | for i in range(1,b-1):
13 | for j in range(b-i-1):
14 | print(" ",end='')
15 | print("*",end='')
16 | for l in range(2*i-1):
17 | print("#",end='')
18 | print("*",end='')
19 | print("\n")
20 |
21 | if b>1:
22 | for i in range(2*b-1):
23 | print("*",end='')
--------------------------------------------------------------------------------
/Python+Algorithm/Uncategorized/xingxingdiandeng.py:
--------------------------------------------------------------------------------
1 |
2 | #coding: utf-8
3 | def xingxing(N):
4 | for i in range(N):
5 | j=i+1
6 | if j==1:
7 | print(' '*(N-1)+'*'+' '*(N-1))
8 | elif j阅读\((.*?)\)', page, re.S)
16 | #将结果输出
17 | print('访问量:%s' % (view.zfill(4)))
18 |
19 |
20 | # 阅读数 22828
--------------------------------------------------------------------------------
/Python+Crawler/Web/刷网页.py:
--------------------------------------------------------------------------------
1 | #coding: utf-8
2 |
3 | from urllib import request
4 | from urllib import parse
5 | import urllib.error
6 | import time
7 | from http import cookiejar
8 | import threading
9 | import linecache
10 |
11 | #cj = http.cookiejar.CookieJar()
12 | #opener = request.build_opener(request.HTTPCookieProcessor(cj), request.HTTPHandler)
13 | #request.install_opener(opener)
14 |
15 | THREAD_NUMBER = 2
16 | IP_NUMBER = 20
17 |
18 | url = ['http://yun.zjer.cn/index.php?r=space/person/show&sid=NID555912']
19 |
20 | head = {
21 | 'User-Agent':
22 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'
23 | }
24 | global count
25 | count = 0
26 | global count1
27 | count1 = 0
28 |
29 | lock = threading.Lock()
30 |
31 |
32 | def brash(proxy_dict):
33 | #print(proxy_dict)
34 | global count
35 | global count1
36 | if count1 < 100:
37 | try: #正常运行
38 | count = count + 1
39 | print(count, 'times') #监视程序是否在正常运行,输出运行了多少次
40 | proxy_handler = request.ProxyHandler({'http': proxy_dict})
41 | opener = request.build_opener(proxy_handler)
42 | request.install_opener(opener)
43 | countUrl = len(url)
44 | for i in range(countUrl): #遍历所有url
45 | req = request.Request(url[i], headers=head, method='POST')
46 | try:
47 | #lock.acquire()
48 | response = request.urlopen(req) #访问网页
49 | html = response.read().decode('utf-8')
50 | print(html)
51 | #lock.release()
52 | except urllib.error.URLError as e:
53 | print(e.reason)
54 | print("EEEEEE")
55 | #time.sleep(1) #间隔执行
56 |
57 | except Exception: #出现异常
58 | print('Retry')
59 | count1 = count1 + 1
60 | time.sleep(1) #间隔执行
61 | else:
62 | print('much error')
63 |
64 |
65 | def ReadSpecialLine(ipfilename, linenumber):
66 | proxy_dict = linecache.getline(ipfilename, linenumber).strip('\n')
67 | lock.acquire()
68 | #print(proxy_dict)
69 | print(linenumber)
70 | lock.release()
71 | return proxy_dict
72 |
73 |
74 | #while True: #让程序一直执行
75 | def For_EveryThread(Thread_i):
76 | while True:
77 | for i in range(int(IP_NUMBER / THREAD_NUMBER)):
78 | linenumber = THREAD_NUMBER * i + Thread_i
79 | proxy_dict = ReadSpecialLine('ip.txt', linenumber)
80 | brash(proxy_dict)
81 | return
82 |
83 |
84 | if __name__ == '__main__':
85 | count = 0
86 | count1 = 0
87 | thread_list = []
88 | start = time.clock()
89 | for Thread_i in range(THREAD_NUMBER):
90 | temp = threading.Thread(target=For_EveryThread, args=(Thread_i + 1, ))
91 | thread_list.append(temp)
92 | for t in thread_list:
93 | t.start() # 开启线程
94 | for t in thread_list:
95 | t.join() # 所有子线程都结束了主线程才关闭
96 | end = time.clock()
97 | print('用时:', str(end - start))
98 |
--------------------------------------------------------------------------------
/Python+Crawler/caixukun.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from selenium.common.exceptions import TimeoutException
3 | from selenium.webdriver.common.by import By
4 | from selenium.webdriver.support.ui import WebDriverWait
5 | from selenium.webdriver.support import expected_conditions as EC
6 | from bs4 import BeautifulSoup
7 | import xlwt
8 |
9 |
10 | browser = webdriver.PhantomJS()
11 | WAIT = WebDriverWait(browser, 10)
12 | browser.set_window_size(1400,900)
13 |
14 |
15 | book=xlwt.Workbook(encoding='utf-8',style_compression=0)
16 |
17 | sheet=book.add_sheet('蔡徐坤篮球',cell_overwrite_ok=True)
18 | sheet.write(0,0,'名称')
19 | sheet.write(0,1,'地址')
20 | sheet.write(0,2,'描述')
21 | sheet.write(0,3,'观看次数')
22 | sheet.write(0,4,'弹幕数')
23 | sheet.write(0,5,'发布时间')
24 |
25 | n=1
26 |
27 | def search():
28 |
29 | try:
30 | print('开始访问b站....')
31 | browser.get("https://www.bilibili.com/")
32 |
33 | # 被那个破登录遮住了
34 | index = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#primary_menu > ul > li.home > a")))
35 | index.click()
36 |
37 | input = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#banner_link > div > div > form > input")))
38 | submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="banner_link"]/div/div/form/button')))
39 |
40 | input.send_keys('蔡徐坤 篮球')
41 | submit.click()
42 |
43 | # 跳转到新的窗口
44 | print('跳转到新窗口')
45 | all_h = browser.window_handles
46 | browser.switch_to.window(all_h[1])
47 |
48 | get_source()
49 | total = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#server-search-app > div.contain > div.body-contain > div > div.page-wrap > div > ul > li.page-item.last > button")))
50 | return int(total.text)
51 | except TimeoutException:
52 | return search()
53 |
54 |
55 | def next_page(page_num):
56 | try:
57 | print('获取下一页数据')
58 | next_btn = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#server-search-app > div.contain > div.body-contain > div > div.page-wrap > div > ul > li.page-item.next > button')))
59 | next_btn.click()
60 | WAIT.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#server-search-app > div.contain > div.body-contain > div > div.page-wrap > div > ul > li.page-item.active > button'),str(page_num)))
61 | get_source()
62 | except TimeoutException:
63 | browser.refresh()
64 | return next_page(page_num)
65 |
66 |
67 | def save_to_excel(soup):
68 | list = soup.find(class_='all-contain').find_all(class_='info')
69 |
70 | for item in list:
71 | item_title = item.find('a').get('title')
72 | item_link = item.find('a').get('href')
73 | item_dec = item.find(class_='des hide').text
74 | item_view = item.find(class_='so-icon watch-num').text
75 | item_biubiu = item.find(class_='so-icon hide').text
76 | item_date = item.find(class_='so-icon time').text
77 |
78 | print('爬取:' + item_title)
79 |
80 | global n
81 |
82 | sheet.write(n, 0, item_title)
83 | sheet.write(n, 1, item_link)
84 | sheet.write(n, 2, item_dec)
85 | sheet.write(n, 3, item_view)
86 | sheet.write(n, 4, item_biubiu)
87 | sheet.write(n, 5, item_date)
88 |
89 | n = n + 1
90 |
91 |
92 | def get_source():
93 | WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#server-search-app > div.contain > div.body-contain > div > div.result-wrap.clearfix')))
94 | html = browser.page_source
95 | soup = BeautifulSoup(html,'lxml')
96 | save_to_excel(soup)
97 |
98 | def main():
99 |
100 | try:
101 | total = search()
102 | print(total)
103 |
104 | for i in range(2,int(total+1)):
105 | next_page(i)
106 |
107 | finally:
108 | browser.close()
109 |
110 |
111 | if __name__ == '__main__':
112 | main()
113 | book.save(u'蔡徐坤篮球.xlsx')
--------------------------------------------------------------------------------
/Python+Crawler/crawler1.py:
--------------------------------------------------------------------------------
1 | # 1、爬虫起步
2 |
3 | import urllib.request
4 | import re
5 |
6 | response = urllib.request.urlopen('http://www.baidu.com')
7 |
8 | content = response.read().decode('utf-8')
9 |
10 | print(content)
11 |
12 |
--------------------------------------------------------------------------------
/Python+Crawler/crawler2.py:
--------------------------------------------------------------------------------
1 | # 2、爬虫进阶
2 |
3 | from urllib import request,parse
4 | import ssl
5 | context = ssl._create_unverified_context()
6 | url = 'https://biihu.cc//account/ajax/login_process/'
7 | headers = {'User-Agent':' Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
8 | dict = {
9 | 'return_url':'https://biihu.cc/',
10 | 'user_name':'xiaoshuaib@gmail.com',
11 | 'password':'123456789',
12 | '_post_type':'ajax',
13 | }
14 | data = bytes(parse.urlencode(dict),'utf-8')
15 |
16 | req = request.Request(url, data=data, headers=headers, method='POST')
17 | response = request.urlopen(req, context=context)
18 | print(response.read().decode('utf-8'))
--------------------------------------------------------------------------------
/Python+Crawler/crawler3.py:
--------------------------------------------------------------------------------
1 | # 使用requests库
2 |
3 | import requests
4 |
5 | url = 'http://www.baidu.com'
6 |
7 | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
8 |
9 | response = requests.get(url, headers=headers)
10 |
11 | content = response.content.decode('utf-8')
12 |
13 | print(content)
14 |
15 |
--------------------------------------------------------------------------------
/Python+Crawler/debug.log:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzy1996/Python-Code/2abcaa6fbfa4a84aaffdf10d7bcc6b12649dd221/Python+Crawler/debug.log
--------------------------------------------------------------------------------
/Python+Crawler/form_test.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | with open('ip.txt') as f:
4 | lines = (line.strip() for line in f)
5 | ip = list(lines)
6 |
7 | with open('user_agent.txt') as f:
8 | lines = (line.strip() for line in f)
9 | user_agent = list(lines)
10 |
11 | # user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36
12 | # proxy-server=http://117.191.11.105:8080
13 | headers = 'User-Agent=' + random.choice(user_agent) # 随机选择浏览器标识
14 | proxies = 'proxy-server=http://' + random.choice(ip) # 代理ip
15 | print(headers)
16 | print(proxies)
--------------------------------------------------------------------------------
/Python+Crawler/ip.txt:
--------------------------------------------------------------------------------
1 | 120.234.138.102:53779
2 | 120.198.230.15:8080
3 | 117.135.77.30:8060
4 | 111.26.9.26:80
5 | 120.234.138.99:53779
6 | 117.191.11.77:8080
7 | 117.191.11.102:8080
8 | 39.137.168.230:80
9 | 117.191.11.103:80
10 | 117.191.11.104:8080
11 | 117.191.11.113:80
12 | 117.191.11.73:8080
13 | 117.131.119.98:80
14 | 117.191.11.75:80
15 | 117.191.11.108:8080
16 | 211.136.127.125:80
17 | 223.82.247.122:80
18 | 223.82.247.121:80
19 | 183.245.98.6:8118
20 | 117.158.189.238:9999
21 | 223.114.75.157:9999
22 | 120.213.176.255:9999
23 | 39.150.84.98:9999
24 | 39.163.49.244:9999
25 | 117.163.247.119:9999
26 | 120.214.30.168:9999
27 | 120.219.249.86:9999
28 | 120.215.204.168:9999
29 | 183.216.175.237:9999
30 | 39.164.222.116:9999
31 | 39.163.47.161:9999
32 | 223.96.95.229:3128
33 | 223.85.196.75:9797
34 | 39.137.77.66:8080
35 | 39.137.77.67:8080
36 | 223.93.145.186:8060
37 | 39.137.77.68:8080
38 | 39.137.77.68:80
39 | 39.137.77.66:80
40 | 39.137.77.67:80
41 | 120.234.138.99:53779
42 | 117.191.11.77:8080
43 | 117.191.11.102:8080
44 | 39.137.168.230:80
45 | 117.191.11.103:80
46 | 117.191.11.104:8080
47 | 117.191.11.113:80
48 | 117.191.11.73:8080
49 | 117.131.119.98:80
50 | 117.191.11.75:80
51 | 117.191.11.108:8080
52 | 211.136.127.125:80
53 | 223.82.247.122:80
54 | 223.82.247.121:80
55 | 183.245.98.6:8118
56 | 117.158.189.238:9999
57 | 112.35.56.134:80
58 | 183.230.157.236:8088
59 | 223.68.190.130:8181
60 | 112.12.37.196:53281
61 | 117.131.75.134:80
62 | 39.137.107.98:8080
63 | 183.230.179.157:8060
64 | 183.230.179.164:8060
65 | 223.85.196.75:9999
66 | 183.215.206.39:53281
67 | 117.191.11.106:8080
68 | 117.191.11.74:8080
69 | 117.191.11.111:8080
70 | 117.191.11.105:8080
71 | 117.191.11.71:80
72 | 117.186.214.74:9999
73 | 117.191.11.107:8080
74 | 117.191.11.101:8080
75 | 117.191.11.80:8080
76 | 117.191.11.106:80
77 | 117.191.11.105:80
78 | 117.191.11.71:8080
79 | 117.191.11.107:80
80 | 117.191.11.74:80
--------------------------------------------------------------------------------
/Python+Crawler/ip1.txt:
--------------------------------------------------------------------------------
1 | 5.202.74.51:80
2 | 5.202.101.141:80
3 | 93.126.32.92:80
4 | 5.202.148.98:80
5 | 109.122.242.198:80
6 | 5.202.47.43:80
7 | 5.202.44.228:80
8 | 5.202.148.185:80
9 | 5.202.159.47:80
10 | 5.202.101.226:80
11 | 47.107.163.15:3128
12 | 110.52.235.30:9999
13 | 109.60.140.89:59901
14 | 110.52.235.204:9999
15 | 5.202.77.91:80
16 | 5.202.101.146:80
17 | 93.126.37.59:80
18 | 5.202.147.38:80
19 | 5.202.68.221:80
20 | 5.202.101.175:80
21 | 5.202.151.43:80
22 | 5.202.220.64:80
23 | 5.202.77.112:80
24 | 5.202.78.123:80
25 | 5.202.151.102:80
26 | 5.202.44.165:80
27 | 5.202.127.46:80
28 | 5.202.74.106:80
29 | 5.202.94.6:80
30 | 31.184.143.206:80
31 | 5.202.157.166:80
32 | 5.202.151.92:80
33 | 91.237.254.19:80
34 | 5.202.69.51:80
35 | 5.202.159.79:80
36 | 5.202.94.192:80
37 | 5.202.151.59:80
38 | 93.126.59.74:80
39 | 5.202.77.201:80
40 | 5.202.157.245:80
41 | 5.202.157.203:80
42 | 5.202.93.60:80
43 | 5.202.148.105:80
44 | 5.202.148.124:80
45 | 5.202.151.69:80
46 | 5.202.101.208:80
47 | 5.202.149.171:80
48 | 5.202.47.68:80
49 | 5.202.101.222:80
50 | 5.202.44.74:80
51 | 50.112.52.37:80
52 | 95.38.64.3:8080
53 | 36.67.8.27:53281
54 | 85.192.166.189:8080
55 | 137.74.254.242:3128
56 | 218.214.29.92:8080
57 | 94.23.159.76:9999
58 | 218.214.29.92:80
59 | 35.204.39.83:3128
60 | 185.238.239.40:8090
61 | 43.229.85.226:8080
62 | 195.122.185.95:3128
63 | 94.74.191.251:8088
64 | 94.74.154.190:80
65 | 187.28.39.155:8080
66 | 41.242.166.233:53112
67 | 185.212.127.32:41452
68 | 89.22.255.82:36693
69 | 178.134.71.138:47621
70 | 88.148.183.139:3128
71 | 187.28.39.156:8080
72 | 125.162.136.91:80
73 | 190.14.252.107:8080
74 | 212.233.114.46:3128
75 | 170.79.88.116:8080
76 | 52.14.34.225:80
77 | 103.224.101.155:46759
78 | 202.153.231.147:3128
79 | 202.153.231.147:80
80 | 185.15.108.152:8080
81 | 185.186.81.50:8080
82 | 181.188.187.141:46664
83 | 210.11.189.43:58993
84 | 187.87.76.251:3128
85 | 104.236.248.219:3128
86 | 61.8.66.178:80
87 | 202.146.2.131:47217
88 | 180.250.219.58:53281
89 | 182.253.60.114:53586
90 | 202.146.0.219:47217
91 | 88.255.101.247:8080
92 | 187.102.48.202:8080
93 | 187.102.48.193:8080
94 | 111.198.154.116:8888
95 | 180.180.123.68:8080
96 | 123.200.20.6:8080
97 | 187.45.54.81:8080
98 | 177.75.4.34:80
99 | 203.189.159.181:8080
100 | 202.169.238.82:8080
101 | 175.30.124.96:80
102 | 117.163.247.119:9999
103 | 35.185.22.15:80
104 | 112.245.171.227:9999
105 | 120.214.30.168:9999
106 | 177.54.144.160:3128
107 | 1.70.44.237:9999
108 | 223.245.168.45:9999
109 | 112.233.19.219:9999
110 | 115.229.131.80:8998
111 | 195.98.191.102:8081
112 | 119.184.142.202:9999
113 | 124.88.67.34:83
114 | 88.147.189.62:8081
115 | 42.230.149.23:9999
116 | 203.90.144.145:82
117 | 110.157.171.119:9000
118 | 112.233.249.106:9999
119 | 183.153.115.19:9999
120 | 115.46.199.2:9999
121 | 139.59.104.254:80
122 | 46.44.60.71:8081
123 | 5.2.75.15:1080
124 | 61.234.76.28:9999
125 | 120.219.249.86:9999
126 | 104.196.114.98:80
127 | 124.88.67.18:80
128 | 120.215.204.168:9999
129 | 116.116.112.139:9999
130 | 27.206.37.23:9999
131 | 92.208.138.179:80
132 | 115.216.168.55:9999
133 | 154.46.204.36:80
134 | 110.155.142.174:9999
135 | 183.216.175.237:9999
136 | 39.164.222.116:9999
137 | 60.162.51.77:8888
138 | 60.208.182.59:9999
139 | 115.55.152.132:9999
140 | 39.163.47.161:9999
141 | 185.148.218.246:8081
142 | 59.127.38.117:8080
143 | 212.22.86.114:3130
144 | 101.4.136.34:81
145 | 101.4.136.34:80
146 | 101.4.136.34:8080
147 | 178.169.64.76:8081
148 | 5.167.96.238:3128
149 | 47.94.230.42:9999
150 | 211.159.171.58:80
151 | 5.202.47.152:80
152 | 5.202.149.252:80
153 | 5.202.44.103:80
154 | 5.202.101.217:80
155 | 121.17.174.121:9797
156 | 5.202.148.113:80
157 | 5.202.220.21:80
158 | 5.202.151.58:80
159 | 5.202.151.77:80
160 | 5.202.109.18:80
161 | 5.202.68.82:80
162 | 5.202.76.226:80
163 | 103.111.56.69:50148
164 | 5.202.46.130:80
165 | 103.248.219.172:8080
166 | 206.81.11.75:80
167 | 5.202.44.170:80
168 | 140.227.64.94:3128
169 | 5.202.149.30:80
170 | 5.202.192.147:80
171 | 5.202.44.38:80
172 | 5.202.38.168:80
173 | 217.69.10.147:80
174 | 5.202.68.87:80
175 | 5.202.138.76:80
176 | 5.202.149.43:80
177 | 5.202.47.155:80
178 | 5.202.67.247:80
179 | 5.202.158.151:80
180 | 5.202.218.73:80
181 | 5.202.38.4:80
182 | 187.4.249.6:80
183 | 5.190.166.242:80
184 | 51.77.223.95:80
185 | 5.202.74.23:80
186 | 5.202.67.121:80
187 | 5.202.101.179:80
188 | 5.202.151.70:80
189 | 5.202.94.68:80
190 | 36.91.203.207:8080
191 | 189.204.158.161:8080
192 | 5.202.215.13:80
193 | 5.202.78.116:80
194 | 212.33.202.98:80
195 | 5.202.242.42:80
196 | 92.126.203.59:8080
197 | 111.177.183.168:9999
198 | 1.20.96.24:57132
199 | 93.171.156.209:51016
200 | 5.128.73.158:8081
--------------------------------------------------------------------------------
/Python+Crawler/sample1.py:
--------------------------------------------------------------------------------
1 | # 爬虫1,抓取网页内容
2 |
3 | '''
4 | 第一个示例:简单的网页爬虫
5 | 爬取豆瓣首页
6 | '''
7 |
8 | import urllib.request
9 |
10 | #网址
11 | url = "http://www.douban.com/"
12 |
13 | #请求
14 | request = urllib.request.Request(url)
15 |
16 | #爬取结果
17 | response = urllib.request.urlopen(request)
18 |
19 | data = response.read()
20 |
21 | #设置解码方式
22 | data = data.decode('utf-8')
23 |
24 | #打印结果
25 | print(data)
26 |
27 | #打印爬取网页的各类信息
28 |
29 | print(type(response))
30 | print(response.geturl())
31 | print(response.info())
32 | print(response.getcode())
33 |
--------------------------------------------------------------------------------
/Python+Crawler/selenium_first.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | import time
3 |
4 | # http://httpbin.org/get
5 | with open('ip.txt') as f:
6 | lines = (line.strip() for line in f)
7 | ip = list(lines)
8 |
9 | with open('user_agent.txt') as f:
10 | lines = (line.strip() for line in f)
11 | user_agent = list(lines)
12 |
13 | # 添加配置
14 | options = webdriver.ChromeOptions()
15 | options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36")
16 | options.add_argument("proxy-server=http://117.191.11.105:8080")
17 |
18 | # 创建浏览器驱动
19 | driver = webdriver.Chrome(chrome_options=options)
20 | driver.get("http://httpbin.org/get")
21 |
22 | # input = driver.find_element_by_css_selector('#kw')
23 | # input.send_keys("如吉生物")
24 |
25 | # button = driver.find_element_by_css_selector('#su')
26 | # button.click()
27 |
28 | print(driver.page_source)
29 | time.sleep(10)
30 | driver.quit()
--------------------------------------------------------------------------------
/Python+Crawler/spider-google.py:
--------------------------------------------------------------------------------
1 | # 爬谷歌的图片
2 | # keyword是关键词
3 |
4 | from icrawler.builtin import GoogleImageCrawler
5 |
6 | google_crawler = GoogleImageCrawler(storage={'root_dir': '111'})
7 | google_crawler.crawl(keyword='cat', max_num=10)
--------------------------------------------------------------------------------
/Python+Crawler/test.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | import time
3 | import random
4 |
5 | with open('ip.txt') as f:
6 | lines = (line.strip() for line in f)
7 | ip = list(lines)
8 |
9 | with open('user_agent.txt') as f:
10 | lines = (line.strip() for line in f)
11 | user_agent = list(lines)
12 |
13 | # http://httpbin.org/get
14 | count = 0
15 | for i in range(len(ip)):
16 | try:
17 | # 添加配置
18 | options = webdriver.ChromeOptions()
19 | options.add_argument('User-Agent=' + random.choice(user_agent))
20 | options.add_argument('proxy-server=http://' + ip[i])
21 |
22 | # 创建浏览器驱动
23 | driver = webdriver.Chrome(chrome_options=options)
24 | driver.get("http://www.baidu.com")
25 |
26 | input = driver.find_element_by_css_selector('#kw')
27 | input.send_keys("如吉生物")
28 |
29 | button = driver.find_element_by_css_selector('#su')
30 | button.click()
31 |
32 | # print(driver.page_source)
33 | time.sleep(10)
34 | print(str(count) + 'times')
35 | count = count + 1
36 | driver.quit()
37 |
38 | except Exception: # 其他异常
39 | driver.quit()
40 | print('Retry')
41 |
42 |
43 |
--------------------------------------------------------------------------------
/Python+Crawler/豆瓣最受欢迎的250部电影.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yzy1996/Python-Code/2abcaa6fbfa4a84aaffdf10d7bcc6b12649dd221/Python+Crawler/豆瓣最受欢迎的250部电影.xlsx
--------------------------------------------------------------------------------
/Python+HTML/README.md:
--------------------------------------------------------------------------------
1 | # Python+HTML
2 |
3 | 使用方法参见:[CSDN博客](https://blog.csdn.net/yzy_1996/article/details/80223053)
4 |
--------------------------------------------------------------------------------
/Python+HTML/test1.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
12 |
13 |
14 |
15 |
18 |
19 |
20 | 打开python命令行(方式1)
21 |
22 |
23 |
24 |
25 | 打开python命令行(方式2)
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/Python+HTML/test2.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
12 |
13 |
14 |
15 |
18 |
19 |
20 | 执行test2程序(方式1)
21 |
22 |
23 |
24 |
25 | 执行test2程序(方式2)
26 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/Python+HTML/test2.py:
--------------------------------------------------------------------------------
1 | file = open('new_file' + '.txt','w')
2 | file.close()
--------------------------------------------------------------------------------
/Python+HTML/test3.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
11 |
12 |
13 |
14 |
17 |
18 |
19 |
20 |
21 | 执行test3程序(方式1)
22 |
23 |
24 |
25 |
26 | 执行test3程序(方式2)
27 |
28 |
29 |
30 |
31 |
32 |
36 |
37 |
38 |
39 |