├── README.md ├── Web ├── class │ └── search_class.php ├── index.html ├── css │ ├── search.css │ └── index.css └── search.php ├── LICENSE ├── .gitignore ├── sql.py ├── movie.py └── Proxy-IP_1.0.0b3.py /README.md: -------------------------------------------------------------------------------- 1 | # douban-movie-crawler 2 | 豆瓣电影爬虫Demo 3 | -------------------------------------------------------------------------------- /Web/class/search_class.php: -------------------------------------------------------------------------------- 1 | actor_list = implode('/ ',$temp[0]); 11 | return $this->actor_list; 12 | } 13 | } 14 | ?> -------------------------------------------------------------------------------- /Web/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 |没有找到相关结果
"; 33 | } 34 | //释放结果集 35 | mysqli_free_result($result); 36 | 37 | //断开服务器连接 38 | mysqli_close($con); 39 | ?> 40 | 41 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /sql.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | import mysql.connector 5 | 6 | host = "localhost" # 数据库主机地址 7 | user = "root" # 数据库用户名 8 | passwd = "root" # 数据库密码 9 | database = "movie" # 数据库名 10 | 11 | 12 | # 创建数据库 13 | def create_db(): 14 | db = mysql.connector.connect(host=host, user=user, passwd=passwd) # 打开数据库连接 15 | cursor = db.cursor() # 创建游标对象 16 | try: 17 | cursor.execute("CREATE DATABASE `movie`") # 执行MYSQL语句 18 | print("数据库创建成功!") 19 | except Exception: 20 | print("【movie】数据库已存在,跳过创建!") 21 | db.close() # 关闭数据库连接 22 | 23 | 24 | # 创建数据表 25 | def create_table(): 26 | db = mysql.connector.connect(host=host, user=user, passwd=passwd, database=database) 27 | cursor = db.cursor() 28 | try: 29 | cursor.execute("CREATE TABLE `movie` ( `id` INT NOT NULL AUTO_INCREMENT,\ 30 | `db_id` INT(20) NOT NULL, `title` VARCHAR(100) NOT NULL , `rate` FLOAT NOT NULL , `img` VARCHAR(100) NOT NULL ,\ 31 | `data` TINYTEXT NOT NULL , `description` TEXT NOT NULL , `director` TEXT NOT NULL ,\ 32 | `author` TEXT NOT NULL , `actor` TEXT NOT NULL , `genre` VARCHAR(255) NOT NULL ,\ 33 | PRIMARY KEY (`id`)) DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci") 34 | print("表创建成功!") 35 | except Exception: 36 | print("表已存在,跳过创建!") 37 | db.close() # 关闭数据库连接 38 | 39 | 40 | # 插入电影数据 ID、标题、评分、图链 41 | def insert_data(id, db_id, title, rate, img): 42 | db = mysql.connector.connect(host=host, user=user, passwd=passwd, database=database) 43 | cursor = db.cursor() 44 | insert = "INSERT INTO `movie` (`id`, `db_id`, `title`, `rate`, `img`, `data`, `description`,\ 45 | `director`, `author`, `actor`, `genre`) VALUES ('" + id + "', '" + db_id + "', '" + title + "', '" + rate +\ 46 | "', '" + img + "', '', '', '', '', '', '')" 47 | # print(insert) # 查看数据库插入语句 48 | cursor.execute(insert) 49 | db.commit() # 提交任务,数据才会写入数据库 50 | db.close() # 关闭数据库连接 51 | 52 | 53 | # 插入电影详情 日期、简介、导演、作者、演员、类型 54 | def insert_info(id, data, description, director, author, actor, genre): 55 | db = mysql.connector.connect(host=host, user=user, passwd=passwd, database=database) 56 | cursor = db.cursor() 57 | insert = "UPDATE `movie` SET `data` = '" + data + "', `description` = '" + description + "', `director` = '"\ 58 | + director + "', `author` = '" + author + "', `actor` = '" + actor + "', `genre` = '"\ 59 | + genre + "' WHERE `movie`.`id` = " + id + ";" 60 | print(insert) # 查看数据库插入语句 61 | cursor.execute(insert) 62 | db.commit() # 提交任务,数据才会写入数据库 63 | db.close() # 关闭数据库连接 64 | 65 | 66 | if __name__ == '__main__': 67 | create_db() 68 | create_table() -------------------------------------------------------------------------------- /movie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding:utf-8 -*- 3 | 4 | import re 5 | import json 6 | import time 7 | import requests 8 | from fake_useragent import UserAgent 9 | import sql 10 | 11 | # User-Agent池 12 | ua = UserAgent() 13 | headers = {'User-Agent': ua.chrome} 14 | 15 | # 代理IP池(有时效性) 16 | proxies = ['49.86.75.105:39187', '110.86.176.186:44587', '27.152.195.115:29933', '1.194.148.210:30077', 17 | '123.54.231.164:48890', '106.122.169.251:48471', '100.86.35.31:47038', '49.85.111.243:23859', 18 | '183.166.125.46:35662', '49.85.105.104:25124', '122.241.218.254:20164', '220.185.209.203:45233', 19 | '125.109.195.79:22384', '125.123.64.194:21925', '117.93.181.43:44142', '180.125.70.163:42062'] 20 | 21 | # 全局变量 22 | times = 0 23 | 24 | 25 | # 豆瓣电影分类函数(tag) 26 | def douban_tags(): 27 | response = requests.get("https://movie.douban.com/j/search_tags", headers=headers) 28 | tags = json.loads(response.text) 29 | return tags["tags"] # 返回豆瓣电影分类列表 30 | 31 | 32 | # 豆瓣电影信息函数(ID、标题、评分、图链) 33 | def douban_movie(mv_tag, mv_num): 34 | temp = [] 35 | url = "https://movie.douban.com/j/search_subjects?type=movie&tag=" + str(mv_tag) + "&sort=recommend&page_limit=" + \ 36 | str(mv_num) + "&page_start=0" 37 | response = requests.get(url, headers=headers, timeout=10) 38 | movie_json = json.loads(response.text) 39 | for info in movie_json["subjects"]: # 遍历豆瓣JSON信息得到字典数据 40 | temp.append([info["id"], info["title"], info["rate"], info["cover"]]) 41 | return temp # 返回各分类所有电影的信息数组 42 | 43 | 44 | # 豆瓣电影详情函数(日期、简介、导演、作者、演员、类型) 45 | def douban_info(id, mv_id, you_ip): 46 | url = "https://movie.douban.com/subject/" + str(mv_id) 47 | proxy_host = you_ip # 代理IP 48 | response = requests.get(url, headers=headers, proxies={"http": proxy_host, "https": proxy_host}, timeout=10) 49 | movie_json = re.search(r'application/ld\+json">([\s\S]*?)', '').replace('标签内):网页链接、备注名称、爬取页数、爬取速度 48 | def ip_html1(link, text, page, speed): 49 | global proxy_ip 50 | ip_num = 0 51 | try: 52 | for a in range(1, page): # 爬取多少页 53 | url = str(link) + str(a) 54 | response = requests.get(url, headers=headers, timeout=8) 55 | # print(response.text) 56 | soupIP = BeautifulSoup(response.text, 'html5lib') 57 | trs = soupIP.find_all('tr') 58 | for tr in trs[1:]: 59 | ip_num += 1 # 自增计算IP数 60 | tds = tr.find_all('td') 61 | # print(tds) 62 | ip = tds[0].text.strip() 63 | proxy_ip.append(ip) 64 | time.sleep(speed) # 控制访问速度(很重要,如果访问太快被封IP就不能继续爬了) 65 | except Exception as e: 66 | print(e) 67 | print(str(text) + "获取到" + str(ip_num) + "个代理IP") 68 | 69 | 70 | # 使用HTML解析获取代理IP函数【二】(源码IP和端口不在一起,IP在第1个