├── README.pdf ├── .gitattributes ├── result ├── db.png ├── 预测1.png ├── 预测2.png ├── output.png ├── p (1).png ├── p (10).png ├── p (11).png ├── p (12).png ├── p (13).png ├── p (14).png ├── p (15).png ├── p (16).png ├── p (17).png ├── p (18).png ├── p (19).png ├── p (2).png ├── p (3).png ├── p (4).png ├── p (5).png ├── p (6).png ├── p (7).png ├── p (8).png ├── p (9).png └── db_struct.png ├── .gitignore ├── src ├── movie_basic.py ├── movie_detail.py ├── unit.py ├── attachfile.py ├── main.py ├── database.py └── predict.ipynb └── README.md /README.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/README.pdf -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /result/db.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/db.png -------------------------------------------------------------------------------- /result/预测1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/预测1.png -------------------------------------------------------------------------------- /result/预测2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/预测2.png -------------------------------------------------------------------------------- /result/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/output.png -------------------------------------------------------------------------------- /result/p (1).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (1).png -------------------------------------------------------------------------------- /result/p (10).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (10).png -------------------------------------------------------------------------------- /result/p (11).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (11).png -------------------------------------------------------------------------------- /result/p (12).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (12).png -------------------------------------------------------------------------------- /result/p (13).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (13).png -------------------------------------------------------------------------------- /result/p (14).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (14).png -------------------------------------------------------------------------------- /result/p (15).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (15).png -------------------------------------------------------------------------------- /result/p (16).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (16).png -------------------------------------------------------------------------------- /result/p (17).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (17).png -------------------------------------------------------------------------------- /result/p (18).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (18).png -------------------------------------------------------------------------------- /result/p (19).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (19).png -------------------------------------------------------------------------------- /result/p (2).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (2).png -------------------------------------------------------------------------------- /result/p (3).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (3).png -------------------------------------------------------------------------------- /result/p (4).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (4).png -------------------------------------------------------------------------------- /result/p (5).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (5).png -------------------------------------------------------------------------------- /result/p (6).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (6).png -------------------------------------------------------------------------------- /result/p (7).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (7).png -------------------------------------------------------------------------------- /result/p (8).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (8).png -------------------------------------------------------------------------------- /result/p (9).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (9).png -------------------------------------------------------------------------------- /result/db_struct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/db_struct.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # python cache 2 | */__pycache__/ 3 | *.py[cod] 4 | 5 | #html 6 | html/ 7 | *.html 8 | *.htm 9 | 10 | #csv 11 | csv/ 12 | 13 | #pycharm 14 | .idea/ 15 | -------------------------------------------------------------------------------- /src/movie_basic.py: -------------------------------------------------------------------------------- 1 | from lxml.html import tostring 2 | import re 3 | 4 | 5 | def get_basic_data(movie): 6 | # 电影信息 7 | movie_info = movie.xpath('div[@class="info"]')[0] 8 | movie_bd = movie_info.xpath('div[@class="bd"]')[0] 9 | movie_credits = movie_bd.xpath('p')[0] 10 | movie_star_info = movie_bd.xpath('div[@class="star"]')[0] 11 | movie_quote = movie_bd.xpath('p[@class="quote"]') 12 | 13 | movie_div = tostring(movie_info, encoding="utf-8").decode("utf-8") 14 | 15 | # 获取电影名 16 | movie_title = re.findall( 17 | r'(.*)', movie_div) 18 | name1 = movie_title[0].strip() 19 | if len(movie_title) > 1: 20 | name2 = "".join(movie_title[1].strip()[1:].split()) 21 | else: 22 | name2 = '-' 23 | 24 | # 电影评分 25 | score = float(movie_star_info.xpath( 26 | 'span[@class="rating_num"]/text()')[0]) 27 | 28 | # 评价人数 29 | comment = int(movie_star_info.xpath( 30 | 'span')[-1].xpath('text()')[0][:-3]) # ...人评价\ 31 | 32 | # 电影语录 33 | if len(movie_quote) > 0: 34 | quote_str = movie_quote[0].xpath('span/text()')[0] 35 | else: 36 | quote_str = '' 37 | 38 | # 电影详情URL 39 | page_url = re.findall( 40 | r'', movie_div)[0] 41 | 42 | return name1, name2, score, comment, quote_str, page_url 43 | -------------------------------------------------------------------------------- /src/movie_detail.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | import re 3 | 4 | 5 | def get_detail_data(movie_html: str): 6 | # print(movie_html) 7 | 8 | _selector = lxml.html.fromstring(movie_html) 9 | movie_page_divs = _selector.xpath('//div[@id="info"]')[0] 10 | # print(movie_page_divs) 11 | 12 | ''' 13 | 例: 14 | 15 | 类型: 剧情 / 犯罪 16 | 制片国家/地区: 美国 17 | 语言: 英语 18 | 上映日期: 1994-09-10(多伦多电影节) / 1994-10-14(美国) 19 | 片长: 142分钟 20 | 又名: 月黑高飞(港) / 刺激1995(台) / 地狱诺言 / 铁窗岁月 / 消香克的救赎 21 | IMDb: tt0111161 22 | ''' 23 | 24 | info_str = _selector.xpath('//div[@id="info"]')[0].xpath('string(.)').strip() 25 | 26 | type = re.findall(r'类型: (.*)', info_str)[0].split(' / ') 27 | place = re.findall(r'制片国家/地区: (.*)', info_str)[0].split(' / ') 28 | lang = re.findall(r'语言: (.*)', info_str)[0].split(' / ') 29 | year = int(re.findall(r'上映日期: (....)', info_str)[0]) 30 | length = int(re.findall(r'片长: (.*?)分钟', info_str)[0]) 31 | 32 | # print(type, place, lang, year, length) 33 | 34 | movie_attrs = movie_page_divs.xpath('//span[@class="attrs"]') 35 | director = movie_attrs[0].xpath("a/text()") 36 | if len(movie_attrs) < 3: 37 | actor = '' 38 | else: 39 | actor = movie_attrs[2].xpath("a/text()") 40 | # print(director) 41 | # print(actor) 42 | 43 | return director, actor, type, place, lang, year, length 44 | -------------------------------------------------------------------------------- /src/unit.py: -------------------------------------------------------------------------------- 1 | import lxml.html 2 | import re 3 | import requests 4 | from main import get_data_douban 5 | from movie_detail import get_detail_data 6 | 7 | 8 | # with open('./html/7号房的礼物.html', 'r') as f: 9 | # print(get_detail_data(f.read())) 10 | 11 | 12 | # from database import * 13 | 14 | # with open('quotes.txt','w+') as f: 15 | # sql = 'select quote from movie' 16 | # quotes = execute_sql(sql, 1) 17 | # # print(quotes[0]) 18 | # for quote in quotes[0]: 19 | # f.write(quote) 20 | 21 | import lxml.html 22 | import requests 23 | 24 | 25 | headers = { 26 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0", 27 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", 28 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", 29 | "Upgrade-Insecure-Requests": "1", 30 | "Sec-Fetch-Dest": "document", 31 | "Sec-Fetch-Mode": "navigate", 32 | "Sec-Fetch-Site": "cross-site", 33 | "Sec-Fetch-User": "?1" 34 | } 35 | 36 | def get_html(url: str) -> str: 37 | return requests.get(url, headers=headers).content.decode(encoding='utf-8') 38 | 39 | url = 'https://www.douban.com/search?q=满江红' 40 | 41 | html = get_html(url) 42 | 43 | with open('manjianghong.html', 'w+', encoding='utf-8') as f: 44 | f.write(html.encode('utf-8').decode('utf-8')) 45 | 46 | # with open('manjianghong.html', 'r', encoding='utf-8') as f: 47 | # html = f.read() 48 | 49 | # print(html) 50 | 51 | get_data_douban('满江红', 'https://www.douban.com/search?q=满江红') 52 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # douban-master 2 | 3 | ## 功能 4 | 5 | 数据获取: 使用爬虫工具, 在豆瓣TOP250榜单, 猫眼网票房排行榜上爬取电影相关数据, 如评分,票房等 6 | 7 | 数据持久化: 使用pandas中的DataFrame存储csv的方式和MySQL关系型数据库存储两种方式分别实现持久化 8 | 9 | 可视化分析: 从持久化的数据中选取相应数据的关系进行可视化分析 10 | 11 | 票房预测: 通过可视化分析得到的结论, 选取可能影响票房的因素, 建立预测模型和算法, 进行预测 12 | 13 | --- 14 | ## 文件结构 15 | 16 | | 文件 | 描述 | 17 | | -------------------------- | ------------------------------------ | 18 | | main.py | 数据爬虫及持久化的主函数 | 19 | | movie_basic.py | 豆瓣TOP250列表页爬取 | 20 | | movie_detail.py | 豆瓣电影详情内页爬取 | 21 | | database.py | 数据库连接操作及查询接口 | 22 | | attachfile.py | 静态内容,如请求头headers等 | 23 | | visualization_sql.ipynb | 数据可视化,数据使用SQL查询方式 | 24 | | visualization_pandas.ipynb | 数据可视化,数据使用pandas聚合等方式 | 25 | | predict.ipynb | 票房预测模型的建立和预测举例 | 26 | | /html | 存放爬取的html文件 | 27 | | /csv | 存放pandas处理的dataframe数据 | 28 | | /result | 存放可视化结果,及数据库内容截图等 | 29 | 30 | 31 | 32 | --- 33 | 34 | ## 技术栈 35 | 36 | Python爬虫与数据处理: requests, lxml, re, pandas 37 | 38 | 数据持久化: pymysql, pandas, MySQL 39 | 40 | 数据清洗: pandas, MySQL (实际上没做) 41 | 42 | 可视化分析: pyecharts, matplotlib, SQL, pandas 43 | 44 | 模型预测: sklearn, numpy, matplotlib 45 | 46 | --- 47 | ## 可视化举例 48 | ![p1](./result/p%20(1).png) 49 | ![p2](./result/p%20(2).png) 50 | ![p3](./result/p%20(3).png) 51 | ![p4](./result/p%20(4).png) 52 | ![p5](./result/p%20(5).png) 53 | ![p6](./result/p%20(6).png) 54 | ![p7](./result/p%20(7).png) 55 | ![p8](./result/p%20(8).png) 56 | ![p9](./result/p%20(9).png) 57 | ![p10](./result/p%20(10).png) 58 | ![p11](./result/p%20(11).png) 59 | 60 | 61 | --- 62 | ## 票房预测举例 63 | 单位/万元 64 | 65 | ![p1](./result/预测1.png) 66 | 67 | ![p2](./result/预测2.png) 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /src/attachfile.py: -------------------------------------------------------------------------------- 1 | headers = { 2 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0", 3 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", 4 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2", 5 | "Upgrade-Insecure-Requests": "1", 6 | "Sec-Fetch-Dest": "document", 7 | "Sec-Fetch-Mode": "navigate", 8 | "Sec-Fetch-Site": "cross-site", 9 | "Sec-Fetch-User": "?1" 10 | } 11 | 12 | name_map = { 13 | 'Singapore Rep.': '新加坡', 14 | 'Dominican Rep.': '多米尼加', 15 | 'Palestine': '巴勒斯坦', 16 | 'Bahamas': '巴哈马', 17 | 'Timor-Leste': '东帝汶', 18 | 'Afghanistan': '阿富汗', 19 | 'Guinea-Bissau': '几内亚比绍', 20 | "Côte d'Ivoire": '科特迪瓦', 21 | 'Siachen Glacier': '锡亚琴冰川', 22 | "Br. Indian Ocean Ter.": '英属印度洋领土', 23 | 'Angola': '安哥拉', 24 | 'Albania': '阿尔巴尼亚', 25 | 'United Arab Emirates': '阿联酋', 26 | 'Argentina': '阿根廷', 27 | 'Armenia': '亚美尼亚', 28 | 'French Southern and Antarctic Lands': '法属南半球和南极领地', 29 | 'Australia': '澳大利亚', 30 | 'Austria': '奥地利', 31 | 'Azerbaijan': '阿塞拜疆', 32 | 'Burundi': '布隆迪', 33 | 'Belgium': '比利时', 34 | 'Benin': '贝宁', 35 | 'Burkina Faso': '布基纳法索', 36 | 'Bangladesh': '孟加拉国', 37 | 'Bulgaria': '保加利亚', 38 | 'The Bahamas': '巴哈马', 39 | 'Bosnia and Herz.': '波斯尼亚和黑塞哥维那', 40 | 'Belarus': '白俄罗斯', 41 | 'Belize': '伯利兹', 42 | 'Bermuda': '百慕大', 43 | 'Bolivia': '玻利维亚', 44 | 'Brazil': '巴西', 45 | 'Brunei': '文莱', 46 | 'Bhutan': '不丹', 47 | 'Botswana': '博茨瓦纳', 48 | 'Central African Rep.': '中非', 49 | 'Canada': '加拿大', 50 | 'Switzerland': '瑞士', 51 | 'Chile': '智利', 52 | 'China': '中国', 53 | 'Ivory Coast': '象牙海岸', 54 | 'Cameroon': '喀麦隆', 55 | 'Dem. Rep. Congo': '刚果民主共和国', 56 | 'Congo': '刚果', 57 | 'Colombia': '哥伦比亚', 58 | 'Costa Rica': '哥斯达黎加', 59 | 'Cuba': '古巴', 60 | 'N. Cyprus': '北塞浦路斯', 61 | 'Cyprus': '塞浦路斯', 62 | 'Czech Rep.': '捷克', 63 | 'Germany': '德国', 64 | 'Djibouti': '吉布提', 65 | 'Denmark': '丹麦', 66 | 'Algeria': '阿尔及利亚', 67 | 'Ecuador': '厄瓜多尔', 68 | 'Egypt': '埃及', 69 | 'Eritrea': '厄立特里亚', 70 | 'Spain': '西班牙', 71 | 'Estonia': '爱沙尼亚', 72 | 'Ethiopia': '埃塞俄比亚', 73 | 'Finland': '芬兰', 74 | 'Fiji': '斐', 75 | 'Falkland Islands': '福克兰群岛', 76 | 'France': '法国', 77 | 'Gabon': '加蓬', 78 | 'United Kingdom': '英国', 79 | 'Georgia': '格鲁吉亚', 80 | 'Ghana': '加纳', 81 | 'Guinea': '几内亚', 82 | 'Gambia': '冈比亚', 83 | 'Guinea Bissau': '几内亚比绍', 84 | 'Eq. Guinea': '赤道几内亚', 85 | 'Greece': '希腊', 86 | 'Greenland': '格陵兰', 87 | 'Guatemala': '危地马拉', 88 | 'French Guiana': '法属圭亚那', 89 | 'Guyana': '圭亚那', 90 | 'Honduras': '洪都拉斯', 91 | 'Croatia': '克罗地亚', 92 | 'Haiti': '海地', 93 | 'Hungary': '匈牙利', 94 | 'Indonesia': '印度尼西亚', 95 | 'India': '印度', 96 | 'Ireland': '爱尔兰', 97 | 'Iran': '伊朗', 98 | 'Iraq': '伊拉克', 99 | 'Iceland': '冰岛', 100 | 'Israel': '以色列', 101 | 'Italy': '意大利', 102 | 'Jamaica': '牙买加', 103 | 'Jordan': '约旦', 104 | 'Japan': '日本', 105 | 'Kazakhstan': '哈萨克斯坦', 106 | 'Kenya': '肯尼亚', 107 | 'Kyrgyzstan': '吉尔吉斯斯坦', 108 | 'Cambodia': '柬埔寨', 109 | 'Korea': '韩国', 110 | 'Kosovo': '科索沃', 111 | 'Kuwait': '科威特', 112 | 'Lao PDR': '老挝', 113 | 'Lebanon': '黎巴嫩', 114 | 'Liberia': '利比里亚', 115 | 'Libya': '利比亚', 116 | 'Sri Lanka': '斯里兰卡', 117 | 'Lesotho': '莱索托', 118 | 'Lithuania': '立陶宛', 119 | 'Luxembourg': '卢森堡', 120 | 'Latvia': '拉脱维亚', 121 | 'Morocco': '摩洛哥', 122 | 'Moldova': '摩尔多瓦', 123 | 'Madagascar': '马达加斯加', 124 | 'Mexico': '墨西哥', 125 | 'Macedonia': '马其顿', 126 | 'Mali': '马里', 127 | 'Myanmar': '缅甸', 128 | 'Montenegro': '黑山', 129 | 'Mongolia': '蒙古', 130 | 'Mozambique': '莫桑比克', 131 | 'Mauritania': '毛里塔尼亚', 132 | 'Malawi': '马拉维', 133 | 'Malaysia': '马来西亚', 134 | 'Namibia': '纳米比亚', 135 | 'New Caledonia': '新喀里多尼亚', 136 | 'Niger': '尼日尔', 137 | 'Nigeria': '尼日利亚', 138 | 'Nicaragua': '尼加拉瓜', 139 | 'Netherlands': '荷兰', 140 | 'Norway': '挪威', 141 | 'Nepal': '尼泊尔', 142 | 'New Zealand': '新西兰', 143 | 'Oman': '阿曼', 144 | 'Pakistan': '巴基斯坦', 145 | 'Panama': '巴拿马', 146 | 'Peru': '秘鲁', 147 | 'Philippines': '菲律宾', 148 | 'Papua New Guinea': '巴布亚新几内亚', 149 | 'Poland': '波兰', 150 | 'Puerto Rico': '波多黎各', 151 | 'Dem. Rep. Korea': '朝鲜', 152 | 'Portugal': '葡萄牙', 153 | 'Paraguay': '巴拉圭', 154 | 'Qatar': '卡塔尔', 155 | 'Romania': '罗马尼亚', 156 | 'Russia': '俄罗斯', 157 | 'Rwanda': '卢旺达', 158 | 'W. Sahara': '西撒哈拉', 159 | 'Saudi Arabia': '沙特阿拉伯', 160 | 'Sudan': '苏丹', 161 | 'S. Sudan': '南苏丹', 162 | 'Senegal': '塞内加尔', 163 | 'Solomon Is.': '所罗门群岛', 164 | 'Sierra Leone': '塞拉利昂', 165 | 'El Salvador': '萨尔瓦多', 166 | 'Somaliland': '索马里兰', 167 | 'Somalia': '索马里', 168 | 'Serbia': '塞尔维亚', 169 | 'Suriname': '苏里南', 170 | 'Slovakia': '斯洛伐克', 171 | 'Slovenia': '斯洛文尼亚', 172 | 'Sweden': '瑞典', 173 | 'Swaziland': '斯威士兰', 174 | 'Syria': '叙利亚', 175 | 'Chad': '乍得', 176 | 'Togo': '多哥', 177 | 'Thailand': '泰国', 178 | 'Tajikistan': '塔吉克斯坦', 179 | 'Turkmenistan': '土库曼斯坦', 180 | 'East Timor': '东帝汶', 181 | 'Trinidad and Tobago': '特里尼达和多巴哥', 182 | 'Tunisia': '突尼斯', 183 | 'Turkey': '土耳其', 184 | 'Tanzania': '坦桑尼亚', 185 | 'Uganda': '乌干达', 186 | 'Ukraine': '乌克兰', 187 | 'Uruguay': '乌拉圭', 188 | 'United States': '美国', 189 | 'Uzbekistan': '乌兹别克斯坦', 190 | 'Venezuela': '委内瑞拉', 191 | 'Vietnam': '越南', 192 | 'Vanuatu': '瓦努阿图', 193 | 'West Bank': '西岸', 194 | 'Yemen': '也门', 195 | 'South Africa': '南非', 196 | 'Zambia': '赞比亚', 197 | 'Zimbabwe': '津巴布韦', 198 | 'Comoros': '科摩罗' 199 | } -------------------------------------------------------------------------------- /src/main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import lxml.html 3 | import pandas as pd 4 | import time 5 | import os 6 | 7 | 8 | from movie_detail import get_detail_data 9 | from movie_basic import get_basic_data 10 | from database import db_store, db_store_2, csv_store 11 | from lxml.html import tostring 12 | import re 13 | from attachfile import headers 14 | 15 | 16 | # 是否使用本地文件测试 17 | local_test = True 18 | 19 | 20 | def get_url(url: str, start: int) -> str: 21 | return url.replace('start=0', 'start=' + str(start)) 22 | 23 | 24 | def get_html(url: str) -> str: 25 | time.sleep(3) 26 | print('sleeping...') 27 | return requests.get(url, headers=headers).content.decode(encoding='utf-8') 28 | 29 | 30 | def get_data(url: str): 31 | movies_data = [] 32 | for i in range(10): 33 | print(i) 34 | start = i * 25 35 | 36 | _url = get_url(url, start=start) 37 | 38 | if local_test: 39 | # 使用本地文件 40 | html = open("./html/from_" + str(start + 1) + ".html", "r", encoding='utf-8').read() 41 | else: 42 | # 使用网络请求 43 | html = get_html(_url) 44 | time.sleep(0.5) 45 | with open("./html/from_" + str(start + 1) + ".html", 'w+', encoding='utf-8') as f: 46 | f.write(html.encode('utf-8').decode('utf-8')) 47 | 48 | selector = lxml.html.fromstring(html) 49 | movie_divs = selector.xpath('//div[@class="item"]') 50 | 51 | for movie in movie_divs: 52 | name1, name2, score, comment, quote_str, page_url = get_basic_data( 53 | movie) 54 | 55 | # 详情页内容爬取 56 | if local_test: 57 | # 使用本地文件 58 | movie_html = open("./html/" + name1 + ".html", "r", encoding='utf-8').read() 59 | else: 60 | # 使用网络请求 61 | movie_html = get_html(page_url) 62 | time.sleep(0.5) 63 | with open("./html/" + name1 + ".html", 'w+', encoding='utf-8') as f: 64 | f.write(movie_html.encode('utf-8').decode('utf-8')) 65 | 66 | # 详细信息 67 | director, actor, type, place, lang, year, length = get_detail_data( 68 | movie_html) 69 | 70 | movie_data = [name1, name2, score, comment, quote_str, 71 | page_url, director, actor, type, place, lang, year, length] 72 | 73 | # 存储到数据库 74 | db_store(movie_data) 75 | 76 | # print(movie_data) 77 | 78 | movies_data.append(movie_data) 79 | 80 | csv_store() 81 | movies_data = pd.DataFrame(movies_data, columns=[ 82 | '中文名', '外文名', '评分', '评价人数', '电影语录', '详情URL', '导演', '主演', '类型', '地区', '语言', '上映年份', '时长']) 83 | 84 | return movies_data 85 | 86 | 87 | url = "https://movie.douban.com/top250?start=0&filter=" 88 | 89 | 90 | get_data(url).to_csv("./csv/豆瓣数据.csv") 91 | 92 | url_maoyan = 'https://piaofang.maoyan.com/rankings/year' 93 | url_prefix = 'https://www.douban.com/search?cat=1002&q=' 94 | 95 | def get_search_url(text): 96 | return url_prefix + text 97 | 98 | def get_data_douban(name : str, html : str): 99 | 100 | movies = lxml.html.fromstring(html) 101 | score = movies.xpath('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div[2]/div/div/span[2]/text()')[0] 102 | comment = movies.xpath('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div[2]/div/div/span[3]/text()')[0] 103 | comment = int(comment[1:-4]) 104 | h3 = movies.xpath('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div[2]/div/h3') 105 | h3 = tostring(h3[0], encoding="utf-8").decode("utf-8") 106 | page_url = re.findall(r'" 131 | ] 132 | }, 133 | "metadata": {}, 134 | "output_type": "display_data" 135 | } 136 | ], 137 | "source": [ 138 | "from sklearn.linear_model import LinearRegression \n", 139 | "\n", 140 | "\n", 141 | "sql = 'select substr(date,1,4) year, avg(money) count \\\n", 142 | " from type t left outer join maoyan_movie_type mt on t.id = mt.type_id join maoyan_movie m on m.id = mt.maoyan_id\\\n", 143 | " group by year\\\n", 144 | " order by year\\\n", 145 | " limit 1, 14'\n", 146 | "x, y = execute_sql(sql, 2)\n", 147 | "y = [float(i) for i in y]\n", 148 | "x = np.array(x,dtype=int)\n", 149 | "y = np.array(y,dtype=float)\n", 150 | "\n", 151 | "reg = LinearRegression().fit(x.reshape(-1, 1), y.reshape(-1, 1))\n", 152 | "\n", 153 | "\n", 154 | "plt.figure(figsize=(12, 3.5))\n", 155 | "plt.scatter(x, y, color='black')\n", 156 | "plt.plot(x, reg.predict(x.reshape(-1, 1)), color='red', linewidth=1)\n", 157 | "plt.show()\n", 158 | "\n", 159 | "\n" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 7, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "def predict(movie):\n", 169 | " movie = binary_rep(movie)\n", 170 | " data = pd.read_csv('../csv/binary_data.csv')\n", 171 | " \n", 172 | " \n", 173 | " res = []\n", 174 | " for item in data.values:\n", 175 | " type, actor, director = list(map(lambda x : eval(x), [item[1], item[2], item[3]]))\n", 176 | " month, money, year, name = item[4], int(item[5]), int(item[6]), item[7]\n", 177 | " res.append((similarity(movie, [type, actor, director, month]), money, year, name))\n", 178 | " \n", 179 | " res.sort(key=lambda x : -x[0])\n", 180 | " \n", 181 | " predict_val = 0\n", 182 | " cnt = 0\n", 183 | " for i in range(5):\n", 184 | " if res[i][0] <= 2:\n", 185 | " break\n", 186 | " cnt += 1\n", 187 | " predict_val += res[i][1] * reg.predict([[2023]]) / reg.predict([[res[i][2]]])\n", 188 | " \n", 189 | " print(res[i])\n", 190 | " \n", 191 | " return predict_val[0][0] / cnt\n" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 8, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "name": "stdout", 201 | "output_type": "stream", 202 | "text": [ 203 | "(4.6808694950739635, 406733, 2022, '长津湖之水门桥')\n", 204 | "(3.846339605633867, 577534, 2021, '长津湖')\n", 205 | "(3.616970526244077, 165207, 2017, '西游伏妖篇')\n", 206 | "(3.254676920420864, 54469, 2015, '战狼')\n", 207 | "(3.2450937914128564, 88348, 2014, '智取威虎山')\n" 208 | ] 209 | }, 210 | { 211 | "data": { 212 | "text/plain": [ 213 | "306848.3916073195" 214 | ] 215 | }, 216 | "execution_count": 8, 217 | "metadata": {}, 218 | "output_type": "execute_result" 219 | } 220 | ], 221 | "source": [ 222 | "movie = [\n", 223 | " ['剧情', '历史'],\n", 224 | " ['吴京', '包贝尔', '易烊千玺', '邓超', '欧豪', '雷佳音', '郭京飞'],\n", 225 | " ['徐克', '吴京'],\n", 226 | " 2\n", 227 | "]\n", 228 | "\n", 229 | "predict(movie)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 18, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "(8.941589026166287, 194190, 2017, '前任3:再见前任')\n", 242 | "(3.0211400439542415, 58861, 2014, '匆匆那年')\n", 243 | "(2.668806447200494, 71902, 2013, '致我们终将逝去的青春')\n" 244 | ] 245 | }, 246 | { 247 | "data": { 248 | "text/plain": [ 249 | "169230.68515821127" 250 | ] 251 | }, 252 | "execution_count": 18, 253 | "metadata": {}, 254 | "output_type": "execute_result" 255 | } 256 | ], 257 | "source": [ 258 | "movie = [\n", 259 | " ['爱情', '喜剧'],\n", 260 | " ['韩庚', '郑恺', '于文文', '刘雅瑟', '张天爱'],\n", 261 | " ['田羽生'],\n", 262 | " 10\n", 263 | "]\n", 264 | "\n", 265 | "predict(movie)" 266 | ] 267 | } 268 | ], 269 | "metadata": { 270 | "kernelspec": { 271 | "display_name": "base", 272 | "language": "python", 273 | "name": "python3" 274 | }, 275 | "language_info": { 276 | "codemirror_mode": { 277 | "name": "ipython", 278 | "version": 3 279 | }, 280 | "file_extension": ".py", 281 | "mimetype": "text/x-python", 282 | "name": "python", 283 | "nbconvert_exporter": "python", 284 | "pygments_lexer": "ipython3", 285 | "version": "3.9.13" 286 | }, 287 | "orig_nbformat": 4, 288 | "vscode": { 289 | "interpreter": { 290 | "hash": "6a018d3a992d78c184ccc94ab54168c7b26325ed3c2283926339c3edbf5487e7" 291 | } 292 | } 293 | }, 294 | "nbformat": 4, 295 | "nbformat_minor": 2 296 | } 297 | --------------------------------------------------------------------------------