├── README.pdf
├── .gitattributes
├── result
├── db.png
├── 预测1.png
├── 预测2.png
├── output.png
├── p (1).png
├── p (10).png
├── p (11).png
├── p (12).png
├── p (13).png
├── p (14).png
├── p (15).png
├── p (16).png
├── p (17).png
├── p (18).png
├── p (19).png
├── p (2).png
├── p (3).png
├── p (4).png
├── p (5).png
├── p (6).png
├── p (7).png
├── p (8).png
├── p (9).png
└── db_struct.png
├── .gitignore
├── src
├── movie_basic.py
├── movie_detail.py
├── unit.py
├── attachfile.py
├── main.py
├── database.py
└── predict.ipynb
└── README.md
/README.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/README.pdf
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/result/db.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/db.png
--------------------------------------------------------------------------------
/result/预测1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/预测1.png
--------------------------------------------------------------------------------
/result/预测2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/预测2.png
--------------------------------------------------------------------------------
/result/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/output.png
--------------------------------------------------------------------------------
/result/p (1).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (1).png
--------------------------------------------------------------------------------
/result/p (10).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (10).png
--------------------------------------------------------------------------------
/result/p (11).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (11).png
--------------------------------------------------------------------------------
/result/p (12).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (12).png
--------------------------------------------------------------------------------
/result/p (13).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (13).png
--------------------------------------------------------------------------------
/result/p (14).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (14).png
--------------------------------------------------------------------------------
/result/p (15).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (15).png
--------------------------------------------------------------------------------
/result/p (16).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (16).png
--------------------------------------------------------------------------------
/result/p (17).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (17).png
--------------------------------------------------------------------------------
/result/p (18).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (18).png
--------------------------------------------------------------------------------
/result/p (19).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (19).png
--------------------------------------------------------------------------------
/result/p (2).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (2).png
--------------------------------------------------------------------------------
/result/p (3).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (3).png
--------------------------------------------------------------------------------
/result/p (4).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (4).png
--------------------------------------------------------------------------------
/result/p (5).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (5).png
--------------------------------------------------------------------------------
/result/p (6).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (6).png
--------------------------------------------------------------------------------
/result/p (7).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (7).png
--------------------------------------------------------------------------------
/result/p (8).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (8).png
--------------------------------------------------------------------------------
/result/p (9).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/p (9).png
--------------------------------------------------------------------------------
/result/db_struct.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/fengxin-zhxx/douban-master/HEAD/result/db_struct.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # python cache
2 | */__pycache__/
3 | *.py[cod]
4 |
5 | #html
6 | html/
7 | *.html
8 | *.htm
9 |
10 | #csv
11 | csv/
12 |
13 | #pycharm
14 | .idea/
15 |
--------------------------------------------------------------------------------
/src/movie_basic.py:
--------------------------------------------------------------------------------
1 | from lxml.html import tostring
2 | import re
3 |
4 |
5 | def get_basic_data(movie):
6 | # 电影信息
7 | movie_info = movie.xpath('div[@class="info"]')[0]
8 | movie_bd = movie_info.xpath('div[@class="bd"]')[0]
9 | movie_credits = movie_bd.xpath('p')[0]
10 | movie_star_info = movie_bd.xpath('div[@class="star"]')[0]
11 | movie_quote = movie_bd.xpath('p[@class="quote"]')
12 |
13 | movie_div = tostring(movie_info, encoding="utf-8").decode("utf-8")
14 |
15 | # 获取电影名
16 | movie_title = re.findall(
17 | r'(.*)', movie_div)
18 | name1 = movie_title[0].strip()
19 | if len(movie_title) > 1:
20 | name2 = "".join(movie_title[1].strip()[1:].split())
21 | else:
22 | name2 = '-'
23 |
24 | # 电影评分
25 | score = float(movie_star_info.xpath(
26 | 'span[@class="rating_num"]/text()')[0])
27 |
28 | # 评价人数
29 | comment = int(movie_star_info.xpath(
30 | 'span')[-1].xpath('text()')[0][:-3]) # ...人评价\
31 |
32 | # 电影语录
33 | if len(movie_quote) > 0:
34 | quote_str = movie_quote[0].xpath('span/text()')[0]
35 | else:
36 | quote_str = ''
37 |
38 | # 电影详情URL
39 | page_url = re.findall(
40 | r'', movie_div)[0]
41 |
42 | return name1, name2, score, comment, quote_str, page_url
43 |
--------------------------------------------------------------------------------
/src/movie_detail.py:
--------------------------------------------------------------------------------
1 | import lxml.html
2 | import re
3 |
4 |
5 | def get_detail_data(movie_html: str):
6 | # print(movie_html)
7 |
8 | _selector = lxml.html.fromstring(movie_html)
9 | movie_page_divs = _selector.xpath('//div[@id="info"]')[0]
10 | # print(movie_page_divs)
11 |
12 | '''
13 | 例:
14 |
15 | 类型: 剧情 / 犯罪
16 | 制片国家/地区: 美国
17 | 语言: 英语
18 | 上映日期: 1994-09-10(多伦多电影节) / 1994-10-14(美国)
19 | 片长: 142分钟
20 | 又名: 月黑高飞(港) / 刺激1995(台) / 地狱诺言 / 铁窗岁月 / 消香克的救赎
21 | IMDb: tt0111161
22 | '''
23 |
24 | info_str = _selector.xpath('//div[@id="info"]')[0].xpath('string(.)').strip()
25 |
26 | type = re.findall(r'类型: (.*)', info_str)[0].split(' / ')
27 | place = re.findall(r'制片国家/地区: (.*)', info_str)[0].split(' / ')
28 | lang = re.findall(r'语言: (.*)', info_str)[0].split(' / ')
29 | year = int(re.findall(r'上映日期: (....)', info_str)[0])
30 | length = int(re.findall(r'片长: (.*?)分钟', info_str)[0])
31 |
32 | # print(type, place, lang, year, length)
33 |
34 | movie_attrs = movie_page_divs.xpath('//span[@class="attrs"]')
35 | director = movie_attrs[0].xpath("a/text()")
36 | if len(movie_attrs) < 3:
37 | actor = ''
38 | else:
39 | actor = movie_attrs[2].xpath("a/text()")
40 | # print(director)
41 | # print(actor)
42 |
43 | return director, actor, type, place, lang, year, length
44 |
--------------------------------------------------------------------------------
/src/unit.py:
--------------------------------------------------------------------------------
1 | import lxml.html
2 | import re
3 | import requests
4 | from main import get_data_douban
5 | from movie_detail import get_detail_data
6 |
7 |
8 | # with open('./html/7号房的礼物.html', 'r') as f:
9 | # print(get_detail_data(f.read()))
10 |
11 |
12 | # from database import *
13 |
14 | # with open('quotes.txt','w+') as f:
15 | # sql = 'select quote from movie'
16 | # quotes = execute_sql(sql, 1)
17 | # # print(quotes[0])
18 | # for quote in quotes[0]:
19 | # f.write(quote)
20 |
21 | import lxml.html
22 | import requests
23 |
24 |
25 | headers = {
26 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0",
27 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
28 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
29 | "Upgrade-Insecure-Requests": "1",
30 | "Sec-Fetch-Dest": "document",
31 | "Sec-Fetch-Mode": "navigate",
32 | "Sec-Fetch-Site": "cross-site",
33 | "Sec-Fetch-User": "?1"
34 | }
35 |
36 | def get_html(url: str) -> str:
37 | return requests.get(url, headers=headers).content.decode(encoding='utf-8')
38 |
39 | url = 'https://www.douban.com/search?q=满江红'
40 |
41 | html = get_html(url)
42 |
43 | with open('manjianghong.html', 'w+', encoding='utf-8') as f:
44 | f.write(html.encode('utf-8').decode('utf-8'))
45 |
46 | # with open('manjianghong.html', 'r', encoding='utf-8') as f:
47 | # html = f.read()
48 |
49 | # print(html)
50 |
51 | get_data_douban('满江红', 'https://www.douban.com/search?q=满江红')
52 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # douban-master
2 |
3 | ## 功能
4 |
5 | 数据获取: 使用爬虫工具, 在豆瓣TOP250榜单, 猫眼网票房排行榜上爬取电影相关数据, 如评分,票房等
6 |
7 | 数据持久化: 使用pandas中的DataFrame存储csv的方式和MySQL关系型数据库存储两种方式分别实现持久化
8 |
9 | 可视化分析: 从持久化的数据中选取相应数据的关系进行可视化分析
10 |
11 | 票房预测: 通过可视化分析得到的结论, 选取可能影响票房的因素, 建立预测模型和算法, 进行预测
12 |
13 | ---
14 | ## 文件结构
15 |
16 | | 文件 | 描述 |
17 | | -------------------------- | ------------------------------------ |
18 | | main.py | 数据爬虫及持久化的主函数 |
19 | | movie_basic.py | 豆瓣TOP250列表页爬取 |
20 | | movie_detail.py | 豆瓣电影详情内页爬取 |
21 | | database.py | 数据库连接操作及查询接口 |
22 | | attachfile.py | 静态内容,如请求头headers等 |
23 | | visualization_sql.ipynb | 数据可视化,数据使用SQL查询方式 |
24 | | visualization_pandas.ipynb | 数据可视化,数据使用pandas聚合等方式 |
25 | | predict.ipynb | 票房预测模型的建立和预测举例 |
26 | | /html | 存放爬取的html文件 |
27 | | /csv | 存放pandas处理的dataframe数据 |
28 | | /result | 存放可视化结果,及数据库内容截图等 |
29 |
30 |
31 |
32 | ---
33 |
34 | ## 技术栈
35 |
36 | Python爬虫与数据处理: requests, lxml, re, pandas
37 |
38 | 数据持久化: pymysql, pandas, MySQL
39 |
40 | 数据清洗: pandas, MySQL (实际上没做)
41 |
42 | 可视化分析: pyecharts, matplotlib, SQL, pandas
43 |
44 | 模型预测: sklearn, numpy, matplotlib
45 |
46 | ---
47 | ## 可视化举例
48 | .png)
49 | .png)
50 | .png)
51 | .png)
52 | .png)
53 | .png)
54 | .png)
55 | .png)
56 | .png)
57 | .png)
58 | .png)
59 |
60 |
61 | ---
62 | ## 票房预测举例
63 | 单位/万元
64 |
65 | 
66 |
67 | 
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
--------------------------------------------------------------------------------
/src/attachfile.py:
--------------------------------------------------------------------------------
1 | headers = {
2 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:108.0) Gecko/20100101 Firefox/108.0",
3 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
4 | "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
5 | "Upgrade-Insecure-Requests": "1",
6 | "Sec-Fetch-Dest": "document",
7 | "Sec-Fetch-Mode": "navigate",
8 | "Sec-Fetch-Site": "cross-site",
9 | "Sec-Fetch-User": "?1"
10 | }
11 |
12 | name_map = {
13 | 'Singapore Rep.': '新加坡',
14 | 'Dominican Rep.': '多米尼加',
15 | 'Palestine': '巴勒斯坦',
16 | 'Bahamas': '巴哈马',
17 | 'Timor-Leste': '东帝汶',
18 | 'Afghanistan': '阿富汗',
19 | 'Guinea-Bissau': '几内亚比绍',
20 | "Côte d'Ivoire": '科特迪瓦',
21 | 'Siachen Glacier': '锡亚琴冰川',
22 | "Br. Indian Ocean Ter.": '英属印度洋领土',
23 | 'Angola': '安哥拉',
24 | 'Albania': '阿尔巴尼亚',
25 | 'United Arab Emirates': '阿联酋',
26 | 'Argentina': '阿根廷',
27 | 'Armenia': '亚美尼亚',
28 | 'French Southern and Antarctic Lands': '法属南半球和南极领地',
29 | 'Australia': '澳大利亚',
30 | 'Austria': '奥地利',
31 | 'Azerbaijan': '阿塞拜疆',
32 | 'Burundi': '布隆迪',
33 | 'Belgium': '比利时',
34 | 'Benin': '贝宁',
35 | 'Burkina Faso': '布基纳法索',
36 | 'Bangladesh': '孟加拉国',
37 | 'Bulgaria': '保加利亚',
38 | 'The Bahamas': '巴哈马',
39 | 'Bosnia and Herz.': '波斯尼亚和黑塞哥维那',
40 | 'Belarus': '白俄罗斯',
41 | 'Belize': '伯利兹',
42 | 'Bermuda': '百慕大',
43 | 'Bolivia': '玻利维亚',
44 | 'Brazil': '巴西',
45 | 'Brunei': '文莱',
46 | 'Bhutan': '不丹',
47 | 'Botswana': '博茨瓦纳',
48 | 'Central African Rep.': '中非',
49 | 'Canada': '加拿大',
50 | 'Switzerland': '瑞士',
51 | 'Chile': '智利',
52 | 'China': '中国',
53 | 'Ivory Coast': '象牙海岸',
54 | 'Cameroon': '喀麦隆',
55 | 'Dem. Rep. Congo': '刚果民主共和国',
56 | 'Congo': '刚果',
57 | 'Colombia': '哥伦比亚',
58 | 'Costa Rica': '哥斯达黎加',
59 | 'Cuba': '古巴',
60 | 'N. Cyprus': '北塞浦路斯',
61 | 'Cyprus': '塞浦路斯',
62 | 'Czech Rep.': '捷克',
63 | 'Germany': '德国',
64 | 'Djibouti': '吉布提',
65 | 'Denmark': '丹麦',
66 | 'Algeria': '阿尔及利亚',
67 | 'Ecuador': '厄瓜多尔',
68 | 'Egypt': '埃及',
69 | 'Eritrea': '厄立特里亚',
70 | 'Spain': '西班牙',
71 | 'Estonia': '爱沙尼亚',
72 | 'Ethiopia': '埃塞俄比亚',
73 | 'Finland': '芬兰',
74 | 'Fiji': '斐',
75 | 'Falkland Islands': '福克兰群岛',
76 | 'France': '法国',
77 | 'Gabon': '加蓬',
78 | 'United Kingdom': '英国',
79 | 'Georgia': '格鲁吉亚',
80 | 'Ghana': '加纳',
81 | 'Guinea': '几内亚',
82 | 'Gambia': '冈比亚',
83 | 'Guinea Bissau': '几内亚比绍',
84 | 'Eq. Guinea': '赤道几内亚',
85 | 'Greece': '希腊',
86 | 'Greenland': '格陵兰',
87 | 'Guatemala': '危地马拉',
88 | 'French Guiana': '法属圭亚那',
89 | 'Guyana': '圭亚那',
90 | 'Honduras': '洪都拉斯',
91 | 'Croatia': '克罗地亚',
92 | 'Haiti': '海地',
93 | 'Hungary': '匈牙利',
94 | 'Indonesia': '印度尼西亚',
95 | 'India': '印度',
96 | 'Ireland': '爱尔兰',
97 | 'Iran': '伊朗',
98 | 'Iraq': '伊拉克',
99 | 'Iceland': '冰岛',
100 | 'Israel': '以色列',
101 | 'Italy': '意大利',
102 | 'Jamaica': '牙买加',
103 | 'Jordan': '约旦',
104 | 'Japan': '日本',
105 | 'Kazakhstan': '哈萨克斯坦',
106 | 'Kenya': '肯尼亚',
107 | 'Kyrgyzstan': '吉尔吉斯斯坦',
108 | 'Cambodia': '柬埔寨',
109 | 'Korea': '韩国',
110 | 'Kosovo': '科索沃',
111 | 'Kuwait': '科威特',
112 | 'Lao PDR': '老挝',
113 | 'Lebanon': '黎巴嫩',
114 | 'Liberia': '利比里亚',
115 | 'Libya': '利比亚',
116 | 'Sri Lanka': '斯里兰卡',
117 | 'Lesotho': '莱索托',
118 | 'Lithuania': '立陶宛',
119 | 'Luxembourg': '卢森堡',
120 | 'Latvia': '拉脱维亚',
121 | 'Morocco': '摩洛哥',
122 | 'Moldova': '摩尔多瓦',
123 | 'Madagascar': '马达加斯加',
124 | 'Mexico': '墨西哥',
125 | 'Macedonia': '马其顿',
126 | 'Mali': '马里',
127 | 'Myanmar': '缅甸',
128 | 'Montenegro': '黑山',
129 | 'Mongolia': '蒙古',
130 | 'Mozambique': '莫桑比克',
131 | 'Mauritania': '毛里塔尼亚',
132 | 'Malawi': '马拉维',
133 | 'Malaysia': '马来西亚',
134 | 'Namibia': '纳米比亚',
135 | 'New Caledonia': '新喀里多尼亚',
136 | 'Niger': '尼日尔',
137 | 'Nigeria': '尼日利亚',
138 | 'Nicaragua': '尼加拉瓜',
139 | 'Netherlands': '荷兰',
140 | 'Norway': '挪威',
141 | 'Nepal': '尼泊尔',
142 | 'New Zealand': '新西兰',
143 | 'Oman': '阿曼',
144 | 'Pakistan': '巴基斯坦',
145 | 'Panama': '巴拿马',
146 | 'Peru': '秘鲁',
147 | 'Philippines': '菲律宾',
148 | 'Papua New Guinea': '巴布亚新几内亚',
149 | 'Poland': '波兰',
150 | 'Puerto Rico': '波多黎各',
151 | 'Dem. Rep. Korea': '朝鲜',
152 | 'Portugal': '葡萄牙',
153 | 'Paraguay': '巴拉圭',
154 | 'Qatar': '卡塔尔',
155 | 'Romania': '罗马尼亚',
156 | 'Russia': '俄罗斯',
157 | 'Rwanda': '卢旺达',
158 | 'W. Sahara': '西撒哈拉',
159 | 'Saudi Arabia': '沙特阿拉伯',
160 | 'Sudan': '苏丹',
161 | 'S. Sudan': '南苏丹',
162 | 'Senegal': '塞内加尔',
163 | 'Solomon Is.': '所罗门群岛',
164 | 'Sierra Leone': '塞拉利昂',
165 | 'El Salvador': '萨尔瓦多',
166 | 'Somaliland': '索马里兰',
167 | 'Somalia': '索马里',
168 | 'Serbia': '塞尔维亚',
169 | 'Suriname': '苏里南',
170 | 'Slovakia': '斯洛伐克',
171 | 'Slovenia': '斯洛文尼亚',
172 | 'Sweden': '瑞典',
173 | 'Swaziland': '斯威士兰',
174 | 'Syria': '叙利亚',
175 | 'Chad': '乍得',
176 | 'Togo': '多哥',
177 | 'Thailand': '泰国',
178 | 'Tajikistan': '塔吉克斯坦',
179 | 'Turkmenistan': '土库曼斯坦',
180 | 'East Timor': '东帝汶',
181 | 'Trinidad and Tobago': '特里尼达和多巴哥',
182 | 'Tunisia': '突尼斯',
183 | 'Turkey': '土耳其',
184 | 'Tanzania': '坦桑尼亚',
185 | 'Uganda': '乌干达',
186 | 'Ukraine': '乌克兰',
187 | 'Uruguay': '乌拉圭',
188 | 'United States': '美国',
189 | 'Uzbekistan': '乌兹别克斯坦',
190 | 'Venezuela': '委内瑞拉',
191 | 'Vietnam': '越南',
192 | 'Vanuatu': '瓦努阿图',
193 | 'West Bank': '西岸',
194 | 'Yemen': '也门',
195 | 'South Africa': '南非',
196 | 'Zambia': '赞比亚',
197 | 'Zimbabwe': '津巴布韦',
198 | 'Comoros': '科摩罗'
199 | }
--------------------------------------------------------------------------------
/src/main.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import lxml.html
3 | import pandas as pd
4 | import time
5 | import os
6 |
7 |
8 | from movie_detail import get_detail_data
9 | from movie_basic import get_basic_data
10 | from database import db_store, db_store_2, csv_store
11 | from lxml.html import tostring
12 | import re
13 | from attachfile import headers
14 |
15 |
16 | # 是否使用本地文件测试
17 | local_test = True
18 |
19 |
20 | def get_url(url: str, start: int) -> str:
21 | return url.replace('start=0', 'start=' + str(start))
22 |
23 |
24 | def get_html(url: str) -> str:
25 | time.sleep(3)
26 | print('sleeping...')
27 | return requests.get(url, headers=headers).content.decode(encoding='utf-8')
28 |
29 |
30 | def get_data(url: str):
31 | movies_data = []
32 | for i in range(10):
33 | print(i)
34 | start = i * 25
35 |
36 | _url = get_url(url, start=start)
37 |
38 | if local_test:
39 | # 使用本地文件
40 | html = open("./html/from_" + str(start + 1) + ".html", "r", encoding='utf-8').read()
41 | else:
42 | # 使用网络请求
43 | html = get_html(_url)
44 | time.sleep(0.5)
45 | with open("./html/from_" + str(start + 1) + ".html", 'w+', encoding='utf-8') as f:
46 | f.write(html.encode('utf-8').decode('utf-8'))
47 |
48 | selector = lxml.html.fromstring(html)
49 | movie_divs = selector.xpath('//div[@class="item"]')
50 |
51 | for movie in movie_divs:
52 | name1, name2, score, comment, quote_str, page_url = get_basic_data(
53 | movie)
54 |
55 | # 详情页内容爬取
56 | if local_test:
57 | # 使用本地文件
58 | movie_html = open("./html/" + name1 + ".html", "r", encoding='utf-8').read()
59 | else:
60 | # 使用网络请求
61 | movie_html = get_html(page_url)
62 | time.sleep(0.5)
63 | with open("./html/" + name1 + ".html", 'w+', encoding='utf-8') as f:
64 | f.write(movie_html.encode('utf-8').decode('utf-8'))
65 |
66 | # 详细信息
67 | director, actor, type, place, lang, year, length = get_detail_data(
68 | movie_html)
69 |
70 | movie_data = [name1, name2, score, comment, quote_str,
71 | page_url, director, actor, type, place, lang, year, length]
72 |
73 | # 存储到数据库
74 | db_store(movie_data)
75 |
76 | # print(movie_data)
77 |
78 | movies_data.append(movie_data)
79 |
80 | csv_store()
81 | movies_data = pd.DataFrame(movies_data, columns=[
82 | '中文名', '外文名', '评分', '评价人数', '电影语录', '详情URL', '导演', '主演', '类型', '地区', '语言', '上映年份', '时长'])
83 |
84 | return movies_data
85 |
86 |
87 | url = "https://movie.douban.com/top250?start=0&filter="
88 |
89 |
90 | get_data(url).to_csv("./csv/豆瓣数据.csv")
91 |
92 | url_maoyan = 'https://piaofang.maoyan.com/rankings/year'
93 | url_prefix = 'https://www.douban.com/search?cat=1002&q='
94 |
95 | def get_search_url(text):
96 | return url_prefix + text
97 |
98 | def get_data_douban(name : str, html : str):
99 |
100 | movies = lxml.html.fromstring(html)
101 | score = movies.xpath('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div[2]/div/div/span[2]/text()')[0]
102 | comment = movies.xpath('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div[2]/div/div/span[3]/text()')[0]
103 | comment = int(comment[1:-4])
104 | h3 = movies.xpath('/html/body/div[3]/div[1]/div/div[1]/div[3]/div[2]/div[1]/div[2]/div/h3')
105 | h3 = tostring(h3[0], encoding="utf-8").decode("utf-8")
106 | page_url = re.findall(r'"
131 | ]
132 | },
133 | "metadata": {},
134 | "output_type": "display_data"
135 | }
136 | ],
137 | "source": [
138 | "from sklearn.linear_model import LinearRegression \n",
139 | "\n",
140 | "\n",
141 | "sql = 'select substr(date,1,4) year, avg(money) count \\\n",
142 | " from type t left outer join maoyan_movie_type mt on t.id = mt.type_id join maoyan_movie m on m.id = mt.maoyan_id\\\n",
143 | " group by year\\\n",
144 | " order by year\\\n",
145 | " limit 1, 14'\n",
146 | "x, y = execute_sql(sql, 2)\n",
147 | "y = [float(i) for i in y]\n",
148 | "x = np.array(x,dtype=int)\n",
149 | "y = np.array(y,dtype=float)\n",
150 | "\n",
151 | "reg = LinearRegression().fit(x.reshape(-1, 1), y.reshape(-1, 1))\n",
152 | "\n",
153 | "\n",
154 | "plt.figure(figsize=(12, 3.5))\n",
155 | "plt.scatter(x, y, color='black')\n",
156 | "plt.plot(x, reg.predict(x.reshape(-1, 1)), color='red', linewidth=1)\n",
157 | "plt.show()\n",
158 | "\n",
159 | "\n"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 7,
165 | "metadata": {},
166 | "outputs": [],
167 | "source": [
168 | "def predict(movie):\n",
169 | " movie = binary_rep(movie)\n",
170 | " data = pd.read_csv('../csv/binary_data.csv')\n",
171 | " \n",
172 | " \n",
173 | " res = []\n",
174 | " for item in data.values:\n",
175 | " type, actor, director = list(map(lambda x : eval(x), [item[1], item[2], item[3]]))\n",
176 | " month, money, year, name = item[4], int(item[5]), int(item[6]), item[7]\n",
177 | " res.append((similarity(movie, [type, actor, director, month]), money, year, name))\n",
178 | " \n",
179 | " res.sort(key=lambda x : -x[0])\n",
180 | " \n",
181 | " predict_val = 0\n",
182 | " cnt = 0\n",
183 | " for i in range(5):\n",
184 | " if res[i][0] <= 2:\n",
185 | " break\n",
186 | " cnt += 1\n",
187 | " predict_val += res[i][1] * reg.predict([[2023]]) / reg.predict([[res[i][2]]])\n",
188 | " \n",
189 | " print(res[i])\n",
190 | " \n",
191 | " return predict_val[0][0] / cnt\n"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 8,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "name": "stdout",
201 | "output_type": "stream",
202 | "text": [
203 | "(4.6808694950739635, 406733, 2022, '长津湖之水门桥')\n",
204 | "(3.846339605633867, 577534, 2021, '长津湖')\n",
205 | "(3.616970526244077, 165207, 2017, '西游伏妖篇')\n",
206 | "(3.254676920420864, 54469, 2015, '战狼')\n",
207 | "(3.2450937914128564, 88348, 2014, '智取威虎山')\n"
208 | ]
209 | },
210 | {
211 | "data": {
212 | "text/plain": [
213 | "306848.3916073195"
214 | ]
215 | },
216 | "execution_count": 8,
217 | "metadata": {},
218 | "output_type": "execute_result"
219 | }
220 | ],
221 | "source": [
222 | "movie = [\n",
223 | " ['剧情', '历史'],\n",
224 | " ['吴京', '包贝尔', '易烊千玺', '邓超', '欧豪', '雷佳音', '郭京飞'],\n",
225 | " ['徐克', '吴京'],\n",
226 | " 2\n",
227 | "]\n",
228 | "\n",
229 | "predict(movie)"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 18,
235 | "metadata": {},
236 | "outputs": [
237 | {
238 | "name": "stdout",
239 | "output_type": "stream",
240 | "text": [
241 | "(8.941589026166287, 194190, 2017, '前任3:再见前任')\n",
242 | "(3.0211400439542415, 58861, 2014, '匆匆那年')\n",
243 | "(2.668806447200494, 71902, 2013, '致我们终将逝去的青春')\n"
244 | ]
245 | },
246 | {
247 | "data": {
248 | "text/plain": [
249 | "169230.68515821127"
250 | ]
251 | },
252 | "execution_count": 18,
253 | "metadata": {},
254 | "output_type": "execute_result"
255 | }
256 | ],
257 | "source": [
258 | "movie = [\n",
259 | " ['爱情', '喜剧'],\n",
260 | " ['韩庚', '郑恺', '于文文', '刘雅瑟', '张天爱'],\n",
261 | " ['田羽生'],\n",
262 | " 10\n",
263 | "]\n",
264 | "\n",
265 | "predict(movie)"
266 | ]
267 | }
268 | ],
269 | "metadata": {
270 | "kernelspec": {
271 | "display_name": "base",
272 | "language": "python",
273 | "name": "python3"
274 | },
275 | "language_info": {
276 | "codemirror_mode": {
277 | "name": "ipython",
278 | "version": 3
279 | },
280 | "file_extension": ".py",
281 | "mimetype": "text/x-python",
282 | "name": "python",
283 | "nbconvert_exporter": "python",
284 | "pygments_lexer": "ipython3",
285 | "version": "3.9.13"
286 | },
287 | "orig_nbformat": 4,
288 | "vscode": {
289 | "interpreter": {
290 | "hash": "6a018d3a992d78c184ccc94ab54168c7b26325ed3c2283926339c3edbf5487e7"
291 | }
292 | }
293 | },
294 | "nbformat": 4,
295 | "nbformat_minor": 2
296 | }
297 |
--------------------------------------------------------------------------------