├── .idea
├── .gitignore
├── dewuSpider.iml
├── inspectionProfiles
│ └── profiles_settings.xml
├── misc.xml
├── modules.xml
└── vcs.xml
├── app
├── actual_query.py
├── configUtil.py
├── data_analysis
│ ├── analysis.py
│ ├── analysis_executor.py
│ └── generate_reports.py
├── db
│ └── my_sql_db.py
├── de_wu_spider.py
├── decorator
│ └── decorator.py
├── log.py
└── util
│ ├── pyplot_util.py
│ └── zhi_ma_ip.py
├── config.yaml
├── developer.py
├── main.py
├── readme.md
└── requirements.txt
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /workspace.xml
--------------------------------------------------------------------------------
/.idea/dewuSpider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/app/actual_query.py:
--------------------------------------------------------------------------------
1 | from flask import Flask
2 |
3 | app = Flask(__name__)
4 |
5 |
6 | @app.route('/hello')
7 | def hello():
8 | return 'Hello word!'
9 |
10 |
11 | if __name__ == '__main__':
12 | app.run()
13 |
--------------------------------------------------------------------------------
/app/configUtil.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | import random
3 | import os
4 |
5 | from app.log import Logger
6 |
7 | log = Logger().log()
8 |
9 |
10 | class ConfigUtil:
11 | def __init__(self):
12 | cur_path = os.path.dirname(__file__)
13 | self.configPath = cur_path + "/config.yaml"
14 |
15 | def readYaml(self):
16 | # read config from yaml document
17 | file = self.configPath
18 | try:
19 | f = open(file, 'r', encoding='UTF-8')
20 | global configData
21 | configData = yaml.load(f, Loader=yaml.FullLoader)
22 | except IOError:
23 | log.info('open config failed')
24 | return configData
25 |
26 | def getValue(self, key):
27 | return self.readYaml()[key]
28 |
29 | def randomGetUserAgent(self):
30 | return random.choice(self.readYaml()['User_Agents'])
31 |
32 |
33 | if __name__ == '__main__':
34 | curPath = os.path.dirname(__file__)
35 | print(curPath)
36 |
--------------------------------------------------------------------------------
/app/data_analysis/analysis.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import matplotlib.pyplot as plt
3 | import os
4 | import time
5 |
6 | from app.configUtil import ConfigUtil
7 | from decimal import Decimal
8 | from app.log import Logger
9 |
10 |
11 | class Analysis:
12 | def __init__(self, article_number, db, _type="one_month"):
13 | self.db = db
14 | self.engine = self.db.getEngine()
15 | self.log = Logger().logger
16 | self.article_number = article_number
17 | self.type = _type
18 | # 保存图片文件夹
19 | self.save_img_path = ConfigUtil().getValue("analysis_img_path") + self.article_number
20 | if not os.path.exists(self.save_img_path):
21 | os.makedirs(self.save_img_path)
22 | # 查询数据
23 | sql = f"SELECT * FROM org_purchase_record WHERE article_number = '{article_number}'"
24 | if _type == "one_month":
25 | sql += "and format_time >= DATE_SUB(DATE_FORMAT(NOW(), '%Y-%m-%d'), INTERVAL 30 DAY)"
26 | elif _type == "three_month":
27 | sql += "and format_time >= DATE_SUB(DATE_FORMAT(NOW(), '%Y-%m-%d'), INTERVAL 91 DAY)"
28 | self.all_data = pd.read_sql_query(sql, self.engine)
29 | # 删除求购
30 | self.data = self.all_data.drop(self.all_data[self.all_data.order_sub_type_name == "求购"].index)
31 | # 获取求购数据
32 | self.ask_to_buy = self.all_data[self.all_data["order_sub_type_name"] == "求购"]
33 | # 获取时间
34 | self.date = self.data.format_time.drop_duplicates().sort_values(ascending=False).values
35 | # 获取尺码
36 | self.size = self.data.properties_values.drop_duplicates().sort_values(ascending=False).values
37 |
38 | # 图片属性
39 | self.image_wide = 15
40 | self.image_high = 10
41 | self.title_fontsize = 16
42 | self.label_fontsize = 14
43 |
44 | def get_price_volume(self, chart_type="日期"):
45 | """
46 | 统计价格-销量
47 | :return:
48 | """
49 | # 根绝时间分组的价格
50 | price = self.data.groupby('format_time' if chart_type == "日期" else 'properties_values')['price'].mean()
51 | counts = self.data.groupby('format_time' if chart_type == "日期" else 'properties_values')['price'].count().values
52 |
53 | # 绘图
54 | fig = plt.figure(figsize=(self.image_wide, self.image_high))
55 | p = fig.add_subplot(111)
56 | p.set_title(f"{chart_type}-价格-销量趋势图", fontsize=self.label_fontsize)
57 | p.set_ylabel("价格", fontsize=self.label_fontsize)
58 | p.set_xlabel(chart_type, fontsize=self.label_fontsize)
59 | p.plot(self.date if chart_type == "日期" else self.size, price)
60 | c = p.twinx()
61 | c.set_ylabel("交易量", fontsize=self.label_fontsize)
62 | ca = c.bar(self.date if chart_type == "日期" else self.size, counts, alpha=0.3)
63 | fig.legend(['平均价格', '交易量'])
64 | # self.__auto_text(ca)
65 |
66 | # 修改保存路径
67 | img_path = f"/date_price_volume_{self.type}.jpg" if chart_type == "日期" \
68 | else f"/size_price_volume_{self.type}.jpg"
69 | fig.savefig(self.save_img_path + img_path)
70 | plt.close(fig)
71 |
72 | def get_user_repeat(self):
73 | """
74 | 统计用户重复率
75 | :return:
76 | """
77 | # 根据用户名分组的重复率
78 | users = self.data.groupby('user_name')['price'].count().reset_index(name='count')\
79 | .sort_values('count', ascending=False).head(10)
80 | user_list = users.user_name.tolist()
81 | count_list = users['count'].tolist()
82 |
83 | # 绘图
84 | fig = plt.figure(figsize=(self.image_wide, self.image_high))
85 | user_repeat_plt = fig.add_subplot(111)
86 | user_repeat_plt.set_title("用户重复购买数量", fontsize=self.title_fontsize)
87 | user_repeat_plt.set_xlabel("用户名称", fontsize=self.label_fontsize)
88 | user_repeat_plt.set_ylabel("数量", fontsize=self.label_fontsize)
89 | user_repeat_plt.bar(user_list, count_list)
90 | fig.savefig(self.save_img_path + f"/user_repeat_{self.type}.jpg")
91 | plt.close(fig)
92 |
93 | def get_repeat_num(self):
94 | """
95 | 生成交易量重复图
96 | :return:
97 | """
98 | # 计算交易数量
99 | counts = self.data.user_name.value_counts()
100 | two_count = 0
101 | three_count = 0
102 | four_count = 0
103 | for count in counts:
104 | if count > 1:
105 | two_count += count
106 | if count > 2:
107 | three_count += count
108 | if count > 3:
109 | four_count += count
110 | # 绘图
111 | index_list = ["大于两次", "大于三次", "大于四次"]
112 | data = [two_count, three_count, four_count]
113 | fig = plt.figure(figsize=(self.image_wide, self.image_high))
114 | repeat_plt = fig.add_subplot(111)
115 | repeat_plt.set_title("重复交易量试图", fontsize=self.title_fontsize)
116 | repeat_plt.set_xlabel("重复频率", fontsize=self.label_fontsize)
117 | repeat_plt.set_ylabel("重复交易数量", fontsize=self.label_fontsize)
118 | res = repeat_plt.bar(index_list, data)
119 | # self.__auto_text(res)
120 | fig.savefig(self.save_img_path + f"/repeat_num_{self.type}.jpg")
121 | plt.close(fig)
122 |
123 | def analysis_info(self):
124 | """
125 | 获取基础分析数据
126 | :return:
127 | """
128 | if len(self.data) == 0:
129 | return {
130 | "r_size": "",
131 | "max_price": 0,
132 | "min_price": 0,
133 | "avg_price": 0,
134 | "all_volume": 0,
135 | "premium": 0
136 | }
137 |
138 | # 获取数据
139 | recommended_size = self.data.groupby("properties_values")["id"].count().reset_index(name="count") \
140 | .sort_values("count", ascending=False).head(3).properties_values.values
141 | r_size = ",".join(recommended_size)
142 |
143 | # 基础属性
144 | max_price = self.data.price.max()
145 | min_price = self.data.price.min()
146 | avg_price = round(self.data.price.mean(), 2)
147 | all_volume = self.data.shape[0]
148 |
149 | # 计算溢价
150 | auth_price = self.db.getOne(f"SELECT auth_price FROM org_detail WHERE article_number = '{self.article_number}'")[0]
151 | premium = round((Decimal(avg_price) - auth_price) / auth_price * 100, 2)
152 | return {
153 | "r_size": r_size,
154 | "max_price": max_price,
155 | "min_price": min_price,
156 | "avg_price": avg_price,
157 | "all_volume": all_volume,
158 | "premium": premium
159 | }
160 |
161 | def update_info(self):
162 | """
163 | 更新基础属性
164 | :return:
165 | """
166 | an_info = self.analysis_info()
167 | # 持久化
168 | query_info_sql = f"SELECT * FROM org_data_analysis_info WHERE article_number = '{self.article_number}'"
169 | info = self.db.getOne(query_info_sql)
170 | if info:
171 | # 更新
172 | update_sql = f"UPDATE org_data_analysis_info " \
173 | f"SET max_price = {an_info['max_price']}, avg_price = {an_info['avg_price']}," \
174 | f"min_price = {an_info['min_price']}, premium = {an_info['premium']}," \
175 | f"all_volume = {an_info['all_volume']},recommended_size = '{an_info['r_size']}'," \
176 | f"update_time = '{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}' " \
177 | f"WHERE article_number = '{self.article_number}'"
178 | self.db.executeSql(update_sql)
179 | else:
180 | # 插入
181 | info = (
182 | None,
183 | self.article_number,
184 | 0,
185 | an_info['max_price'],
186 | an_info['avg_price'],
187 | an_info['min_price'],
188 | an_info['premium'],
189 | an_info['all_volume'],
190 | an_info['r_size'],
191 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
192 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
193 | )
194 | insert_sql = "INSERT INTO org_data_analysis_info VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
195 | self.db.insertData(sql=insert_sql, data=info)
196 |
197 | def get_ask_to_buy(self):
198 | """
199 | 绘制求购数据
200 | :return:
201 | """
202 | # 获取数据
203 | size = self.ask_to_buy.groupby("properties_values")["id"].count()
204 | date = self.ask_to_buy.groupby("format_time")["id"].count()
205 |
206 | size_index = size.index.values
207 | date_index = pd.to_datetime(date.index.values, format="%Y-%m-%d")
208 |
209 | size_data = size.values
210 | date_data = date.values
211 |
212 | # 绘图
213 | fig = plt.figure(figsize=(self.image_wide, self.image_high))
214 | size_plt = fig.add_subplot(211)
215 | size_plt.set_title("尺码-求购量图", fontsize=self.title_fontsize)
216 | size_plt.set_ylabel("求购量", fontsize=self.label_fontsize)
217 | size_plt.set_xlabel("尺码", fontsize=self.label_fontsize)
218 | size_plt.bar(size_index, size_data)
219 |
220 | date_plt = fig.add_subplot(212)
221 | date_plt.set_title("日期-求购量图", fontsize=self.title_fontsize)
222 | date_plt.set_ylabel("求购量", fontsize=self.label_fontsize)
223 | date_plt.set_xlabel("日期", fontsize=self.label_fontsize)
224 | date_plt.bar(date_index, date_data)
225 | fig.savefig(self.save_img_path + f"/ask_to_buy_{self.type}.jpg")
226 | plt.close(fig)
227 |
228 | def run_analysis(self):
229 | """
230 | 数据分析
231 | :return:
232 | """
233 | if self.type == "one_month":
234 | # 修改信息
235 | self.log.info(f"正在更新【{self.article_number}】交易信息")
236 | self.update_info()
237 | self.log.info(f"【{self.article_number}】交易信息更新完成")
238 |
239 | count = len(self.data)
240 | if count > 0:
241 | # 生成日期价格图
242 | self.log.info(f"正在生成【{self.article_number}】日期价格图")
243 | self.get_price_volume()
244 | self.log.info(f"【{self.article_number}】日期价格图生成完毕")
245 |
246 | # 生成尺码价格图
247 | self.log.info(f"正在生成【{self.article_number}】尺码价格图")
248 | self.get_price_volume(chart_type="尺码")
249 | self.log.info(f"【{self.article_number}】尺码价格图生成完毕")
250 |
251 | # 生成求购图
252 | self.log.info(f"正在生成【{self.article_number}】求购图")
253 | self.get_ask_to_buy()
254 | self.log.info(f"【{self.article_number}】求购图生成完毕")
255 |
256 | # 生成推荐尺码移动平均线图
257 | self.log.info(f"正在生成【{self.article_number}】SMV图")
258 | self.get_ma()
259 | self.log.info(f"【{self.article_number}】SMV图生成完毕")
260 |
261 | # 生成交易量重复图
262 | self.log.info(f"正在生成【{self.article_number}】交易量重复图")
263 | self.get_repeat_num()
264 | self.log.info(f"【{self.article_number}】交易量重复图生成完毕")
265 |
266 | # 生成用户交易量重复图
267 | self.log.info(f"正在生成【{self.article_number}】用户交易量重复图")
268 | self.get_user_repeat()
269 | self.log.info(f"【{self.article_number}】用户交易量重复图生成完毕")
270 |
271 | def __get_recommended_data(self):
272 | """
273 | 获取推荐尺码所有的数据
274 | :return:
275 | """
276 | recommended_size = self.data.groupby("properties_values")["id"].count().reset_index(name="count") \
277 | .sort_values("count", ascending=False).head(3).properties_values.values.tolist()
278 | # 删除非推荐尺码数据
279 | recommended_data = self.data.drop(self.data[(self.data.properties_values != recommended_size[0])
280 | & (self.data.properties_values != recommended_size[1])
281 | & (self.data.properties_values != recommended_size[2])].index)
282 | return recommended_data
283 |
284 | def get_ma(self):
285 | """
286 | 绘制移动平均线
287 | :return:
288 | """
289 | # 获取推荐尺码数据
290 | recommended_data = self.__get_recommended_data()
291 | data = recommended_data.groupby("format_time")
292 |
293 | # 处理数据
294 | date_list = []
295 | avg_list = []
296 | price_list = []
297 | for index, d in data:
298 | one_day = self.__analysis_one_day(d)
299 | date_list.append(pd.to_datetime(index.value, format="%Y-%m-%d"))
300 | avg_list.append(one_day['avg_price'])
301 | price_list.append(one_day['close_price'])
302 |
303 | # 绘图
304 | fig = plt.figure(figsize=(self.image_wide, self.image_high))
305 | plot = fig.add_subplot(111)
306 | plot.set_title("推荐尺码SMA(Simple Moving Average)", fontsize=self.title_fontsize)
307 | plot.set_xlabel("日期", fontsize=self.label_fontsize)
308 | plot.set_ylabel("价格", fontsize=self.label_fontsize)
309 | plot.plot(date_list, avg_list, label="平均线", color="#F08080")
310 | plot.plot(date_list, price_list, label="价格线", color="#DB7093", linestyle="--")
311 | plot.legend()
312 | plot.grid(alpha=0.4, linestyle=':')
313 | fig.savefig(self.save_img_path + f"/ma_{self.type}.jpg")
314 | plt.close(fig)
315 |
316 | def get_k_line(self):
317 | """
318 | 绘制k线图
319 | :return:
320 | """
321 | pass
322 |
323 | @staticmethod
324 | def __auto_text(rects):
325 | for rect in rects:
326 | plt.text(rect.get_x(), rect.get_height(), rect.get_height(), va='bottom')
327 |
328 | @staticmethod
329 | def __analysis_one_day(data):
330 | """
331 | 分析一天的数据
332 | :param data:
333 | :return:
334 | """
335 | max_price = data.price.max()
336 | min_price = data.price.min()
337 | avg_price = round(data.price.mean(), 2)
338 | open_price = data.price.iloc[0]
339 | close_price = data.price.iloc[data.shape[0] - 1]
340 | return {
341 | "max_price": max_price,
342 | "min_price": min_price,
343 | "avg_price": avg_price,
344 | "open_price": open_price,
345 | "close_price": close_price
346 | }
--------------------------------------------------------------------------------
/app/data_analysis/analysis_executor.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from app.data_analysis.analysis import Analysis
4 | from app.data_analysis.generate_reports import GenerateReports
5 | from app.log import Logger
6 | from app.db.my_sql_db import MySqlDb
7 | from app.decorator.decorator import error_repeat
8 |
9 |
10 | class AnalysisExecutor:
11 | def __init__(self):
12 | self.db = MySqlDb()
13 | self.log = Logger().logger
14 | self.thread_count = 4
15 |
16 | def update_all_data(self):
17 | """
18 | 更新所有商品信息数据
19 | :return:
20 | """
21 | self.log.info("正在启动单线程数据分析程序...")
22 | # 查询所有已有记录的商品列表
23 | commodity_sql = 'SELECT * FROM org_detail'
24 | commodity_list = self.db.query(commodity_sql)
25 | article_number_list = [com[7] for com in commodity_list]
26 |
27 | # with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
28 | # executor.map(self.update_one_date, article_number_list)
29 |
30 | for commodity in article_number_list:
31 | # 一个月数据
32 | self.update_one_month(commodity)
33 | self.reports_one_month(commodity)
34 |
35 | # 三个月数据
36 | self.update_three_month(commodity)
37 | self.reports_three_month(commodity)
38 |
39 | now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
40 | self.log.info(f"{now}程序结束")
41 |
42 | def update_one_month(self, article_number):
43 | """
44 | 更新一个月数据
45 | :param article_number:
46 | :return:
47 | """
48 | self.log.info(f"正在对【{article_number}】进行一个月数据分析")
49 | an = Analysis(article_number, self.db)
50 | an.run_analysis()
51 | self.log.info(f"【{article_number}】数据分析完成")
52 |
53 | @error_repeat
54 | def update_three_month(self, article_number):
55 | """
56 | 更新三个月数据
57 | :param article_number:
58 | :return:
59 | """
60 | self.log.info(f"正在对【{article_number}】进行三个月数据分析")
61 | an = Analysis(article_number, self.db, _type='three_month')
62 | an.run_analysis()
63 | self.log.info(f"【{article_number}】数据分析完成")
64 |
65 | @error_repeat
66 | def reports_one_month(self, article_number):
67 | """
68 | 生成一个月的数据报告
69 | :param article_number:
70 | :return:
71 | """
72 | self.log.info(f"正在生成【{article_number}】一个月数据分析报告")
73 | gen = GenerateReports(article_number, self.db)
74 | gen.generate()
75 | self.log.info(f"【{article_number}】一个月数据分析报告生成成功")
76 |
77 | @error_repeat
78 | def reports_three_month(self, article_number):
79 | """
80 | 生成三个月的数据报告
81 | :param article_number:
82 | :return:
83 | """
84 | self.log.info(f"正在生成【{article_number}】三个月数据分析报告")
85 | gen = GenerateReports(article_number, self.db, reports_type="three_month")
86 | gen.generate()
87 | self.log.info(f"【{article_number}】三个月数据分析报告生成成功")
--------------------------------------------------------------------------------
/app/data_analysis/generate_reports.py:
--------------------------------------------------------------------------------
1 | import docxtpl
2 | import os
3 | import time
4 |
5 | from docx.shared import Mm
6 | from app.configUtil import ConfigUtil
7 | from app.log import Logger
8 | from app.data_analysis.analysis import Analysis
9 |
10 |
11 | class GenerateReports:
12 | """
13 | 生成数据分析报告
14 | """
15 |
16 | def __init__(self, article_number, db, reports_type="one_month"):
17 | self.article_number = article_number
18 | self.temp_path = os.path.dirname(os.path.dirname(__file__)) + "/static/data_analysis_tpl.docx"
19 | self.reports_type = reports_type
20 | self.db = db
21 | self.log = Logger().logger
22 | conf = ConfigUtil()
23 | self.img_path = conf.getValue("img_path") + self.article_number + ".jpg"
24 | self.analysis_img_path = conf.getValue("analysis_img_path") + self.article_number
25 |
26 | def generate(self):
27 | # 基础数据
28 | detail_sql = f"SELECT title,auth_price,sell_date,brand FROM org_detail " \
29 | f"WHERE article_number ='{self.article_number}'"
30 | detail = self.db.getOne(detail_sql)
31 | daily_docx = docxtpl.DocxTemplate(self.temp_path)
32 | # logo_img = docxtpl.InlineImage(daily_docx, self.img_path, width=Mm(140))
33 |
34 | # 基础分析数据
35 | an = Analysis(self.article_number, self.db, self.reports_type)
36 | info = an.analysis_info()
37 |
38 | # 分析图
39 | size_price_volume = docxtpl.InlineImage(daily_docx, self.analysis_img_path
40 | + f'/size_price_volume_{self.reports_type}.jpg', width=Mm(140))
41 | date_price_volume = docxtpl.InlineImage(daily_docx, self.analysis_img_path
42 | + f'/date_price_volume_{self.reports_type}.jpg', width=Mm(140))
43 | ask_to_buy = docxtpl.InlineImage(daily_docx, self.analysis_img_path
44 | + f'/ask_to_buy_{self.reports_type}.jpg', width=Mm(140))
45 | ma = docxtpl.InlineImage(daily_docx, self.analysis_img_path
46 | + f'/ma_{self.reports_type}.jpg', width=Mm(140))
47 | user_repeat = docxtpl.InlineImage(daily_docx, self.analysis_img_path
48 | + f'/user_repeat_{self.reports_type}.jpg', width=Mm(140))
49 | repeat_num = docxtpl.InlineImage(daily_docx, self.analysis_img_path
50 | + f'/repeat_num_{self.reports_type}.jpg', width=Mm(140))
51 |
52 | # 渲染内容
53 | context = {
54 | "name": detail[0],
55 | "brand": detail[3],
56 | "auth_price": detail[1],
57 | "sell_date": detail[2],
58 | "recommended_size": info['r_size'],
59 | "max_price": info['max_price'],
60 | "min_price": info['min_price'],
61 | "all_volume": info['all_volume'],
62 | "premium": info['premium'],
63 | "avg_price": info['avg_price'],
64 | "size_price_volume": size_price_volume,
65 | "date_price_volume": date_price_volume,
66 | "ask_to_buy": ask_to_buy,
67 | "ma": ma,
68 | "user_repeat": user_repeat,
69 | "repeat_num": repeat_num,
70 | "create_time": time.strftime('%Y-%m-%d', time.localtime())
71 | }
72 | # 渲染docx
73 | daily_docx.render(context)
74 | # 保存docx
75 | daily_docx.save(self.analysis_img_path + '/' + self.article_number + '_' + self.reports_type + ".docx")
76 |
--------------------------------------------------------------------------------
/app/db/my_sql_db.py:
--------------------------------------------------------------------------------
1 | import pymysql
2 | from app.log import Logger
3 | from app.configUtil import ConfigUtil
4 | from sqlalchemy import create_engine
5 |
6 | log = Logger().log()
7 | config = ConfigUtil()
8 |
9 |
10 | class MySqlDb:
11 | def __init__(self):
12 | self.dbConfig = config.getValue('db')
13 | self.host = self.dbConfig['host']
14 | self.username = self.dbConfig['username']
15 | self.password = self.dbConfig['password']
16 | self.port = self.dbConfig['port']
17 | self.db = self.dbConfig['dbName']
18 | self.charset = self.dbConfig['charset']
19 |
20 | def getConnect(self):
21 | """
22 | 获取mysql数据库连接
23 | :return:
24 | """
25 | con = None
26 | try:
27 | con = pymysql.connect(
28 | host=self.host, user=self.username, passwd=self.password, port=self.port, db=self.db, charset=self.charset
29 | )
30 | except Exception as e:
31 | log.error(e)
32 | log.info("获取数据库连接失败,正在尝试重新连接...")
33 | for _ in range(5):
34 | try:
35 | con = pymysql.connect(
36 | host=self.host, user=self.username, passwd=self.password, port=self.port, db=self.db,
37 | charset=self.charset
38 | )
39 | log.info("连接成功!")
40 | break
41 | except Exception as e:
42 | log.error(e)
43 | return con
44 |
45 | @staticmethod
46 | def getCursor(connect):
47 | """
48 | 获取数据库游标
49 | :param connect:
50 | :return:
51 | """
52 | return connect.cursor()
53 |
54 | def getEngine(self):
55 | """
56 | 获取pandas需要的Engine
57 | :return:
58 | """
59 | return create_engine(f'mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{self.db}')
60 |
61 | def insertDataList(self, sql, data):
62 | """
63 | 插入一个集合数据
64 | :param sql:
65 | :param data:
66 | :return:
67 | """
68 | for entity in data:
69 | self.insertData(sql, entity)
70 |
71 | def insertData(self, sql, data):
72 | """
73 | 执行插入语句
74 | :param sql:执行的sql
75 | :param data:需要插入的数据
76 | :return:
77 | """
78 | if sql is not None and sql != ' ':
79 | if data is not None:
80 | con = self.getConnect()
81 | cur = self.getCursor(con)
82 | cur.execute(sql, data)
83 | con.commit()
84 | # 关闭连接,关闭游标
85 | cur.close()
86 | con.close()
87 | log.info("数据插入成功")
88 | else:
89 | log.info("待插入数据不能为空")
90 | else:
91 | log.info("执行sql不能为空")
92 |
93 | def query(self, sql):
94 | """
95 | 查询数据
96 | :param sql:
97 | :return:
98 | """
99 | if sql is not None and sql != " ":
100 | con = self.getConnect()
101 | cur = self.getCursor(con)
102 | cur.execute(sql)
103 | data = cur.fetchall()
104 | cur.close()
105 | con.close()
106 | return data
107 | else:
108 | log.info("执行sql不能为空")
109 |
110 | def getOne(self, sql):
111 | """
112 | 查询数据
113 | :param sql:
114 | :return:
115 | """
116 | if sql is not None and sql != " ":
117 | con = self.getConnect()
118 | cur = self.getCursor(con)
119 | cur.execute(sql)
120 | data = cur.fetchone()
121 | cur.close()
122 | con.close()
123 | return data
124 | else:
125 | log.info("执行sql不能为空")
126 |
127 | def executeSql(self, sql):
128 | """
129 | 执行sql语句
130 | :param sql:
131 | :return:
132 | """
133 | con = self.getConnect()
134 | cur = self.getCursor(con)
135 | try:
136 | if sql is not None and sql != " ":
137 | cur.execute(sql)
138 | else:
139 | log.info("执行sql不能为空")
140 | except Exception as e:
141 | log.error(e)
142 | con.rollback()
143 | else:
144 | con.commit()
145 | finally:
146 | # 关闭连接,关闭游标
147 | cur.close()
148 | con.close()
149 |
150 |
--------------------------------------------------------------------------------
/app/de_wu_spider.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import time
3 | import os
4 | import datetime
5 | import random
6 |
7 | from app.util import request_util
8 | from app.db.my_sql_db import MySqlDb
9 | from app.configUtil import ConfigUtil
10 | from app.util.zhi_ma_ip import ZhiMaIp
11 | from app.log import Logger
12 | from app.data_analysis.analysis_executor import AnalysisExecutor
13 | from concurrent.futures import ThreadPoolExecutor
14 |
15 |
16 | class DeWuSpider:
17 | def __init__(self):
18 | self.db = MySqlDb()
19 | self.config = ConfigUtil()
20 | self.log = Logger().logger
21 | self.zm = ZhiMaIp()
22 | self.proxies = self.zm.getOneProxies()
23 | self.thread_count = 4
24 |
25 | # 移除跳过认证警告
26 | requests.packages.urllib3.disable_warnings()
27 |
28 | def get_info(self, spuId):
29 | """
30 | 根据spuId获取商品信息
31 | :param spuId:得物唯一标识
32 | :return:
33 | """
34 | self.log.info(f"开始获取详情-->spuId:{spuId}...")
35 | data = {
36 | "spuId": spuId,
37 | "productSourceName": "",
38 | "propertyValueId": "0"
39 | }
40 | data = request_util.add_sign(data)
41 | url = 'https://app.dewu.com/api/v1/h5/index/fire/flow/product/detail'
42 | # 发送请求
43 | res = self.try_err_send_request('info', data, url)
44 | if res.status_code == 200:
45 | self.log.info(f"spuId:{spuId},发送请求成功,正在解析数据...")
46 | data = res.json().get('data')
47 | # 详情
48 | pageDetail = data.get('detail')
49 | articleNumber = pageDetail.get('articleNumber')
50 | detail = self.db.getOne("SELECT * FROM org_detail WHERE article_number = '{}'".format(articleNumber))
51 | if not detail:
52 | # 参数
53 | baseProperties = data.get('baseProperties')
54 | brandList = baseProperties["brandList"]
55 | parameterList = baseProperties["list"]
56 | parameters = self.get_parameter(parameterList)
57 |
58 | # 图片
59 | # image_and_txt = data.get("imageAndText")
60 | # imgList = self.get_img_url(image_and_txt, articleNumber)
61 |
62 | # 下载logo
63 | logoUrl = self.downloadImg(pageDetail["logoUrl"], articleNumber)
64 | detail = (
65 | None,
66 | pageDetail.get('title'),
67 | spuId,
68 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
69 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),
70 | pageDetail.get('authPrice'),
71 | pageDetail.get('sellDate'),
72 | articleNumber,
73 | logoUrl,
74 | brandList[0].get("brandName"),
75 | parameters["functionality"],
76 | parameters["blendent"],
77 | parameters["upperLevel"],
78 | parameters["topShoeStyles"],
79 | parameters["heelType"],
80 | None
81 | )
82 | self.log.info(f"spuId:{spuId},开始入库...")
83 | # 持久化到数据库
84 | # 插入详情
85 | detail_sql = 'INSERT INTO org_detail VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)'
86 | self.db.insertData(detail_sql, detail)
87 | # 插入详情图片
88 | # insert_img_sql = 'INSERT INTO org_detail_img VALUES(%s, %s, %s, %s, %s, %s)'
89 | # self.db.insertDataList(insert_img_sql, imgList)
90 | self.log.info(f"spuId:{spuId},入库结束...")
91 |
92 | @staticmethod
93 | def get_parameter(parameterList):
94 | """
95 | 根据参数列表返回参数字典
96 | :param parameterList: 参数列表
97 | :return:参数字典
98 | """
99 | parameter = {'functionality': None, 'blendent': None, 'upperLevel': None, 'topShoeStyles': None, 'heelType': None}
100 | for p in parameterList:
101 | key = p['key']
102 | value = p['value']
103 | if key == '功能性':
104 | parameter['functionality'] = value
105 | elif key == '配色':
106 | parameter['blendent'] = value
107 | elif key == '鞋帮高度':
108 | parameter['upperLevel'] = value
109 | elif key == '鞋头款式':
110 | parameter['topShoeStyles'] = value
111 | elif key == '鞋跟类型':
112 | parameter['heelType'] = value
113 | return parameter
114 |
115 | def get_img_url(self, images, article_number):
116 | """
117 | 获取图片URL
118 | :param article_number:
119 | :param images:
120 | :return:
121 | """
122 | imgList = []
123 | count = 1
124 | for g in images:
125 | if len(g) > 2:
126 | # 判断是否为尺码对照表
127 | contentType = g['contentType']
128 | if contentType != 'STRUCTURE_SIZE' and contentType != 'SIZETEMPLATE' and contentType != 'ATTENTION':
129 | g = g['images'][0]
130 | else:
131 | continue
132 | u = self.downloadImg(g.get('url'), article_number + str(count))
133 | height = g["height"]
134 | if height > 100:
135 | img = (
136 | None,
137 | article_number,
138 | u,
139 | count,
140 | g["width"],
141 | g["height"]
142 | )
143 | imgList.append(img)
144 | count += 1
145 | return imgList
146 |
147 | def get_record(self, spuId):
148 | """
149 | 获取当天的交易记录
150 | :param spuId: 得物唯一标识
151 | :return:
152 | """
153 | self.log.info(f"开始获取交易记录-->spuId:{spuId}...")
154 | # 获取当前商品的货号
155 | get_article_number_sql = f"SELECT article_number FROM org_detail WHERE spu_id = '{spuId}'"
156 | _article_number = self.db.getOne(get_article_number_sql)[0]
157 |
158 | # 获取最新一条交易记录
159 | get_newest_sql = f"select * from org_last_record r WHERE article_number='{_article_number}'"
160 | newest = self.db.getOne(get_newest_sql)
161 | lastId = ""
162 | count = 1
163 | while True:
164 | self.log.info(f"正在请求{spuId}---第{count}页---交易记录...")
165 | record, lastId, flag_stop = self.get_trading_record(spuId, lastId, _article_number, newest)
166 | # 插入数据库
167 | insert_sql = 'INSERT INTO org_purchase_record VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)'
168 | self.log.info(f"spuId:{spuId}---第{count}页---交易记录解析完成")
169 | self.db.insertDataList(insert_sql, record)
170 | # 判断当前是否为第一次获取
171 | if count == 1 and len(record) > 0:
172 | # 删除上次获取的记录
173 | del_sql = f"DELETE FROM org_last_record WHERE article_number = '{_article_number}'"
174 | self.db.executeSql(del_sql)
175 | # 把当前最新的记录更新到列表
176 | insert_sql = 'INSERT INTO org_last_record VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)'
177 | self.db.insertData(insert_sql, record[-1])
178 |
179 | # 判断是否停止
180 | if flag_stop:
181 | self.log.info(f"spuId:{spuId}---今日交易记录获取完成")
182 | break
183 |
184 | # 随机随眠一到三秒
185 | time.sleep(random.randint(1, 3))
186 | count += 1
187 |
188 | def get_trading_record(self, spuId, lastId, _article_number, newest):
189 | """
190 | 根据spuId获取交易记录
191 | :param newest: 最新交易记录
192 | :param _article_number:货号
193 | :param spuId: 得物唯一标识
194 | :param lastId: 下一页标识
195 | :return:返回交易记录和下一页标识
196 | """
197 | recordList = []
198 | # 标识是否还要继续爬取
199 | flag_stop = False
200 | data = {
201 | "spuId": spuId,
202 | "limit": '20',
203 | "lastId": lastId,
204 | "sourceApp": "app"
205 | }
206 | data = request_util.add_sign(data)
207 | url = 'https://app.dewu.com/api/v1/h5/commodity/fire/last-sold-list'
208 | # 发送请求
209 | res = self.try_err_send_request('record', data, url)
210 | if res.status_code == 200:
211 | self.log.info("交易记录请求成功,正在解析交易数据。。。")
212 | all_data = res.json()
213 | lastId = all_data.get('data').get('lastId')
214 | # 判断下一次请求是否已经无数据,如果是返回空集合与停止循环标识
215 | if lastId == "":
216 | flag_stop = True
217 | return recordList, lastId, flag_stop
218 |
219 | data_list = all_data.get('data').get('list')
220 | for d in data_list:
221 | formatTime = d['formatTime']
222 | formatTime = self.refactorFormatTime(formatTime)
223 | record = (
224 | None,
225 | spuId,
226 | _article_number,
227 | d['userName'],
228 | formatTime,
229 | d['price'] / 100,
230 | d['orderSubTypeName'],
231 | d['propertiesValues'],
232 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
233 | )
234 |
235 | # 判断是否是当天的数据 如果是则放入集合,如果不是则跳出循环标识不再获取交易记录
236 | if newest:
237 | if self.compareRecord(newest, record):
238 | flag_stop = True
239 | break
240 |
241 | recordList.append(record)
242 | return recordList[::-1], lastId, flag_stop
243 |
244 | @staticmethod
245 | def compareRecord(dbData, record):
246 | """
247 | 比较两条记录是否相等
248 | :param dbData:
249 | :param record:
250 | :return:
251 | """
252 | for i in range(len(record)):
253 | if i == 0 or i == 4 or i == 8:
254 | continue
255 | if i == 5:
256 | price = dbData[i]
257 | if not int(price) == record[i]:
258 | return False
259 | else:
260 | if not dbData[i] == record[i]:
261 | return False
262 | return True
263 |
264 | @staticmethod
265 | def refactorFormatTime(formatTime):
266 | """
267 | 重构交易时间返回数据类型yyyy-MM-dd
268 | :param formatTime:
269 | :return:
270 | """
271 | if '前' in formatTime or '刚刚' == formatTime:
272 | if '小时' in formatTime:
273 | h = formatTime[0:formatTime.find('小')]
274 | newTime = (datetime.datetime.now() + datetime.timedelta(hours=-int(h))).strftime("%Y-%m-%d")
275 | else:
276 | newTime = datetime.datetime.now().strftime("%Y-%m-%d")
277 | if '天' in formatTime:
278 | dd = formatTime[0:formatTime.find('天')]
279 | newTime = (datetime.datetime.now() + datetime.timedelta(days=-int(dd))).strftime("%Y-%m-%d")
280 | else:
281 | if '月' in formatTime:
282 | newTime = time.strftime("%Y", time.localtime()) + '-' + formatTime.replace('月', '-').replace('日', '')
283 | else:
284 | newTime = formatTime.replace('.', '-')
285 | return newTime
286 |
287 | def downloadImg(self, imgUrl, fileName):
288 | """
289 | 下载图片
290 | :param imgUrl: 图片路径
291 | :param fileName: 图片名称
292 | :return:
293 | """
294 | img_path = self.config.getValue('img_path')
295 | # 判断文件夹是否存在
296 | if not os.path.exists(img_path):
297 | os.makedirs(img_path)
298 | # 发请求并保存图片
299 | suffix = os.path.splitext(imgUrl)[1]
300 | if suffix == '':
301 | suffix = '.jpg'
302 | r = None
303 | try:
304 | r = requests.get(url=imgUrl, stream=True, timeout=(10, 10), proxies=self.proxies)
305 | except Exception as e:
306 | self.log.error(e)
307 | self.log.info(f"下载图片{imgUrl}请求失败,开始重新发起请求")
308 | for i in range(5):
309 | try:
310 | self.proxies = self.zm.getOneProxies()
311 | self.log.info(f"当前代理为{self.proxies}")
312 | r = requests.get(url=imgUrl, stream=True, timeout=(10, 10), proxies=self.proxies)
313 | self.log.info(f"第{i + 1}次尝试成功")
314 | break
315 | except Exception as e:
316 | self.log.error(e)
317 | self.log.info(f"第{i + 1}次尝试失败")
318 | if r.status_code == 200:
319 | all_name = fileName + suffix
320 | open(img_path + all_name, 'wb').write(r.content)
321 | return all_name
322 | else:
323 | return None
324 |
325 | def query_by_key(self, key):
326 | """
327 | 根据key查询数据
328 | :param key: 参数
329 | :return:
330 | """
331 | self.log.info(f"正在根据关键词【{key}】查看结果")
332 | query_res = []
333 | pram = {
334 | 'title': key,
335 | 'page': '0',
336 | 'sortType': '0',
337 | 'sortMode': '1',
338 | 'limit': '20',
339 | 'showHot': '1',
340 | 'isAggr': '1'
341 | }
342 | data = request_util.add_sign(pram)
343 | url = "https://app.dewu.com/api/v1/h5/search/fire/search/list"
344 | res = self.try_err_send_request('search', data, url)
345 | if res.status_code == 200:
346 | self.log.info(f"根据关键词【{key}】查询请求成功!")
347 | all_data = res.json()
348 | for entity in all_data['data']['productList']:
349 | query_res.append(entity)
350 | return query_res
351 |
352 | def try_err_send_request(self, send_type, data, url):
353 | """
354 | 发送请求
355 | :param send_type:发送类型
356 | :param data:参数
357 | :param url:请求路径
358 | :return:
359 | """
360 | res = None
361 | try:
362 | res = self.send_request(send_type, data, url)
363 | except Exception as e:
364 | self.log.error(e)
365 | self.log.info(f",发送请求失败,正在尝试重新请求...")
366 | for i in range(5):
367 | self.log.info(f"正在尝试第{i + 1}次请求...")
368 | self.proxies = self.zm.getOneProxies()
369 | self.log.info(f"当前代理为{self.proxies}")
370 | try:
371 | res = self.send_request(send_type, data, url)
372 | self.log.info(f"第{i + 1}次请求成功...")
373 | break
374 | except Exception as e:
375 | self.log.error(e)
376 | self.log.info(f"第{i + 1}次请求失败...")
377 | return res
378 |
379 | def send_request(self, send_type, data, url):
380 | """
381 | 单独发送请求
382 | :param send_type: 请求类型
383 | :param url: 请求路径
384 | :param data: 请求参数
385 | :return:
386 | """
387 | if 'search' == send_type:
388 | res = requests.get(url=url, params=data, headers=request_util.get_header(send_type),
389 | verify=False, timeout=(10, 10), proxies=self.proxies)
390 | else:
391 | res = requests.post(url=url, json=data, headers=request_util.get_header(send_type),
392 | verify=False, timeout=(10, 10), proxies=self.proxies)
393 | return res
394 |
395 | def run(self):
396 | """
397 | 启动爬虫
398 | :return:
399 | """
400 | self.log.info("正在启动得物爬虫程序...")
401 | # 获取代理
402 | self.log.info(f"当前代理:{self.proxies}")
403 | # 查询所有待查询的商品列表
404 | commodity_sql = 'SELECT * FROM org_all_commodity'
405 | commodity_list = self.db.query(commodity_sql)
406 | for commodity in commodity_list:
407 | self.do_one_commodity_data(commodity)
408 | now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
409 | self.log.info(f"{now}程序结束")
410 |
411 | def do_one_commodity_data(self, commodity):
412 | """
413 | 获取单个商品的数据
414 | :param commodity:
415 | :return:
416 | """
417 | self.log.info(f"开始执行【{commodity[2]}】商品")
418 | spu_id = commodity[1]
419 | if commodity[3] == 1:
420 | data = self.query_by_key(commodity[2])[0]
421 | spu_id = str(data['spuId'])
422 | self.get_info(spu_id)
423 | # 修改数据库状态
424 | update_sql = f'UPDATE org_all_commodity SET is_new = 0,spu_id = {spu_id} WHERE id = {commodity[0]}'
425 | self.db.executeSql(update_sql)
426 | self.get_record(spu_id)
427 |
428 | # 进行第一轮数据分析
429 | if commodity[3] == 1:
430 | self.log.info(f"开始对【{commodity[2]}】商品进行第一轮数据分析")
431 | an = AnalysisExecutor()
432 | # 一月数据
433 | an.update_one_month(commodity[2])
434 | an.reports_one_month(commodity[2])
435 |
436 | # 三月数据
437 | an.update_three_month(commodity[2])
438 | an.reports_three_month(commodity[2])
439 | self.log.info(f"对【{commodity[2]}】商品第一轮数据分析完毕")
440 | self.log.info(f"商品【{commodity[2]}】执行执行完毕!")
441 |
442 | def thread_run(self):
443 | """
444 | 多线程爬虫启动
445 | :return:
446 | """
447 | self.log.info("正在启动多线程得物爬虫程序...")
448 | # 获取代理
449 | self.log.info(f"当前代理:{self.proxies}")
450 | # 查询所有待查询的商品列表
451 | commodity_sql = 'SELECT * FROM org_all_commodity'
452 | commodity_list = self.db.query(commodity_sql)
453 | with ThreadPoolExecutor(max_workers=self.thread_count) as executor:
454 | executor.map(self.do_one_commodity_data, commodity_list)
455 |
456 | now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
457 | self.log.info(f"{now}程序结束")
--------------------------------------------------------------------------------
/app/decorator/decorator.py:
--------------------------------------------------------------------------------
1 | from app.log import Logger
2 |
3 | log = Logger().log()
4 |
5 |
6 | def error_repeat(func):
7 | """
8 | 添加注解的方法在报错之后还会重新执行
9 | :param func:
10 | :return:
11 | """
12 |
13 | def warp(*args, **kwargs):
14 | try:
15 | temp = func(*args, **kwargs)
16 | return temp
17 | except Exception as e:
18 | log.info(f"方法{func.__name__}执行报错,报错信息:" + str(e))
19 | while True:
20 | try:
21 | log.info(f"方法{func.__name__}正在尝试重新执行")
22 | temp = func(*args, **kwargs)
23 | log.info(f"方法{func.__name__}尝试重新执行成功!")
24 | break
25 | except Exception as e:
26 | log.info(f"方法{func.__name__}执行报错,报错信息:" + str(e))
27 | return temp
28 |
29 | return warp
30 |
--------------------------------------------------------------------------------
/app/log.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | import logging
3 | import datetime
4 | import os
5 |
6 |
7 | class Logger:
8 | """自定义封装logging模块"""
9 |
10 | def __init__(self, default_level=logging.INFO):
11 | # 加载配置文件地址
12 | cur_path = os.path.dirname(__file__)
13 | self.config_path = cur_path + "/config.yaml"
14 |
15 | # 初始化一个logger
16 | self.logger = logging.getLogger('__name__')
17 | self.default_level = default_level
18 | logger_main_level, logger_file_level, logger_console_level = self.config()
19 | self.logger.setLevel(logger_main_level)
20 | fomatter = logging.Formatter(
21 | '[%(asctime)s] %(filename)s line:%(lineno)d [%(levelname)s]%(message)s')
22 | # 初始化输出到日志文件的handle
23 | file_name = self.getLogFilePath() + '/{}log.txt'.format(datetime.datetime.now().strftime('%Y-%m-%d'))
24 | file_log = logging.FileHandler(filename=file_name, encoding='utf-8')
25 | file_log.setLevel(logger_file_level)
26 | file_log.setFormatter(fomatter)
27 | # 初始化增加输出到控制台的handle
28 | console_log = logging.StreamHandler()
29 | console_log.setLevel(logger_console_level)
30 | console_log.setFormatter(fomatter)
31 |
32 | if self.logger.hasHandlers() is False:
33 | self.logger.addHandler(file_log)
34 | self.logger.addHandler(console_log)
35 | # self.logger.removeHandler(file_log)
36 | # self.logger.removeHandler(console_log)
37 | file_log.close()
38 | console_log.close()
39 |
40 | def config(self):
41 | """
42 | :return: 返回配置中读取的level
43 | """
44 | try:
45 |
46 | with open(self.config_path, 'r', encoding='utf-8') as f:
47 | global config_data
48 | config_data = yaml.load(f, Loader=yaml.FullLoader)
49 | except IOError as e:
50 | print(e)
51 | self.logger.error('open config file failed')
52 | case1 = config_data['logConfig']['testLogLevel']['mainLogLevel']
53 | case2 = config_data['logConfig']['testLogLevel']['fileLogLevel']
54 | case3 = config_data['logConfig']['testLogLevel']['consoleLogLevel']
55 | logger_main_level = self.switch(case=case1)
56 | logger_file_level = self.switch(case=case2)
57 | logger_console_level = self.switch(case=case3)
58 | log_file_path = config_data['logConfig']['logFilePath']
59 | return logger_main_level, logger_file_level, logger_console_level
60 |
61 | def getLogFilePath(self):
62 | """
63 | 获取配置文件中的看日志存放路径
64 | :return:
65 | """
66 | try:
67 | with open(self.config_path, 'r', encoding='utf-8') as f:
68 | global config_data
69 | config_data = yaml.load(f, Loader=yaml.FullLoader)
70 | except IOError:
71 | self.logger.error('open config file failed')
72 | return config_data['logConfig']['logFilePath']
73 |
74 | def switch(self, case):
75 | """
76 | :param case: 传入需要做判断的level
77 | :return: 返回最终的level
78 | """
79 | if case == 'DEBUG':
80 | result = logging.DEBUG
81 | elif case == 'INFO':
82 | result = logging.DEBUG
83 | elif case == 'ERROR':
84 | result = logging.ERROR
85 | elif case == 'CRITICAL':
86 | result = logging.CRITICAL
87 | else:
88 | result = self.logger.setLevel(self.default_level)
89 | return result
90 |
91 | def log(self):
92 | return self.logger
93 |
94 |
95 | if __name__ == '__main__':
96 | # log = Logger()
97 | # print(log.getLogFilePath())
98 |
99 | with open('config.yaml', 'r') as f:
100 | print(f.readline())
101 |
--------------------------------------------------------------------------------
/app/util/pyplot_util.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 |
3 |
4 | def make_fig():
5 | pass
6 |
--------------------------------------------------------------------------------
/app/util/zhi_ma_ip.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import requests
3 | import time
4 | from app.configUtil import ConfigUtil
5 | from app.log import Logger
6 |
7 | config = ConfigUtil()
8 | log = Logger().logger
9 |
10 |
11 | class ZhiMaIp:
12 | def __init__(self):
13 | self.url = config.getValue("zhi_ma_ip_url")
14 | self.addWhiteListIpUrl = config.getValue("white_list_ip_url")
15 |
16 | def send_request(self):
17 | """
18 | 发送请求获取一个代理
19 | :return:
20 | """
21 | res = None
22 | try:
23 | res = requests.get(self.url, timeout=3)
24 | except Exception as e:
25 | log.error(e)
26 | log.info("代理请求失败,尝试重新申请...")
27 | for i in range(5):
28 | try:
29 | res = requests.get(self.url, timeout=3)
30 | log.info(f"第{i + 1}次代理请求失败!")
31 | break
32 | except Exception as e:
33 | log.error(e)
34 | log.info(f"第{i + 1}次代理请求成功!")
35 | hostAndPort = res.text
36 | hostAndPort = hostAndPort.replace('\n', '').replace('\r', '')
37 | if len(hostAndPort) > 30:
38 | self.addWhiteList()
39 | res = requests.get(self.url)
40 | hostAndPort = res.text
41 | hostAndPort = hostAndPort.replace('\n', '').replace('\r', '')
42 | proxyMeta = "http://%(proxies)s" % {
43 | "proxies": hostAndPort
44 | }
45 |
46 | proxies = {
47 | "https": proxyMeta,
48 | "http": proxyMeta
49 | }
50 | return proxies
51 |
52 | def getOneProxies(self):
53 | """
54 | 获取一个代理服务器
55 | :return:
56 | """
57 | # while True:
58 | # proxies = self.send_request()
59 | # res = self.check_proxies(proxies)
60 | # if res:
61 | # break
62 | return self.send_request()
63 |
64 | @staticmethod
65 | def check_proxies(proxies):
66 | """
67 | 检测代理活性和状态
68 | :param proxies:
69 | :return:
70 | """
71 | # 检测代理活性和状态
72 | start = time.clock()
73 | res = requests.get(url="http://icanhazip.com/", timeout=8, proxies=proxies).text.replace('\n', '')
74 | end = time.clock()
75 | return (end - start) < 2 and res in str(proxies)
76 |
77 | def addWhiteList(self):
78 | """
79 | 添加一个IP至白名单
80 | :return:
81 | """
82 | ip = requests.get(url="http://ip.42.pl/raw").text
83 | url = self.addWhiteListIpUrl + ip
84 | requests.get(url)
85 |
86 |
87 | if __name__ == '__main__':
88 | z = ZhiMaIp()
89 | print(z.getOneProxies())
90 |
91 |
--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | monitortime: 60
2 |
3 | maxtimeout: 30
4 | logConfig:
5 | logFilePath: 'app/logs/'
6 | testLogLevel:
7 | mainLogLevel: INFO
8 | fileLogLevel: INFO
9 | consoleLogLevel: INFO
10 | productLogLevel:
11 | mainLogLevel: INFO
12 | fileLogLevel: INFO
13 | consoleLogLevel: ERROR
14 |
15 | db:
16 | host: '数据库地址'
17 | username: '用户名'
18 | password: '密码'
19 | port: 端口
20 | dbName: '库名'
21 | charset: 'utf8'
22 |
23 |
24 | img_path: "商品图片保存地址"
25 |
26 | analysis_img_path: "分析图片保存地址"
27 |
28 | zhi_ma_ip_url: "芝麻代理接口"
29 |
30 | white_list_ip_url: "芝麻代理加白名单接口"
31 |
32 |
33 |
--------------------------------------------------------------------------------
/developer.py:
--------------------------------------------------------------------------------
1 | from app.data_analysis.analysis_executor import AnalysisExecutor
2 | from app.data_analysis.analysis import Analysis
3 | from app.data_analysis.generate_reports import GenerateReports
4 | from app.decorator.decorator import error_repeat
5 |
6 |
7 | an = AnalysisExecutor()
8 | an.update_one_month("327624-001")
9 | # an.update_three_month("327624-001")
10 | # an.reports_one_month("327624-001")
11 | # an.reports_three_month("327624-001")
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from apscheduler.schedulers.blocking import BlockingScheduler
2 | from app.data_analysis.analysis_executor import AnalysisExecutor
3 | from app.de_wu_spider import DeWuSpider
4 | from app.log import Logger
5 |
6 | log = Logger().logger
7 |
8 |
9 | def run():
10 | # 添加定时器
11 | log.info("得物数据分析程序定时器已启动")
12 | scheduler = BlockingScheduler()
13 | # 定时爬虫程序
14 | # spider = DeWuSpider()
15 | # scheduler.add_job(spider.thread_run, 'cron', hour=00)
16 | # scheduler.add_job(spider.thread_run, 'cron', hour=12)
17 |
18 | # 数据分析程序
19 | analysis = AnalysisExecutor()
20 | # scheduler.add_job(analysis.update_all_data, 'cron', day_of_week=0, hour=3)
21 | scheduler.add_job(analysis.update_all_data, 'cron', hour=14, minute=31)
22 | scheduler.start()
23 |
24 |
25 | if __name__ == '__main__':
26 | run()
27 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # 得物数据爬取、分析平台
4 |
5 | ## Introduction - 介绍
6 | 该项目主要完成对得物交易数据的爬取与分析,对本地的交易数据进行分析生成一些数据分析图,能够准确的预估价格的走势。
7 |
8 | ### Summary - 概要
9 | #### 1、交易记录爬虫
10 | 每天定时爬取得物交易记录,每次爬取存在记忆功能,不会存在重复爬取的现象。
11 | #### 2、数据分析
12 | 每周定时对历史数据进行新的一轮分析,并生成数据分析图、数据分析报告
13 |
14 | ### Features - 特性
15 | #### 1、实时性
16 | #### 2、准确性
17 | #### 3、个性化
18 |
19 | ## Configuration - 配置
20 | 导入模块: pip3 install -i http://pypi.douban.com/simple --trusted-host pypi.douban.com -r requirements.txt
21 |
22 | ## FAQ - 常见问题
23 | #### 1、代理问题:代理需要自行购买
24 | #### 2、数据库问题:项目没有提供数据库文件,如需要自行联系作者
25 | #### 3、启动(运行)问题:python main.py 及启动定时爬取与定时分析任务
26 | #### 4、代码优化与意见:联系作者
27 | #### 5、其他问题:如有其他问题及时联系作者
28 |
29 | ## Contact - 联系
30 | 联系方式:3392903032@qq.com
31 |
32 | ## License - 版权信息
33 | #### 该项目仅供学习交流使用,禁止进行商业用途,如有出现,作者不承担任何法律责任。
34 | #### 项目未经作者允许不得他用,一旦发现,追究其法律责任。
35 | #### 如影响得物平台利益,请通过上方联系方式联系作者。
36 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests==2.25.1
2 | pymysql==1.0.2
3 | pyyaml==5.4.1
4 | apscheduler==3.7.0
5 | flask==1.1.2
6 | SQLAlchemy==1.4.9
7 | pandas~=1.1.5
8 | matplotlib~=3.3.3
9 | docxtpl==0.11.4
--------------------------------------------------------------------------------