├── .idea ├── .gitignore ├── dewuSpider.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── app ├── actual_query.py ├── configUtil.py ├── data_analysis │ ├── analysis.py │ ├── analysis_executor.py │ └── generate_reports.py ├── db │ └── my_sql_db.py ├── de_wu_spider.py ├── decorator │ └── decorator.py ├── log.py └── util │ ├── pyplot_util.py │ └── zhi_ma_ip.py ├── config.yaml ├── developer.py ├── main.py ├── readme.md └── requirements.txt /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /workspace.xml -------------------------------------------------------------------------------- /.idea/dewuSpider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /app/actual_query.py: -------------------------------------------------------------------------------- 1 | from flask import Flask 2 | 3 | app = Flask(__name__) 4 | 5 | 6 | @app.route('/hello') 7 | def hello(): 8 | return 'Hello word!' 9 | 10 | 11 | if __name__ == '__main__': 12 | app.run() 13 | -------------------------------------------------------------------------------- /app/configUtil.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import random 3 | import os 4 | 5 | from app.log import Logger 6 | 7 | log = Logger().log() 8 | 9 | 10 | class ConfigUtil: 11 | def __init__(self): 12 | cur_path = os.path.dirname(__file__) 13 | self.configPath = cur_path + "/config.yaml" 14 | 15 | def readYaml(self): 16 | # read config from yaml document 17 | file = self.configPath 18 | try: 19 | f = open(file, 'r', encoding='UTF-8') 20 | global configData 21 | configData = yaml.load(f, Loader=yaml.FullLoader) 22 | except IOError: 23 | log.info('open config failed') 24 | return configData 25 | 26 | def getValue(self, key): 27 | return self.readYaml()[key] 28 | 29 | def randomGetUserAgent(self): 30 | return random.choice(self.readYaml()['User_Agents']) 31 | 32 | 33 | if __name__ == '__main__': 34 | curPath = os.path.dirname(__file__) 35 | print(curPath) 36 | -------------------------------------------------------------------------------- /app/data_analysis/analysis.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import matplotlib.pyplot as plt 3 | import os 4 | import time 5 | 6 | from app.configUtil import ConfigUtil 7 | from decimal import Decimal 8 | from app.log import Logger 9 | 10 | 11 | class Analysis: 12 | def __init__(self, article_number, db, _type="one_month"): 13 | self.db = db 14 | self.engine = self.db.getEngine() 15 | self.log = Logger().logger 16 | self.article_number = article_number 17 | self.type = _type 18 | # 保存图片文件夹 19 | self.save_img_path = ConfigUtil().getValue("analysis_img_path") + self.article_number 20 | if not os.path.exists(self.save_img_path): 21 | os.makedirs(self.save_img_path) 22 | # 查询数据 23 | sql = f"SELECT * FROM org_purchase_record WHERE article_number = '{article_number}'" 24 | if _type == "one_month": 25 | sql += "and format_time >= DATE_SUB(DATE_FORMAT(NOW(), '%Y-%m-%d'), INTERVAL 30 DAY)" 26 | elif _type == "three_month": 27 | sql += "and format_time >= DATE_SUB(DATE_FORMAT(NOW(), '%Y-%m-%d'), INTERVAL 91 DAY)" 28 | self.all_data = pd.read_sql_query(sql, self.engine) 29 | # 删除求购 30 | self.data = self.all_data.drop(self.all_data[self.all_data.order_sub_type_name == "求购"].index) 31 | # 获取求购数据 32 | self.ask_to_buy = self.all_data[self.all_data["order_sub_type_name"] == "求购"] 33 | # 获取时间 34 | self.date = self.data.format_time.drop_duplicates().sort_values(ascending=False).values 35 | # 获取尺码 36 | self.size = self.data.properties_values.drop_duplicates().sort_values(ascending=False).values 37 | 38 | # 图片属性 39 | self.image_wide = 15 40 | self.image_high = 10 41 | self.title_fontsize = 16 42 | self.label_fontsize = 14 43 | 44 | def get_price_volume(self, chart_type="日期"): 45 | """ 46 | 统计价格-销量 47 | :return: 48 | """ 49 | # 根绝时间分组的价格 50 | price = self.data.groupby('format_time' if chart_type == "日期" else 'properties_values')['price'].mean() 51 | counts = self.data.groupby('format_time' if chart_type == "日期" else 'properties_values')['price'].count().values 52 | 53 | # 绘图 54 | fig = plt.figure(figsize=(self.image_wide, self.image_high)) 55 | p = fig.add_subplot(111) 56 | p.set_title(f"{chart_type}-价格-销量趋势图", fontsize=self.label_fontsize) 57 | p.set_ylabel("价格", fontsize=self.label_fontsize) 58 | p.set_xlabel(chart_type, fontsize=self.label_fontsize) 59 | p.plot(self.date if chart_type == "日期" else self.size, price) 60 | c = p.twinx() 61 | c.set_ylabel("交易量", fontsize=self.label_fontsize) 62 | ca = c.bar(self.date if chart_type == "日期" else self.size, counts, alpha=0.3) 63 | fig.legend(['平均价格', '交易量']) 64 | # self.__auto_text(ca) 65 | 66 | # 修改保存路径 67 | img_path = f"/date_price_volume_{self.type}.jpg" if chart_type == "日期" \ 68 | else f"/size_price_volume_{self.type}.jpg" 69 | fig.savefig(self.save_img_path + img_path) 70 | plt.close(fig) 71 | 72 | def get_user_repeat(self): 73 | """ 74 | 统计用户重复率 75 | :return: 76 | """ 77 | # 根据用户名分组的重复率 78 | users = self.data.groupby('user_name')['price'].count().reset_index(name='count')\ 79 | .sort_values('count', ascending=False).head(10) 80 | user_list = users.user_name.tolist() 81 | count_list = users['count'].tolist() 82 | 83 | # 绘图 84 | fig = plt.figure(figsize=(self.image_wide, self.image_high)) 85 | user_repeat_plt = fig.add_subplot(111) 86 | user_repeat_plt.set_title("用户重复购买数量", fontsize=self.title_fontsize) 87 | user_repeat_plt.set_xlabel("用户名称", fontsize=self.label_fontsize) 88 | user_repeat_plt.set_ylabel("数量", fontsize=self.label_fontsize) 89 | user_repeat_plt.bar(user_list, count_list) 90 | fig.savefig(self.save_img_path + f"/user_repeat_{self.type}.jpg") 91 | plt.close(fig) 92 | 93 | def get_repeat_num(self): 94 | """ 95 | 生成交易量重复图 96 | :return: 97 | """ 98 | # 计算交易数量 99 | counts = self.data.user_name.value_counts() 100 | two_count = 0 101 | three_count = 0 102 | four_count = 0 103 | for count in counts: 104 | if count > 1: 105 | two_count += count 106 | if count > 2: 107 | three_count += count 108 | if count > 3: 109 | four_count += count 110 | # 绘图 111 | index_list = ["大于两次", "大于三次", "大于四次"] 112 | data = [two_count, three_count, four_count] 113 | fig = plt.figure(figsize=(self.image_wide, self.image_high)) 114 | repeat_plt = fig.add_subplot(111) 115 | repeat_plt.set_title("重复交易量试图", fontsize=self.title_fontsize) 116 | repeat_plt.set_xlabel("重复频率", fontsize=self.label_fontsize) 117 | repeat_plt.set_ylabel("重复交易数量", fontsize=self.label_fontsize) 118 | res = repeat_plt.bar(index_list, data) 119 | # self.__auto_text(res) 120 | fig.savefig(self.save_img_path + f"/repeat_num_{self.type}.jpg") 121 | plt.close(fig) 122 | 123 | def analysis_info(self): 124 | """ 125 | 获取基础分析数据 126 | :return: 127 | """ 128 | if len(self.data) == 0: 129 | return { 130 | "r_size": "", 131 | "max_price": 0, 132 | "min_price": 0, 133 | "avg_price": 0, 134 | "all_volume": 0, 135 | "premium": 0 136 | } 137 | 138 | # 获取数据 139 | recommended_size = self.data.groupby("properties_values")["id"].count().reset_index(name="count") \ 140 | .sort_values("count", ascending=False).head(3).properties_values.values 141 | r_size = ",".join(recommended_size) 142 | 143 | # 基础属性 144 | max_price = self.data.price.max() 145 | min_price = self.data.price.min() 146 | avg_price = round(self.data.price.mean(), 2) 147 | all_volume = self.data.shape[0] 148 | 149 | # 计算溢价 150 | auth_price = self.db.getOne(f"SELECT auth_price FROM org_detail WHERE article_number = '{self.article_number}'")[0] 151 | premium = round((Decimal(avg_price) - auth_price) / auth_price * 100, 2) 152 | return { 153 | "r_size": r_size, 154 | "max_price": max_price, 155 | "min_price": min_price, 156 | "avg_price": avg_price, 157 | "all_volume": all_volume, 158 | "premium": premium 159 | } 160 | 161 | def update_info(self): 162 | """ 163 | 更新基础属性 164 | :return: 165 | """ 166 | an_info = self.analysis_info() 167 | # 持久化 168 | query_info_sql = f"SELECT * FROM org_data_analysis_info WHERE article_number = '{self.article_number}'" 169 | info = self.db.getOne(query_info_sql) 170 | if info: 171 | # 更新 172 | update_sql = f"UPDATE org_data_analysis_info " \ 173 | f"SET max_price = {an_info['max_price']}, avg_price = {an_info['avg_price']}," \ 174 | f"min_price = {an_info['min_price']}, premium = {an_info['premium']}," \ 175 | f"all_volume = {an_info['all_volume']},recommended_size = '{an_info['r_size']}'," \ 176 | f"update_time = '{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())}' " \ 177 | f"WHERE article_number = '{self.article_number}'" 178 | self.db.executeSql(update_sql) 179 | else: 180 | # 插入 181 | info = ( 182 | None, 183 | self.article_number, 184 | 0, 185 | an_info['max_price'], 186 | an_info['avg_price'], 187 | an_info['min_price'], 188 | an_info['premium'], 189 | an_info['all_volume'], 190 | an_info['r_size'], 191 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 192 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 193 | ) 194 | insert_sql = "INSERT INTO org_data_analysis_info VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" 195 | self.db.insertData(sql=insert_sql, data=info) 196 | 197 | def get_ask_to_buy(self): 198 | """ 199 | 绘制求购数据 200 | :return: 201 | """ 202 | # 获取数据 203 | size = self.ask_to_buy.groupby("properties_values")["id"].count() 204 | date = self.ask_to_buy.groupby("format_time")["id"].count() 205 | 206 | size_index = size.index.values 207 | date_index = pd.to_datetime(date.index.values, format="%Y-%m-%d") 208 | 209 | size_data = size.values 210 | date_data = date.values 211 | 212 | # 绘图 213 | fig = plt.figure(figsize=(self.image_wide, self.image_high)) 214 | size_plt = fig.add_subplot(211) 215 | size_plt.set_title("尺码-求购量图", fontsize=self.title_fontsize) 216 | size_plt.set_ylabel("求购量", fontsize=self.label_fontsize) 217 | size_plt.set_xlabel("尺码", fontsize=self.label_fontsize) 218 | size_plt.bar(size_index, size_data) 219 | 220 | date_plt = fig.add_subplot(212) 221 | date_plt.set_title("日期-求购量图", fontsize=self.title_fontsize) 222 | date_plt.set_ylabel("求购量", fontsize=self.label_fontsize) 223 | date_plt.set_xlabel("日期", fontsize=self.label_fontsize) 224 | date_plt.bar(date_index, date_data) 225 | fig.savefig(self.save_img_path + f"/ask_to_buy_{self.type}.jpg") 226 | plt.close(fig) 227 | 228 | def run_analysis(self): 229 | """ 230 | 数据分析 231 | :return: 232 | """ 233 | if self.type == "one_month": 234 | # 修改信息 235 | self.log.info(f"正在更新【{self.article_number}】交易信息") 236 | self.update_info() 237 | self.log.info(f"【{self.article_number}】交易信息更新完成") 238 | 239 | count = len(self.data) 240 | if count > 0: 241 | # 生成日期价格图 242 | self.log.info(f"正在生成【{self.article_number}】日期价格图") 243 | self.get_price_volume() 244 | self.log.info(f"【{self.article_number}】日期价格图生成完毕") 245 | 246 | # 生成尺码价格图 247 | self.log.info(f"正在生成【{self.article_number}】尺码价格图") 248 | self.get_price_volume(chart_type="尺码") 249 | self.log.info(f"【{self.article_number}】尺码价格图生成完毕") 250 | 251 | # 生成求购图 252 | self.log.info(f"正在生成【{self.article_number}】求购图") 253 | self.get_ask_to_buy() 254 | self.log.info(f"【{self.article_number}】求购图生成完毕") 255 | 256 | # 生成推荐尺码移动平均线图 257 | self.log.info(f"正在生成【{self.article_number}】SMV图") 258 | self.get_ma() 259 | self.log.info(f"【{self.article_number}】SMV图生成完毕") 260 | 261 | # 生成交易量重复图 262 | self.log.info(f"正在生成【{self.article_number}】交易量重复图") 263 | self.get_repeat_num() 264 | self.log.info(f"【{self.article_number}】交易量重复图生成完毕") 265 | 266 | # 生成用户交易量重复图 267 | self.log.info(f"正在生成【{self.article_number}】用户交易量重复图") 268 | self.get_user_repeat() 269 | self.log.info(f"【{self.article_number}】用户交易量重复图生成完毕") 270 | 271 | def __get_recommended_data(self): 272 | """ 273 | 获取推荐尺码所有的数据 274 | :return: 275 | """ 276 | recommended_size = self.data.groupby("properties_values")["id"].count().reset_index(name="count") \ 277 | .sort_values("count", ascending=False).head(3).properties_values.values.tolist() 278 | # 删除非推荐尺码数据 279 | recommended_data = self.data.drop(self.data[(self.data.properties_values != recommended_size[0]) 280 | & (self.data.properties_values != recommended_size[1]) 281 | & (self.data.properties_values != recommended_size[2])].index) 282 | return recommended_data 283 | 284 | def get_ma(self): 285 | """ 286 | 绘制移动平均线 287 | :return: 288 | """ 289 | # 获取推荐尺码数据 290 | recommended_data = self.__get_recommended_data() 291 | data = recommended_data.groupby("format_time") 292 | 293 | # 处理数据 294 | date_list = [] 295 | avg_list = [] 296 | price_list = [] 297 | for index, d in data: 298 | one_day = self.__analysis_one_day(d) 299 | date_list.append(pd.to_datetime(index.value, format="%Y-%m-%d")) 300 | avg_list.append(one_day['avg_price']) 301 | price_list.append(one_day['close_price']) 302 | 303 | # 绘图 304 | fig = plt.figure(figsize=(self.image_wide, self.image_high)) 305 | plot = fig.add_subplot(111) 306 | plot.set_title("推荐尺码SMA(Simple Moving Average)", fontsize=self.title_fontsize) 307 | plot.set_xlabel("日期", fontsize=self.label_fontsize) 308 | plot.set_ylabel("价格", fontsize=self.label_fontsize) 309 | plot.plot(date_list, avg_list, label="平均线", color="#F08080") 310 | plot.plot(date_list, price_list, label="价格线", color="#DB7093", linestyle="--") 311 | plot.legend() 312 | plot.grid(alpha=0.4, linestyle=':') 313 | fig.savefig(self.save_img_path + f"/ma_{self.type}.jpg") 314 | plt.close(fig) 315 | 316 | def get_k_line(self): 317 | """ 318 | 绘制k线图 319 | :return: 320 | """ 321 | pass 322 | 323 | @staticmethod 324 | def __auto_text(rects): 325 | for rect in rects: 326 | plt.text(rect.get_x(), rect.get_height(), rect.get_height(), va='bottom') 327 | 328 | @staticmethod 329 | def __analysis_one_day(data): 330 | """ 331 | 分析一天的数据 332 | :param data: 333 | :return: 334 | """ 335 | max_price = data.price.max() 336 | min_price = data.price.min() 337 | avg_price = round(data.price.mean(), 2) 338 | open_price = data.price.iloc[0] 339 | close_price = data.price.iloc[data.shape[0] - 1] 340 | return { 341 | "max_price": max_price, 342 | "min_price": min_price, 343 | "avg_price": avg_price, 344 | "open_price": open_price, 345 | "close_price": close_price 346 | } -------------------------------------------------------------------------------- /app/data_analysis/analysis_executor.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from app.data_analysis.analysis import Analysis 4 | from app.data_analysis.generate_reports import GenerateReports 5 | from app.log import Logger 6 | from app.db.my_sql_db import MySqlDb 7 | from app.decorator.decorator import error_repeat 8 | 9 | 10 | class AnalysisExecutor: 11 | def __init__(self): 12 | self.db = MySqlDb() 13 | self.log = Logger().logger 14 | self.thread_count = 4 15 | 16 | def update_all_data(self): 17 | """ 18 | 更新所有商品信息数据 19 | :return: 20 | """ 21 | self.log.info("正在启动单线程数据分析程序...") 22 | # 查询所有已有记录的商品列表 23 | commodity_sql = 'SELECT * FROM org_detail' 24 | commodity_list = self.db.query(commodity_sql) 25 | article_number_list = [com[7] for com in commodity_list] 26 | 27 | # with ThreadPoolExecutor(max_workers=self.thread_count) as executor: 28 | # executor.map(self.update_one_date, article_number_list) 29 | 30 | for commodity in article_number_list: 31 | # 一个月数据 32 | self.update_one_month(commodity) 33 | self.reports_one_month(commodity) 34 | 35 | # 三个月数据 36 | self.update_three_month(commodity) 37 | self.reports_three_month(commodity) 38 | 39 | now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 40 | self.log.info(f"{now}程序结束") 41 | 42 | def update_one_month(self, article_number): 43 | """ 44 | 更新一个月数据 45 | :param article_number: 46 | :return: 47 | """ 48 | self.log.info(f"正在对【{article_number}】进行一个月数据分析") 49 | an = Analysis(article_number, self.db) 50 | an.run_analysis() 51 | self.log.info(f"【{article_number}】数据分析完成") 52 | 53 | @error_repeat 54 | def update_three_month(self, article_number): 55 | """ 56 | 更新三个月数据 57 | :param article_number: 58 | :return: 59 | """ 60 | self.log.info(f"正在对【{article_number}】进行三个月数据分析") 61 | an = Analysis(article_number, self.db, _type='three_month') 62 | an.run_analysis() 63 | self.log.info(f"【{article_number}】数据分析完成") 64 | 65 | @error_repeat 66 | def reports_one_month(self, article_number): 67 | """ 68 | 生成一个月的数据报告 69 | :param article_number: 70 | :return: 71 | """ 72 | self.log.info(f"正在生成【{article_number}】一个月数据分析报告") 73 | gen = GenerateReports(article_number, self.db) 74 | gen.generate() 75 | self.log.info(f"【{article_number}】一个月数据分析报告生成成功") 76 | 77 | @error_repeat 78 | def reports_three_month(self, article_number): 79 | """ 80 | 生成三个月的数据报告 81 | :param article_number: 82 | :return: 83 | """ 84 | self.log.info(f"正在生成【{article_number}】三个月数据分析报告") 85 | gen = GenerateReports(article_number, self.db, reports_type="three_month") 86 | gen.generate() 87 | self.log.info(f"【{article_number}】三个月数据分析报告生成成功") -------------------------------------------------------------------------------- /app/data_analysis/generate_reports.py: -------------------------------------------------------------------------------- 1 | import docxtpl 2 | import os 3 | import time 4 | 5 | from docx.shared import Mm 6 | from app.configUtil import ConfigUtil 7 | from app.log import Logger 8 | from app.data_analysis.analysis import Analysis 9 | 10 | 11 | class GenerateReports: 12 | """ 13 | 生成数据分析报告 14 | """ 15 | 16 | def __init__(self, article_number, db, reports_type="one_month"): 17 | self.article_number = article_number 18 | self.temp_path = os.path.dirname(os.path.dirname(__file__)) + "/static/data_analysis_tpl.docx" 19 | self.reports_type = reports_type 20 | self.db = db 21 | self.log = Logger().logger 22 | conf = ConfigUtil() 23 | self.img_path = conf.getValue("img_path") + self.article_number + ".jpg" 24 | self.analysis_img_path = conf.getValue("analysis_img_path") + self.article_number 25 | 26 | def generate(self): 27 | # 基础数据 28 | detail_sql = f"SELECT title,auth_price,sell_date,brand FROM org_detail " \ 29 | f"WHERE article_number ='{self.article_number}'" 30 | detail = self.db.getOne(detail_sql) 31 | daily_docx = docxtpl.DocxTemplate(self.temp_path) 32 | # logo_img = docxtpl.InlineImage(daily_docx, self.img_path, width=Mm(140)) 33 | 34 | # 基础分析数据 35 | an = Analysis(self.article_number, self.db, self.reports_type) 36 | info = an.analysis_info() 37 | 38 | # 分析图 39 | size_price_volume = docxtpl.InlineImage(daily_docx, self.analysis_img_path 40 | + f'/size_price_volume_{self.reports_type}.jpg', width=Mm(140)) 41 | date_price_volume = docxtpl.InlineImage(daily_docx, self.analysis_img_path 42 | + f'/date_price_volume_{self.reports_type}.jpg', width=Mm(140)) 43 | ask_to_buy = docxtpl.InlineImage(daily_docx, self.analysis_img_path 44 | + f'/ask_to_buy_{self.reports_type}.jpg', width=Mm(140)) 45 | ma = docxtpl.InlineImage(daily_docx, self.analysis_img_path 46 | + f'/ma_{self.reports_type}.jpg', width=Mm(140)) 47 | user_repeat = docxtpl.InlineImage(daily_docx, self.analysis_img_path 48 | + f'/user_repeat_{self.reports_type}.jpg', width=Mm(140)) 49 | repeat_num = docxtpl.InlineImage(daily_docx, self.analysis_img_path 50 | + f'/repeat_num_{self.reports_type}.jpg', width=Mm(140)) 51 | 52 | # 渲染内容 53 | context = { 54 | "name": detail[0], 55 | "brand": detail[3], 56 | "auth_price": detail[1], 57 | "sell_date": detail[2], 58 | "recommended_size": info['r_size'], 59 | "max_price": info['max_price'], 60 | "min_price": info['min_price'], 61 | "all_volume": info['all_volume'], 62 | "premium": info['premium'], 63 | "avg_price": info['avg_price'], 64 | "size_price_volume": size_price_volume, 65 | "date_price_volume": date_price_volume, 66 | "ask_to_buy": ask_to_buy, 67 | "ma": ma, 68 | "user_repeat": user_repeat, 69 | "repeat_num": repeat_num, 70 | "create_time": time.strftime('%Y-%m-%d', time.localtime()) 71 | } 72 | # 渲染docx 73 | daily_docx.render(context) 74 | # 保存docx 75 | daily_docx.save(self.analysis_img_path + '/' + self.article_number + '_' + self.reports_type + ".docx") 76 | -------------------------------------------------------------------------------- /app/db/my_sql_db.py: -------------------------------------------------------------------------------- 1 | import pymysql 2 | from app.log import Logger 3 | from app.configUtil import ConfigUtil 4 | from sqlalchemy import create_engine 5 | 6 | log = Logger().log() 7 | config = ConfigUtil() 8 | 9 | 10 | class MySqlDb: 11 | def __init__(self): 12 | self.dbConfig = config.getValue('db') 13 | self.host = self.dbConfig['host'] 14 | self.username = self.dbConfig['username'] 15 | self.password = self.dbConfig['password'] 16 | self.port = self.dbConfig['port'] 17 | self.db = self.dbConfig['dbName'] 18 | self.charset = self.dbConfig['charset'] 19 | 20 | def getConnect(self): 21 | """ 22 | 获取mysql数据库连接 23 | :return: 24 | """ 25 | con = None 26 | try: 27 | con = pymysql.connect( 28 | host=self.host, user=self.username, passwd=self.password, port=self.port, db=self.db, charset=self.charset 29 | ) 30 | except Exception as e: 31 | log.error(e) 32 | log.info("获取数据库连接失败,正在尝试重新连接...") 33 | for _ in range(5): 34 | try: 35 | con = pymysql.connect( 36 | host=self.host, user=self.username, passwd=self.password, port=self.port, db=self.db, 37 | charset=self.charset 38 | ) 39 | log.info("连接成功!") 40 | break 41 | except Exception as e: 42 | log.error(e) 43 | return con 44 | 45 | @staticmethod 46 | def getCursor(connect): 47 | """ 48 | 获取数据库游标 49 | :param connect: 50 | :return: 51 | """ 52 | return connect.cursor() 53 | 54 | def getEngine(self): 55 | """ 56 | 获取pandas需要的Engine 57 | :return: 58 | """ 59 | return create_engine(f'mysql+pymysql://{self.username}:{self.password}@{self.host}:{self.port}/{self.db}') 60 | 61 | def insertDataList(self, sql, data): 62 | """ 63 | 插入一个集合数据 64 | :param sql: 65 | :param data: 66 | :return: 67 | """ 68 | for entity in data: 69 | self.insertData(sql, entity) 70 | 71 | def insertData(self, sql, data): 72 | """ 73 | 执行插入语句 74 | :param sql:执行的sql 75 | :param data:需要插入的数据 76 | :return: 77 | """ 78 | if sql is not None and sql != ' ': 79 | if data is not None: 80 | con = self.getConnect() 81 | cur = self.getCursor(con) 82 | cur.execute(sql, data) 83 | con.commit() 84 | # 关闭连接,关闭游标 85 | cur.close() 86 | con.close() 87 | log.info("数据插入成功") 88 | else: 89 | log.info("待插入数据不能为空") 90 | else: 91 | log.info("执行sql不能为空") 92 | 93 | def query(self, sql): 94 | """ 95 | 查询数据 96 | :param sql: 97 | :return: 98 | """ 99 | if sql is not None and sql != " ": 100 | con = self.getConnect() 101 | cur = self.getCursor(con) 102 | cur.execute(sql) 103 | data = cur.fetchall() 104 | cur.close() 105 | con.close() 106 | return data 107 | else: 108 | log.info("执行sql不能为空") 109 | 110 | def getOne(self, sql): 111 | """ 112 | 查询数据 113 | :param sql: 114 | :return: 115 | """ 116 | if sql is not None and sql != " ": 117 | con = self.getConnect() 118 | cur = self.getCursor(con) 119 | cur.execute(sql) 120 | data = cur.fetchone() 121 | cur.close() 122 | con.close() 123 | return data 124 | else: 125 | log.info("执行sql不能为空") 126 | 127 | def executeSql(self, sql): 128 | """ 129 | 执行sql语句 130 | :param sql: 131 | :return: 132 | """ 133 | con = self.getConnect() 134 | cur = self.getCursor(con) 135 | try: 136 | if sql is not None and sql != " ": 137 | cur.execute(sql) 138 | else: 139 | log.info("执行sql不能为空") 140 | except Exception as e: 141 | log.error(e) 142 | con.rollback() 143 | else: 144 | con.commit() 145 | finally: 146 | # 关闭连接,关闭游标 147 | cur.close() 148 | con.close() 149 | 150 | -------------------------------------------------------------------------------- /app/de_wu_spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | import os 4 | import datetime 5 | import random 6 | 7 | from app.util import request_util 8 | from app.db.my_sql_db import MySqlDb 9 | from app.configUtil import ConfigUtil 10 | from app.util.zhi_ma_ip import ZhiMaIp 11 | from app.log import Logger 12 | from app.data_analysis.analysis_executor import AnalysisExecutor 13 | from concurrent.futures import ThreadPoolExecutor 14 | 15 | 16 | class DeWuSpider: 17 | def __init__(self): 18 | self.db = MySqlDb() 19 | self.config = ConfigUtil() 20 | self.log = Logger().logger 21 | self.zm = ZhiMaIp() 22 | self.proxies = self.zm.getOneProxies() 23 | self.thread_count = 4 24 | 25 | # 移除跳过认证警告 26 | requests.packages.urllib3.disable_warnings() 27 | 28 | def get_info(self, spuId): 29 | """ 30 | 根据spuId获取商品信息 31 | :param spuId:得物唯一标识 32 | :return: 33 | """ 34 | self.log.info(f"开始获取详情-->spuId:{spuId}...") 35 | data = { 36 | "spuId": spuId, 37 | "productSourceName": "", 38 | "propertyValueId": "0" 39 | } 40 | data = request_util.add_sign(data) 41 | url = 'https://app.dewu.com/api/v1/h5/index/fire/flow/product/detail' 42 | # 发送请求 43 | res = self.try_err_send_request('info', data, url) 44 | if res.status_code == 200: 45 | self.log.info(f"spuId:{spuId},发送请求成功,正在解析数据...") 46 | data = res.json().get('data') 47 | # 详情 48 | pageDetail = data.get('detail') 49 | articleNumber = pageDetail.get('articleNumber') 50 | detail = self.db.getOne("SELECT * FROM org_detail WHERE article_number = '{}'".format(articleNumber)) 51 | if not detail: 52 | # 参数 53 | baseProperties = data.get('baseProperties') 54 | brandList = baseProperties["brandList"] 55 | parameterList = baseProperties["list"] 56 | parameters = self.get_parameter(parameterList) 57 | 58 | # 图片 59 | # image_and_txt = data.get("imageAndText") 60 | # imgList = self.get_img_url(image_and_txt, articleNumber) 61 | 62 | # 下载logo 63 | logoUrl = self.downloadImg(pageDetail["logoUrl"], articleNumber) 64 | detail = ( 65 | None, 66 | pageDetail.get('title'), 67 | spuId, 68 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 69 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()), 70 | pageDetail.get('authPrice'), 71 | pageDetail.get('sellDate'), 72 | articleNumber, 73 | logoUrl, 74 | brandList[0].get("brandName"), 75 | parameters["functionality"], 76 | parameters["blendent"], 77 | parameters["upperLevel"], 78 | parameters["topShoeStyles"], 79 | parameters["heelType"], 80 | None 81 | ) 82 | self.log.info(f"spuId:{spuId},开始入库...") 83 | # 持久化到数据库 84 | # 插入详情 85 | detail_sql = 'INSERT INTO org_detail VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)' 86 | self.db.insertData(detail_sql, detail) 87 | # 插入详情图片 88 | # insert_img_sql = 'INSERT INTO org_detail_img VALUES(%s, %s, %s, %s, %s, %s)' 89 | # self.db.insertDataList(insert_img_sql, imgList) 90 | self.log.info(f"spuId:{spuId},入库结束...") 91 | 92 | @staticmethod 93 | def get_parameter(parameterList): 94 | """ 95 | 根据参数列表返回参数字典 96 | :param parameterList: 参数列表 97 | :return:参数字典 98 | """ 99 | parameter = {'functionality': None, 'blendent': None, 'upperLevel': None, 'topShoeStyles': None, 'heelType': None} 100 | for p in parameterList: 101 | key = p['key'] 102 | value = p['value'] 103 | if key == '功能性': 104 | parameter['functionality'] = value 105 | elif key == '配色': 106 | parameter['blendent'] = value 107 | elif key == '鞋帮高度': 108 | parameter['upperLevel'] = value 109 | elif key == '鞋头款式': 110 | parameter['topShoeStyles'] = value 111 | elif key == '鞋跟类型': 112 | parameter['heelType'] = value 113 | return parameter 114 | 115 | def get_img_url(self, images, article_number): 116 | """ 117 | 获取图片URL 118 | :param article_number: 119 | :param images: 120 | :return: 121 | """ 122 | imgList = [] 123 | count = 1 124 | for g in images: 125 | if len(g) > 2: 126 | # 判断是否为尺码对照表 127 | contentType = g['contentType'] 128 | if contentType != 'STRUCTURE_SIZE' and contentType != 'SIZETEMPLATE' and contentType != 'ATTENTION': 129 | g = g['images'][0] 130 | else: 131 | continue 132 | u = self.downloadImg(g.get('url'), article_number + str(count)) 133 | height = g["height"] 134 | if height > 100: 135 | img = ( 136 | None, 137 | article_number, 138 | u, 139 | count, 140 | g["width"], 141 | g["height"] 142 | ) 143 | imgList.append(img) 144 | count += 1 145 | return imgList 146 | 147 | def get_record(self, spuId): 148 | """ 149 | 获取当天的交易记录 150 | :param spuId: 得物唯一标识 151 | :return: 152 | """ 153 | self.log.info(f"开始获取交易记录-->spuId:{spuId}...") 154 | # 获取当前商品的货号 155 | get_article_number_sql = f"SELECT article_number FROM org_detail WHERE spu_id = '{spuId}'" 156 | _article_number = self.db.getOne(get_article_number_sql)[0] 157 | 158 | # 获取最新一条交易记录 159 | get_newest_sql = f"select * from org_last_record r WHERE article_number='{_article_number}'" 160 | newest = self.db.getOne(get_newest_sql) 161 | lastId = "" 162 | count = 1 163 | while True: 164 | self.log.info(f"正在请求{spuId}---第{count}页---交易记录...") 165 | record, lastId, flag_stop = self.get_trading_record(spuId, lastId, _article_number, newest) 166 | # 插入数据库 167 | insert_sql = 'INSERT INTO org_purchase_record VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)' 168 | self.log.info(f"spuId:{spuId}---第{count}页---交易记录解析完成") 169 | self.db.insertDataList(insert_sql, record) 170 | # 判断当前是否为第一次获取 171 | if count == 1 and len(record) > 0: 172 | # 删除上次获取的记录 173 | del_sql = f"DELETE FROM org_last_record WHERE article_number = '{_article_number}'" 174 | self.db.executeSql(del_sql) 175 | # 把当前最新的记录更新到列表 176 | insert_sql = 'INSERT INTO org_last_record VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s)' 177 | self.db.insertData(insert_sql, record[-1]) 178 | 179 | # 判断是否停止 180 | if flag_stop: 181 | self.log.info(f"spuId:{spuId}---今日交易记录获取完成") 182 | break 183 | 184 | # 随机随眠一到三秒 185 | time.sleep(random.randint(1, 3)) 186 | count += 1 187 | 188 | def get_trading_record(self, spuId, lastId, _article_number, newest): 189 | """ 190 | 根据spuId获取交易记录 191 | :param newest: 最新交易记录 192 | :param _article_number:货号 193 | :param spuId: 得物唯一标识 194 | :param lastId: 下一页标识 195 | :return:返回交易记录和下一页标识 196 | """ 197 | recordList = [] 198 | # 标识是否还要继续爬取 199 | flag_stop = False 200 | data = { 201 | "spuId": spuId, 202 | "limit": '20', 203 | "lastId": lastId, 204 | "sourceApp": "app" 205 | } 206 | data = request_util.add_sign(data) 207 | url = 'https://app.dewu.com/api/v1/h5/commodity/fire/last-sold-list' 208 | # 发送请求 209 | res = self.try_err_send_request('record', data, url) 210 | if res.status_code == 200: 211 | self.log.info("交易记录请求成功,正在解析交易数据。。。") 212 | all_data = res.json() 213 | lastId = all_data.get('data').get('lastId') 214 | # 判断下一次请求是否已经无数据,如果是返回空集合与停止循环标识 215 | if lastId == "": 216 | flag_stop = True 217 | return recordList, lastId, flag_stop 218 | 219 | data_list = all_data.get('data').get('list') 220 | for d in data_list: 221 | formatTime = d['formatTime'] 222 | formatTime = self.refactorFormatTime(formatTime) 223 | record = ( 224 | None, 225 | spuId, 226 | _article_number, 227 | d['userName'], 228 | formatTime, 229 | d['price'] / 100, 230 | d['orderSubTypeName'], 231 | d['propertiesValues'], 232 | time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 233 | ) 234 | 235 | # 判断是否是当天的数据 如果是则放入集合,如果不是则跳出循环标识不再获取交易记录 236 | if newest: 237 | if self.compareRecord(newest, record): 238 | flag_stop = True 239 | break 240 | 241 | recordList.append(record) 242 | return recordList[::-1], lastId, flag_stop 243 | 244 | @staticmethod 245 | def compareRecord(dbData, record): 246 | """ 247 | 比较两条记录是否相等 248 | :param dbData: 249 | :param record: 250 | :return: 251 | """ 252 | for i in range(len(record)): 253 | if i == 0 or i == 4 or i == 8: 254 | continue 255 | if i == 5: 256 | price = dbData[i] 257 | if not int(price) == record[i]: 258 | return False 259 | else: 260 | if not dbData[i] == record[i]: 261 | return False 262 | return True 263 | 264 | @staticmethod 265 | def refactorFormatTime(formatTime): 266 | """ 267 | 重构交易时间返回数据类型yyyy-MM-dd 268 | :param formatTime: 269 | :return: 270 | """ 271 | if '前' in formatTime or '刚刚' == formatTime: 272 | if '小时' in formatTime: 273 | h = formatTime[0:formatTime.find('小')] 274 | newTime = (datetime.datetime.now() + datetime.timedelta(hours=-int(h))).strftime("%Y-%m-%d") 275 | else: 276 | newTime = datetime.datetime.now().strftime("%Y-%m-%d") 277 | if '天' in formatTime: 278 | dd = formatTime[0:formatTime.find('天')] 279 | newTime = (datetime.datetime.now() + datetime.timedelta(days=-int(dd))).strftime("%Y-%m-%d") 280 | else: 281 | if '月' in formatTime: 282 | newTime = time.strftime("%Y", time.localtime()) + '-' + formatTime.replace('月', '-').replace('日', '') 283 | else: 284 | newTime = formatTime.replace('.', '-') 285 | return newTime 286 | 287 | def downloadImg(self, imgUrl, fileName): 288 | """ 289 | 下载图片 290 | :param imgUrl: 图片路径 291 | :param fileName: 图片名称 292 | :return: 293 | """ 294 | img_path = self.config.getValue('img_path') 295 | # 判断文件夹是否存在 296 | if not os.path.exists(img_path): 297 | os.makedirs(img_path) 298 | # 发请求并保存图片 299 | suffix = os.path.splitext(imgUrl)[1] 300 | if suffix == '': 301 | suffix = '.jpg' 302 | r = None 303 | try: 304 | r = requests.get(url=imgUrl, stream=True, timeout=(10, 10), proxies=self.proxies) 305 | except Exception as e: 306 | self.log.error(e) 307 | self.log.info(f"下载图片{imgUrl}请求失败,开始重新发起请求") 308 | for i in range(5): 309 | try: 310 | self.proxies = self.zm.getOneProxies() 311 | self.log.info(f"当前代理为{self.proxies}") 312 | r = requests.get(url=imgUrl, stream=True, timeout=(10, 10), proxies=self.proxies) 313 | self.log.info(f"第{i + 1}次尝试成功") 314 | break 315 | except Exception as e: 316 | self.log.error(e) 317 | self.log.info(f"第{i + 1}次尝试失败") 318 | if r.status_code == 200: 319 | all_name = fileName + suffix 320 | open(img_path + all_name, 'wb').write(r.content) 321 | return all_name 322 | else: 323 | return None 324 | 325 | def query_by_key(self, key): 326 | """ 327 | 根据key查询数据 328 | :param key: 参数 329 | :return: 330 | """ 331 | self.log.info(f"正在根据关键词【{key}】查看结果") 332 | query_res = [] 333 | pram = { 334 | 'title': key, 335 | 'page': '0', 336 | 'sortType': '0', 337 | 'sortMode': '1', 338 | 'limit': '20', 339 | 'showHot': '1', 340 | 'isAggr': '1' 341 | } 342 | data = request_util.add_sign(pram) 343 | url = "https://app.dewu.com/api/v1/h5/search/fire/search/list" 344 | res = self.try_err_send_request('search', data, url) 345 | if res.status_code == 200: 346 | self.log.info(f"根据关键词【{key}】查询请求成功!") 347 | all_data = res.json() 348 | for entity in all_data['data']['productList']: 349 | query_res.append(entity) 350 | return query_res 351 | 352 | def try_err_send_request(self, send_type, data, url): 353 | """ 354 | 发送请求 355 | :param send_type:发送类型 356 | :param data:参数 357 | :param url:请求路径 358 | :return: 359 | """ 360 | res = None 361 | try: 362 | res = self.send_request(send_type, data, url) 363 | except Exception as e: 364 | self.log.error(e) 365 | self.log.info(f",发送请求失败,正在尝试重新请求...") 366 | for i in range(5): 367 | self.log.info(f"正在尝试第{i + 1}次请求...") 368 | self.proxies = self.zm.getOneProxies() 369 | self.log.info(f"当前代理为{self.proxies}") 370 | try: 371 | res = self.send_request(send_type, data, url) 372 | self.log.info(f"第{i + 1}次请求成功...") 373 | break 374 | except Exception as e: 375 | self.log.error(e) 376 | self.log.info(f"第{i + 1}次请求失败...") 377 | return res 378 | 379 | def send_request(self, send_type, data, url): 380 | """ 381 | 单独发送请求 382 | :param send_type: 请求类型 383 | :param url: 请求路径 384 | :param data: 请求参数 385 | :return: 386 | """ 387 | if 'search' == send_type: 388 | res = requests.get(url=url, params=data, headers=request_util.get_header(send_type), 389 | verify=False, timeout=(10, 10), proxies=self.proxies) 390 | else: 391 | res = requests.post(url=url, json=data, headers=request_util.get_header(send_type), 392 | verify=False, timeout=(10, 10), proxies=self.proxies) 393 | return res 394 | 395 | def run(self): 396 | """ 397 | 启动爬虫 398 | :return: 399 | """ 400 | self.log.info("正在启动得物爬虫程序...") 401 | # 获取代理 402 | self.log.info(f"当前代理:{self.proxies}") 403 | # 查询所有待查询的商品列表 404 | commodity_sql = 'SELECT * FROM org_all_commodity' 405 | commodity_list = self.db.query(commodity_sql) 406 | for commodity in commodity_list: 407 | self.do_one_commodity_data(commodity) 408 | now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 409 | self.log.info(f"{now}程序结束") 410 | 411 | def do_one_commodity_data(self, commodity): 412 | """ 413 | 获取单个商品的数据 414 | :param commodity: 415 | :return: 416 | """ 417 | self.log.info(f"开始执行【{commodity[2]}】商品") 418 | spu_id = commodity[1] 419 | if commodity[3] == 1: 420 | data = self.query_by_key(commodity[2])[0] 421 | spu_id = str(data['spuId']) 422 | self.get_info(spu_id) 423 | # 修改数据库状态 424 | update_sql = f'UPDATE org_all_commodity SET is_new = 0,spu_id = {spu_id} WHERE id = {commodity[0]}' 425 | self.db.executeSql(update_sql) 426 | self.get_record(spu_id) 427 | 428 | # 进行第一轮数据分析 429 | if commodity[3] == 1: 430 | self.log.info(f"开始对【{commodity[2]}】商品进行第一轮数据分析") 431 | an = AnalysisExecutor() 432 | # 一月数据 433 | an.update_one_month(commodity[2]) 434 | an.reports_one_month(commodity[2]) 435 | 436 | # 三月数据 437 | an.update_three_month(commodity[2]) 438 | an.reports_three_month(commodity[2]) 439 | self.log.info(f"对【{commodity[2]}】商品第一轮数据分析完毕") 440 | self.log.info(f"商品【{commodity[2]}】执行执行完毕!") 441 | 442 | def thread_run(self): 443 | """ 444 | 多线程爬虫启动 445 | :return: 446 | """ 447 | self.log.info("正在启动多线程得物爬虫程序...") 448 | # 获取代理 449 | self.log.info(f"当前代理:{self.proxies}") 450 | # 查询所有待查询的商品列表 451 | commodity_sql = 'SELECT * FROM org_all_commodity' 452 | commodity_list = self.db.query(commodity_sql) 453 | with ThreadPoolExecutor(max_workers=self.thread_count) as executor: 454 | executor.map(self.do_one_commodity_data, commodity_list) 455 | 456 | now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) 457 | self.log.info(f"{now}程序结束") -------------------------------------------------------------------------------- /app/decorator/decorator.py: -------------------------------------------------------------------------------- 1 | from app.log import Logger 2 | 3 | log = Logger().log() 4 | 5 | 6 | def error_repeat(func): 7 | """ 8 | 添加注解的方法在报错之后还会重新执行 9 | :param func: 10 | :return: 11 | """ 12 | 13 | def warp(*args, **kwargs): 14 | try: 15 | temp = func(*args, **kwargs) 16 | return temp 17 | except Exception as e: 18 | log.info(f"方法{func.__name__}执行报错,报错信息:" + str(e)) 19 | while True: 20 | try: 21 | log.info(f"方法{func.__name__}正在尝试重新执行") 22 | temp = func(*args, **kwargs) 23 | log.info(f"方法{func.__name__}尝试重新执行成功!") 24 | break 25 | except Exception as e: 26 | log.info(f"方法{func.__name__}执行报错,报错信息:" + str(e)) 27 | return temp 28 | 29 | return warp 30 | -------------------------------------------------------------------------------- /app/log.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import logging 3 | import datetime 4 | import os 5 | 6 | 7 | class Logger: 8 | """自定义封装logging模块""" 9 | 10 | def __init__(self, default_level=logging.INFO): 11 | # 加载配置文件地址 12 | cur_path = os.path.dirname(__file__) 13 | self.config_path = cur_path + "/config.yaml" 14 | 15 | # 初始化一个logger 16 | self.logger = logging.getLogger('__name__') 17 | self.default_level = default_level 18 | logger_main_level, logger_file_level, logger_console_level = self.config() 19 | self.logger.setLevel(logger_main_level) 20 | fomatter = logging.Formatter( 21 | '[%(asctime)s] %(filename)s line:%(lineno)d [%(levelname)s]%(message)s') 22 | # 初始化输出到日志文件的handle 23 | file_name = self.getLogFilePath() + '/{}log.txt'.format(datetime.datetime.now().strftime('%Y-%m-%d')) 24 | file_log = logging.FileHandler(filename=file_name, encoding='utf-8') 25 | file_log.setLevel(logger_file_level) 26 | file_log.setFormatter(fomatter) 27 | # 初始化增加输出到控制台的handle 28 | console_log = logging.StreamHandler() 29 | console_log.setLevel(logger_console_level) 30 | console_log.setFormatter(fomatter) 31 | 32 | if self.logger.hasHandlers() is False: 33 | self.logger.addHandler(file_log) 34 | self.logger.addHandler(console_log) 35 | # self.logger.removeHandler(file_log) 36 | # self.logger.removeHandler(console_log) 37 | file_log.close() 38 | console_log.close() 39 | 40 | def config(self): 41 | """ 42 | :return: 返回配置中读取的level 43 | """ 44 | try: 45 | 46 | with open(self.config_path, 'r', encoding='utf-8') as f: 47 | global config_data 48 | config_data = yaml.load(f, Loader=yaml.FullLoader) 49 | except IOError as e: 50 | print(e) 51 | self.logger.error('open config file failed') 52 | case1 = config_data['logConfig']['testLogLevel']['mainLogLevel'] 53 | case2 = config_data['logConfig']['testLogLevel']['fileLogLevel'] 54 | case3 = config_data['logConfig']['testLogLevel']['consoleLogLevel'] 55 | logger_main_level = self.switch(case=case1) 56 | logger_file_level = self.switch(case=case2) 57 | logger_console_level = self.switch(case=case3) 58 | log_file_path = config_data['logConfig']['logFilePath'] 59 | return logger_main_level, logger_file_level, logger_console_level 60 | 61 | def getLogFilePath(self): 62 | """ 63 | 获取配置文件中的看日志存放路径 64 | :return: 65 | """ 66 | try: 67 | with open(self.config_path, 'r', encoding='utf-8') as f: 68 | global config_data 69 | config_data = yaml.load(f, Loader=yaml.FullLoader) 70 | except IOError: 71 | self.logger.error('open config file failed') 72 | return config_data['logConfig']['logFilePath'] 73 | 74 | def switch(self, case): 75 | """ 76 | :param case: 传入需要做判断的level 77 | :return: 返回最终的level 78 | """ 79 | if case == 'DEBUG': 80 | result = logging.DEBUG 81 | elif case == 'INFO': 82 | result = logging.DEBUG 83 | elif case == 'ERROR': 84 | result = logging.ERROR 85 | elif case == 'CRITICAL': 86 | result = logging.CRITICAL 87 | else: 88 | result = self.logger.setLevel(self.default_level) 89 | return result 90 | 91 | def log(self): 92 | return self.logger 93 | 94 | 95 | if __name__ == '__main__': 96 | # log = Logger() 97 | # print(log.getLogFilePath()) 98 | 99 | with open('config.yaml', 'r') as f: 100 | print(f.readline()) 101 | -------------------------------------------------------------------------------- /app/util/pyplot_util.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | 3 | 4 | def make_fig(): 5 | pass 6 | -------------------------------------------------------------------------------- /app/util/zhi_ma_ip.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import requests 3 | import time 4 | from app.configUtil import ConfigUtil 5 | from app.log import Logger 6 | 7 | config = ConfigUtil() 8 | log = Logger().logger 9 | 10 | 11 | class ZhiMaIp: 12 | def __init__(self): 13 | self.url = config.getValue("zhi_ma_ip_url") 14 | self.addWhiteListIpUrl = config.getValue("white_list_ip_url") 15 | 16 | def send_request(self): 17 | """ 18 | 发送请求获取一个代理 19 | :return: 20 | """ 21 | res = None 22 | try: 23 | res = requests.get(self.url, timeout=3) 24 | except Exception as e: 25 | log.error(e) 26 | log.info("代理请求失败,尝试重新申请...") 27 | for i in range(5): 28 | try: 29 | res = requests.get(self.url, timeout=3) 30 | log.info(f"第{i + 1}次代理请求失败!") 31 | break 32 | except Exception as e: 33 | log.error(e) 34 | log.info(f"第{i + 1}次代理请求成功!") 35 | hostAndPort = res.text 36 | hostAndPort = hostAndPort.replace('\n', '').replace('\r', '') 37 | if len(hostAndPort) > 30: 38 | self.addWhiteList() 39 | res = requests.get(self.url) 40 | hostAndPort = res.text 41 | hostAndPort = hostAndPort.replace('\n', '').replace('\r', '') 42 | proxyMeta = "http://%(proxies)s" % { 43 | "proxies": hostAndPort 44 | } 45 | 46 | proxies = { 47 | "https": proxyMeta, 48 | "http": proxyMeta 49 | } 50 | return proxies 51 | 52 | def getOneProxies(self): 53 | """ 54 | 获取一个代理服务器 55 | :return: 56 | """ 57 | # while True: 58 | # proxies = self.send_request() 59 | # res = self.check_proxies(proxies) 60 | # if res: 61 | # break 62 | return self.send_request() 63 | 64 | @staticmethod 65 | def check_proxies(proxies): 66 | """ 67 | 检测代理活性和状态 68 | :param proxies: 69 | :return: 70 | """ 71 | # 检测代理活性和状态 72 | start = time.clock() 73 | res = requests.get(url="http://icanhazip.com/", timeout=8, proxies=proxies).text.replace('\n', '') 74 | end = time.clock() 75 | return (end - start) < 2 and res in str(proxies) 76 | 77 | def addWhiteList(self): 78 | """ 79 | 添加一个IP至白名单 80 | :return: 81 | """ 82 | ip = requests.get(url="http://ip.42.pl/raw").text 83 | url = self.addWhiteListIpUrl + ip 84 | requests.get(url) 85 | 86 | 87 | if __name__ == '__main__': 88 | z = ZhiMaIp() 89 | print(z.getOneProxies()) 90 | 91 | -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | monitortime: 60 2 | 3 | maxtimeout: 30 4 | logConfig: 5 | logFilePath: 'app/logs/' 6 | testLogLevel: 7 | mainLogLevel: INFO 8 | fileLogLevel: INFO 9 | consoleLogLevel: INFO 10 | productLogLevel: 11 | mainLogLevel: INFO 12 | fileLogLevel: INFO 13 | consoleLogLevel: ERROR 14 | 15 | db: 16 | host: '数据库地址' 17 | username: '用户名' 18 | password: '密码' 19 | port: 端口 20 | dbName: '库名' 21 | charset: 'utf8' 22 | 23 | 24 | img_path: "商品图片保存地址" 25 | 26 | analysis_img_path: "分析图片保存地址" 27 | 28 | zhi_ma_ip_url: "芝麻代理接口" 29 | 30 | white_list_ip_url: "芝麻代理加白名单接口" 31 | 32 | 33 | -------------------------------------------------------------------------------- /developer.py: -------------------------------------------------------------------------------- 1 | from app.data_analysis.analysis_executor import AnalysisExecutor 2 | from app.data_analysis.analysis import Analysis 3 | from app.data_analysis.generate_reports import GenerateReports 4 | from app.decorator.decorator import error_repeat 5 | 6 | 7 | an = AnalysisExecutor() 8 | an.update_one_month("327624-001") 9 | # an.update_three_month("327624-001") 10 | # an.reports_one_month("327624-001") 11 | # an.reports_three_month("327624-001") 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from apscheduler.schedulers.blocking import BlockingScheduler 2 | from app.data_analysis.analysis_executor import AnalysisExecutor 3 | from app.de_wu_spider import DeWuSpider 4 | from app.log import Logger 5 | 6 | log = Logger().logger 7 | 8 | 9 | def run(): 10 | # 添加定时器 11 | log.info("得物数据分析程序定时器已启动") 12 | scheduler = BlockingScheduler() 13 | # 定时爬虫程序 14 | # spider = DeWuSpider() 15 | # scheduler.add_job(spider.thread_run, 'cron', hour=00) 16 | # scheduler.add_job(spider.thread_run, 'cron', hour=12) 17 | 18 | # 数据分析程序 19 | analysis = AnalysisExecutor() 20 | # scheduler.add_job(analysis.update_all_data, 'cron', day_of_week=0, hour=3) 21 | scheduler.add_job(analysis.update_all_data, 'cron', hour=14, minute=31) 22 | scheduler.start() 23 | 24 | 25 | if __name__ == '__main__': 26 | run() 27 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # 得物数据爬取、分析平台 4 | 5 | ## Introduction - 介绍 6 | 该项目主要完成对得物交易数据的爬取与分析,对本地的交易数据进行分析生成一些数据分析图,能够准确的预估价格的走势。 7 | 8 | ### Summary - 概要 9 | #### 1、交易记录爬虫 10 | 每天定时爬取得物交易记录,每次爬取存在记忆功能,不会存在重复爬取的现象。 11 | #### 2、数据分析 12 | 每周定时对历史数据进行新的一轮分析,并生成数据分析图、数据分析报告 13 | 14 | ### Features - 特性 15 | #### 1、实时性 16 | #### 2、准确性 17 | #### 3、个性化 18 | 19 | ## Configuration - 配置 20 | 导入模块: pip3 install -i http://pypi.douban.com/simple --trusted-host pypi.douban.com -r requirements.txt 21 | 22 | ## FAQ - 常见问题 23 | #### 1、代理问题:代理需要自行购买 24 | #### 2、数据库问题:项目没有提供数据库文件,如需要自行联系作者 25 | #### 3、启动(运行)问题:python main.py 及启动定时爬取与定时分析任务 26 | #### 4、代码优化与意见:联系作者 27 | #### 5、其他问题:如有其他问题及时联系作者 28 | 29 | ## Contact - 联系 30 | 联系方式:3392903032@qq.com 31 | 32 | ## License - 版权信息 33 | #### 该项目仅供学习交流使用,禁止进行商业用途,如有出现,作者不承担任何法律责任。 34 | #### 项目未经作者允许不得他用,一旦发现,追究其法律责任。 35 | #### 如影响得物平台利益,请通过上方联系方式联系作者。 36 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.25.1 2 | pymysql==1.0.2 3 | pyyaml==5.4.1 4 | apscheduler==3.7.0 5 | flask==1.1.2 6 | SQLAlchemy==1.4.9 7 | pandas~=1.1.5 8 | matplotlib~=3.3.3 9 | docxtpl==0.11.4 --------------------------------------------------------------------------------