├── .gitignore ├── Answers ├── week1-challenge-01 │ └── read_challenge.py ├── week1-challenge-02 │ └── sql_challenge.py ├── week1-challenge-03 │ └── github_data.py ├── week1-challenge-04 │ └── shiyanlou_user.py ├── week1-challenge-05 │ ├── scrapy.cfg │ └── shiyanlou │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ ├── __init__.py │ │ ├── github.py │ │ └── github_next_page.py ├── week2-challenge-01 │ └── titanic.py ├── week2-challenge-02 │ └── earthquake.py ├── week2-challenge-03 │ └── earthquake.py ├── week2-challenge-04 │ ├── carbon_dioxide.py │ └── carbon_dioxide_2.py ├── week2-challenge-05 │ └── carbon_gdp.py ├── week3-challenge-01 │ └── ols_matrix.py ├── week3-challenge-02 │ └── houseprice.py ├── week3-challenge-03 │ └── linear_regression.py ├── week3-challenge-04 │ └── 手写字符分类预测.ipynb ├── week3-challenge-05 │ └── 使用聚类压缩图像.ipynb ├── week4-challenge-01 │ └── banknote.py ├── week4-challenge-02 │ └── association.py ├── week4-challenge-03 │ └── google_stock.py ├── week4-challenge-04 │ └── production_index.py ├── week4-challenge-05 │ └── chengdu_pm25.py ├── week5-spiders-01 │ └── lianjia_spider.py └── week5-spiders-02 │ ├── create_sqlite_database.py │ ├── insert_database.py │ └── xiecheng_spider.py ├── Assignments ├── README.md ├── 🏅️dm01-stenphen-中国保险业过去五年基础数据分析.ipynb ├── 🏅️dm02-米竹314159-杭州互联网寒冬背景下的数据分析岗现状分析.ipynb ├── 🏅️dm04-Luo2019-链家成都市区挂牌二手房分析.ipynb ├── 🥈dm01-BellaG-上海历史天气数据分析预测.ipynb ├── 🥈dm02-linnecn-医学专业论坛的数据爬取和分析.ipynb ├── 🥈dm04-Yueyec-B-站番剧数据简单分析.ipynb └── 🥉dm01-hcccom-双色球历史数据统计预测.ipynb ├── LICENSE ├── Mindmaps ├── README.md ├── louplus-dm-week1.png ├── louplus-dm-week2.png ├── louplus-dm-week3.png ├── louplus-dm-week4.png └── louplus-dm-week5.png └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | .DS_Store 106 | -------------------------------------------------------------------------------- /Answers/week1-challenge-01/read_challenge.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | def convert(file): 4 | df = pd.read_json(file) 5 | df1000 = df[:1000] 6 | df1000.to_hdf('user_study.h5', key='data') -------------------------------------------------------------------------------- /Answers/week1-challenge-02/sql_challenge.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | import pandas as pd 3 | 4 | def count(file, user_id): 5 | 6 | sql_con = sqlite3.connect(file) 7 | sql_query = "SELECT * FROM data WHERE user_id == {}".format(user_id) 8 | df = pd.read_sql(sql_query, sql_con) 9 | 10 | if len(df)==0: 11 | return 0 12 | else: 13 | sum_minutes = df.minutes.sum() 14 | return sum_minutes -------------------------------------------------------------------------------- /Answers/week1-challenge-03/github_data.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import pandas as pd 3 | 4 | def issues(repo): 5 | url = "https://api.github.com/repos/{}/issues".format(repo) 6 | issues = requests.get(url) 7 | 8 | issues_list = [] 9 | for issue in issues.json(): 10 | issues_dict = {'number':issue['number'], 11 | 'title':issue['title'], 12 | 'user_name':issue['user']['login']} 13 | issues_list.append(issues_dict) 14 | 15 | issues_df = pd.DataFrame(issues_list) 16 | 17 | return issues_df 18 | 19 | issues("numpy/numpy") 20 | 21 | 22 | -------------------------------------------------------------------------------- /Answers/week1-challenge-04/shiyanlou_user.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import html 3 | 4 | 5 | def user_info(user_id): 6 | 7 | url = "https://www.lanqiao.cn/users/{}/".format(user_id) 8 | content = requests.get(url) 9 | 10 | if content.status_code == 200: 11 | tree = html.fromstring(content.text) 12 | # 首先选取所以 div 元素,要求其 class 属性中包含 name 字段 13 | # 再取 div 下的 span 14 | user_name = tree.xpath("//div[contains(@class, 'name')]/span/text()")[0].strip() 15 | user_level = tree.xpath("//div[contains(@class, 'name')]/span/text()")[1].strip()[1:] 16 | return user_name, int(user_level) 17 | else: 18 | user_name, user_level = (None, None) 19 | return user_name, user_level -------------------------------------------------------------------------------- /Answers/week1-challenge-05/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = shiyanlou.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = shiyanlou 12 | -------------------------------------------------------------------------------- /Answers/week1-challenge-05/shiyanlou/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Answers/week1-challenge-05/shiyanlou/__init__.py -------------------------------------------------------------------------------- /Answers/week1-challenge-05/shiyanlou/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ShiyanlouItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | repo_name = scrapy.Field() # repo 名称 15 | update_time = scrapy.Field() # 更新时间 16 | -------------------------------------------------------------------------------- /Answers/week1-challenge-05/shiyanlou/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ShiyanlouSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class ShiyanlouDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /Answers/week1-challenge-05/shiyanlou/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | import pandas as pd 8 | 9 | class ShiyanlouPipeline(object): 10 | 11 | def process_item(self, item, spider): 12 | # 读取 item 数据 13 | repo_name = item['repo_name'] 14 | update_time = item['update_time'] 15 | # 每条数据组成临时 df_temp 16 | df_temp = pd.DataFrame([[repo_name, update_time]], columns=['repo_name', 'update_time']) 17 | # 将 df_temp 合并到 df 18 | self.df = self.df.append(df_temp, ignore_index=True).sort_values(by=['update_time'], ascending=False) 19 | 20 | return item 21 | 22 | #当爬虫启动时 23 | def open_spider(self, spider): 24 | # 新建一个带列名的空白 df 25 | self.df = pd.DataFrame(columns=['repo_name', 'update_time']) 26 | 27 | # 当爬虫关闭时 28 | def close_spider(self, spider): 29 | # 将 df 存储为 csv 文件 30 | pd.DataFrame.to_csv(self.df, "../shiyanlou_repo.csv") 31 | -------------------------------------------------------------------------------- /Answers/week1-challenge-05/shiyanlou/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for shiyanlou project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'shiyanlou' 13 | 14 | SPIDER_MODULES = ['shiyanlou.spiders'] 15 | NEWSPIDER_MODULE = 'shiyanlou.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'shiyanlou (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = False 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | # DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'shiyanlou.middlewares.ShiyanlouSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'shiyanlou.middlewares.ShiyanlouDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'shiyanlou.pipelines.ShiyanlouPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /Answers/week1-challenge-05/shiyanlou/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Answers/week1-challenge-05/shiyanlou/spiders/github.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from shiyanlou.items import ShiyanlouItem 4 | 5 | 6 | class GithubSpider(scrapy.Spider): 7 | name = 'github' 8 | allowed_domains = ['github.com'] 9 | 10 | @property 11 | def start_urls(self): 12 | url_temp = 'https://github.com/shiyanlou?after={}&tab=repositories' 13 | # 此参考会失效,请自行重新手动复制 after 参数 14 | after = [ 15 | '', 16 | 'Y3Vyc29yOnYyOpK5MjAxNy0wNi0wN1QwNjoxOTo1NyswODowMM4FkpYw', 17 | 'Y3Vyc29yOnYyOpK5MjAxNS0wMS0yNVQxMTozMTowNyswODowMM4Bxrsx', 18 | 'Y3Vyc29yOnYyOpK5MjAxNC0xMS0yMFQxMzowMzo1MiswODowMM4BjkvL', 19 | ] 20 | return (url_temp.format(i) for i in after) # 1-4 页 21 | 22 | def parse(self, response): 23 | repos = response.xpath('//li[@itemprop="owns"]') 24 | for repo in repos: 25 | item = ShiyanlouItem() 26 | item['repo_name'] = repo.xpath(".//a[@itemprop='name codeRepository']/text()").extract_first().strip() 27 | item['update_time'] = repo.xpath(".//relative-time/@datetime").extract_first() 28 | 29 | yield item -------------------------------------------------------------------------------- /Answers/week1-challenge-05/shiyanlou/spiders/github_next_page.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from shiyanlou.items import ShiyanlouItem 4 | 5 | 6 | class GithubSpider(scrapy.Spider): 7 | name = 'github_next_page' 8 | allowed_domains = ['github.com'] 9 | 10 | @property 11 | def start_urls(self): 12 | return ('https://github.com/shiyanlou?tab=repositories', ) 13 | 14 | def parse(self, response): 15 | repos = response.xpath('//li[@itemprop="owns"]') 16 | for repo in repos: 17 | item = ShiyanlouItem() 18 | item['repo_name'] = repo.xpath(".//a[@itemprop='name codeRepository']/text()").extract_first().strip() 19 | item['update_time'] = repo.xpath(".//relative-time/@datetime").extract_first() 20 | 21 | yield item 22 | 23 | # 如果 Next 按钮没被禁用,那么表示有下一页 24 | spans = response.css('div.pagination span.disabled::text') 25 | if len(spans) == 0 or spans[-1].extract() != 'Next': 26 | next_url = response.css('div.paginate-container a:last-child::attr(href)').extract_first() 27 | yield response.follow(next_url, callback=self.parse) 28 | -------------------------------------------------------------------------------- /Answers/week2-challenge-01/titanic.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | import seaborn as sns 3 | 4 | def plot(): 5 | df = sns.load_dataset("titanic") 6 | 7 | fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(15,4)) 8 | 9 | sns.distplot(df.age.dropna(), ax=axes[0]) 10 | sns.countplot(x='sex', hue="alive", data=df, ax=axes[1]) 11 | sns.countplot(x="class", hue="alive", data=df, ax=axes[2]) 12 | 13 | return axes -------------------------------------------------------------------------------- /Answers/week2-challenge-02/earthquake.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def clean(): 5 | # 读取据 6 | df = pd.read_csv("earthquake.csv") 7 | # 选择需保留列 8 | df1 = df[['time', 'latitude', 'longitude', 'depth', 'mag']] 9 | # 对 place 列使用分割,得到需要的 region 数据 10 | place = df.place.str.split(', ').tolist() 11 | region = [] 12 | for row in place: 13 | region.append(row[-1]) 14 | df2 = pd.DataFrame(region, columns=['region']) 15 | # 拼接数据 16 | df = pd.concat([df1, df2], axis=1) 17 | # 去除重复值 18 | df_clean = df.drop_duplicates().dropna() 19 | 20 | return df_clean -------------------------------------------------------------------------------- /Answers/week2-challenge-03/earthquake.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def clean(): 5 | # 读取据 6 | df = pd.read_csv("earthquake.csv") 7 | # 选择需保留列 8 | df1 = df[['time', 'latitude', 'longitude', 'depth', 'mag']] 9 | # 对 place 列使用分割,得到需要的 region 数据 10 | place = df.place.str.split(', ').tolist() 11 | region = [] 12 | for row in place: 13 | region.append(row[-1]) 14 | df2 = pd.DataFrame(region, columns=['region']) 15 | # 拼接数据 16 | df = pd.concat([df1, df2], axis=1) 17 | # 去除重复值 18 | df_clean = df.drop_duplicates().dropna() 19 | 20 | return df_clean 21 | 22 | 23 | def mag_region(): 24 | # 加载清洁后数据 25 | df_clean = clean() 26 | # 数据离散化,注意开闭区间 27 | df_clean['mag'] = pd.cut(df_clean.mag, bins=[0, 2, 5, 7, 9, 15], right=False, labels=[ 28 | 'micro', 'light', 'strong', 'major', 'great']) 29 | 30 | print(df_clean) 31 | # 多索引分组聚合并计数 32 | df_group = df_clean.groupby(by=['mag', 'region']).count() 33 | # 重置索引并去除缺失值 34 | df_reindex = df_group.reset_index().dropna() 35 | # 按计数从大到小排序,并使用去除重复值的方法巧妙地保留下各地区最大值 36 | df_sort = df_reindex.sort_values( 37 | by='time', ascending=False).drop_duplicates(['mag']) 38 | # 按题目要求整理并重命名 39 | df_final = df_sort.set_index('mag')[['region', 'time']].rename( 40 | columns={"time": "times"}) 41 | # 按题目要求将计数处理成 int 类型 42 | df_final['times'] = df_final.times.astype('int') 43 | 44 | return df_final 45 | -------------------------------------------------------------------------------- /Answers/week2-challenge-04/carbon_dioxide.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def data_clean(): 5 | '''data_clean() 函数用于数据清洁,大致步骤如下: 6 | 1. 统一设置国家代码为新索引 7 | 2. 去掉多余的数据列 8 | 3. 将不规范空值替换为 NaN,并进行填充 9 | ''' 10 | # 读取数据文件 11 | df_data = pd.read_excel("ClimateChange.xlsx", sheetname='Data') 12 | df_country = pd.read_excel("ClimateChange.xlsx", sheetname='Country') 13 | 14 | # 处理 data 数据表 15 | # 选取 EN.ATM.CO2E.KT 数据,并将国家代码设置为索引 16 | df_data_reindex = df_data[df_data['Series code']== 'EN.ATM.CO2E.KT'].set_index('Country code') 17 | # 剔除不必要的数据列 18 | df_data_drop = df_data_reindex.drop(labels=['Country name', 'Series code', 'Series name', 'SCALE', 'Decimals'], axis=1) 19 | # 将原数据集中不规范的空值替换为 NaN 方便填充 20 | df_data_nan = df_data_drop.replace({'..': pd.np.NaN}) 21 | # 对 NaN 空值进行向前和向后填充 22 | df_data_fill = df_data_nan.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1) 23 | # 对填充后依旧全部为空值的数据行进行剔除 24 | df_data_dropna = df_data_fill.dropna(how='all') 25 | 26 | # 处理 Country 数据表 27 | # 将国家代码设置为索引 28 | df_country_reindex = pd.DataFrame(df_country).set_index('Country code') 29 | # 剔除不必要的数据列 30 | df_country_drop = df_country_reindex.drop(labels=['Capital city', 'Region', 'Lending category'], axis=1) 31 | 32 | # 合并数据表 33 | # 对 Data 和 Country 表按照索引进行合并 34 | df_combine = pd.concat([df_data_dropna, df_country_drop], axis=1) 35 | # 对合并后数据集进行求和得到各国排放总量 36 | df_combine['Sum emissions'] = df_combine[list(df_combine)[:-2]].sum(axis=1) 37 | # 对合并后存在空值的数据行进行剔除,得到清洁后的数据集 38 | df_clean = df_combine.dropna(thresh=10) 39 | 40 | return df_clean 41 | 42 | def co2(): 43 | '''co2() 函数用于数据统计,大致步骤如下: 44 | 1. 使用 groupby 按题目规则求和 45 | 2. 对数据进行排序并得到目标 DataFrame 46 | ''' 47 | # 读取清洁后数据 48 | df_clean = data_clean() 49 | 50 | # 按收入群体对数据进行求和 51 | sum_by_groups = df_clean.groupby('Income group')['Sum emissions'].sum() 52 | 53 | # 按要求整理 DataFrame 54 | item_high_list = [] 55 | item_low_list = [] 56 | 57 | for group_name in list(sum_by_groups.index): 58 | # 得到各收入群体最高排放量数据 59 | item_high = df_clean[df_clean['Income group'] == group_name].sort_values(by='Sum emissions', ascending=False).iloc[0] 60 | # 将最高排放量数据存入相应列表方便生成最终 DataFrame 61 | item_high_list.append((item_high['Income group'], item_high['Country name'], item_high['Sum emissions'])) 62 | # 得到各收入群体最低排放量数据 63 | item_low = df_clean[df_clean['Income group'] == group_name].sort_values(by='Sum emissions').iloc[0] 64 | # 将最低排放量数据存入相应列表方便生成最终 DataFrame 65 | item_low_list.append((item_low['Income group'], item_low['Country name'], item_low['Sum emissions'])) 66 | 67 | # 设置 DataFrame 标签 68 | high_labels = ['Income group', 'Highest emission country', 'Highest emissions'] 69 | low_labels = ['Income group', 'Lowest emission country', 'Lowest emissions'] 70 | 71 | # 生成并合并目标 DataFrame 72 | highest_df = pd.DataFrame.from_records(item_high_list, columns=high_labels).set_index('Income group') 73 | lowest_df = pd.DataFrame.from_records(item_low_list, columns=low_labels).set_index('Income group') 74 | 75 | results = pd.concat([sum_by_groups, highest_df, lowest_df], axis=1) 76 | 77 | return results -------------------------------------------------------------------------------- /Answers/week2-challenge-04/carbon_dioxide_2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def data_clean(): 5 | data = pd.read_excel("ClimateChange.xlsx", sheetname='Data') 6 | 7 | # 处理 data 数据表 # 选取 EN.ATM.CO2E.KT 数据,并将国家代码设置为索引 8 | data = data[data['Series code'] == 9 | 'EN.ATM.CO2E.KT'].set_index('Country code') 10 | # 剔除不必要的数据列 11 | data.drop(labels=['Country name', 'Series code', 12 | 'Series name', 'SCALE', 'Decimals'], axis=1, inplace=True) 13 | # 将原数据集中不规范的空值替换为 NaN 方便填充 14 | data.replace({'..': pd.np.NaN}, inplace=True) 15 | # 对 NaN 空值进行向前和向后填充 16 | data = data.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1) 17 | # 对填充后依旧全部为空值的数据行进行剔除 18 | data.dropna(how='all', inplace=True) 19 | data['Sum emissions'] = data.sum(axis=1) 20 | data = data['Sum emissions'] 21 | 22 | # 处理 Country 数据表 23 | # 将国家代码设置为索引 24 | countries = pd.read_excel("ClimateChange.xlsx", sheetname='Country') 25 | countries.set_index('Country code', inplace=True) 26 | # 剔除不必要的数据列 27 | countries.drop(labels=['Capital city', 'Region', 28 | 'Lending category'], axis=1, inplace=True) 29 | 30 | # 合并数据表 31 | # 对 Data 和 Country 表按照索引进行合并 32 | return pd.concat([data, countries], axis=1) 33 | 34 | 35 | def co2(): 36 | '''co2() 函数用于数据统计,大致步骤如下: 37 | 1. 使用 grouby 按题目规则求和 38 | 2. 对数据进行排序并得到目标 DataFrame 39 | ''' 40 | # 读取清洁后数据 41 | df = data_clean() 42 | 43 | # 按收入群体对数据进行求和 44 | df_sum = df.groupby('Income group').sum() 45 | 46 | df_max = df.sort_values(by='Sum emissions', ascending=False).groupby( 47 | 'Income group').head(1).set_index('Income group') 48 | df_max.columns = ['Highest emissions', 'Highest emission country'] 49 | df_max = df_max.reindex( 50 | columns=['Highest emission country', 'Highest emissions']) 51 | 52 | df_min = df.sort_values(by='Sum emissions').groupby( 53 | 'Income group').head(1).set_index('Income group') 54 | df_min.columns = ['Lowest emissions', 'Lowest emission country'] 55 | df_min = df_min.reindex( 56 | columns=['Lowest emission country', 'Lowest emissions']) 57 | 58 | result = pd.concat([df_sum, df_max, df_min], axis=1) 59 | 60 | return result -------------------------------------------------------------------------------- /Answers/week2-challenge-05/carbon_gdp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | from matplotlib import pyplot as plt 4 | 5 | 6 | def data_clean(): 7 | '''data_clean() 函数用于数据清洁,大致步骤如下: 8 | 1. 统一设置国家代码为新索引 9 | 2. 去掉多余的数据列 10 | 3. 将不规范空值替换为 NaN,并进行填充 11 | ''' 12 | # 读取数据 13 | df_data = pd.read_excel("ClimateChange.xlsx", sheetname='Data') 14 | 15 | # 选择数据 16 | df_co2 = df_data[df_data['Series code'] == 17 | 'EN.ATM.CO2E.KT'].set_index('Country code') 18 | df_gdp = df_data[df_data['Series code'] == 19 | 'NY.GDP.MKTP.CD'].set_index('Country code') 20 | 21 | # 缺失值替换 22 | df_co2_nan = df_co2.replace({'..': pd.np.NaN}) 23 | df_gdp_nan = df_gdp.replace({'..': pd.np.NaN}) 24 | 25 | # 缺失值填充 26 | df_co2_fill = df_co2_nan.iloc[:, 5:].fillna( 27 | method='ffill', axis=1).fillna(method='bfill', axis=1) 28 | df_gdp_fill = df_gdp_nan.iloc[:, 5:].fillna( 29 | method='ffill', axis=1).fillna(method='bfill', axis=1) 30 | 31 | # 数据合并 32 | df_co2_fill['CO2-SUM'] = df_co2_fill.sum(axis=1) 33 | df_gdp_fill['GDP-SUM'] = df_gdp_fill.sum(axis=1) 34 | df_merge = pd.concat([df_co2_fill['CO2-SUM'], df_gdp_fill['GDP-SUM']], axis=1) 35 | 36 | # 缺失数据填充为 0 37 | df_merge_fill = df_merge.fillna(value=0) 38 | 39 | return df_merge_fill 40 | 41 | 42 | def co2_gdp_plot(): 43 | '''co2_gdp_plot() 函数用于数据整理和绘图,大致步骤如下: 44 | 1. 数据归一化 45 | 2. 得到需要返回的数据 46 | 3. 绘图 47 | ''' 48 | # 读取数据 49 | df_clean = data_clean() 50 | 51 | # 数据归一化处理 52 | df_max_min = (df_clean - df_clean.min()) / (df_clean.max() - df_clean.min()) 53 | 54 | # 获取中国归一化后的 CO2 和 GDP 数据 55 | china = [] 56 | for i in df_max_min[df_max_min.index == 'CHN'].values: 57 | china.extend(np.round(i, 3).tolist()) 58 | 59 | # 获取 5 个常任理事国标签及对应的坐标刻度 60 | countries_labels = ['USA', 'CHN', 'FRA', 'RUS', 'GBR'] 61 | # 获取国家标签作为刻度标签 62 | sticks_labels = [] 63 | # 获取相应国家序号对应着刻度坐标 64 | labels_position = [] 65 | 66 | for i in range(len(df_max_min)): 67 | if df_max_min.index[i] in countries_labels: 68 | sticks_labels.append(df_max_min.index[i]) 69 | labels_position.append(i) 70 | 71 | # 对数据进行绘图 72 | fig, axes = plt.subplots() 73 | df_max_min.plot( 74 | kind='line', 75 | title='GDP-CO2', 76 | ax=axes 77 | ) 78 | plt.xlabel("Countries") 79 | plt.ylabel("Values") 80 | # 绘制 5 大常任理事国坐标刻度标签 81 | plt.xticks(labels_position, sticks_labels, rotation='vertical') 82 | plt.show() 83 | 84 | return axes, china -------------------------------------------------------------------------------- /Answers/week3-challenge-01/ols_matrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def caculate_w(): 6 | 7 | # 读取数据集 8 | df = pd.read_csv("nyc-east-river-bicycle-counts.csv", index_col=0) 9 | 10 | # 处理自变量 11 | x = df['Brooklyn Bridge'].values 12 | x = x.reshape(len(x), 1) # 添加截距项系数 13 | x = np.matrix(np.concatenate((np.ones_like(x), x), axis=1)) 14 | 15 | # 处理因变量 16 | y = df['Manhattan Bridge'].values 17 | y = np.matrix(y.reshape(len(y), 1)) 18 | 19 | # 使用矩阵方法计算 20 | W = (x.T * x).I * x.T * y 21 | b = round(float(W[0]), 2) 22 | w = round(float(W[1]), 2) 23 | 24 | return w, b 25 | -------------------------------------------------------------------------------- /Answers/week3-challenge-02/houseprice.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.preprocessing import PolynomialFeatures 4 | from sklearn.linear_model import LinearRegression 5 | from sklearn.metrics import mean_absolute_error 6 | from sklearn.model_selection import train_test_split 7 | 8 | 9 | def beijing(n): 10 | 11 | # 读取数据,去除重复值,无空值 12 | df = pd.read_csv("beijing_house_price.csv") 13 | df = df.drop_duplicates() 14 | # df = df[['公交', '写字楼', '医院', '商场', '地铁', '学校', '建造时间', '楼层', '面积', '每平米价格']] 15 | df = df.iloc[:, [0, 1, 2, 3, 4, 5, 7, 9, 11, 10]] # 线上环境中文输入不方便 16 | 17 | # 计算特征与目标值相关性系数,并保留前 3 个特征 18 | pearson = np.abs(df.corr(method='pearson').iloc[-1]) 19 | pearson_max = pearson.sort_values(ascending=False)[1:4] 20 | features_names = pearson_max.index.values 21 | features = df[features_names] 22 | # target = df['每平米价格'] 23 | target = df.iloc[:, [9]] 24 | 25 | # 切分训练和测试数据 26 | X_train, X_test, y_train, y_test = train_test_split( 27 | features, target, test_size=0.3, random_state=10) 28 | 29 | # 多项式特征处理 30 | poly_features = PolynomialFeatures(degree=n) 31 | X_train_features = poly_features.fit_transform(X_train) 32 | X_test_features = poly_features.fit_transform(X_test) 33 | 34 | # 建立线性回归模型 35 | model = LinearRegression() 36 | model.fit(X_train_features, y_train) 37 | y_pred = model.predict(X_test_features) 38 | 39 | # 计算平均绝对误差 40 | mae = mean_absolute_error(y_test, y_pred) 41 | 42 | return mae 43 | -------------------------------------------------------------------------------- /Answers/week3-challenge-03/linear_regression.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | def gradient_descent(): 6 | # 读取数据集 7 | df = pd.read_csv("nyc-east-river-bicycle-counts.csv", index_col=0) 8 | # 读取自变量 9 | x = df['Brooklyn Bridge'].values 10 | # 读取因变量 11 | y = df['Manhattan Bridge'].values 12 | 13 | w = 0 # 初始参数为 0 14 | b = 0 # 初始参数为 0 15 | lr = 0.000000001 # 学习率 16 | num_iter = 1000 # 迭代次数 17 | for i in range(num_iter): # 梯度下降迭代 18 | # 计算近似值 19 | y_hat = (w * x) + b 20 | # 计算参数对应梯度 21 | w_gradient = -(2/len(x)) * sum(x * (y - y_hat)) 22 | b_gradient = -(2/len(x)) * sum(y - y_hat) 23 | # 根据梯度更新参数 24 | w -= lr * w_gradient 25 | b -= lr * b_gradient 26 | 27 | return w, b 28 | -------------------------------------------------------------------------------- /Answers/week3-challenge-04/手写字符分类预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "\n", 9 | "# 手写字符分类预测" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "---" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "**以下内容仅保留挑战代码部分,挑战全文请到原课程查看。**" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "---" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "**挑战:使用 `1x5` 的子图样式绘制 Digits 数据集前 `5` 个手写字符的图像。**" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "### 代码开始 ### (3~5 行代码)\n", 47 | "fig, axes = plt.subplots(1, 5, figsize=(12,4))\n", 48 | "for i, image in enumerate(digits.images[:5]):\n", 49 | " axes[i].imshow(image, cmap=plt.cm.gray_r)\n", 50 | "### 代码结束 ###" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "---" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "**挑战:使用 `train_test_split()` 将数据集切分为 80%(训练集) 和 20%(测试集) 两部分。**" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "### 代码开始 ### (≈ 2 行代码)\n", 74 | "from sklearn.model_selection import train_test_split\n", 75 | "\n", 76 | "train_x, test_x, train_y, test_y = train_test_split(digits.data, digits.target, test_size=0.2, random_state=30)\n", 77 | "### 代码结束 ###" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "---" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "**挑战:使用 `MLPClassifier()` 搭建神经网络结构,并训练手写字符识别模型,最后得到在测试集上的预测准确率。**" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "from sklearn.neural_network import MLPClassifier\n", 101 | "from sklearn.metrics import accuracy_score\n", 102 | "\n", 103 | "def mpl():\n", 104 | " \"\"\"\n", 105 | " 参数:无\n", 106 | "\n", 107 | " 返回:\n", 108 | " model -- 人工神经网络模型\n", 109 | " score -- 测试集上的预测准确率\n", 110 | " \"\"\"\n", 111 | " ### 代码开始 ### (≈ 2 行代码)\n", 112 | " model = MLPClassifier(\n", 113 | " hidden_layer_sizes=(100, 50),\n", 114 | " activation='relu',\n", 115 | " solver='sgd',\n", 116 | " learning_rate_init=0.02,\n", 117 | " learning_rate='constant',\n", 118 | " max_iter=100,\n", 119 | " random_state=1\n", 120 | " )\n", 121 | "\n", 122 | " model.fit(train_x, train_y)\n", 123 | " score = accuracy_score(test_y, model.predict(test_x))\n", 124 | " ### 代码结束 ###\n", 125 | " return model, score" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "---" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.7.0" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 2 164 | } 165 | -------------------------------------------------------------------------------- /Answers/week3-challenge-05/使用聚类压缩图像.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "\n", 9 | "# 使用聚类压缩图像" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "---" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "**以下内容仅保留挑战代码部分,挑战全文请到原课程查看。**" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "---" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "**挑战:将形状为 $(516, 819, 3)$ 的数据转换为 $(422604, 3)$ 形状的数据。**" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "\"\"\"数据格式变换\n", 47 | "\"\"\"\n", 48 | "### 代码开始 ###(≈ 1 行代码)\n", 49 | "data = chengdu.reshape(516 * 819, 3)\n", 50 | "### 代码结束 ###" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "---" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "**挑战:计算 `422604` 个像素点中种类的个数。**" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "\"\"\"计算像素点种类个数\n", 74 | "\"\"\"\n", 75 | "def get_variety(data):\n", 76 | " \"\"\"\n", 77 | " 参数:\n", 78 | " 预处理后像素点集合\n", 79 | "\n", 80 | " 返回:\n", 81 | " num_variety -- 像素点种类个数\n", 82 | " \"\"\"\n", 83 | "\n", 84 | " ### 代码开始 ### (≈ 3 行代码)\n", 85 | " temp=data.tolist()\n", 86 | " num_variety=len(set([tuple(t) for t in temp]))\n", 87 | " ### 代码结束 ###\n", 88 | " \n", 89 | " return num_variety" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "---" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "**挑战:使用 Mini Batch K-Means 聚类方法对像素点进行聚类,并用每一个中心的像素点代替属于该类别的像素点。**" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "from sklearn.cluster import MiniBatchKMeans\n", 113 | "\n", 114 | "### 代码开始 ###(≈ 4 行代码)\n", 115 | "model = MiniBatchKMeans(10)\n", 116 | "model.fit(data)\n", 117 | "predict=model.predict(data)\n", 118 | "### 代码结束 ###\n", 119 | "\n", 120 | "new_colors = model.cluster_centers_[predict]" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "---" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "**挑战:将聚类后并替换为类别中心点值的像素点,变换为数据处理前的格式,并绘制出图片进行对比展示。**" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "fig, ax = plt.subplots(1, 2, figsize=(16, 6))\n", 144 | "\n", 145 | "### 代码开始 ###(≈ 3 行代码)\n", 146 | "new_chengdu = new_colors.reshape(chengdu.shape)\n", 147 | "ax[0].imshow(chengdu)\n", 148 | "ax[1].imshow(new_chengdu)\n", 149 | "### 代码结束 ###" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "---" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "" 164 | ] 165 | } 166 | ], 167 | "metadata": { 168 | "kernelspec": { 169 | "display_name": "Python 3", 170 | "language": "python", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "codemirror_mode": { 175 | "name": "ipython", 176 | "version": 3 177 | }, 178 | "file_extension": ".py", 179 | "mimetype": "text/x-python", 180 | "name": "python", 181 | "nbconvert_exporter": "python", 182 | "pygments_lexer": "ipython3", 183 | "version": "3.7.0" 184 | } 185 | }, 186 | "nbformat": 4, 187 | "nbformat_minor": 2 188 | } 189 | -------------------------------------------------------------------------------- /Answers/week4-challenge-01/banknote.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from sklearn.svm import SVC 4 | 5 | def identify(): 6 | 7 | df_train = pd.read_csv("banknote_train.csv") 8 | df_test = pd.read_csv("banknote_test.csv") 9 | 10 | model = SVC(gamma='auto') 11 | model.fit(df_train.iloc[:, :-1], df_train['class']) 12 | df_test['class'] = model.predict(df_test) 13 | 14 | return df_test -------------------------------------------------------------------------------- /Answers/week4-challenge-02/association.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from mlxtend.preprocessing import TransactionEncoder 3 | from mlxtend.frequent_patterns import apriori 4 | from mlxtend.frequent_patterns import association_rules as rules 5 | 6 | def rule(): 7 | 8 | df = pd.read_csv("shopping_data.csv", header=None) 9 | dataset = df.stack().groupby(level=0).apply(list).tolist() 10 | 11 | te = TransactionEncoder() # 定义模型 12 | te_ary = te.fit_transform(dataset) # 转换数据集 13 | df = pd.DataFrame(te_ary, columns=te.columns_) # 将数组处理为 DataFrame 14 | 15 | frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True) 16 | association_rules = rules(frequent_itemsets, metric="confidence", min_threshold=0.2) # 置信度阈值为 0.1 17 | 18 | return frequent_itemsets, association_rules -------------------------------------------------------------------------------- /Answers/week4-challenge-03/google_stock.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def quarter_volume(): 5 | df = pd.read_csv("GOOGL.csv", index_col=0) 6 | df.index = pd.to_datetime(df.index) 7 | df = df.resample('Q').agg({"Open": 'mean', "High": 'mean', "Low": 'mean', 8 | "Close": 'mean', "Adj Close": 'mean', "Volume": 'sum'}) 9 | df = df.sort_values(by='Volume', ascending=False) 10 | return df -------------------------------------------------------------------------------- /Answers/week4-challenge-04/production_index.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from statsmodels.tsa.stattools import arma_order_select_ic 3 | 4 | 5 | def arima(): 6 | df = pd.read_csv("agriculture.csv", index_col=0) 7 | diff = df.diff().dropna() 8 | p, q = arma_order_select_ic(diff, ic='aic')['aic_min_order'] # AIC 9 | d = 1 10 | return p, d, q 11 | -------------------------------------------------------------------------------- /Answers/week4-challenge-05/chengdu_pm25.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from fbprophet import Prophet 3 | 4 | 5 | def additive(): 6 | df = pd.read_csv("Chengdu_HourlyPM25.csv") 7 | df_nan = df.replace({-999: pd.np.NaN}) 8 | df = df_nan.fillna(method='ffill').fillna(method='bfill') 9 | 10 | df.index = pd.to_datetime(df['Date (LST)']) 11 | df = df.resample('D').mean() 12 | df = df.reset_index() 13 | df.rename(columns={'Date (LST)': 'ds', 'Value': 'y'}, inplace=True) 14 | 15 | m = Prophet() # 创建加法模型 16 | m.fit(df) 17 | 18 | future = m.make_future_dataframe(periods=365, freq='D') # 生成预测序列 19 | forecast = m.predict(future) # 预测 20 | # 仅保留预测值和相应的置信区间 21 | forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']] 22 | forecast = forecast.set_index('ds')['2017-01-01':] 23 | 24 | forecast.to_csv("forecast.csv") # 存为数据文件 25 | 26 | return forecast 27 | 28 | additive() -------------------------------------------------------------------------------- /Answers/week5-spiders-01/lianjia_spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import time 4 | from tqdm import tqdm 5 | import re 6 | import sqlite3 7 | import pandas as pd 8 | 9 | ''' 10 | 爬虫代码分为三步: 11 | 1. 爬取房屋 id 12 | 2. 根据房屋 id 组合 url,然后依次爬取房屋的具体界面获取信息 13 | 3. 保存到本地 14 | ''' 15 | 16 | base_url = 'https://sh.lianjia.com/zufang/' 17 | test_url = 'https://sh.lianjia.com/zufang/pg1/' 18 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'} 19 | 20 | # 获取一页的房屋列表,具体为房屋的 id,例如 107100610451 21 | def getHouseURLList(page_url): 22 | try: 23 | r = requests.get(page_url, timeout=5, headers=headers) 24 | if r.status_code==403: 25 | print('访问被拒,请稍后再试') 26 | except requests.exceptions.Timeout: 27 | # 请求超时,返回无效数据 28 | return None 29 | content = r.content 30 | soup = BeautifulSoup(content) 31 | result_list = list(soup.select('#house-lst')[0].children) 32 | return_list = [] 33 | for result in result_list: 34 | return_list.append(result['data-id']) 35 | return return_list 36 | 37 | # 示例 38 | # return_list = getHouseURLList(test_url) 39 | 40 | # 房屋 id 41 | data_id_list = [] 42 | 43 | # 多走几轮,以获得更全的数据 44 | for _ in range(1): 45 | # for i in tqdm(range(1, 101)): 46 | for i in tqdm(range(1, 2)): 47 | page_url = base_url+'pg{}/'.format(i) 48 | return_list = getHouseURLList(page_url) 49 | if not return_list: 50 | time.sleep(10) 51 | continue 52 | data_id_list.extend(return_list) 53 | 54 | # 去除重复数据 55 | data_id_list = list(set(data_id_list)) 56 | 57 | # 写入本地文件,保存房屋 id 58 | with open('house_id_list.txt', 'w') as f: 59 | f.write('\n'.join(data_id_list)) 60 | 61 | 62 | # 清理面积 63 | def clean_str(s): 64 | # 去除中文 65 | re.sub(r'[^\x00-\x7f]', '', s) 66 | new_s = [] 67 | for c in s: 68 | # 遇到非数字则舍去 69 | if c.isdigit(): new_s.append(c) 70 | else: break 71 | return ''.join(new_s) 72 | 73 | # 定义一个类保存数据 74 | class Room(object): 75 | def __init__(self, url): 76 | self.done = False 77 | self.area = 0 78 | self.url = url 79 | self.price = '' 80 | self.isRemoved = '' 81 | self.special_label = '' 82 | self.title = '' 83 | self.floor = '' 84 | self.is_near_subway = '' 85 | self.publish_time = '' 86 | self.rooms = '' 87 | self.toilet = '' 88 | self.halls = '' 89 | self.rent_way = '' 90 | self.location = '' 91 | 92 | # 房屋面积 93 | def setArea(self, area): 94 | self.area = float(clean_str(area)) 95 | 96 | # 价格 97 | def setPrice(self, price): 98 | self.price = price 99 | 100 | # 是否下架 101 | def setIsRemoved(self, isRemoved): 102 | self.isRemoved = isRemoved 103 | 104 | # 是否精装修 105 | def setSpecialLabel(self, special_label): 106 | self.special_label = special_label 107 | 108 | # 户型: 房间数量,房间,大厅,卫生间; 出租方式: 整租、合租 109 | def setType(self, type): 110 | tmp = type.split() 111 | if len(tmp) == 1: 112 | room_count = tmp[0] 113 | rent_way = '暂无信息' 114 | else: 115 | room_count, rent_way = tmp 116 | room_count = re.sub(r'[^\x00-\x7f]', ' ', room_count).strip().split() 117 | room_count = list(map(int, room_count)) 118 | 119 | # 部分房屋无卫生间或客厅 120 | if len(room_count) < 3: 121 | for i in range(3 - len(room_count)): 122 | room_count.append(0) 123 | 124 | self.rooms = room_count[0] 125 | self.halls = room_count[1] 126 | self.toilet = room_count[2] 127 | self.rent_way = rent_way 128 | 129 | # 位置 130 | def setLocation(self, location): 131 | self.location = location 132 | 133 | # 是否靠近地铁 134 | def setSubway(self, is_near_subway): 135 | self.is_near_subway = is_near_subway 136 | 137 | # 朝向 138 | def setDirection(self, direction): 139 | self.direction = direction 140 | 141 | # 楼层 142 | def setFloor(self, floor): 143 | self.floor = floor 144 | 145 | # 发布时间 146 | def setPublishTime(self, publish_time): 147 | self.publish_time = publish_time 148 | 149 | # 房屋标题 150 | def setTitle(self, title): 151 | self.title = title 152 | 153 | # 房屋链接 154 | def setURL(self, URL): 155 | self.url = URL 156 | 157 | # 是否爬取成功 158 | def setDone(self, done): 159 | self.done = done 160 | 161 | def __repr__(self): 162 | return str(self.__dict__) 163 | 164 | # 给定 url 获取房屋信息 165 | def getRoom(url): 166 | room = Room(url) 167 | try: 168 | r = requests.get(url, timeout=5, headers=headers) 169 | if r.status_code==403: 170 | print('访问被拒,请稍后再试') 171 | except requests.exceptions.Timeout: 172 | time.sleep(2) 173 | print('timeout') 174 | return Room('invalid') 175 | content = r.content.decode() 176 | 177 | soup = BeautifulSoup(content, features='lxml') 178 | 179 | title = soup.find('h1', class_='main').text 180 | room.setTitle(title) 181 | 182 | price_div = soup.find('div', class_='price') 183 | price_list = list(price_div.stripped_strings) # ['9000', '元/月', '精装修'] 184 | price = ''.join(price_list[:2]) 185 | room.setPrice(price) 186 | 187 | special_label = ' '.join(price_list[2:]) if len(price_list)>2 else '无' 188 | room.setSpecialLabel(special_label) 189 | isRemoved = '已下架' if price_div['class'][1] == 'isRemove' else '正在出租' 190 | room.setIsRemoved(isRemoved) 191 | 192 | room_info = soup.find('div', class_='zf-room') 193 | room_info_list = list(room_info.stripped_strings) 194 | 195 | location = "{} {} {}".format(room_info_list[15], room_info_list[16], room_info_list[11]) 196 | room.setLocation(location) 197 | room.setPublishTime(room_info_list[-1]) 198 | room.setArea(room_info_list[1]) 199 | room.setType(room_info_list[3]) # 4室2厅3卫; 200 | room.setFloor(room_info_list[5]) 201 | room.setDirection(room_info_list[7]) 202 | room.setSubway(room_info_list[9]) 203 | 204 | room.setDone(True) 205 | 206 | return room 207 | 208 | # 连接数据库 209 | conn = sqlite3.connect('lianjia.db') 210 | 211 | cursor = conn.cursor() 212 | # 创建表,如果已创建,则删除下面这行 213 | cursor.execute(''' 214 | CREATE TABLE ROOM( 215 | url VARCHAR(1000) PRIMARY KEY, 216 | price Double, 217 | area Double, 218 | isRemoved VARCHAR(1000), 219 | special_label VARCHAR(1000), 220 | rooms INT, 221 | halls INT, 222 | toilet INT, 223 | rent_way INT, 224 | location VARCHAR(1000), 225 | is_near_subway VARCHAR(1000), 226 | direction VARCHAR(1000), 227 | floor VARCHAR(1000), 228 | publish_time VARCHAR(1000), 229 | title VARCHAR(1000) 230 | ) 231 | ''') 232 | 233 | SELECT_COMMAND = "select * from ROOM where url='{}';" 234 | INSERT_COMMAND = "insert into ROOM(url, price, area, isRemoved, \ 235 | special_label, rooms, halls, toilet, rent_way, \ 236 | location, is_near_subway, direction, floor, publish_time, title) \ 237 | values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);" 238 | 239 | # 根据上面爬取到的房屋 ID 组合生成 url,然后依次爬取 240 | def getAllHouseInfo(file_path): 241 | base_url = 'https://sh.lianjia.com/zufang/{}.html' 242 | with open(file_path) as f: 243 | lines = f.readlines() 244 | 245 | urls = [base_url.format(line.strip('\n')) for line in lines] 246 | for url in tqdm(urls): 247 | cursor = conn.cursor() 248 | cursor.execute(SELECT_COMMAND.format(url)) 249 | if len(cursor.fetchall()) != 0: 250 | continue 251 | cursor.close() 252 | # 如果失败,最多尝试 5 次 253 | count = 0 254 | while count < 5: 255 | room = getRoom(url) 256 | if room.done: 257 | break 258 | count += 1 259 | if count == 5: 260 | continue 261 | 262 | # 插入数据库 263 | cursor = conn.cursor() 264 | cursor.execute(INSERT_COMMAND, 265 | (room.url, room.price, room.area, room.isRemoved, room.special_label, 266 | room.rooms, room.halls, room.toilet, room.rent_way, room.location, room.is_near_subway, 267 | room.direction, room.floor, room.publish_time, room.title)) 268 | cursor.close() 269 | conn.commit() 270 | if cursor.rowcount != 1: 271 | print('插入错误') 272 | 273 | 274 | getAllHouseInfo('house_id_list.txt') 275 | 276 | csv_path = 'lianjia.csv' 277 | 278 | cursor = conn.cursor() 279 | 280 | # 保存到本地 csv 文件 281 | cursor.execute('SELECT * FROM ROOM') 282 | 283 | data = cursor.fetchall() 284 | data = list(map(list, data)) 285 | name_attribute = ['url', 'price', 'area', 'state', 'label', 'rooms', 'halls', 'toilets', 'rentway', 'location', 286 | 'subway', 'direction', 'floor', 'publishtime', 'title'] 287 | data_frame =pd.DataFrame(columns=name_attribute,data=data) 288 | data_frame.to_csv(csv_path, encoding='utf_8_sig') 289 | conn.close() -------------------------------------------------------------------------------- /Answers/week5-spiders-02/create_sqlite_database.py: -------------------------------------------------------------------------------- 1 | import sqlite3 2 | 3 | 4 | # 创建携程数据库 5 | conn = sqlite3.connect('xiecheng.db') 6 | 7 | # 创建一个包含机票信息的表 8 | # 分别是 公司名、出发时间、到达时间、出发机场、到达机场、飞机类型、准点率、飞机编号、价格、日期 9 | CREATE_COMMAND1 = ''' 10 | CREATE table AIRPLANE ( 11 | company_name varchar(1000), 12 | start_time varchar(1000), 13 | arrival_time varchar(1000), 14 | start_airport varchar(1000), 15 | arrival_airport varchar(1000), 16 | airpane_type varchar(1000), 17 | ontime_rate float, 18 | airpane_number varchar(1000), 19 | price float, 20 | date varchar(1000) 21 | ); 22 | ''' 23 | 24 | # 选择预计,根据飞机编号、日期、出发时间选出需要的机票信息 25 | SELECT_COMMAND = ''' 26 | select * from AIRPLANE where airpane_number=? and date=? and start_time=?; 27 | ''' 28 | 29 | # 插入新的数据 30 | INSERT_COMMAND1 = ''' 31 | insert into AIRPLANE values(?,?,?,?,?,?,?,?,?,?); 32 | ''' 33 | 34 | # 创建一个最低价格的表,包含出发城市、到达城市、日期、最低价格 35 | CREATE_COMMAND2 = ''' 36 | CREATE table LOWEST_PRICE ( 37 | start_city varchar(1000), 38 | arrival_city varchar(1000), 39 | date varchar(1000), 40 | price float 41 | ); 42 | ''' 43 | 44 | # 插入数据 45 | INSERT_COMMAND2 = ''' 46 | insert into LOWEST_PRICE values(?,?,?,?); 47 | ''' 48 | 49 | # 创建表 50 | cursor = conn.cursor() 51 | cursor.execute(CREATE_COMMAND1) 52 | cursor.close() 53 | cursor = conn.cursor() 54 | cursor.execute(CREATE_COMMAND2) 55 | cursor.close() 56 | -------------------------------------------------------------------------------- /Answers/week5-spiders-02/insert_database.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import pandas as pd 3 | 4 | 5 | base = datetime.date(2018, 10, 30) 6 | numdays = 80 7 | 8 | # 所有的十月三十号以后的八十天的 list 9 | date_list = [base + datetime.timedelta(days=x) for x in range(0, numdays)] 10 | 11 | # 获取从 start 到 dest 的数据并插入数据库中 12 | def getTickets(start, dest, driver, date_list, conn): 13 | cursor = conn.cursor() 14 | name_attribute = [] 15 | for one_day in tqdm_notebook(date_list): 16 | # 获取数据 17 | tmp = get_ticket_info(start, dest, str(one_day), driver) 18 | for x in tmp: 19 | result = cursor.execute( 20 | SELECT_COMMAND, (x[-2], str(one_day), x[1])).fetchall() 21 | x.append(str(one_day)) 22 | if len(result) == 0: 23 | # 如果没有爬取则插入数据库 24 | cursor.execute(INSERT_COMMAND1, x) 25 | conn.commit() 26 | cursor.close() 27 | 28 | 29 | # 成都到上海和上海到成都 30 | getTickets('CTU', 'SHA', driver, date_list, conn) 31 | getTickets('SHA', 'CTU', driver, date_list, conn) 32 | -------------------------------------------------------------------------------- /Answers/week5-spiders-02/xiecheng_spider.py: -------------------------------------------------------------------------------- 1 | # 使用 BeautifulSoup 进行解析 2 | from bs4 import BeautifulSoup 3 | from selenium import webdriver 4 | from selenium.common.exceptions import TimeoutException 5 | import time 6 | from selenium.webdriver.chrome.options import Options 7 | import re 8 | from selenium.webdriver.common.proxy import Proxy, ProxyType 9 | 10 | 11 | ''' 12 | dstation: 出发城市代码 13 | astation: 到达城市代码 14 | date: 出发日期,形如 2018-10-30 15 | driver: 创建的 webdriver 16 | ''' 17 | def get_ticket_info(dstation, astation, date, driver): 18 | url = "http://flights.ctrip.com/booking/%s-%s-day-1.html?DDate1=%s" % ( 19 | dstation, astation, date) 20 | # 一直尝试到成功 21 | while True: 22 | try: 23 | driver.get(url) 24 | break 25 | except TimeoutException as e: 26 | pass 27 | # 等待页面加载出来 28 | time.sleep(2) 29 | 30 | # webdriver 执行 js 语句滑动窗口,一直滑动到底部 31 | initial_pagesource = driver.page_source 32 | while True: 33 | # 滑到页面底部,暂停 0.1 秒是为了等待页面刷新出结果 34 | driver.execute_script( 35 | "window.scrollTo(0, document.body.scrollHeight);") 36 | # 等待数据加载 37 | time.sleep(1) 38 | # 如果当前页面和上一个页面的 html 内容不同,则表明滑动到底部了 39 | if initial_pagesource == driver.page_source: 40 | break 41 | initial_pagesource = driver.page_source 42 | 43 | # 使用 BeautifulSoup 解析 html 内容 44 | soup = BeautifulSoup(initial_pagesource) 45 | # 获取搜索结果的每一个项 46 | result = soup.find_all("div", class_=["search_table_header", ]) 47 | result_list = [] 48 | for ticket_info in result: 49 | try: 50 | # 航空公司名、出发时间、到达时间 51 | company_name, start_time, arrival_time = [ 52 | x.text for x in ticket_info.find_all('strong')] 53 | # 出发机场、到达机场 54 | start_airport, arrival_airport = [ 55 | x.text for x in ticket_info.find_all("div", class_=["airport", ])] 56 | tmp = [x.text for x in ticket_info.find_all( 57 | "span", class_=["direction_black_border", ])] 58 | # 飞机类型,准点率(可能没有) 59 | if len(tmp) == 2: 60 | airpane_type, ontime_rate = tmp 61 | ontime_rate = float(''.join(filter(str.isdigit, ontime_rate)))/100 62 | else: 63 | airpane_type = tmp[0] 64 | ontime_rate = 0 65 | # 航班编号 66 | airpane_number = [x.text for x in ticket_info.find_all("span")][2] 67 | # 价格(经济舱) 68 | price = int([''.join(list(filter(str.isdigit, x.text))) 69 | for x in ticket_info.find_all("span", class_=["base_price02", ])][0]) 70 | result_list.append([company_name, start_time, arrival_time, start_airport, 71 | arrival_airport, airpane_type, ontime_rate, airpane_number, price]) 72 | 73 | except Exception as E: 74 | print(E) 75 | 76 | # 按机票价格排序后返回 77 | return sorted(result_list, key=lambda x: x[-1]) 78 | 79 | if __name__ == "__main__": 80 | # driver = webdriver.PhantomJS(executable_path="./chromedriver", service_args=['--load-images=no']) 81 | options = Options() 82 | # options = webdriver.ChromeOptions() 83 | # options.add_argument("--headless") # Runs Chrome in headless mode. 84 | options.add_argument('--no-sandbox') # # Bypass OS security model 85 | options.add_argument('start-maximized') 86 | options.add_argument('disable-infobars') 87 | options.add_argument("--disable-extensions") 88 | driver = webdriver.Chrome(options=options, executable_path='./chromedriver') 89 | # driver = webdriver.Chrome("./chromedriver") 90 | result_list = get_ticket_info('CTU', 'SHA', '2018-10-30', driver) 91 | print(result_list) -------------------------------------------------------------------------------- /Assignments/README.md: -------------------------------------------------------------------------------- 1 |
2 |

3 | 实验楼《楼+ 数据分析与挖掘实战》优秀项目挑战报告|课程报名 4 |
5 | 6 |
7 | 8 | 如果 Github 加载缓慢,可以点击下方链接快速浏览。👇 9 | 10 | ### 第 1 期课程 11 | 12 | - 报告题目:[中国保险业过去五年基础数据分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%8F%85%EF%B8%8Fdm01-stenphen-%E4%B8%AD%E5%9B%BD%E4%BF%9D%E9%99%A9%E4%B8%9A%E8%BF%87%E5%8E%BB%E4%BA%94%E5%B9%B4%E5%9F%BA%E7%A1%80%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90.ipynb)|学员昵称:stenphen 🌟 13 | - 报告题目:[上海历史天气数据分析预测](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%8F%85%EF%B8%8Fdm01-stenphen-%E4%B8%AD%E5%9B%BD%E4%BF%9D%E9%99%A9%E4%B8%9A%E8%BF%87%E5%8E%BB%E4%BA%94%E5%B9%B4%E5%9F%BA%E7%A1%80%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90.ipynb)|学员昵称:BellaG 14 | - 报告题目:[双色球历史数据统计预测](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%A5%89dm01-hcccom-%E5%8F%8C%E8%89%B2%E7%90%83%E5%8E%86%E5%8F%B2%E6%95%B0%E6%8D%AE%E7%BB%9F%E8%AE%A1%E9%A2%84%E6%B5%8B.ipynb)|学员昵称:hcccom 15 | 16 | ### 第 2 期课程 17 | 18 | - 报告题目:[杭州互联网寒冬背景下的数据分析岗现状分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%8F%85%EF%B8%8Fdm02-%E7%B1%B3%E7%AB%B9314159-%E6%9D%AD%E5%B7%9E%E4%BA%92%E8%81%94%E7%BD%91%E5%AF%92%E5%86%AC%E8%83%8C%E6%99%AF%E4%B8%8B%E7%9A%84%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B2%97%E7%8E%B0%E7%8A%B6%E5%88%86%E6%9E%90.ipynb)|学员昵称:米竹314159 🌟 19 | - 报告题目:[医学专业论坛的数据爬取和分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%A5%88dm02-linnecn-%E5%8C%BB%E5%AD%A6%E4%B8%93%E4%B8%9A%E8%AE%BA%E5%9D%9B%E7%9A%84%E6%95%B0%E6%8D%AE%E7%88%AC%E5%8F%96%E5%92%8C%E5%88%86%E6%9E%90.ipynb)|学员昵称:linnecn 20 | 21 | ### 第 3 期课程 22 | 23 | - 虚位以待 24 | 25 | ### 第 4 期课程 26 | 27 | - 报告题目:[链家成都市区挂牌二手房分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%8F%85%EF%B8%8Fdm04-Luo2019-%E9%93%BE%E5%AE%B6%E6%88%90%E9%83%BD%E5%B8%82%E5%8C%BA%E6%8C%82%E7%89%8C%E4%BA%8C%E6%89%8B%E6%88%BF%E5%88%86%E6%9E%90.ipynb)|学员昵称:Luo2019 🌟 28 | - 报告题目:[B 站番剧数据简单分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%A5%88dm04-Yueyec-B-%E7%AB%99%E7%95%AA%E5%89%A7%E6%95%B0%E6%8D%AE%E7%AE%80%E5%8D%95%E5%88%86%E6%9E%90.ipynb)|学员昵称:Yueyec 29 | 30 | ### 第 5 期课程 31 | 32 | - 报告题目:[京东手机销售数据分析](https://www.kaggle.com/ted0001/dm05-998494)|学员昵称:[Ted_Wei](https://www.lanqiao.cn/users/998494/) 🌟 33 | - 报告题目:[通信基站室内分布系统外引小区识别](https://www.kaggle.com/cym1085893/dm05-1085893)|学员昵称:[yiming_chen](https://www.lanqiao.cn/users/1085893/) 34 | 35 | ### 第 6 期课程 36 | 37 | - 报告题目:[大连地区酒店数据分析](https://www.kaggle.com/louplus/dm06-937174)|学员昵称:[Miss_candy](https://www.lanqiao.cn/users/937174/) 🌟 38 | 39 | ### 第 7 期课程 40 | 41 | - 报告题目:[微博搜索“双十一”数据分析](https://www.kaggle.com/lanjie/dm07-1127847)|学员昵称:[灵汐](https://www.lanqiao.cn/users/1127847/) 42 | 43 | ### 第 8 期课程 44 | 45 | - 报告题目:[B站up主“老番茄”基本数据采集分析](https://www.kaggle.com/truwbin/dm08-877339-b-up)|学员昵称:[今天小古不出门](https://www.lanqiao.cn/users/877339/) 🌟 46 | - 报告题目:[下厨房家常菜菜谱分析及新菜谱预测评分](https://www.kaggle.com/fors3c/dm08-ns3c)|学员昵称:ns3c 47 | 48 | ### 第 9 期课程 49 | 50 | - 报告题目:[世界银行国际旅游业指标分析](https://www.kaggle.com/furongrong/dm09-535211)|学员昵称:[RR25](https://www.lanqiao.cn/users/535211/) 🌟 51 | - 报告题目:[猪肉价格数据分析](https://www.kaggle.com/suxiaomo/dm09-1180757)|学员昵称:[苏小墨](https://www.lanqiao.cn/users/1180757/) 52 | - 报告题目:[汽车之家数据分析](https://www.kaggle.com/mengchenshang/dm09-1176812)|学员昵称:[凹润纸](https://www.lanqiao.cn/users/1176812/) 53 | 54 | ### 第 10 期课程 55 | 56 | - 虚位以待 57 | 58 | ### 第 11 期课程 59 | 60 | - 报告题目:[科比职业生涯回顾与模型预测](https://www.kaggle.com/yemujianglin/dm11-1276351)|学员昵称:[夜幕降临_](https://www.lanqiao.cn/users/1276351/) 🌟 61 | 62 | ### 第 12 期课程 63 | 64 | - 虚位以待 65 |
66 | 67 | ### 第 13 期课程 68 | 69 | - 报告题目:[基于 Python 语言的加拿大联邦大选数据分析](https://www.kaggle.com/czz1403/dm13-1204880-python)|学员昵称:[TXZXTLD](https://www.lanqiao.cn/users/1204880/) 🌟 70 | - 报告题目:[新冠疫情社会影响数据分析](https://www.kaggle.com/vincentbao/dm13-812273)|学员昵称:[vincentbao](https://www.lanqiao.cn/users/812273/) 🌟 已经制作成课程:https://www.lanqiao.cn/courses/2791 71 | 72 | 73 | ``` 74 | - 原作者可以提 PR 更新自己的报告内容。 75 | - 实验报告版权归属原学员且授权实验楼独家使用,请勿用于商业用途。 76 | ``` 77 | -------------------------------------------------------------------------------- /Assignments/🥉dm01-hcccom-双色球历史数据统计预测.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# 双色球历史数据统计预测" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "- 报告题目:双色球历史数据统计预测\n", 22 | "- 学员昵称:hcccom\n", 23 | "- 课程期数:第一期" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "© 本文著作权归作者所有,并授权实验楼独家使用,未经实验楼许可,不得转载使用。" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "---" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### 获取数据" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "import random\n", 54 | "import numpy as np\n", 55 | "import requests\n", 56 | "import csv\n", 57 | "from bs4 import BeautifulSoup\n", 58 | "headers = {\n", 59 | " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}\n", 60 | "res = requests.get(\n", 61 | " 'https://datachart.500.com/ssq/history/newinc/history.php?start=03001&end=18147', headers=headers) # 从03年第一期开始\n", 62 | "res.encoding = 'uft-8'\n", 63 | "soup = BeautifulSoup(res.text, 'lxml')\n", 64 | "data = soup.find_all(attrs={'class': 't_tr1'})\n", 65 | "csvFile = open(\"./ssq004.csv\", 'wt', newline='', encoding='utf-8')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "def gens(x): # 模拟\n", 75 | " random.seed(x)\n", 76 | " a = np.arange(1, 34, 1).tolist()\n", 77 | " red = sorted(random.sample(a, 6))\n", 78 | " b = np.arange(1, 17, 1).tolist()\n", 79 | " blue = random.sample(b, 1)\n", 80 | " red.extend(blue)\n", 81 | " return red" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 8, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "with open(r\"ssq004.csv\", 'a', newline='') as f:\n", 91 | " writer = csv.writer(f)\n", 92 | " writer.writerow([\"period\", \"real\", \"date\", \"vis\", 'r1',\n", 93 | " 'r2', 'r3', 'r4', 'r5', 'r6', 'b1']) # 先写入列名\n", 94 | " for i in range(0, len(data)):\n", 95 | " period = data[i].find_all('td')[0].text\n", 96 | " real = [int(data[i].find_all('td')[1].text), int(data[i].find_all('td')[2].text), int(data[i].find_all('td')[3].text), int(data[i].find_all(\n", 97 | " 'td')[4].text), int(data[i].find_all('td')[5].text), int(data[i].find_all('td')[6].text), int(data[i].find_all('td')[7].text)]\n", 98 | " date = data[i].find_all('td')[15].text\n", 99 | " vis = gens(date)\n", 100 | " r1 = int(data[i].find_all('td')[1].text)\n", 101 | " r2 = int(data[i].find_all('td')[2].text)\n", 102 | " r3 = int(data[i].find_all('td')[3].text)\n", 103 | " r4 = int(data[i].find_all('td')[4].text)\n", 104 | " r5 = int(data[i].find_all('td')[5].text)\n", 105 | " r6 = int(data[i].find_all('td')[6].text)\n", 106 | " b1 = int(data[i].find_all('td')[7].text)\n", 107 | "\n", 108 | " writer.writerows(\n", 109 | " [[period, real, date, vis, r1, r2, r3, r4, r5, r6, b1]])\n", 110 | "csvFile.close()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "### 数据处理" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 9, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/plain": [ 128 | "[3, 6, 8, 23, 25, 33, 4]" 129 | ] 130 | }, 131 | "execution_count": 9, 132 | "metadata": {}, 133 | "output_type": "execute_result" 134 | } 135 | ], 136 | "source": [ 137 | "import pandas as pd\n", 138 | "import matplotlib.pyplot as plt\n", 139 | "df = pd.read_csv('ssq004.csv', encoding='gbk')\n", 140 | "df.set_index(\"date\", inplace=True)\n", 141 | "df.index = pd.DatetimeIndex(df.index)\n", 142 | "df.sort_index(ascending=True, inplace=True)\n", 143 | "\n", 144 | "\n", 145 | "def get_real(i):\n", 146 | " a = df.real[i]\n", 147 | " lista = a.strip('[]').split(',')\n", 148 | " map(int, lista)\n", 149 | " list_real = [int(x) for x in lista]\n", 150 | " return list_real\n", 151 | "\n", 152 | "\n", 153 | "def get_vis(i): # 产生和时间相关的随机双色球\n", 154 | " b = df.vis[i]\n", 155 | " listb = b.strip('[]').split(',')\n", 156 | " map(int, listb)\n", 157 | " list_vis = [int(x) for x in listb]\n", 158 | " return list_vis\n", 159 | "\n", 160 | "\n", 161 | "get_vis(1)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "# 计算每个球出现的频率" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 10, 176 | "metadata": { 177 | "scrolled": true 178 | }, 179 | "outputs": [], 180 | "source": [ 181 | "df1 = df.drop(columns=['period', 'real', 'vis'])\n", 182 | "df2 = df1.drop(columns=['b1'])\n", 183 | "df3 = df1[['b1']]\n", 184 | "dup = df1[df1.duplicated()].count()\n", 185 | "rd1 = df2.stack().value_counts()\n", 186 | "bd1 = df3['b1'].value_counts()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 11, 192 | "metadata": { 193 | "scrolled": false 194 | }, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "image/png": "\n", 199 | "text/plain": [ 200 | "
" 201 | ] 202 | }, 203 | "metadata": { 204 | "needs_background": "light" 205 | }, 206 | "output_type": "display_data" 207 | }, 208 | { 209 | "data": { 210 | "image/png": "\n", 211 | "text/plain": [ 212 | "
" 213 | ] 214 | }, 215 | "metadata": { 216 | "needs_background": "light" 217 | }, 218 | "output_type": "display_data" 219 | } 220 | ], 221 | "source": [ 222 | "import matplotlib.pyplot as plt\n", 223 | "from pylab import *\n", 224 | "plt.figure(111)\n", 225 | "rd1.plot(kind='bar', align='center')\n", 226 | "plt.xlabel(\"red\")\n", 227 | "plt.ylabel(\"Times\")\n", 228 | "plt.show()\n", 229 | "plt.figure(112)\n", 230 | "bd1.plot(kind='bar')\n", 231 | "plt.xlabel(\"blue\")\n", 232 | "plt.ylabel(\"Times\")\n", 233 | "plt.show()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### 取一组\n", 241 | " " 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 14, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "data": { 251 | "text/html": [ 252 | "
\n", 253 | "\n", 266 | "\n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | "
periodrealvisr1r2r3r4r5r6b1
date
2018-12-0418142[5, 8, 10, 11, 27, 28, 11][11, 13, 16, 24, 29, 31, 8]581011272811
2018-12-0618143[4, 6, 15, 28, 32, 33, 14][10, 14, 15, 16, 24, 33, 14]461528323314
2018-12-0918144[8, 13, 17, 18, 20, 27, 13][3, 7, 13, 17, 19, 30, 13]8131718202713
2018-12-1118145[3, 9, 13, 22, 23, 25, 6][3, 6, 10, 12, 19, 25, 12]39132223256
2018-12-1318146[2, 10, 11, 17, 18, 29, 16][6, 10, 21, 24, 26, 32, 3]2101117182916
\n", 363 | "
" 364 | ], 365 | "text/plain": [ 366 | " period real vis \\\n", 367 | "date \n", 368 | "2018-12-04 18142 [5, 8, 10, 11, 27, 28, 11] [11, 13, 16, 24, 29, 31, 8] \n", 369 | "2018-12-06 18143 [4, 6, 15, 28, 32, 33, 14] [10, 14, 15, 16, 24, 33, 14] \n", 370 | "2018-12-09 18144 [8, 13, 17, 18, 20, 27, 13] [3, 7, 13, 17, 19, 30, 13] \n", 371 | "2018-12-11 18145 [3, 9, 13, 22, 23, 25, 6] [3, 6, 10, 12, 19, 25, 12] \n", 372 | "2018-12-13 18146 [2, 10, 11, 17, 18, 29, 16] [6, 10, 21, 24, 26, 32, 3] \n", 373 | "\n", 374 | " r1 r2 r3 r4 r5 r6 b1 \n", 375 | "date \n", 376 | "2018-12-04 5 8 10 11 27 28 11 \n", 377 | "2018-12-06 4 6 15 28 32 33 14 \n", 378 | "2018-12-09 8 13 17 18 20 27 13 \n", 379 | "2018-12-11 3 9 13 22 23 25 6 \n", 380 | "2018-12-13 2 10 11 17 18 29 16 " 381 | ] 382 | }, 383 | "execution_count": 14, 384 | "metadata": {}, 385 | "output_type": "execute_result" 386 | } 387 | ], 388 | "source": [ 389 | "df.tail(5)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": 15, 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "[]" 401 | ] 402 | }, 403 | "execution_count": 15, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | }, 407 | { 408 | "data": { 409 | "image/png": "\n", 410 | "text/plain": [ 411 | "
" 412 | ] 413 | }, 414 | "metadata": { 415 | "needs_background": "light" 416 | }, 417 | "output_type": "display_data" 418 | } 419 | ], 420 | "source": [ 421 | "x = [1, 2, 3, 4, 5, 6, 7]\n", 422 | "y1 = get_real(-1)\n", 423 | "y2 = get_vis(-1)\n", 424 | "fig, ax = plt.subplots(figsize=(15, 5))\n", 425 | "ax.plot(x, y1, 'ch-', markersize=10)\n", 426 | "ax.plot(x, y2, 'ys-', markersize=10)" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": 167, 432 | "metadata": {}, 433 | "outputs": [ 434 | { 435 | "data": { 436 | "image/png": "\n", 437 | "text/plain": [ 438 | "
" 439 | ] 440 | }, 441 | "metadata": { 442 | "needs_background": "light" 443 | }, 444 | "output_type": "display_data" 445 | }, 446 | { 447 | "data": { 448 | "text/plain": [ 449 | "array([0.95333787, 3.40088556])" 450 | ] 451 | }, 452 | "execution_count": 167, 453 | "metadata": {}, 454 | "output_type": "execute_result" 455 | } 456 | ], 457 | "source": [ 458 | "x_data = np.array(y1)\n", 459 | "\n", 460 | "y_data = np.array(y2)\n", 461 | "\n", 462 | "poly = np.polyfit(x_data, y_data, deg=1)\n", 463 | "\n", 464 | "plt.plot(x_data, y_data, 'o')\n", 465 | "\n", 466 | "plt.plot(x_data, np.polyval(poly, x_data))\n", 467 | "\n", 468 | "plt.show()\n", 469 | "poly" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 7, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAIABJREFUeJzt3Xl81NW9//HXSQgQ1oCEJUEWURFlN1KtrVK0olYxoqjsoi3+Wr3V1nJdbr3a7eotWnurVasVZV9k12IpVq1rbRN2RVyQJROEICRhCZDl/P44EwkhIZNkZr4z33k/H488Er98M/MZB945Od/zPR9jrUVEROJfktcFiIhIeCjQRUR8QoEuIuITCnQREZ9QoIuI+IQCXUTEJxToIiI+oUAXEfEJBbqIiE80ieaTdejQwfbo0SOaTykiEvdyc3P3WGvT6zovqoHeo0cPcnJyovmUIiJxzxizLZTzNOUiIuITCnQREZ9QoIuI+IQCXUTEJxToIiI+EdVVLiIiiWbpmgBTV24mv7CEjLRUpgzvTfagzIg8lwJdRCRClq4JcN/iDZSUlgMQKCzhvsUbACIS6ppyERGJhMNFLH91BZeUv8O45FU0pRSAktJypq7cHJGn1AhdRKShDu2FvV/A3i3VPj6HQ18xDaApVFjD+xVn87l1o/L8wpKIlKNAFxGpjbXB0P68htDeAiX7jj+/TVdo3xPOugra9eDLvz9FZwp4oGzS12EOkJGWGpFyFegiktishYMFx0L6q6rh/QUcKapysoG0U6H9aXDOte5z+17uc7vukJJ67DFfuYvOFPBoxRhml1/69SOkpiQzZXjviLwUBbqI+J+1sP/LE6dFKkP76IFj55pkSOvmQrrrecHQPg1O6eWON2lW93OtegByX4Rv383p7W8lU6tcRETqoaIC9uefOC3y1RbY9wWUHjp2blITSOvuQrr7hcdCu/1pLrSTUxpex1uPwntPwJDJMOwBso2JWIBXp0AXkfhRUQ7FgROnRfYGQ7vs8LFzk5tCux4upE8b6ua2K0O77amQHIH4++cz8MavYcBouPx/wZjwP8dJKNBFJLaUl0HRjpovQu7bCuVHj53bpDm0Cwb16Ze4EXdlaLfJhKTk6NW9Zjb89R7oczWMeBKSor8qXIEuItFXXgqF26tNjQRH3YXboKLs2LkpLVxAp/eG3lccfyGydRdPgvMEHy6F5XdAr2Fw3fORGf2HQIEuIpFRdgT2bat5jXbhDrDlx85t2soFdOd+cPY1x1+IbNUp6lMX9fLpa7Do+9B1CNw4q+6LphGkQBeRhis97KZBalqnXZQHtuLYuc3auJDOPBf6jTr+QmTL9NgO7dpsew/mj4OOfWDMfGja0tNy6gx0Y0xz4C2gWfD8hdbaB40xPYF5QHtgNTDeWnu09kcSkbh09JC74HjCOu0v3AVK7LFzU9u5gD71GzBgzPGh3aJ9fIZ2bfLXwJwb3br08UsgNc3rikIaoR8BhllrDxhjUoB3jDGvAj8FHrfWzjPGPAPcCjwdwVpFJFKO7K/lFvYtsH/n8ee26OACuse3jk2LtO/pLk62aO9N/dG2+2OYORKap8H4pdCyg9cVASEEurXWApWr7lOCHxYYBowJHp8OPIQCXSR2HS6qFtZfHBtxH9x9/LmtOrmw7jWsynK/YHA3b+tN/bFi31aYme3Wqk9YCm2js8Y8FCHNoRtjkoFc4HTgj8DnQKG1tvJSdB4QO69KJFGV7Dt2M031kfahPcef27qLC+kzhx8/NdK+JzRr7U39sa54J8y4xq13v3mF++0khoQU6NbacmCgMSYNWAL0qem0mr7XGDMZmAzQrVu3BpYpIkCVzaKq375ex2ZRfa46PrTb9fD8Al7cOfiVG5kf/AomLoNOZ3td0QnqtcrFWltojHkTOB9IM8Y0CY7SuwL5tXzPs8CzAFlZWTWGvohUUX2zqOMuRlbbLMokQduuVTaLqnJjTdXNoqRxDhfBrJFuumXcIrdSJwaFssolHSgNhnkqcCnwv8AbwPW4lS4TgWWRLFTEV6yFA7uq3cJeZW776P5j59a0WVTlHZGhbBYljXP0EMy5CXZthJvmuovBMSqUEXoXYHpwHj0JWGCtfcUY8xEwzxjza2AN8HwE6xTxj8Id8KeLoGTvsWNJTY7tO9L9m+HdLEoaruwoLBgPO/7p7gA98zKvKzqpUFa5rAcG1XB8CzAkEkWJ+NqX612YXzQFul0Q2c2ipOHKy2Dx9+Gz12DEE9B3pNcV1Ul/g0SirSjPfR4yGVp19LYWqVlFBbx8J3y0DIb/Dwye4HVFIYmBXW1EEkxRHiQ3czfoSOyxFlbeD2tnwcX3wgW3e11RyBToItFWHIA2GbGxS6Cc6M2H4YOn4fzbYei9XldTL/obJRJtRXluqaHEnveegH/8LwwaD8N/E3d7zyjQRaKtKKBAj0W5L8Lffu7W81/9f3EX5qBAF4mu8jLX97KNdsqIKRsWwst3wenfhWufjW6nozBSoItE04Ev3R7hGqHHjs1/hSW3uWbRN86EJk29rqjBFOgi0VS5ZFGBHhu+eAsWTIDO/WH03LjfKkGBLhJNCvTYkZcDc0e7G7vGLYLmbbyuqNEU6CLRVBnomkP31q4PYdZ1rjHFhKW+acyhQBeJpuIANGvri9Fg3Prqc5iRDSktYMIyaN3Z64rCRrf+i0RTUV5MdbhJOEV5rkGFLYcJf3EbovmIAl0kmnRTkXcOFLgwP1wEE1+G9DO9rijsNOUiEk1FeZo/90JJIcy61t3UNWYBZAz0uqKI0AhdJFqOHnLb5mqEHl1HDsDsUVCwGUbPg+4XeF1RxCjQRaKlOOA+K9Cjp/QwzB8LgRwYNR1Ov8TriiJKgS4SLVqDHl3lZbDoVtjyJmQ/DWeP8LqiiNMcuki0aA169FRUwLLb4eNX4IqpMHCM1xVFhQJdJFqKA4Bxe6FL5FgLr06B9fNg2APwjcleVxQ1mnIRiZaiHdCqI0s37GHqys3kF5aQkZbKlOG9yR6kUXvY/P0X8O8/w4V3wrfv9rqaqFKgi0RLUYC9TTpy3+INlJSWAxAoLOG+xRsAFOrh8Pbv4J3HIesWuPQXcbmneWNoykUkWoryWFfc6uswr1RSWs7UlZs9KspH/vWcG533GwVXPpZwYQ4KdJHosBaKA3x+NK3GP84vLIlyQT6zbh6s+Bn0vtKtaEnQfq2J+apFoq1kH5Qe4lDzmjeCykiL7324PbXpZVj6I+h5MVz/AiSneF2RZxToItEQXLJ4/qABpKYc394sNSWZKcN7e1FV/Pv8dVh4C2QOhpvmQEpzryvyVJ2Bbow51RjzhjFmkzHmQ2PMncHjDxljAsaYtcGPKyNfrkicCgb6kAH9eXhkPzLTUjFAZloqD4/spwuiDbH9A5g3FjqcCWNfgmatvK7Ic6GscikD7rbWrjbGtAZyjTGrgn/2uLX20ciVJ+ITVW77z+7aSQHeWDvXu/1ZWneB8UsgtZ3XFcWEOgPdWrsT2Bn8er8xZhOgv40i9VG0A5JSoGW615XEv4JPYOa10Ky1a1DRqqPXFcWMes2hG2N6AIOAD4KH7jDGrDfGTDPG6EekSG2KAq6xRYKuvgibwu0wM9stSZywDNJO9bqimBLy3y5jTCtgEXCXtbYYeBroBQzEjeAfq+X7JhtjcowxOQUFBWEoWSQOFeVBG23K1Sj7d7kGFUcPwPil0OF0ryuKOSEFujEmBRfms621iwGstbusteXW2grgOWBITd9rrX3WWptlrc1KT9evm5KgigPaZbExDu11I/P9u2DsIujc1+uKYlIoq1wM8DywyVr7uyrHu1Q57VpgY/jLE/GBinIozlcv0YY6sh9mX++aO4+eC6ee53VFMSuUVS4XAuOBDcaYtcFj9wOjjTEDAQtsBW6LSIUi8W7/l64psUbo9VdaAnNHQ/5auHEWnHax1xXFtFBWubwD1LQpworwlyMSf5auCZx898Sv90FXoNdLeSm8dDNsfQdGPgdn6VaXumi3RZFGWLomUPfuicXqVFRvFeWw5Db45K9w1ePQf5TXFcUFraESaYSpKzfXvXvi163nNIceEmvhlbtg4yL47i/dVrgSEgW6SCPUtkvicceLAtCsDTRvG6Wq4pi18Lefw+oZ8O2fuSYVEjIFukgj1LZL4nHHi/LURzRUb02F95+EIbfBsJ97XU3cUaCLNMKU4b3r3j2xOE/z56H459Pwxm9gwBi4/JGEbFDRWLooKtIIlRc+61zlkjHIowrjxOqZ8Nd7oc/VMOIJbZHQQAp0kUbKHpRZ++6JpSVw6CuN0E/mwyXw8o+h1zC47nlIViw1lH4MikRSUXDbXK1Br9mnr8GiH0DXIe7GoSbNvK4orinQRSJJa9Brt/VdmD8OOvaBsQugaUuvK4p7CnSRSNIa9JoFVsOcG932t+OXaElnmCjQRSLp6ykXBfrXdn8Ms66DFu3cnuYtO3hdkW8o0EUiqWgHtOyoueFKe79we5onN3Vh3ibD64p8RZeTRSJJ+6AfU5zvwrz8CEx6Fdqf5nVFvqMRukgkFeVp/hzg4B6Yke0aVYxb5C6EStgp0EUixdpgL9EE73t5uAhmjYTCbTBmPmSe63VFvqUpF5FIKdkHpQcT+4Lo0UNuNcuuD+GmudDjQq8r8jUFukikFAdXuCTqHHrZEbfOfMcH7g7QMy/zuiLfU6CLREpRAt9UVF4Gi74Pn/8dRjwJfUd6XVFC0By6SKQkaqBXVLi9WTYth+EPw+DxXleUMDRClwaps4+muEBPSnHr0BOFtbDyPlg7G4beBxf8yOuKEooCXeotpD6a4ubQ22Qk1lawb/wGPngGzr8dLr7H62oSTgL9TZNwCamPpgTXoCfQdMu7f3AdhwZPgOG/UYMKDyjQpd5C6qMpwTXoCRLoOS/AqgfgnJFw1e8V5h5RoEu9hdRHM9FVlAenXBJgCmrDQnjlJ3DGZXDtnyApue7vkYhQoEu9hdRHM9Ed2AW23P8j9M2vwuLJ0P1CuGEGNGnqdUUJrc5AN8acaox5wxizyRjzoTHmzuDx9saYVcaYT4Of20W+XIkF2YMyeXhkPzLTUjFAZloqD4/spwuiVSXCksUt/4AFE6HLABg9F1L0G5rXQlnlUgbcba1dbYxpDeQaY1YBNwN/t9Y+Yoy5F7gX0GXtBHHSPpri/0DPy4G5o+GUXm6zreZtvK5ICGGEbq3daa1dHfx6P7AJyASuAaYHT5sOZEeqSJG4UxnofpxD/3Kja1DRqqPrNtSivdcVSVC95tCNMT2AQcAHQCdr7U5woQ8k0N0TInUoDkDT1v5rrfbV5zDzWkhp4RpUtO7sdUVSRciBboxpBSwC7rLWFtfj+yYbY3KMMTkFBQUNqVEk/lTug+6n5XuFO1yDClvuwrxdd68rkmpCCnRjTAouzGdbaxcHD+8yxnQJ/nkXYHdN32utfdZam2WtzUpPTw9HzSKxz283FR3YDTOz4XCxm2ZJP9PriqQGoaxyMcDzwCZr7e+q/NFyYGLw64nAsvCXJxKn/LQGvWQfzBzpWsiNXeBWtUhMCmWVy4XAeGCDMWZt8Nj9wCPAAmPMrcB2YFRkShSJM6WH4WCBPzoVHTkAs0fBns0weh50O9/riuQk6gx0a+07QG0TgZeEtxwRH/i6sUWcj9BLD8O8MRBYDTdMh9P1zz3WabdFkXDzwxr08lJYeAt88Q/Ifgb6XO11RRIC3fovDXNoLzzeF2ZdD5tedgEgTuUIPV7n0CsqYOmPYPNf4MpHYeBoryuSEGmELg2z/X0o2gFHiuGzVdCqEwwa57ZObdfD6+q8Fc83FVkLK+6GDQtg2AMw5AdeVyT1oBG6NExeDiQ1gZ985Lq5ZwyCdx6H/xvobjz5aFnijtqL8qBlOqQ097qS+nvtIciZBhfeBd++2+tqpJ40QpeGCeRAp77QrBWcdaX7KMqDNbNg9QxYMMGF2sCxbtR+Si+vK46eeF2D/vZj8O7vIetWuPQhf90UlSA0Qpf6qyiHwBromnX88bZdYei9cNcGGPMSdB0C7z0BTwyG6SNg42IoO+JNzdEUj2vQ//Uc/P2X0O8GN2+uMI9LGqFL/e35BI7uh8ysmv88KRnOvMx9FOfDmtlu1L5wErQ4BQaOgcE3Q4fTo1p2VFjrRuinfcfrSkK3di6s+Bn0/h5kP5VYPVB9Ru+c1F9ejvtcfYRekzYZcPEUuHOt22a1+zfh/afgyXPhxatct5vSw5GtN5oOF8HRA/GzBn3Ty7DsR9DzYrh+GiSneF2RNIJG6FJ/gRy3i2D7esyLJyXD6Ze6j/1fwtrZkDsdFt0Kqe1hwGg4dyKkx3nXo3hag/75626teWYW3DQnPi/iynE0Qpf6y8uFzHMb/qt5685uBcWP18L4pdDzIvjXn+CPQ2DaFbBuPpTGacPpr9egx3igb/8nzBsLHXq7/VmatfK6IgkDBbrUz9GDsPvD2ufP6yMpCXp9x91W/tNNcOkv4MCXsGQyPNYbXr0Hdm9q/PNEU9EO9zmWR+g717n9WdpkwPjFkKrukX6hQJf6yV8LtiK0+fP6aNURvnUX3JELE192UzM50+Cp8+H5y2DtHDh6KLzPGQlFAbc+v1WM9nsp+MTdJ9C8rfvtKFbrlAbRHLrUTyB4QTTz3Mg8flKSm4LpeREc3APr5rq59qU/hFfvhf43wLk3Q+e+kXn+xirKcyPfpGSvKznRvm2uQYVJdg0q0nywG6QcR4Eu9ZOX427tb9kh8s/VsgN88z/ggjtg23uQ+6Jb/vjv59yUz7k3Q9+R0LRl5GsJVXEgNufP93/pwrz0INy8IrFu9EogmnKR+snLCc/8eX0YAz0uhOueg7s/huEPw5H9sPwOeLQ3vPITNy8cC4p2xN78+aG9MCPbdR0auyh2f7uRRtMIXUJXnA/788M/f14fLdrDBT+C838IOz5wo/a1c9x8e8ag4Kj9OmjWOvq1VZRD8c7YWoN+ZD/Mug72boGxL8Gp53ldkUSQRugSusobiqI9Qq+JMa57zrXPuFH7Fb912wq8fCc8dhYs/7FrzGBt9Go6sBsqSmNnhF5aAnNugi/Xu5VEp13sdUUSYRqhS+gCOZCUAp37eV3J8VLbwTdugyGT3Q+d3Bdh/QJYPR0693ej9n6joHmbyNYRS2vQy47Cgomw7V247s/Q+wqvK5Io0AhdQpeX68I8Vu8oNMZNKWT/EX62Gb73mBuh/+Wnbl37sttd4Edq1B4ra9Aryt1a/k9XwlWPQ7/rva1HokYjdAlNRTnkr4FBY72uJDTN28J533dbweavdqP2DYvc9r6d+h4btaemhe85i2Kgl6i18Mpd8OES+O6vIGuSd7VI1GmELqHZvckteYuF+fP6MMatmR/xhJtrv+pxd+PPip+5ufYlP4TtH4Rn1F6UByktoXkYf0jUh7Xwt5+7pZ0XTYELf+xNHeIZjdAlNIF67LAYq5q3gaxb3Ef+GnfD0oaXYN0cSO/jRu39b3AraRqiONjYwqu9xP/xW3j/SRhyG3znv7ypQTylEbqEJi/HXXxsf5rXlYRHxiC4+vdw92Y3em/aAv56jxu1L57sbmSq76i9KM+76Zb3n4I3/8d1iLr8ETWoSFAaoUtoAsEdFv0WFM1auRZ5gyfAzvVuZcz6BbB+PnQ4043aB4wObdReFPBmBdDqmbDyPugzAq7+gxpUJDC981K3I/vdHHq8zZ/XV5f+bmXM3R/DNU+5ufCV97sVMgtvhS/ern3UXnYEDu6O/pLFD5fAyz+GXpe45YnJGqMlsjoD3RgzzRiz2xizscqxh4wxAWPM2uDHlZEtUzyVvwaw0DVB7jJs2tKt5vn+Kvjhe3DuJPhsFUy/Cp7Mgnf/4DYOq6pyDXo0lyx+8jdY9AM49Rtw4yxo0ix6zy0xKZQR+ovA5TUcf9xaOzD4sSK8ZUlM+foO0cHe1uGFTufAlb91c+3X/glapsOqB9xc+0s3w5Y3oaKiSqeiKM2hb30HFoyHTmfDmPnuGoAkvDp/P7PWvmWM6RH5UiRmBXJdu7mGrv7wg5RUGHCT+9j9sZtrXzfXTXm06wnpZ7nz2kZhS9rAandLf1p3GLfYrbkXoXFz6HcYY9YHp2TU8sSvrHUj9HherhhuHc+Cyx+Gn34MI/8MbTLhk1fd+vY2GZF97t2bYNZI98N1wtLobGMscaOhV1CeBn4F2ODnx4BbajrRGDMZmAzQrVu3Bj6deKY44NrC+f2CaEOkNIf+o9zHnk/dNrUpqZF7vr1b3Da4yc1cg4pI//CQuNOgEbq1dpe1ttxaWwE8Bww5ybnPWmuzrLVZ6enpDa1TvFI5f941Qh2K/KLDGdDtG5F7/KKAa1BRftSNzNv3jNxzSdxqUKAbY7pU+c9rgY21nStxLpDjRoSdYmyHxURycA/MzIZD+2DcIujYx+uKJEbVOeVijJkLDAU6GGPygAeBocaYgbgpl63AbRGsUbyUl+vWZzdp6nUlielwkWvqXLjdXQBNxJVGErJQVrmMruHw8xGoRWJNeRnsXAuDJ3pdSWI6ehBm3+AuhI6e69rwiZyEbiuT2u3+CEoPaYWLF8qOwPxxkPcvuH4anPFdryuSOKBAl9pV7rCYqQuiUVVeBotuhc9fhxFPwjnXel2RxAnt5SK1y8uFFqdAux5eV5I4Kipg+X/AppfdromDx3tdkcQRBbrULpDj1p/7bYfFWGWt28J33RwYej+c/0OvK5I4o0CXmh0uhoLNmj+Pptd/Df96Fi64Ay7+T6+rkTikQJea5a8GrObPo+Xd/4O3H3X7sl/2a/1WJA2iQJea5f3bfVagR17ONFj133DOSLjq9wpzaTAFutQsLxdOOQNSPWp4nCjWvwSv/BTOGA4jn4WkZK8rkjimQJcTWesuiGr+PLI+XgFLboMe34IbpkNyitcVSZxToMuJCrfDwQJNt0TSljddg4yMge4u0Eju0igJQ4EuJ6q8oUgj9MjY8W+YOwZO6QVjF0Kz1l5XJD6hQJcT5eVCk+bQqa/XlfjPlxtg9nXQqiOMX5LYXaAk7BTocqJADnQZoDndcNvzmds5sWkr16CidWevKxKfUaDL8cpLYec6dSgKt8IdrkGFtTB+KbTr7nVF4kPanEuOt2sjlB1Wh6JwOrDbhfmR/XDzK5B+ptcViU8p0OV4lS3nNEIPj5J9bppl/043Mu/S3+uKxMcU6HK8QC60TIc0NfRutCMHYPYo2PMJjJkf2Z6jIijQpbo87bAYFqWHYd5oCKx2Nw31GuZ1RZIAdFFUjinZB199qvnzxiovhYWT4Iu3IPsp6HO11xVJglCgi1NRARsXua81f95wFRWw9IeweQVc+SgMuMnriiSBaMol0R3YDWtmQu50KNzmuhN1Pc/rquKTtbDibtjwElzy3zDkB15XJAlGgZ6IKirgi39A7gvw8V+gogx6fBsufRDOugqaNPO6wvhjLbz2oNsK91s/gW/f7XVFkoAU6Ink4B5YOxtyX4S9WyC1HXzj/8G5N0OHM7yuLr69/ZhrUnHe9+GSB72uRhKUAt3vrIWt77jR+KaXofwodPsmDL0P+oyAlOZeVxj/PvgTvP4r6H8jXDFVK4TEMwp0vzq0F9bNhZwX3MqV5m0h6xY4dxJ0PMvr6vxj7Rx49T+h9/fgmqcgSesMxDsKdD+xFrb/043GP1wK5Ueg6xDIfhrOzoamLbyu0F8+Wg7LbofThsL10yBZ/5zEW3X+DTTGTAOuAnZba/sGj7UH5gM9gK3ADdbafZErU06qZB+sm++CvOBjaNYGBo93o/HOoW2Bu3RNgKkrN5NfWEJGWipThvcme1BmhAuPY5+9BgtvcUs8b5qjqSuJCaEMKV4EngRmVDl2L/B3a+0jxph7g/99T/jLk1pZ6+7qzH0BNi6GshLIGAwjnoC+10HTliE/1NI1Ae5bvIGS0nIAAoUl3Ld4A4BCvSbb3od54yD9LBj7Ur3+X4tEUp2Bbq19yxjTo9rha4Chwa+nA2+iQI+Ow8Wwfr5bqbJro9tbe8CNbjSeMbBBDzl15eavw7xSSWk5U1duVqBXl78W5twAbTNdgwo10ZYY0tBJv07W2p0A1tqdxpiOtZ1ojJkMTAbo1k0bPjVYYLUbjW9YCKWHoHN/uOpx6Deq0S3M8gtL6nU8YRVshlkj3QXmCcugVbrXFYkcJ+JXcay1zwLPAmRlZdlIP5+vHNnvAjz3Bdd0IqWFm07JmuSmV8K0PC4jLZVADeGdkabGxV/btxVmZINJdmHetqvXFYmcoKGBvssY0yU4Ou8C7A5nUQlv5zq33HDDS3D0AHQ8x+0L0v8GNzoMsynDex83hw6QmpLMlOG9w/5ccal4p2tQUXoIJq1wzZ1FYlBDA305MBF4JPh5WdgqSlRHD7qLm7kvuD3JmzSHc0a60XjX8yJ6s0rlPLlWudTg0F7XoOJAAUxcDp3O8boikVoZa08+C2KMmYu7ANoB2AU8CCwFFgDdgO3AKGvt3rqeLCsry+bk5DSyZJ/Z9aEbja+fD0eKoUNvF+IDbnK35ot3DhfDjBGw6yMYtxB6XuR1RZKgjDG51to6t0ENZZXL6Fr+6JJ6VyVOaYm78Sf3BdjxASQ3dTf+ZE2Cbhfo1vFYUFEB88bAlxvgxtkKc4kLurUtmgo2u9H4urlwuBBOOR0u+zUMGAMtT/G6OqkqKQkGjXcbl/W+3OtqREKiQI+0siPuFvGcabD9PUhKcR1ssia5LWs1Go9dA270ugKRelGgR8qez9yUyto5ULLXNY649CEYOE7rl0UkIhTo4VR2FD5+xY3Gt74NSU2g95VuNN5zqHbiE5GIUqCHw94v3K34a2fDwQJo2w2GPQCDxkHrzl5XJyIJQoHeUOWlrhFwzguw5Q0wSXDmFW403msYJCV7XaGIJBgFen0VbncNldfMhAO7oE0mDL3fbVfbJsPr6kQkgSnQQ1FeBp+udKPxz15zx864zI3GT/+uGhuISExQEp1MUQBWz3Af+/OhVWe46GcweAKkaedIEYktCvTqKsrdKDznBTcqt9bNiV/5WzjzckhO8bpCEZEaKdArFe908+KrZ0DRDmjZES68C86d6NaQi4jEuMQO9IoK2PK6G41vfhVsOfSJqq7cAAAGkUlEQVS8GC77levi3qSp1xWKiIQsMQP9wG43Gs+dDoXboMUpcMHtbt8O7XUtInEqcQK9ogK2vuVG4x+/AhVl0P1bcMl/u71VmjTzukIRkUbxf6Af3OPu4Mx9EfZugeZpMOQ2NxpPP9Pr6kREwsafgW4tbHvXjcY3LYfyo3Dq+XDxPXD2NZCiXpki4j/+CvRDe91e47kvwp5PoFlbOHeSG413Otvr6kREIir+A91a1/UnZ5rrAlR+BDKz4Jo/up6cTVt4XaGISFTEfKAvXROouXlxSaHrw5nzAhRsgqat3e6GWZOgcz+vyxYRibqYDvSlawLct3gDJaXlAAQKDzF38WIGrc6h+86VUFYCGYPg6j9A3+ugWSuPKxYR8U5MB/rUlZspKS2nFYfITn6XMcmvc3bSNg7taA6Db3Sj8YxBXpcpIhITYjrQ8wtLAPhlyouMTH6Hjyq681+lt7C8/JtsGDHK2+JERGJMTAd6RloqgcISnim7mhlll7HW9gIMmWladigiUl1MN7mcMrw3qSnJfGJPZa09HTCkpiQzZXhvr0sTEYk5MT1Czx6UCVDzKhcRETlOowLdGLMV2A+UA2XW2qxwFFVV9qBMBbiISAjCMUL/jrV2TxgeR0REGiGm59BFRCR0jQ10C/zNGJNrjJlc0wnGmMnGmBxjTE5BQUEjn05ERGrT2EC/0Fo7GLgCuN0Yc1H1E6y1z1prs6y1Wenp6Y18OhERqU2jAt1amx/8vBtYAgwJR1EiIlJ/DQ50Y0xLY0zryq+By4CN4SpMRETqpzGrXDoBS4wxlY8zx1r717BUJSIi9dbgQLfWbgEGhLEWERFpBC1bFBHxCQW6iIhPKNBFRHxCgS4i4hMKdBERn4jp7XNFvFJrc3KRGKZAF6nmxObkJdy3eAOAQl1imqZcRKqpbE5eVUlpOVNXbvaoIpHQKNBFqqlsTh7qcZFYoUAXqSajlibktR0XiRUKdJFqKpuTV6Xm5BIPdFFUpBo1J5d4pUAXqYGak0s80pSLiIhPKNBFRHxCgS4i4hMKdBERn1Cgi4j4hLHWRu/JjCkAtkXtCSOjA7DH6yIiyO+vD/z/GvX64l/119jdWpte1zdFNdD9wBiTY63N8rqOSPH76wP/v0a9vvjX0NeoKRcREZ9QoIuI+IQCvf6e9bqACPP76wP/v0a9vvjXoNeoOXQREZ/QCF1ExCcU6PVgjNlqjNlgjFlrjMnxup7GMsZMM8bsNsZsrHKsvTFmlTHm0+Dndl7W2Bi1vL6HjDGB4Hu41hhzpZc1NpYx5lRjzBvGmE3GmA+NMXcGj/vifTzJ6/PF+2iMaW6M+ZcxZl3w9f0ieLynMeaD4Ps33xjTNKTH05RL6IwxW4Esa60v1sAaYy4CDgAzrLV9g8d+C+y11j5ijLkXaGetvcfLOhuqltf3EHDAWvuol7WFizGmC9DFWrvaGNMayAWygZvxwft4ktd3Az54H40xBmhprT1gjEkB3gHuBH4KLLbWzjPGPAOss9Y+XdfjaYSewKy1bwF7qx2+Bpge/Ho67h9PXKrl9fmKtXantXZ18Ov9wCYgE5+8jyd5fb5gnQPB/0wJflhgGLAweDzk90+BXj8W+JsxJtcYM9nrYiKkk7V2J7h/TEBHj+uJhDuMMeuDUzJxORVRE2NMD2AQ8AE+fB+rvT7wyftojEk2xqwFdgOrgM+BQmttWfCUPEL8IaZAr58LrbWDgSuA24O/0kt8eRroBQwEdgKPeVtOeBhjWgGLgLustcVe1xNuNbw+37yP1tpya+1AoCswBOhT02mhPJYCvR6stfnBz7uBJbj/+X6zKzhvWTl/udvjesLKWrsr+A+oAngOH7yHwbnXRcBsa+3i4GHfvI81vT4/vo/W2kLgTeB8IM0YU9lRriuQH8pjKNBDZIxpGbwogzGmJXAZsPHk3xWXlgMTg19PBJZ5WEvYVYZc0LXE+XsYvKj2PLDJWvu7Kn/ki/exttfnl/fRGJNujEkLfp0KXIq7TvAGcH3wtJDfP61yCZEx5jTcqBxcL9Y51trfeFhSoxlj5gJDcTu77QIeBJYCC4BuwHZglLU2Li8s1vL6huJ+TbfAVuC2yrnmeGSM+RbwNrABqAgevh83zxz37+NJXt9ofPA+GmP64y56JuMG2Austb8M5s08oD2wBhhnrT1S5+Mp0EVE/EFTLiIiPqFAFxHxCQW6iIhPKNBFRHxCgS4i4hMKdBERn1Cgi4j4hAJdRMQn/j9vM6GM4ysnuwAAAABJRU5ErkJggg==\n", 480 | "text/plain": [ 481 | "
" 482 | ] 483 | }, 484 | "metadata": { 485 | "needs_background": "light" 486 | }, 487 | "output_type": "display_data" 488 | }, 489 | { 490 | "data": { 491 | "text/plain": [ 492 | "array([-2.44332198e-03, 1.76012475e-01, -4.58949704e+00, 5.31642494e+01,\n", 493 | " -2.57338653e+02, 3.41987935e+02])" 494 | ] 495 | }, 496 | "execution_count": 7, 497 | "metadata": {}, 498 | "output_type": "execute_result" 499 | } 500 | ], 501 | "source": [ 502 | "x_data = np.array(y1)\n", 503 | "\n", 504 | "y_data = np.array(y2)\n", 505 | "\n", 506 | "poly = np.polyfit(x_data, y_data, deg=5)\n", 507 | "\n", 508 | "plt.plot(x_data, y_data, 'o')\n", 509 | "\n", 510 | "plt.plot(x_data, np.polyval(poly, x_data))\n", 511 | "\n", 512 | "plt.show()\n", 513 | "poly" 514 | ] 515 | }, 516 | { 517 | "cell_type": "markdown", 518 | "metadata": {}, 519 | "source": [ 520 | "各种分析过后,仍无法做出预测。。。。。" 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "metadata": {}, 526 | "source": [ 527 | "结论:双色球是无法预测的" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "---" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "#### 评阅意见反馈" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "
\n", 549 | "hcccom 提交的《双色球历史数据统计预测》项目挑战报告初步达到课程挑战要求,但仍然有很多地方值得完善。\n", 550 | "

\n", 551 | "数据采集部分内容不错,能采集完整的双色球投注数据。但缺乏必要的解释和代码注释。数据分析和处理阶段仅对各号球的出现频次做了统计,选择的柱形图虽然合理但内容较为单薄。这里,建议可以对连续 2 球或者多球的出现频次统计分析。或者分析不同位置各号球的出现频次,或许从统计学角度更有意义。\n", 552 | "

\n", 553 | "「取一组」小节之后没有看明白分析的用意,或许是想预测各号球如何出现?不过这样肯定无法完成的。回归分析显然不能用于这里的预测过程。\n", 554 | "

\n", 555 | "总之,该挑战报告有 2 点值得改善的地方:\n", 556 | "\n", 557 | "- 补充陈述内容,让阅读者知道每一步的大致操作用意。整个分析报告几乎没有解释性语句,非常不赞同这样做。数据分析的过程很重要,实际上阐述结论和讲好一个故事更加重要。\n", 558 | "\n", 559 | "- 分析思路没有理清,显然双色球出现是随机事件,这是无法通过回归分析完成的。所以,挑战的选题从一开始就不太理想。实际上,就算是真实的数据分析任务,也不建议去做彩票预测,因为就算从概率上得到了一些高频次组合方式,但没有明确的指导意义。\n", 560 | "\n", 561 | "代码方面,注意不用写重复冗余代码(类似 plt.show() 在同单元格多次出现)。后期在书写代码时注意按照 PEP8 格式化即可。VS Code 等 IDE 带有相关格式化插件,Jupyter Notebook 也可以通过安装 jupyter_contrib_nbextensions 拓展开启相关插件自动完成代码格式化。\n", 562 | "

\n", 563 | "总之,通过该报告可以判定学员初步达到我们课程预设的培养目标,但仍需要继续学习和加深对数据分析各环节的思考。希望 hcccom 后续再通过书籍等拓展更多相关的数据分析和挖掘知识,并结合自己的兴趣及专业特长在数据分析的道路上越走越好。\n", 564 | "
\n", 565 | "\n", 566 | "


\n", 567 | "
楼+ 数据分析和挖掘课程组
\n", 568 | "
2018 年 12 月 17 日
" 569 | ] 570 | }, 571 | { 572 | "cell_type": "markdown", 573 | "metadata": {}, 574 | "source": [ 575 | "---" 576 | ] 577 | } 578 | ], 579 | "metadata": { 580 | "kernelspec": { 581 | "display_name": "Python 3", 582 | "language": "python", 583 | "name": "python3" 584 | }, 585 | "language_info": { 586 | "codemirror_mode": { 587 | "name": "ipython", 588 | "version": 3 589 | }, 590 | "file_extension": ".py", 591 | "mimetype": "text/x-python", 592 | "name": "python", 593 | "nbconvert_exporter": "python", 594 | "pygments_lexer": "ipython3", 595 | "version": "3.7.1" 596 | } 597 | }, 598 | "nbformat": 4, 599 | "nbformat_minor": 2 600 | } 601 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 实验楼在线教育 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Mindmaps/README.md: -------------------------------------------------------------------------------- 1 | - [点击下载全部思维导图](https://minhaskamal.github.io/DownGit/#/home?url=https://github.com/shiyanlou/louplus-dm/tree/master/Mindmaps) 2 | -------------------------------------------------------------------------------- /Mindmaps/louplus-dm-week1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week1.png -------------------------------------------------------------------------------- /Mindmaps/louplus-dm-week2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week2.png -------------------------------------------------------------------------------- /Mindmaps/louplus-dm-week3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week3.png -------------------------------------------------------------------------------- /Mindmaps/louplus-dm-week4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week4.png -------------------------------------------------------------------------------- /Mindmaps/louplus-dm-week5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week5.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

3 | 蓝桥云课《楼+ 数据分析与挖掘实战》课程仓库|课程报名 4 |
5 | 6 |
7 | 8 | 主分支下方包含最新课程的参考答案,历史开班课程的参考答案移步相应分支查看。 9 | 10 | ### 其他班级 11 | 12 | - [第 12-13 期挑战参考答案](https://github.com/shiyanlou/louplus-dm/tree/v3/Answers) 13 | - [第 07-11 期挑战参考答案](https://github.com/shiyanlou/louplus-dm/tree/v2/Answers) 14 | - [第 01-06 期挑战参考答案](https://github.com/shiyanlou/louplus-dm/tree/master/Answers) 15 | 16 | ### 优秀报告 17 | 18 | - [优秀项目挑战比赛报告](https://github.com/shiyanlou/louplus-dm/tree/master/Assignments) 19 | --------------------------------------------------------------------------------