├── .gitignore
├── Answers
├── week1-challenge-01
│ └── read_challenge.py
├── week1-challenge-02
│ └── sql_challenge.py
├── week1-challenge-03
│ └── github_data.py
├── week1-challenge-04
│ └── shiyanlou_user.py
├── week1-challenge-05
│ ├── scrapy.cfg
│ └── shiyanlou
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ ├── __init__.py
│ │ ├── github.py
│ │ └── github_next_page.py
├── week2-challenge-01
│ └── titanic.py
├── week2-challenge-02
│ └── earthquake.py
├── week2-challenge-03
│ └── earthquake.py
├── week2-challenge-04
│ ├── carbon_dioxide.py
│ └── carbon_dioxide_2.py
├── week2-challenge-05
│ └── carbon_gdp.py
├── week3-challenge-01
│ └── ols_matrix.py
├── week3-challenge-02
│ └── houseprice.py
├── week3-challenge-03
│ └── linear_regression.py
├── week3-challenge-04
│ └── 手写字符分类预测.ipynb
├── week3-challenge-05
│ └── 使用聚类压缩图像.ipynb
├── week4-challenge-01
│ └── banknote.py
├── week4-challenge-02
│ └── association.py
├── week4-challenge-03
│ └── google_stock.py
├── week4-challenge-04
│ └── production_index.py
├── week4-challenge-05
│ └── chengdu_pm25.py
├── week5-spiders-01
│ └── lianjia_spider.py
└── week5-spiders-02
│ ├── create_sqlite_database.py
│ ├── insert_database.py
│ └── xiecheng_spider.py
├── Assignments
├── README.md
├── 🏅️dm01-stenphen-中国保险业过去五年基础数据分析.ipynb
├── 🏅️dm02-米竹314159-杭州互联网寒冬背景下的数据分析岗现状分析.ipynb
├── 🏅️dm04-Luo2019-链家成都市区挂牌二手房分析.ipynb
├── 🥈dm01-BellaG-上海历史天气数据分析预测.ipynb
├── 🥈dm02-linnecn-医学专业论坛的数据爬取和分析.ipynb
├── 🥈dm04-Yueyec-B-站番剧数据简单分析.ipynb
└── 🥉dm01-hcccom-双色球历史数据统计预测.ipynb
├── LICENSE
├── Mindmaps
├── README.md
├── louplus-dm-week1.png
├── louplus-dm-week2.png
├── louplus-dm-week3.png
├── louplus-dm-week4.png
└── louplus-dm-week5.png
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 | .DS_Store
106 |
--------------------------------------------------------------------------------
/Answers/week1-challenge-01/read_challenge.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 | def convert(file):
4 | df = pd.read_json(file)
5 | df1000 = df[:1000]
6 | df1000.to_hdf('user_study.h5', key='data')
--------------------------------------------------------------------------------
/Answers/week1-challenge-02/sql_challenge.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 | import pandas as pd
3 |
4 | def count(file, user_id):
5 |
6 | sql_con = sqlite3.connect(file)
7 | sql_query = "SELECT * FROM data WHERE user_id == {}".format(user_id)
8 | df = pd.read_sql(sql_query, sql_con)
9 |
10 | if len(df)==0:
11 | return 0
12 | else:
13 | sum_minutes = df.minutes.sum()
14 | return sum_minutes
--------------------------------------------------------------------------------
/Answers/week1-challenge-03/github_data.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import pandas as pd
3 |
4 | def issues(repo):
5 | url = "https://api.github.com/repos/{}/issues".format(repo)
6 | issues = requests.get(url)
7 |
8 | issues_list = []
9 | for issue in issues.json():
10 | issues_dict = {'number':issue['number'],
11 | 'title':issue['title'],
12 | 'user_name':issue['user']['login']}
13 | issues_list.append(issues_dict)
14 |
15 | issues_df = pd.DataFrame(issues_list)
16 |
17 | return issues_df
18 |
19 | issues("numpy/numpy")
20 |
21 |
22 |
--------------------------------------------------------------------------------
/Answers/week1-challenge-04/shiyanlou_user.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import html
3 |
4 |
5 | def user_info(user_id):
6 |
7 | url = "https://www.lanqiao.cn/users/{}/".format(user_id)
8 | content = requests.get(url)
9 |
10 | if content.status_code == 200:
11 | tree = html.fromstring(content.text)
12 | # 首先选取所以 div 元素,要求其 class 属性中包含 name 字段
13 | # 再取 div 下的 span
14 | user_name = tree.xpath("//div[contains(@class, 'name')]/span/text()")[0].strip()
15 | user_level = tree.xpath("//div[contains(@class, 'name')]/span/text()")[1].strip()[1:]
16 | return user_name, int(user_level)
17 | else:
18 | user_name, user_level = (None, None)
19 | return user_name, user_level
--------------------------------------------------------------------------------
/Answers/week1-challenge-05/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = shiyanlou.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = shiyanlou
12 |
--------------------------------------------------------------------------------
/Answers/week1-challenge-05/shiyanlou/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Answers/week1-challenge-05/shiyanlou/__init__.py
--------------------------------------------------------------------------------
/Answers/week1-challenge-05/shiyanlou/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class ShiyanlouItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | repo_name = scrapy.Field() # repo 名称
15 | update_time = scrapy.Field() # 更新时间
16 |
--------------------------------------------------------------------------------
/Answers/week1-challenge-05/shiyanlou/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class ShiyanlouSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class ShiyanlouDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/Answers/week1-challenge-05/shiyanlou/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 | import pandas as pd
8 |
9 | class ShiyanlouPipeline(object):
10 |
11 | def process_item(self, item, spider):
12 | # 读取 item 数据
13 | repo_name = item['repo_name']
14 | update_time = item['update_time']
15 | # 每条数据组成临时 df_temp
16 | df_temp = pd.DataFrame([[repo_name, update_time]], columns=['repo_name', 'update_time'])
17 | # 将 df_temp 合并到 df
18 | self.df = self.df.append(df_temp, ignore_index=True).sort_values(by=['update_time'], ascending=False)
19 |
20 | return item
21 |
22 | #当爬虫启动时
23 | def open_spider(self, spider):
24 | # 新建一个带列名的空白 df
25 | self.df = pd.DataFrame(columns=['repo_name', 'update_time'])
26 |
27 | # 当爬虫关闭时
28 | def close_spider(self, spider):
29 | # 将 df 存储为 csv 文件
30 | pd.DataFrame.to_csv(self.df, "../shiyanlou_repo.csv")
31 |
--------------------------------------------------------------------------------
/Answers/week1-challenge-05/shiyanlou/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for shiyanlou project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'shiyanlou'
13 |
14 | SPIDER_MODULES = ['shiyanlou.spiders']
15 | NEWSPIDER_MODULE = 'shiyanlou.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'shiyanlou (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | # DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'shiyanlou.middlewares.ShiyanlouSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'shiyanlou.middlewares.ShiyanlouDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'shiyanlou.pipelines.ShiyanlouPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/Answers/week1-challenge-05/shiyanlou/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/Answers/week1-challenge-05/shiyanlou/spiders/github.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from shiyanlou.items import ShiyanlouItem
4 |
5 |
6 | class GithubSpider(scrapy.Spider):
7 | name = 'github'
8 | allowed_domains = ['github.com']
9 |
10 | @property
11 | def start_urls(self):
12 | url_temp = 'https://github.com/shiyanlou?after={}&tab=repositories'
13 | # 此参考会失效,请自行重新手动复制 after 参数
14 | after = [
15 | '',
16 | 'Y3Vyc29yOnYyOpK5MjAxNy0wNi0wN1QwNjoxOTo1NyswODowMM4FkpYw',
17 | 'Y3Vyc29yOnYyOpK5MjAxNS0wMS0yNVQxMTozMTowNyswODowMM4Bxrsx',
18 | 'Y3Vyc29yOnYyOpK5MjAxNC0xMS0yMFQxMzowMzo1MiswODowMM4BjkvL',
19 | ]
20 | return (url_temp.format(i) for i in after) # 1-4 页
21 |
22 | def parse(self, response):
23 | repos = response.xpath('//li[@itemprop="owns"]')
24 | for repo in repos:
25 | item = ShiyanlouItem()
26 | item['repo_name'] = repo.xpath(".//a[@itemprop='name codeRepository']/text()").extract_first().strip()
27 | item['update_time'] = repo.xpath(".//relative-time/@datetime").extract_first()
28 |
29 | yield item
--------------------------------------------------------------------------------
/Answers/week1-challenge-05/shiyanlou/spiders/github_next_page.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from shiyanlou.items import ShiyanlouItem
4 |
5 |
6 | class GithubSpider(scrapy.Spider):
7 | name = 'github_next_page'
8 | allowed_domains = ['github.com']
9 |
10 | @property
11 | def start_urls(self):
12 | return ('https://github.com/shiyanlou?tab=repositories', )
13 |
14 | def parse(self, response):
15 | repos = response.xpath('//li[@itemprop="owns"]')
16 | for repo in repos:
17 | item = ShiyanlouItem()
18 | item['repo_name'] = repo.xpath(".//a[@itemprop='name codeRepository']/text()").extract_first().strip()
19 | item['update_time'] = repo.xpath(".//relative-time/@datetime").extract_first()
20 |
21 | yield item
22 |
23 | # 如果 Next 按钮没被禁用,那么表示有下一页
24 | spans = response.css('div.pagination span.disabled::text')
25 | if len(spans) == 0 or spans[-1].extract() != 'Next':
26 | next_url = response.css('div.paginate-container a:last-child::attr(href)').extract_first()
27 | yield response.follow(next_url, callback=self.parse)
28 |
--------------------------------------------------------------------------------
/Answers/week2-challenge-01/titanic.py:
--------------------------------------------------------------------------------
1 | from matplotlib import pyplot as plt
2 | import seaborn as sns
3 |
4 | def plot():
5 | df = sns.load_dataset("titanic")
6 |
7 | fig, axes = plt.subplots(ncols=3, nrows=1, figsize=(15,4))
8 |
9 | sns.distplot(df.age.dropna(), ax=axes[0])
10 | sns.countplot(x='sex', hue="alive", data=df, ax=axes[1])
11 | sns.countplot(x="class", hue="alive", data=df, ax=axes[2])
12 |
13 | return axes
--------------------------------------------------------------------------------
/Answers/week2-challenge-02/earthquake.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def clean():
5 | # 读取据
6 | df = pd.read_csv("earthquake.csv")
7 | # 选择需保留列
8 | df1 = df[['time', 'latitude', 'longitude', 'depth', 'mag']]
9 | # 对 place 列使用分割,得到需要的 region 数据
10 | place = df.place.str.split(', ').tolist()
11 | region = []
12 | for row in place:
13 | region.append(row[-1])
14 | df2 = pd.DataFrame(region, columns=['region'])
15 | # 拼接数据
16 | df = pd.concat([df1, df2], axis=1)
17 | # 去除重复值
18 | df_clean = df.drop_duplicates().dropna()
19 |
20 | return df_clean
--------------------------------------------------------------------------------
/Answers/week2-challenge-03/earthquake.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def clean():
5 | # 读取据
6 | df = pd.read_csv("earthquake.csv")
7 | # 选择需保留列
8 | df1 = df[['time', 'latitude', 'longitude', 'depth', 'mag']]
9 | # 对 place 列使用分割,得到需要的 region 数据
10 | place = df.place.str.split(', ').tolist()
11 | region = []
12 | for row in place:
13 | region.append(row[-1])
14 | df2 = pd.DataFrame(region, columns=['region'])
15 | # 拼接数据
16 | df = pd.concat([df1, df2], axis=1)
17 | # 去除重复值
18 | df_clean = df.drop_duplicates().dropna()
19 |
20 | return df_clean
21 |
22 |
23 | def mag_region():
24 | # 加载清洁后数据
25 | df_clean = clean()
26 | # 数据离散化,注意开闭区间
27 | df_clean['mag'] = pd.cut(df_clean.mag, bins=[0, 2, 5, 7, 9, 15], right=False, labels=[
28 | 'micro', 'light', 'strong', 'major', 'great'])
29 |
30 | print(df_clean)
31 | # 多索引分组聚合并计数
32 | df_group = df_clean.groupby(by=['mag', 'region']).count()
33 | # 重置索引并去除缺失值
34 | df_reindex = df_group.reset_index().dropna()
35 | # 按计数从大到小排序,并使用去除重复值的方法巧妙地保留下各地区最大值
36 | df_sort = df_reindex.sort_values(
37 | by='time', ascending=False).drop_duplicates(['mag'])
38 | # 按题目要求整理并重命名
39 | df_final = df_sort.set_index('mag')[['region', 'time']].rename(
40 | columns={"time": "times"})
41 | # 按题目要求将计数处理成 int 类型
42 | df_final['times'] = df_final.times.astype('int')
43 |
44 | return df_final
45 |
--------------------------------------------------------------------------------
/Answers/week2-challenge-04/carbon_dioxide.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def data_clean():
5 | '''data_clean() 函数用于数据清洁,大致步骤如下:
6 | 1. 统一设置国家代码为新索引
7 | 2. 去掉多余的数据列
8 | 3. 将不规范空值替换为 NaN,并进行填充
9 | '''
10 | # 读取数据文件
11 | df_data = pd.read_excel("ClimateChange.xlsx", sheetname='Data')
12 | df_country = pd.read_excel("ClimateChange.xlsx", sheetname='Country')
13 |
14 | # 处理 data 数据表
15 | # 选取 EN.ATM.CO2E.KT 数据,并将国家代码设置为索引
16 | df_data_reindex = df_data[df_data['Series code']== 'EN.ATM.CO2E.KT'].set_index('Country code')
17 | # 剔除不必要的数据列
18 | df_data_drop = df_data_reindex.drop(labels=['Country name', 'Series code', 'Series name', 'SCALE', 'Decimals'], axis=1)
19 | # 将原数据集中不规范的空值替换为 NaN 方便填充
20 | df_data_nan = df_data_drop.replace({'..': pd.np.NaN})
21 | # 对 NaN 空值进行向前和向后填充
22 | df_data_fill = df_data_nan.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)
23 | # 对填充后依旧全部为空值的数据行进行剔除
24 | df_data_dropna = df_data_fill.dropna(how='all')
25 |
26 | # 处理 Country 数据表
27 | # 将国家代码设置为索引
28 | df_country_reindex = pd.DataFrame(df_country).set_index('Country code')
29 | # 剔除不必要的数据列
30 | df_country_drop = df_country_reindex.drop(labels=['Capital city', 'Region', 'Lending category'], axis=1)
31 |
32 | # 合并数据表
33 | # 对 Data 和 Country 表按照索引进行合并
34 | df_combine = pd.concat([df_data_dropna, df_country_drop], axis=1)
35 | # 对合并后数据集进行求和得到各国排放总量
36 | df_combine['Sum emissions'] = df_combine[list(df_combine)[:-2]].sum(axis=1)
37 | # 对合并后存在空值的数据行进行剔除,得到清洁后的数据集
38 | df_clean = df_combine.dropna(thresh=10)
39 |
40 | return df_clean
41 |
42 | def co2():
43 | '''co2() 函数用于数据统计,大致步骤如下:
44 | 1. 使用 groupby 按题目规则求和
45 | 2. 对数据进行排序并得到目标 DataFrame
46 | '''
47 | # 读取清洁后数据
48 | df_clean = data_clean()
49 |
50 | # 按收入群体对数据进行求和
51 | sum_by_groups = df_clean.groupby('Income group')['Sum emissions'].sum()
52 |
53 | # 按要求整理 DataFrame
54 | item_high_list = []
55 | item_low_list = []
56 |
57 | for group_name in list(sum_by_groups.index):
58 | # 得到各收入群体最高排放量数据
59 | item_high = df_clean[df_clean['Income group'] == group_name].sort_values(by='Sum emissions', ascending=False).iloc[0]
60 | # 将最高排放量数据存入相应列表方便生成最终 DataFrame
61 | item_high_list.append((item_high['Income group'], item_high['Country name'], item_high['Sum emissions']))
62 | # 得到各收入群体最低排放量数据
63 | item_low = df_clean[df_clean['Income group'] == group_name].sort_values(by='Sum emissions').iloc[0]
64 | # 将最低排放量数据存入相应列表方便生成最终 DataFrame
65 | item_low_list.append((item_low['Income group'], item_low['Country name'], item_low['Sum emissions']))
66 |
67 | # 设置 DataFrame 标签
68 | high_labels = ['Income group', 'Highest emission country', 'Highest emissions']
69 | low_labels = ['Income group', 'Lowest emission country', 'Lowest emissions']
70 |
71 | # 生成并合并目标 DataFrame
72 | highest_df = pd.DataFrame.from_records(item_high_list, columns=high_labels).set_index('Income group')
73 | lowest_df = pd.DataFrame.from_records(item_low_list, columns=low_labels).set_index('Income group')
74 |
75 | results = pd.concat([sum_by_groups, highest_df, lowest_df], axis=1)
76 |
77 | return results
--------------------------------------------------------------------------------
/Answers/week2-challenge-04/carbon_dioxide_2.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def data_clean():
5 | data = pd.read_excel("ClimateChange.xlsx", sheetname='Data')
6 |
7 | # 处理 data 数据表 # 选取 EN.ATM.CO2E.KT 数据,并将国家代码设置为索引
8 | data = data[data['Series code'] ==
9 | 'EN.ATM.CO2E.KT'].set_index('Country code')
10 | # 剔除不必要的数据列
11 | data.drop(labels=['Country name', 'Series code',
12 | 'Series name', 'SCALE', 'Decimals'], axis=1, inplace=True)
13 | # 将原数据集中不规范的空值替换为 NaN 方便填充
14 | data.replace({'..': pd.np.NaN}, inplace=True)
15 | # 对 NaN 空值进行向前和向后填充
16 | data = data.fillna(method='ffill', axis=1).fillna(method='bfill', axis=1)
17 | # 对填充后依旧全部为空值的数据行进行剔除
18 | data.dropna(how='all', inplace=True)
19 | data['Sum emissions'] = data.sum(axis=1)
20 | data = data['Sum emissions']
21 |
22 | # 处理 Country 数据表
23 | # 将国家代码设置为索引
24 | countries = pd.read_excel("ClimateChange.xlsx", sheetname='Country')
25 | countries.set_index('Country code', inplace=True)
26 | # 剔除不必要的数据列
27 | countries.drop(labels=['Capital city', 'Region',
28 | 'Lending category'], axis=1, inplace=True)
29 |
30 | # 合并数据表
31 | # 对 Data 和 Country 表按照索引进行合并
32 | return pd.concat([data, countries], axis=1)
33 |
34 |
35 | def co2():
36 | '''co2() 函数用于数据统计,大致步骤如下:
37 | 1. 使用 grouby 按题目规则求和
38 | 2. 对数据进行排序并得到目标 DataFrame
39 | '''
40 | # 读取清洁后数据
41 | df = data_clean()
42 |
43 | # 按收入群体对数据进行求和
44 | df_sum = df.groupby('Income group').sum()
45 |
46 | df_max = df.sort_values(by='Sum emissions', ascending=False).groupby(
47 | 'Income group').head(1).set_index('Income group')
48 | df_max.columns = ['Highest emissions', 'Highest emission country']
49 | df_max = df_max.reindex(
50 | columns=['Highest emission country', 'Highest emissions'])
51 |
52 | df_min = df.sort_values(by='Sum emissions').groupby(
53 | 'Income group').head(1).set_index('Income group')
54 | df_min.columns = ['Lowest emissions', 'Lowest emission country']
55 | df_min = df_min.reindex(
56 | columns=['Lowest emission country', 'Lowest emissions'])
57 |
58 | result = pd.concat([df_sum, df_max, df_min], axis=1)
59 |
60 | return result
--------------------------------------------------------------------------------
/Answers/week2-challenge-05/carbon_gdp.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | from matplotlib import pyplot as plt
4 |
5 |
6 | def data_clean():
7 | '''data_clean() 函数用于数据清洁,大致步骤如下:
8 | 1. 统一设置国家代码为新索引
9 | 2. 去掉多余的数据列
10 | 3. 将不规范空值替换为 NaN,并进行填充
11 | '''
12 | # 读取数据
13 | df_data = pd.read_excel("ClimateChange.xlsx", sheetname='Data')
14 |
15 | # 选择数据
16 | df_co2 = df_data[df_data['Series code'] ==
17 | 'EN.ATM.CO2E.KT'].set_index('Country code')
18 | df_gdp = df_data[df_data['Series code'] ==
19 | 'NY.GDP.MKTP.CD'].set_index('Country code')
20 |
21 | # 缺失值替换
22 | df_co2_nan = df_co2.replace({'..': pd.np.NaN})
23 | df_gdp_nan = df_gdp.replace({'..': pd.np.NaN})
24 |
25 | # 缺失值填充
26 | df_co2_fill = df_co2_nan.iloc[:, 5:].fillna(
27 | method='ffill', axis=1).fillna(method='bfill', axis=1)
28 | df_gdp_fill = df_gdp_nan.iloc[:, 5:].fillna(
29 | method='ffill', axis=1).fillna(method='bfill', axis=1)
30 |
31 | # 数据合并
32 | df_co2_fill['CO2-SUM'] = df_co2_fill.sum(axis=1)
33 | df_gdp_fill['GDP-SUM'] = df_gdp_fill.sum(axis=1)
34 | df_merge = pd.concat([df_co2_fill['CO2-SUM'], df_gdp_fill['GDP-SUM']], axis=1)
35 |
36 | # 缺失数据填充为 0
37 | df_merge_fill = df_merge.fillna(value=0)
38 |
39 | return df_merge_fill
40 |
41 |
42 | def co2_gdp_plot():
43 | '''co2_gdp_plot() 函数用于数据整理和绘图,大致步骤如下:
44 | 1. 数据归一化
45 | 2. 得到需要返回的数据
46 | 3. 绘图
47 | '''
48 | # 读取数据
49 | df_clean = data_clean()
50 |
51 | # 数据归一化处理
52 | df_max_min = (df_clean - df_clean.min()) / (df_clean.max() - df_clean.min())
53 |
54 | # 获取中国归一化后的 CO2 和 GDP 数据
55 | china = []
56 | for i in df_max_min[df_max_min.index == 'CHN'].values:
57 | china.extend(np.round(i, 3).tolist())
58 |
59 | # 获取 5 个常任理事国标签及对应的坐标刻度
60 | countries_labels = ['USA', 'CHN', 'FRA', 'RUS', 'GBR']
61 | # 获取国家标签作为刻度标签
62 | sticks_labels = []
63 | # 获取相应国家序号对应着刻度坐标
64 | labels_position = []
65 |
66 | for i in range(len(df_max_min)):
67 | if df_max_min.index[i] in countries_labels:
68 | sticks_labels.append(df_max_min.index[i])
69 | labels_position.append(i)
70 |
71 | # 对数据进行绘图
72 | fig, axes = plt.subplots()
73 | df_max_min.plot(
74 | kind='line',
75 | title='GDP-CO2',
76 | ax=axes
77 | )
78 | plt.xlabel("Countries")
79 | plt.ylabel("Values")
80 | # 绘制 5 大常任理事国坐标刻度标签
81 | plt.xticks(labels_position, sticks_labels, rotation='vertical')
82 | plt.show()
83 |
84 | return axes, china
--------------------------------------------------------------------------------
/Answers/week3-challenge-01/ols_matrix.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | def caculate_w():
6 |
7 | # 读取数据集
8 | df = pd.read_csv("nyc-east-river-bicycle-counts.csv", index_col=0)
9 |
10 | # 处理自变量
11 | x = df['Brooklyn Bridge'].values
12 | x = x.reshape(len(x), 1) # 添加截距项系数
13 | x = np.matrix(np.concatenate((np.ones_like(x), x), axis=1))
14 |
15 | # 处理因变量
16 | y = df['Manhattan Bridge'].values
17 | y = np.matrix(y.reshape(len(y), 1))
18 |
19 | # 使用矩阵方法计算
20 | W = (x.T * x).I * x.T * y
21 | b = round(float(W[0]), 2)
22 | w = round(float(W[1]), 2)
23 |
24 | return w, b
25 |
--------------------------------------------------------------------------------
/Answers/week3-challenge-02/houseprice.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from sklearn.preprocessing import PolynomialFeatures
4 | from sklearn.linear_model import LinearRegression
5 | from sklearn.metrics import mean_absolute_error
6 | from sklearn.model_selection import train_test_split
7 |
8 |
9 | def beijing(n):
10 |
11 | # 读取数据,去除重复值,无空值
12 | df = pd.read_csv("beijing_house_price.csv")
13 | df = df.drop_duplicates()
14 | # df = df[['公交', '写字楼', '医院', '商场', '地铁', '学校', '建造时间', '楼层', '面积', '每平米价格']]
15 | df = df.iloc[:, [0, 1, 2, 3, 4, 5, 7, 9, 11, 10]] # 线上环境中文输入不方便
16 |
17 | # 计算特征与目标值相关性系数,并保留前 3 个特征
18 | pearson = np.abs(df.corr(method='pearson').iloc[-1])
19 | pearson_max = pearson.sort_values(ascending=False)[1:4]
20 | features_names = pearson_max.index.values
21 | features = df[features_names]
22 | # target = df['每平米价格']
23 | target = df.iloc[:, [9]]
24 |
25 | # 切分训练和测试数据
26 | X_train, X_test, y_train, y_test = train_test_split(
27 | features, target, test_size=0.3, random_state=10)
28 |
29 | # 多项式特征处理
30 | poly_features = PolynomialFeatures(degree=n)
31 | X_train_features = poly_features.fit_transform(X_train)
32 | X_test_features = poly_features.fit_transform(X_test)
33 |
34 | # 建立线性回归模型
35 | model = LinearRegression()
36 | model.fit(X_train_features, y_train)
37 | y_pred = model.predict(X_test_features)
38 |
39 | # 计算平均绝对误差
40 | mae = mean_absolute_error(y_test, y_pred)
41 |
42 | return mae
43 |
--------------------------------------------------------------------------------
/Answers/week3-challenge-03/linear_regression.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 |
4 |
5 | def gradient_descent():
6 | # 读取数据集
7 | df = pd.read_csv("nyc-east-river-bicycle-counts.csv", index_col=0)
8 | # 读取自变量
9 | x = df['Brooklyn Bridge'].values
10 | # 读取因变量
11 | y = df['Manhattan Bridge'].values
12 |
13 | w = 0 # 初始参数为 0
14 | b = 0 # 初始参数为 0
15 | lr = 0.000000001 # 学习率
16 | num_iter = 1000 # 迭代次数
17 | for i in range(num_iter): # 梯度下降迭代
18 | # 计算近似值
19 | y_hat = (w * x) + b
20 | # 计算参数对应梯度
21 | w_gradient = -(2/len(x)) * sum(x * (y - y_hat))
22 | b_gradient = -(2/len(x)) * sum(y - y_hat)
23 | # 根据梯度更新参数
24 | w -= lr * w_gradient
25 | b -= lr * b_gradient
26 |
27 | return w, b
28 |
--------------------------------------------------------------------------------
/Answers/week3-challenge-04/手写字符分类预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | " \n",
8 | "\n",
9 | "# 手写字符分类预测"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "---"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "**以下内容仅保留挑战代码部分,挑战全文请到原课程查看。**"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "---"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "**挑战:使用 `1x5` 的子图样式绘制 Digits 数据集前 `5` 个手写字符的图像。**"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "### 代码开始 ### (3~5 行代码)\n",
47 | "fig, axes = plt.subplots(1, 5, figsize=(12,4))\n",
48 | "for i, image in enumerate(digits.images[:5]):\n",
49 | " axes[i].imshow(image, cmap=plt.cm.gray_r)\n",
50 | "### 代码结束 ###"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "---"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "**挑战:使用 `train_test_split()` 将数据集切分为 80%(训练集) 和 20%(测试集) 两部分。**"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "### 代码开始 ### (≈ 2 行代码)\n",
74 | "from sklearn.model_selection import train_test_split\n",
75 | "\n",
76 | "train_x, test_x, train_y, test_y = train_test_split(digits.data, digits.target, test_size=0.2, random_state=30)\n",
77 | "### 代码结束 ###"
78 | ]
79 | },
80 | {
81 | "cell_type": "markdown",
82 | "metadata": {},
83 | "source": [
84 | "---"
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "metadata": {},
90 | "source": [
91 | "**挑战:使用 `MLPClassifier()` 搭建神经网络结构,并训练手写字符识别模型,最后得到在测试集上的预测准确率。**"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": null,
97 | "metadata": {},
98 | "outputs": [],
99 | "source": [
100 | "from sklearn.neural_network import MLPClassifier\n",
101 | "from sklearn.metrics import accuracy_score\n",
102 | "\n",
103 | "def mpl():\n",
104 | " \"\"\"\n",
105 | " 参数:无\n",
106 | "\n",
107 | " 返回:\n",
108 | " model -- 人工神经网络模型\n",
109 | " score -- 测试集上的预测准确率\n",
110 | " \"\"\"\n",
111 | " ### 代码开始 ### (≈ 2 行代码)\n",
112 | " model = MLPClassifier(\n",
113 | " hidden_layer_sizes=(100, 50),\n",
114 | " activation='relu',\n",
115 | " solver='sgd',\n",
116 | " learning_rate_init=0.02,\n",
117 | " learning_rate='constant',\n",
118 | " max_iter=100,\n",
119 | " random_state=1\n",
120 | " )\n",
121 | "\n",
122 | " model.fit(train_x, train_y)\n",
123 | " score = accuracy_score(test_y, model.predict(test_x))\n",
124 | " ### 代码结束 ###\n",
125 | " return model, score"
126 | ]
127 | },
128 | {
129 | "cell_type": "markdown",
130 | "metadata": {},
131 | "source": [
132 | "---"
133 | ]
134 | },
135 | {
136 | "cell_type": "markdown",
137 | "metadata": {},
138 | "source": [
139 | " "
140 | ]
141 | }
142 | ],
143 | "metadata": {
144 | "kernelspec": {
145 | "display_name": "Python 3",
146 | "language": "python",
147 | "name": "python3"
148 | },
149 | "language_info": {
150 | "codemirror_mode": {
151 | "name": "ipython",
152 | "version": 3
153 | },
154 | "file_extension": ".py",
155 | "mimetype": "text/x-python",
156 | "name": "python",
157 | "nbconvert_exporter": "python",
158 | "pygments_lexer": "ipython3",
159 | "version": "3.7.0"
160 | }
161 | },
162 | "nbformat": 4,
163 | "nbformat_minor": 2
164 | }
165 |
--------------------------------------------------------------------------------
/Answers/week3-challenge-05/使用聚类压缩图像.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | " \n",
8 | "\n",
9 | "# 使用聚类压缩图像"
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "metadata": {},
15 | "source": [
16 | "---"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "**以下内容仅保留挑战代码部分,挑战全文请到原课程查看。**"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "---"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "**挑战:将形状为 $(516, 819, 3)$ 的数据转换为 $(422604, 3)$ 形状的数据。**"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {},
44 | "outputs": [],
45 | "source": [
46 | "\"\"\"数据格式变换\n",
47 | "\"\"\"\n",
48 | "### 代码开始 ###(≈ 1 行代码)\n",
49 | "data = chengdu.reshape(516 * 819, 3)\n",
50 | "### 代码结束 ###"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "metadata": {},
56 | "source": [
57 | "---"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "**挑战:计算 `422604` 个像素点中种类的个数。**"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "\"\"\"计算像素点种类个数\n",
74 | "\"\"\"\n",
75 | "def get_variety(data):\n",
76 | " \"\"\"\n",
77 | " 参数:\n",
78 | " 预处理后像素点集合\n",
79 | "\n",
80 | " 返回:\n",
81 | " num_variety -- 像素点种类个数\n",
82 | " \"\"\"\n",
83 | "\n",
84 | " ### 代码开始 ### (≈ 3 行代码)\n",
85 | " temp=data.tolist()\n",
86 | " num_variety=len(set([tuple(t) for t in temp]))\n",
87 | " ### 代码结束 ###\n",
88 | " \n",
89 | " return num_variety"
90 | ]
91 | },
92 | {
93 | "cell_type": "markdown",
94 | "metadata": {},
95 | "source": [
96 | "---"
97 | ]
98 | },
99 | {
100 | "cell_type": "markdown",
101 | "metadata": {},
102 | "source": [
103 | "**挑战:使用 Mini Batch K-Means 聚类方法对像素点进行聚类,并用每一个中心的像素点代替属于该类别的像素点。**"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "from sklearn.cluster import MiniBatchKMeans\n",
113 | "\n",
114 | "### 代码开始 ###(≈ 4 行代码)\n",
115 | "model = MiniBatchKMeans(10)\n",
116 | "model.fit(data)\n",
117 | "predict=model.predict(data)\n",
118 | "### 代码结束 ###\n",
119 | "\n",
120 | "new_colors = model.cluster_centers_[predict]"
121 | ]
122 | },
123 | {
124 | "cell_type": "markdown",
125 | "metadata": {},
126 | "source": [
127 | "---"
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "metadata": {},
133 | "source": [
134 | "**挑战:将聚类后并替换为类别中心点值的像素点,变换为数据处理前的格式,并绘制出图片进行对比展示。**"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": null,
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "fig, ax = plt.subplots(1, 2, figsize=(16, 6))\n",
144 | "\n",
145 | "### 代码开始 ###(≈ 3 行代码)\n",
146 | "new_chengdu = new_colors.reshape(chengdu.shape)\n",
147 | "ax[0].imshow(chengdu)\n",
148 | "ax[1].imshow(new_chengdu)\n",
149 | "### 代码结束 ###"
150 | ]
151 | },
152 | {
153 | "cell_type": "markdown",
154 | "metadata": {},
155 | "source": [
156 | "---"
157 | ]
158 | },
159 | {
160 | "cell_type": "markdown",
161 | "metadata": {},
162 | "source": [
163 | " "
164 | ]
165 | }
166 | ],
167 | "metadata": {
168 | "kernelspec": {
169 | "display_name": "Python 3",
170 | "language": "python",
171 | "name": "python3"
172 | },
173 | "language_info": {
174 | "codemirror_mode": {
175 | "name": "ipython",
176 | "version": 3
177 | },
178 | "file_extension": ".py",
179 | "mimetype": "text/x-python",
180 | "name": "python",
181 | "nbconvert_exporter": "python",
182 | "pygments_lexer": "ipython3",
183 | "version": "3.7.0"
184 | }
185 | },
186 | "nbformat": 4,
187 | "nbformat_minor": 2
188 | }
189 |
--------------------------------------------------------------------------------
/Answers/week4-challenge-01/banknote.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from sklearn.svm import SVC
4 |
5 | def identify():
6 |
7 | df_train = pd.read_csv("banknote_train.csv")
8 | df_test = pd.read_csv("banknote_test.csv")
9 |
10 | model = SVC(gamma='auto')
11 | model.fit(df_train.iloc[:, :-1], df_train['class'])
12 | df_test['class'] = model.predict(df_test)
13 |
14 | return df_test
--------------------------------------------------------------------------------
/Answers/week4-challenge-02/association.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from mlxtend.preprocessing import TransactionEncoder
3 | from mlxtend.frequent_patterns import apriori
4 | from mlxtend.frequent_patterns import association_rules as rules
5 |
6 | def rule():
7 |
8 | df = pd.read_csv("shopping_data.csv", header=None)
9 | dataset = df.stack().groupby(level=0).apply(list).tolist()
10 |
11 | te = TransactionEncoder() # 定义模型
12 | te_ary = te.fit_transform(dataset) # 转换数据集
13 | df = pd.DataFrame(te_ary, columns=te.columns_) # 将数组处理为 DataFrame
14 |
15 | frequent_itemsets = apriori(df, min_support=0.05, use_colnames=True)
16 | association_rules = rules(frequent_itemsets, metric="confidence", min_threshold=0.2) # 置信度阈值为 0.1
17 |
18 | return frequent_itemsets, association_rules
--------------------------------------------------------------------------------
/Answers/week4-challenge-03/google_stock.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def quarter_volume():
5 | df = pd.read_csv("GOOGL.csv", index_col=0)
6 | df.index = pd.to_datetime(df.index)
7 | df = df.resample('Q').agg({"Open": 'mean', "High": 'mean', "Low": 'mean',
8 | "Close": 'mean', "Adj Close": 'mean', "Volume": 'sum'})
9 | df = df.sort_values(by='Volume', ascending=False)
10 | return df
--------------------------------------------------------------------------------
/Answers/week4-challenge-04/production_index.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from statsmodels.tsa.stattools import arma_order_select_ic
3 |
4 |
5 | def arima():
6 | df = pd.read_csv("agriculture.csv", index_col=0)
7 | diff = df.diff().dropna()
8 | p, q = arma_order_select_ic(diff, ic='aic')['aic_min_order'] # AIC
9 | d = 1
10 | return p, d, q
11 |
--------------------------------------------------------------------------------
/Answers/week4-challenge-05/chengdu_pm25.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from fbprophet import Prophet
3 |
4 |
5 | def additive():
6 | df = pd.read_csv("Chengdu_HourlyPM25.csv")
7 | df_nan = df.replace({-999: pd.np.NaN})
8 | df = df_nan.fillna(method='ffill').fillna(method='bfill')
9 |
10 | df.index = pd.to_datetime(df['Date (LST)'])
11 | df = df.resample('D').mean()
12 | df = df.reset_index()
13 | df.rename(columns={'Date (LST)': 'ds', 'Value': 'y'}, inplace=True)
14 |
15 | m = Prophet() # 创建加法模型
16 | m.fit(df)
17 |
18 | future = m.make_future_dataframe(periods=365, freq='D') # 生成预测序列
19 | forecast = m.predict(future) # 预测
20 | # 仅保留预测值和相应的置信区间
21 | forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]
22 | forecast = forecast.set_index('ds')['2017-01-01':]
23 |
24 | forecast.to_csv("forecast.csv") # 存为数据文件
25 |
26 | return forecast
27 |
28 | additive()
--------------------------------------------------------------------------------
/Answers/week5-spiders-01/lianjia_spider.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from bs4 import BeautifulSoup
3 | import time
4 | from tqdm import tqdm
5 | import re
6 | import sqlite3
7 | import pandas as pd
8 |
9 | '''
10 | 爬虫代码分为三步:
11 | 1. 爬取房屋 id
12 | 2. 根据房屋 id 组合 url,然后依次爬取房屋的具体界面获取信息
13 | 3. 保存到本地
14 | '''
15 |
16 | base_url = 'https://sh.lianjia.com/zufang/'
17 | test_url = 'https://sh.lianjia.com/zufang/pg1/'
18 | headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
19 |
20 | # 获取一页的房屋列表,具体为房屋的 id,例如 107100610451
21 | def getHouseURLList(page_url):
22 | try:
23 | r = requests.get(page_url, timeout=5, headers=headers)
24 | if r.status_code==403:
25 | print('访问被拒,请稍后再试')
26 | except requests.exceptions.Timeout:
27 | # 请求超时,返回无效数据
28 | return None
29 | content = r.content
30 | soup = BeautifulSoup(content)
31 | result_list = list(soup.select('#house-lst')[0].children)
32 | return_list = []
33 | for result in result_list:
34 | return_list.append(result['data-id'])
35 | return return_list
36 |
37 | # 示例
38 | # return_list = getHouseURLList(test_url)
39 |
40 | # 房屋 id
41 | data_id_list = []
42 |
43 | # 多走几轮,以获得更全的数据
44 | for _ in range(1):
45 | # for i in tqdm(range(1, 101)):
46 | for i in tqdm(range(1, 2)):
47 | page_url = base_url+'pg{}/'.format(i)
48 | return_list = getHouseURLList(page_url)
49 | if not return_list:
50 | time.sleep(10)
51 | continue
52 | data_id_list.extend(return_list)
53 |
54 | # 去除重复数据
55 | data_id_list = list(set(data_id_list))
56 |
57 | # 写入本地文件,保存房屋 id
58 | with open('house_id_list.txt', 'w') as f:
59 | f.write('\n'.join(data_id_list))
60 |
61 |
62 | # 清理面积
63 | def clean_str(s):
64 | # 去除中文
65 | re.sub(r'[^\x00-\x7f]', '', s)
66 | new_s = []
67 | for c in s:
68 | # 遇到非数字则舍去
69 | if c.isdigit(): new_s.append(c)
70 | else: break
71 | return ''.join(new_s)
72 |
73 | # 定义一个类保存数据
74 | class Room(object):
75 | def __init__(self, url):
76 | self.done = False
77 | self.area = 0
78 | self.url = url
79 | self.price = ''
80 | self.isRemoved = ''
81 | self.special_label = ''
82 | self.title = ''
83 | self.floor = ''
84 | self.is_near_subway = ''
85 | self.publish_time = ''
86 | self.rooms = ''
87 | self.toilet = ''
88 | self.halls = ''
89 | self.rent_way = ''
90 | self.location = ''
91 |
92 | # 房屋面积
93 | def setArea(self, area):
94 | self.area = float(clean_str(area))
95 |
96 | # 价格
97 | def setPrice(self, price):
98 | self.price = price
99 |
100 | # 是否下架
101 | def setIsRemoved(self, isRemoved):
102 | self.isRemoved = isRemoved
103 |
104 | # 是否精装修
105 | def setSpecialLabel(self, special_label):
106 | self.special_label = special_label
107 |
108 | # 户型: 房间数量,房间,大厅,卫生间; 出租方式: 整租、合租
109 | def setType(self, type):
110 | tmp = type.split()
111 | if len(tmp) == 1:
112 | room_count = tmp[0]
113 | rent_way = '暂无信息'
114 | else:
115 | room_count, rent_way = tmp
116 | room_count = re.sub(r'[^\x00-\x7f]', ' ', room_count).strip().split()
117 | room_count = list(map(int, room_count))
118 |
119 | # 部分房屋无卫生间或客厅
120 | if len(room_count) < 3:
121 | for i in range(3 - len(room_count)):
122 | room_count.append(0)
123 |
124 | self.rooms = room_count[0]
125 | self.halls = room_count[1]
126 | self.toilet = room_count[2]
127 | self.rent_way = rent_way
128 |
129 | # 位置
130 | def setLocation(self, location):
131 | self.location = location
132 |
133 | # 是否靠近地铁
134 | def setSubway(self, is_near_subway):
135 | self.is_near_subway = is_near_subway
136 |
137 | # 朝向
138 | def setDirection(self, direction):
139 | self.direction = direction
140 |
141 | # 楼层
142 | def setFloor(self, floor):
143 | self.floor = floor
144 |
145 | # 发布时间
146 | def setPublishTime(self, publish_time):
147 | self.publish_time = publish_time
148 |
149 | # 房屋标题
150 | def setTitle(self, title):
151 | self.title = title
152 |
153 | # 房屋链接
154 | def setURL(self, URL):
155 | self.url = URL
156 |
157 | # 是否爬取成功
158 | def setDone(self, done):
159 | self.done = done
160 |
161 | def __repr__(self):
162 | return str(self.__dict__)
163 |
164 | # 给定 url 获取房屋信息
165 | def getRoom(url):
166 | room = Room(url)
167 | try:
168 | r = requests.get(url, timeout=5, headers=headers)
169 | if r.status_code==403:
170 | print('访问被拒,请稍后再试')
171 | except requests.exceptions.Timeout:
172 | time.sleep(2)
173 | print('timeout')
174 | return Room('invalid')
175 | content = r.content.decode()
176 |
177 | soup = BeautifulSoup(content, features='lxml')
178 |
179 | title = soup.find('h1', class_='main').text
180 | room.setTitle(title)
181 |
182 | price_div = soup.find('div', class_='price')
183 | price_list = list(price_div.stripped_strings) # ['9000', '元/月', '精装修']
184 | price = ''.join(price_list[:2])
185 | room.setPrice(price)
186 |
187 | special_label = ' '.join(price_list[2:]) if len(price_list)>2 else '无'
188 | room.setSpecialLabel(special_label)
189 | isRemoved = '已下架' if price_div['class'][1] == 'isRemove' else '正在出租'
190 | room.setIsRemoved(isRemoved)
191 |
192 | room_info = soup.find('div', class_='zf-room')
193 | room_info_list = list(room_info.stripped_strings)
194 |
195 | location = "{} {} {}".format(room_info_list[15], room_info_list[16], room_info_list[11])
196 | room.setLocation(location)
197 | room.setPublishTime(room_info_list[-1])
198 | room.setArea(room_info_list[1])
199 | room.setType(room_info_list[3]) # 4室2厅3卫;
200 | room.setFloor(room_info_list[5])
201 | room.setDirection(room_info_list[7])
202 | room.setSubway(room_info_list[9])
203 |
204 | room.setDone(True)
205 |
206 | return room
207 |
208 | # 连接数据库
209 | conn = sqlite3.connect('lianjia.db')
210 |
211 | cursor = conn.cursor()
212 | # 创建表,如果已创建,则删除下面这行
213 | cursor.execute('''
214 | CREATE TABLE ROOM(
215 | url VARCHAR(1000) PRIMARY KEY,
216 | price Double,
217 | area Double,
218 | isRemoved VARCHAR(1000),
219 | special_label VARCHAR(1000),
220 | rooms INT,
221 | halls INT,
222 | toilet INT,
223 | rent_way INT,
224 | location VARCHAR(1000),
225 | is_near_subway VARCHAR(1000),
226 | direction VARCHAR(1000),
227 | floor VARCHAR(1000),
228 | publish_time VARCHAR(1000),
229 | title VARCHAR(1000)
230 | )
231 | ''')
232 |
233 | SELECT_COMMAND = "select * from ROOM where url='{}';"
234 | INSERT_COMMAND = "insert into ROOM(url, price, area, isRemoved, \
235 | special_label, rooms, halls, toilet, rent_way, \
236 | location, is_near_subway, direction, floor, publish_time, title) \
237 | values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"
238 |
239 | # 根据上面爬取到的房屋 ID 组合生成 url,然后依次爬取
240 | def getAllHouseInfo(file_path):
241 | base_url = 'https://sh.lianjia.com/zufang/{}.html'
242 | with open(file_path) as f:
243 | lines = f.readlines()
244 |
245 | urls = [base_url.format(line.strip('\n')) for line in lines]
246 | for url in tqdm(urls):
247 | cursor = conn.cursor()
248 | cursor.execute(SELECT_COMMAND.format(url))
249 | if len(cursor.fetchall()) != 0:
250 | continue
251 | cursor.close()
252 | # 如果失败,最多尝试 5 次
253 | count = 0
254 | while count < 5:
255 | room = getRoom(url)
256 | if room.done:
257 | break
258 | count += 1
259 | if count == 5:
260 | continue
261 |
262 | # 插入数据库
263 | cursor = conn.cursor()
264 | cursor.execute(INSERT_COMMAND,
265 | (room.url, room.price, room.area, room.isRemoved, room.special_label,
266 | room.rooms, room.halls, room.toilet, room.rent_way, room.location, room.is_near_subway,
267 | room.direction, room.floor, room.publish_time, room.title))
268 | cursor.close()
269 | conn.commit()
270 | if cursor.rowcount != 1:
271 | print('插入错误')
272 |
273 |
274 | getAllHouseInfo('house_id_list.txt')
275 |
276 | csv_path = 'lianjia.csv'
277 |
278 | cursor = conn.cursor()
279 |
280 | # 保存到本地 csv 文件
281 | cursor.execute('SELECT * FROM ROOM')
282 |
283 | data = cursor.fetchall()
284 | data = list(map(list, data))
285 | name_attribute = ['url', 'price', 'area', 'state', 'label', 'rooms', 'halls', 'toilets', 'rentway', 'location',
286 | 'subway', 'direction', 'floor', 'publishtime', 'title']
287 | data_frame =pd.DataFrame(columns=name_attribute,data=data)
288 | data_frame.to_csv(csv_path, encoding='utf_8_sig')
289 | conn.close()
--------------------------------------------------------------------------------
/Answers/week5-spiders-02/create_sqlite_database.py:
--------------------------------------------------------------------------------
1 | import sqlite3
2 |
3 |
4 | # 创建携程数据库
5 | conn = sqlite3.connect('xiecheng.db')
6 |
7 | # 创建一个包含机票信息的表
8 | # 分别是 公司名、出发时间、到达时间、出发机场、到达机场、飞机类型、准点率、飞机编号、价格、日期
9 | CREATE_COMMAND1 = '''
10 | CREATE table AIRPLANE (
11 | company_name varchar(1000),
12 | start_time varchar(1000),
13 | arrival_time varchar(1000),
14 | start_airport varchar(1000),
15 | arrival_airport varchar(1000),
16 | airpane_type varchar(1000),
17 | ontime_rate float,
18 | airpane_number varchar(1000),
19 | price float,
20 | date varchar(1000)
21 | );
22 | '''
23 |
24 | # 选择预计,根据飞机编号、日期、出发时间选出需要的机票信息
25 | SELECT_COMMAND = '''
26 | select * from AIRPLANE where airpane_number=? and date=? and start_time=?;
27 | '''
28 |
29 | # 插入新的数据
30 | INSERT_COMMAND1 = '''
31 | insert into AIRPLANE values(?,?,?,?,?,?,?,?,?,?);
32 | '''
33 |
34 | # 创建一个最低价格的表,包含出发城市、到达城市、日期、最低价格
35 | CREATE_COMMAND2 = '''
36 | CREATE table LOWEST_PRICE (
37 | start_city varchar(1000),
38 | arrival_city varchar(1000),
39 | date varchar(1000),
40 | price float
41 | );
42 | '''
43 |
44 | # 插入数据
45 | INSERT_COMMAND2 = '''
46 | insert into LOWEST_PRICE values(?,?,?,?);
47 | '''
48 |
49 | # 创建表
50 | cursor = conn.cursor()
51 | cursor.execute(CREATE_COMMAND1)
52 | cursor.close()
53 | cursor = conn.cursor()
54 | cursor.execute(CREATE_COMMAND2)
55 | cursor.close()
56 |
--------------------------------------------------------------------------------
/Answers/week5-spiders-02/insert_database.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import pandas as pd
3 |
4 |
5 | base = datetime.date(2018, 10, 30)
6 | numdays = 80
7 |
8 | # 所有的十月三十号以后的八十天的 list
9 | date_list = [base + datetime.timedelta(days=x) for x in range(0, numdays)]
10 |
11 | # 获取从 start 到 dest 的数据并插入数据库中
12 | def getTickets(start, dest, driver, date_list, conn):
13 | cursor = conn.cursor()
14 | name_attribute = []
15 | for one_day in tqdm_notebook(date_list):
16 | # 获取数据
17 | tmp = get_ticket_info(start, dest, str(one_day), driver)
18 | for x in tmp:
19 | result = cursor.execute(
20 | SELECT_COMMAND, (x[-2], str(one_day), x[1])).fetchall()
21 | x.append(str(one_day))
22 | if len(result) == 0:
23 | # 如果没有爬取则插入数据库
24 | cursor.execute(INSERT_COMMAND1, x)
25 | conn.commit()
26 | cursor.close()
27 |
28 |
29 | # 成都到上海和上海到成都
30 | getTickets('CTU', 'SHA', driver, date_list, conn)
31 | getTickets('SHA', 'CTU', driver, date_list, conn)
32 |
--------------------------------------------------------------------------------
/Answers/week5-spiders-02/xiecheng_spider.py:
--------------------------------------------------------------------------------
1 | # 使用 BeautifulSoup 进行解析
2 | from bs4 import BeautifulSoup
3 | from selenium import webdriver
4 | from selenium.common.exceptions import TimeoutException
5 | import time
6 | from selenium.webdriver.chrome.options import Options
7 | import re
8 | from selenium.webdriver.common.proxy import Proxy, ProxyType
9 |
10 |
11 | '''
12 | dstation: 出发城市代码
13 | astation: 到达城市代码
14 | date: 出发日期,形如 2018-10-30
15 | driver: 创建的 webdriver
16 | '''
17 | def get_ticket_info(dstation, astation, date, driver):
18 | url = "http://flights.ctrip.com/booking/%s-%s-day-1.html?DDate1=%s" % (
19 | dstation, astation, date)
20 | # 一直尝试到成功
21 | while True:
22 | try:
23 | driver.get(url)
24 | break
25 | except TimeoutException as e:
26 | pass
27 | # 等待页面加载出来
28 | time.sleep(2)
29 |
30 | # webdriver 执行 js 语句滑动窗口,一直滑动到底部
31 | initial_pagesource = driver.page_source
32 | while True:
33 | # 滑到页面底部,暂停 0.1 秒是为了等待页面刷新出结果
34 | driver.execute_script(
35 | "window.scrollTo(0, document.body.scrollHeight);")
36 | # 等待数据加载
37 | time.sleep(1)
38 | # 如果当前页面和上一个页面的 html 内容不同,则表明滑动到底部了
39 | if initial_pagesource == driver.page_source:
40 | break
41 | initial_pagesource = driver.page_source
42 |
43 | # 使用 BeautifulSoup 解析 html 内容
44 | soup = BeautifulSoup(initial_pagesource)
45 | # 获取搜索结果的每一个项
46 | result = soup.find_all("div", class_=["search_table_header", ])
47 | result_list = []
48 | for ticket_info in result:
49 | try:
50 | # 航空公司名、出发时间、到达时间
51 | company_name, start_time, arrival_time = [
52 | x.text for x in ticket_info.find_all('strong')]
53 | # 出发机场、到达机场
54 | start_airport, arrival_airport = [
55 | x.text for x in ticket_info.find_all("div", class_=["airport", ])]
56 | tmp = [x.text for x in ticket_info.find_all(
57 | "span", class_=["direction_black_border", ])]
58 | # 飞机类型,准点率(可能没有)
59 | if len(tmp) == 2:
60 | airpane_type, ontime_rate = tmp
61 | ontime_rate = float(''.join(filter(str.isdigit, ontime_rate)))/100
62 | else:
63 | airpane_type = tmp[0]
64 | ontime_rate = 0
65 | # 航班编号
66 | airpane_number = [x.text for x in ticket_info.find_all("span")][2]
67 | # 价格(经济舱)
68 | price = int([''.join(list(filter(str.isdigit, x.text)))
69 | for x in ticket_info.find_all("span", class_=["base_price02", ])][0])
70 | result_list.append([company_name, start_time, arrival_time, start_airport,
71 | arrival_airport, airpane_type, ontime_rate, airpane_number, price])
72 |
73 | except Exception as E:
74 | print(E)
75 |
76 | # 按机票价格排序后返回
77 | return sorted(result_list, key=lambda x: x[-1])
78 |
79 | if __name__ == "__main__":
80 | # driver = webdriver.PhantomJS(executable_path="./chromedriver", service_args=['--load-images=no'])
81 | options = Options()
82 | # options = webdriver.ChromeOptions()
83 | # options.add_argument("--headless") # Runs Chrome in headless mode.
84 | options.add_argument('--no-sandbox') # # Bypass OS security model
85 | options.add_argument('start-maximized')
86 | options.add_argument('disable-infobars')
87 | options.add_argument("--disable-extensions")
88 | driver = webdriver.Chrome(options=options, executable_path='./chromedriver')
89 | # driver = webdriver.Chrome("./chromedriver")
90 | result_list = get_ticket_info('CTU', 'SHA', '2018-10-30', driver)
91 | print(result_list)
--------------------------------------------------------------------------------
/Assignments/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
实验楼《楼+ 数据分析与挖掘实战》优秀项目挑战报告|课程报名
4 |
5 |
6 |
7 |
8 | 如果 Github 加载缓慢,可以点击下方链接快速浏览。👇
9 |
10 | ### 第 1 期课程
11 |
12 | - 报告题目:[中国保险业过去五年基础数据分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%8F%85%EF%B8%8Fdm01-stenphen-%E4%B8%AD%E5%9B%BD%E4%BF%9D%E9%99%A9%E4%B8%9A%E8%BF%87%E5%8E%BB%E4%BA%94%E5%B9%B4%E5%9F%BA%E7%A1%80%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90.ipynb)|学员昵称:stenphen 🌟
13 | - 报告题目:[上海历史天气数据分析预测](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%8F%85%EF%B8%8Fdm01-stenphen-%E4%B8%AD%E5%9B%BD%E4%BF%9D%E9%99%A9%E4%B8%9A%E8%BF%87%E5%8E%BB%E4%BA%94%E5%B9%B4%E5%9F%BA%E7%A1%80%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90.ipynb)|学员昵称:BellaG
14 | - 报告题目:[双色球历史数据统计预测](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%A5%89dm01-hcccom-%E5%8F%8C%E8%89%B2%E7%90%83%E5%8E%86%E5%8F%B2%E6%95%B0%E6%8D%AE%E7%BB%9F%E8%AE%A1%E9%A2%84%E6%B5%8B.ipynb)|学员昵称:hcccom
15 |
16 | ### 第 2 期课程
17 |
18 | - 报告题目:[杭州互联网寒冬背景下的数据分析岗现状分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%8F%85%EF%B8%8Fdm02-%E7%B1%B3%E7%AB%B9314159-%E6%9D%AD%E5%B7%9E%E4%BA%92%E8%81%94%E7%BD%91%E5%AF%92%E5%86%AC%E8%83%8C%E6%99%AF%E4%B8%8B%E7%9A%84%E6%95%B0%E6%8D%AE%E5%88%86%E6%9E%90%E5%B2%97%E7%8E%B0%E7%8A%B6%E5%88%86%E6%9E%90.ipynb)|学员昵称:米竹314159 🌟
19 | - 报告题目:[医学专业论坛的数据爬取和分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%A5%88dm02-linnecn-%E5%8C%BB%E5%AD%A6%E4%B8%93%E4%B8%9A%E8%AE%BA%E5%9D%9B%E7%9A%84%E6%95%B0%E6%8D%AE%E7%88%AC%E5%8F%96%E5%92%8C%E5%88%86%E6%9E%90.ipynb)|学员昵称:linnecn
20 |
21 | ### 第 3 期课程
22 |
23 | - 虚位以待
24 |
25 | ### 第 4 期课程
26 |
27 | - 报告题目:[链家成都市区挂牌二手房分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%8F%85%EF%B8%8Fdm04-Luo2019-%E9%93%BE%E5%AE%B6%E6%88%90%E9%83%BD%E5%B8%82%E5%8C%BA%E6%8C%82%E7%89%8C%E4%BA%8C%E6%89%8B%E6%88%BF%E5%88%86%E6%9E%90.ipynb)|学员昵称:Luo2019 🌟
28 | - 报告题目:[B 站番剧数据简单分析](https://nbviewer.jupyter.org/github/shiyanlou/louplus-dm/blob/master/Assignments/%F0%9F%A5%88dm04-Yueyec-B-%E7%AB%99%E7%95%AA%E5%89%A7%E6%95%B0%E6%8D%AE%E7%AE%80%E5%8D%95%E5%88%86%E6%9E%90.ipynb)|学员昵称:Yueyec
29 |
30 | ### 第 5 期课程
31 |
32 | - 报告题目:[京东手机销售数据分析](https://www.kaggle.com/ted0001/dm05-998494)|学员昵称:[Ted_Wei](https://www.lanqiao.cn/users/998494/) 🌟
33 | - 报告题目:[通信基站室内分布系统外引小区识别](https://www.kaggle.com/cym1085893/dm05-1085893)|学员昵称:[yiming_chen](https://www.lanqiao.cn/users/1085893/)
34 |
35 | ### 第 6 期课程
36 |
37 | - 报告题目:[大连地区酒店数据分析](https://www.kaggle.com/louplus/dm06-937174)|学员昵称:[Miss_candy](https://www.lanqiao.cn/users/937174/) 🌟
38 |
39 | ### 第 7 期课程
40 |
41 | - 报告题目:[微博搜索“双十一”数据分析](https://www.kaggle.com/lanjie/dm07-1127847)|学员昵称:[灵汐](https://www.lanqiao.cn/users/1127847/)
42 |
43 | ### 第 8 期课程
44 |
45 | - 报告题目:[B站up主“老番茄”基本数据采集分析](https://www.kaggle.com/truwbin/dm08-877339-b-up)|学员昵称:[今天小古不出门](https://www.lanqiao.cn/users/877339/) 🌟
46 | - 报告题目:[下厨房家常菜菜谱分析及新菜谱预测评分](https://www.kaggle.com/fors3c/dm08-ns3c)|学员昵称:ns3c
47 |
48 | ### 第 9 期课程
49 |
50 | - 报告题目:[世界银行国际旅游业指标分析](https://www.kaggle.com/furongrong/dm09-535211)|学员昵称:[RR25](https://www.lanqiao.cn/users/535211/) 🌟
51 | - 报告题目:[猪肉价格数据分析](https://www.kaggle.com/suxiaomo/dm09-1180757)|学员昵称:[苏小墨](https://www.lanqiao.cn/users/1180757/)
52 | - 报告题目:[汽车之家数据分析](https://www.kaggle.com/mengchenshang/dm09-1176812)|学员昵称:[凹润纸](https://www.lanqiao.cn/users/1176812/)
53 |
54 | ### 第 10 期课程
55 |
56 | - 虚位以待
57 |
58 | ### 第 11 期课程
59 |
60 | - 报告题目:[科比职业生涯回顾与模型预测](https://www.kaggle.com/yemujianglin/dm11-1276351)|学员昵称:[夜幕降临_](https://www.lanqiao.cn/users/1276351/) 🌟
61 |
62 | ### 第 12 期课程
63 |
64 | - 虚位以待
65 |
66 |
67 | ### 第 13 期课程
68 |
69 | - 报告题目:[基于 Python 语言的加拿大联邦大选数据分析](https://www.kaggle.com/czz1403/dm13-1204880-python)|学员昵称:[TXZXTLD](https://www.lanqiao.cn/users/1204880/) 🌟
70 | - 报告题目:[新冠疫情社会影响数据分析](https://www.kaggle.com/vincentbao/dm13-812273)|学员昵称:[vincentbao](https://www.lanqiao.cn/users/812273/) 🌟 已经制作成课程:https://www.lanqiao.cn/courses/2791
71 |
72 |
73 | ```
74 | - 原作者可以提 PR 更新自己的报告内容。
75 | - 实验报告版权归属原学员且授权实验楼独家使用,请勿用于商业用途。
76 | ```
77 |
--------------------------------------------------------------------------------
/Assignments/🥉dm01-hcccom-双色球历史数据统计预测.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# 双色球历史数据统计预测"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "---"
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "- 报告题目:双色球历史数据统计预测\n",
22 | "- 学员昵称:hcccom\n",
23 | "- 课程期数:第一期"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "© 本文著作权归作者所有,并授权实验楼独家使用,未经实验楼许可,不得转载使用。"
31 | ]
32 | },
33 | {
34 | "cell_type": "markdown",
35 | "metadata": {},
36 | "source": [
37 | "---"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### 获取数据"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": null,
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "import random\n",
54 | "import numpy as np\n",
55 | "import requests\n",
56 | "import csv\n",
57 | "from bs4 import BeautifulSoup\n",
58 | "headers = {\n",
59 | " 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}\n",
60 | "res = requests.get(\n",
61 | " 'https://datachart.500.com/ssq/history/newinc/history.php?start=03001&end=18147', headers=headers) # 从03年第一期开始\n",
62 | "res.encoding = 'uft-8'\n",
63 | "soup = BeautifulSoup(res.text, 'lxml')\n",
64 | "data = soup.find_all(attrs={'class': 't_tr1'})\n",
65 | "csvFile = open(\"./ssq004.csv\", 'wt', newline='', encoding='utf-8')"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "def gens(x): # 模拟\n",
75 | " random.seed(x)\n",
76 | " a = np.arange(1, 34, 1).tolist()\n",
77 | " red = sorted(random.sample(a, 6))\n",
78 | " b = np.arange(1, 17, 1).tolist()\n",
79 | " blue = random.sample(b, 1)\n",
80 | " red.extend(blue)\n",
81 | " return red"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 8,
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "with open(r\"ssq004.csv\", 'a', newline='') as f:\n",
91 | " writer = csv.writer(f)\n",
92 | " writer.writerow([\"period\", \"real\", \"date\", \"vis\", 'r1',\n",
93 | " 'r2', 'r3', 'r4', 'r5', 'r6', 'b1']) # 先写入列名\n",
94 | " for i in range(0, len(data)):\n",
95 | " period = data[i].find_all('td')[0].text\n",
96 | " real = [int(data[i].find_all('td')[1].text), int(data[i].find_all('td')[2].text), int(data[i].find_all('td')[3].text), int(data[i].find_all(\n",
97 | " 'td')[4].text), int(data[i].find_all('td')[5].text), int(data[i].find_all('td')[6].text), int(data[i].find_all('td')[7].text)]\n",
98 | " date = data[i].find_all('td')[15].text\n",
99 | " vis = gens(date)\n",
100 | " r1 = int(data[i].find_all('td')[1].text)\n",
101 | " r2 = int(data[i].find_all('td')[2].text)\n",
102 | " r3 = int(data[i].find_all('td')[3].text)\n",
103 | " r4 = int(data[i].find_all('td')[4].text)\n",
104 | " r5 = int(data[i].find_all('td')[5].text)\n",
105 | " r6 = int(data[i].find_all('td')[6].text)\n",
106 | " b1 = int(data[i].find_all('td')[7].text)\n",
107 | "\n",
108 | " writer.writerows(\n",
109 | " [[period, real, date, vis, r1, r2, r3, r4, r5, r6, b1]])\n",
110 | "csvFile.close()"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "### 数据处理"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 9,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/plain": [
128 | "[3, 6, 8, 23, 25, 33, 4]"
129 | ]
130 | },
131 | "execution_count": 9,
132 | "metadata": {},
133 | "output_type": "execute_result"
134 | }
135 | ],
136 | "source": [
137 | "import pandas as pd\n",
138 | "import matplotlib.pyplot as plt\n",
139 | "df = pd.read_csv('ssq004.csv', encoding='gbk')\n",
140 | "df.set_index(\"date\", inplace=True)\n",
141 | "df.index = pd.DatetimeIndex(df.index)\n",
142 | "df.sort_index(ascending=True, inplace=True)\n",
143 | "\n",
144 | "\n",
145 | "def get_real(i):\n",
146 | " a = df.real[i]\n",
147 | " lista = a.strip('[]').split(',')\n",
148 | " map(int, lista)\n",
149 | " list_real = [int(x) for x in lista]\n",
150 | " return list_real\n",
151 | "\n",
152 | "\n",
153 | "def get_vis(i): # 产生和时间相关的随机双色球\n",
154 | " b = df.vis[i]\n",
155 | " listb = b.strip('[]').split(',')\n",
156 | " map(int, listb)\n",
157 | " list_vis = [int(x) for x in listb]\n",
158 | " return list_vis\n",
159 | "\n",
160 | "\n",
161 | "get_vis(1)"
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "execution_count": null,
167 | "metadata": {},
168 | "outputs": [],
169 | "source": [
170 | "# 计算每个球出现的频率"
171 | ]
172 | },
173 | {
174 | "cell_type": "code",
175 | "execution_count": 10,
176 | "metadata": {
177 | "scrolled": true
178 | },
179 | "outputs": [],
180 | "source": [
181 | "df1 = df.drop(columns=['period', 'real', 'vis'])\n",
182 | "df2 = df1.drop(columns=['b1'])\n",
183 | "df3 = df1[['b1']]\n",
184 | "dup = df1[df1.duplicated()].count()\n",
185 | "rd1 = df2.stack().value_counts()\n",
186 | "bd1 = df3['b1'].value_counts()"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 11,
192 | "metadata": {
193 | "scrolled": false
194 | },
195 | "outputs": [
196 | {
197 | "data": {
198 | "image/png": "\n",
199 | "text/plain": [
200 | ""
201 | ]
202 | },
203 | "metadata": {
204 | "needs_background": "light"
205 | },
206 | "output_type": "display_data"
207 | },
208 | {
209 | "data": {
210 | "image/png": "\n",
211 | "text/plain": [
212 | ""
213 | ]
214 | },
215 | "metadata": {
216 | "needs_background": "light"
217 | },
218 | "output_type": "display_data"
219 | }
220 | ],
221 | "source": [
222 | "import matplotlib.pyplot as plt\n",
223 | "from pylab import *\n",
224 | "plt.figure(111)\n",
225 | "rd1.plot(kind='bar', align='center')\n",
226 | "plt.xlabel(\"red\")\n",
227 | "plt.ylabel(\"Times\")\n",
228 | "plt.show()\n",
229 | "plt.figure(112)\n",
230 | "bd1.plot(kind='bar')\n",
231 | "plt.xlabel(\"blue\")\n",
232 | "plt.ylabel(\"Times\")\n",
233 | "plt.show()"
234 | ]
235 | },
236 | {
237 | "cell_type": "markdown",
238 | "metadata": {},
239 | "source": [
240 | "### 取一组\n",
241 | " "
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 14,
247 | "metadata": {},
248 | "outputs": [
249 | {
250 | "data": {
251 | "text/html": [
252 | "\n",
253 | "\n",
266 | "
\n",
267 | " \n",
268 | " \n",
269 | " \n",
270 | " period \n",
271 | " real \n",
272 | " vis \n",
273 | " r1 \n",
274 | " r2 \n",
275 | " r3 \n",
276 | " r4 \n",
277 | " r5 \n",
278 | " r6 \n",
279 | " b1 \n",
280 | " \n",
281 | " \n",
282 | " date \n",
283 | " \n",
284 | " \n",
285 | " \n",
286 | " \n",
287 | " \n",
288 | " \n",
289 | " \n",
290 | " \n",
291 | " \n",
292 | " \n",
293 | " \n",
294 | " \n",
295 | " \n",
296 | " \n",
297 | " 2018-12-04 \n",
298 | " 18142 \n",
299 | " [5, 8, 10, 11, 27, 28, 11] \n",
300 | " [11, 13, 16, 24, 29, 31, 8] \n",
301 | " 5 \n",
302 | " 8 \n",
303 | " 10 \n",
304 | " 11 \n",
305 | " 27 \n",
306 | " 28 \n",
307 | " 11 \n",
308 | " \n",
309 | " \n",
310 | " 2018-12-06 \n",
311 | " 18143 \n",
312 | " [4, 6, 15, 28, 32, 33, 14] \n",
313 | " [10, 14, 15, 16, 24, 33, 14] \n",
314 | " 4 \n",
315 | " 6 \n",
316 | " 15 \n",
317 | " 28 \n",
318 | " 32 \n",
319 | " 33 \n",
320 | " 14 \n",
321 | " \n",
322 | " \n",
323 | " 2018-12-09 \n",
324 | " 18144 \n",
325 | " [8, 13, 17, 18, 20, 27, 13] \n",
326 | " [3, 7, 13, 17, 19, 30, 13] \n",
327 | " 8 \n",
328 | " 13 \n",
329 | " 17 \n",
330 | " 18 \n",
331 | " 20 \n",
332 | " 27 \n",
333 | " 13 \n",
334 | " \n",
335 | " \n",
336 | " 2018-12-11 \n",
337 | " 18145 \n",
338 | " [3, 9, 13, 22, 23, 25, 6] \n",
339 | " [3, 6, 10, 12, 19, 25, 12] \n",
340 | " 3 \n",
341 | " 9 \n",
342 | " 13 \n",
343 | " 22 \n",
344 | " 23 \n",
345 | " 25 \n",
346 | " 6 \n",
347 | " \n",
348 | " \n",
349 | " 2018-12-13 \n",
350 | " 18146 \n",
351 | " [2, 10, 11, 17, 18, 29, 16] \n",
352 | " [6, 10, 21, 24, 26, 32, 3] \n",
353 | " 2 \n",
354 | " 10 \n",
355 | " 11 \n",
356 | " 17 \n",
357 | " 18 \n",
358 | " 29 \n",
359 | " 16 \n",
360 | " \n",
361 | " \n",
362 | "
\n",
363 | "
"
364 | ],
365 | "text/plain": [
366 | " period real vis \\\n",
367 | "date \n",
368 | "2018-12-04 18142 [5, 8, 10, 11, 27, 28, 11] [11, 13, 16, 24, 29, 31, 8] \n",
369 | "2018-12-06 18143 [4, 6, 15, 28, 32, 33, 14] [10, 14, 15, 16, 24, 33, 14] \n",
370 | "2018-12-09 18144 [8, 13, 17, 18, 20, 27, 13] [3, 7, 13, 17, 19, 30, 13] \n",
371 | "2018-12-11 18145 [3, 9, 13, 22, 23, 25, 6] [3, 6, 10, 12, 19, 25, 12] \n",
372 | "2018-12-13 18146 [2, 10, 11, 17, 18, 29, 16] [6, 10, 21, 24, 26, 32, 3] \n",
373 | "\n",
374 | " r1 r2 r3 r4 r5 r6 b1 \n",
375 | "date \n",
376 | "2018-12-04 5 8 10 11 27 28 11 \n",
377 | "2018-12-06 4 6 15 28 32 33 14 \n",
378 | "2018-12-09 8 13 17 18 20 27 13 \n",
379 | "2018-12-11 3 9 13 22 23 25 6 \n",
380 | "2018-12-13 2 10 11 17 18 29 16 "
381 | ]
382 | },
383 | "execution_count": 14,
384 | "metadata": {},
385 | "output_type": "execute_result"
386 | }
387 | ],
388 | "source": [
389 | "df.tail(5)"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": 15,
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "data": {
399 | "text/plain": [
400 | "[]"
401 | ]
402 | },
403 | "execution_count": 15,
404 | "metadata": {},
405 | "output_type": "execute_result"
406 | },
407 | {
408 | "data": {
409 | "image/png": "\n",
410 | "text/plain": [
411 | ""
412 | ]
413 | },
414 | "metadata": {
415 | "needs_background": "light"
416 | },
417 | "output_type": "display_data"
418 | }
419 | ],
420 | "source": [
421 | "x = [1, 2, 3, 4, 5, 6, 7]\n",
422 | "y1 = get_real(-1)\n",
423 | "y2 = get_vis(-1)\n",
424 | "fig, ax = plt.subplots(figsize=(15, 5))\n",
425 | "ax.plot(x, y1, 'ch-', markersize=10)\n",
426 | "ax.plot(x, y2, 'ys-', markersize=10)"
427 | ]
428 | },
429 | {
430 | "cell_type": "code",
431 | "execution_count": 167,
432 | "metadata": {},
433 | "outputs": [
434 | {
435 | "data": {
436 | "image/png": "\n",
437 | "text/plain": [
438 | ""
439 | ]
440 | },
441 | "metadata": {
442 | "needs_background": "light"
443 | },
444 | "output_type": "display_data"
445 | },
446 | {
447 | "data": {
448 | "text/plain": [
449 | "array([0.95333787, 3.40088556])"
450 | ]
451 | },
452 | "execution_count": 167,
453 | "metadata": {},
454 | "output_type": "execute_result"
455 | }
456 | ],
457 | "source": [
458 | "x_data = np.array(y1)\n",
459 | "\n",
460 | "y_data = np.array(y2)\n",
461 | "\n",
462 | "poly = np.polyfit(x_data, y_data, deg=1)\n",
463 | "\n",
464 | "plt.plot(x_data, y_data, 'o')\n",
465 | "\n",
466 | "plt.plot(x_data, np.polyval(poly, x_data))\n",
467 | "\n",
468 | "plt.show()\n",
469 | "poly"
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": 7,
475 | "metadata": {},
476 | "outputs": [
477 | {
478 | "data": {
479 | "image/png": "\n",
480 | "text/plain": [
481 | ""
482 | ]
483 | },
484 | "metadata": {
485 | "needs_background": "light"
486 | },
487 | "output_type": "display_data"
488 | },
489 | {
490 | "data": {
491 | "text/plain": [
492 | "array([-2.44332198e-03, 1.76012475e-01, -4.58949704e+00, 5.31642494e+01,\n",
493 | " -2.57338653e+02, 3.41987935e+02])"
494 | ]
495 | },
496 | "execution_count": 7,
497 | "metadata": {},
498 | "output_type": "execute_result"
499 | }
500 | ],
501 | "source": [
502 | "x_data = np.array(y1)\n",
503 | "\n",
504 | "y_data = np.array(y2)\n",
505 | "\n",
506 | "poly = np.polyfit(x_data, y_data, deg=5)\n",
507 | "\n",
508 | "plt.plot(x_data, y_data, 'o')\n",
509 | "\n",
510 | "plt.plot(x_data, np.polyval(poly, x_data))\n",
511 | "\n",
512 | "plt.show()\n",
513 | "poly"
514 | ]
515 | },
516 | {
517 | "cell_type": "markdown",
518 | "metadata": {},
519 | "source": [
520 | "各种分析过后,仍无法做出预测。。。。。"
521 | ]
522 | },
523 | {
524 | "cell_type": "markdown",
525 | "metadata": {},
526 | "source": [
527 | "结论:双色球是无法预测的"
528 | ]
529 | },
530 | {
531 | "cell_type": "markdown",
532 | "metadata": {},
533 | "source": [
534 | "---"
535 | ]
536 | },
537 | {
538 | "cell_type": "markdown",
539 | "metadata": {},
540 | "source": [
541 | "#### 评阅意见反馈"
542 | ]
543 | },
544 | {
545 | "cell_type": "markdown",
546 | "metadata": {},
547 | "source": [
548 | "\n",
549 | "hcccom 提交的《双色球历史数据统计预测》项目挑战报告初步达到课程挑战要求,但仍然有很多地方值得完善。\n",
550 | " \n",
551 | "数据采集部分内容不错,能采集完整的双色球投注数据。但缺乏必要的解释和代码注释。数据分析和处理阶段仅对各号球的出现频次做了统计,选择的柱形图虽然合理但内容较为单薄。这里,建议可以对连续 2 球或者多球的出现频次统计分析。或者分析不同位置各号球的出现频次,或许从统计学角度更有意义。\n",
552 | " \n",
553 | "「取一组」小节之后没有看明白分析的用意,或许是想预测各号球如何出现?不过这样肯定无法完成的。回归分析显然不能用于这里的预测过程。\n",
554 | " \n",
555 | "总之,该挑战报告有 2 点值得改善的地方:\n",
556 | "\n",
557 | "- 补充陈述内容,让阅读者知道每一步的大致操作用意。整个分析报告几乎没有解释性语句,非常不赞同这样做。数据分析的过程很重要,实际上阐述结论和讲好一个故事更加重要。\n",
558 | "\n",
559 | "- 分析思路没有理清,显然双色球出现是随机事件,这是无法通过回归分析完成的。所以,挑战的选题从一开始就不太理想。实际上,就算是真实的数据分析任务,也不建议去做彩票预测,因为就算从概率上得到了一些高频次组合方式,但没有明确的指导意义。\n",
560 | "\n",
561 | "代码方面,注意不用写重复冗余代码(类似 plt.show() 在同单元格多次出现)。后期在书写代码时注意按照 PEP8 格式化即可。VS Code 等 IDE 带有相关格式化插件,Jupyter Notebook 也可以通过安装 jupyter_contrib_nbextensions 拓展开启相关插件自动完成代码格式化。\n",
562 | " \n",
563 | "总之,通过该报告可以判定学员初步达到我们课程预设的培养目标,但仍需要继续学习和加深对数据分析各环节的思考。希望 hcccom 后续再通过书籍等拓展更多相关的数据分析和挖掘知识,并结合自己的兴趣及专业特长在数据分析的道路上越走越好。\n",
564 | "
\n",
565 | "\n",
566 | " \n",
567 | "楼+ 数据分析和挖掘课程组
\n",
568 | "2018 年 12 月 17 日
"
569 | ]
570 | },
571 | {
572 | "cell_type": "markdown",
573 | "metadata": {},
574 | "source": [
575 | "---"
576 | ]
577 | }
578 | ],
579 | "metadata": {
580 | "kernelspec": {
581 | "display_name": "Python 3",
582 | "language": "python",
583 | "name": "python3"
584 | },
585 | "language_info": {
586 | "codemirror_mode": {
587 | "name": "ipython",
588 | "version": 3
589 | },
590 | "file_extension": ".py",
591 | "mimetype": "text/x-python",
592 | "name": "python",
593 | "nbconvert_exporter": "python",
594 | "pygments_lexer": "ipython3",
595 | "version": "3.7.1"
596 | }
597 | },
598 | "nbformat": 4,
599 | "nbformat_minor": 2
600 | }
601 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 实验楼在线教育
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Mindmaps/README.md:
--------------------------------------------------------------------------------
1 | - [点击下载全部思维导图](https://minhaskamal.github.io/DownGit/#/home?url=https://github.com/shiyanlou/louplus-dm/tree/master/Mindmaps)
2 |
--------------------------------------------------------------------------------
/Mindmaps/louplus-dm-week1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week1.png
--------------------------------------------------------------------------------
/Mindmaps/louplus-dm-week2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week2.png
--------------------------------------------------------------------------------
/Mindmaps/louplus-dm-week3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week3.png
--------------------------------------------------------------------------------
/Mindmaps/louplus-dm-week4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week4.png
--------------------------------------------------------------------------------
/Mindmaps/louplus-dm-week5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shiyanlou/louplus-dm/52764983b7080c3ca760e38c38c9a71cf0c2ed3e/Mindmaps/louplus-dm-week5.png
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
蓝桥云课《楼+ 数据分析与挖掘实战》课程仓库|课程报名
4 |
5 |
6 |
7 |
8 | 主分支下方包含最新课程的参考答案,历史开班课程的参考答案移步相应分支查看。
9 |
10 | ### 其他班级
11 |
12 | - [第 12-13 期挑战参考答案](https://github.com/shiyanlou/louplus-dm/tree/v3/Answers)
13 | - [第 07-11 期挑战参考答案](https://github.com/shiyanlou/louplus-dm/tree/v2/Answers)
14 | - [第 01-06 期挑战参考答案](https://github.com/shiyanlou/louplus-dm/tree/master/Answers)
15 |
16 | ### 优秀报告
17 |
18 | - [优秀项目挑战比赛报告](https://github.com/shiyanlou/louplus-dm/tree/master/Assignments)
19 |
--------------------------------------------------------------------------------