├── .gitignore
├── LICENSE.md
├── Lectures
├── Week_1_Statistics
│ ├── Knowledge 1_with notes.pdf
│ ├── Knowledge.pdf
│ ├── Lec1.R
│ ├── List of R material.pdf
│ ├── Practice.R
│ └── Practice.pdf
├── Week_5_Classification
│ ├── python_environment_setup.md
│ └── week5_reading.md
├── Week_6_NLP
│ └── week6_reading.md
├── Week_7_Clustering
│ └── week7_reading.md
└── Week_8_Recommender
│ └── week8_reading.md
├── README.md
└── Readings
├── DataScienceForBusiness.pdf
├── DoingDataScience.pdf
├── ElemStatLearn.pdf
├── ISLR_Sixth_Printing.pdf
├── Introduction_to_Machine_Learning_with_Python.pdf
├── Machine_Learning_In_Action.pdf
├── NaturalLanguageProcessingWithPython.pdf
└── Python_Data_Science_Handbook.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .ipynb_checkpoints
3 | *.pyc
4 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | ## COPYRIGHT POLICY
2 |
3 | All content included on the Site or third-party platforms as part of the class, such as text, graphics, logos, button icons, images, audio clips, video clips, live streams, digital downloads, data compilations, and software, is the property of BitTiger or its content suppliers and protected by copyright laws.
4 |
5 | Any attempt to redistribute or resell BitTiger content will result in the appropriate legal action being taken.
6 |
7 | We thank you in advance for respecting our copyrighted content. For more info see
8 |
9 | https://www.bittiger.io/termsofuse
10 |
11 | https://www.bittiger.io/termsofservice
12 |
13 |
14 | ## 版权申明
15 |
16 | 所有太阁官方网站以及在第三方平台课程中所产生的课程内容,如文本,图形,徽标,按钮图标,图像,音频剪辑,视频剪辑,直播流,数字下载,数据编辑和软件均属于太阁所有并受版权法保护。
17 |
18 | 对于任何尝试散播或转售BitTiger的所属资料的行为,太阁将采取适当的法律行动。
19 |
20 | 我们非常感谢您尊重我们的版权内容。
21 |
22 | 有关详情,请参阅
23 |
24 | https://www.bittiger.io/termsofuse
25 |
26 | https://www.bittiger.io/termsofservice
27 |
28 |
--------------------------------------------------------------------------------
/Lectures/Week_1_Statistics/Knowledge 1_with notes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Lectures/Week_1_Statistics/Knowledge 1_with notes.pdf
--------------------------------------------------------------------------------
/Lectures/Week_1_Statistics/Knowledge.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Lectures/Week_1_Statistics/Knowledge.pdf
--------------------------------------------------------------------------------
/Lectures/Week_1_Statistics/Lec1.R:
--------------------------------------------------------------------------------
1 | #Lecture 1
2 | train <- read.csv('train.csv', stringsAsFactors = FALSE)
3 | summary(train)
4 |
5 | str(train)
6 | head(train)
7 |
8 | contVar <- NULL
9 | corr <- NULL
10 | catVar <- NULL
11 | catCorr <- NULL
12 | for (i in 1:dim(train)[2]) {
13 | if (is.numeric((train[,i]))){
14 | contVar <- c(contVar, i)
15 | corr <- c(corr, cor(train[,i], train$SalePrice, use = 'pairwise.complete.obs'))
16 | } else {
17 | catVar <- c(catVar, i)
18 | }
19 | }
20 |
21 | contVar
22 | length(contVar)
23 | catVar
24 | length(catVar)
25 |
26 | library(corrplot)
27 | trainCont <- train[,contVar]
28 | trainCont <- trainCont[,-1]
29 | trainCatg <- train[,catVar]
30 | correlations <- cor(trainCont, use = 'pairwise.complete.obs')
31 | corrplot(correlations, method = "square")
32 |
33 | trainCont
34 | trainCatg
35 | # first 5 highest correlated factors
36 | # find absolute value for each correlations
37 | absolute = abs(correlations)
38 | highestCorrelation <- NULL
39 | # find the sum of the absolute value for each correlations
40 | for (i in 1:dim(absolute)[1]) {
41 | highestCorrelation <- c(highestCorrelation, sum(absolute[i,]))
42 | }
43 | highestCorrelation
44 | orderIndex = order(highestCorrelation)
45 | orderIndex
46 | # find the 5 index with the highest absolute value for each correlations are
47 | # very likely to be the most important correlated factors among the 37 factors
48 | highestIndex = orderIndex[33:37]
49 | highestCorrelation[highestIndex] # their correlations values
50 | highestCorrelationNames = colnames(trainCont[,highestIndex]) #their names
51 | highestCorrelationNames
52 | # Cut the data of these columns
53 | trimmedData = trainCont[, highestIndex]
54 | trimCorrelations <- cor(trimmedData, use = 'pairwise.complete.obs')
55 | corrplot(trimCorrelations, method = "square") # plot the correlations
56 |
57 | #predictive power
58 | #for continuous variables, the predictive power is described as the absolute
59 | #correlation with SalePrice, when the absolute correlation is higher, it means
60 | #it is positively or negatively correlated to the SalePrice, which can be
61 | #considered as the predictive power
62 | contVar <- contVar[-1] #remove ID
63 | corr <- corr[-1]
64 | contVar <- contVar[-37] #remove SalePrice
65 | corr <- corr[-37]
66 | abscorr = abs(corr)
67 | abscorr
68 | contPredictIndex = order(abscorr)
69 | colnames(train)[contVar]
70 | # top 5 predictive power
71 | rev(colnames(train)[contVar[contPredictIndex[32:36]]])
72 | rev(abscorr[contPredictIndex[32:36]])
73 |
74 | #for categorical variables, the predictive power can be identified as the mean
75 | #difference between leftmost and rightmost categories, the more different the mean is, the
76 | #higher predictive power the categorical variable has
77 | percentage <- sapply(train, function(x) {length(which(is.na(x)))/nrow(train)})
78 | percentage <- percentage[catVar]
79 | catVar <- catVar[percentage < 0.05]
80 | percentage <- percentage[percentage < 0.05]
81 |
82 |
83 | #generate potential useful features from existing features
84 | #for this, I believe the absolute correlations between variables can describe
85 | #the potential for the combined features. When the absolute correlations between
86 | # A and B is higher than the absolute correlations between C and D. A and B are
87 | #more likely to be combined and generate a potential useful features.
88 | contVar <- c(contVar, 81)
89 | absoluteOrder <- order(absolute)
90 | #Since the correlations come within pairs and the last 37 self-self correlations are 1, therefore, take the 10 elements 1323 - 1332
91 | top5Order <- absoluteOrder[1323:1332]
92 | top5Order <- top5Order[-seq(1, 9, 2)]
93 | x = ceiling((top5Order - 1) / 37)
94 | y = (top5Order - 1) %% 37 + 1
95 | # the 1st x and 1st y is the potential most powerful features, the 2nd x and 2nd y and so on
96 | colnames(train)[contVar[x]]
97 | colnames(train)[contVar[y]]
98 | correlations[x, y] # the diagonal line is the correlations between two factors, those factors are the highest
99 | top5Order
100 | absolute[top5Order]
101 | # check to see that they are the same, the answer is correct
102 |
103 | meanDifference <- NULL
104 | for (i in 1:length(catVar)) {
105 | xtab <- xtabs(train$SalePrice ~ eval(parse(text = paste("train$", colnames(train)[catVar[i]], sep = ""))))
106 | count <- table(train[catVar[i]])
107 | tab = xtab / count
108 | meanDifference <- c(meanDifference, max(tab) - min(tab))
109 | }
110 | meanOrder <- order(meanDifference)
111 | meanDifference[meanOrder]
112 | meanOrder[(length(meanOrder) - 4):length(meanOrder)]
113 | "The 5 Most Powerful Predicting Categorical Variable is"
114 | colnames(train)[catVar[meanOrder[(length(meanOrder) - 4):length(meanOrder)]]]
115 | meanDifference[meanOrder[(length(meanOrder) - 4):length(meanOrder)]]
116 |
--------------------------------------------------------------------------------
/Lectures/Week_1_Statistics/List of R material.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Lectures/Week_1_Statistics/List of R material.pdf
--------------------------------------------------------------------------------
/Lectures/Week_1_Statistics/Practice.R:
--------------------------------------------------------------------------------
1 | # Vector: logical, integer, numeric (double), and character.
2 |
3 | dbl_var <- c(1, 2.5, 4.5)
4 | # With the L suffix, you get an integer rather than a double
5 | int_var <- c(1L, 6L, 10L)
6 | # Use TRUE and FALSE (or T and F) to create logical vectors
7 | log_var <- c(TRUE, FALSE, T, F)
8 | chr_var <- c("these are", "some strings")
9 |
10 | # Vectors are always flat, even if you nest c()
11 | c(1, c(2, c(3, 4)))
12 | c(1, 2, 3, 4)
13 |
14 | # missing value is NA
15 | t = c(1, NA, 2)
16 | is.na(t)
17 |
18 | # Given a vector, you can determine its type with typeof(),
19 | # or check if it’s a specific type with an “is” function:
20 | # is.character(), is.double(), is.integer(), is.logical(),
21 | # or, more generally, is.atomic().
22 |
23 | typeof(int_var)
24 | is.integer(int_var)
25 | is.atomic(int_var)
26 |
27 | # is.numeric() is a general test for the “numberliness” of a vector
28 | # and returns TRUE for both integer and double vectors.
29 | is.numeric(int_var)
30 | is.numeric(dbl_var)
31 |
32 | # Coercion:
33 | # All elements of a vector must be the same type,
34 | # so when you attempt to combine different types,
35 | # they will be coerced to the most flexible type.
36 | # Types from least to most flexible are: logical, integer, double, and character.
37 |
38 | str(c("a", 1))
39 |
40 | x <- c(FALSE, FALSE, TRUE)
41 | as.numeric(x)
42 |
43 | # List
44 | # Lists are different from vectors,
45 | # because their elements can be of any type, including lists.
46 |
47 | x <- list(1:3, "a", c(TRUE, FALSE, TRUE), c(2.3, 5.9))
48 | str(x)
49 |
50 | # The typeof() a list is list.
51 | # You can test for a list with is.list() and coerce to a list with as.list().
52 | typeof(x)
53 | is.list(x)
54 |
55 | # You can turn a list into a vector with unlist().
56 | # If the elements of a list have different types, unlist() uses the same coercion rules as c().
57 |
58 | unlist(x)
59 |
60 | # Lists are used to build up many of the more complicated data structures in R
61 | # e.g., linear models objects (as produced by lm()) are lists.
62 |
63 | # Matrix
64 | a <- matrix(1:6, ncol = 3, nrow = 2)
65 |
66 | # You can also modify an object in place by setting dim()
67 | c <- 1:6
68 | dim(c) <- c(2, 3)
69 |
70 | # length() generalises to nrow() and ncol() for matrices.
71 | # names() generalises to rownames() and colnames() for matrices, and dimnames(), a list of character vectors, for arrays.
72 |
73 | length(a)
74 | nrow(a)
75 | ncol(a)
76 |
77 | rownames(a) <- c("A", "B")
78 | colnames(a) <- c("a", "b", "c")
79 | a
80 |
81 | # Data frame: the most common way of storing data in R
82 | df <- data.frame(x = 1:3, y = c("a", "b", "c"))
83 | str(df)
84 | # why we don't like factor, see this post:http://www.win-vector.com/blog/2014/09/factors-are-not-first-class-citizens-in-r/
85 | # x = as.factor(c('a', 'b', 'c'))
86 | # x = c(x, 'd')
87 | df <- data.frame(x = 1:3, y = c("a", "b", "c"), stringsAsFactors = FALSE)
88 | str(df)
89 |
90 | is.data.frame(df)
91 |
92 | # Coercion
93 | # You can coerce an object to a data frame with as.data.frame():
94 | # A vector will create a one-column data frame.
95 | # A list will create one column for each element;
96 | # it’s an error if they’re not all the same length.
97 | x
98 | as.data.frame(x)
99 | # A matrix will create a data frame with the same number of columns and rows as the matrix.
100 | a
101 | as.data.frame(a)
102 |
103 | # Material: https://cran.r-project.org/doc/manuals/r-patched/R-intro.html
104 | train <- read.csv("train.csv", stringsAsFactors = FALSE)
105 |
106 | # Understand the data
107 | summary(train)
108 | str(train)
109 | dim(train)
110 | head(train)
111 | colnames(train)
112 | head(rownames(train))
113 | length(unique(train$Id))
114 |
115 | # check how many NAs in each feature
116 | length(which(is.na(train$LotFrontage)))
117 | sapply(train, function(x) {length(which(is.na(x)))})
118 | # same as:
119 | colSums(sapply(train, is.na))
120 | sort(sapply(train, function(x) { sum(is.na(x)) }), decreasing=TRUE)
121 | # The percentage of data missing in train.
122 | sum(is.na(train)) / (nrow(train) *ncol(train))
123 |
124 | # Find out variables with largest number of missing values
125 | # how to treat missing values, (1) actually missing (2) add new level (3) imputation
126 | # (4) remove all rows with NA
127 |
128 | # categorial variables
129 | table(train$OverallQual)
130 | table(train$OverallQual) / dim(train)[1]
131 | # barplot(counts of data not data itself)
132 | barplot(table(train$OverallQual))
133 | barplot(table(train$OverallCond))
134 |
135 | # continous variables
136 | mean(train$SalePrice)
137 | sd(train$SalePrice)
138 | median(train$SalePrice)
139 | quantile(train$SalePrice, c(0.1, 0.25, 0.5, 0.75, 0.9))
140 |
141 | # let's use pdf
142 | plot(density(train$SalePrice)) # Right skewed
143 | plot(density(log(train$SalePrice))) # looks more normal distributed
144 |
145 | # boxplot http://www.physics.csbsju.edu/stats/box2.html
146 | boxplot(train$SalePrice)
147 | # Q1 - 1.5IQR, Q1, median, Q3, Q3 + 1.5IQR, where IQR is interquartile range: Q3 - Q1
148 |
149 | # Get to explore the relationship between features and outcome
150 | # what could be the most useful features from the list? Most times choose features by intuition.
151 | # Talk about example at work.
152 | # Start from OverallQual
153 | boxplot(log(subset(train, OverallQual <= 5)$SalePrice + 1),
154 | log(subset(train, OverallQual > 5)$SalePrice + 1))
155 | # add more notation https://www.r-bloggers.com/box-plot-with-r-tutorial/
156 | boxplot(log(subset(train, OverallQual <= 5)$SalePrice + 1),
157 | log(subset(train, OverallQual > 5)$SalePrice + 1),
158 | xlab = "OverallQuality", ylab = "log(SalesPrice)",
159 | names = c("Low Qual", "High Qual"), col = c("red", "green"))
160 | # another version of boxplot, called violin plot
161 | # library(vioplot)
162 | # lowQual <- subset(train, OverallQual <= 5, select="SalePrice")
163 | # highQual <- subset(train, OverallQual > 5, select="SalePrice")
164 | # vioplot(log(lowQual$SalePrice), log(highQual$SalePrice), col ='gold')
165 |
166 | # What if we want to have a boxplot for each category of the overallQuality feature
167 | library(lattice)
168 | bwplot(OverallQual ~ SalePrice, data = train)
169 | bwplot(Neighborhood ~ SalePrice, data = train)
170 | bwplot(LotArea ~ SalePrice, data = train) # not really good for continuous feature
171 |
172 | # how to explore relationship between continuous feature and response: calculate the correlation
173 | with(train, cor(LotArea, SalePrice))
174 | contVar <- NULL
175 | corr <- NULL
176 | for(i in 1:dim(train)[2]) {
177 | if(is.numeric(train[, i])) {
178 | contVar <- c(contVar, i)
179 | corr <- c(corr, cor(train[, i], train$SalePrice))
180 |
181 | }
182 | }
183 |
184 | # find out all of the continous columns
185 | contVar <- names(train)[which(sapply(train, is.numeric))]
186 | trainCont <- train[, contVar]
187 | dim(trainCont)
188 |
189 | library(corrplot)
190 | correlations <- cor(trainCont[, -1]) # see a lot of NA
191 | correlations <- cor(trainCont[, -1], use = "pairwise.complete.obs")
192 | corrplot(correlations, method = "square")
193 | # some features are not helpful at all, which could be due to the missingness as well
194 | corrplot(correlations[rowInd, rowInd], method = "square")
195 |
--------------------------------------------------------------------------------
/Lectures/Week_1_Statistics/Practice.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Lectures/Week_1_Statistics/Practice.pdf
--------------------------------------------------------------------------------
/Lectures/Week_5_Classification/python_environment_setup.md:
--------------------------------------------------------------------------------
1 | # Environment Setup for Data Science in Python
2 |
3 | BitTiger DS501
4 |
5 | ---
6 |
7 | The course will heavily use Python. It is not necessary to be an expert in Python coming into the course (that is what we will teach you!) but it is helpful to be familiar with its syntax.
8 |
9 | Not everyone knows Python. We provide resources here to get you situated with Python environment and to bring you up to speed with Python quickly.
10 |
11 | ## Install Python and its modules
12 | ### Option 1: Install Anaconda (Recommended)
13 |
14 | We will be using [Anaconda](https://store.continuum.io/cshop/anaconda/) with Python version 2.7. Install this and you will have Python along with several useful packages and programs:
15 |
16 | * NumPy
17 | * SciPy
18 | * Pandas
19 | * Sklearn
20 | * MatPlotLib
21 | * IPython
22 |
23 | You can see the [full list of packages](http://docs.continuum.io/anaconda/pkg-docs.html).
24 |
25 | You should install Anaconda now. Read the [docs](https://docs.continuum.io/anaconda/install) if you don't know how to install.
26 |
27 | ### Option 2: Use virtualenv (Only for those with some experience)
28 |
29 | If you are familiar with command line interface (CLI) and [virtualenv](https://virtualenv.pypa.io/en/stable/), and you are so inclined to manage your Python modules yourself, you can use virtualenv, for example, to create and activate a virtualenv on Linux/MacOS you do
30 |
31 | ```
32 | $ virtualenv venv
33 | $ source venv/bin/activate
34 | ```
35 |
36 | and you use pip to install any packages:
37 |
38 | ```
39 | $ pip install numpy
40 | $ pip install scipy
41 | $ pip install pandas
42 | $ pip install sklearn
43 | $ pip install matplotlib
44 | $ pip install ipython[all]
45 | ```
46 |
47 | ## Jupyter Notebook (f.k.a. IPython Notebook)
48 |
49 | [Jupyter Notebook](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/) will be our main tool for interactive programming process, including: developing, documenting, and executing code, as well as communicating the results.
50 |
51 | You can either start Jupyter Notebook from Anaconda GUI or from CLI. For example, on Linux and MacOS:
52 | ```
53 | $ jupyter notebook
54 | ```
55 |
56 | Jupyter Notebook will be launched on your browser.
57 |
58 |
59 | ## Text Editor (optional)
60 |
61 | You are welcome to use any editor you'd like. Below are two general purpose text editors, commonly used by data scientists and software engineers.
62 |
63 | ### Sublime
64 |
65 | You can download [Sublime Text](http://www.sublimetext.com/2) for free.
66 |
67 | ### Atom
68 |
69 | You can download [Atom](https://atom.io/) for free.
70 |
71 |
72 | ---
73 |
74 |
75 | ## Learn Python
76 |
77 | ### Resources
78 |
79 | If you feel new to python, you might benefit from looking at one of these resources:
80 |
81 | * [Think Python](http://greenteapress.com/wp/think-python/)
82 | * [Dive Into Python](http://www.diveintopython.net/)
83 | * [Learn Python the Hard Way](http://learnpythonthehardway.org/)
84 |
85 | Python also has great documentation.
86 |
87 | The [Python tutorial](https://docs.python.org/2/tutorial/) and [Python library](https://docs.python.org/2/library/) are great resources if you need to look up how something is done in Python.
88 |
89 | ### What you need to know
90 |
91 | ##### Eventually you should be comfortable with everything below.
92 |
93 | * Basic data structures and associated methods
94 | * int, float
95 | * string
96 | * list
97 | * dict
98 | * set
99 | * range
100 | * Control structures
101 | * if, elif, else
102 | * while
103 | * for
104 | * break, continue, pass
105 | * Enumerations
106 | * for loops
107 | * list comprehensions
108 | * enumerate
109 | * zip
110 | * Functions
111 | * Declaration
112 | * Calling
113 | * Keyword arguments
114 | * Object orientation
115 | * Classes
116 | * Methods
117 | * Properties (instance variables)
118 | * self
119 | * Modules
120 | * import
121 | * aliasing (`import pandas as pd`)
122 | * global import (`from pandas import *`)
123 | * IO
124 | * Read a file
125 | * Write to a file
126 |
--------------------------------------------------------------------------------
/Lectures/Week_5_Classification/week5_reading.md:
--------------------------------------------------------------------------------
1 | # Readings for Week 5
2 |
3 | ## Machine Learning Theory
4 |
5 | * Logistic Regression
6 | * [Andrew Ng CS229 lecture notes](http://cs229.stanford.edu/notes/cs229-notes1.pdf) Page 1-7, 16-19
7 | * *An Introductino to Statisitical Learning* Chapter 4, Page 128-138 (optional)
8 |
9 | * Gradient Descent
10 | * *Machine Learning in Action* Chapter 5, Page 83-90 (Page 90-96 optional)
11 | * [An Gradient Descent animation in R](http://vis.supstat.com/2013/03/gradient-descent-algorithm-with-r/)
12 |
13 | * Machine Learning Model evaluation
14 | * [Bias-Variance Tradeoff](http://scott.fortmann-roe.com/docs/BiasVariance.html)
15 | * [Confusion Matrix](https://en.wikipedia.org/wiki/Confusion_matrix)
16 | * [Receiver Operating Characteristic (ROC)](https://en.wikipedia.org/wiki/Receiver_operating_characteristic)
17 | * [Evaluating Machine Learning Models](http://www.oreilly.com/data/free/files/evaluating-machine-learning-models.pdf) (optional)
18 |
19 | ## ML Implementation in Python
20 |
21 | * Sklearn Examples
22 | * [Logistic Regression Docs](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)
23 | * [Logistic Regression Example](http://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html)
24 | * [Confusion matrix](http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py)
25 | * [ROC Curve](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py)
26 | * [More model evaluation](http://scikit-learn.org/stable/modules/model_evaluation.html)
27 |
--------------------------------------------------------------------------------
/Lectures/Week_6_NLP/week6_reading.md:
--------------------------------------------------------------------------------
1 | # Readings for Week 6
2 |
3 | ## Machine Learning Theory
4 |
5 | * Natural Language Processing
6 | * Text feature extraction [Part-I](http://blog.christianperone.com/2011/09/machine-learning-text-feature-extraction-tf-idf-part-i/), [Part-II](http://blog.christianperone.com/2011/10/machine-learning-text-feature-extraction-tf-idf-part-ii/), [Part-III](http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/)
7 | * [Text classification with NLTK and scikit-learn (slides)](http://www.slideshare.net/ogrisel/statistical-machine-learning-for-text-classification-with-scikitlearn-and-nltk)
8 | * *Natural Language Processing with Python* (ch 3, pg 79-122, optional)
9 |
10 | ## ML Implementation in Python
11 |
12 | * Sklearn
13 | * [Working with text data](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html)
14 |
15 | * NLTK (optional)
16 | * [NLTK Book: 1 - 4 Language Processing and Python](http://www.nltk.org/book/ch01.html)
17 | * [NLTK Book: 3.6 Normalizing Text, 3.7 Tokenizing Text, ](http://www.nltk.org/book/ch03.html)
18 |
--------------------------------------------------------------------------------
/Lectures/Week_7_Clustering/week7_reading.md:
--------------------------------------------------------------------------------
1 | # Readings for Week 7
2 |
3 | ## Machine Learning Theory
4 |
5 | * Basic Math
6 | * Linear Algebra: *Machine Learning in Action*: Appendix-B
7 |
8 | * Clustering
9 | * Kmeans: *Introduction to Statistical Learning* Chapter 10.3.1
10 | * KMeans: *Machine Learning in Action*: Chapter 10
11 | * Hierarchical Clustering: *Introduction to Statistical Learning* Chapter 10.3.2
12 |
13 | * Principal Component Analysis (Optional)
14 | * PCA: *Introduction to Statistical Learning* Chapter 10.2
15 | * PCA: *Machine Learning in Action*: Chapter 13
16 |
17 | * Singular Value Decomposition (Optional)
18 | * SVD: *Machine Learning in Action*: Chapter 14
19 | * Extra reading: [Dimensionality Reduction](http://infolab.stanford.edu/~ullman/mmds/ch11.pdf)
20 |
21 | ## ML Implementation in Python
22 |
23 | * Sklearn
24 | * [Clustering Overview](http://scikit-learn.org/stable/modules/clustering.html#)
25 | * [KMeans Example](http://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html#k-means-clustering)
26 | * [Hierarchical Clustering Example](http://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html#hierarchical-agglomerative-clustering-ward)
27 | * [PCA Example](http://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html)
28 |
--------------------------------------------------------------------------------
/Lectures/Week_8_Recommender/week8_reading.md:
--------------------------------------------------------------------------------
1 | # Readings for Week 8
2 |
3 | ## Machine Learning Theory
4 |
5 | * Recommendation System
6 | * [Mining Massive Datasets ch 9](http://infolab.stanford.edu/~ullman/mmds/ch9.pdf) (especially 9.1, 9.3, 9.5)
7 |
8 | * NMF and Recommender
9 | * [Nonnegative Matrix Factorization and Recommendor Systems](http://econometricsense.blogspot.com/2012/10/nonnegative-matrix-factorization-and.html)
10 | * [NMF original paper](http://www.dm.unibo.it/~simoncin/nmfconverge.pdf)
11 |
12 | ## ML Implementation in Python
13 |
14 | * Recommender
15 | * [Matrix Factorization: A Simple Tutorial and Implementation in Python](http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/)
16 | * Turi (f.k.a Dato, which is f.k.a Graphlab)
17 | * [Recommender with Turi](https://turi.com/learn/userguide/recommender/introduction.html)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BitTiger Data Scientist Mastery Program
2 |
3 | This is the repository for BitTiger Data Scientist Mastery Program (DS501). This page covers what we are going to teach you and the resource you will need throughout the course.
4 |
5 | The course videos are linked here: [DS501 数据科学家直通车](https://www.bittiger.io/livecourses/NjTjzamKwWpHHECur).
6 |
7 | * __Day:__ Day of the Week
8 | * __Readings:__ Readings for the day.
9 | * __Lecture Notes:__ The folder contains lecture notes and slides.
10 |
11 |
12 | ## Teaching Team
13 |
14 | * Lead Instructor
15 |
16 | * Ella -
17 | * Stone -
18 | * Miao -
19 |
20 | * Teaching Assitant
21 |
22 | * Ye
23 | * Qirui
24 | * Yiqing
25 |
26 | * Course Coordinator
27 |
28 | * Chloe
29 | * Joy
30 |
31 | ## Syllabus
32 |
33 | * [Week 1: Statistical Foundations](#week-1-statistical-foundations)
34 | * [Week 2: Statistical Inference](#week-2-statistical-inference)
35 | * [Week 3: Linear Regression](#week-3-linear-regression)
36 | * [Week 4: Tree Based Models](#week-4-tree-based-models)
37 | * [Week 5: Logistic Regression](#week-5-logistic-regression)
38 | * [Week 6: Natural Language Processing](#week-6-natural-language-processing)
39 | * [Week 7: Clustering](#week-7-clustering)
40 | * [Week 8: Recommendation System](#week-8-recommendation-system)
41 |
42 |
43 | ### Week 1: Statistical Foundations
44 | | | Day (US) | Readings | Lecture Notes | Instructor |
45 | |:--:|:--:|:--:|:--:|:--:|
46 | | __Knowledge__ | Friday | To be posted | To be posted | Ella |
47 | | __Practice__ | Saturday | To be posted | To be posted | Ella |
48 | | __Code Lab__ | Tuesday | To be posted | To be posted | Ella |
49 |
50 | ---
51 |
52 | ### Week 2: Statistical Inference
53 | | | Day (US) | Readings | Lecture Notes | Instructor |
54 | |:--:|:--:|:--:|:--:|:--:|
55 | | __Knowledge__ | Friday | To be posted | To be posted | Ella |
56 | | __Practice__ | Saturday | To be posted | To be posted | Ella |
57 | | __Code Lab__ | Tuesday | To be posted | To be posted | Ella |
58 |
59 | ---
60 |
61 | ### Week 3: Linear Regression
62 | | | Day (US) | Readings | Lecture Notes | Instructor |
63 | |:--:|:--:|:--:|:--:|:--:|
64 | | __Knowledge__ | Friday | To be posted | To be posted | Ella |
65 | | __Practice__ | Saturday | To be posted | To be posted | Ella |
66 | | __Code Lab__ | Tuesday | To be posted | To be posted | Ella |
67 |
68 | ---
69 |
70 | ### Week 4: Tree Based Models
71 | | | Day (US) | Readings | Lecture Notes | Instructor |
72 | |:--:|:--:|:--:|:--:|:--:|
73 | | __Knowledge__ | Friday | To be posted | To be posted | Ella |
74 | | __Practice__ | Saturday | To be posted | To be posted | Ella |
75 | | __Code Lab__ | Tuesday | To be posted | To be posted | Ella |
76 |
77 | ---
78 |
79 | ### Week 5: Logistic Regression
80 | | | Day (US) | Readings | Lecture Notes | Instructor |
81 | |:--:|:--:|:--:|:--:|:--:|
82 | | __Knowledge__ | Friday | To be posted | To be posted | Stone |
83 | | __Practice__ | Saturday | 1. [Python environment setup](./Lectures/Week_5_Classification/python_environment_setup.md) | To be posted | Stone |
84 | | __Code Lab__ | Tuesday | To be posted | To be posted | Stone |
85 |
86 | ---
87 |
88 | ### Week 6: Natural Language Processing
89 | | | Day (US) | Readings | Lecture Notes | Instructor |
90 | |:--:|:--:|:--:|:--:|:--:|
91 | | __Knowledge__ | Friday | To be posted | To be posted | Stone |
92 | | __Practice__ | Saturday | To be posted | To be posted | Stone |
93 | | __Code Lab__ | Tuesday | To be posted | To be posted | Stone |
94 |
95 | ---
96 | ### Week 7: Clustering
97 | | | Day (US) | Readings | Lecture Notes | Instructor |
98 | |:--:|:--:|:--:|:--:|:--:|
99 | | __Knowledge__ | Friday | To be posted | To be posted | Stone |
100 | | __Practice__ | Saturday | To be posted | To be posted | Stone |
101 | | __Code Lab__ | Tuesday | To be posted | To be posted | Stone |
102 | ---
103 |
104 | ### Week 8: Recommendation System
105 | | | Day (US) | Readings | Lecture Notes | Instructor |
106 | |:--:|:--:|:--:|:--:|:--:|
107 | | __Knowledge__ | Friday | To be posted | To be posted | Stone |
108 | | __Practice__ | Saturday | To be posted | To be posted | Stone |
109 | | __Code Lab__ | Tuesday | To be posted | To be posted | Stone |
110 | ---
111 |
112 |
113 | ## Notes
114 | * Git and Github
115 | * [Set up git](https://help.github.com/articles/set-up-git/)
116 | * [Git and Github learning resource](https://help.github.com/articles/git-and-github-learning-resources/)
117 | * [Github overall guides](https://guides.github.com/)
118 |
119 | * Python and Python tools
120 | * Required: [Python environment setup](./Lectures/Week_5_Classification/python_environment_setup.md)
121 | * [Scipy lectures](http://www.scipy-lectures.org/)
122 | * [Scikit-Learn Tutorial](http://scikit-learn.org/stable/tutorial/)
123 |
124 |
125 | ## Textbooks
126 |
127 | * Statistics
128 | * [The Elements of Statistical Learning Data Mining, Inference, and Prediction](./Readings/ElemStatLearn.pdf)
129 | * [An Introduction to Statistical Learning with Applications in R](./Readings/ISLR_Sixth_Printing.pdf)
130 |
131 | * Data Science and Machine Learning in Python
132 | * [Introduction to Machine Learning with Python](./Readings/Introduction_to_Machine_Learning_with_Python.pdf)
133 | * [Machine Learning In Action](./Readings/Machine_Learning_In_Action.pdf)
134 | * [Python Data Science Handbook](./Readings/Python_Data_Science_Handbook.pdf)
135 |
136 | * Python Language
137 | * Introductory: [Learn Python the Hard Way](https://learnpythonthehardway.org/)
138 | * Advanced: [Effective Python](http://www.effectivepython.com/)
139 |
140 | * Business and Data Science
141 | * [Data Science For Business](./Readings/DataScienceForBusiness.pdf)
142 |
143 | * Natural Language Processing
144 | * [Natural Language Processing with Python](./Readings/NaturalLanguageProcessingWithPython.pdf)
145 |
146 | * More...
147 | * [23 Free Data Science Books](http://www.wzchen.com/data-science-books)
148 |
149 |
150 | ## Getting Help
151 |
152 | * [Data Science Stack Exchange](http://datascience.stackexchange.com/)
153 | * [Stats Stack Exchange](http://stats.stackexchange.com/)
154 | * [MetaOptimize: ML and Datascience forum](http://metaoptimize.com/qa)
155 |
156 |
157 | ## Data Science Interview
158 |
159 | * [y-hat: What We Learned Analyzing Hundreds of Data Science Interviews](http://blog.yhat.com/posts/data-science-interviews.html)
160 | * [Quora: How Do I Prepare for a Data Scientist Interview](https://www.quora.com/How-do-I-prepare-for-a-data-scientist-interview/answers/4332208)
161 |
162 |
163 | ## Data Science Blogs
164 |
165 | * [Airbnb](http://nerds.airbnb.com/data/)
166 | * [Uber](https://eng.uber.com/category/uberdata/)
167 | * [StitchFix](http://multithreaded.stitchfix.com/blog)
168 | * [Facebook](https://research.fb.com/blog/)
169 | * [y-hat](http://blog.yhat.com/)
170 | * [Machine Learning Mastery](http://machinelearningmastery.com/blog/)
171 | * [Data Science 101](http://101.datascience.community/)
172 | * [The Data Incubator](http://blog.thedataincubator.com/): good examples of data science projects
173 |
174 |
175 |
176 |
177 |
--------------------------------------------------------------------------------
/Readings/DataScienceForBusiness.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/DataScienceForBusiness.pdf
--------------------------------------------------------------------------------
/Readings/DoingDataScience.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/DoingDataScience.pdf
--------------------------------------------------------------------------------
/Readings/ElemStatLearn.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/ElemStatLearn.pdf
--------------------------------------------------------------------------------
/Readings/ISLR_Sixth_Printing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/ISLR_Sixth_Printing.pdf
--------------------------------------------------------------------------------
/Readings/Introduction_to_Machine_Learning_with_Python.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/Introduction_to_Machine_Learning_with_Python.pdf
--------------------------------------------------------------------------------
/Readings/Machine_Learning_In_Action.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/Machine_Learning_In_Action.pdf
--------------------------------------------------------------------------------
/Readings/NaturalLanguageProcessingWithPython.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/NaturalLanguageProcessingWithPython.pdf
--------------------------------------------------------------------------------
/Readings/Python_Data_Science_Handbook.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/Python_Data_Science_Handbook.pdf
--------------------------------------------------------------------------------