├── .gitignore ├── LICENSE.md ├── Lectures ├── Week_1_Statistics │ ├── Knowledge 1_with notes.pdf │ ├── Knowledge.pdf │ ├── Lec1.R │ ├── List of R material.pdf │ ├── Practice.R │ └── Practice.pdf ├── Week_5_Classification │ ├── python_environment_setup.md │ └── week5_reading.md ├── Week_6_NLP │ └── week6_reading.md ├── Week_7_Clustering │ └── week7_reading.md └── Week_8_Recommender │ └── week8_reading.md ├── README.md └── Readings ├── DataScienceForBusiness.pdf ├── DoingDataScience.pdf ├── ElemStatLearn.pdf ├── ISLR_Sixth_Printing.pdf ├── Introduction_to_Machine_Learning_with_Python.pdf ├── Machine_Learning_In_Action.pdf ├── NaturalLanguageProcessingWithPython.pdf └── Python_Data_Science_Handbook.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .ipynb_checkpoints 3 | *.pyc 4 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | ## COPYRIGHT POLICY 2 | 3 | All content included on the Site or third-party platforms as part of the class, such as text, graphics, logos, button icons, images, audio clips, video clips, live streams, digital downloads, data compilations, and software, is the property of BitTiger or its content suppliers and protected by copyright laws. 4 | 5 | Any attempt to redistribute or resell BitTiger content will result in the appropriate legal action being taken. 6 | 7 | We thank you in advance for respecting our copyrighted content. For more info see 8 | 9 | https://www.bittiger.io/termsofuse 10 | 11 | https://www.bittiger.io/termsofservice 12 | 13 | 14 | ## 版权申明 15 | 16 | 所有太阁官方网站以及在第三方平台课程中所产生的课程内容,如文本,图形,徽标,按钮图标,图像,音频剪辑,视频剪辑,直播流,数字下载,数据编辑和软件均属于太阁所有并受版权法保护。 17 | 18 | 对于任何尝试散播或转售BitTiger的所属资料的行为,太阁将采取适当的法律行动。 19 | 20 | 我们非常感谢您尊重我们的版权内容。 21 | 22 | 有关详情,请参阅 23 | 24 | https://www.bittiger.io/termsofuse 25 | 26 | https://www.bittiger.io/termsofservice 27 | 28 | -------------------------------------------------------------------------------- /Lectures/Week_1_Statistics/Knowledge 1_with notes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Lectures/Week_1_Statistics/Knowledge 1_with notes.pdf -------------------------------------------------------------------------------- /Lectures/Week_1_Statistics/Knowledge.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Lectures/Week_1_Statistics/Knowledge.pdf -------------------------------------------------------------------------------- /Lectures/Week_1_Statistics/Lec1.R: -------------------------------------------------------------------------------- 1 | #Lecture 1 2 | train <- read.csv('train.csv', stringsAsFactors = FALSE) 3 | summary(train) 4 | 5 | str(train) 6 | head(train) 7 | 8 | contVar <- NULL 9 | corr <- NULL 10 | catVar <- NULL 11 | catCorr <- NULL 12 | for (i in 1:dim(train)[2]) { 13 | if (is.numeric((train[,i]))){ 14 | contVar <- c(contVar, i) 15 | corr <- c(corr, cor(train[,i], train$SalePrice, use = 'pairwise.complete.obs')) 16 | } else { 17 | catVar <- c(catVar, i) 18 | } 19 | } 20 | 21 | contVar 22 | length(contVar) 23 | catVar 24 | length(catVar) 25 | 26 | library(corrplot) 27 | trainCont <- train[,contVar] 28 | trainCont <- trainCont[,-1] 29 | trainCatg <- train[,catVar] 30 | correlations <- cor(trainCont, use = 'pairwise.complete.obs') 31 | corrplot(correlations, method = "square") 32 | 33 | trainCont 34 | trainCatg 35 | # first 5 highest correlated factors 36 | # find absolute value for each correlations 37 | absolute = abs(correlations) 38 | highestCorrelation <- NULL 39 | # find the sum of the absolute value for each correlations 40 | for (i in 1:dim(absolute)[1]) { 41 | highestCorrelation <- c(highestCorrelation, sum(absolute[i,])) 42 | } 43 | highestCorrelation 44 | orderIndex = order(highestCorrelation) 45 | orderIndex 46 | # find the 5 index with the highest absolute value for each correlations are 47 | # very likely to be the most important correlated factors among the 37 factors 48 | highestIndex = orderIndex[33:37] 49 | highestCorrelation[highestIndex] # their correlations values 50 | highestCorrelationNames = colnames(trainCont[,highestIndex]) #their names 51 | highestCorrelationNames 52 | # Cut the data of these columns 53 | trimmedData = trainCont[, highestIndex] 54 | trimCorrelations <- cor(trimmedData, use = 'pairwise.complete.obs') 55 | corrplot(trimCorrelations, method = "square") # plot the correlations 56 | 57 | #predictive power 58 | #for continuous variables, the predictive power is described as the absolute 59 | #correlation with SalePrice, when the absolute correlation is higher, it means 60 | #it is positively or negatively correlated to the SalePrice, which can be 61 | #considered as the predictive power 62 | contVar <- contVar[-1] #remove ID 63 | corr <- corr[-1] 64 | contVar <- contVar[-37] #remove SalePrice 65 | corr <- corr[-37] 66 | abscorr = abs(corr) 67 | abscorr 68 | contPredictIndex = order(abscorr) 69 | colnames(train)[contVar] 70 | # top 5 predictive power 71 | rev(colnames(train)[contVar[contPredictIndex[32:36]]]) 72 | rev(abscorr[contPredictIndex[32:36]]) 73 | 74 | #for categorical variables, the predictive power can be identified as the mean 75 | #difference between leftmost and rightmost categories, the more different the mean is, the 76 | #higher predictive power the categorical variable has 77 | percentage <- sapply(train, function(x) {length(which(is.na(x)))/nrow(train)}) 78 | percentage <- percentage[catVar] 79 | catVar <- catVar[percentage < 0.05] 80 | percentage <- percentage[percentage < 0.05] 81 | 82 | 83 | #generate potential useful features from existing features 84 | #for this, I believe the absolute correlations between variables can describe 85 | #the potential for the combined features. When the absolute correlations between 86 | # A and B is higher than the absolute correlations between C and D. A and B are 87 | #more likely to be combined and generate a potential useful features. 88 | contVar <- c(contVar, 81) 89 | absoluteOrder <- order(absolute) 90 | #Since the correlations come within pairs and the last 37 self-self correlations are 1, therefore, take the 10 elements 1323 - 1332 91 | top5Order <- absoluteOrder[1323:1332] 92 | top5Order <- top5Order[-seq(1, 9, 2)] 93 | x = ceiling((top5Order - 1) / 37) 94 | y = (top5Order - 1) %% 37 + 1 95 | # the 1st x and 1st y is the potential most powerful features, the 2nd x and 2nd y and so on 96 | colnames(train)[contVar[x]] 97 | colnames(train)[contVar[y]] 98 | correlations[x, y] # the diagonal line is the correlations between two factors, those factors are the highest 99 | top5Order 100 | absolute[top5Order] 101 | # check to see that they are the same, the answer is correct 102 | 103 | meanDifference <- NULL 104 | for (i in 1:length(catVar)) { 105 | xtab <- xtabs(train$SalePrice ~ eval(parse(text = paste("train$", colnames(train)[catVar[i]], sep = "")))) 106 | count <- table(train[catVar[i]]) 107 | tab = xtab / count 108 | meanDifference <- c(meanDifference, max(tab) - min(tab)) 109 | } 110 | meanOrder <- order(meanDifference) 111 | meanDifference[meanOrder] 112 | meanOrder[(length(meanOrder) - 4):length(meanOrder)] 113 | "The 5 Most Powerful Predicting Categorical Variable is" 114 | colnames(train)[catVar[meanOrder[(length(meanOrder) - 4):length(meanOrder)]]] 115 | meanDifference[meanOrder[(length(meanOrder) - 4):length(meanOrder)]] 116 | -------------------------------------------------------------------------------- /Lectures/Week_1_Statistics/List of R material.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Lectures/Week_1_Statistics/List of R material.pdf -------------------------------------------------------------------------------- /Lectures/Week_1_Statistics/Practice.R: -------------------------------------------------------------------------------- 1 | # Vector: logical, integer, numeric (double), and character. 2 | 3 | dbl_var <- c(1, 2.5, 4.5) 4 | # With the L suffix, you get an integer rather than a double 5 | int_var <- c(1L, 6L, 10L) 6 | # Use TRUE and FALSE (or T and F) to create logical vectors 7 | log_var <- c(TRUE, FALSE, T, F) 8 | chr_var <- c("these are", "some strings") 9 | 10 | # Vectors are always flat, even if you nest c() 11 | c(1, c(2, c(3, 4))) 12 | c(1, 2, 3, 4) 13 | 14 | # missing value is NA 15 | t = c(1, NA, 2) 16 | is.na(t) 17 | 18 | # Given a vector, you can determine its type with typeof(), 19 | # or check if it’s a specific type with an “is” function: 20 | # is.character(), is.double(), is.integer(), is.logical(), 21 | # or, more generally, is.atomic(). 22 | 23 | typeof(int_var) 24 | is.integer(int_var) 25 | is.atomic(int_var) 26 | 27 | # is.numeric() is a general test for the “numberliness” of a vector 28 | # and returns TRUE for both integer and double vectors. 29 | is.numeric(int_var) 30 | is.numeric(dbl_var) 31 | 32 | # Coercion: 33 | # All elements of a vector must be the same type, 34 | # so when you attempt to combine different types, 35 | # they will be coerced to the most flexible type. 36 | # Types from least to most flexible are: logical, integer, double, and character. 37 | 38 | str(c("a", 1)) 39 | 40 | x <- c(FALSE, FALSE, TRUE) 41 | as.numeric(x) 42 | 43 | # List 44 | # Lists are different from vectors, 45 | # because their elements can be of any type, including lists. 46 | 47 | x <- list(1:3, "a", c(TRUE, FALSE, TRUE), c(2.3, 5.9)) 48 | str(x) 49 | 50 | # The typeof() a list is list. 51 | # You can test for a list with is.list() and coerce to a list with as.list(). 52 | typeof(x) 53 | is.list(x) 54 | 55 | # You can turn a list into a vector with unlist(). 56 | # If the elements of a list have different types, unlist() uses the same coercion rules as c(). 57 | 58 | unlist(x) 59 | 60 | # Lists are used to build up many of the more complicated data structures in R 61 | # e.g., linear models objects (as produced by lm()) are lists. 62 | 63 | # Matrix 64 | a <- matrix(1:6, ncol = 3, nrow = 2) 65 | 66 | # You can also modify an object in place by setting dim() 67 | c <- 1:6 68 | dim(c) <- c(2, 3) 69 | 70 | # length() generalises to nrow() and ncol() for matrices. 71 | # names() generalises to rownames() and colnames() for matrices, and dimnames(), a list of character vectors, for arrays. 72 | 73 | length(a) 74 | nrow(a) 75 | ncol(a) 76 | 77 | rownames(a) <- c("A", "B") 78 | colnames(a) <- c("a", "b", "c") 79 | a 80 | 81 | # Data frame: the most common way of storing data in R 82 | df <- data.frame(x = 1:3, y = c("a", "b", "c")) 83 | str(df) 84 | # why we don't like factor, see this post:http://www.win-vector.com/blog/2014/09/factors-are-not-first-class-citizens-in-r/ 85 | # x = as.factor(c('a', 'b', 'c')) 86 | # x = c(x, 'd') 87 | df <- data.frame(x = 1:3, y = c("a", "b", "c"), stringsAsFactors = FALSE) 88 | str(df) 89 | 90 | is.data.frame(df) 91 | 92 | # Coercion 93 | # You can coerce an object to a data frame with as.data.frame(): 94 | # A vector will create a one-column data frame. 95 | # A list will create one column for each element; 96 | # it’s an error if they’re not all the same length. 97 | x 98 | as.data.frame(x) 99 | # A matrix will create a data frame with the same number of columns and rows as the matrix. 100 | a 101 | as.data.frame(a) 102 | 103 | # Material: https://cran.r-project.org/doc/manuals/r-patched/R-intro.html 104 | train <- read.csv("train.csv", stringsAsFactors = FALSE) 105 | 106 | # Understand the data 107 | summary(train) 108 | str(train) 109 | dim(train) 110 | head(train) 111 | colnames(train) 112 | head(rownames(train)) 113 | length(unique(train$Id)) 114 | 115 | # check how many NAs in each feature 116 | length(which(is.na(train$LotFrontage))) 117 | sapply(train, function(x) {length(which(is.na(x)))}) 118 | # same as: 119 | colSums(sapply(train, is.na)) 120 | sort(sapply(train, function(x) { sum(is.na(x)) }), decreasing=TRUE) 121 | # The percentage of data missing in train. 122 | sum(is.na(train)) / (nrow(train) *ncol(train)) 123 | 124 | # Find out variables with largest number of missing values 125 | # how to treat missing values, (1) actually missing (2) add new level (3) imputation 126 | # (4) remove all rows with NA 127 | 128 | # categorial variables 129 | table(train$OverallQual) 130 | table(train$OverallQual) / dim(train)[1] 131 | # barplot(counts of data not data itself) 132 | barplot(table(train$OverallQual)) 133 | barplot(table(train$OverallCond)) 134 | 135 | # continous variables 136 | mean(train$SalePrice) 137 | sd(train$SalePrice) 138 | median(train$SalePrice) 139 | quantile(train$SalePrice, c(0.1, 0.25, 0.5, 0.75, 0.9)) 140 | 141 | # let's use pdf 142 | plot(density(train$SalePrice)) # Right skewed 143 | plot(density(log(train$SalePrice))) # looks more normal distributed 144 | 145 | # boxplot http://www.physics.csbsju.edu/stats/box2.html 146 | boxplot(train$SalePrice) 147 | # Q1 - 1.5IQR, Q1, median, Q3, Q3 + 1.5IQR, where IQR is interquartile range: Q3 - Q1 148 | 149 | # Get to explore the relationship between features and outcome 150 | # what could be the most useful features from the list? Most times choose features by intuition. 151 | # Talk about example at work. 152 | # Start from OverallQual 153 | boxplot(log(subset(train, OverallQual <= 5)$SalePrice + 1), 154 | log(subset(train, OverallQual > 5)$SalePrice + 1)) 155 | # add more notation https://www.r-bloggers.com/box-plot-with-r-tutorial/ 156 | boxplot(log(subset(train, OverallQual <= 5)$SalePrice + 1), 157 | log(subset(train, OverallQual > 5)$SalePrice + 1), 158 | xlab = "OverallQuality", ylab = "log(SalesPrice)", 159 | names = c("Low Qual", "High Qual"), col = c("red", "green")) 160 | # another version of boxplot, called violin plot 161 | # library(vioplot) 162 | # lowQual <- subset(train, OverallQual <= 5, select="SalePrice") 163 | # highQual <- subset(train, OverallQual > 5, select="SalePrice") 164 | # vioplot(log(lowQual$SalePrice), log(highQual$SalePrice), col ='gold') 165 | 166 | # What if we want to have a boxplot for each category of the overallQuality feature 167 | library(lattice) 168 | bwplot(OverallQual ~ SalePrice, data = train) 169 | bwplot(Neighborhood ~ SalePrice, data = train) 170 | bwplot(LotArea ~ SalePrice, data = train) # not really good for continuous feature 171 | 172 | # how to explore relationship between continuous feature and response: calculate the correlation 173 | with(train, cor(LotArea, SalePrice)) 174 | contVar <- NULL 175 | corr <- NULL 176 | for(i in 1:dim(train)[2]) { 177 | if(is.numeric(train[, i])) { 178 | contVar <- c(contVar, i) 179 | corr <- c(corr, cor(train[, i], train$SalePrice)) 180 | 181 | } 182 | } 183 | 184 | # find out all of the continous columns 185 | contVar <- names(train)[which(sapply(train, is.numeric))] 186 | trainCont <- train[, contVar] 187 | dim(trainCont) 188 | 189 | library(corrplot) 190 | correlations <- cor(trainCont[, -1]) # see a lot of NA 191 | correlations <- cor(trainCont[, -1], use = "pairwise.complete.obs") 192 | corrplot(correlations, method = "square") 193 | # some features are not helpful at all, which could be due to the missingness as well 194 | corrplot(correlations[rowInd, rowInd], method = "square") 195 | -------------------------------------------------------------------------------- /Lectures/Week_1_Statistics/Practice.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Lectures/Week_1_Statistics/Practice.pdf -------------------------------------------------------------------------------- /Lectures/Week_5_Classification/python_environment_setup.md: -------------------------------------------------------------------------------- 1 | # Environment Setup for Data Science in Python 2 | 3 | BitTiger DS501 4 | 5 | --- 6 | 7 | The course will heavily use Python. It is not necessary to be an expert in Python coming into the course (that is what we will teach you!) but it is helpful to be familiar with its syntax. 8 | 9 | Not everyone knows Python. We provide resources here to get you situated with Python environment and to bring you up to speed with Python quickly. 10 | 11 | ## Install Python and its modules 12 | ### Option 1: Install Anaconda (Recommended) 13 | 14 | We will be using [Anaconda](https://store.continuum.io/cshop/anaconda/) with Python version 2.7. Install this and you will have Python along with several useful packages and programs: 15 | 16 | * NumPy 17 | * SciPy 18 | * Pandas 19 | * Sklearn 20 | * MatPlotLib 21 | * IPython 22 | 23 | You can see the [full list of packages](http://docs.continuum.io/anaconda/pkg-docs.html). 24 | 25 | You should install Anaconda now. Read the [docs](https://docs.continuum.io/anaconda/install) if you don't know how to install. 26 | 27 | ### Option 2: Use virtualenv (Only for those with some experience) 28 | 29 | If you are familiar with command line interface (CLI) and [virtualenv](https://virtualenv.pypa.io/en/stable/), and you are so inclined to manage your Python modules yourself, you can use virtualenv, for example, to create and activate a virtualenv on Linux/MacOS you do 30 | 31 | ``` 32 | $ virtualenv venv 33 | $ source venv/bin/activate 34 | ``` 35 | 36 | and you use pip to install any packages: 37 | 38 | ``` 39 | $ pip install numpy 40 | $ pip install scipy 41 | $ pip install pandas 42 | $ pip install sklearn 43 | $ pip install matplotlib 44 | $ pip install ipython[all] 45 | ``` 46 | 47 | ## Jupyter Notebook (f.k.a. IPython Notebook) 48 | 49 | [Jupyter Notebook](http://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/) will be our main tool for interactive programming process, including: developing, documenting, and executing code, as well as communicating the results. 50 | 51 | You can either start Jupyter Notebook from Anaconda GUI or from CLI. For example, on Linux and MacOS: 52 | ``` 53 | $ jupyter notebook 54 | ``` 55 | 56 | Jupyter Notebook will be launched on your browser. 57 | 58 | 59 | ## Text Editor (optional) 60 | 61 | You are welcome to use any editor you'd like. Below are two general purpose text editors, commonly used by data scientists and software engineers. 62 | 63 | ### Sublime 64 | 65 | You can download [Sublime Text](http://www.sublimetext.com/2) for free. 66 | 67 | ### Atom 68 | 69 | You can download [Atom](https://atom.io/) for free. 70 | 71 | 72 | --- 73 |
74 | 75 | ## Learn Python 76 | 77 | ### Resources 78 | 79 | If you feel new to python, you might benefit from looking at one of these resources: 80 | 81 | * [Think Python](http://greenteapress.com/wp/think-python/) 82 | * [Dive Into Python](http://www.diveintopython.net/) 83 | * [Learn Python the Hard Way](http://learnpythonthehardway.org/) 84 | 85 | Python also has great documentation. 86 | 87 | The [Python tutorial](https://docs.python.org/2/tutorial/) and [Python library](https://docs.python.org/2/library/) are great resources if you need to look up how something is done in Python. 88 | 89 | ### What you need to know 90 | 91 | ##### Eventually you should be comfortable with everything below. 92 | 93 | * Basic data structures and associated methods 94 | * int, float 95 | * string 96 | * list 97 | * dict 98 | * set 99 | * range 100 | * Control structures 101 | * if, elif, else 102 | * while 103 | * for 104 | * break, continue, pass 105 | * Enumerations 106 | * for loops 107 | * list comprehensions 108 | * enumerate 109 | * zip 110 | * Functions 111 | * Declaration 112 | * Calling 113 | * Keyword arguments 114 | * Object orientation 115 | * Classes 116 | * Methods 117 | * Properties (instance variables) 118 | * self 119 | * Modules 120 | * import 121 | * aliasing (`import pandas as pd`) 122 | * global import (`from pandas import *`) 123 | * IO 124 | * Read a file 125 | * Write to a file 126 | -------------------------------------------------------------------------------- /Lectures/Week_5_Classification/week5_reading.md: -------------------------------------------------------------------------------- 1 | # Readings for Week 5 2 | 3 | ## Machine Learning Theory 4 | 5 | * Logistic Regression 6 | * [Andrew Ng CS229 lecture notes](http://cs229.stanford.edu/notes/cs229-notes1.pdf) Page 1-7, 16-19 7 | * *An Introductino to Statisitical Learning* Chapter 4, Page 128-138 (optional) 8 | 9 | * Gradient Descent 10 | * *Machine Learning in Action* Chapter 5, Page 83-90 (Page 90-96 optional) 11 | * [An Gradient Descent animation in R](http://vis.supstat.com/2013/03/gradient-descent-algorithm-with-r/) 12 | 13 | * Machine Learning Model evaluation 14 | * [Bias-Variance Tradeoff](http://scott.fortmann-roe.com/docs/BiasVariance.html) 15 | * [Confusion Matrix](https://en.wikipedia.org/wiki/Confusion_matrix) 16 | * [Receiver Operating Characteristic (ROC)](https://en.wikipedia.org/wiki/Receiver_operating_characteristic) 17 | * [Evaluating Machine Learning Models](http://www.oreilly.com/data/free/files/evaluating-machine-learning-models.pdf) (optional) 18 | 19 | ## ML Implementation in Python 20 | 21 | * Sklearn Examples 22 | * [Logistic Regression Docs](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html) 23 | * [Logistic Regression Example](http://scikit-learn.org/stable/auto_examples/linear_model/plot_iris_logistic.html) 24 | * [Confusion matrix](http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py) 25 | * [ROC Curve](http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#sphx-glr-auto-examples-model-selection-plot-roc-py) 26 | * [More model evaluation](http://scikit-learn.org/stable/modules/model_evaluation.html) 27 | -------------------------------------------------------------------------------- /Lectures/Week_6_NLP/week6_reading.md: -------------------------------------------------------------------------------- 1 | # Readings for Week 6 2 | 3 | ## Machine Learning Theory 4 | 5 | * Natural Language Processing 6 | * Text feature extraction [Part-I](http://blog.christianperone.com/2011/09/machine-learning-text-feature-extraction-tf-idf-part-i/), [Part-II](http://blog.christianperone.com/2011/10/machine-learning-text-feature-extraction-tf-idf-part-ii/), [Part-III](http://blog.christianperone.com/2013/09/machine-learning-cosine-similarity-for-vector-space-models-part-iii/) 7 | * [Text classification with NLTK and scikit-learn (slides)](http://www.slideshare.net/ogrisel/statistical-machine-learning-for-text-classification-with-scikitlearn-and-nltk) 8 | * *Natural Language Processing with Python* (ch 3, pg 79-122, optional) 9 | 10 | ## ML Implementation in Python 11 | 12 | * Sklearn 13 | * [Working with text data](http://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html) 14 | 15 | * NLTK (optional) 16 | * [NLTK Book: 1 - 4 Language Processing and Python](http://www.nltk.org/book/ch01.html) 17 | * [NLTK Book: 3.6 Normalizing Text, 3.7 Tokenizing Text, ](http://www.nltk.org/book/ch03.html) 18 | -------------------------------------------------------------------------------- /Lectures/Week_7_Clustering/week7_reading.md: -------------------------------------------------------------------------------- 1 | # Readings for Week 7 2 | 3 | ## Machine Learning Theory 4 | 5 | * Basic Math 6 | * Linear Algebra: *Machine Learning in Action*: Appendix-B 7 | 8 | * Clustering 9 | * Kmeans: *Introduction to Statistical Learning* Chapter 10.3.1 10 | * KMeans: *Machine Learning in Action*: Chapter 10 11 | * Hierarchical Clustering: *Introduction to Statistical Learning* Chapter 10.3.2 12 | 13 | * Principal Component Analysis (Optional) 14 | * PCA: *Introduction to Statistical Learning* Chapter 10.2 15 | * PCA: *Machine Learning in Action*: Chapter 13 16 | 17 | * Singular Value Decomposition (Optional) 18 | * SVD: *Machine Learning in Action*: Chapter 14 19 | * Extra reading: [Dimensionality Reduction](http://infolab.stanford.edu/~ullman/mmds/ch11.pdf) 20 | 21 | ## ML Implementation in Python 22 | 23 | * Sklearn 24 | * [Clustering Overview](http://scikit-learn.org/stable/modules/clustering.html#) 25 | * [KMeans Example](http://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html#k-means-clustering) 26 | * [Hierarchical Clustering Example](http://scikit-learn.org/stable/tutorial/statistical_inference/unsupervised_learning.html#hierarchical-agglomerative-clustering-ward) 27 | * [PCA Example](http://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html) 28 | -------------------------------------------------------------------------------- /Lectures/Week_8_Recommender/week8_reading.md: -------------------------------------------------------------------------------- 1 | # Readings for Week 8 2 | 3 | ## Machine Learning Theory 4 | 5 | * Recommendation System 6 | * [Mining Massive Datasets ch 9](http://infolab.stanford.edu/~ullman/mmds/ch9.pdf) (especially 9.1, 9.3, 9.5) 7 | 8 | * NMF and Recommender 9 | * [Nonnegative Matrix Factorization and Recommendor Systems](http://econometricsense.blogspot.com/2012/10/nonnegative-matrix-factorization-and.html) 10 | * [NMF original paper](http://www.dm.unibo.it/~simoncin/nmfconverge.pdf) 11 | 12 | ## ML Implementation in Python 13 | 14 | * Recommender 15 | * [Matrix Factorization: A Simple Tutorial and Implementation in Python](http://www.quuxlabs.com/blog/2010/09/matrix-factorization-a-simple-tutorial-and-implementation-in-python/) 16 | * Turi (f.k.a Dato, which is f.k.a Graphlab) 17 | * [Recommender with Turi](https://turi.com/learn/userguide/recommender/introduction.html) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BitTiger Data Scientist Mastery Program 2 | 3 | This is the repository for BitTiger Data Scientist Mastery Program (DS501). This page covers what we are going to teach you and the resource you will need throughout the course. 4 | 5 | The course videos are linked here: [DS501 数据科学家直通车](https://www.bittiger.io/livecourses/NjTjzamKwWpHHECur). 6 | 7 | * __Day:__ Day of the Week 8 | * __Readings:__ Readings for the day. 9 | * __Lecture Notes:__ The folder contains lecture notes and slides. 10 | 11 | 12 | ## Teaching Team 13 | 14 | * Lead Instructor 15 | 16 | * Ella - 17 | * Stone - 18 | * Miao - 19 | 20 | * Teaching Assitant 21 | 22 | * Ye 23 | * Qirui 24 | * Yiqing 25 | 26 | * Course Coordinator 27 | 28 | * Chloe 29 | * Joy 30 | 31 | ## Syllabus 32 | 33 | * [Week 1: Statistical Foundations](#week-1-statistical-foundations) 34 | * [Week 2: Statistical Inference](#week-2-statistical-inference) 35 | * [Week 3: Linear Regression](#week-3-linear-regression) 36 | * [Week 4: Tree Based Models](#week-4-tree-based-models) 37 | * [Week 5: Logistic Regression](#week-5-logistic-regression) 38 | * [Week 6: Natural Language Processing](#week-6-natural-language-processing) 39 | * [Week 7: Clustering](#week-7-clustering) 40 | * [Week 8: Recommendation System](#week-8-recommendation-system) 41 | 42 | 43 | ### Week 1: Statistical Foundations 44 | | | Day (US) | Readings | Lecture Notes | Instructor | 45 | |:--:|:--:|:--:|:--:|:--:| 46 | | __Knowledge__ | Friday | To be posted | To be posted | Ella | 47 | | __Practice__ | Saturday | To be posted | To be posted | Ella | 48 | | __Code Lab__ | Tuesday | To be posted | To be posted | Ella | 49 | 50 | --- 51 | 52 | ### Week 2: Statistical Inference 53 | | | Day (US) | Readings | Lecture Notes | Instructor | 54 | |:--:|:--:|:--:|:--:|:--:| 55 | | __Knowledge__ | Friday | To be posted | To be posted | Ella | 56 | | __Practice__ | Saturday | To be posted | To be posted | Ella | 57 | | __Code Lab__ | Tuesday | To be posted | To be posted | Ella | 58 | 59 | --- 60 | 61 | ### Week 3: Linear Regression 62 | | | Day (US) | Readings | Lecture Notes | Instructor | 63 | |:--:|:--:|:--:|:--:|:--:| 64 | | __Knowledge__ | Friday | To be posted | To be posted | Ella | 65 | | __Practice__ | Saturday | To be posted | To be posted | Ella | 66 | | __Code Lab__ | Tuesday | To be posted | To be posted | Ella | 67 | 68 | --- 69 | 70 | ### Week 4: Tree Based Models 71 | | | Day (US) | Readings | Lecture Notes | Instructor | 72 | |:--:|:--:|:--:|:--:|:--:| 73 | | __Knowledge__ | Friday | To be posted | To be posted | Ella | 74 | | __Practice__ | Saturday | To be posted | To be posted | Ella | 75 | | __Code Lab__ | Tuesday | To be posted | To be posted | Ella | 76 | 77 | --- 78 | 79 | ### Week 5: Logistic Regression 80 | | | Day (US) | Readings | Lecture Notes | Instructor | 81 | |:--:|:--:|:--:|:--:|:--:| 82 | | __Knowledge__ | Friday | To be posted | To be posted | Stone | 83 | | __Practice__ | Saturday | 1. [Python environment setup](./Lectures/Week_5_Classification/python_environment_setup.md) | To be posted | Stone | 84 | | __Code Lab__ | Tuesday | To be posted | To be posted | Stone | 85 | 86 | --- 87 | 88 | ### Week 6: Natural Language Processing 89 | | | Day (US) | Readings | Lecture Notes | Instructor | 90 | |:--:|:--:|:--:|:--:|:--:| 91 | | __Knowledge__ | Friday | To be posted | To be posted | Stone | 92 | | __Practice__ | Saturday | To be posted | To be posted | Stone | 93 | | __Code Lab__ | Tuesday | To be posted | To be posted | Stone | 94 | 95 | --- 96 | ### Week 7: Clustering 97 | | | Day (US) | Readings | Lecture Notes | Instructor | 98 | |:--:|:--:|:--:|:--:|:--:| 99 | | __Knowledge__ | Friday | To be posted | To be posted | Stone | 100 | | __Practice__ | Saturday | To be posted | To be posted | Stone | 101 | | __Code Lab__ | Tuesday | To be posted | To be posted | Stone | 102 | --- 103 | 104 | ### Week 8: Recommendation System 105 | | | Day (US) | Readings | Lecture Notes | Instructor | 106 | |:--:|:--:|:--:|:--:|:--:| 107 | | __Knowledge__ | Friday | To be posted | To be posted | Stone | 108 | | __Practice__ | Saturday | To be posted | To be posted | Stone | 109 | | __Code Lab__ | Tuesday | To be posted | To be posted | Stone | 110 | --- 111 | 112 | 113 | ## Notes 114 | * Git and Github 115 | * [Set up git](https://help.github.com/articles/set-up-git/) 116 | * [Git and Github learning resource](https://help.github.com/articles/git-and-github-learning-resources/) 117 | * [Github overall guides](https://guides.github.com/) 118 | 119 | * Python and Python tools 120 | * Required: [Python environment setup](./Lectures/Week_5_Classification/python_environment_setup.md) 121 | * [Scipy lectures](http://www.scipy-lectures.org/) 122 | * [Scikit-Learn Tutorial](http://scikit-learn.org/stable/tutorial/) 123 | 124 | 125 | ## Textbooks 126 | 127 | * Statistics 128 | * [The Elements of Statistical Learning Data Mining, Inference, and Prediction](./Readings/ElemStatLearn.pdf) 129 | * [An Introduction to Statistical Learning with Applications in R](./Readings/ISLR_Sixth_Printing.pdf) 130 | 131 | * Data Science and Machine Learning in Python 132 | * [Introduction to Machine Learning with Python](./Readings/Introduction_to_Machine_Learning_with_Python.pdf) 133 | * [Machine Learning In Action](./Readings/Machine_Learning_In_Action.pdf) 134 | * [Python Data Science Handbook](./Readings/Python_Data_Science_Handbook.pdf) 135 | 136 | * Python Language 137 | * Introductory: [Learn Python the Hard Way](https://learnpythonthehardway.org/) 138 | * Advanced: [Effective Python](http://www.effectivepython.com/) 139 | 140 | * Business and Data Science 141 | * [Data Science For Business](./Readings/DataScienceForBusiness.pdf) 142 | 143 | * Natural Language Processing 144 | * [Natural Language Processing with Python](./Readings/NaturalLanguageProcessingWithPython.pdf) 145 | 146 | * More... 147 | * [23 Free Data Science Books](http://www.wzchen.com/data-science-books) 148 | 149 | 150 | ## Getting Help 151 | 152 | * [Data Science Stack Exchange](http://datascience.stackexchange.com/) 153 | * [Stats Stack Exchange](http://stats.stackexchange.com/) 154 | * [MetaOptimize: ML and Datascience forum](http://metaoptimize.com/qa) 155 | 156 | 157 | ## Data Science Interview 158 | 159 | * [y-hat: What We Learned Analyzing Hundreds of Data Science Interviews](http://blog.yhat.com/posts/data-science-interviews.html) 160 | * [Quora: How Do I Prepare for a Data Scientist Interview](https://www.quora.com/How-do-I-prepare-for-a-data-scientist-interview/answers/4332208) 161 | 162 | 163 | ## Data Science Blogs 164 | 165 | * [Airbnb](http://nerds.airbnb.com/data/) 166 | * [Uber](https://eng.uber.com/category/uberdata/) 167 | * [StitchFix](http://multithreaded.stitchfix.com/blog) 168 | * [Facebook](https://research.fb.com/blog/) 169 | * [y-hat](http://blog.yhat.com/) 170 | * [Machine Learning Mastery](http://machinelearningmastery.com/blog/) 171 | * [Data Science 101](http://101.datascience.community/) 172 | * [The Data Incubator](http://blog.thedataincubator.com/): good examples of data science projects 173 | 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /Readings/DataScienceForBusiness.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/DataScienceForBusiness.pdf -------------------------------------------------------------------------------- /Readings/DoingDataScience.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/DoingDataScience.pdf -------------------------------------------------------------------------------- /Readings/ElemStatLearn.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/ElemStatLearn.pdf -------------------------------------------------------------------------------- /Readings/ISLR_Sixth_Printing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/ISLR_Sixth_Printing.pdf -------------------------------------------------------------------------------- /Readings/Introduction_to_Machine_Learning_with_Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/Introduction_to_Machine_Learning_with_Python.pdf -------------------------------------------------------------------------------- /Readings/Machine_Learning_In_Action.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/Machine_Learning_In_Action.pdf -------------------------------------------------------------------------------- /Readings/NaturalLanguageProcessingWithPython.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/NaturalLanguageProcessingWithPython.pdf -------------------------------------------------------------------------------- /Readings/Python_Data_Science_Handbook.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LangYujian/DataScience/ec4f872d1f44c3afb89881d31d7c2ee9f584cb5c/Readings/Python_Data_Science_Handbook.pdf --------------------------------------------------------------------------------