├── SQL for analysing Covid data.sql └── data_time_series_covid19.xlsx /SQL for analysing Covid data.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE COVID 2 | 3 | USE COVID 4 | GO 5 | 6 | -- Before conducting data analysis, I usually check missing value or null value to make sure my data is full and avoid errors during analysis. Let's do it :)) 7 | --CHECK NULL data 8 | SELECT * FROM dbo.Data 9 | WHERE Province IS NULL 10 | OR Country IS NULL 11 | OR Latitude IS NULL 12 | OR Longitude IS NULL 13 | OR Date IS NULL 14 | OR Confirmed IS NULL 15 | OR Deaths IS NULL 16 | OR Recovered IS NULL 17 | 18 | --Update null value = 0 19 | UPDATE dbo.Data 20 | SET Longitude = 0 WHERE Longitude IS NULL 21 | 22 | UPDATE dbo.Data 23 | SET Latitude = 0 WHERE Latitude IS NULL 24 | 25 | UPDATE dbo.Data 26 | SET Recovered = 0 WHERE Recovered IS NULL 27 | 28 | UPDATE dbo.Data 29 | SET Active = 0 WHERE Active IS NULL 30 | 31 | UPDATE dbo.Data 32 | SET Incidence_Rate = 0 WHERE Incidence_Rate IS NULL 33 | 34 | UPDATE dbo.Data 35 | SET Case_Fatality_Ratio = 0 WHERE Case_Fatality_Ratio IS NULL 36 | 37 | --- 1. DESCRIPTIVE STATISTICS --- 38 | ---- We will check some basic statistics before going to the indexs of 2 major characteristics of descriptive stastistics 39 | 40 | /* check first 10 rows */ 41 | SELECT TOP 10 * FROM dbo.Data 42 | 43 | /* check how many rows */ 44 | SELECT COUNT(*) AS 'Nb of row' 45 | FROM dbo.Data; 46 | /* how many month */ 47 | SELECT DATEPART(YEAR, Date) AS 'Year', COUNT(DISTINCT(MONTH(Date))) AS 'NB of month' FROM dbo.Data 48 | GROUP BY DATEPART(YEAR, Date) 49 | /*start_date - end_date*/ 50 | SELECT MIN(Date) AS 'start_date', MAX(Date) AS 'end_date' FROM dbo.Data 51 | /* how many rows in each month */ 52 | SELECT DATEPART(YEAR, Date) AS 'Year', DATEPART(MONTH, Date) AS 'Month', COUNT(*) AS 'Nb of row' 53 | FROM dbo.Data 54 | GROUP BY DATEPART(YEAR, Date), DATEPART(MONTH, Date) 55 | ORDER BY 1,2,3 56 | 57 | /*min: confirmed, deaths, recovered per month*/ 58 | 59 | SELECT DATEPART(YEAR, Date) AS 'Year', 60 | DATEPART(MONTH, Date) AS 'Month', 61 | MIN(Confirmed) AS min_confirmed, 62 | MIN(Deaths) AS min_dealths, 63 | MIN(Recovered) AS min_recovered 64 | FROM dbo.Data 65 | GROUP BY DATEPART(YEAR, Date), DATEPART(MONTH, Date) 66 | ORDER BY 1,2 67 | 68 | --max: confirmed, deaths, recovered per month 69 | SELECT DATEPART(YEAR, Date) AS 'Year', 70 | DATEPART(MONTH, Date) AS 'Month', 71 | MAX(Confirmed) AS max_confirmed, 72 | MAX(Deaths) AS max_dealths, 73 | MAX(Recovered) AS max_recovered 74 | FROM dbo.Data 75 | GROUP BY DATEPART(YEAR, Date), DATEPART(MONTH, Date) 76 | ORDER BY 1,2 77 | 78 | -- The total case: confirmed, deaths, recovered per month 79 | 80 | SELECT DATEPART(YEAR, Date) AS 'Year', 81 | DATEPART(MONTH, Date) AS 'Month', 82 | sum(Confirmed) AS sum_confirmed, 83 | sum(Deaths) AS sum_dealths, 84 | sum(Recovered) AS sum_recovered 85 | FROM dbo.Data 86 | GROUP BY DATEPART(YEAR, Date), DATEPART(MONTH, Date) 87 | ORDER BY 1,2 88 | 89 | /********* 1.1. The central tendency: a distribution is an estimate of the “center” of a distribution of values: 90 | -- MEAN 91 | -- MODE 92 | -- MEDIAN 93 | *********/ 94 | 95 | ---------- MEAN ---------- 96 | 97 | SELECT DATEPART(YEAR, Date) AS 'Year', 98 | DATEPART(MONTH, Date) AS 'Month', 99 | ROUND(AVG(Confirmed),0) AS avg_confirmed, 100 | ROUND(AVG(Deaths),0) AS avg_dealths, 101 | ROUND(AVG(Recovered),0) AS avg_recovered 102 | FROM dbo.Data 103 | GROUP BY DATEPART(YEAR, Date), DATEPART(MONTH, Date) 104 | ORDER BY 1,2 105 | 106 | ---------- MEDIAN ---------- 107 | --To get the last value in the top 50 percent of rows. 108 | SELECT TOP 1 Confirmed 109 | FROM dbo.Data 110 | WHERE Confirmed IN (SELECT TOP 50 PERCENT Confirmed 111 | FROM dbo.Data 112 | ORDER BY Confirmed ASC) 113 | ORDER BY Confirmed DESC 114 | 115 | ---------- MODE ---------- 116 | /* What is the frequently occuring numbers of confirmed cases in each month? */ 117 | /* we can see that February 2020 are the months which have most number of confirmed case*/ 118 | SELECT TOP 1 119 | DATEPART(YEAR, Date) AS 'Year', 120 | DATEPART(MONTH, Date) AS 'Month', 121 | confirmed 122 | FROM dbo.Data 123 | WHERE Confirmed IS Not NULL 124 | GROUP BY DATEPART(YEAR, Date), DATEPART(MONTH, Date), confirmed 125 | ORDER BY COUNT(*) DESC 126 | 127 | /********* 1.2. The dispersion: refers to the spread of the values around the central tendency: 128 | -- RANGE = max value - min value 129 | -- VARIANCE 130 | -- STANDART DEVIATION 131 | *********/ 132 | 133 | -- How spread out? 134 | --- confirmed case 135 | SELECT 136 | SUM(confirmed) AS total_confirmed, 137 | ROUND(AVG(confirmed), 0) AS average_confirmed, 138 | ROUND(VAR(confirmed),0) AS variance_confirmed, 139 | ROUND(STDEV(confirmed),0) AS std_confirmed 140 | FROM dbo.Data 141 | --- deaths case 142 | SELECT 143 | SUM(deaths) AS total_deaths, 144 | ROUND(AVG(deaths), 0) AS average_deaths, 145 | ROUND(VAR(deaths),0) AS variance_deaths, 146 | ROUND(STDEV(deaths),0) AS std_deaths 147 | FROM dbo.Data 148 | --- recovered case 149 | SELECT 150 | SUM(recovered) AS total_recovered, 151 | ROUND(AVG(recovered), 0) AS average_recovered, 152 | ROUND(VAR(recovered),0) AS variance_recovered, 153 | ROUND(STDEV(recovered),0) AS std_recovered 154 | FROM dbo.Data 155 | 156 | /* How spread out in each month? */ 157 | --- confirmed case 158 | SELECT 159 | DATEPART(YEAR, Date) AS 'Year', 160 | DATEPART(MONTH, Date) AS 'Month', 161 | SUM(confirmed) AS total_confirmed, 162 | ROUND(AVG(confirmed), 0) AS average_confirmed, 163 | ROUND(VAR(confirmed),0) AS variance_confirmed, 164 | ROUND(STDEV(confirmed),0) AS std_confirmed 165 | FROM dbo.Data 166 | GROUP BY DATEPART(YEAR, Date), DATEPART(MONTH, Date) 167 | ORDER BY 1,2 168 | --- deaths case 169 | SELECT 170 | DATEPART(YEAR, Date) AS 'Year', 171 | DATEPART(MONTH, Date) AS 'Month', 172 | SUM(deaths) AS total_deaths, 173 | ROUND(AVG(deaths), 0) AS average_deaths, 174 | ROUND(VAR(deaths),0) AS variance_deaths, 175 | ROUND(STDEV(deaths),0) AS std_deaths 176 | FROM dbo.Data 177 | GROUP BY DATEPART(YEAR, Date), DATEPART(MONTH, Date) 178 | ORDER BY 1,2 179 | --- recovered case 180 | SELECT 181 | DATEPART(YEAR, Date) AS 'Year', 182 | DATEPART(MONTH, Date) AS 'Month', 183 | SUM(recovered) AS total_recovered, 184 | ROUND(AVG(recovered), 0) AS average_recovered, 185 | ROUND(VAR(recovered),0) AS variance_recovered, 186 | ROUND(STDEV(recovered),0) AS std_recovered 187 | FROM dbo.Data 188 | GROUP BY DATEPART(YEAR, Date), DATEPART(MONTH, Date) 189 | ORDER BY 1,2 190 | 191 | --2. PERCENTITLES AND FREQUENCY 192 | --Percentiles : One hundreds equal groups; population divided across group 193 | --Percentiles help us understand the distribution of data by grouping values into equal sized buckets. 194 | 195 | --Discrete Percentile: returns value that exists in the column. 196 | --Discrete Percentile is very useful when you want to know the value in the column, that falls into a percentile. 197 | 198 | --Continuous Percentile: interpolates the boundary value between the percentiles. 199 | --Continuous Percentile is very useful when you want to know what is the value at the boundary between two percentile buckets. 200 | 201 | --TOP 5 202 | SELECT TOP 5 * FROM dbo.Data 203 | 204 | /* What are the top data ? */ 205 | /* Data Interpretion: 206 | it seems like top cases are coming from 12 month, which is not suprising due to seasonality trend of holidays or regions*/ 207 | 208 | --TOP 10 of the Confirmed case: the most Confirmed case are from India in April and May 2021 209 | SELECT TOP 10 * 210 | FROM dbo.Data 211 | ORDER BY Confirmed DESC 212 | 213 | --TOP 10 of the Deaths case: the most deaths case are from India. It causes the largest number of confirmed case. 214 | SELECT TOP 10 * 215 | FROM dbo.Data 216 | ORDER BY Deaths DESC 217 | 218 | --TOP 10 of the recovered case: is similar to Deaths case 219 | SELECT TOP 10 * 220 | FROM dbo.Data 221 | ORDER BY recovered DESC 222 | 223 | 224 | /* What about average of each case ? */ 225 | /* the average confirmed, deaths, recovered case are respectively about 1256, 27, 848 but it doesn't tell us the full story, like 226 | - Are there many days with low cases? 227 | - Are there many days with high cases? 228 | - or our cases evenly distributed across all days? 229 | */ 230 | 231 | SELECT 232 | ROUND(AVG(Confirmed),0) AS avg_confirmed, 233 | ROUND(AVG(Deaths),0) AS avg_deaths, 234 | ROUND(AVG(Recovered),0) AS avg_recoverd 235 | FROM dbo.Data 236 | 237 | /****** we can use percentiles to answer above question and understand our data distributions *******/ 238 | 239 | /*** Percentile Discrete Function ***/ 240 | 241 | /* get 50 percentile of values, and compare to the average value 242 | --- confirmed: it seem like 50 percentile of revenue 3 cases, it is too far off from the average confrimed case - 1256 case 243 | --- it is similarly to deaths and recovered case*/ 244 | --- it means that there are many low values in each type of case. 245 | ---PERCENTITLE 246 | SELECT 247 | PERCENTILE_DISC(0.5) WITHIN GROUP(ORDER BY Confirmed) OVER() AS percentitles_confirmed_50, 248 | PERCENTILE_DISC(0.5) WITHIN GROUP(ORDER BY Deaths) OVER() AS percentitles_deaths_50, 249 | PERCENTILE_DISC(0.5) WITHIN GROUP(ORDER BY Recovered) OVER() AS percentitles_recovered_50 250 | FROM dbo.Data 251 | 252 | 253 | /* let's look at 50th, 60th , 90th , 95th percentiles OF confirmed case */ 254 | 255 | SELECT 256 | PERCENTILE_DISC(0.5) WITHIN GROUP(ORDER BY Confirmed) OVER () AS pct_50_revenues, 257 | PERCENTILE_DISC(0.6) WITHIN GROUP(ORDER BY Confirmed) over () AS pct_60_revenues, 258 | PERCENTILE_DISC(0.9) WITHIN GROUP(ORDER BY Confirmed) over () AS pct_90_revenues, 259 | PERCENTILE_DISC(0.95) WITHIN GROUP(ORDER BY Confirmed) over () AS pct_95_revenues 260 | FROM dbo.Data; 261 | 262 | /*** Percentile Continuous Function ***/ 263 | SELECT 264 | PERCENTILE_CONT(0.95) WITHIN GROUP(ORDER BY confirmed) OVER() AS pct_95_cont_confirmed, 265 | PERCENTILE_DISC(0.95) WITHIN GROUP(ORDER BY confirmed) OVER() AS pct_95_disc_reconfirmed 266 | FROM dbo.Data; 267 | 268 | --- 3.CORRELATION AND RANKS 269 | 270 | /* check the correlation between confirmed, deaths and recoverd case*/ 271 | /* we can see that there is high correlation between confirmed, deaths and recoverd case, which make sense.*/ 272 | -- confirmed-deaths: 0.7917 273 | SELECT ((Avg(Confirmed * Deaths) - (Avg(Confirmed) * Avg(Deaths))) / (StDev(Confirmed) * StDev(Deaths))) AS 'Cor_cf_dt' 274 | FROM dbo.Data 275 | --confirmed - recovered: 0.68807 276 | SELECT ((Avg(Confirmed * Recovered) - (Avg(Confirmed) * Avg(Recovered))) / (StDev(Confirmed) * StDev(Recovered))) AS 'Cor_cf_rc' 277 | FROM dbo.Data 278 | --deaths - recovered: 0.60565 279 | SELECT ((Avg(deaths * Recovered) - (Avg(deaths) * Avg(Recovered))) / (StDev(deaths) * StDev(Recovered))) AS 'Cor_dt_rc' 280 | FROM dbo.Data 281 | 282 | 283 | /* We want to add a row number based on the case */ 284 | SELECT 285 | ROW_NUMBER() OVER(ORDER BY Confirmed) AS Row_number, * 286 | FROM dbo.Data; 287 | 288 | 289 | /* We also want to know the standing (rank) of month_of_year based on the units sold */ 290 | SELECT 291 | ROW_NUMBER() OVER(ORDER BY confirmed) AS row_number, 292 | Province, 293 | Country, 294 | confirmed 295 | FROM dbo.Data 296 | ORDER BY confirmed DESC; 297 | 298 | 299 | --- 4.LINEAR MODELS 300 | /***************** Linear Models ****************/ 301 | /* Linear Model such as regression are useful for estimating values for business. 302 | Such as: We just want to estimate how much revenue we get after run a marketing campaign with xx cost.*/ 303 | 304 | --- The result of Linear Regression: y=mx+b => y = 0.0136x + 9.9926. It means that when confirmed case increases 100 case, there will increase 1 deadth. 305 | 306 | /*********** Computing Slope (Deaths on y-axis and confirmed case in x-asis) *********/ 307 | /* Result: 0.01360387 */ 308 | SELECT (count(Confirmed)*sum(Confirmed*Deaths) - sum(Confirmed)* sum(Deaths))/(count(Confirmed)*sum(Confirmed*Confirmed) - sum(Confirmed)* sum(Confirmed)) 309 | FROM dbo.Data 310 | 311 | /*********** Computing Intercept (deaths on y-axis and confirmed case in x-asis) *********/ 312 | --Intercept = avg(y) - slope*avg(x) 313 | /* Result: 9.992565367 */ 314 | SELECT AVG(Deaths) - ((count(Confirmed)*sum(Confirmed*Deaths) - sum(Confirmed)* sum(Deaths))/(count(Confirmed)*sum(Confirmed*Confirmed) - sum(Confirmed)* sum(Confirmed)))*AVG(Confirmed) 315 | FROM dbo.Data 316 | 317 | -------------------------------------------------------------------------------- /data_time_series_covid19.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HuongTram11/SQL-for-Data-Analysis/c98e70c4830d6133818a434cb1b433e3f9d5ee56/data_time_series_covid19.xlsx --------------------------------------------------------------------------------