[\\w.]+)\"\n",
298 | "\n",
299 | "match = re.search(pattern, text)\n",
300 | "if match:\n",
301 | " print(f\"Username: {match.group('username')}\")\n",
302 | " print(f\"Domain: {match.group('domain')}\")\n"
303 | ],
304 | "metadata": {
305 | "colab": {
306 | "base_uri": "https://localhost:8080/"
307 | },
308 | "id": "L9FOB0uiLexp",
309 | "outputId": "c73b8c7f-af37-4016-b404-d9ad7502514f"
310 | },
311 | "execution_count": 24,
312 | "outputs": [
313 | {
314 | "output_type": "stream",
315 | "name": "stdout",
316 | "text": [
317 | "Username: john.doe\n",
318 | "Domain: example.com\n"
319 | ]
320 | }
321 | ]
322 | }
323 | ]
324 | }
--------------------------------------------------------------------------------
/regex/regex_contd.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "id": "VxllMcawIjlB"
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import re"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "source": [
31 | "text = \"First content
Second content
\"\n",
32 | "\n",
33 | "# Greedy matching (default)\n",
34 | "greedy = re.findall(r\"(.*)
\", text)\n",
35 | "print(f\"Greedy: {greedy}\")\n",
36 | "\n",
37 | "# Non-greedy matching\n",
38 | "non_greedy = re.findall(r\"(.*?)
\", text)\n",
39 | "print(f\"Non-greedy: {non_greedy}\")\n"
40 | ],
41 | "metadata": {
42 | "colab": {
43 | "base_uri": "https://localhost:8080/"
44 | },
45 | "id": "Cb35InjGIzW3",
46 | "outputId": "cf4f7b63-8e9c-4ad0-f4f7-a0c6bbd52a03"
47 | },
48 | "execution_count": 2,
49 | "outputs": [
50 | {
51 | "output_type": "stream",
52 | "name": "stdout",
53 | "text": [
54 | "Greedy: ['First contentSecond content']\n",
55 | "Non-greedy: ['First content', 'Second content']\n"
56 | ]
57 | }
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "source": [
63 | "# Password validation\n",
64 | "password = \"Password123\"\n",
65 | "has_uppercase = bool(re.search(r\"(?=.*[A-Z])\", password))\n",
66 | "has_lowercase = bool(re.search(r\"(?=.*[a-z])\", password))\n",
67 | "has_digit = bool(re.search(r\"(?=.*\\d)\", password))\n",
68 | "is_long_enough = len(password) >= 8\n",
69 | "\n",
70 | "if all([has_uppercase, has_lowercase, has_digit, is_long_enough]):\n",
71 | " print(\"Password meets requirements\")\n",
72 | "else:\n",
73 | " print(\"Password does not meet all requirements\")\n"
74 | ],
75 | "metadata": {
76 | "colab": {
77 | "base_uri": "https://localhost:8080/"
78 | },
79 | "id": "N21dkm8NI-TG",
80 | "outputId": "4c0b31c3-422c-4145-e07f-aff64c4a5950"
81 | },
82 | "execution_count": 3,
83 | "outputs": [
84 | {
85 | "output_type": "stream",
86 | "name": "stdout",
87 | "text": [
88 | "Password meets requirements\n"
89 | ]
90 | }
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "source": [],
96 | "metadata": {
97 | "id": "VNTuYz1bI-Jc"
98 | },
99 | "execution_count": 3,
100 | "outputs": []
101 | }
102 | ]
103 | }
--------------------------------------------------------------------------------
/regex/regex_examples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {
21 | "colab": {
22 | "base_uri": "https://localhost:8080/"
23 | },
24 | "id": "YPKB2a5Hsw02",
25 | "outputId": "3da0932d-8614-477c-b6a6-d202c795f3da"
26 | },
27 | "outputs": [
28 | {
29 | "output_type": "stream",
30 | "name": "stdout",
31 | "text": [
32 | "Contact info: 1234567890 and 9876543210.\n"
33 | ]
34 | }
35 | ],
36 | "source": [
37 | "import re\n",
38 | "\n",
39 | "text = \"Contact info: (123)-456-7890 and 987-654-3210.\"\n",
40 | "cleaned_text = re.sub(r'[()-]', '', text)\n",
41 | "print(cleaned_text)\n"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "source": [
47 | "text = \"Please reach out to us at support@example.org or help@example.org.\"\n",
48 | "emails = re.findall(r'\\b[\\w.-]+?@\\w+?\\.\\w+?\\b', text)\n",
49 | "print(emails)\n"
50 | ],
51 | "metadata": {
52 | "colab": {
53 | "base_uri": "https://localhost:8080/"
54 | },
55 | "id": "eGS7s-zTs-9T",
56 | "outputId": "74a8b91b-5af7-48f6-9dd3-d690b9f36b21"
57 | },
58 | "execution_count": null,
59 | "outputs": [
60 | {
61 | "output_type": "stream",
62 | "name": "stdout",
63 | "text": [
64 | "['support@example.org', 'help@example.org']\n"
65 | ]
66 | }
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "source": [
72 | "text = \"This\tis\ta\tstring with multiple unnecessary spaces.\"\n",
73 | "cleaned_text = re.sub(r'\\s+', ' ', text)\n",
74 | "print(cleaned_text)\n"
75 | ],
76 | "metadata": {
77 | "colab": {
78 | "base_uri": "https://localhost:8080/"
79 | },
80 | "id": "dd0K0-LrtmBi",
81 | "outputId": "5436543f-4b19-4081-f8d4-5ec6f64d6d6b"
82 | },
83 | "execution_count": null,
84 | "outputs": [
85 | {
86 | "output_type": "stream",
87 | "name": "stdout",
88 | "text": [
89 | "This is a string with multiple unnecessary spaces.\n"
90 | ]
91 | }
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "source": [
97 | "email = \"test@example.com\"\n",
98 | "if re.match(r'^\\b[\\w.-]+?@\\w+?\\.\\w+?\\b$', email):\n",
99 | " print(\"Valid email\") # Output: Valid email\n",
100 | "else:\n",
101 | " print(\"Invalid email\")\n"
102 | ],
103 | "metadata": {
104 | "colab": {
105 | "base_uri": "https://localhost:8080/"
106 | },
107 | "id": "j05fGS1UyCfe",
108 | "outputId": "402a5319-ba31-44d8-e870-ccbc35535af3"
109 | },
110 | "execution_count": null,
111 | "outputs": [
112 | {
113 | "output_type": "stream",
114 | "name": "stdout",
115 | "text": [
116 | "Valid email\n"
117 | ]
118 | }
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "source": [
124 | "text = \"This is sentence one. And this is sentence two! Is this sentence three?\"\n",
125 | "sentences = re.split(r'[.!?]', text)\n",
126 | "print(sentences) # Output: ['This is sentence one', ' And this is sentence two', ' Is this sentence three', '']\n"
127 | ],
128 | "metadata": {
129 | "colab": {
130 | "base_uri": "https://localhost:8080/"
131 | },
132 | "id": "7f68JqnBzBX9",
133 | "outputId": "455de3e7-3cd0-4ffc-ad69-d058e9ecedff"
134 | },
135 | "execution_count": null,
136 | "outputs": [
137 | {
138 | "output_type": "stream",
139 | "name": "stdout",
140 | "text": [
141 | "['This is sentence one', ' And this is sentence two', ' Is this sentence three', '']\n"
142 | ]
143 | }
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "source": [
149 | "import pandas as pd\n",
150 | "\n",
151 | "data = {\n",
152 | "\t'names': ['Alice123', 'Bob!@#', 'Charlie$$$'],\n",
153 | "\t'emails': ['alice@example.com', 'bob_at_example.com', 'charlie@example.com']\n",
154 | "}\n",
155 | "df = pd.DataFrame(data)\n",
156 | "\n",
157 | "# Remove non-alphabetic characters from names\n",
158 | "df['names'] = df['names'].str.replace(r'[^a-zA-Z]', '', regex=True)\n",
159 | "\n",
160 | "# Validate email addresses\n",
161 | "df['valid_email'] = df['emails'].apply(lambda x: bool(re.match(r'^\\b[\\w.-]+?@\\w+?\\.\\w+?\\b$', x)))\n",
162 | "\n",
163 | "print(df)\n"
164 | ],
165 | "metadata": {
166 | "colab": {
167 | "base_uri": "https://localhost:8080/"
168 | },
169 | "id": "qboHFiS30UMQ",
170 | "outputId": "eeb42cb5-ebcf-4ebe-f301-74c2c1ac184a"
171 | },
172 | "execution_count": null,
173 | "outputs": [
174 | {
175 | "output_type": "stream",
176 | "name": "stdout",
177 | "text": [
178 | " names emails valid_email\n",
179 | "0 Alice alice@example.com True\n",
180 | "1 Bob bob_at_example.com False\n",
181 | "2 Charlie charlie@example.com True\n"
182 | ]
183 | }
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "source": [],
189 | "metadata": {
190 | "id": "la5oKWfX0U2Z"
191 | },
192 | "execution_count": null,
193 | "outputs": []
194 | }
195 | ]
196 | }
--------------------------------------------------------------------------------
/statistics/Basic_Stats_Functions_Python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "## Import the Built-In `statistics` Module"
21 | ],
22 | "metadata": {
23 | "id": "s8yOidchG5UV"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "source": [
29 | "import statistics"
30 | ],
31 | "metadata": {
32 | "id": "cOmUhMH9bIAb"
33 | },
34 | "execution_count": null,
35 | "outputs": []
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "source": [
40 | "## 1. Mean"
41 | ],
42 | "metadata": {
43 | "id": "Cy_sSAo4bExW"
44 | }
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {
50 | "colab": {
51 | "base_uri": "https://localhost:8080/"
52 | },
53 | "id": "v-3qQD50a9hT",
54 | "outputId": "2a7d7cd5-d8f9-445d-f56f-59ac7f4e57b6"
55 | },
56 | "outputs": [
57 | {
58 | "output_type": "stream",
59 | "name": "stdout",
60 | "text": [
61 | "Mean: 30\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "data = [10, 20, 30, 40, 50]\n",
67 | "mean = statistics.mean(data)\n",
68 | "print(\"Mean:\", mean)"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "source": [
74 | "## 2. Median"
75 | ],
76 | "metadata": {
77 | "id": "obFi961MbQ46"
78 | }
79 | },
80 | {
81 | "cell_type": "code",
82 | "source": [
83 | "data = [15, 20, 35, 40, 50]\n",
84 | "median = statistics.median(data)\n",
85 | "print(\"Median:\", median)"
86 | ],
87 | "metadata": {
88 | "colab": {
89 | "base_uri": "https://localhost:8080/"
90 | },
91 | "id": "FrS8KYaXbPWy",
92 | "outputId": "f07a115a-5f18-462a-ae47-ab3c239db261"
93 | },
94 | "execution_count": null,
95 | "outputs": [
96 | {
97 | "output_type": "stream",
98 | "name": "stdout",
99 | "text": [
100 | "Median: 35\n"
101 | ]
102 | }
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "source": [
108 | "## 3. Mode"
109 | ],
110 | "metadata": {
111 | "id": "b9ybgj32bYKy"
112 | }
113 | },
114 | {
115 | "cell_type": "code",
116 | "source": [
117 | "data = [1, 2, 2, 3, 4, 4, 4]\n",
118 | "mode = statistics.mode(data)\n",
119 | "print(\"Mode:\", mode)"
120 | ],
121 | "metadata": {
122 | "colab": {
123 | "base_uri": "https://localhost:8080/"
124 | },
125 | "id": "AgrG9I5fbWU0",
126 | "outputId": "eebf7a08-b1d0-42f7-f982-3a20b9241082"
127 | },
128 | "execution_count": null,
129 | "outputs": [
130 | {
131 | "output_type": "stream",
132 | "name": "stdout",
133 | "text": [
134 | "Mode: 4\n"
135 | ]
136 | }
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "source": [
142 | "data = [1, 2, 2, 2, 3, 4, 4, 4, 7, 7, 7]\n",
143 | "mode = statistics.mode(data)\n",
144 | "print(\"Modes:\", mode)"
145 | ],
146 | "metadata": {
147 | "colab": {
148 | "base_uri": "https://localhost:8080/"
149 | },
150 | "id": "d3D3oyVBccaa",
151 | "outputId": "93c7fa31-ca9c-429f-df8d-d93ca9eef080"
152 | },
153 | "execution_count": null,
154 | "outputs": [
155 | {
156 | "output_type": "stream",
157 | "name": "stdout",
158 | "text": [
159 | "Modes: 2\n"
160 | ]
161 | }
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "source": [
167 | "data = [1, 2, 2, 2, 3, 4, 4, 4, 7, 7, 7]\n",
168 | "modes = statistics.multimode(data)\n",
169 | "print(\"Modes:\", modes)"
170 | ],
171 | "metadata": {
172 | "colab": {
173 | "base_uri": "https://localhost:8080/"
174 | },
175 | "id": "62_XzwJhcH3d",
176 | "outputId": "e7cf6cd4-50b3-42a5-b1ad-d45be40c602a"
177 | },
178 | "execution_count": null,
179 | "outputs": [
180 | {
181 | "output_type": "stream",
182 | "name": "stdout",
183 | "text": [
184 | "Modes: [2, 4, 7]\n"
185 | ]
186 | }
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "source": [
192 | "## 4. Standard Deviation"
193 | ],
194 | "metadata": {
195 | "id": "neQiIHTC6CtL"
196 | }
197 | },
198 | {
199 | "cell_type": "code",
200 | "source": [
201 | "data = [12, 15, 22, 29, 35]\n",
202 | "std_dev = statistics.stdev(data)\n",
203 | "print(f\"Standard Deviation: {std_dev:.3f}\")"
204 | ],
205 | "metadata": {
206 | "colab": {
207 | "base_uri": "https://localhost:8080/"
208 | },
209 | "id": "uY-DcaV4cRux",
210 | "outputId": "98166ea5-b57c-4e1b-f526-5cfbb3f9aed7"
211 | },
212 | "execution_count": null,
213 | "outputs": [
214 | {
215 | "output_type": "stream",
216 | "name": "stdout",
217 | "text": [
218 | "Standard Deviation: 9.555\n"
219 | ]
220 | }
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "source": [
226 | "## 5. Variance"
227 | ],
228 | "metadata": {
229 | "id": "q6Ra31AD7jcU"
230 | }
231 | },
232 | {
233 | "cell_type": "code",
234 | "source": [
235 | "data = [8, 10, 12, 14, 16]\n",
236 | "variance = statistics.variance(data)\n",
237 | "print(f\"Variance: {variance:.2f}\")"
238 | ],
239 | "metadata": {
240 | "colab": {
241 | "base_uri": "https://localhost:8080/"
242 | },
243 | "id": "ALOJxc4V6G0a",
244 | "outputId": "ff7c8a7d-8250-4fc3-b48a-a59dc1e15877"
245 | },
246 | "execution_count": null,
247 | "outputs": [
248 | {
249 | "output_type": "stream",
250 | "name": "stdout",
251 | "text": [
252 | "Variance: 10.00\n"
253 | ]
254 | }
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "source": [
260 | "## 6. Covariance"
261 | ],
262 | "metadata": {
263 | "id": "oXGwdDsci1AP"
264 | }
265 | },
266 | {
267 | "cell_type": "code",
268 | "source": [
269 | "data1 = [2, 4, 6, 8, 10]\n",
270 | "data2 = [1, 3, 5, 7, 9]\n",
271 | "covariance = statistics.covariance(data1, data2)\n",
272 | "print(\"Covariance:\", covariance)"
273 | ],
274 | "metadata": {
275 | "id": "5wjqe8n67uoT",
276 | "colab": {
277 | "base_uri": "https://localhost:8080/"
278 | },
279 | "outputId": "7c11b3be-9d00-47ef-b05d-ad6faff58c65"
280 | },
281 | "execution_count": 1,
282 | "outputs": [
283 | {
284 | "output_type": "stream",
285 | "name": "stdout",
286 | "text": [
287 | "Covariance: 10.0\n"
288 | ]
289 | }
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "source": [
295 | "## 7. Quantiles"
296 | ],
297 | "metadata": {
298 | "id": "DqquyE0XmKg-"
299 | }
300 | },
301 | {
302 | "cell_type": "code",
303 | "source": [
304 | "data = [1, 5, 7, 9, 10, 12, 16, 18, 19, 21]\n",
305 | "# Quartiles\n",
306 | "quantiles = statistics.quantiles(data, n=4)\n",
307 | "print(\"Quantiles (Quartiles):\", quantiles)"
308 | ],
309 | "metadata": {
310 | "colab": {
311 | "base_uri": "https://localhost:8080/"
312 | },
313 | "id": "5p1xVng-kwju",
314 | "outputId": "903cb4f8-5bb6-488a-c582-62126fbff758"
315 | },
316 | "execution_count": 4,
317 | "outputs": [
318 | {
319 | "output_type": "stream",
320 | "name": "stdout",
321 | "text": [
322 | "Quantiles (Quartiles): [6.5, 11.0, 18.25]\n"
323 | ]
324 | }
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "source": [
330 | "## 8. Correlation"
331 | ],
332 | "metadata": {
333 | "id": "eUTp6xe2CCVM"
334 | }
335 | },
336 | {
337 | "cell_type": "code",
338 | "source": [
339 | "data1 = [1, 2, 3, 4, 5]\n",
340 | "data2 = [2, 4, 6, 8, 10]\n",
341 | "correlation = statistics.correlation(data1, data2)\n",
342 | "print(\"Correlation:\", correlation)"
343 | ],
344 | "metadata": {
345 | "colab": {
346 | "base_uri": "https://localhost:8080/"
347 | },
348 | "id": "1CFP4t68mO4r",
349 | "outputId": "a43c0f06-8c1e-4229-ab23-1c8a35aef2e7"
350 | },
351 | "execution_count": 5,
352 | "outputs": [
353 | {
354 | "output_type": "stream",
355 | "name": "stdout",
356 | "text": [
357 | "Correlation: 1.0\n"
358 | ]
359 | }
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "source": [
365 | "## 9. Linear Regression"
366 | ],
367 | "metadata": {
368 | "id": "AMq5BdfuFMiB"
369 | }
370 | },
371 | {
372 | "cell_type": "code",
373 | "source": [
374 | "x = [1, 2, 3, 4, 5]\n",
375 | "y = [3, 4, 2, 5, 7]\n",
376 | "slope, intercept = statistics.linear_regression(x, y)\n",
377 | "print(\"Slope:\", slope)\n",
378 | "print(\"Intercept:\", intercept)"
379 | ],
380 | "metadata": {
381 | "colab": {
382 | "base_uri": "https://localhost:8080/"
383 | },
384 | "id": "TJVQAIjACFxz",
385 | "outputId": "e79e8709-0a67-4b51-dbb8-634fdc52ad3a"
386 | },
387 | "execution_count": 7,
388 | "outputs": [
389 | {
390 | "output_type": "stream",
391 | "name": "stdout",
392 | "text": [
393 | "Slope: 0.9\n",
394 | "Intercept: 1.5\n"
395 | ]
396 | }
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "source": [
402 | "## 10. Normal Distribution"
403 | ],
404 | "metadata": {
405 | "id": "EjcEB4bcGILJ"
406 | }
407 | },
408 | {
409 | "cell_type": "code",
410 | "source": [
411 | "# Create a normal distribution with mean 30 and standard deviation 10\n",
412 | "normal_dist = statistics.NormalDist(mu=30, sigma=10)\n",
413 | "\n",
414 | "# Calculate the probability of a value less than or equal to 20\n",
415 | "probability = normal_dist.cdf(20)\n",
416 | "print(f\"Probability (CDF) of 20: {probability:.3f}\")\n",
417 | "\n",
418 | "# Calculate the z-score for a value\n",
419 | "z_score = normal_dist.inv_cdf(0.975)\n",
420 | "print(f\"Z-score for 97.5th percentile: {z_score:.3f}\")"
421 | ],
422 | "metadata": {
423 | "colab": {
424 | "base_uri": "https://localhost:8080/"
425 | },
426 | "id": "sYkVaHDtFQ6m",
427 | "outputId": "2f1db0e8-3b5e-4764-8151-02e92b413513"
428 | },
429 | "execution_count": 11,
430 | "outputs": [
431 | {
432 | "output_type": "stream",
433 | "name": "stdout",
434 | "text": [
435 | "Probability (CDF) of 20: 0.159\n",
436 | "Z-score for 97.5th percentile: 49.600\n"
437 | ]
438 | }
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "source": [],
444 | "metadata": {
445 | "id": "w8eCKz1tJy55"
446 | },
447 | "execution_count": null,
448 | "outputs": []
449 | }
450 | ]
451 | }
--------------------------------------------------------------------------------
/statistics/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/statistics/handle_excel_files.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "## Generating a Sample Excel File"
21 | ],
22 | "metadata": {
23 | "id": "TJ5jHxsTviGe"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "id": "oGEfUQfJolvB",
31 | "colab": {
32 | "base_uri": "https://localhost:8080/"
33 | },
34 | "outputId": "8b18535d-bdfc-4198-a405-2071610bec82"
35 | },
36 | "outputs": [
37 | {
38 | "output_type": "stream",
39 | "name": "stdout",
40 | "text": [
41 | "Sample Excel file 'employee_data.xlsx' generated successfully.\n"
42 | ]
43 | }
44 | ],
45 | "source": [
46 | "import pandas as pd\n",
47 | "\n",
48 | "# Sample employee data\n",
49 | "data = {\n",
50 | " 'employee_id': [101, 102, 103, 104, 105],\n",
51 | " 'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],\n",
52 | " 'department': ['HR', 'Finance', 'IT', 'Sales', 'Marketing'],\n",
53 | " 'salary': [55000, 62000, 72000, 50000, 57000],\n",
54 | " 'performance_score': [3.8, 4.2, 4.5, 3.5, 4.0],\n",
55 | " 'years_at_company': [2, 5, 3, 4, 1]\n",
56 | "}\n",
57 | "\n",
58 | "# Create a DataFrame\n",
59 | "df = pd.DataFrame(data)\n",
60 | "\n",
61 | "# Save to an Excel file\n",
62 | "df.to_excel('employee_data.xlsx', index=False)\n",
63 | "\n",
64 | "print(\"Sample Excel file 'employee_data.xlsx' generated successfully.\")"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "source": [
70 | "## Reading in the Excel File"
71 | ],
72 | "metadata": {
73 | "id": "o0EPIZ8bvlWR"
74 | }
75 | },
76 | {
77 | "cell_type": "code",
78 | "source": [
79 | "# Read Excel file into a DataFrame\n",
80 | "df = pd.read_excel('employee_data.xlsx')\n",
81 | "\n",
82 | "print(df.head())"
83 | ],
84 | "metadata": {
85 | "colab": {
86 | "base_uri": "https://localhost:8080/"
87 | },
88 | "id": "cBRgGP-Wvs-P",
89 | "outputId": "c423480e-8cff-40f5-ef1a-fb3e5a55c0d6"
90 | },
91 | "execution_count": null,
92 | "outputs": [
93 | {
94 | "output_type": "stream",
95 | "name": "stdout",
96 | "text": [
97 | " employee_id name department salary performance_score \\\n",
98 | "0 101 Alice HR 55000 3.8 \n",
99 | "1 102 Bob Finance 62000 4.2 \n",
100 | "2 103 Charlie IT 72000 4.5 \n",
101 | "3 104 David Sales 50000 3.5 \n",
102 | "4 105 Eva Marketing 57000 4.0 \n",
103 | "\n",
104 | " years_at_company \n",
105 | "0 2 \n",
106 | "1 5 \n",
107 | "2 3 \n",
108 | "3 4 \n",
109 | "4 1 \n"
110 | ]
111 | }
112 | ]
113 | },
114 | {
115 | "cell_type": "markdown",
116 | "source": [
117 | "## Exploring and Summarizing Data"
118 | ],
119 | "metadata": {
120 | "id": "ldEpbTVevteH"
121 | }
122 | },
123 | {
124 | "cell_type": "code",
125 | "source": [
126 | "# Get info about the DataFrame\n",
127 | "print(df.info())"
128 | ],
129 | "metadata": {
130 | "colab": {
131 | "base_uri": "https://localhost:8080/"
132 | },
133 | "id": "MCwxQH84vv5u",
134 | "outputId": "49d9df4b-94e6-4fd2-e515-d6ebdf5a2a85"
135 | },
136 | "execution_count": null,
137 | "outputs": [
138 | {
139 | "output_type": "stream",
140 | "name": "stdout",
141 | "text": [
142 | "\n",
143 | "RangeIndex: 5 entries, 0 to 4\n",
144 | "Data columns (total 6 columns):\n",
145 | " # Column Non-Null Count Dtype \n",
146 | "--- ------ -------------- ----- \n",
147 | " 0 employee_id 5 non-null int64 \n",
148 | " 1 name 5 non-null object \n",
149 | " 2 department 5 non-null object \n",
150 | " 3 salary 5 non-null int64 \n",
151 | " 4 performance_score 5 non-null float64\n",
152 | " 5 years_at_company 5 non-null int64 \n",
153 | "dtypes: float64(1), int64(3), object(2)\n",
154 | "memory usage: 368.0+ bytes\n",
155 | "None\n"
156 | ]
157 | }
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "source": [
163 | "# Get descriptive statistics\n",
164 | "print(df.describe())"
165 | ],
166 | "metadata": {
167 | "colab": {
168 | "base_uri": "https://localhost:8080/"
169 | },
170 | "id": "UF_BXV17wUyv",
171 | "outputId": "ef65974f-d366-4d99-9b6f-2a776bb9a2ca"
172 | },
173 | "execution_count": null,
174 | "outputs": [
175 | {
176 | "output_type": "stream",
177 | "name": "stdout",
178 | "text": [
179 | " employee_id salary performance_score years_at_company\n",
180 | "count 5.000000 5.000000 5.000000 5.000000\n",
181 | "mean 103.000000 59200.000000 4.000000 3.000000\n",
182 | "std 1.581139 8348.652586 0.380789 1.581139\n",
183 | "min 101.000000 50000.000000 3.500000 1.000000\n",
184 | "25% 102.000000 55000.000000 3.800000 2.000000\n",
185 | "50% 103.000000 57000.000000 4.000000 3.000000\n",
186 | "75% 104.000000 62000.000000 4.200000 4.000000\n",
187 | "max 105.000000 72000.000000 4.500000 5.000000\n"
188 | ]
189 | }
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "source": [
195 | "## Handling Missing Values"
196 | ],
197 | "metadata": {
198 | "id": "rVlV_FmmvwOP"
199 | }
200 | },
201 | {
202 | "cell_type": "code",
203 | "source": [
204 | "# Check for missing values\n",
205 | "missing_values = df.isna().sum()\n",
206 | "print(missing_values)"
207 | ],
208 | "metadata": {
209 | "colab": {
210 | "base_uri": "https://localhost:8080/"
211 | },
212 | "id": "5vMERpSvvzmH",
213 | "outputId": "8c9292f5-aa19-4f19-a2e6-76aaaaee6e82"
214 | },
215 | "execution_count": null,
216 | "outputs": [
217 | {
218 | "output_type": "stream",
219 | "name": "stdout",
220 | "text": [
221 | "employee_id 0\n",
222 | "name 0\n",
223 | "department 0\n",
224 | "salary 0\n",
225 | "performance_score 0\n",
226 | "years_at_company 0\n",
227 | "dtype: int64\n"
228 | ]
229 | }
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "source": [
235 | "# Fill missing performance scores with the average\n",
236 | "df['performance_score'] = df['performance_score'].fillna(df['performance_score'].mean())"
237 | ],
238 | "metadata": {
239 | "id": "f8QMM_3UweNm"
240 | },
241 | "execution_count": null,
242 | "outputs": []
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "source": [
247 | "## Basic Data Manipulation"
248 | ],
249 | "metadata": {
250 | "id": "UoQF_uYxv2kR"
251 | }
252 | },
253 | {
254 | "cell_type": "code",
255 | "source": [
256 | " # Filter employees with a performance score above 4\n",
257 | "high_performers = df[df['performance_score'] > 4]\n",
258 | "print(high_performers)"
259 | ],
260 | "metadata": {
261 | "colab": {
262 | "base_uri": "https://localhost:8080/"
263 | },
264 | "id": "gyWnuUX8v10g",
265 | "outputId": "aad60691-ddfa-461e-efa5-0889de7bf047"
266 | },
267 | "execution_count": null,
268 | "outputs": [
269 | {
270 | "output_type": "stream",
271 | "name": "stdout",
272 | "text": [
273 | " employee_id name department salary performance_score \\\n",
274 | "1 102 Bob Finance 62000 4.2 \n",
275 | "2 103 Charlie IT 72000 4.5 \n",
276 | "\n",
277 | " years_at_company \n",
278 | "1 5 \n",
279 | "2 3 \n"
280 | ]
281 | }
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "source": [
287 | "# Select specific columns\n",
288 | "selected_columns = df[['name', 'department', 'salary']]\n",
289 | "print(selected_columns)"
290 | ],
291 | "metadata": {
292 | "colab": {
293 | "base_uri": "https://localhost:8080/"
294 | },
295 | "id": "2NYJXMEywkNm",
296 | "outputId": "5e3fe3ee-fe57-476d-d45b-84f8c41f3324"
297 | },
298 | "execution_count": null,
299 | "outputs": [
300 | {
301 | "output_type": "stream",
302 | "name": "stdout",
303 | "text": [
304 | " name department salary\n",
305 | "0 Alice HR 55000\n",
306 | "1 Bob Finance 62000\n",
307 | "2 Charlie IT 72000\n",
308 | "3 David Sales 50000\n",
309 | "4 Eva Marketing 57000\n"
310 | ]
311 | }
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "source": [
317 | "# Add a new column for bonus\n",
318 | "df['bonus'] = df['salary'].apply(lambda x: x * 0.10)\n",
319 | "print(df.head())"
320 | ],
321 | "metadata": {
322 | "colab": {
323 | "base_uri": "https://localhost:8080/"
324 | },
325 | "id": "jiVSYWN5wmO-",
326 | "outputId": "2aae7e21-c9b5-4b62-9e99-06265a1dfae1"
327 | },
328 | "execution_count": null,
329 | "outputs": [
330 | {
331 | "output_type": "stream",
332 | "name": "stdout",
333 | "text": [
334 | " employee_id name department salary performance_score \\\n",
335 | "0 101 Alice HR 55000 3.8 \n",
336 | "1 102 Bob Finance 62000 4.2 \n",
337 | "2 103 Charlie IT 72000 4.5 \n",
338 | "3 104 David Sales 50000 3.5 \n",
339 | "4 105 Eva Marketing 57000 4.0 \n",
340 | "\n",
341 | " years_at_company bonus \n",
342 | "0 2 5500.0 \n",
343 | "1 5 6200.0 \n",
344 | "2 3 7200.0 \n",
345 | "3 4 5000.0 \n",
346 | "4 1 5700.0 \n"
347 | ]
348 | }
349 | ]
350 | },
351 | {
352 | "cell_type": "markdown",
353 | "source": [
354 | "## Grouping and Aggregating Data"
355 | ],
356 | "metadata": {
357 | "id": "O3B9eGDsv8Bs"
358 | }
359 | },
360 | {
361 | "cell_type": "code",
362 | "source": [
363 | "# Calculate average salary grouped by department\n",
364 | "average_salary_by_department = df.groupby('department')['salary'].mean().reset_index()\n",
365 | "print(average_salary_by_department)"
366 | ],
367 | "metadata": {
368 | "colab": {
369 | "base_uri": "https://localhost:8080/"
370 | },
371 | "id": "nKeNszX6v9QA",
372 | "outputId": "8e5170dc-51f9-4032-9bfa-b0cd5404e651"
373 | },
374 | "execution_count": null,
375 | "outputs": [
376 | {
377 | "output_type": "stream",
378 | "name": "stdout",
379 | "text": [
380 | " department salary\n",
381 | "0 Finance 62000.0\n",
382 | "1 HR 55000.0\n",
383 | "2 IT 72000.0\n",
384 | "3 Marketing 57000.0\n",
385 | "4 Sales 50000.0\n"
386 | ]
387 | }
388 | ]
389 | }
390 | ]
391 | }
--------------------------------------------------------------------------------
/statistics/probability/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/statistics/probability/joint_and_conditional_pbty.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "## Step 1: Creating Sample Data"
21 | ],
22 | "metadata": {
23 | "id": "YjO9ZVZIM8Ye"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "colab": {
31 | "base_uri": "https://localhost:8080/"
32 | },
33 | "id": "sAXOTiD9Ltz0",
34 | "outputId": "63fdc6fe-0ae1-4b61-ba92-4e481cc8c561"
35 | },
36 | "outputs": [
37 | {
38 | "output_type": "stream",
39 | "name": "stdout",
40 | "text": [
41 | " Age_Group Sports_Interest\n",
42 | "0 Teen Yes\n",
43 | "1 Teen No\n",
44 | "2 Teen Yes\n",
45 | "3 Adult No\n",
46 | "4 Adult No\n",
47 | "5 Senior Yes\n",
48 | "6 Senior Yes\n",
49 | "7 Senior No\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "import pandas as pd\n",
55 | "\n",
56 | "# Sample data\n",
57 | "data = {\n",
58 | " \"Age_Group\": [\"Teen\", \"Teen\", \"Teen\", \"Adult\", \"Adult\", \"Senior\", \"Senior\", \"Senior\"],\n",
59 | " \"Sports_Interest\": [\"Yes\", \"No\", \"Yes\", \"No\", \"No\", \"Yes\", \"Yes\", \"No\"]\n",
60 | "}\n",
61 | "\n",
62 | "df = pd.DataFrame(data)\n",
63 | "\n",
64 | "# Display the data\n",
65 | "print(df)\n"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "source": [
71 | "## Step 2: Calculating Joint Probability"
72 | ],
73 | "metadata": {
74 | "id": "1VY0hLRKMWMr"
75 | }
76 | },
77 | {
78 | "cell_type": "code",
79 | "source": [
80 | "# Total number of observations\n",
81 | "total_count = len(df)\n",
82 | "\n",
83 | "# Count occurrences where Age_Group is \"Teen\" and Sports_Interest is \"Yes\"\n",
84 | "joint_count = len(df[(df['Age_Group'] == 'Teen') & (df['Sports_Interest'] == 'Yes')])\n",
85 | "\n",
86 | "# Joint probability\n",
87 | "joint_probability = joint_count / total_count\n",
88 | "\n",
89 | "print(f\"Joint Probability (Teen and Sports Interest Yes): {joint_probability}\")\n"
90 | ],
91 | "metadata": {
92 | "colab": {
93 | "base_uri": "https://localhost:8080/"
94 | },
95 | "id": "M32eM5NPMHNd",
96 | "outputId": "35e64e55-358f-471c-b583-ca322b9597c0"
97 | },
98 | "execution_count": null,
99 | "outputs": [
100 | {
101 | "output_type": "stream",
102 | "name": "stdout",
103 | "text": [
104 | "Joint Probability (Teen and Sports Interest Yes): 0.25\n"
105 | ]
106 | }
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "source": [
112 | "## Step 3: Calculating Conditional Probability"
113 | ],
114 | "metadata": {
115 | "id": "OIs1olhPMZgq"
116 | }
117 | },
118 | {
119 | "cell_type": "code",
120 | "source": [
121 | "# Filter data for Age_Group = \"Teen\"\n",
122 | "teen_data = df[df['Age_Group'] == 'Teen']\n",
123 | "\n",
124 | "# Count occurrences of Sports_Interest = \"Yes\" among teens\n",
125 | "conditional_count = len(teen_data[teen_data['Sports_Interest'] == 'Yes'])\n",
126 | "\n",
127 | "# Conditional probability\n",
128 | "conditional_probability = conditional_count / len(teen_data)\n",
129 | "\n",
130 | "print(f\"Conditional Probability (Sports Interest Yes | Age Group Teen): {conditional_probability:.3f}\")\n"
131 | ],
132 | "metadata": {
133 | "colab": {
134 | "base_uri": "https://localhost:8080/"
135 | },
136 | "id": "vMTq6kaKMJdd",
137 | "outputId": "559e5632-7ca7-44bd-9d59-4f2aeb19f50a"
138 | },
139 | "execution_count": null,
140 | "outputs": [
141 | {
142 | "output_type": "stream",
143 | "name": "stdout",
144 | "text": [
145 | "Conditional Probability (Sports Interest Yes | Age Group Teen): 0.667\n"
146 | ]
147 | }
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "source": [
153 | "## Step 4: Generalizing with Functions"
154 | ],
155 | "metadata": {
156 | "id": "L2uNqq9zM1I2"
157 | }
158 | },
159 | {
160 | "cell_type": "code",
161 | "source": [
162 | "def calculate_joint_probability(df, condition1, condition2):\n",
163 | " total_count = len(df)\n",
164 | " joint_count = len(df[(df[condition1[0]] == condition1[1]) & (df[condition2[0]] == condition2[1])])\n",
165 | " return joint_count / total_count\n",
166 | "\n",
167 | "def calculate_conditional_probability(df, given_condition, target_condition):\n",
168 | " subset = df[df[given_condition[0]] == given_condition[1]]\n",
169 | " conditional_count = len(subset[subset[target_condition[0]] == target_condition[1]])\n",
170 | " return conditional_count / len(subset)\n"
171 | ],
172 | "metadata": {
173 | "id": "VGoD5_-2MMfE"
174 | },
175 | "execution_count": null,
176 | "outputs": []
177 | },
178 | {
179 | "cell_type": "code",
180 | "source": [
181 | "# Joint Probability of \"Teen\" and \"Sports_Interest = Yes\"\n",
182 | "joint_prob = calculate_joint_probability(df, (\"Age_Group\", \"Teen\"), (\"Sports_Interest\", \"Yes\"))\n",
183 | "print(f\"Joint Probability (Teen and Sports Interest Yes): {joint_prob}\")\n",
184 | "\n",
185 | "# Conditional Probability of \"Sports_Interest = Yes\" given \"Age_Group = Teen\"\n",
186 | "conditional_prob = calculate_conditional_probability(df, (\"Age_Group\", \"Teen\"), (\"Sports_Interest\", \"Yes\"))\n",
187 | "print(f\"Conditional Probability (Sports Interest Yes | Age Group Teen): {conditional_prob:.3f}\")\n"
188 | ],
189 | "metadata": {
190 | "colab": {
191 | "base_uri": "https://localhost:8080/"
192 | },
193 | "id": "rSEt6qJgMQQN",
194 | "outputId": "0f5e0527-f942-4f4a-8081-7ec6d4591708"
195 | },
196 | "execution_count": null,
197 | "outputs": [
198 | {
199 | "output_type": "stream",
200 | "name": "stdout",
201 | "text": [
202 | "Joint Probability (Teen and Sports Interest Yes): 0.25\n",
203 | "Conditional Probability (Sports Interest Yes | Age Group Teen): 0.667\n"
204 | ]
205 | }
206 | ]
207 | }
208 | ]
209 | }
--------------------------------------------------------------------------------
/statistics/sparse_data_analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "id": "0R9gVhnIMrNH"
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import numpy as np\n",
26 | "from scipy import sparse\n",
27 | "import pandas as pd\n",
28 | "from scipy import stats\n"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "source": [
34 | "# Create a sparse matrix where rows are users and columns are products\n",
35 | "# Only storing the actual interactions\n",
36 | "row = np.array([0, 3, 1, 0]) # User IDs\n",
37 | "col = np.array([0, 3, 1, 2]) # Product IDs\n",
38 | "data = np.array([4, 5, 7, 9]) # Interaction values (like ratings)\n",
39 | "\n",
40 | "# Create the sparse matrix\n",
41 | "sparse_matrix = sparse.coo_matrix((data, (row, col)), shape=(4, 4))\n",
42 | "\n",
43 | "# seeing the sparse matrix as a regular matrix\n",
44 | "print(\"Here's our sparse matrix as a regular array:\")\n",
45 | "print(sparse_matrix.toarray())\n"
46 | ],
47 | "metadata": {
48 | "colab": {
49 | "base_uri": "https://localhost:8080/"
50 | },
51 | "id": "RkAQQ8QCMzM7",
52 | "outputId": "96ee974b-f672-4c59-a965-4626b9bc1cf5"
53 | },
54 | "execution_count": 2,
55 | "outputs": [
56 | {
57 | "output_type": "stream",
58 | "name": "stdout",
59 | "text": [
60 | "Here's our sparse matrix as a regular array:\n",
61 | "[[4 0 9 0]\n",
62 | " [0 7 0 0]\n",
63 | " [0 0 0 0]\n",
64 | " [0 0 0 5]]\n"
65 | ]
66 | }
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "source": [
72 | "def calculate_sparse_mean(sparse_matrix):\n",
73 | " \"\"\"\n",
74 | " Calculate mean of non-zero elements in a sparse matrix.\n",
75 | " This is useful when zeros represent 'no data' rather than actual zeros.\n",
76 | " \"\"\"\n",
77 | " if sparse_matrix.nnz == 0: # nnz is the number of non-zero elements\n",
78 | " return 0.0\n",
79 | " return sparse_matrix.sum() / sparse_matrix.nnz\n",
80 | "\n",
81 | "mean_value = calculate_sparse_mean(sparse_matrix)\n",
82 | "print(f\"\\nMean of non-zero elements: {mean_value:.2f}\")\n"
83 | ],
84 | "metadata": {
85 | "colab": {
86 | "base_uri": "https://localhost:8080/"
87 | },
88 | "id": "Dz0BFJXXM1ia",
89 | "outputId": "d3b9092d-2218-477a-80c5-551fcbf19cd5"
90 | },
91 | "execution_count": 3,
92 | "outputs": [
93 | {
94 | "output_type": "stream",
95 | "name": "stdout",
96 | "text": [
97 | "\n",
98 | "Mean of non-zero elements: 6.25\n"
99 | ]
100 | }
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "source": [
106 | "def analyze_row_patterns(sparse_matrix):\n",
107 | " \"\"\"\n",
108 | " Analyze patterns in each row of a sparse matrix.\n",
109 | " Returns dictionary with various row statistics.\n",
110 | " \"\"\"\n",
111 | " # Convert to CSR format for efficient row operations\n",
112 | " csr_matrix = sparse_matrix.tocsr()\n",
113 | "\n",
114 | " # Calculate statistics\n",
115 | " row_sums = np.array(csr_matrix.sum(axis=1)).flatten()\n",
116 | " row_nonzeros = np.diff(csr_matrix.indptr) # Number of non-zeros per row\n",
117 | "\n",
118 | " # Calculate means, handling empty rows\n",
119 | " row_means = np.zeros_like(row_sums, dtype=float)\n",
120 | " mask = row_nonzeros > 0\n",
121 | " row_means[mask] = row_sums[mask] / row_nonzeros[mask]\n",
122 | "\n",
123 | " return {\n",
124 | " 'activity_sum': row_sums, # Total activity per user\n",
125 | " 'interaction_count': row_nonzeros, # Number of interactions per user\n",
126 | " 'average_value': row_means # Average value per user\n",
127 | " }\n"
128 | ],
129 | "metadata": {
130 | "id": "SF3ygrrvM4Ks"
131 | },
132 | "execution_count": 4,
133 | "outputs": []
134 | },
135 | {
136 | "cell_type": "code",
137 | "source": [
138 | "stats = analyze_row_patterns(sparse_matrix)\n",
139 | "print(\"\\nUser Statistics:\")\n",
140 | "for i, (sum_val, count, mean) in enumerate(zip(\n",
141 | " stats['activity_sum'],\n",
142 | " stats['interaction_count'],\n",
143 | " stats['average_value']\n",
144 | ")):\n",
145 | " print(f\"User {i}: {count} interactions, \"\n",
146 | " f\"total activity = {sum_val}, \"\n",
147 | " f\"average value = {mean:.2f}\")\n"
148 | ],
149 | "metadata": {
150 | "colab": {
151 | "base_uri": "https://localhost:8080/"
152 | },
153 | "id": "IAzJ8tHRM519",
154 | "outputId": "52a67420-34d2-4d81-ce3f-4af60177be04"
155 | },
156 | "execution_count": 5,
157 | "outputs": [
158 | {
159 | "output_type": "stream",
160 | "name": "stdout",
161 | "text": [
162 | "\n",
163 | "User Statistics:\n",
164 | "User 0: 2 interactions, total activity = 13, average value = 6.50\n",
165 | "User 1: 1 interactions, total activity = 7, average value = 7.00\n",
166 | "User 2: 0 interactions, total activity = 0, average value = 0.00\n",
167 | "User 3: 1 interactions, total activity = 5, average value = 5.00\n"
168 | ]
169 | }
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "source": [
175 | "def calculate_sparse_correlation(sparse_matrix, min_overlap=2):\n",
176 | " \"\"\"\n",
177 | " Calculate correlation between columns, considering only overlapping non-zero elements.\n",
178 | " Like finding which products are often rated similarly.\n",
179 | " \"\"\"\n",
180 | " # Convert to dense format for this calculation\n",
181 | " # (For very large matrices, you'd want to do this differently)\n",
182 | " dense_cols = sparse_matrix.toarray().T\n",
183 | " n_cols = dense_cols.shape[0]\n",
184 | " correlations = np.zeros((n_cols, n_cols))\n",
185 | "\n",
186 | " for i in range(n_cols):\n",
187 | " for j in range(i, n_cols):\n",
188 | " # Find where both columns have non-zero values\n",
189 | " mask = (dense_cols[i] != 0) & (dense_cols[j] != 0)\n",
190 | " if mask.sum() >= min_overlap:\n",
191 | " corr = stats.pearsonr(dense_cols[i][mask],\n",
192 | " dense_cols[j][mask])[0]\n",
193 | " correlations[i, j] = correlations[j, i] = corr\n",
194 | "\n",
195 | " return correlations"
196 | ],
197 | "metadata": {
198 | "id": "ADRakCn4M8KD"
199 | },
200 | "execution_count": 6,
201 | "outputs": []
202 | },
203 | {
204 | "cell_type": "code",
205 | "source": [
206 | "corr_matrix = calculate_sparse_correlation(sparse_matrix)\n",
207 | "print(\"\\nCorrelation matrix:\")\n",
208 | "print(corr_matrix)"
209 | ],
210 | "metadata": {
211 | "colab": {
212 | "base_uri": "https://localhost:8080/"
213 | },
214 | "id": "7UuFzRB6M979",
215 | "outputId": "af68a7bc-e862-40bc-ead4-eac28fd5b1f7"
216 | },
217 | "execution_count": 7,
218 | "outputs": [
219 | {
220 | "output_type": "stream",
221 | "name": "stdout",
222 | "text": [
223 | "\n",
224 | "Correlation matrix:\n",
225 | "[[0. 0. 0. 0.]\n",
226 | " [0. 0. 0. 0.]\n",
227 | " [0. 0. 0. 0.]\n",
228 | " [0. 0. 0. 0.]]\n"
229 | ]
230 | }
231 | ]
232 | }
233 | ]
234 | }
--------------------------------------------------------------------------------
/statistics/sparse_data_analysis_v0_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "## Imports"
21 | ],
22 | "metadata": {
23 | "id": "DPt0ex-tOHxH"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "id": "0R9gVhnIMrNH"
31 | },
32 | "outputs": [],
33 | "source": [
34 | "import numpy as np\n",
35 | "from scipy import sparse\n",
36 | "import pandas as pd\n",
37 | "from scipy import stats\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "source": [
43 | "## Creating a Sparse Matrix"
44 | ],
45 | "metadata": {
46 | "id": "M7kq8YzvOKG-"
47 | }
48 | },
49 | {
50 | "cell_type": "code",
51 | "source": [
52 | "# Create a sparse matrix where rows are users and columns are products\n",
53 | "# Only storing the actual interactions\n",
54 | "row = np.array([0, 3, 1, 0]) # User IDs\n",
55 | "col = np.array([0, 3, 1, 2]) # Product IDs\n",
56 | "data = np.array([4, 5, 7, 9]) # Interaction values (like ratings)\n",
57 | "\n",
58 | "# Create the sparse matrix\n",
59 | "sparse_matrix = sparse.coo_matrix((data, (row, col)), shape=(4, 4))\n",
60 | "\n",
61 | "# seeing the sparse matrix as a regular matrix\n",
62 | "print(\"Here's our sparse matrix as a regular array:\")\n",
63 | "print(sparse_matrix.toarray())\n"
64 | ],
65 | "metadata": {
66 | "colab": {
67 | "base_uri": "https://localhost:8080/"
68 | },
69 | "id": "RkAQQ8QCMzM7",
70 | "outputId": "96ee974b-f672-4c59-a965-4626b9bc1cf5"
71 | },
72 | "execution_count": null,
73 | "outputs": [
74 | {
75 | "output_type": "stream",
76 | "name": "stdout",
77 | "text": [
78 | "Here's our sparse matrix as a regular array:\n",
79 | "[[4 0 9 0]\n",
80 | " [0 7 0 0]\n",
81 | " [0 0 0 0]\n",
82 | " [0 0 0 5]]\n"
83 | ]
84 | }
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "source": [
90 | "## Basic Statistical Analysis"
91 | ],
92 | "metadata": {
93 | "id": "VRm2aXYiOS3F"
94 | }
95 | },
96 | {
97 | "cell_type": "code",
98 | "source": [
99 | "def calculate_sparse_mean(sparse_matrix):\n",
100 | " \"\"\"\n",
101 | " Calculate mean of non-zero elements in a sparse matrix.\n",
102 | " This is useful when zeros represent 'no data' rather than actual zeros.\n",
103 | " \"\"\"\n",
104 | " if sparse_matrix.nnz == 0: # nnz is the number of non-zero elements\n",
105 | " return 0.0\n",
106 | " return sparse_matrix.sum() / sparse_matrix.nnz\n",
107 | "\n",
108 | "mean_value = calculate_sparse_mean(sparse_matrix)\n",
109 | "print(f\"\\nMean of non-zero elements: {mean_value:.2f}\")\n"
110 | ],
111 | "metadata": {
112 | "colab": {
113 | "base_uri": "https://localhost:8080/"
114 | },
115 | "id": "Dz0BFJXXM1ia",
116 | "outputId": "d3b9092d-2218-477a-80c5-551fcbf19cd5"
117 | },
118 | "execution_count": null,
119 | "outputs": [
120 | {
121 | "output_type": "stream",
122 | "name": "stdout",
123 | "text": [
124 | "\n",
125 | "Mean of non-zero elements: 6.25\n"
126 | ]
127 | }
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "source": [
133 | "## Handling Row and Column Statistics"
134 | ],
135 | "metadata": {
136 | "id": "fDhW59jyOWl1"
137 | }
138 | },
139 | {
140 | "cell_type": "code",
141 | "source": [
142 | "def analyze_row_patterns(sparse_matrix):\n",
143 | " \"\"\"\n",
144 | " Analyze patterns in each row of a sparse matrix.\n",
145 | " Returns dictionary with various row statistics.\n",
146 | " \"\"\"\n",
147 | " # Convert to CSR format for efficient row operations\n",
148 | " csr_matrix = sparse_matrix.tocsr()\n",
149 | "\n",
150 | " # Calculate statistics\n",
151 | " row_sums = np.array(csr_matrix.sum(axis=1)).flatten()\n",
152 | " row_nonzeros = np.diff(csr_matrix.indptr) # Number of non-zeros per row\n",
153 | "\n",
154 | " # Calculate means, handling empty rows\n",
155 | " row_means = np.zeros_like(row_sums, dtype=float)\n",
156 | " mask = row_nonzeros > 0\n",
157 | " row_means[mask] = row_sums[mask] / row_nonzeros[mask]\n",
158 | "\n",
159 | " return {\n",
160 | " 'activity_sum': row_sums, # Total activity per user\n",
161 | " 'interaction_count': row_nonzeros, # Number of interactions per user\n",
162 | " 'average_value': row_means # Average value per user\n",
163 | " }\n"
164 | ],
165 | "metadata": {
166 | "id": "SF3ygrrvM4Ks"
167 | },
168 | "execution_count": null,
169 | "outputs": []
170 | },
171 | {
172 | "cell_type": "code",
173 | "source": [
174 | "stats = analyze_row_patterns(sparse_matrix)\n",
175 | "print(\"\\nUser Statistics:\")\n",
176 | "for i, (sum_val, count, mean) in enumerate(zip(\n",
177 | " stats['activity_sum'],\n",
178 | " stats['interaction_count'],\n",
179 | " stats['average_value']\n",
180 | ")):\n",
181 | " print(f\"User {i}: {count} interactions, \"\n",
182 | " f\"total activity = {sum_val}, \"\n",
183 | " f\"average value = {mean:.2f}\")\n"
184 | ],
185 | "metadata": {
186 | "colab": {
187 | "base_uri": "https://localhost:8080/"
188 | },
189 | "id": "IAzJ8tHRM519",
190 | "outputId": "52a67420-34d2-4d81-ce3f-4af60177be04"
191 | },
192 | "execution_count": null,
193 | "outputs": [
194 | {
195 | "output_type": "stream",
196 | "name": "stdout",
197 | "text": [
198 | "\n",
199 | "User Statistics:\n",
200 | "User 0: 2 interactions, total activity = 13, average value = 6.50\n",
201 | "User 1: 1 interactions, total activity = 7, average value = 7.00\n",
202 | "User 2: 0 interactions, total activity = 0, average value = 0.00\n",
203 | "User 3: 1 interactions, total activity = 5, average value = 5.00\n"
204 | ]
205 | }
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "source": [
211 | "## Correlation Analysis"
212 | ],
213 | "metadata": {
214 | "id": "m5ETMgcxOatl"
215 | }
216 | },
217 | {
218 | "cell_type": "code",
219 | "source": [
220 | "def calculate_sparse_correlation(sparse_matrix, min_overlap=2):\n",
221 | " \"\"\"\n",
222 | " Calculate correlation between columns, considering only overlapping non-zero elements.\n",
223 | " Like finding which products are often rated similarly.\n",
224 | " \"\"\"\n",
225 | " # Convert to dense format for this calculation\n",
226 | " # (For very large matrices, you'd want to do this differently)\n",
227 | " dense_cols = sparse_matrix.toarray().T\n",
228 | " n_cols = dense_cols.shape[0]\n",
229 | " correlations = np.zeros((n_cols, n_cols))\n",
230 | "\n",
231 | " for i in range(n_cols):\n",
232 | " for j in range(i, n_cols):\n",
233 | " # Find where both columns have non-zero values\n",
234 | " mask = (dense_cols[i] != 0) & (dense_cols[j] != 0)\n",
235 | " if mask.sum() >= min_overlap:\n",
236 | " corr = stats.pearsonr(dense_cols[i][mask],\n",
237 | " dense_cols[j][mask])[0]\n",
238 | " correlations[i, j] = correlations[j, i] = corr\n",
239 | "\n",
240 | " return correlations"
241 | ],
242 | "metadata": {
243 | "id": "ADRakCn4M8KD"
244 | },
245 | "execution_count": null,
246 | "outputs": []
247 | },
248 | {
249 | "cell_type": "code",
250 | "source": [
251 | "corr_matrix = calculate_sparse_correlation(sparse_matrix)\n",
252 | "print(\"\\nCorrelation matrix:\")\n",
253 | "print(corr_matrix)"
254 | ],
255 | "metadata": {
256 | "colab": {
257 | "base_uri": "https://localhost:8080/"
258 | },
259 | "id": "7UuFzRB6M979",
260 | "outputId": "af68a7bc-e862-40bc-ead4-eac28fd5b1f7"
261 | },
262 | "execution_count": null,
263 | "outputs": [
264 | {
265 | "output_type": "stream",
266 | "name": "stdout",
267 | "text": [
268 | "\n",
269 | "Correlation matrix:\n",
270 | "[[0. 0. 0. 0.]\n",
271 | " [0. 0. 0. 0.]\n",
272 | " [0. 0. 0. 0.]\n",
273 | " [0. 0. 0. 0.]]\n"
274 | ]
275 | }
276 | ]
277 | }
278 | ]
279 | }
--------------------------------------------------------------------------------