[\\w.]+)\"\n",
298 | "\n",
299 | "match = re.search(pattern, text)\n",
300 | "if match:\n",
301 | " print(f\"Username: {match.group('username')}\")\n",
302 | " print(f\"Domain: {match.group('domain')}\")\n"
303 | ],
304 | "metadata": {
305 | "colab": {
306 | "base_uri": "https://localhost:8080/"
307 | },
308 | "id": "L9FOB0uiLexp",
309 | "outputId": "c73b8c7f-af37-4016-b404-d9ad7502514f"
310 | },
311 | "execution_count": 24,
312 | "outputs": [
313 | {
314 | "output_type": "stream",
315 | "name": "stdout",
316 | "text": [
317 | "Username: john.doe\n",
318 | "Domain: example.com\n"
319 | ]
320 | }
321 | ]
322 | }
323 | ]
324 | }
--------------------------------------------------------------------------------
/regex/regex_contd.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "id": "VxllMcawIjlB"
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import re"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "source": [
31 | "text = \"First content
Second content
\"\n",
32 | "\n",
33 | "# Greedy matching (default)\n",
34 | "greedy = re.findall(r\"(.*)
\", text)\n",
35 | "print(f\"Greedy: {greedy}\")\n",
36 | "\n",
37 | "# Non-greedy matching\n",
38 | "non_greedy = re.findall(r\"(.*?)
\", text)\n",
39 | "print(f\"Non-greedy: {non_greedy}\")\n"
40 | ],
41 | "metadata": {
42 | "colab": {
43 | "base_uri": "https://localhost:8080/"
44 | },
45 | "id": "Cb35InjGIzW3",
46 | "outputId": "cf4f7b63-8e9c-4ad0-f4f7-a0c6bbd52a03"
47 | },
48 | "execution_count": 2,
49 | "outputs": [
50 | {
51 | "output_type": "stream",
52 | "name": "stdout",
53 | "text": [
54 | "Greedy: ['First contentSecond content']\n",
55 | "Non-greedy: ['First content', 'Second content']\n"
56 | ]
57 | }
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "source": [
63 | "# Password validation\n",
64 | "password = \"Password123\"\n",
65 | "has_uppercase = bool(re.search(r\"(?=.*[A-Z])\", password))\n",
66 | "has_lowercase = bool(re.search(r\"(?=.*[a-z])\", password))\n",
67 | "has_digit = bool(re.search(r\"(?=.*\\d)\", password))\n",
68 | "is_long_enough = len(password) >= 8\n",
69 | "\n",
70 | "if all([has_uppercase, has_lowercase, has_digit, is_long_enough]):\n",
71 | " print(\"Password meets requirements\")\n",
72 | "else:\n",
73 | " print(\"Password does not meet all requirements\")\n"
74 | ],
75 | "metadata": {
76 | "colab": {
77 | "base_uri": "https://localhost:8080/"
78 | },
79 | "id": "N21dkm8NI-TG",
80 | "outputId": "4c0b31c3-422c-4145-e07f-aff64c4a5950"
81 | },
82 | "execution_count": 3,
83 | "outputs": [
84 | {
85 | "output_type": "stream",
86 | "name": "stdout",
87 | "text": [
88 | "Password meets requirements\n"
89 | ]
90 | }
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "source": [],
96 | "metadata": {
97 | "id": "VNTuYz1bI-Jc"
98 | },
99 | "execution_count": 3,
100 | "outputs": []
101 | }
102 | ]
103 | }
--------------------------------------------------------------------------------
/regex/regex_examples.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "code",
19 | "execution_count": null,
20 | "metadata": {
21 | "colab": {
22 | "base_uri": "https://localhost:8080/"
23 | },
24 | "id": "YPKB2a5Hsw02",
25 | "outputId": "3da0932d-8614-477c-b6a6-d202c795f3da"
26 | },
27 | "outputs": [
28 | {
29 | "output_type": "stream",
30 | "name": "stdout",
31 | "text": [
32 | "Contact info: 1234567890 and 9876543210.\n"
33 | ]
34 | }
35 | ],
36 | "source": [
37 | "import re\n",
38 | "\n",
39 | "text = \"Contact info: (123)-456-7890 and 987-654-3210.\"\n",
40 | "cleaned_text = re.sub(r'[()-]', '', text)\n",
41 | "print(cleaned_text)\n"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "source": [
47 | "text = \"Please reach out to us at support@example.org or help@example.org.\"\n",
48 | "emails = re.findall(r'\\b[\\w.-]+?@\\w+?\\.\\w+?\\b', text)\n",
49 | "print(emails)\n"
50 | ],
51 | "metadata": {
52 | "colab": {
53 | "base_uri": "https://localhost:8080/"
54 | },
55 | "id": "eGS7s-zTs-9T",
56 | "outputId": "74a8b91b-5af7-48f6-9dd3-d690b9f36b21"
57 | },
58 | "execution_count": null,
59 | "outputs": [
60 | {
61 | "output_type": "stream",
62 | "name": "stdout",
63 | "text": [
64 | "['support@example.org', 'help@example.org']\n"
65 | ]
66 | }
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "source": [
72 | "text = \"This\tis\ta\tstring with multiple unnecessary spaces.\"\n",
73 | "cleaned_text = re.sub(r'\\s+', ' ', text)\n",
74 | "print(cleaned_text)\n"
75 | ],
76 | "metadata": {
77 | "colab": {
78 | "base_uri": "https://localhost:8080/"
79 | },
80 | "id": "dd0K0-LrtmBi",
81 | "outputId": "5436543f-4b19-4081-f8d4-5ec6f64d6d6b"
82 | },
83 | "execution_count": null,
84 | "outputs": [
85 | {
86 | "output_type": "stream",
87 | "name": "stdout",
88 | "text": [
89 | "This is a string with multiple unnecessary spaces.\n"
90 | ]
91 | }
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "source": [
97 | "email = \"test@example.com\"\n",
98 | "if re.match(r'^\\b[\\w.-]+?@\\w+?\\.\\w+?\\b$', email):\n",
99 | " print(\"Valid email\") # Output: Valid email\n",
100 | "else:\n",
101 | " print(\"Invalid email\")\n"
102 | ],
103 | "metadata": {
104 | "colab": {
105 | "base_uri": "https://localhost:8080/"
106 | },
107 | "id": "j05fGS1UyCfe",
108 | "outputId": "402a5319-ba31-44d8-e870-ccbc35535af3"
109 | },
110 | "execution_count": null,
111 | "outputs": [
112 | {
113 | "output_type": "stream",
114 | "name": "stdout",
115 | "text": [
116 | "Valid email\n"
117 | ]
118 | }
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "source": [
124 | "text = \"This is sentence one. And this is sentence two! Is this sentence three?\"\n",
125 | "sentences = re.split(r'[.!?]', text)\n",
126 | "print(sentences) # Output: ['This is sentence one', ' And this is sentence two', ' Is this sentence three', '']\n"
127 | ],
128 | "metadata": {
129 | "colab": {
130 | "base_uri": "https://localhost:8080/"
131 | },
132 | "id": "7f68JqnBzBX9",
133 | "outputId": "455de3e7-3cd0-4ffc-ad69-d058e9ecedff"
134 | },
135 | "execution_count": null,
136 | "outputs": [
137 | {
138 | "output_type": "stream",
139 | "name": "stdout",
140 | "text": [
141 | "['This is sentence one', ' And this is sentence two', ' Is this sentence three', '']\n"
142 | ]
143 | }
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "source": [
149 | "import pandas as pd\n",
150 | "\n",
151 | "data = {\n",
152 | "\t'names': ['Alice123', 'Bob!@#', 'Charlie$$$'],\n",
153 | "\t'emails': ['alice@example.com', 'bob_at_example.com', 'charlie@example.com']\n",
154 | "}\n",
155 | "df = pd.DataFrame(data)\n",
156 | "\n",
157 | "# Remove non-alphabetic characters from names\n",
158 | "df['names'] = df['names'].str.replace(r'[^a-zA-Z]', '', regex=True)\n",
159 | "\n",
160 | "# Validate email addresses\n",
161 | "df['valid_email'] = df['emails'].apply(lambda x: bool(re.match(r'^\\b[\\w.-]+?@\\w+?\\.\\w+?\\b$', x)))\n",
162 | "\n",
163 | "print(df)\n"
164 | ],
165 | "metadata": {
166 | "colab": {
167 | "base_uri": "https://localhost:8080/"
168 | },
169 | "id": "qboHFiS30UMQ",
170 | "outputId": "eeb42cb5-ebcf-4ebe-f301-74c2c1ac184a"
171 | },
172 | "execution_count": null,
173 | "outputs": [
174 | {
175 | "output_type": "stream",
176 | "name": "stdout",
177 | "text": [
178 | " names emails valid_email\n",
179 | "0 Alice alice@example.com True\n",
180 | "1 Bob bob_at_example.com False\n",
181 | "2 Charlie charlie@example.com True\n"
182 | ]
183 | }
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "source": [],
189 | "metadata": {
190 | "id": "la5oKWfX0U2Z"
191 | },
192 | "execution_count": null,
193 | "outputs": []
194 | }
195 | ]
196 | }
--------------------------------------------------------------------------------
/statistics/Basic_Stats_Functions_Python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "## Import the Built-In `statistics` Module"
21 | ],
22 | "metadata": {
23 | "id": "s8yOidchG5UV"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "source": [
29 | "import statistics"
30 | ],
31 | "metadata": {
32 | "id": "cOmUhMH9bIAb"
33 | },
34 | "execution_count": null,
35 | "outputs": []
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "source": [
40 | "## 1. Mean"
41 | ],
42 | "metadata": {
43 | "id": "Cy_sSAo4bExW"
44 | }
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": null,
49 | "metadata": {
50 | "colab": {
51 | "base_uri": "https://localhost:8080/"
52 | },
53 | "id": "v-3qQD50a9hT",
54 | "outputId": "2a7d7cd5-d8f9-445d-f56f-59ac7f4e57b6"
55 | },
56 | "outputs": [
57 | {
58 | "output_type": "stream",
59 | "name": "stdout",
60 | "text": [
61 | "Mean: 30\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "data = [10, 20, 30, 40, 50]\n",
67 | "mean = statistics.mean(data)\n",
68 | "print(\"Mean:\", mean)"
69 | ]
70 | },
71 | {
72 | "cell_type": "markdown",
73 | "source": [
74 | "## 2. Median"
75 | ],
76 | "metadata": {
77 | "id": "obFi961MbQ46"
78 | }
79 | },
80 | {
81 | "cell_type": "code",
82 | "source": [
83 | "data = [15, 20, 35, 40, 50]\n",
84 | "median = statistics.median(data)\n",
85 | "print(\"Median:\", median)"
86 | ],
87 | "metadata": {
88 | "colab": {
89 | "base_uri": "https://localhost:8080/"
90 | },
91 | "id": "FrS8KYaXbPWy",
92 | "outputId": "f07a115a-5f18-462a-ae47-ab3c239db261"
93 | },
94 | "execution_count": null,
95 | "outputs": [
96 | {
97 | "output_type": "stream",
98 | "name": "stdout",
99 | "text": [
100 | "Median: 35\n"
101 | ]
102 | }
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "source": [
108 | "## 3. Mode"
109 | ],
110 | "metadata": {
111 | "id": "b9ybgj32bYKy"
112 | }
113 | },
114 | {
115 | "cell_type": "code",
116 | "source": [
117 | "data = [1, 2, 2, 3, 4, 4, 4]\n",
118 | "mode = statistics.mode(data)\n",
119 | "print(\"Mode:\", mode)"
120 | ],
121 | "metadata": {
122 | "colab": {
123 | "base_uri": "https://localhost:8080/"
124 | },
125 | "id": "AgrG9I5fbWU0",
126 | "outputId": "eebf7a08-b1d0-42f7-f982-3a20b9241082"
127 | },
128 | "execution_count": null,
129 | "outputs": [
130 | {
131 | "output_type": "stream",
132 | "name": "stdout",
133 | "text": [
134 | "Mode: 4\n"
135 | ]
136 | }
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "source": [
142 | "data = [1, 2, 2, 2, 3, 4, 4, 4, 7, 7, 7]\n",
143 | "mode = statistics.mode(data)\n",
144 | "print(\"Modes:\", mode)"
145 | ],
146 | "metadata": {
147 | "colab": {
148 | "base_uri": "https://localhost:8080/"
149 | },
150 | "id": "d3D3oyVBccaa",
151 | "outputId": "93c7fa31-ca9c-429f-df8d-d93ca9eef080"
152 | },
153 | "execution_count": null,
154 | "outputs": [
155 | {
156 | "output_type": "stream",
157 | "name": "stdout",
158 | "text": [
159 | "Modes: 2\n"
160 | ]
161 | }
162 | ]
163 | },
164 | {
165 | "cell_type": "code",
166 | "source": [
167 | "data = [1, 2, 2, 2, 3, 4, 4, 4, 7, 7, 7]\n",
168 | "modes = statistics.multimode(data)\n",
169 | "print(\"Modes:\", modes)"
170 | ],
171 | "metadata": {
172 | "colab": {
173 | "base_uri": "https://localhost:8080/"
174 | },
175 | "id": "62_XzwJhcH3d",
176 | "outputId": "e7cf6cd4-50b3-42a5-b1ad-d45be40c602a"
177 | },
178 | "execution_count": null,
179 | "outputs": [
180 | {
181 | "output_type": "stream",
182 | "name": "stdout",
183 | "text": [
184 | "Modes: [2, 4, 7]\n"
185 | ]
186 | }
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "source": [
192 | "## 4. Standard Deviation"
193 | ],
194 | "metadata": {
195 | "id": "neQiIHTC6CtL"
196 | }
197 | },
198 | {
199 | "cell_type": "code",
200 | "source": [
201 | "data = [12, 15, 22, 29, 35]\n",
202 | "std_dev = statistics.stdev(data)\n",
203 | "print(f\"Standard Deviation: {std_dev:.3f}\")"
204 | ],
205 | "metadata": {
206 | "colab": {
207 | "base_uri": "https://localhost:8080/"
208 | },
209 | "id": "uY-DcaV4cRux",
210 | "outputId": "98166ea5-b57c-4e1b-f526-5cfbb3f9aed7"
211 | },
212 | "execution_count": null,
213 | "outputs": [
214 | {
215 | "output_type": "stream",
216 | "name": "stdout",
217 | "text": [
218 | "Standard Deviation: 9.555\n"
219 | ]
220 | }
221 | ]
222 | },
223 | {
224 | "cell_type": "markdown",
225 | "source": [
226 | "## 5. Variance"
227 | ],
228 | "metadata": {
229 | "id": "q6Ra31AD7jcU"
230 | }
231 | },
232 | {
233 | "cell_type": "code",
234 | "source": [
235 | "data = [8, 10, 12, 14, 16]\n",
236 | "variance = statistics.variance(data)\n",
237 | "print(f\"Variance: {variance:.2f}\")"
238 | ],
239 | "metadata": {
240 | "colab": {
241 | "base_uri": "https://localhost:8080/"
242 | },
243 | "id": "ALOJxc4V6G0a",
244 | "outputId": "ff7c8a7d-8250-4fc3-b48a-a59dc1e15877"
245 | },
246 | "execution_count": null,
247 | "outputs": [
248 | {
249 | "output_type": "stream",
250 | "name": "stdout",
251 | "text": [
252 | "Variance: 10.00\n"
253 | ]
254 | }
255 | ]
256 | },
257 | {
258 | "cell_type": "markdown",
259 | "source": [
260 | "## 6. Covariance"
261 | ],
262 | "metadata": {
263 | "id": "oXGwdDsci1AP"
264 | }
265 | },
266 | {
267 | "cell_type": "code",
268 | "source": [
269 | "data1 = [2, 4, 6, 8, 10]\n",
270 | "data2 = [1, 3, 5, 7, 9]\n",
271 | "covariance = statistics.covariance(data1, data2)\n",
272 | "print(\"Covariance:\", covariance)"
273 | ],
274 | "metadata": {
275 | "id": "5wjqe8n67uoT",
276 | "colab": {
277 | "base_uri": "https://localhost:8080/"
278 | },
279 | "outputId": "7c11b3be-9d00-47ef-b05d-ad6faff58c65"
280 | },
281 | "execution_count": 1,
282 | "outputs": [
283 | {
284 | "output_type": "stream",
285 | "name": "stdout",
286 | "text": [
287 | "Covariance: 10.0\n"
288 | ]
289 | }
290 | ]
291 | },
292 | {
293 | "cell_type": "markdown",
294 | "source": [
295 | "## 7. Quantiles"
296 | ],
297 | "metadata": {
298 | "id": "DqquyE0XmKg-"
299 | }
300 | },
301 | {
302 | "cell_type": "code",
303 | "source": [
304 | "data = [1, 5, 7, 9, 10, 12, 16, 18, 19, 21]\n",
305 | "# Quartiles\n",
306 | "quantiles = statistics.quantiles(data, n=4)\n",
307 | "print(\"Quantiles (Quartiles):\", quantiles)"
308 | ],
309 | "metadata": {
310 | "colab": {
311 | "base_uri": "https://localhost:8080/"
312 | },
313 | "id": "5p1xVng-kwju",
314 | "outputId": "903cb4f8-5bb6-488a-c582-62126fbff758"
315 | },
316 | "execution_count": 4,
317 | "outputs": [
318 | {
319 | "output_type": "stream",
320 | "name": "stdout",
321 | "text": [
322 | "Quantiles (Quartiles): [6.5, 11.0, 18.25]\n"
323 | ]
324 | }
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "source": [
330 | "## 8. Correlation"
331 | ],
332 | "metadata": {
333 | "id": "eUTp6xe2CCVM"
334 | }
335 | },
336 | {
337 | "cell_type": "code",
338 | "source": [
339 | "data1 = [1, 2, 3, 4, 5]\n",
340 | "data2 = [2, 4, 6, 8, 10]\n",
341 | "correlation = statistics.correlation(data1, data2)\n",
342 | "print(\"Correlation:\", correlation)"
343 | ],
344 | "metadata": {
345 | "colab": {
346 | "base_uri": "https://localhost:8080/"
347 | },
348 | "id": "1CFP4t68mO4r",
349 | "outputId": "a43c0f06-8c1e-4229-ab23-1c8a35aef2e7"
350 | },
351 | "execution_count": 5,
352 | "outputs": [
353 | {
354 | "output_type": "stream",
355 | "name": "stdout",
356 | "text": [
357 | "Correlation: 1.0\n"
358 | ]
359 | }
360 | ]
361 | },
362 | {
363 | "cell_type": "markdown",
364 | "source": [
365 | "## 9. Linear Regression"
366 | ],
367 | "metadata": {
368 | "id": "AMq5BdfuFMiB"
369 | }
370 | },
371 | {
372 | "cell_type": "code",
373 | "source": [
374 | "x = [1, 2, 3, 4, 5]\n",
375 | "y = [3, 4, 2, 5, 7]\n",
376 | "slope, intercept = statistics.linear_regression(x, y)\n",
377 | "print(\"Slope:\", slope)\n",
378 | "print(\"Intercept:\", intercept)"
379 | ],
380 | "metadata": {
381 | "colab": {
382 | "base_uri": "https://localhost:8080/"
383 | },
384 | "id": "TJVQAIjACFxz",
385 | "outputId": "e79e8709-0a67-4b51-dbb8-634fdc52ad3a"
386 | },
387 | "execution_count": 7,
388 | "outputs": [
389 | {
390 | "output_type": "stream",
391 | "name": "stdout",
392 | "text": [
393 | "Slope: 0.9\n",
394 | "Intercept: 1.5\n"
395 | ]
396 | }
397 | ]
398 | },
399 | {
400 | "cell_type": "markdown",
401 | "source": [
402 | "## 10. Normal Distribution"
403 | ],
404 | "metadata": {
405 | "id": "EjcEB4bcGILJ"
406 | }
407 | },
408 | {
409 | "cell_type": "code",
410 | "source": [
411 | "# Create a normal distribution with mean 30 and standard deviation 10\n",
412 | "normal_dist = statistics.NormalDist(mu=30, sigma=10)\n",
413 | "\n",
414 | "# Calculate the probability of a value less than or equal to 20\n",
415 | "probability = normal_dist.cdf(20)\n",
416 | "print(f\"Probability (CDF) of 20: {probability:.3f}\")\n",
417 | "\n",
418 | "# Calculate the z-score for a value\n",
419 | "z_score = normal_dist.inv_cdf(0.975)\n",
420 | "print(f\"Z-score for 97.5th percentile: {z_score:.3f}\")"
421 | ],
422 | "metadata": {
423 | "colab": {
424 | "base_uri": "https://localhost:8080/"
425 | },
426 | "id": "sYkVaHDtFQ6m",
427 | "outputId": "2f1db0e8-3b5e-4764-8151-02e92b413513"
428 | },
429 | "execution_count": 11,
430 | "outputs": [
431 | {
432 | "output_type": "stream",
433 | "name": "stdout",
434 | "text": [
435 | "Probability (CDF) of 20: 0.159\n",
436 | "Z-score for 97.5th percentile: 49.600\n"
437 | ]
438 | }
439 | ]
440 | },
441 | {
442 | "cell_type": "code",
443 | "source": [],
444 | "metadata": {
445 | "id": "w8eCKz1tJy55"
446 | },
447 | "execution_count": null,
448 | "outputs": []
449 | }
450 | ]
451 | }
--------------------------------------------------------------------------------
/statistics/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/statistics/probability/README.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/statistics/probability/joint_and_conditional_pbty.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "## Step 1: Creating Sample Data"
21 | ],
22 | "metadata": {
23 | "id": "YjO9ZVZIM8Ye"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "colab": {
31 | "base_uri": "https://localhost:8080/"
32 | },
33 | "id": "sAXOTiD9Ltz0",
34 | "outputId": "63fdc6fe-0ae1-4b61-ba92-4e481cc8c561"
35 | },
36 | "outputs": [
37 | {
38 | "output_type": "stream",
39 | "name": "stdout",
40 | "text": [
41 | " Age_Group Sports_Interest\n",
42 | "0 Teen Yes\n",
43 | "1 Teen No\n",
44 | "2 Teen Yes\n",
45 | "3 Adult No\n",
46 | "4 Adult No\n",
47 | "5 Senior Yes\n",
48 | "6 Senior Yes\n",
49 | "7 Senior No\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "import pandas as pd\n",
55 | "\n",
56 | "# Sample data\n",
57 | "data = {\n",
58 | " \"Age_Group\": [\"Teen\", \"Teen\", \"Teen\", \"Adult\", \"Adult\", \"Senior\", \"Senior\", \"Senior\"],\n",
59 | " \"Sports_Interest\": [\"Yes\", \"No\", \"Yes\", \"No\", \"No\", \"Yes\", \"Yes\", \"No\"]\n",
60 | "}\n",
61 | "\n",
62 | "df = pd.DataFrame(data)\n",
63 | "\n",
64 | "# Display the data\n",
65 | "print(df)\n"
66 | ]
67 | },
68 | {
69 | "cell_type": "markdown",
70 | "source": [
71 | "## Step 2: Calculating Joint Probability"
72 | ],
73 | "metadata": {
74 | "id": "1VY0hLRKMWMr"
75 | }
76 | },
77 | {
78 | "cell_type": "code",
79 | "source": [
80 | "# Total number of observations\n",
81 | "total_count = len(df)\n",
82 | "\n",
83 | "# Count occurrences where Age_Group is \"Teen\" and Sports_Interest is \"Yes\"\n",
84 | "joint_count = len(df[(df['Age_Group'] == 'Teen') & (df['Sports_Interest'] == 'Yes')])\n",
85 | "\n",
86 | "# Joint probability\n",
87 | "joint_probability = joint_count / total_count\n",
88 | "\n",
89 | "print(f\"Joint Probability (Teen and Sports Interest Yes): {joint_probability}\")\n"
90 | ],
91 | "metadata": {
92 | "colab": {
93 | "base_uri": "https://localhost:8080/"
94 | },
95 | "id": "M32eM5NPMHNd",
96 | "outputId": "35e64e55-358f-471c-b583-ca322b9597c0"
97 | },
98 | "execution_count": null,
99 | "outputs": [
100 | {
101 | "output_type": "stream",
102 | "name": "stdout",
103 | "text": [
104 | "Joint Probability (Teen and Sports Interest Yes): 0.25\n"
105 | ]
106 | }
107 | ]
108 | },
109 | {
110 | "cell_type": "markdown",
111 | "source": [
112 | "## Step 3: Calculating Conditional Probability"
113 | ],
114 | "metadata": {
115 | "id": "OIs1olhPMZgq"
116 | }
117 | },
118 | {
119 | "cell_type": "code",
120 | "source": [
121 | "# Filter data for Age_Group = \"Teen\"\n",
122 | "teen_data = df[df['Age_Group'] == 'Teen']\n",
123 | "\n",
124 | "# Count occurrences of Sports_Interest = \"Yes\" among teens\n",
125 | "conditional_count = len(teen_data[teen_data['Sports_Interest'] == 'Yes'])\n",
126 | "\n",
127 | "# Conditional probability\n",
128 | "conditional_probability = conditional_count / len(teen_data)\n",
129 | "\n",
130 | "print(f\"Conditional Probability (Sports Interest Yes | Age Group Teen): {conditional_probability:.3f}\")\n"
131 | ],
132 | "metadata": {
133 | "colab": {
134 | "base_uri": "https://localhost:8080/"
135 | },
136 | "id": "vMTq6kaKMJdd",
137 | "outputId": "559e5632-7ca7-44bd-9d59-4f2aeb19f50a"
138 | },
139 | "execution_count": null,
140 | "outputs": [
141 | {
142 | "output_type": "stream",
143 | "name": "stdout",
144 | "text": [
145 | "Conditional Probability (Sports Interest Yes | Age Group Teen): 0.667\n"
146 | ]
147 | }
148 | ]
149 | },
150 | {
151 | "cell_type": "markdown",
152 | "source": [
153 | "## Step 4: Generalizing with Functions"
154 | ],
155 | "metadata": {
156 | "id": "L2uNqq9zM1I2"
157 | }
158 | },
159 | {
160 | "cell_type": "code",
161 | "source": [
162 | "def calculate_joint_probability(df, condition1, condition2):\n",
163 | " total_count = len(df)\n",
164 | " joint_count = len(df[(df[condition1[0]] == condition1[1]) & (df[condition2[0]] == condition2[1])])\n",
165 | " return joint_count / total_count\n",
166 | "\n",
167 | "def calculate_conditional_probability(df, given_condition, target_condition):\n",
168 | " subset = df[df[given_condition[0]] == given_condition[1]]\n",
169 | " conditional_count = len(subset[subset[target_condition[0]] == target_condition[1]])\n",
170 | " return conditional_count / len(subset)\n"
171 | ],
172 | "metadata": {
173 | "id": "VGoD5_-2MMfE"
174 | },
175 | "execution_count": null,
176 | "outputs": []
177 | },
178 | {
179 | "cell_type": "code",
180 | "source": [
181 | "# Joint Probability of \"Teen\" and \"Sports_Interest = Yes\"\n",
182 | "joint_prob = calculate_joint_probability(df, (\"Age_Group\", \"Teen\"), (\"Sports_Interest\", \"Yes\"))\n",
183 | "print(f\"Joint Probability (Teen and Sports Interest Yes): {joint_prob}\")\n",
184 | "\n",
185 | "# Conditional Probability of \"Sports_Interest = Yes\" given \"Age_Group = Teen\"\n",
186 | "conditional_prob = calculate_conditional_probability(df, (\"Age_Group\", \"Teen\"), (\"Sports_Interest\", \"Yes\"))\n",
187 | "print(f\"Conditional Probability (Sports Interest Yes | Age Group Teen): {conditional_prob:.3f}\")\n"
188 | ],
189 | "metadata": {
190 | "colab": {
191 | "base_uri": "https://localhost:8080/"
192 | },
193 | "id": "rSEt6qJgMQQN",
194 | "outputId": "0f5e0527-f942-4f4a-8081-7ec6d4591708"
195 | },
196 | "execution_count": null,
197 | "outputs": [
198 | {
199 | "output_type": "stream",
200 | "name": "stdout",
201 | "text": [
202 | "Joint Probability (Teen and Sports Interest Yes): 0.25\n",
203 | "Conditional Probability (Sports Interest Yes | Age Group Teen): 0.667\n"
204 | ]
205 | }
206 | ]
207 | }
208 | ]
209 | }
--------------------------------------------------------------------------------
/statistics/sparse_data_analysis.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "id": "0R9gVhnIMrNH"
22 | },
23 | "outputs": [],
24 | "source": [
25 | "import numpy as np\n",
26 | "from scipy import sparse\n",
27 | "import pandas as pd\n",
28 | "from scipy import stats\n"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "source": [
34 | "# Create a sparse matrix where rows are users and columns are products\n",
35 | "# Only storing the actual interactions\n",
36 | "row = np.array([0, 3, 1, 0]) # User IDs\n",
37 | "col = np.array([0, 3, 1, 2]) # Product IDs\n",
38 | "data = np.array([4, 5, 7, 9]) # Interaction values (like ratings)\n",
39 | "\n",
40 | "# Create the sparse matrix\n",
41 | "sparse_matrix = sparse.coo_matrix((data, (row, col)), shape=(4, 4))\n",
42 | "\n",
43 | "# seeing the sparse matrix as a regular matrix\n",
44 | "print(\"Here's our sparse matrix as a regular array:\")\n",
45 | "print(sparse_matrix.toarray())\n"
46 | ],
47 | "metadata": {
48 | "colab": {
49 | "base_uri": "https://localhost:8080/"
50 | },
51 | "id": "RkAQQ8QCMzM7",
52 | "outputId": "96ee974b-f672-4c59-a965-4626b9bc1cf5"
53 | },
54 | "execution_count": 2,
55 | "outputs": [
56 | {
57 | "output_type": "stream",
58 | "name": "stdout",
59 | "text": [
60 | "Here's our sparse matrix as a regular array:\n",
61 | "[[4 0 9 0]\n",
62 | " [0 7 0 0]\n",
63 | " [0 0 0 0]\n",
64 | " [0 0 0 5]]\n"
65 | ]
66 | }
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "source": [
72 | "def calculate_sparse_mean(sparse_matrix):\n",
73 | " \"\"\"\n",
74 | " Calculate mean of non-zero elements in a sparse matrix.\n",
75 | " This is useful when zeros represent 'no data' rather than actual zeros.\n",
76 | " \"\"\"\n",
77 | " if sparse_matrix.nnz == 0: # nnz is the number of non-zero elements\n",
78 | " return 0.0\n",
79 | " return sparse_matrix.sum() / sparse_matrix.nnz\n",
80 | "\n",
81 | "mean_value = calculate_sparse_mean(sparse_matrix)\n",
82 | "print(f\"\\nMean of non-zero elements: {mean_value:.2f}\")\n"
83 | ],
84 | "metadata": {
85 | "colab": {
86 | "base_uri": "https://localhost:8080/"
87 | },
88 | "id": "Dz0BFJXXM1ia",
89 | "outputId": "d3b9092d-2218-477a-80c5-551fcbf19cd5"
90 | },
91 | "execution_count": 3,
92 | "outputs": [
93 | {
94 | "output_type": "stream",
95 | "name": "stdout",
96 | "text": [
97 | "\n",
98 | "Mean of non-zero elements: 6.25\n"
99 | ]
100 | }
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "source": [
106 | "def analyze_row_patterns(sparse_matrix):\n",
107 | " \"\"\"\n",
108 | " Analyze patterns in each row of a sparse matrix.\n",
109 | " Returns dictionary with various row statistics.\n",
110 | " \"\"\"\n",
111 | " # Convert to CSR format for efficient row operations\n",
112 | " csr_matrix = sparse_matrix.tocsr()\n",
113 | "\n",
114 | " # Calculate statistics\n",
115 | " row_sums = np.array(csr_matrix.sum(axis=1)).flatten()\n",
116 | " row_nonzeros = np.diff(csr_matrix.indptr) # Number of non-zeros per row\n",
117 | "\n",
118 | " # Calculate means, handling empty rows\n",
119 | " row_means = np.zeros_like(row_sums, dtype=float)\n",
120 | " mask = row_nonzeros > 0\n",
121 | " row_means[mask] = row_sums[mask] / row_nonzeros[mask]\n",
122 | "\n",
123 | " return {\n",
124 | " 'activity_sum': row_sums, # Total activity per user\n",
125 | " 'interaction_count': row_nonzeros, # Number of interactions per user\n",
126 | " 'average_value': row_means # Average value per user\n",
127 | " }\n"
128 | ],
129 | "metadata": {
130 | "id": "SF3ygrrvM4Ks"
131 | },
132 | "execution_count": 4,
133 | "outputs": []
134 | },
135 | {
136 | "cell_type": "code",
137 | "source": [
138 | "stats = analyze_row_patterns(sparse_matrix)\n",
139 | "print(\"\\nUser Statistics:\")\n",
140 | "for i, (sum_val, count, mean) in enumerate(zip(\n",
141 | " stats['activity_sum'],\n",
142 | " stats['interaction_count'],\n",
143 | " stats['average_value']\n",
144 | ")):\n",
145 | " print(f\"User {i}: {count} interactions, \"\n",
146 | " f\"total activity = {sum_val}, \"\n",
147 | " f\"average value = {mean:.2f}\")\n"
148 | ],
149 | "metadata": {
150 | "colab": {
151 | "base_uri": "https://localhost:8080/"
152 | },
153 | "id": "IAzJ8tHRM519",
154 | "outputId": "52a67420-34d2-4d81-ce3f-4af60177be04"
155 | },
156 | "execution_count": 5,
157 | "outputs": [
158 | {
159 | "output_type": "stream",
160 | "name": "stdout",
161 | "text": [
162 | "\n",
163 | "User Statistics:\n",
164 | "User 0: 2 interactions, total activity = 13, average value = 6.50\n",
165 | "User 1: 1 interactions, total activity = 7, average value = 7.00\n",
166 | "User 2: 0 interactions, total activity = 0, average value = 0.00\n",
167 | "User 3: 1 interactions, total activity = 5, average value = 5.00\n"
168 | ]
169 | }
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "source": [
175 | "def calculate_sparse_correlation(sparse_matrix, min_overlap=2):\n",
176 | " \"\"\"\n",
177 | " Calculate correlation between columns, considering only overlapping non-zero elements.\n",
178 | " Like finding which products are often rated similarly.\n",
179 | " \"\"\"\n",
180 | " # Convert to dense format for this calculation\n",
181 | " # (For very large matrices, you'd want to do this differently)\n",
182 | " dense_cols = sparse_matrix.toarray().T\n",
183 | " n_cols = dense_cols.shape[0]\n",
184 | " correlations = np.zeros((n_cols, n_cols))\n",
185 | "\n",
186 | " for i in range(n_cols):\n",
187 | " for j in range(i, n_cols):\n",
188 | " # Find where both columns have non-zero values\n",
189 | " mask = (dense_cols[i] != 0) & (dense_cols[j] != 0)\n",
190 | " if mask.sum() >= min_overlap:\n",
191 | " corr = stats.pearsonr(dense_cols[i][mask],\n",
192 | " dense_cols[j][mask])[0]\n",
193 | " correlations[i, j] = correlations[j, i] = corr\n",
194 | "\n",
195 | " return correlations"
196 | ],
197 | "metadata": {
198 | "id": "ADRakCn4M8KD"
199 | },
200 | "execution_count": 6,
201 | "outputs": []
202 | },
203 | {
204 | "cell_type": "code",
205 | "source": [
206 | "corr_matrix = calculate_sparse_correlation(sparse_matrix)\n",
207 | "print(\"\\nCorrelation matrix:\")\n",
208 | "print(corr_matrix)"
209 | ],
210 | "metadata": {
211 | "colab": {
212 | "base_uri": "https://localhost:8080/"
213 | },
214 | "id": "7UuFzRB6M979",
215 | "outputId": "af68a7bc-e862-40bc-ead4-eac28fd5b1f7"
216 | },
217 | "execution_count": 7,
218 | "outputs": [
219 | {
220 | "output_type": "stream",
221 | "name": "stdout",
222 | "text": [
223 | "\n",
224 | "Correlation matrix:\n",
225 | "[[0. 0. 0. 0.]\n",
226 | " [0. 0. 0. 0.]\n",
227 | " [0. 0. 0. 0.]\n",
228 | " [0. 0. 0. 0.]]\n"
229 | ]
230 | }
231 | ]
232 | }
233 | ]
234 | }
--------------------------------------------------------------------------------
/statistics/sparse_data_analysis_v0_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": []
7 | },
8 | "kernelspec": {
9 | "name": "python3",
10 | "display_name": "Python 3"
11 | },
12 | "language_info": {
13 | "name": "python"
14 | }
15 | },
16 | "cells": [
17 | {
18 | "cell_type": "markdown",
19 | "source": [
20 | "## Imports"
21 | ],
22 | "metadata": {
23 | "id": "DPt0ex-tOHxH"
24 | }
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": null,
29 | "metadata": {
30 | "id": "0R9gVhnIMrNH"
31 | },
32 | "outputs": [],
33 | "source": [
34 | "import numpy as np\n",
35 | "from scipy import sparse\n",
36 | "import pandas as pd\n",
37 | "from scipy import stats\n"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "source": [
43 | "## Creating a Sparse Matrix"
44 | ],
45 | "metadata": {
46 | "id": "M7kq8YzvOKG-"
47 | }
48 | },
49 | {
50 | "cell_type": "code",
51 | "source": [
52 | "# Create a sparse matrix where rows are users and columns are products\n",
53 | "# Only storing the actual interactions\n",
54 | "row = np.array([0, 3, 1, 0]) # User IDs\n",
55 | "col = np.array([0, 3, 1, 2]) # Product IDs\n",
56 | "data = np.array([4, 5, 7, 9]) # Interaction values (like ratings)\n",
57 | "\n",
58 | "# Create the sparse matrix\n",
59 | "sparse_matrix = sparse.coo_matrix((data, (row, col)), shape=(4, 4))\n",
60 | "\n",
61 | "# seeing the sparse matrix as a regular matrix\n",
62 | "print(\"Here's our sparse matrix as a regular array:\")\n",
63 | "print(sparse_matrix.toarray())\n"
64 | ],
65 | "metadata": {
66 | "colab": {
67 | "base_uri": "https://localhost:8080/"
68 | },
69 | "id": "RkAQQ8QCMzM7",
70 | "outputId": "96ee974b-f672-4c59-a965-4626b9bc1cf5"
71 | },
72 | "execution_count": null,
73 | "outputs": [
74 | {
75 | "output_type": "stream",
76 | "name": "stdout",
77 | "text": [
78 | "Here's our sparse matrix as a regular array:\n",
79 | "[[4 0 9 0]\n",
80 | " [0 7 0 0]\n",
81 | " [0 0 0 0]\n",
82 | " [0 0 0 5]]\n"
83 | ]
84 | }
85 | ]
86 | },
87 | {
88 | "cell_type": "markdown",
89 | "source": [
90 | "## Basic Statistical Analysis"
91 | ],
92 | "metadata": {
93 | "id": "VRm2aXYiOS3F"
94 | }
95 | },
96 | {
97 | "cell_type": "code",
98 | "source": [
99 | "def calculate_sparse_mean(sparse_matrix):\n",
100 | " \"\"\"\n",
101 | " Calculate mean of non-zero elements in a sparse matrix.\n",
102 | " This is useful when zeros represent 'no data' rather than actual zeros.\n",
103 | " \"\"\"\n",
104 | " if sparse_matrix.nnz == 0: # nnz is the number of non-zero elements\n",
105 | " return 0.0\n",
106 | " return sparse_matrix.sum() / sparse_matrix.nnz\n",
107 | "\n",
108 | "mean_value = calculate_sparse_mean(sparse_matrix)\n",
109 | "print(f\"\\nMean of non-zero elements: {mean_value:.2f}\")\n"
110 | ],
111 | "metadata": {
112 | "colab": {
113 | "base_uri": "https://localhost:8080/"
114 | },
115 | "id": "Dz0BFJXXM1ia",
116 | "outputId": "d3b9092d-2218-477a-80c5-551fcbf19cd5"
117 | },
118 | "execution_count": null,
119 | "outputs": [
120 | {
121 | "output_type": "stream",
122 | "name": "stdout",
123 | "text": [
124 | "\n",
125 | "Mean of non-zero elements: 6.25\n"
126 | ]
127 | }
128 | ]
129 | },
130 | {
131 | "cell_type": "markdown",
132 | "source": [
133 | "## Handling Row and Column Statistics"
134 | ],
135 | "metadata": {
136 | "id": "fDhW59jyOWl1"
137 | }
138 | },
139 | {
140 | "cell_type": "code",
141 | "source": [
142 | "def analyze_row_patterns(sparse_matrix):\n",
143 | " \"\"\"\n",
144 | " Analyze patterns in each row of a sparse matrix.\n",
145 | " Returns dictionary with various row statistics.\n",
146 | " \"\"\"\n",
147 | " # Convert to CSR format for efficient row operations\n",
148 | " csr_matrix = sparse_matrix.tocsr()\n",
149 | "\n",
150 | " # Calculate statistics\n",
151 | " row_sums = np.array(csr_matrix.sum(axis=1)).flatten()\n",
152 | " row_nonzeros = np.diff(csr_matrix.indptr) # Number of non-zeros per row\n",
153 | "\n",
154 | " # Calculate means, handling empty rows\n",
155 | " row_means = np.zeros_like(row_sums, dtype=float)\n",
156 | " mask = row_nonzeros > 0\n",
157 | " row_means[mask] = row_sums[mask] / row_nonzeros[mask]\n",
158 | "\n",
159 | " return {\n",
160 | " 'activity_sum': row_sums, # Total activity per user\n",
161 | " 'interaction_count': row_nonzeros, # Number of interactions per user\n",
162 | " 'average_value': row_means # Average value per user\n",
163 | " }\n"
164 | ],
165 | "metadata": {
166 | "id": "SF3ygrrvM4Ks"
167 | },
168 | "execution_count": null,
169 | "outputs": []
170 | },
171 | {
172 | "cell_type": "code",
173 | "source": [
174 | "stats = analyze_row_patterns(sparse_matrix)\n",
175 | "print(\"\\nUser Statistics:\")\n",
176 | "for i, (sum_val, count, mean) in enumerate(zip(\n",
177 | " stats['activity_sum'],\n",
178 | " stats['interaction_count'],\n",
179 | " stats['average_value']\n",
180 | ")):\n",
181 | " print(f\"User {i}: {count} interactions, \"\n",
182 | " f\"total activity = {sum_val}, \"\n",
183 | " f\"average value = {mean:.2f}\")\n"
184 | ],
185 | "metadata": {
186 | "colab": {
187 | "base_uri": "https://localhost:8080/"
188 | },
189 | "id": "IAzJ8tHRM519",
190 | "outputId": "52a67420-34d2-4d81-ce3f-4af60177be04"
191 | },
192 | "execution_count": null,
193 | "outputs": [
194 | {
195 | "output_type": "stream",
196 | "name": "stdout",
197 | "text": [
198 | "\n",
199 | "User Statistics:\n",
200 | "User 0: 2 interactions, total activity = 13, average value = 6.50\n",
201 | "User 1: 1 interactions, total activity = 7, average value = 7.00\n",
202 | "User 2: 0 interactions, total activity = 0, average value = 0.00\n",
203 | "User 3: 1 interactions, total activity = 5, average value = 5.00\n"
204 | ]
205 | }
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "source": [
211 | "## Correlation Analysis"
212 | ],
213 | "metadata": {
214 | "id": "m5ETMgcxOatl"
215 | }
216 | },
217 | {
218 | "cell_type": "code",
219 | "source": [
220 | "def calculate_sparse_correlation(sparse_matrix, min_overlap=2):\n",
221 | " \"\"\"\n",
222 | " Calculate correlation between columns, considering only overlapping non-zero elements.\n",
223 | " Like finding which products are often rated similarly.\n",
224 | " \"\"\"\n",
225 | " # Convert to dense format for this calculation\n",
226 | " # (For very large matrices, you'd want to do this differently)\n",
227 | " dense_cols = sparse_matrix.toarray().T\n",
228 | " n_cols = dense_cols.shape[0]\n",
229 | " correlations = np.zeros((n_cols, n_cols))\n",
230 | "\n",
231 | " for i in range(n_cols):\n",
232 | " for j in range(i, n_cols):\n",
233 | " # Find where both columns have non-zero values\n",
234 | " mask = (dense_cols[i] != 0) & (dense_cols[j] != 0)\n",
235 | " if mask.sum() >= min_overlap:\n",
236 | " corr = stats.pearsonr(dense_cols[i][mask],\n",
237 | " dense_cols[j][mask])[0]\n",
238 | " correlations[i, j] = correlations[j, i] = corr\n",
239 | "\n",
240 | " return correlations"
241 | ],
242 | "metadata": {
243 | "id": "ADRakCn4M8KD"
244 | },
245 | "execution_count": null,
246 | "outputs": []
247 | },
248 | {
249 | "cell_type": "code",
250 | "source": [
251 | "corr_matrix = calculate_sparse_correlation(sparse_matrix)\n",
252 | "print(\"\\nCorrelation matrix:\")\n",
253 | "print(corr_matrix)"
254 | ],
255 | "metadata": {
256 | "colab": {
257 | "base_uri": "https://localhost:8080/"
258 | },
259 | "id": "7UuFzRB6M979",
260 | "outputId": "af68a7bc-e862-40bc-ead4-eac28fd5b1f7"
261 | },
262 | "execution_count": null,
263 | "outputs": [
264 | {
265 | "output_type": "stream",
266 | "name": "stdout",
267 | "text": [
268 | "\n",
269 | "Correlation matrix:\n",
270 | "[[0. 0. 0. 0.]\n",
271 | " [0. 0. 0. 0.]\n",
272 | " [0. 0. 0. 0.]\n",
273 | " [0. 0. 0. 0.]]\n"
274 | ]
275 | }
276 | ]
277 | }
278 | ]
279 | }
--------------------------------------------------------------------------------
/vibe-coding/speed-reader/README.md:
--------------------------------------------------------------------------------
1 | # RSVP Speed Reader 🚀
2 |
3 | A minimalist command-line speed reading application using RSVP (Rapid Serial Visual Presentation) technique. Read faster, comprehend better, and eliminate distractions with this clean terminal-based reader.
4 |
5 | ## Features
6 |
7 | - **Focused Reading**: Words appear one at a time in a fixed position, eliminating eye movement
8 | - **Variable Speed**: Adjustable reading speed from 50 to 1000+ words per minute
9 | - **Real-time Controls**: Pause, navigate, and adjust speed while reading
10 | - **Adaptive Display**: Uses 40% center area of your terminal with clean borders
11 | - **Progress Tracking**: Visual progress indicator and word count
12 | - **Multiple Input Methods**: Read from files or paste text directly
13 | - **Clean Interface**: Static controls, distraction-free design
14 | - **Smart Timing**: Longer words get slightly more display time
15 |
16 | ## Interface Overview
17 |
18 | ```
19 | Speed: 300 WPM Progress: 45/150
20 | ┌────────────────────────────────────────────┐
21 | │ │
22 | │ │
23 | │ c u r r e n t │
24 | │ │
25 | │ │
26 | └────────────────────────────────────────────┘
27 |
28 | SPACE = Pause/Resume
29 | ↑/↓ = Speed Up/Down
30 | ←/→ = Previous/Next
31 | Q = Quit
32 | ```
33 |
34 | ## 🚀 Quick Start
35 |
36 | ### Installation
37 |
38 | ```bash
39 | # Clone or download the script
40 | git clone https://github.com/yourusername/rsvp-speed-reader.git
41 | cd rsvp-speed-reader
42 |
43 | # Or download directly
44 | curl -O https://raw.githubusercontent.com/yourusername/rsvp-speed-reader/main/rsvp_reader.py
45 | ```
46 |
47 | ### Basic Usage
48 |
49 | ```bash
50 | # Read a text file
51 | python3 rsvp_reader.py -f your_document.txt
52 |
53 | # Read with custom speed (300 WPM)
54 | python3 rsvp_reader.py -f book.txt -w 300
55 |
56 | # Read text directly from command line
57 | python3 rsvp_reader.py "Your text content here to speed read through"
58 |
59 | # Read with word chunks (2 words at a time)
60 | python3 rsvp_reader.py -f article.txt -c 2 -w 200
61 |
62 | # Try with sample text (no arguments)
63 | python3 rsvp_reader.py
64 |
65 | # Make it executable
66 | chmod +x rsvp_reader.py
67 |
68 | # Then run
69 | ./rsvp_reader.py
70 | ```
71 |
72 | ## 🎮 Controls
73 |
74 | While reading, use these keyboard controls:
75 |
76 | | Key | Action |
77 | |-----|--------|
78 | | `SPACE` | Pause/Resume reading |
79 | | `↑` | Increase speed by 25 WPM |
80 | | `↓` | Decrease speed by 25 WPM |
81 | | `→` | Skip to next word |
82 | | `←` | Go back to previous word |
83 | | `Q` | Quit application |
84 |
85 | ## ⚙️ Command Line Options
86 |
87 | ```bash
88 | python3 rsvp_reader.py [OPTIONS] [INPUT]
89 | ```
90 |
91 | ### Arguments
92 |
93 | - `INPUT` - Text file path or direct text string (optional)
94 |
95 | ### Options
96 |
97 | - `-f, --file` - Treat input as file path
98 | - `-w, --wpm INTEGER` - Words per minute (default: 150)
99 | - `-c, --chunk INTEGER` - Words per chunk (default: 1)
100 | - `-h, --help` - Show help message
101 |
102 | ### Examples with Options
103 |
104 | ```bash
105 | # Slow reading with larger chunks
106 | python3 rsvp_reader.py -f textbook.txt -w 120 -c 3
107 |
108 | # Fast reading for familiar content
109 | python3 rsvp_reader.py -f news.txt -w 500
110 |
111 | # Medium pace for technical content
112 | python3 rsvp_reader.py -f documentation.txt -w 200 -c 2
113 | ```
114 |
115 | ## 📋 Requirements
116 |
117 | - **Python 3.6+** - No additional packages required
118 | - **Terminal** - Works in any terminal that supports ANSI escape codes
119 | - **Operating System** - Linux, macOS, Windows (with proper terminal)
120 |
121 |
122 | ### Recommended Starting Points
123 |
124 | - **New to speed reading**: Start at 150 WPM
125 | - **Experienced reader**: Start at 250 WPM
126 | - **Technical content**: Use 120-180 WPM with chunks
127 | - **Light reading**: 300-400 WPM is comfortable
128 |
129 |
130 | ## 🐛 Troubleshooting
131 |
132 | ### Common Issues
133 |
134 | **Terminal not clearing properly**
135 | ```bash
136 | # Reset terminal
137 | reset
138 | # Or try
139 | tput reset
140 | ```
141 |
142 | **Keyboard controls not responding**
143 | - Ensure terminal has focus
144 | - Try running in different terminal emulator
145 | - Check if running over SSH with proper terminal settings
146 |
147 | **Words appearing off-center**
148 | - Resize terminal window
149 | - Try different terminal dimensions
150 | - App adapts to terminal size automatically
151 |
152 | **Speed too fast/slow to start**
153 | ```bash
154 | # Start with comfortable speed
155 | python3 rsvp_reader.py -f file.txt -w 150
156 |
157 | # Adjust in real-time with ↑/↓ keys
158 | ```
159 |
160 | ## Acknowledgments
161 |
162 | - Built through collaborative "vibe coding"
163 | - Inspired by RSVP research and speed reading techniques
164 |
165 | Happy Speed Reading! 📚⚡
166 |
--------------------------------------------------------------------------------