├── Deep Learning └── 2h_intro_to_nn_march_2024.ipynb ├── Machine Learning ├── ML.ipynb └── loan_train_data.csv ├── Natural Language Processing ├── Intro to NLP.pdf ├── Preprocesare_text.ipynb ├── Reprezentarea_Cuvintelor.ipynb └── Solutie Reprezentarea_Cuvintelor.ipynb ├── Python and Data Visualisation ├── Python.ipynb ├── csv_pretty_print.ipynb └── csv_pretty_print_sol.ipynb ├── README.md └── Transformers └── Transfer_Learning_&_Transformers_NITRO2024.ipynb /Machine Learning/ML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "b89ea37c-03c2-484b-8bf5-7543dc8746f2", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import pandas as pd\n", 11 | "import matplotlib.pyplot as plt" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "id": "bab41031-3c93-4d3d-9f82-b3b638e7d44c", 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "df = pd.read_csv('https://storage.googleapis.com/datasets-3456/loan_train_data.csv')" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "id": "1953ac39-1a06-4f1b-8c5a-0ce462f67132", 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "df" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "id": "96644df6-5639-433c-98db-6ab1f39d3c80", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "df['ApplicantIncome']" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "id": "ee6419f6-4981-48ea-b010-48d5440fe632", 48 | "metadata": { 49 | "scrolled": true 50 | }, 51 | "outputs": [], 52 | "source": [ 53 | "df['ApplicantIncome'] * 2 " 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "id": "c8111922-62cd-4d7b-84fa-ec1052462c15", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "df['ApplicantIncome'] > 5000" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "id": "3c7a3e16-385d-4641-a588-f1b2c89a376e", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "df['ApplicantIncome'].loc[df['ApplicantIncome'] > 5000]" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "id": "58c3318b-0ec4-4f70-9564-f63616376247", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "df" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "id": "3effef68-66c5-4c23-b3c5-c512d0f4da3f", 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [ 93 | "df['Property_Area']" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "id": "160673d9-a7cc-4e0f-83ba-7d4a58e5c690", 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "df['Property_Area'].unique()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "id": "e4803bd2-9101-469e-ad5f-1000a0a1a44d", 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "df['Property_Area'].value_counts()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "9381d606-2a61-4935-b5f6-720f566de3bd", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "plt.bar([0, 1, 2], df['Property_Area'].value_counts())" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "id": "c79f69f1-dc15-482c-b47c-ef0c29a5b6ca", 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "df['ApplicantIncome']" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "ff8018bc-d357-4e7a-a512-991785566a63", 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "plt.hist(df['ApplicantIncome'], bins=80)" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": null, 149 | "id": "f6403e40-8eab-4cbc-8313-61f1eff1888f", 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "df['Gender'].unique() # NaN -> Not a Number" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "id": "ee0049e2-727c-453b-9040-4b692f9e2230", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "df['GE'] = 0" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "id": "24a655f0-a4bf-44d5-a636-1fc801d0ec66", 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "df" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "id": "97b54b6c-2695-4492-ad58-43fff3b25165", 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "df.loc[df['Gender'] == 'Female', 'GE'] = 1\n", 184 | "df.loc[df['Gender'] == 'Male', 'GE'] = 2" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "id": "5818e494-00cd-4912-92b4-a9429731d908", 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "df" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "id": "bdbd29c0-f63d-40a7-af7e-bd787398a576", 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "df.drop(columns=['Loan_ID', 'Gender'], inplace=True)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "id": "b353ab98-4fd8-43d4-b4a8-3794ba04ec2f", 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "df" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": null, 220 | "id": "457857c3-2b4a-4c53-9ee2-f56501bae616", 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "df['Married'].value_counts()" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "id": "492b10dd-0aa7-419a-bde1-429de03c9722", 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "df['Married'] = df['Married'].map({'No': 0, 'Yes': 1})" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "id": "ed6c6909-eb44-4770-ba32-357b65bca7e4", 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "df" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "id": "61774e90-09d4-40f5-af83-9008f1181cde", 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "df['Dependents'].value_counts()" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "id": "3fc79e96-d907-4486-8dda-3d015f9f5e6f", 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "df['Dependents'] = df['Dependents'].map({'0': 0, '1': 1, '2': 2, '3+': 3})" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "id": "f1c24d13-cfee-4f79-a096-37218c5569cc", 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "df" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "id": "6d1f4b85-f0d2-4c3f-921a-9211ee70297f", 281 | "metadata": {}, 282 | "outputs": [], 283 | "source": [ 284 | "df['Education'].value_counts()" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "id": "0c63f1a0-b429-401b-9124-df5b8201c135", 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "df['Education'] = df['Education'].map({'Graduate': 1, 'Not Graduate':0})" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "id": "147a32cd-125c-42d7-a0c7-5ccff47b7664", 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "df" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "id": "55b1640f-1f4e-493c-b485-26473c17f815", 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "df['Property_Area'].value_counts()" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "id": "e7997516-ea6c-4c4a-9509-599d4695fa6d", 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "df = pd.get_dummies(df, columns=['Property_Area'])" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": null, 330 | "id": "f504b309-4281-42ce-bb8b-e3cff58bcc3a", 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "df.info()" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "id": "69697549-38b2-4203-bad3-39928c1ea725", 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "df['LoanAmount']" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "id": "94cc0376-1166-44e4-8129-2ec4fa94f5f8", 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "df['LoanAmount'].mean()" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "id": "b5941bc0-a767-4c35-85c4-f4f1531336fc", 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [ 364 | "df['LoanAmount'].isna()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "id": "6558f40b-599d-4b55-9235-15ebb4651439", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [ 374 | "df.loc[df['LoanAmount'].isna(), 'LoanAmount'] = df['LoanAmount'].mean()" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": null, 380 | "id": "8f7af119-d80f-4942-a8b9-21465c6d5abd", 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "df.info()" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "id": "aaa0d12b-d3f9-4e27-b1bc-81a469864612", 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "df.loc[df['Loan_Amount_Term'].isna(), 'Loan_Amount_Term'] = df['Loan_Amount_Term'].mean()" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": null, 400 | "id": "d63c8a20-6c8e-4879-9a73-a826003e70e0", 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "df.loc[df['Credit_History'].isna(), 'Credit_History'] = df['Credit_History'].mean()" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": null, 410 | "id": "a1f9f820-95b1-4127-bd9d-10f46937f460", 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "df" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "id": "3917713f-78b3-4074-8328-ad7811f7ae7e", 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [ 424 | "X = df[ [\n", 425 | " 'Married',\n", 426 | " 'Dependents',\n", 427 | " 'Education',\n", 428 | " 'ApplicantIncome',\n", 429 | " 'CoapplicantIncome',\n", 430 | " 'LoanAmount',\n", 431 | " 'Loan_Amount_Term',\n", 432 | " 'Credit_History',\n", 433 | " 'Property_Area_Rural',\n", 434 | " 'Property_Area_Semiurban',\n", 435 | " 'Property_Area_Urban',\n", 436 | "] ]\n", 437 | "X" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": null, 443 | "id": "b89b9994-583c-4e2e-9633-016845cf11aa", 444 | "metadata": {}, 445 | "outputs": [], 446 | "source": [ 447 | "X.info()" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "id": "47cbc95c-17e2-40c7-917c-f9effd0fa51a", 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "X.loc[X['Dependents'].isna(), 'Dependents'] = X['Dependents'].mode()[0]" 458 | ] 459 | }, 460 | { 461 | "cell_type": "code", 462 | "execution_count": null, 463 | "id": "cc358a31-c08a-42cd-9e14-32cfa3ad6fd2", 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "X.loc[X['Married'].isna(), 'Married'] = X['Married'].mode()[0]" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "id": "465c1369-f583-46fe-bb19-5952d5fb33a4", 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "X.info()" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "id": "c982dd48-6e8a-4494-b701-b9735ad70a36", 484 | "metadata": { 485 | "scrolled": true 486 | }, 487 | "outputs": [], 488 | "source": [ 489 | "y = df['Loan_Status'].map({'Y': 1, 'N': 0})\n", 490 | "y" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "id": "6746ca5c-8795-409d-b27d-11a83b8de6db", 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "from sklearn.neighbors import KNeighborsClassifier" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "id": "7cea4463-e91c-4792-8156-f2074b6c3f87", 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "X.info()" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "id": "351a1823-180c-4bdf-bc72-b1ff21647dd0", 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "model = KNeighborsClassifier()\n", 521 | "\n", 522 | "model.fit(X, y)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "id": "507c8fc9-472f-43f3-8416-9c23ce07993a", 529 | "metadata": {}, 530 | "outputs": [], 531 | "source": [ 532 | "model.score(X, y)" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "id": "128a22ba-f336-4de5-b792-646fbd2d30bd", 539 | "metadata": {}, 540 | "outputs": [], 541 | "source": [ 542 | "from sklearn.ensemble import RandomForestClassifier\n", 543 | "\n", 544 | "model = RandomForestClassifier()\n", 545 | "\n", 546 | "model.fit(X, y)\n", 547 | "\n", 548 | "model.score(X, y)" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "id": "facb230b-4f95-40e4-ba97-a381e889a039", 555 | "metadata": {}, 556 | "outputs": [], 557 | "source": [ 558 | "from sklearn.model_selection import train_test_split\n", 559 | "\n", 560 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": null, 566 | "id": "b6dd3ecb-e6b4-4301-865b-06e96782725b", 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "model = RandomForestClassifier()\n", 571 | "\n", 572 | "model.fit(X_train, y_train)\n", 573 | "\n", 574 | "print(model.score(X_train, y_train))\n", 575 | "print(model.score(X_test, y_test))" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "id": "4da92ef1-3338-4bc5-b2d4-9e46b73b28f6", 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [] 585 | } 586 | ], 587 | "metadata": { 588 | "kernelspec": { 589 | "display_name": "Python 3 (ipykernel)", 590 | "language": "python", 591 | "name": "python3" 592 | }, 593 | "language_info": { 594 | "codemirror_mode": { 595 | "name": "ipython", 596 | "version": 3 597 | }, 598 | "file_extension": ".py", 599 | "mimetype": "text/x-python", 600 | "name": "python", 601 | "nbconvert_exporter": "python", 602 | "pygments_lexer": "ipython3", 603 | "version": "3.11.7" 604 | } 605 | }, 606 | "nbformat": 4, 607 | "nbformat_minor": 5 608 | } 609 | -------------------------------------------------------------------------------- /Machine Learning/loan_train_data.csv: -------------------------------------------------------------------------------- 1 | Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status 2 | LP001002,Male,No,0,Graduate,No,5849,0,,360,1,Urban,Y 3 | LP001003,Male,Yes,1,Graduate,No,4583,1508,128,360,1,Rural,N 4 | LP001005,Male,Yes,0,Graduate,Yes,3000,0,66,360,1,Urban,Y 5 | LP001006,Male,Yes,0,Not Graduate,No,2583,2358,120,360,1,Urban,Y 6 | LP001008,Male,No,0,Graduate,No,6000,0,141,360,1,Urban,Y 7 | LP001011,Male,Yes,2,Graduate,Yes,5417,4196,267,360,1,Urban,Y 8 | LP001013,Male,Yes,0,Not Graduate,No,2333,1516,95,360,1,Urban,Y 9 | LP001014,Male,Yes,3+,Graduate,No,3036,2504,158,360,0,Semiurban,N 10 | LP001018,Male,Yes,2,Graduate,No,4006,1526,168,360,1,Urban,Y 11 | LP001020,Male,Yes,1,Graduate,No,12841,10968,349,360,1,Semiurban,N 12 | LP001024,Male,Yes,2,Graduate,No,3200,700,70,360,1,Urban,Y 13 | LP001027,Male,Yes,2,Graduate,,2500,1840,109,360,1,Urban,Y 14 | LP001028,Male,Yes,2,Graduate,No,3073,8106,200,360,1,Urban,Y 15 | LP001029,Male,No,0,Graduate,No,1853,2840,114,360,1,Rural,N 16 | LP001030,Male,Yes,2,Graduate,No,1299,1086,17,120,1,Urban,Y 17 | LP001032,Male,No,0,Graduate,No,4950,0,125,360,1,Urban,Y 18 | LP001034,Male,No,1,Not Graduate,No,3596,0,100,240,,Urban,Y 19 | LP001036,Female,No,0,Graduate,No,3510,0,76,360,0,Urban,N 20 | LP001038,Male,Yes,0,Not Graduate,No,4887,0,133,360,1,Rural,N 21 | LP001041,Male,Yes,0,Graduate,,2600,3500,115,,1,Urban,Y 22 | LP001043,Male,Yes,0,Not Graduate,No,7660,0,104,360,0,Urban,N 23 | LP001046,Male,Yes,1,Graduate,No,5955,5625,315,360,1,Urban,Y 24 | LP001047,Male,Yes,0,Not Graduate,No,2600,1911,116,360,0,Semiurban,N 25 | LP001050,,Yes,2,Not Graduate,No,3365,1917,112,360,0,Rural,N 26 | LP001052,Male,Yes,1,Graduate,,3717,2925,151,360,,Semiurban,N 27 | LP001066,Male,Yes,0,Graduate,Yes,9560,0,191,360,1,Semiurban,Y 28 | LP001068,Male,Yes,0,Graduate,No,2799,2253,122,360,1,Semiurban,Y 29 | LP001073,Male,Yes,2,Not Graduate,No,4226,1040,110,360,1,Urban,Y 30 | LP001086,Male,No,0,Not Graduate,No,1442,0,35,360,1,Urban,N 31 | LP001087,Female,No,2,Graduate,,3750,2083,120,360,1,Semiurban,Y 32 | LP001091,Male,Yes,1,Graduate,,4166,3369,201,360,,Urban,N 33 | LP001095,Male,No,0,Graduate,No,3167,0,74,360,1,Urban,N 34 | LP001097,Male,No,1,Graduate,Yes,4692,0,106,360,1,Rural,N 35 | LP001098,Male,Yes,0,Graduate,No,3500,1667,114,360,1,Semiurban,Y 36 | LP001100,Male,No,3+,Graduate,No,12500,3000,320,360,1,Rural,N 37 | LP001106,Male,Yes,0,Graduate,No,2275,2067,,360,1,Urban,Y 38 | LP001109,Male,Yes,0,Graduate,No,1828,1330,100,,0,Urban,N 39 | LP001112,Female,Yes,0,Graduate,No,3667,1459,144,360,1,Semiurban,Y 40 | LP001114,Male,No,0,Graduate,No,4166,7210,184,360,1,Urban,Y 41 | LP001116,Male,No,0,Not Graduate,No,3748,1668,110,360,1,Semiurban,Y 42 | LP001119,Male,No,0,Graduate,No,3600,0,80,360,1,Urban,N 43 | LP001120,Male,No,0,Graduate,No,1800,1213,47,360,1,Urban,Y 44 | LP001123,Male,Yes,0,Graduate,No,2400,0,75,360,,Urban,Y 45 | LP001131,Male,Yes,0,Graduate,No,3941,2336,134,360,1,Semiurban,Y 46 | LP001136,Male,Yes,0,Not Graduate,Yes,4695,0,96,,1,Urban,Y 47 | LP001137,Female,No,0,Graduate,No,3410,0,88,,1,Urban,Y 48 | LP001138,Male,Yes,1,Graduate,No,5649,0,44,360,1,Urban,Y 49 | LP001144,Male,Yes,0,Graduate,No,5821,0,144,360,1,Urban,Y 50 | LP001146,Female,Yes,0,Graduate,No,2645,3440,120,360,0,Urban,N 51 | LP001151,Female,No,0,Graduate,No,4000,2275,144,360,1,Semiurban,Y 52 | LP001155,Female,Yes,0,Not Graduate,No,1928,1644,100,360,1,Semiurban,Y 53 | LP001157,Female,No,0,Graduate,No,3086,0,120,360,1,Semiurban,Y 54 | LP001164,Female,No,0,Graduate,No,4230,0,112,360,1,Semiurban,N 55 | LP001179,Male,Yes,2,Graduate,No,4616,0,134,360,1,Urban,N 56 | LP001186,Female,Yes,1,Graduate,Yes,11500,0,286,360,0,Urban,N 57 | LP001194,Male,Yes,2,Graduate,No,2708,1167,97,360,1,Semiurban,Y 58 | LP001195,Male,Yes,0,Graduate,No,2132,1591,96,360,1,Semiurban,Y 59 | LP001197,Male,Yes,0,Graduate,No,3366,2200,135,360,1,Rural,N 60 | LP001198,Male,Yes,1,Graduate,No,8080,2250,180,360,1,Urban,Y 61 | LP001199,Male,Yes,2,Not Graduate,No,3357,2859,144,360,1,Urban,Y 62 | LP001205,Male,Yes,0,Graduate,No,2500,3796,120,360,1,Urban,Y 63 | LP001206,Male,Yes,3+,Graduate,No,3029,0,99,360,1,Urban,Y 64 | LP001207,Male,Yes,0,Not Graduate,Yes,2609,3449,165,180,0,Rural,N 65 | LP001213,Male,Yes,1,Graduate,No,4945,0,,360,0,Rural,N 66 | LP001222,Female,No,0,Graduate,No,4166,0,116,360,0,Semiurban,N 67 | LP001225,Male,Yes,0,Graduate,No,5726,4595,258,360,1,Semiurban,N 68 | LP001228,Male,No,0,Not Graduate,No,3200,2254,126,180,0,Urban,N 69 | LP001233,Male,Yes,1,Graduate,No,10750,0,312,360,1,Urban,Y 70 | LP001238,Male,Yes,3+,Not Graduate,Yes,7100,0,125,60,1,Urban,Y 71 | LP001241,Female,No,0,Graduate,No,4300,0,136,360,0,Semiurban,N 72 | LP001243,Male,Yes,0,Graduate,No,3208,3066,172,360,1,Urban,Y 73 | LP001245,Male,Yes,2,Not Graduate,Yes,1875,1875,97,360,1,Semiurban,Y 74 | LP001248,Male,No,0,Graduate,No,3500,0,81,300,1,Semiurban,Y 75 | LP001250,Male,Yes,3+,Not Graduate,No,4755,0,95,,0,Semiurban,N 76 | LP001253,Male,Yes,3+,Graduate,Yes,5266,1774,187,360,1,Semiurban,Y 77 | LP001255,Male,No,0,Graduate,No,3750,0,113,480,1,Urban,N 78 | LP001256,Male,No,0,Graduate,No,3750,4750,176,360,1,Urban,N 79 | LP001259,Male,Yes,1,Graduate,Yes,1000,3022,110,360,1,Urban,N 80 | LP001263,Male,Yes,3+,Graduate,No,3167,4000,180,300,0,Semiurban,N 81 | LP001264,Male,Yes,3+,Not Graduate,Yes,3333,2166,130,360,,Semiurban,Y 82 | LP001265,Female,No,0,Graduate,No,3846,0,111,360,1,Semiurban,Y 83 | LP001266,Male,Yes,1,Graduate,Yes,2395,0,,360,1,Semiurban,Y 84 | LP001267,Female,Yes,2,Graduate,No,1378,1881,167,360,1,Urban,N 85 | LP001273,Male,Yes,0,Graduate,No,6000,2250,265,360,,Semiurban,N 86 | LP001275,Male,Yes,1,Graduate,No,3988,0,50,240,1,Urban,Y 87 | LP001279,Male,No,0,Graduate,No,2366,2531,136,360,1,Semiurban,Y 88 | LP001280,Male,Yes,2,Not Graduate,No,3333,2000,99,360,,Semiurban,Y 89 | LP001282,Male,Yes,0,Graduate,No,2500,2118,104,360,1,Semiurban,Y 90 | LP001289,Male,No,0,Graduate,No,8566,0,210,360,1,Urban,Y 91 | LP001310,Male,Yes,0,Graduate,No,5695,4167,175,360,1,Semiurban,Y 92 | LP001316,Male,Yes,0,Graduate,No,2958,2900,131,360,1,Semiurban,Y 93 | LP001318,Male,Yes,2,Graduate,No,6250,5654,188,180,1,Semiurban,Y 94 | LP001319,Male,Yes,2,Not Graduate,No,3273,1820,81,360,1,Urban,Y 95 | LP001322,Male,No,0,Graduate,No,4133,0,122,360,1,Semiurban,Y 96 | LP001325,Male,No,0,Not Graduate,No,3620,0,25,120,1,Semiurban,Y 97 | LP001326,Male,No,0,Graduate,,6782,0,,360,,Urban,N 98 | LP001327,Female,Yes,0,Graduate,No,2484,2302,137,360,1,Semiurban,Y 99 | LP001333,Male,Yes,0,Graduate,No,1977,997,50,360,1,Semiurban,Y 100 | LP001334,Male,Yes,0,Not Graduate,No,4188,0,115,180,1,Semiurban,Y 101 | LP001343,Male,Yes,0,Graduate,No,1759,3541,131,360,1,Semiurban,Y 102 | LP001345,Male,Yes,2,Not Graduate,No,4288,3263,133,180,1,Urban,Y 103 | LP001349,Male,No,0,Graduate,No,4843,3806,151,360,1,Semiurban,Y 104 | LP001350,Male,Yes,,Graduate,No,13650,0,,360,1,Urban,Y 105 | LP001356,Male,Yes,0,Graduate,No,4652,3583,,360,1,Semiurban,Y 106 | LP001357,Male,,,Graduate,No,3816,754,160,360,1,Urban,Y 107 | LP001367,Male,Yes,1,Graduate,No,3052,1030,100,360,1,Urban,Y 108 | LP001369,Male,Yes,2,Graduate,No,11417,1126,225,360,1,Urban,Y 109 | LP001370,Male,No,0,Not Graduate,,7333,0,120,360,1,Rural,N 110 | LP001379,Male,Yes,2,Graduate,No,3800,3600,216,360,0,Urban,N 111 | LP001384,Male,Yes,3+,Not Graduate,No,2071,754,94,480,1,Semiurban,Y 112 | LP001385,Male,No,0,Graduate,No,5316,0,136,360,1,Urban,Y 113 | LP001387,Female,Yes,0,Graduate,,2929,2333,139,360,1,Semiurban,Y 114 | LP001391,Male,Yes,0,Not Graduate,No,3572,4114,152,,0,Rural,N 115 | LP001392,Female,No,1,Graduate,Yes,7451,0,,360,1,Semiurban,Y 116 | LP001398,Male,No,0,Graduate,,5050,0,118,360,1,Semiurban,Y 117 | LP001401,Male,Yes,1,Graduate,No,14583,0,185,180,1,Rural,Y 118 | LP001404,Female,Yes,0,Graduate,No,3167,2283,154,360,1,Semiurban,Y 119 | LP001405,Male,Yes,1,Graduate,No,2214,1398,85,360,,Urban,Y 120 | LP001421,Male,Yes,0,Graduate,No,5568,2142,175,360,1,Rural,N 121 | LP001422,Female,No,0,Graduate,No,10408,0,259,360,1,Urban,Y 122 | LP001426,Male,Yes,,Graduate,No,5667,2667,180,360,1,Rural,Y 123 | LP001430,Female,No,0,Graduate,No,4166,0,44,360,1,Semiurban,Y 124 | LP001431,Female,No,0,Graduate,No,2137,8980,137,360,0,Semiurban,Y 125 | LP001432,Male,Yes,2,Graduate,No,2957,0,81,360,1,Semiurban,Y 126 | LP001439,Male,Yes,0,Not Graduate,No,4300,2014,194,360,1,Rural,Y 127 | LP001443,Female,No,0,Graduate,No,3692,0,93,360,,Rural,Y 128 | LP001448,,Yes,3+,Graduate,No,23803,0,370,360,1,Rural,Y 129 | LP001449,Male,No,0,Graduate,No,3865,1640,,360,1,Rural,Y 130 | LP001451,Male,Yes,1,Graduate,Yes,10513,3850,160,180,0,Urban,N 131 | LP001465,Male,Yes,0,Graduate,No,6080,2569,182,360,,Rural,N 132 | LP001469,Male,No,0,Graduate,Yes,20166,0,650,480,,Urban,Y 133 | LP001473,Male,No,0,Graduate,No,2014,1929,74,360,1,Urban,Y 134 | LP001478,Male,No,0,Graduate,No,2718,0,70,360,1,Semiurban,Y 135 | LP001482,Male,Yes,0,Graduate,Yes,3459,0,25,120,1,Semiurban,Y 136 | LP001487,Male,No,0,Graduate,No,4895,0,102,360,1,Semiurban,Y 137 | LP001488,Male,Yes,3+,Graduate,No,4000,7750,290,360,1,Semiurban,N 138 | LP001489,Female,Yes,0,Graduate,No,4583,0,84,360,1,Rural,N 139 | LP001491,Male,Yes,2,Graduate,Yes,3316,3500,88,360,1,Urban,Y 140 | LP001492,Male,No,0,Graduate,No,14999,0,242,360,0,Semiurban,N 141 | LP001493,Male,Yes,2,Not Graduate,No,4200,1430,129,360,1,Rural,N 142 | LP001497,Male,Yes,2,Graduate,No,5042,2083,185,360,1,Rural,N 143 | LP001498,Male,No,0,Graduate,No,5417,0,168,360,1,Urban,Y 144 | LP001504,Male,No,0,Graduate,Yes,6950,0,175,180,1,Semiurban,Y 145 | LP001507,Male,Yes,0,Graduate,No,2698,2034,122,360,1,Semiurban,Y 146 | LP001508,Male,Yes,2,Graduate,No,11757,0,187,180,1,Urban,Y 147 | LP001514,Female,Yes,0,Graduate,No,2330,4486,100,360,1,Semiurban,Y 148 | LP001516,Female,Yes,2,Graduate,No,14866,0,70,360,1,Urban,Y 149 | LP001518,Male,Yes,1,Graduate,No,1538,1425,30,360,1,Urban,Y 150 | LP001519,Female,No,0,Graduate,No,10000,1666,225,360,1,Rural,N 151 | LP001520,Male,Yes,0,Graduate,No,4860,830,125,360,1,Semiurban,Y 152 | LP001528,Male,No,0,Graduate,No,6277,0,118,360,0,Rural,N 153 | LP001529,Male,Yes,0,Graduate,Yes,2577,3750,152,360,1,Rural,Y 154 | LP001531,Male,No,0,Graduate,No,9166,0,244,360,1,Urban,N 155 | LP001532,Male,Yes,2,Not Graduate,No,2281,0,113,360,1,Rural,N 156 | LP001535,Male,No,0,Graduate,No,3254,0,50,360,1,Urban,Y 157 | LP001536,Male,Yes,3+,Graduate,No,39999,0,600,180,0,Semiurban,Y 158 | LP001541,Male,Yes,1,Graduate,No,6000,0,160,360,,Rural,Y 159 | LP001543,Male,Yes,1,Graduate,No,9538,0,187,360,1,Urban,Y 160 | LP001546,Male,No,0,Graduate,,2980,2083,120,360,1,Rural,Y 161 | LP001552,Male,Yes,0,Graduate,No,4583,5625,255,360,1,Semiurban,Y 162 | LP001560,Male,Yes,0,Not Graduate,No,1863,1041,98,360,1,Semiurban,Y 163 | LP001562,Male,Yes,0,Graduate,No,7933,0,275,360,1,Urban,N 164 | LP001565,Male,Yes,1,Graduate,No,3089,1280,121,360,0,Semiurban,N 165 | LP001570,Male,Yes,2,Graduate,No,4167,1447,158,360,1,Rural,Y 166 | LP001572,Male,Yes,0,Graduate,No,9323,0,75,180,1,Urban,Y 167 | LP001574,Male,Yes,0,Graduate,No,3707,3166,182,,1,Rural,Y 168 | LP001577,Female,Yes,0,Graduate,No,4583,0,112,360,1,Rural,N 169 | LP001578,Male,Yes,0,Graduate,No,2439,3333,129,360,1,Rural,Y 170 | LP001579,Male,No,0,Graduate,No,2237,0,63,480,0,Semiurban,N 171 | LP001580,Male,Yes,2,Graduate,No,8000,0,200,360,1,Semiurban,Y 172 | LP001581,Male,Yes,0,Not Graduate,,1820,1769,95,360,1,Rural,Y 173 | LP001585,,Yes,3+,Graduate,No,51763,0,700,300,1,Urban,Y 174 | LP001586,Male,Yes,3+,Not Graduate,No,3522,0,81,180,1,Rural,N 175 | LP001594,Male,Yes,0,Graduate,No,5708,5625,187,360,1,Semiurban,Y 176 | LP001603,Male,Yes,0,Not Graduate,Yes,4344,736,87,360,1,Semiurban,N 177 | LP001606,Male,Yes,0,Graduate,No,3497,1964,116,360,1,Rural,Y 178 | LP001608,Male,Yes,2,Graduate,No,2045,1619,101,360,1,Rural,Y 179 | LP001610,Male,Yes,3+,Graduate,No,5516,11300,495,360,0,Semiurban,N 180 | LP001616,Male,Yes,1,Graduate,No,3750,0,116,360,1,Semiurban,Y 181 | LP001630,Male,No,0,Not Graduate,No,2333,1451,102,480,0,Urban,N 182 | LP001633,Male,Yes,1,Graduate,No,6400,7250,180,360,0,Urban,N 183 | LP001634,Male,No,0,Graduate,No,1916,5063,67,360,,Rural,N 184 | LP001636,Male,Yes,0,Graduate,No,4600,0,73,180,1,Semiurban,Y 185 | LP001637,Male,Yes,1,Graduate,No,33846,0,260,360,1,Semiurban,N 186 | LP001639,Female,Yes,0,Graduate,No,3625,0,108,360,1,Semiurban,Y 187 | LP001640,Male,Yes,0,Graduate,Yes,39147,4750,120,360,1,Semiurban,Y 188 | LP001641,Male,Yes,1,Graduate,Yes,2178,0,66,300,0,Rural,N 189 | LP001643,Male,Yes,0,Graduate,No,2383,2138,58,360,,Rural,Y 190 | LP001644,,Yes,0,Graduate,Yes,674,5296,168,360,1,Rural,Y 191 | LP001647,Male,Yes,0,Graduate,No,9328,0,188,180,1,Rural,Y 192 | LP001653,Male,No,0,Not Graduate,No,4885,0,48,360,1,Rural,Y 193 | LP001656,Male,No,0,Graduate,No,12000,0,164,360,1,Semiurban,N 194 | LP001657,Male,Yes,0,Not Graduate,No,6033,0,160,360,1,Urban,N 195 | LP001658,Male,No,0,Graduate,No,3858,0,76,360,1,Semiurban,Y 196 | LP001664,Male,No,0,Graduate,No,4191,0,120,360,1,Rural,Y 197 | LP001665,Male,Yes,1,Graduate,No,3125,2583,170,360,1,Semiurban,N 198 | LP001666,Male,No,0,Graduate,No,8333,3750,187,360,1,Rural,Y 199 | LP001669,Female,No,0,Not Graduate,No,1907,2365,120,,1,Urban,Y 200 | LP001671,Female,Yes,0,Graduate,No,3416,2816,113,360,,Semiurban,Y 201 | LP001673,Male,No,0,Graduate,Yes,11000,0,83,360,1,Urban,N 202 | LP001674,Male,Yes,1,Not Graduate,No,2600,2500,90,360,1,Semiurban,Y 203 | LP001677,Male,No,2,Graduate,No,4923,0,166,360,0,Semiurban,Y 204 | LP001682,Male,Yes,3+,Not Graduate,No,3992,0,,180,1,Urban,N 205 | LP001688,Male,Yes,1,Not Graduate,No,3500,1083,135,360,1,Urban,Y 206 | LP001691,Male,Yes,2,Not Graduate,No,3917,0,124,360,1,Semiurban,Y 207 | LP001692,Female,No,0,Not Graduate,No,4408,0,120,360,1,Semiurban,Y 208 | LP001693,Female,No,0,Graduate,No,3244,0,80,360,1,Urban,Y 209 | LP001698,Male,No,0,Not Graduate,No,3975,2531,55,360,1,Rural,Y 210 | LP001699,Male,No,0,Graduate,No,2479,0,59,360,1,Urban,Y 211 | LP001702,Male,No,0,Graduate,No,3418,0,127,360,1,Semiurban,N 212 | LP001708,Female,No,0,Graduate,No,10000,0,214,360,1,Semiurban,N 213 | LP001711,Male,Yes,3+,Graduate,No,3430,1250,128,360,0,Semiurban,N 214 | LP001713,Male,Yes,1,Graduate,Yes,7787,0,240,360,1,Urban,Y 215 | LP001715,Male,Yes,3+,Not Graduate,Yes,5703,0,130,360,1,Rural,Y 216 | LP001716,Male,Yes,0,Graduate,No,3173,3021,137,360,1,Urban,Y 217 | LP001720,Male,Yes,3+,Not Graduate,No,3850,983,100,360,1,Semiurban,Y 218 | LP001722,Male,Yes,0,Graduate,No,150,1800,135,360,1,Rural,N 219 | LP001726,Male,Yes,0,Graduate,No,3727,1775,131,360,1,Semiurban,Y 220 | LP001732,Male,Yes,2,Graduate,,5000,0,72,360,0,Semiurban,N 221 | LP001734,Female,Yes,2,Graduate,No,4283,2383,127,360,,Semiurban,Y 222 | LP001736,Male,Yes,0,Graduate,No,2221,0,60,360,0,Urban,N 223 | LP001743,Male,Yes,2,Graduate,No,4009,1717,116,360,1,Semiurban,Y 224 | LP001744,Male,No,0,Graduate,No,2971,2791,144,360,1,Semiurban,Y 225 | LP001749,Male,Yes,0,Graduate,No,7578,1010,175,,1,Semiurban,Y 226 | LP001750,Male,Yes,0,Graduate,No,6250,0,128,360,1,Semiurban,Y 227 | LP001751,Male,Yes,0,Graduate,No,3250,0,170,360,1,Rural,N 228 | LP001754,Male,Yes,,Not Graduate,Yes,4735,0,138,360,1,Urban,N 229 | LP001758,Male,Yes,2,Graduate,No,6250,1695,210,360,1,Semiurban,Y 230 | LP001760,Male,,,Graduate,No,4758,0,158,480,1,Semiurban,Y 231 | LP001761,Male,No,0,Graduate,Yes,6400,0,200,360,1,Rural,Y 232 | LP001765,Male,Yes,1,Graduate,No,2491,2054,104,360,1,Semiurban,Y 233 | LP001768,Male,Yes,0,Graduate,,3716,0,42,180,1,Rural,Y 234 | LP001770,Male,No,0,Not Graduate,No,3189,2598,120,,1,Rural,Y 235 | LP001776,Female,No,0,Graduate,No,8333,0,280,360,1,Semiurban,Y 236 | LP001778,Male,Yes,1,Graduate,No,3155,1779,140,360,1,Semiurban,Y 237 | LP001784,Male,Yes,1,Graduate,No,5500,1260,170,360,1,Rural,Y 238 | LP001786,Male,Yes,0,Graduate,,5746,0,255,360,,Urban,N 239 | LP001788,Female,No,0,Graduate,Yes,3463,0,122,360,,Urban,Y 240 | LP001790,Female,No,1,Graduate,No,3812,0,112,360,1,Rural,Y 241 | LP001792,Male,Yes,1,Graduate,No,3315,0,96,360,1,Semiurban,Y 242 | LP001798,Male,Yes,2,Graduate,No,5819,5000,120,360,1,Rural,Y 243 | LP001800,Male,Yes,1,Not Graduate,No,2510,1983,140,180,1,Urban,N 244 | LP001806,Male,No,0,Graduate,No,2965,5701,155,60,1,Urban,Y 245 | LP001807,Male,Yes,2,Graduate,Yes,6250,1300,108,360,1,Rural,Y 246 | LP001811,Male,Yes,0,Not Graduate,No,3406,4417,123,360,1,Semiurban,Y 247 | LP001813,Male,No,0,Graduate,Yes,6050,4333,120,180,1,Urban,N 248 | LP001814,Male,Yes,2,Graduate,No,9703,0,112,360,1,Urban,Y 249 | LP001819,Male,Yes,1,Not Graduate,No,6608,0,137,180,1,Urban,Y 250 | LP001824,Male,Yes,1,Graduate,No,2882,1843,123,480,1,Semiurban,Y 251 | LP001825,Male,Yes,0,Graduate,No,1809,1868,90,360,1,Urban,Y 252 | LP001835,Male,Yes,0,Not Graduate,No,1668,3890,201,360,0,Semiurban,N 253 | LP001836,Female,No,2,Graduate,No,3427,0,138,360,1,Urban,N 254 | LP001841,Male,No,0,Not Graduate,Yes,2583,2167,104,360,1,Rural,Y 255 | LP001843,Male,Yes,1,Not Graduate,No,2661,7101,279,180,1,Semiurban,Y 256 | LP001844,Male,No,0,Graduate,Yes,16250,0,192,360,0,Urban,N 257 | LP001846,Female,No,3+,Graduate,No,3083,0,255,360,1,Rural,Y 258 | LP001849,Male,No,0,Not Graduate,No,6045,0,115,360,0,Rural,N 259 | LP001854,Male,Yes,3+,Graduate,No,5250,0,94,360,1,Urban,N 260 | LP001859,Male,Yes,0,Graduate,No,14683,2100,304,360,1,Rural,N 261 | LP001864,Male,Yes,3+,Not Graduate,No,4931,0,128,360,,Semiurban,N 262 | LP001865,Male,Yes,1,Graduate,No,6083,4250,330,360,,Urban,Y 263 | LP001868,Male,No,0,Graduate,No,2060,2209,134,360,1,Semiurban,Y 264 | LP001870,Female,No,1,Graduate,No,3481,0,155,36,1,Semiurban,N 265 | LP001871,Female,No,0,Graduate,No,7200,0,120,360,1,Rural,Y 266 | LP001872,Male,No,0,Graduate,Yes,5166,0,128,360,1,Semiurban,Y 267 | LP001875,Male,No,0,Graduate,No,4095,3447,151,360,1,Rural,Y 268 | LP001877,Male,Yes,2,Graduate,No,4708,1387,150,360,1,Semiurban,Y 269 | LP001882,Male,Yes,3+,Graduate,No,4333,1811,160,360,0,Urban,Y 270 | LP001883,Female,No,0,Graduate,,3418,0,135,360,1,Rural,N 271 | LP001884,Female,No,1,Graduate,No,2876,1560,90,360,1,Urban,Y 272 | LP001888,Female,No,0,Graduate,No,3237,0,30,360,1,Urban,Y 273 | LP001891,Male,Yes,0,Graduate,No,11146,0,136,360,1,Urban,Y 274 | LP001892,Male,No,0,Graduate,No,2833,1857,126,360,1,Rural,Y 275 | LP001894,Male,Yes,0,Graduate,No,2620,2223,150,360,1,Semiurban,Y 276 | LP001896,Male,Yes,2,Graduate,No,3900,0,90,360,1,Semiurban,Y 277 | LP001900,Male,Yes,1,Graduate,No,2750,1842,115,360,1,Semiurban,Y 278 | LP001903,Male,Yes,0,Graduate,No,3993,3274,207,360,1,Semiurban,Y 279 | LP001904,Male,Yes,0,Graduate,No,3103,1300,80,360,1,Urban,Y 280 | LP001907,Male,Yes,0,Graduate,No,14583,0,436,360,1,Semiurban,Y 281 | LP001908,Female,Yes,0,Not Graduate,No,4100,0,124,360,,Rural,Y 282 | LP001910,Male,No,1,Not Graduate,Yes,4053,2426,158,360,0,Urban,N 283 | LP001914,Male,Yes,0,Graduate,No,3927,800,112,360,1,Semiurban,Y 284 | LP001915,Male,Yes,2,Graduate,No,2301,985.7999878,78,180,1,Urban,Y 285 | LP001917,Female,No,0,Graduate,No,1811,1666,54,360,1,Urban,Y 286 | LP001922,Male,Yes,0,Graduate,No,20667,0,,360,1,Rural,N 287 | LP001924,Male,No,0,Graduate,No,3158,3053,89,360,1,Rural,Y 288 | LP001925,Female,No,0,Graduate,Yes,2600,1717,99,300,1,Semiurban,N 289 | LP001926,Male,Yes,0,Graduate,No,3704,2000,120,360,1,Rural,Y 290 | LP001931,Female,No,0,Graduate,No,4124,0,115,360,1,Semiurban,Y 291 | LP001935,Male,No,0,Graduate,No,9508,0,187,360,1,Rural,Y 292 | LP001936,Male,Yes,0,Graduate,No,3075,2416,139,360,1,Rural,Y 293 | LP001938,Male,Yes,2,Graduate,No,4400,0,127,360,0,Semiurban,N 294 | LP001940,Male,Yes,2,Graduate,No,3153,1560,134,360,1,Urban,Y 295 | LP001945,Female,No,,Graduate,No,5417,0,143,480,0,Urban,N 296 | LP001947,Male,Yes,0,Graduate,No,2383,3334,172,360,1,Semiurban,Y 297 | LP001949,Male,Yes,3+,Graduate,,4416,1250,110,360,1,Urban,Y 298 | LP001953,Male,Yes,1,Graduate,No,6875,0,200,360,1,Semiurban,Y 299 | LP001954,Female,Yes,1,Graduate,No,4666,0,135,360,1,Urban,Y 300 | LP001955,Female,No,0,Graduate,No,5000,2541,151,480,1,Rural,N 301 | LP001963,Male,Yes,1,Graduate,No,2014,2925,113,360,1,Urban,N 302 | LP001964,Male,Yes,0,Not Graduate,No,1800,2934,93,360,0,Urban,N 303 | LP001972,Male,Yes,,Not Graduate,No,2875,1750,105,360,1,Semiurban,Y 304 | LP001974,Female,No,0,Graduate,No,5000,0,132,360,1,Rural,Y 305 | LP001977,Male,Yes,1,Graduate,No,1625,1803,96,360,1,Urban,Y 306 | LP001978,Male,No,0,Graduate,No,4000,2500,140,360,1,Rural,Y 307 | LP001990,Male,No,0,Not Graduate,No,2000,0,,360,1,Urban,N 308 | LP001993,Female,No,0,Graduate,No,3762,1666,135,360,1,Rural,Y 309 | LP001994,Female,No,0,Graduate,No,2400,1863,104,360,0,Urban,N 310 | LP001996,Male,No,0,Graduate,No,20233,0,480,360,1,Rural,N 311 | LP001998,Male,Yes,2,Not Graduate,No,7667,0,185,360,,Rural,Y 312 | LP002002,Female,No,0,Graduate,No,2917,0,84,360,1,Semiurban,Y 313 | LP002004,Male,No,0,Not Graduate,No,2927,2405,111,360,1,Semiurban,Y 314 | LP002006,Female,No,0,Graduate,No,2507,0,56,360,1,Rural,Y 315 | LP002008,Male,Yes,2,Graduate,Yes,5746,0,144,84,,Rural,Y 316 | LP002024,,Yes,0,Graduate,No,2473,1843,159,360,1,Rural,N 317 | LP002031,Male,Yes,1,Not Graduate,No,3399,1640,111,180,1,Urban,Y 318 | LP002035,Male,Yes,2,Graduate,No,3717,0,120,360,1,Semiurban,Y 319 | LP002036,Male,Yes,0,Graduate,No,2058,2134,88,360,,Urban,Y 320 | LP002043,Female,No,1,Graduate,No,3541,0,112,360,,Semiurban,Y 321 | LP002050,Male,Yes,1,Graduate,Yes,10000,0,155,360,1,Rural,N 322 | LP002051,Male,Yes,0,Graduate,No,2400,2167,115,360,1,Semiurban,Y 323 | LP002053,Male,Yes,3+,Graduate,No,4342,189,124,360,1,Semiurban,Y 324 | LP002054,Male,Yes,2,Not Graduate,No,3601,1590,,360,1,Rural,Y 325 | LP002055,Female,No,0,Graduate,No,3166,2985,132,360,,Rural,Y 326 | LP002065,Male,Yes,3+,Graduate,No,15000,0,300,360,1,Rural,Y 327 | LP002067,Male,Yes,1,Graduate,Yes,8666,4983,376,360,0,Rural,N 328 | LP002068,Male,No,0,Graduate,No,4917,0,130,360,0,Rural,Y 329 | LP002082,Male,Yes,0,Graduate,Yes,5818,2160,184,360,1,Semiurban,Y 330 | LP002086,Female,Yes,0,Graduate,No,4333,2451,110,360,1,Urban,N 331 | LP002087,Female,No,0,Graduate,No,2500,0,67,360,1,Urban,Y 332 | LP002097,Male,No,1,Graduate,No,4384,1793,117,360,1,Urban,Y 333 | LP002098,Male,No,0,Graduate,No,2935,0,98,360,1,Semiurban,Y 334 | LP002100,Male,No,,Graduate,No,2833,0,71,360,1,Urban,Y 335 | LP002101,Male,Yes,0,Graduate,,63337,0,490,180,1,Urban,Y 336 | LP002103,,Yes,1,Graduate,Yes,9833,1833,182,180,1,Urban,Y 337 | LP002106,Male,Yes,,Graduate,Yes,5503,4490,70,,1,Semiurban,Y 338 | LP002110,Male,Yes,1,Graduate,,5250,688,160,360,1,Rural,Y 339 | LP002112,Male,Yes,2,Graduate,Yes,2500,4600,176,360,1,Rural,Y 340 | LP002113,Female,No,3+,Not Graduate,No,1830,0,,360,0,Urban,N 341 | LP002114,Female,No,0,Graduate,No,4160,0,71,360,1,Semiurban,Y 342 | LP002115,Male,Yes,3+,Not Graduate,No,2647,1587,173,360,1,Rural,N 343 | LP002116,Female,No,0,Graduate,No,2378,0,46,360,1,Rural,N 344 | LP002119,Male,Yes,1,Not Graduate,No,4554,1229,158,360,1,Urban,Y 345 | LP002126,Male,Yes,3+,Not Graduate,No,3173,0,74,360,1,Semiurban,Y 346 | LP002128,Male,Yes,2,Graduate,,2583,2330,125,360,1,Rural,Y 347 | LP002129,Male,Yes,0,Graduate,No,2499,2458,160,360,1,Semiurban,Y 348 | LP002130,Male,Yes,,Not Graduate,No,3523,3230,152,360,0,Rural,N 349 | LP002131,Male,Yes,2,Not Graduate,No,3083,2168,126,360,1,Urban,Y 350 | LP002137,Male,Yes,0,Graduate,No,6333,4583,259,360,,Semiurban,Y 351 | LP002138,Male,Yes,0,Graduate,No,2625,6250,187,360,1,Rural,Y 352 | LP002139,Male,Yes,0,Graduate,No,9083,0,228,360,1,Semiurban,Y 353 | LP002140,Male,No,0,Graduate,No,8750,4167,308,360,1,Rural,N 354 | LP002141,Male,Yes,3+,Graduate,No,2666,2083,95,360,1,Rural,Y 355 | LP002142,Female,Yes,0,Graduate,Yes,5500,0,105,360,0,Rural,N 356 | LP002143,Female,Yes,0,Graduate,No,2423,505,130,360,1,Semiurban,Y 357 | LP002144,Female,No,,Graduate,No,3813,0,116,180,1,Urban,Y 358 | LP002149,Male,Yes,2,Graduate,No,8333,3167,165,360,1,Rural,Y 359 | LP002151,Male,Yes,1,Graduate,No,3875,0,67,360,1,Urban,N 360 | LP002158,Male,Yes,0,Not Graduate,No,3000,1666,100,480,0,Urban,N 361 | LP002160,Male,Yes,3+,Graduate,No,5167,3167,200,360,1,Semiurban,Y 362 | LP002161,Female,No,1,Graduate,No,4723,0,81,360,1,Semiurban,N 363 | LP002170,Male,Yes,2,Graduate,No,5000,3667,236,360,1,Semiurban,Y 364 | LP002175,Male,Yes,0,Graduate,No,4750,2333,130,360,1,Urban,Y 365 | LP002178,Male,Yes,0,Graduate,No,3013,3033,95,300,,Urban,Y 366 | LP002180,Male,No,0,Graduate,Yes,6822,0,141,360,1,Rural,Y 367 | LP002181,Male,No,0,Not Graduate,No,6216,0,133,360,1,Rural,N 368 | LP002187,Male,No,0,Graduate,No,2500,0,96,480,1,Semiurban,N 369 | LP002188,Male,No,0,Graduate,No,5124,0,124,,0,Rural,N 370 | LP002190,Male,Yes,1,Graduate,No,6325,0,175,360,1,Semiurban,Y 371 | LP002191,Male,Yes,0,Graduate,No,19730,5266,570,360,1,Rural,N 372 | LP002194,Female,No,0,Graduate,Yes,15759,0,55,360,1,Semiurban,Y 373 | LP002197,Male,Yes,2,Graduate,No,5185,0,155,360,1,Semiurban,Y 374 | LP002201,Male,Yes,2,Graduate,Yes,9323,7873,380,300,1,Rural,Y 375 | LP002205,Male,No,1,Graduate,No,3062,1987,111,180,0,Urban,N 376 | LP002209,Female,No,0,Graduate,,2764,1459,110,360,1,Urban,Y 377 | LP002211,Male,Yes,0,Graduate,No,4817,923,120,180,1,Urban,Y 378 | LP002219,Male,Yes,3+,Graduate,No,8750,4996,130,360,1,Rural,Y 379 | LP002223,Male,Yes,0,Graduate,No,4310,0,130,360,,Semiurban,Y 380 | LP002224,Male,No,0,Graduate,No,3069,0,71,480,1,Urban,N 381 | LP002225,Male,Yes,2,Graduate,No,5391,0,130,360,1,Urban,Y 382 | LP002226,Male,Yes,0,Graduate,,3333,2500,128,360,1,Semiurban,Y 383 | LP002229,Male,No,0,Graduate,No,5941,4232,296,360,1,Semiurban,Y 384 | LP002231,Female,No,0,Graduate,No,6000,0,156,360,1,Urban,Y 385 | LP002234,Male,No,0,Graduate,Yes,7167,0,128,360,1,Urban,Y 386 | LP002236,Male,Yes,2,Graduate,No,4566,0,100,360,1,Urban,N 387 | LP002237,Male,No,1,Graduate,,3667,0,113,180,1,Urban,Y 388 | LP002239,Male,No,0,Not Graduate,No,2346,1600,132,360,1,Semiurban,Y 389 | LP002243,Male,Yes,0,Not Graduate,No,3010,3136,,360,0,Urban,N 390 | LP002244,Male,Yes,0,Graduate,No,2333,2417,136,360,1,Urban,Y 391 | LP002250,Male,Yes,0,Graduate,No,5488,0,125,360,1,Rural,Y 392 | LP002255,Male,No,3+,Graduate,No,9167,0,185,360,1,Rural,Y 393 | LP002262,Male,Yes,3+,Graduate,No,9504,0,275,360,1,Rural,Y 394 | LP002263,Male,Yes,0,Graduate,No,2583,2115,120,360,,Urban,Y 395 | LP002265,Male,Yes,2,Not Graduate,No,1993,1625,113,180,1,Semiurban,Y 396 | LP002266,Male,Yes,2,Graduate,No,3100,1400,113,360,1,Urban,Y 397 | LP002272,Male,Yes,2,Graduate,No,3276,484,135,360,,Semiurban,Y 398 | LP002277,Female,No,0,Graduate,No,3180,0,71,360,0,Urban,N 399 | LP002281,Male,Yes,0,Graduate,No,3033,1459,95,360,1,Urban,Y 400 | LP002284,Male,No,0,Not Graduate,No,3902,1666,109,360,1,Rural,Y 401 | LP002287,Female,No,0,Graduate,No,1500,1800,103,360,0,Semiurban,N 402 | LP002288,Male,Yes,2,Not Graduate,No,2889,0,45,180,0,Urban,N 403 | LP002296,Male,No,0,Not Graduate,No,2755,0,65,300,1,Rural,N 404 | LP002297,Male,No,0,Graduate,No,2500,20000,103,360,1,Semiurban,Y 405 | LP002300,Female,No,0,Not Graduate,No,1963,0,53,360,1,Semiurban,Y 406 | LP002301,Female,No,0,Graduate,Yes,7441,0,194,360,1,Rural,N 407 | LP002305,Female,No,0,Graduate,No,4547,0,115,360,1,Semiurban,Y 408 | LP002308,Male,Yes,0,Not Graduate,No,2167,2400,115,360,1,Urban,Y 409 | LP002314,Female,No,0,Not Graduate,No,2213,0,66,360,1,Rural,Y 410 | LP002315,Male,Yes,1,Graduate,No,8300,0,152,300,0,Semiurban,N 411 | LP002317,Male,Yes,3+,Graduate,No,81000,0,360,360,0,Rural,N 412 | LP002318,Female,No,1,Not Graduate,Yes,3867,0,62,360,1,Semiurban,N 413 | LP002319,Male,Yes,0,Graduate,,6256,0,160,360,,Urban,Y 414 | LP002328,Male,Yes,0,Not Graduate,No,6096,0,218,360,0,Rural,N 415 | LP002332,Male,Yes,0,Not Graduate,No,2253,2033,110,360,1,Rural,Y 416 | LP002335,Female,Yes,0,Not Graduate,No,2149,3237,178,360,0,Semiurban,N 417 | LP002337,Female,No,0,Graduate,No,2995,0,60,360,1,Urban,Y 418 | LP002341,Female,No,1,Graduate,No,2600,0,160,360,1,Urban,N 419 | LP002342,Male,Yes,2,Graduate,Yes,1600,20000,239,360,1,Urban,N 420 | LP002345,Male,Yes,0,Graduate,No,1025,2773,112,360,1,Rural,Y 421 | LP002347,Male,Yes,0,Graduate,No,3246,1417,138,360,1,Semiurban,Y 422 | LP002348,Male,Yes,0,Graduate,No,5829,0,138,360,1,Rural,Y 423 | LP002357,Female,No,0,Not Graduate,No,2720,0,80,,0,Urban,N 424 | LP002361,Male,Yes,0,Graduate,No,1820,1719,100,360,1,Urban,Y 425 | LP002362,Male,Yes,1,Graduate,No,7250,1667,110,,0,Urban,N 426 | LP002364,Male,Yes,0,Graduate,No,14880,0,96,360,1,Semiurban,Y 427 | LP002366,Male,Yes,0,Graduate,No,2666,4300,121,360,1,Rural,Y 428 | LP002367,Female,No,1,Not Graduate,No,4606,0,81,360,1,Rural,N 429 | LP002368,Male,Yes,2,Graduate,No,5935,0,133,360,1,Semiurban,Y 430 | LP002369,Male,Yes,0,Graduate,No,2920,16.12000084,87,360,1,Rural,Y 431 | LP002370,Male,No,0,Not Graduate,No,2717,0,60,180,1,Urban,Y 432 | LP002377,Female,No,1,Graduate,Yes,8624,0,150,360,1,Semiurban,Y 433 | LP002379,Male,No,0,Graduate,No,6500,0,105,360,0,Rural,N 434 | LP002386,Male,No,0,Graduate,,12876,0,405,360,1,Semiurban,Y 435 | LP002387,Male,Yes,0,Graduate,No,2425,2340,143,360,1,Semiurban,Y 436 | LP002390,Male,No,0,Graduate,No,3750,0,100,360,1,Urban,Y 437 | LP002393,Female,,,Graduate,No,10047,0,,240,1,Semiurban,Y 438 | LP002398,Male,No,0,Graduate,No,1926,1851,50,360,1,Semiurban,Y 439 | LP002401,Male,Yes,0,Graduate,No,2213,1125,,360,1,Urban,Y 440 | LP002403,Male,No,0,Graduate,Yes,10416,0,187,360,0,Urban,N 441 | LP002407,Female,Yes,0,Not Graduate,Yes,7142,0,138,360,1,Rural,Y 442 | LP002408,Male,No,0,Graduate,No,3660,5064,187,360,1,Semiurban,Y 443 | LP002409,Male,Yes,0,Graduate,No,7901,1833,180,360,1,Rural,Y 444 | LP002418,Male,No,3+,Not Graduate,No,4707,1993,148,360,1,Semiurban,Y 445 | LP002422,Male,No,1,Graduate,No,37719,0,152,360,1,Semiurban,Y 446 | LP002424,Male,Yes,0,Graduate,No,7333,8333,175,300,,Rural,Y 447 | LP002429,Male,Yes,1,Graduate,Yes,3466,1210,130,360,1,Rural,Y 448 | LP002434,Male,Yes,2,Not Graduate,No,4652,0,110,360,1,Rural,Y 449 | LP002435,Male,Yes,0,Graduate,,3539,1376,55,360,1,Rural,N 450 | LP002443,Male,Yes,2,Graduate,No,3340,1710,150,360,0,Rural,N 451 | LP002444,Male,No,1,Not Graduate,Yes,2769,1542,190,360,,Semiurban,N 452 | LP002446,Male,Yes,2,Not Graduate,No,2309,1255,125,360,0,Rural,N 453 | LP002447,Male,Yes,2,Not Graduate,No,1958,1456,60,300,,Urban,Y 454 | LP002448,Male,Yes,0,Graduate,No,3948,1733,149,360,0,Rural,N 455 | LP002449,Male,Yes,0,Graduate,No,2483,2466,90,180,0,Rural,Y 456 | LP002453,Male,No,0,Graduate,Yes,7085,0,84,360,1,Semiurban,Y 457 | LP002455,Male,Yes,2,Graduate,No,3859,0,96,360,1,Semiurban,Y 458 | LP002459,Male,Yes,0,Graduate,No,4301,0,118,360,1,Urban,Y 459 | LP002467,Male,Yes,0,Graduate,No,3708,2569,173,360,1,Urban,N 460 | LP002472,Male,No,2,Graduate,No,4354,0,136,360,1,Rural,Y 461 | LP002473,Male,Yes,0,Graduate,No,8334,0,160,360,1,Semiurban,N 462 | LP002478,,Yes,0,Graduate,Yes,2083,4083,160,360,,Semiurban,Y 463 | LP002484,Male,Yes,3+,Graduate,No,7740,0,128,180,1,Urban,Y 464 | LP002487,Male,Yes,0,Graduate,No,3015,2188,153,360,1,Rural,Y 465 | LP002489,Female,No,1,Not Graduate,,5191,0,132,360,1,Semiurban,Y 466 | LP002493,Male,No,0,Graduate,No,4166,0,98,360,0,Semiurban,N 467 | LP002494,Male,No,0,Graduate,No,6000,0,140,360,1,Rural,Y 468 | LP002500,Male,Yes,3+,Not Graduate,No,2947,1664,70,180,0,Urban,N 469 | LP002501,,Yes,0,Graduate,No,16692,0,110,360,1,Semiurban,Y 470 | LP002502,Female,Yes,2,Not Graduate,,210,2917,98,360,1,Semiurban,Y 471 | LP002505,Male,Yes,0,Graduate,No,4333,2451,110,360,1,Urban,N 472 | LP002515,Male,Yes,1,Graduate,Yes,3450,2079,162,360,1,Semiurban,Y 473 | LP002517,Male,Yes,1,Not Graduate,No,2653,1500,113,180,0,Rural,N 474 | LP002519,Male,Yes,3+,Graduate,No,4691,0,100,360,1,Semiurban,Y 475 | LP002522,Female,No,0,Graduate,Yes,2500,0,93,360,,Urban,Y 476 | LP002524,Male,No,2,Graduate,No,5532,4648,162,360,1,Rural,Y 477 | LP002527,Male,Yes,2,Graduate,Yes,16525,1014,150,360,1,Rural,Y 478 | LP002529,Male,Yes,2,Graduate,No,6700,1750,230,300,1,Semiurban,Y 479 | LP002530,,Yes,2,Graduate,No,2873,1872,132,360,0,Semiurban,N 480 | LP002531,Male,Yes,1,Graduate,Yes,16667,2250,86,360,1,Semiurban,Y 481 | LP002533,Male,Yes,2,Graduate,No,2947,1603,,360,1,Urban,N 482 | LP002534,Female,No,0,Not Graduate,No,4350,0,154,360,1,Rural,Y 483 | LP002536,Male,Yes,3+,Not Graduate,No,3095,0,113,360,1,Rural,Y 484 | LP002537,Male,Yes,0,Graduate,No,2083,3150,128,360,1,Semiurban,Y 485 | LP002541,Male,Yes,0,Graduate,No,10833,0,234,360,1,Semiurban,Y 486 | LP002543,Male,Yes,2,Graduate,No,8333,0,246,360,1,Semiurban,Y 487 | LP002544,Male,Yes,1,Not Graduate,No,1958,2436,131,360,1,Rural,Y 488 | LP002545,Male,No,2,Graduate,No,3547,0,80,360,0,Rural,N 489 | LP002547,Male,Yes,1,Graduate,No,18333,0,500,360,1,Urban,N 490 | LP002555,Male,Yes,2,Graduate,Yes,4583,2083,160,360,1,Semiurban,Y 491 | LP002556,Male,No,0,Graduate,No,2435,0,75,360,1,Urban,N 492 | LP002560,Male,No,0,Not Graduate,No,2699,2785,96,360,,Semiurban,Y 493 | LP002562,Male,Yes,1,Not Graduate,No,5333,1131,186,360,,Urban,Y 494 | LP002571,Male,No,0,Not Graduate,No,3691,0,110,360,1,Rural,Y 495 | LP002582,Female,No,0,Not Graduate,Yes,17263,0,225,360,1,Semiurban,Y 496 | LP002585,Male,Yes,0,Graduate,No,3597,2157,119,360,0,Rural,N 497 | LP002586,Female,Yes,1,Graduate,No,3326,913,105,84,1,Semiurban,Y 498 | LP002587,Male,Yes,0,Not Graduate,No,2600,1700,107,360,1,Rural,Y 499 | LP002588,Male,Yes,0,Graduate,No,4625,2857,111,12,,Urban,Y 500 | LP002600,Male,Yes,1,Graduate,Yes,2895,0,95,360,1,Semiurban,Y 501 | LP002602,Male,No,0,Graduate,No,6283,4416,209,360,0,Rural,N 502 | LP002603,Female,No,0,Graduate,No,645,3683,113,480,1,Rural,Y 503 | LP002606,Female,No,0,Graduate,No,3159,0,100,360,1,Semiurban,Y 504 | LP002615,Male,Yes,2,Graduate,No,4865,5624,208,360,1,Semiurban,Y 505 | LP002618,Male,Yes,1,Not Graduate,No,4050,5302,138,360,,Rural,N 506 | LP002619,Male,Yes,0,Not Graduate,No,3814,1483,124,300,1,Semiurban,Y 507 | LP002622,Male,Yes,2,Graduate,No,3510,4416,243,360,1,Rural,Y 508 | LP002624,Male,Yes,0,Graduate,No,20833,6667,480,360,,Urban,Y 509 | LP002625,,No,0,Graduate,No,3583,0,96,360,1,Urban,N 510 | LP002626,Male,Yes,0,Graduate,Yes,2479,3013,188,360,1,Urban,Y 511 | LP002634,Female,No,1,Graduate,No,13262,0,40,360,1,Urban,Y 512 | LP002637,Male,No,0,Not Graduate,No,3598,1287,100,360,1,Rural,N 513 | LP002640,Male,Yes,1,Graduate,No,6065,2004,250,360,1,Semiurban,Y 514 | LP002643,Male,Yes,2,Graduate,No,3283,2035,148,360,1,Urban,Y 515 | LP002648,Male,Yes,0,Graduate,No,2130,6666,70,180,1,Semiurban,N 516 | LP002652,Male,No,0,Graduate,No,5815,3666,311,360,1,Rural,N 517 | LP002659,Male,Yes,3+,Graduate,No,3466,3428,150,360,1,Rural,Y 518 | LP002670,Female,Yes,2,Graduate,No,2031,1632,113,480,1,Semiurban,Y 519 | LP002682,Male,Yes,,Not Graduate,No,3074,1800,123,360,0,Semiurban,N 520 | LP002683,Male,No,0,Graduate,No,4683,1915,185,360,1,Semiurban,N 521 | LP002684,Female,No,0,Not Graduate,No,3400,0,95,360,1,Rural,N 522 | LP002689,Male,Yes,2,Not Graduate,No,2192,1742,45,360,1,Semiurban,Y 523 | LP002690,Male,No,0,Graduate,No,2500,0,55,360,1,Semiurban,Y 524 | LP002692,Male,Yes,3+,Graduate,Yes,5677,1424,100,360,1,Rural,Y 525 | LP002693,Male,Yes,2,Graduate,Yes,7948,7166,480,360,1,Rural,Y 526 | LP002697,Male,No,0,Graduate,No,4680,2087,,360,1,Semiurban,N 527 | LP002699,Male,Yes,2,Graduate,Yes,17500,0,400,360,1,Rural,Y 528 | LP002705,Male,Yes,0,Graduate,No,3775,0,110,360,1,Semiurban,Y 529 | LP002706,Male,Yes,1,Not Graduate,No,5285,1430,161,360,0,Semiurban,Y 530 | LP002714,Male,No,1,Not Graduate,No,2679,1302,94,360,1,Semiurban,Y 531 | LP002716,Male,No,0,Not Graduate,No,6783,0,130,360,1,Semiurban,Y 532 | LP002717,Male,Yes,0,Graduate,No,1025,5500,216,360,,Rural,Y 533 | LP002720,Male,Yes,3+,Graduate,No,4281,0,100,360,1,Urban,Y 534 | LP002723,Male,No,2,Graduate,No,3588,0,110,360,0,Rural,N 535 | LP002729,Male,No,1,Graduate,No,11250,0,196,360,,Semiurban,N 536 | LP002731,Female,No,0,Not Graduate,Yes,18165,0,125,360,1,Urban,Y 537 | LP002732,Male,No,0,Not Graduate,,2550,2042,126,360,1,Rural,Y 538 | LP002734,Male,Yes,0,Graduate,No,6133,3906,324,360,1,Urban,Y 539 | LP002738,Male,No,2,Graduate,No,3617,0,107,360,1,Semiurban,Y 540 | LP002739,Male,Yes,0,Not Graduate,No,2917,536,66,360,1,Rural,N 541 | LP002740,Male,Yes,3+,Graduate,No,6417,0,157,180,1,Rural,Y 542 | LP002741,Female,Yes,1,Graduate,No,4608,2845,140,180,1,Semiurban,Y 543 | LP002743,Female,No,0,Graduate,No,2138,0,99,360,0,Semiurban,N 544 | LP002753,Female,No,1,Graduate,,3652,0,95,360,1,Semiurban,Y 545 | LP002755,Male,Yes,1,Not Graduate,No,2239,2524,128,360,1,Urban,Y 546 | LP002757,Female,Yes,0,Not Graduate,No,3017,663,102,360,,Semiurban,Y 547 | LP002767,Male,Yes,0,Graduate,No,2768,1950,155,360,1,Rural,Y 548 | LP002768,Male,No,0,Not Graduate,No,3358,0,80,36,1,Semiurban,N 549 | LP002772,Male,No,0,Graduate,No,2526,1783,145,360,1,Rural,Y 550 | LP002776,Female,No,0,Graduate,No,5000,0,103,360,0,Semiurban,N 551 | LP002777,Male,Yes,0,Graduate,No,2785,2016,110,360,1,Rural,Y 552 | LP002778,Male,Yes,2,Graduate,Yes,6633,0,,360,0,Rural,N 553 | LP002784,Male,Yes,1,Not Graduate,No,2492,2375,,360,1,Rural,Y 554 | LP002785,Male,Yes,1,Graduate,No,3333,3250,158,360,1,Urban,Y 555 | LP002788,Male,Yes,0,Not Graduate,No,2454,2333,181,360,0,Urban,N 556 | LP002789,Male,Yes,0,Graduate,No,3593,4266,132,180,0,Rural,N 557 | LP002792,Male,Yes,1,Graduate,No,5468,1032,26,360,1,Semiurban,Y 558 | LP002794,Female,No,0,Graduate,No,2667,1625,84,360,,Urban,Y 559 | LP002795,Male,Yes,3+,Graduate,Yes,10139,0,260,360,1,Semiurban,Y 560 | LP002798,Male,Yes,0,Graduate,No,3887,2669,162,360,1,Semiurban,Y 561 | LP002804,Female,Yes,0,Graduate,No,4180,2306,182,360,1,Semiurban,Y 562 | LP002807,Male,Yes,2,Not Graduate,No,3675,242,108,360,1,Semiurban,Y 563 | LP002813,Female,Yes,1,Graduate,Yes,19484,0,600,360,1,Semiurban,Y 564 | LP002820,Male,Yes,0,Graduate,No,5923,2054,211,360,1,Rural,Y 565 | LP002821,Male,No,0,Not Graduate,Yes,5800,0,132,360,1,Semiurban,Y 566 | LP002832,Male,Yes,2,Graduate,No,8799,0,258,360,0,Urban,N 567 | LP002833,Male,Yes,0,Not Graduate,No,4467,0,120,360,,Rural,Y 568 | LP002836,Male,No,0,Graduate,No,3333,0,70,360,1,Urban,Y 569 | LP002837,Male,Yes,3+,Graduate,No,3400,2500,123,360,0,Rural,N 570 | LP002840,Female,No,0,Graduate,No,2378,0,9,360,1,Urban,N 571 | LP002841,Male,Yes,0,Graduate,No,3166,2064,104,360,0,Urban,N 572 | LP002842,Male,Yes,1,Graduate,No,3417,1750,186,360,1,Urban,Y 573 | LP002847,Male,Yes,,Graduate,No,5116,1451,165,360,0,Urban,N 574 | LP002855,Male,Yes,2,Graduate,No,16666,0,275,360,1,Urban,Y 575 | LP002862,Male,Yes,2,Not Graduate,No,6125,1625,187,480,1,Semiurban,N 576 | LP002863,Male,Yes,3+,Graduate,No,6406,0,150,360,1,Semiurban,N 577 | LP002868,Male,Yes,2,Graduate,No,3159,461,108,84,1,Urban,Y 578 | LP002872,,Yes,0,Graduate,No,3087,2210,136,360,0,Semiurban,N 579 | LP002874,Male,No,0,Graduate,No,3229,2739,110,360,1,Urban,Y 580 | LP002877,Male,Yes,1,Graduate,No,1782,2232,107,360,1,Rural,Y 581 | LP002888,Male,No,0,Graduate,,3182,2917,161,360,1,Urban,Y 582 | LP002892,Male,Yes,2,Graduate,No,6540,0,205,360,1,Semiurban,Y 583 | LP002893,Male,No,0,Graduate,No,1836,33837,90,360,1,Urban,N 584 | LP002894,Female,Yes,0,Graduate,No,3166,0,36,360,1,Semiurban,Y 585 | LP002898,Male,Yes,1,Graduate,No,1880,0,61,360,,Rural,N 586 | LP002911,Male,Yes,1,Graduate,No,2787,1917,146,360,0,Rural,N 587 | LP002912,Male,Yes,1,Graduate,No,4283,3000,172,84,1,Rural,N 588 | LP002916,Male,Yes,0,Graduate,No,2297,1522,104,360,1,Urban,Y 589 | LP002917,Female,No,0,Not Graduate,No,2165,0,70,360,1,Semiurban,Y 590 | LP002925,,No,0,Graduate,No,4750,0,94,360,1,Semiurban,Y 591 | LP002926,Male,Yes,2,Graduate,Yes,2726,0,106,360,0,Semiurban,N 592 | LP002928,Male,Yes,0,Graduate,No,3000,3416,56,180,1,Semiurban,Y 593 | LP002931,Male,Yes,2,Graduate,Yes,6000,0,205,240,1,Semiurban,N 594 | LP002933,,No,3+,Graduate,Yes,9357,0,292,360,1,Semiurban,Y 595 | LP002936,Male,Yes,0,Graduate,No,3859,3300,142,180,1,Rural,Y 596 | LP002938,Male,Yes,0,Graduate,Yes,16120,0,260,360,1,Urban,Y 597 | LP002940,Male,No,0,Not Graduate,No,3833,0,110,360,1,Rural,Y 598 | LP002941,Male,Yes,2,Not Graduate,Yes,6383,1000,187,360,1,Rural,N 599 | LP002943,Male,No,,Graduate,No,2987,0,88,360,0,Semiurban,N 600 | LP002945,Male,Yes,0,Graduate,Yes,9963,0,180,360,1,Rural,Y 601 | LP002948,Male,Yes,2,Graduate,No,5780,0,192,360,1,Urban,Y 602 | LP002949,Female,No,3+,Graduate,,416,41667,350,180,,Urban,N 603 | LP002950,Male,Yes,0,Not Graduate,,2894,2792,155,360,1,Rural,Y 604 | LP002953,Male,Yes,3+,Graduate,No,5703,0,128,360,1,Urban,Y 605 | LP002958,Male,No,0,Graduate,No,3676,4301,172,360,1,Rural,Y 606 | LP002959,Female,Yes,1,Graduate,No,12000,0,496,360,1,Semiurban,Y 607 | LP002960,Male,Yes,0,Not Graduate,No,2400,3800,,180,1,Urban,N 608 | LP002961,Male,Yes,1,Graduate,No,3400,2500,173,360,1,Semiurban,Y 609 | LP002964,Male,Yes,2,Not Graduate,No,3987,1411,157,360,1,Rural,Y 610 | LP002974,Male,Yes,0,Graduate,No,3232,1950,108,360,1,Rural,Y 611 | LP002978,Female,No,0,Graduate,No,2900,0,71,360,1,Rural,Y 612 | LP002979,Male,Yes,3+,Graduate,No,4106,0,40,180,1,Rural,Y 613 | LP002983,Male,Yes,1,Graduate,No,8072,240,253,360,1,Urban,Y 614 | LP002984,Male,Yes,2,Graduate,No,7583,0,187,360,1,Urban,Y 615 | LP002990,Female,No,0,Graduate,Yes,4583,0,133,360,0,Semiurban,N 616 | -------------------------------------------------------------------------------- /Natural Language Processing/Intro to NLP.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Nitro-Language-Processing/Workshops-2024/948bf767c185cecec348729482d1f878c74499d1/Natural Language Processing/Intro to NLP.pdf -------------------------------------------------------------------------------- /Natural Language Processing/Preprocesare_text.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "# Preprocesare\n", 21 | "\n", 22 | "Limbajul scris conține o serie de elemente care nu transmit neapărat informații relevante pentru problema pe care încercăm să o rezolvăm. Mai jos este o listă de acțiuni pe care le putem face asupra textului pentru a elimina caracterele nedorite:\n", 23 | "- tokenizare (împărțim în cuvinte / tokeni)\n", 24 | "- transformarea tuturor literelor în litere mici\n", 25 | "- eliminarea cifrelor / transformarea lor în cuvinte\n", 26 | "- eliminarea semnelor de punctuație\n", 27 | "- eliminarea (sau înlocuirea) linkurilor [LINK], tagurilor [TAG] și mențiunilor [USER]\n", 28 | "- eliminarea (sau înlocuirea) emoticoanelor ( :) :D) și emojiurilor (💙 🐱)\n", 29 | "- eliminarea cuvintelor de legătură (stopwords)\n", 30 | "- transformarea cuvintelor în cea mai simplă formă a lor (lematizare / stemming)" 31 | ], 32 | "metadata": { 33 | "id": "dl4d-KNd7vcv" 34 | } 35 | }, 36 | { 37 | "cell_type": "code", 38 | "source": [ 39 | "import nltk\n", 40 | "nltk.download('punkt')\n", 41 | "nltk.download('stopwords')\n", 42 | "nltk.download('wordnet')\n", 43 | "nltk.download('twitter_samples')" 44 | ], 45 | "metadata": { 46 | "id": "rsxlLK-y8IeR", 47 | "colab": { 48 | "base_uri": "https://localhost:8080/" 49 | }, 50 | "outputId": "978b5762-4b3f-49bf-ecaa-e5926eb392a9" 51 | }, 52 | "execution_count": 2, 53 | "outputs": [ 54 | { 55 | "output_type": "stream", 56 | "name": "stderr", 57 | "text": [ 58 | "[nltk_data] Downloading package punkt to /root/nltk_data...\n", 59 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n", 60 | "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", 61 | "[nltk_data] Unzipping corpora/stopwords.zip.\n", 62 | "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", 63 | "[nltk_data] Downloading package twitter_samples to /root/nltk_data...\n", 64 | "[nltk_data] Unzipping corpora/twitter_samples.zip.\n" 65 | ] 66 | }, 67 | { 68 | "output_type": "execute_result", 69 | "data": { 70 | "text/plain": [ 71 | "True" 72 | ] 73 | }, 74 | "metadata": {}, 75 | "execution_count": 2 76 | } 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "source": [ 82 | "Vom folosi un set de date pentru sentiment analysis:" 83 | ], 84 | "metadata": { 85 | "id": "d9IEhS5P78D6" 86 | } 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 3, 91 | "metadata": { 92 | "colab": { 93 | "base_uri": "https://localhost:8080/" 94 | }, 95 | "id": "3oyMrHP6FlpR", 96 | "outputId": "a1fbc94c-dc24-4d7e-ab08-7eb11b3f0331" 97 | }, 98 | "outputs": [ 99 | { 100 | "output_type": "execute_result", 101 | "data": { 102 | "text/plain": [ 103 | "['#FollowFriday @France_Inte @PKuchly57 @Milipol_Paris for being top engaged members in my community this week :)',\n", 104 | " '@Lamb2ja Hey James! How odd :/ Please call our Contact Centre on 02392441234 and we will be able to assist you :) Many thanks!',\n", 105 | " '@DespiteOfficial we had a listen last night :) As You Bleed is an amazing track. When are you in Scotland?!',\n", 106 | " '@97sides CONGRATS :)',\n", 107 | " 'yeaaaah yippppy!!! my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days',\n", 108 | " '@BhaktisBanter @PallaviRuhail This one is irresistible :)\\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM',\n", 109 | " \"We don't like to keep our lovely customers waiting for long! We hope you enjoy! Happy Friday! - LWWF :) https://t.co/smyYriipxI\",\n", 110 | " '@Impatientraider On second thought, there’s just not enough time for a DD :) But new shorts entering system. Sheep must be buying.',\n", 111 | " 'Jgh , but we have to go to Bayan :D bye',\n", 112 | " 'As an act of mischievousness, am calling the ETL layer of our in-house warehousing app Katamari.\\n\\nWell… as the name implies :p.',\n", 113 | " '#FollowFriday @wncer1 @Defense_gouv for being top influencers in my community this week :)',\n", 114 | " \"Who Wouldn't Love These Big....Juicy....Selfies :) - http://t.co/QVzjgd1uFo http://t.co/oWBL11eQRY\",\n", 115 | " '@Mish23615351 follow @jnlazts & http://t.co/RCvcYYO0Iq follow u back :)',\n", 116 | " \"@jjulieredburn Perfect, so you already know what's waiting for you :)\",\n", 117 | " 'Great new opportunity for junior triathletes aged 12 and 13 at the Gatorade series! Get your entries in :) http://t.co/of3DyOzML0',\n", 118 | " 'Laying out a greetings card range for print today - love my job :-)',\n", 119 | " \"Friend's lunch... yummmm :)\\n#Nostalgia #TBS #KU.\",\n", 120 | " \"@RookieSenpai @arcadester it is the id conflict thanks for the help :D here's the screenshot of it working\",\n", 121 | " '@oohdawg_ Hi liv :))',\n", 122 | " 'Hello I need to know something can u fm me on Twitter?? — sure thing :) dm me x http://t.co/W6Dy130BV7']" 123 | ] 124 | }, 125 | "metadata": {}, 126 | "execution_count": 3 127 | } 128 | ], 129 | "source": [ 130 | "from nltk.corpus import twitter_samples\n", 131 | "\n", 132 | "tweets = twitter_samples.strings('positive_tweets.json')\n", 133 | "negative_tweets = twitter_samples.strings('negative_tweets.json')\n", 134 | "tweets[:20]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "source": [ 140 | "Din această listă ne vom uita la tweetul numărul 6:" 141 | ], 142 | "metadata": { 143 | "id": "M3Qn2ruK-VXD" 144 | } 145 | }, 146 | { 147 | "cell_type": "code", 148 | "source": [ 149 | "text = tweets[4]\n", 150 | "text" 151 | ], 152 | "metadata": { 153 | "colab": { 154 | "base_uri": "https://localhost:8080/", 155 | "height": 36 156 | }, 157 | "id": "4UBV6kuD8yci", 158 | "outputId": "13f49cae-b989-4e38-8036-e5f53704581c" 159 | }, 160 | "execution_count": 4, 161 | "outputs": [ 162 | { 163 | "output_type": "execute_result", 164 | "data": { 165 | "text/plain": [ 166 | "'yeaaaah yippppy!!! my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days'" 167 | ], 168 | "application/vnd.google.colaboratory.intrinsic+json": { 169 | "type": "string" 170 | } 171 | }, 172 | "metadata": {}, 173 | "execution_count": 4 174 | } 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "source": [ 180 | "## RegEx\n", 181 | "\n", 182 | "Un [RegEx](https://www.w3schools.com/python/python_regex.asp) (_Regular Expression_ / _Expresie Regulată_) reprezintă un șir de caractere care definește un șablon de căutare. Poate fi folosit pentru a identifica un subșir într-un string, pentru a-l înlocui sau pentru a împărți textul în jurul lui.\n", 183 | "\n", 184 | "Puteți vedea cum funcționează un regex pe un text anume folosind acest link: https://pythex.org/." 185 | ], 186 | "metadata": { 187 | "id": "gw3eZLKkU-fP" 188 | } 189 | }, 190 | { 191 | "cell_type": "code", 192 | "source": [ 193 | "import re\n", 194 | "\n", 195 | "txt = \"The rain in Spain stays mainly in the plain\"\n", 196 | "x = re.search(\"Spai.\", txt)\n", 197 | "x" 198 | ], 199 | "metadata": { 200 | "colab": { 201 | "base_uri": "https://localhost:8080/" 202 | }, 203 | "id": "JY3TpT-V5QX3", 204 | "outputId": "701cfb31-3d46-46bf-e10f-45cc3475b37e" 205 | }, 206 | "execution_count": 5, 207 | "outputs": [ 208 | { 209 | "output_type": "execute_result", 210 | "data": { 211 | "text/plain": [ 212 | "" 213 | ] 214 | }, 215 | "metadata": {}, 216 | "execution_count": 5 217 | } 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "source": [ 223 | "Căutarea unui string nu returnează nimic dacă nu găsește niciun match, altfel returnează un obiect cu matchul exact și poziția la care se află. Stringul devine relevant când folosim alte simboluri pentru pattern matching, cum ar fi:\n", 224 | "- . - orice caracter\n", 225 | "- \\+ - mai multe apariții ale caracterului anterior\n", 226 | "- \\* - un număr nedefinit de apariții are caracterului anterior, incluzând 0" 227 | ], 228 | "metadata": { 229 | "id": "6XX0_VEtWJAr" 230 | } 231 | }, 232 | { 233 | "cell_type": "code", 234 | "source": [ 235 | "x = re.split(\" +.\", txt)\n", 236 | "print(x)" 237 | ], 238 | "metadata": { 239 | "colab": { 240 | "base_uri": "https://localhost:8080/" 241 | }, 242 | "id": "37jcxol7D7yu", 243 | "outputId": "fe47a815-68f8-44a6-98fe-5d175fc44a12" 244 | }, 245 | "execution_count": 6, 246 | "outputs": [ 247 | { 248 | "output_type": "stream", 249 | "name": "stdout", 250 | "text": [ 251 | "['The', 'ain', 'n', 'pain', 'tays', 'ainly', 'n', 'he', 'lain']\n" 252 | ] 253 | } 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "source": [ 259 | "Alte caractere speciale:\n", 260 | "- \\d - cifre\n", 261 | "- \\D - nu cifre\n", 262 | "- \\s - spațiu\n", 263 | "- \\S - nu spațiu\n", 264 | "- \\w - litere mici, majuscule, caracterul \"_\"\n", 265 | "- \\W - tot ce nu e \\w\n", 266 | "- [a-m] - setul de caractere din interior. Poate include intervale\n", 267 | "\n", 268 | "Librăria completă poate fi găsită aici https://docs.python.org/3/library/re.html." 269 | ], 270 | "metadata": { 271 | "id": "7c-ppOwgEszZ" 272 | } 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "source": [ 277 | "Putem folosi un regex să identificăm toate cuvintele care includ secvența \"ai\":\n" 278 | ], 279 | "metadata": { 280 | "id": "iZBBAPXQtMAz" 281 | } 282 | }, 283 | { 284 | "cell_type": "code", 285 | "source": [ 286 | "x = re.findall(\"\\w*ai\\w\", txt)\n", 287 | "print(x)" 288 | ], 289 | "metadata": { 290 | "colab": { 291 | "base_uri": "https://localhost:8080/" 292 | }, 293 | "id": "Oxme5CAus_uJ", 294 | "outputId": "a2f96a3e-51bb-4b8e-d790-2106cc7008bc" 295 | }, 296 | "execution_count": 7, 297 | "outputs": [ 298 | { 299 | "output_type": "stream", 300 | "name": "stdout", 301 | "text": [ 302 | "['rain', 'Spain', 'main', 'plain']\n" 303 | ] 304 | } 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "source": [ 310 | "## Tokenizare" 311 | ], 312 | "metadata": { 313 | "id": "tn5YBCwU-aWa" 314 | } 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "source": [ 319 | "Putem separa cuvintele în funcție de spațiu:" 320 | ], 321 | "metadata": { 322 | "id": "IJD7UdyZ-Yuo" 323 | } 324 | }, 325 | { 326 | "cell_type": "code", 327 | "source": [ 328 | "text.split()" 329 | ], 330 | "metadata": { 331 | "colab": { 332 | "base_uri": "https://localhost:8080/" 333 | }, 334 | "id": "NOCqYz_4-IKO", 335 | "outputId": "a76e8c6e-b127-4000-f252-63b2487f078a" 336 | }, 337 | "execution_count": 8, 338 | "outputs": [ 339 | { 340 | "output_type": "execute_result", 341 | "data": { 342 | "text/plain": [ 343 | "['yeaaaah',\n", 344 | " 'yippppy!!!',\n", 345 | " 'my',\n", 346 | " 'accnt',\n", 347 | " 'verified',\n", 348 | " 'rqst',\n", 349 | " 'has',\n", 350 | " 'succeed',\n", 351 | " 'got',\n", 352 | " 'a',\n", 353 | " 'blue',\n", 354 | " 'tick',\n", 355 | " 'mark',\n", 356 | " 'on',\n", 357 | " 'my',\n", 358 | " 'fb',\n", 359 | " 'profile',\n", 360 | " ':)',\n", 361 | " 'in',\n", 362 | " '15',\n", 363 | " 'days']" 364 | ] 365 | }, 366 | "metadata": {}, 367 | "execution_count": 8 368 | } 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "source": [ 374 | "Sau cu ajutorul unei funcții:" 375 | ], 376 | "metadata": { 377 | "id": "NKLq4kvk-qGk" 378 | } 379 | }, 380 | { 381 | "cell_type": "code", 382 | "source": [ 383 | "from nltk import word_tokenize\n", 384 | "\n", 385 | "word_tokenize(text)" 386 | ], 387 | "metadata": { 388 | "colab": { 389 | "base_uri": "https://localhost:8080/" 390 | }, 391 | "id": "PdfxhJDV-L6Z", 392 | "outputId": "14d8585c-ba7a-47ee-b38b-235b3968b1c6" 393 | }, 394 | "execution_count": 9, 395 | "outputs": [ 396 | { 397 | "output_type": "execute_result", 398 | "data": { 399 | "text/plain": [ 400 | "['yeaaaah',\n", 401 | " 'yippppy',\n", 402 | " '!',\n", 403 | " '!',\n", 404 | " '!',\n", 405 | " 'my',\n", 406 | " 'accnt',\n", 407 | " 'verified',\n", 408 | " 'rqst',\n", 409 | " 'has',\n", 410 | " 'succeed',\n", 411 | " 'got',\n", 412 | " 'a',\n", 413 | " 'blue',\n", 414 | " 'tick',\n", 415 | " 'mark',\n", 416 | " 'on',\n", 417 | " 'my',\n", 418 | " 'fb',\n", 419 | " 'profile',\n", 420 | " ':',\n", 421 | " ')',\n", 422 | " 'in',\n", 423 | " '15',\n", 424 | " 'days']" 425 | ] 426 | }, 427 | "metadata": {}, 428 | "execution_count": 9 429 | } 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "source": [ 435 | "word_tokenize(\"we don't like to keep our lovely customers waiting for long! we hope you enjoy! happy friday! - lwwf :) https://t.co/smyyriipxi\")" 436 | ], 437 | "metadata": { 438 | "colab": { 439 | "base_uri": "https://localhost:8080/" 440 | }, 441 | "id": "WMwTYeKdSeZg", 442 | "outputId": "4149de7f-ce64-4dd9-d39e-1007a85aa6f9" 443 | }, 444 | "execution_count": 10, 445 | "outputs": [ 446 | { 447 | "output_type": "execute_result", 448 | "data": { 449 | "text/plain": [ 450 | "['we',\n", 451 | " 'do',\n", 452 | " \"n't\",\n", 453 | " 'like',\n", 454 | " 'to',\n", 455 | " 'keep',\n", 456 | " 'our',\n", 457 | " 'lovely',\n", 458 | " 'customers',\n", 459 | " 'waiting',\n", 460 | " 'for',\n", 461 | " 'long',\n", 462 | " '!',\n", 463 | " 'we',\n", 464 | " 'hope',\n", 465 | " 'you',\n", 466 | " 'enjoy',\n", 467 | " '!',\n", 468 | " 'happy',\n", 469 | " 'friday',\n", 470 | " '!',\n", 471 | " '-',\n", 472 | " 'lwwf',\n", 473 | " ':',\n", 474 | " ')',\n", 475 | " 'https',\n", 476 | " ':',\n", 477 | " '//t.co/smyyriipxi']" 478 | ] 479 | }, 480 | "metadata": {}, 481 | "execution_count": 10 482 | } 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "source": [ 488 | "Putem separa propozițiile între ele tot cu ajutorul unei funcții:" 489 | ], 490 | "metadata": { 491 | "id": "z0F6UHtj-x7a" 492 | } 493 | }, 494 | { 495 | "cell_type": "code", 496 | "source": [ 497 | "from nltk import sent_tokenize\n", 498 | "\n", 499 | "sent_tokenize(text)" 500 | ], 501 | "metadata": { 502 | "colab": { 503 | "base_uri": "https://localhost:8080/" 504 | }, 505 | "id": "X1-tSrGR-Ldo", 506 | "outputId": "90fdedcf-a779-4d78-ccc9-4cad0e7d611e" 507 | }, 508 | "execution_count": 11, 509 | "outputs": [ 510 | { 511 | "output_type": "execute_result", 512 | "data": { 513 | "text/plain": [ 514 | "['yeaaaah yippppy!!!',\n", 515 | " 'my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days']" 516 | ] 517 | }, 518 | "metadata": {}, 519 | "execution_count": 11 520 | } 521 | ] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "source": [ 526 | "## Litere mici (lowercase)\n", 527 | "\n", 528 | "Transformarea literelor în minuscule este cea mai simplă operație:" 529 | ], 530 | "metadata": { 531 | "id": "wfdewSsB--21" 532 | } 533 | }, 534 | { 535 | "cell_type": "code", 536 | "source": [ 537 | "text.lower()" 538 | ], 539 | "metadata": { 540 | "colab": { 541 | "base_uri": "https://localhost:8080/", 542 | "height": 36 543 | }, 544 | "id": "pnJNtg65_AJj", 545 | "outputId": "38b45281-f026-4980-a899-ed019a5c35b8" 546 | }, 547 | "execution_count": 12, 548 | "outputs": [ 549 | { 550 | "output_type": "execute_result", 551 | "data": { 552 | "text/plain": [ 553 | "'yeaaaah yippppy!!! my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days'" 554 | ], 555 | "application/vnd.google.colaboratory.intrinsic+json": { 556 | "type": "string" 557 | } 558 | }, 559 | "metadata": {}, 560 | "execution_count": 12 561 | } 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "source": [ 567 | "## Eliminarea cifrelor" 568 | ], 569 | "metadata": { 570 | "id": "cnAi1PAP_VJv" 571 | } 572 | }, 573 | { 574 | "cell_type": "code", 575 | "source": [ 576 | "\"63\".isdigit()" 577 | ], 578 | "metadata": { 579 | "colab": { 580 | "base_uri": "https://localhost:8080/" 581 | }, 582 | "id": "USTJIfgw_dMH", 583 | "outputId": "b664dee3-608b-468f-d4d1-5469b8bd95e4" 584 | }, 585 | "execution_count": 13, 586 | "outputs": [ 587 | { 588 | "output_type": "execute_result", 589 | "data": { 590 | "text/plain": [ 591 | "True" 592 | ] 593 | }, 594 | "metadata": {}, 595 | "execution_count": 13 596 | } 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "source": [ 602 | "\"not5\".isdigit()" 603 | ], 604 | "metadata": { 605 | "id": "0MnC2qjUSxzV", 606 | "outputId": "1d05b63c-ced3-48f4-a21f-aece4631c912", 607 | "colab": { 608 | "base_uri": "https://localhost:8080/" 609 | } 610 | }, 611 | "execution_count": 14, 612 | "outputs": [ 613 | { 614 | "output_type": "execute_result", 615 | "data": { 616 | "text/plain": [ 617 | "False" 618 | ] 619 | }, 620 | "metadata": {}, 621 | "execution_count": 14 622 | } 623 | ] 624 | }, 625 | { 626 | "cell_type": "markdown", 627 | "source": [ 628 | "Pentru a transforma numerele în cuvinte putem folosi librăria num2words:" 629 | ], 630 | "metadata": { 631 | "id": "DPnhn5xzBf9T" 632 | } 633 | }, 634 | { 635 | "cell_type": "code", 636 | "source": [ 637 | "!pip install num2words" 638 | ], 639 | "metadata": { 640 | "colab": { 641 | "base_uri": "https://localhost:8080/" 642 | }, 643 | "id": "bXRBmKlZBeFc", 644 | "outputId": "e94a777f-75e6-4dba-c53c-21f6337e7dee" 645 | }, 646 | "execution_count": 15, 647 | "outputs": [ 648 | { 649 | "output_type": "stream", 650 | "name": "stdout", 651 | "text": [ 652 | "Collecting num2words\n", 653 | " Downloading num2words-0.5.13-py3-none-any.whl (143 kB)\n", 654 | "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/143.3 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.2/143.3 kB\u001b[0m \u001b[31m2.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.3/143.3 kB\u001b[0m \u001b[31m2.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 655 | "\u001b[?25hCollecting docopt>=0.6.2 (from num2words)\n", 656 | " Downloading docopt-0.6.2.tar.gz (25 kB)\n", 657 | " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 658 | "Building wheels for collected packages: docopt\n", 659 | " Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 660 | " Created wheel for docopt: filename=docopt-0.6.2-py2.py3-none-any.whl size=13706 sha256=3a47730838a9cf79616653383a1cd0c426f372fceeca26ff52b7debd4b838b71\n", 661 | " Stored in directory: /root/.cache/pip/wheels/fc/ab/d4/5da2067ac95b36618c629a5f93f809425700506f72c9732fac\n", 662 | "Successfully built docopt\n", 663 | "Installing collected packages: docopt, num2words\n", 664 | "Successfully installed docopt-0.6.2 num2words-0.5.13\n" 665 | ] 666 | } 667 | ] 668 | }, 669 | { 670 | "cell_type": "code", 671 | "source": [ 672 | "from num2words import num2words\n", 673 | "\n", 674 | "num2words(15)" 675 | ], 676 | "metadata": { 677 | "colab": { 678 | "base_uri": "https://localhost:8080/", 679 | "height": 36 680 | }, 681 | "id": "4pB0jEDjCIv7", 682 | "outputId": "d7573fcc-ec55-41a1-df29-9121548db02d" 683 | }, 684 | "execution_count": 16, 685 | "outputs": [ 686 | { 687 | "output_type": "execute_result", 688 | "data": { 689 | "text/plain": [ 690 | "'fifteen'" 691 | ], 692 | "application/vnd.google.colaboratory.intrinsic+json": { 693 | "type": "string" 694 | } 695 | }, 696 | "metadata": {}, 697 | "execution_count": 16 698 | } 699 | ] 700 | }, 701 | { 702 | "cell_type": "markdown", 703 | "source": [ 704 | "Cum putem identifica toate numerele dintr-un text?" 705 | ], 706 | "metadata": { 707 | "id": "HY0WljcjZZMC" 708 | } 709 | }, 710 | { 711 | "cell_type": "code", 712 | "source": [ 713 | "text = \"yeaaaah yippppy!!! my accnt verified rqst has succeed got a blue tick mark on my fb profile :) in 15 days\"" 714 | ], 715 | "metadata": { 716 | "id": "nqK41Kwofk_O" 717 | }, 718 | "execution_count": 30, 719 | "outputs": [] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "source": [ 724 | "# TODO: Parcurge tokenii din text si afiseaza numerele\n", 725 | "def get_numbers_1(text):\n", 726 | " \"\"\" Varianta C++ \"\"\"\n", 727 | " numbers = []\n", 728 | " for token in word_tokenize(text):\n", 729 | " if token.isdigit():\n", 730 | " numbers.append(token)\n", 731 | " return numbers\n", 732 | "\n", 733 | "\n", 734 | "def get_numbers_2(text):\n", 735 | " \"\"\" Varianta cu list comprehension \"\"\"\n", 736 | " return [token for token in word_tokenize(text) if token.isdigit()]\n", 737 | "\n", 738 | "\n", 739 | "def get_numbers_3(text):\n", 740 | " \"\"\" Varianta cu RegEx \"\"\"\n", 741 | " return re.findall(r\"\\d+\", text)\n", 742 | "\n", 743 | "\n", 744 | "get_numbers_3(text)" 745 | ], 746 | "metadata": { 747 | "id": "Ml0uzTHyZVfE", 748 | "outputId": "0ee08ef9-2636-4c72-d41d-a0f4071c1483", 749 | "colab": { 750 | "base_uri": "https://localhost:8080/" 751 | } 752 | }, 753 | "execution_count": 31, 754 | "outputs": [ 755 | { 756 | "output_type": "execute_result", 757 | "data": { 758 | "text/plain": [ 759 | "['15']" 760 | ] 761 | }, 762 | "metadata": {}, 763 | "execution_count": 31 764 | } 765 | ] 766 | }, 767 | { 768 | "cell_type": "code", 769 | "source": [ 770 | "# TODO: Creeaza o lista cu toate cuvintele din text inafara de numere\n", 771 | "def filter_numbers(text):\n", 772 | " return [token for token in word_tokenize(text) if not token.isdigit()]\n", 773 | "\n", 774 | "\n", 775 | "filter_numbers(text)" 776 | ], 777 | "metadata": { 778 | "id": "MkfpwWgZZ-AI", 779 | "outputId": "c167ac37-1cf3-411b-fa98-dda4ab58b53f", 780 | "colab": { 781 | "base_uri": "https://localhost:8080/" 782 | } 783 | }, 784 | "execution_count": 35, 785 | "outputs": [ 786 | { 787 | "output_type": "execute_result", 788 | "data": { 789 | "text/plain": [ 790 | "['yeaaaah',\n", 791 | " 'yippppy',\n", 792 | " '!',\n", 793 | " '!',\n", 794 | " '!',\n", 795 | " 'my',\n", 796 | " 'accnt',\n", 797 | " 'verified',\n", 798 | " 'rqst',\n", 799 | " 'has',\n", 800 | " 'succeed',\n", 801 | " 'got',\n", 802 | " 'a',\n", 803 | " 'blue',\n", 804 | " 'tick',\n", 805 | " 'mark',\n", 806 | " 'on',\n", 807 | " 'my',\n", 808 | " 'fb',\n", 809 | " 'profile',\n", 810 | " ':',\n", 811 | " ')',\n", 812 | " 'in',\n", 813 | " 'days']" 814 | ] 815 | }, 816 | "metadata": {}, 817 | "execution_count": 35 818 | } 819 | ] 820 | }, 821 | { 822 | "cell_type": "markdown", 823 | "source": [ 824 | "## Eliminarea semnelor de punctuatie\n", 825 | "\n", 826 | "Exista o librarie care deja contine toate semnele de punctuatie:" 827 | ], 828 | "metadata": { 829 | "id": "piTV6d9O8uG_" 830 | } 831 | }, 832 | { 833 | "cell_type": "code", 834 | "source": [ 835 | "from string import punctuation\n", 836 | "\n", 837 | "print(punctuation)" 838 | ], 839 | "metadata": { 840 | "colab": { 841 | "base_uri": "https://localhost:8080/" 842 | }, 843 | "id": "Y6nY6Ca_bvr_", 844 | "outputId": "ba94d037-0161-49eb-91f5-e6827415b6bc" 845 | }, 846 | "execution_count": 37, 847 | "outputs": [ 848 | { 849 | "output_type": "stream", 850 | "name": "stdout", 851 | "text": [ 852 | "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~\n" 853 | ] 854 | } 855 | ] 856 | }, 857 | { 858 | "cell_type": "markdown", 859 | "source": [ 860 | "Elimina toate semnele de punctuatie din text:" 861 | ], 862 | "metadata": { 863 | "id": "L859kvPqcC-J" 864 | } 865 | }, 866 | { 867 | "cell_type": "code", 868 | "source": [ 869 | "# TODO: Creeaza o lista cu toate cuvintele din text inafara de semnele de punctuatie\n", 870 | "def filter_punctuation(text):\n", 871 | " return [token for token in word_tokenize(text) if token not in punctuation]\n", 872 | "\n", 873 | "\n", 874 | "filter_punctuation(text)" 875 | ], 876 | "metadata": { 877 | "id": "Nr8nvT14cJq_", 878 | "outputId": "ded616dc-e316-41c2-ab98-06972ff6fd23", 879 | "colab": { 880 | "base_uri": "https://localhost:8080/" 881 | } 882 | }, 883 | "execution_count": 38, 884 | "outputs": [ 885 | { 886 | "output_type": "execute_result", 887 | "data": { 888 | "text/plain": [ 889 | "['yeaaaah',\n", 890 | " 'yippppy',\n", 891 | " 'my',\n", 892 | " 'accnt',\n", 893 | " 'verified',\n", 894 | " 'rqst',\n", 895 | " 'has',\n", 896 | " 'succeed',\n", 897 | " 'got',\n", 898 | " 'a',\n", 899 | " 'blue',\n", 900 | " 'tick',\n", 901 | " 'mark',\n", 902 | " 'on',\n", 903 | " 'my',\n", 904 | " 'fb',\n", 905 | " 'profile',\n", 906 | " 'in',\n", 907 | " '15',\n", 908 | " 'days']" 909 | ] 910 | }, 911 | "metadata": {}, 912 | "execution_count": 38 913 | } 914 | ] 915 | }, 916 | { 917 | "cell_type": "markdown", 918 | "source": [ 919 | "## Eliminarea linkurilor, tagurilor si mentiunilor\n", 920 | "\n", 921 | "Linkurile incep (de obicei) cu caracterele \"http\", tagurile cu \"#\" si mentiunile cu \"@\". Cum ne putem folosi de informatia asta pentru a le elimina din text?" 922 | ], 923 | "metadata": { 924 | "id": "9HJ7C9UrcORA" 925 | } 926 | }, 927 | { 928 | "cell_type": "code", 929 | "source": [ 930 | "# TODO: Elimina linkurile, tagurile si mentiunile din text\n", 931 | "text = \"@BhaktisBanter @PallaviRuhail This one is irresistible :)\\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM\"\n", 932 | "\n", 933 | "def filter_other(text):\n", 934 | " return re.sub(r\"(@|#|http)(\\w|:|/|\\.)*\", \"\", text)\n", 935 | "\n", 936 | "\n", 937 | "filter_other(text)" 938 | ], 939 | "metadata": { 940 | "id": "w-wsmugZcUlc", 941 | "outputId": "404f37b9-f059-47f8-c6d8-e34abd8c530b", 942 | "colab": { 943 | "base_uri": "https://localhost:8080/", 944 | "height": 36 945 | } 946 | }, 947 | "execution_count": 65, 948 | "outputs": [ 949 | { 950 | "output_type": "execute_result", 951 | "data": { 952 | "text/plain": [ 953 | "' This one is irresistible :)\\n '" 954 | ], 955 | "application/vnd.google.colaboratory.intrinsic+json": { 956 | "type": "string" 957 | } 958 | }, 959 | "metadata": {}, 960 | "execution_count": 65 961 | } 962 | ] 963 | }, 964 | { 965 | "cell_type": "markdown", 966 | "source": [ 967 | "## Emoticoane & emojiuri\n", 968 | "\n", 969 | "Tokenizarea se bazează de obicei pe spații și punctuație, ceea ce înseamnă că nu știe să gestioneze emoticoanele. O variantă este să ne creăm propriul regex care să identifice simbolurile să să le înlocuiască cu emoția corespunzătoare.\n", 970 | "\n", 971 | "Un scurt exemplu:" 972 | ], 973 | "metadata": { 974 | "id": "izNSmtGue7gD" 975 | } 976 | }, 977 | { 978 | "cell_type": "code", 979 | "source": [ 980 | "emoticons = {\n", 981 | " \"happy\": r\":[\\)|D+]\",\n", 982 | " \"laugh\": r\":\\)\\)+\",\n", 983 | " \"sad\": r\":\\(+\"\n", 984 | "}" 985 | ], 986 | "metadata": { 987 | "id": "JVKGAPTzKrwP" 988 | }, 989 | "execution_count": 66, 990 | "outputs": [] 991 | }, 992 | { 993 | "cell_type": "markdown", 994 | "source": [ 995 | "Pentru emoticoane putem folosi biblioteca [emoji](https://pypi.org/project/emoji/):" 996 | ], 997 | "metadata": { 998 | "id": "dL_3jA0j-HrL" 999 | } 1000 | }, 1001 | { 1002 | "cell_type": "code", 1003 | "source": [ 1004 | "!pip install emoji" 1005 | ], 1006 | "metadata": { 1007 | "colab": { 1008 | "base_uri": "https://localhost:8080/" 1009 | }, 1010 | "id": "_2jGoaJnQUGD", 1011 | "outputId": "0071f7f6-5086-4441-de28-0f40fd0064e5" 1012 | }, 1013 | "execution_count": 67, 1014 | "outputs": [ 1015 | { 1016 | "output_type": "stream", 1017 | "name": "stdout", 1018 | "text": [ 1019 | "Collecting emoji\n", 1020 | " Downloading emoji-2.11.0-py2.py3-none-any.whl (433 kB)\n", 1021 | "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/433.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.6/433.8 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m430.1/433.8 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m433.8/433.8 kB\u001b[0m \u001b[31m5.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 1022 | "\u001b[?25hInstalling collected packages: emoji\n", 1023 | "Successfully installed emoji-2.11.0\n" 1024 | ] 1025 | } 1026 | ] 1027 | }, 1028 | { 1029 | "cell_type": "code", 1030 | "source": [ 1031 | "import emoji\n", 1032 | "\n", 1033 | "print(tweets[24])\n", 1034 | "emoji.demojize(tweets[24])" 1035 | ], 1036 | "metadata": { 1037 | "colab": { 1038 | "base_uri": "https://localhost:8080/", 1039 | "height": 53 1040 | }, 1041 | "id": "MgVVPDioOeFx", 1042 | "outputId": "93d31552-6015-4444-f813-00fc994a2f0f" 1043 | }, 1044 | "execution_count": 68, 1045 | "outputs": [ 1046 | { 1047 | "output_type": "stream", 1048 | "name": "stdout", 1049 | "text": [ 1050 | "💅🏽💋 - :)))) haven't seen you in years\n" 1051 | ] 1052 | }, 1053 | { 1054 | "output_type": "execute_result", 1055 | "data": { 1056 | "text/plain": [ 1057 | "\":nail_polish_medium_skin_tone::kiss_mark: - :)))) haven't seen you in years\"" 1058 | ], 1059 | "application/vnd.google.colaboratory.intrinsic+json": { 1060 | "type": "string" 1061 | } 1062 | }, 1063 | "metadata": {}, 1064 | "execution_count": 68 1065 | } 1066 | ] 1067 | }, 1068 | { 1069 | "cell_type": "markdown", 1070 | "source": [ 1071 | "## Eliminarea cuvintelor de legatura (stopwords)" 1072 | ], 1073 | "metadata": { 1074 | "id": "PL8o18mSf7Fo" 1075 | } 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "source": [ 1080 | "from nltk.corpus import stopwords\n", 1081 | "\n", 1082 | "stop_words = stopwords.words('english')\n", 1083 | "stop_words[:10]" 1084 | ], 1085 | "metadata": { 1086 | "colab": { 1087 | "base_uri": "https://localhost:8080/" 1088 | }, 1089 | "id": "LISzqvjfgD98", 1090 | "outputId": "9a461832-7410-461f-9c00-99993f71bd5c" 1091 | }, 1092 | "execution_count": 69, 1093 | "outputs": [ 1094 | { 1095 | "output_type": "execute_result", 1096 | "data": { 1097 | "text/plain": [ 1098 | "['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', \"you're\"]" 1099 | ] 1100 | }, 1101 | "metadata": {}, 1102 | "execution_count": 69 1103 | } 1104 | ] 1105 | }, 1106 | { 1107 | "cell_type": "markdown", 1108 | "source": [ 1109 | "Elimina toate cuvintele din text care se regasesc in lista de cuvinte de legatura:" 1110 | ], 1111 | "metadata": { 1112 | "id": "g4XC4Lm6gbss" 1113 | } 1114 | }, 1115 | { 1116 | "cell_type": "code", 1117 | "source": [ 1118 | "# TODO: Elimina stopwords din text\n", 1119 | "def filter_stopwords(text):\n", 1120 | " return ' '.join([token for token in word_tokenize(text) if token not in stop_words])\n", 1121 | "\n", 1122 | "filter_stopwords(text)" 1123 | ], 1124 | "metadata": { 1125 | "id": "DZlrpdIggb4u", 1126 | "outputId": "f74f09b1-1f71-4a5e-d07c-8483c7d796db", 1127 | "colab": { 1128 | "base_uri": "https://localhost:8080/", 1129 | "height": 36 1130 | } 1131 | }, 1132 | "execution_count": 71, 1133 | "outputs": [ 1134 | { 1135 | "output_type": "execute_result", 1136 | "data": { 1137 | "text/plain": [ 1138 | "'@ BhaktisBanter @ PallaviRuhail This one irresistible : ) # FlipkartFashionFriday http : //t.co/EbZ0L2VENM'" 1139 | ], 1140 | "application/vnd.google.colaboratory.intrinsic+json": { 1141 | "type": "string" 1142 | } 1143 | }, 1144 | "metadata": {}, 1145 | "execution_count": 71 1146 | } 1147 | ] 1148 | }, 1149 | { 1150 | "cell_type": "markdown", 1151 | "source": [ 1152 | "## Lematizare sau Stemming\n", 1153 | "\n", 1154 | "Ne amintim ca lematizarea aduce un cuvant la forma sa de dictionar, in timp ce stemmingul elimina o serie de prefixe / sufixe predefinite:" 1155 | ], 1156 | "metadata": { 1157 | "id": "HmORcnTsgyV8" 1158 | } 1159 | }, 1160 | { 1161 | "cell_type": "markdown", 1162 | "source": [ 1163 | "![1_HLQgkMt5-g5WO5VpNuTl_g.jpeg](https://miro.medium.com/max/564/1*HLQgkMt5-g5WO5VpNuTl_g.jpeg)" 1164 | ], 1165 | "metadata": { 1166 | "id": "gV_8FD4Jj8df" 1167 | } 1168 | }, 1169 | { 1170 | "cell_type": "code", 1171 | "source": [ 1172 | "from nltk.stem import WordNetLemmatizer\n", 1173 | "\n", 1174 | "lemmatizer = WordNetLemmatizer()\n", 1175 | "\n", 1176 | "print(\"leaves :\", lemmatizer.lemmatize(\"leaves\"))" 1177 | ], 1178 | "metadata": { 1179 | "colab": { 1180 | "base_uri": "https://localhost:8080/" 1181 | }, 1182 | "id": "dVqVnRi0kOXP", 1183 | "outputId": "15fdeb29-2a07-41e6-cb0e-fd0a1348e92d" 1184 | }, 1185 | "execution_count": 72, 1186 | "outputs": [ 1187 | { 1188 | "output_type": "stream", 1189 | "name": "stdout", 1190 | "text": [ 1191 | "leaves : leaf\n" 1192 | ] 1193 | } 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "code", 1198 | "source": [ 1199 | "from nltk.stem import PorterStemmer\n", 1200 | "\n", 1201 | "stemmer = PorterStemmer()\n", 1202 | "\n", 1203 | "print(\"leaves :\", stemmer.stem(\"leaves\"))" 1204 | ], 1205 | "metadata": { 1206 | "colab": { 1207 | "base_uri": "https://localhost:8080/" 1208 | }, 1209 | "id": "4VguW1v_kV6L", 1210 | "outputId": "051c138f-8311-49f6-e988-fea9199083a1" 1211 | }, 1212 | "execution_count": 73, 1213 | "outputs": [ 1214 | { 1215 | "output_type": "stream", 1216 | "name": "stdout", 1217 | "text": [ 1218 | "leaves : leav\n" 1219 | ] 1220 | } 1221 | ] 1222 | }, 1223 | { 1224 | "cell_type": "markdown", 1225 | "source": [ 1226 | "# Exercitiu\n", 1227 | "\n", 1228 | "Creaza o functie care primeste un tweet ca parametru si returneaza o lista de cuvinte asupra carora am aplicat toate operatiile de preprocesare discutate până acum." 1229 | ], 1230 | "metadata": { 1231 | "id": "_BbhEbABcXBN" 1232 | } 1233 | }, 1234 | { 1235 | "cell_type": "code", 1236 | "source": [ 1237 | "def preprocess(text):\n", 1238 | " return [lemmatizer.lemmatize(token.lower()) for token in emoji.demojize(text).split() \\\n", 1239 | " if not token.isdigit() and \\\n", 1240 | " token not in punctuation and \\\n", 1241 | " token[0] not in ['#', '@'] and token[:3] not in ['htt', 'www'] and \\\n", 1242 | " token not in stop_words]\n", 1243 | "\n", 1244 | "preprocess(text)" 1245 | ], 1246 | "metadata": { 1247 | "id": "TbT0qgRkcmSp", 1248 | "outputId": "8b13c830-fba4-41a2-a402-b5f6b52eef04", 1249 | "colab": { 1250 | "base_uri": "https://localhost:8080/" 1251 | } 1252 | }, 1253 | "execution_count": 76, 1254 | "outputs": [ 1255 | { 1256 | "output_type": "execute_result", 1257 | "data": { 1258 | "text/plain": [ 1259 | "['this', 'one', 'irresistible', ':)']" 1260 | ] 1261 | }, 1262 | "metadata": {}, 1263 | "execution_count": 76 1264 | } 1265 | ] 1266 | }, 1267 | { 1268 | "cell_type": "markdown", 1269 | "source": [ 1270 | "# Exercitiu\n", 1271 | "\n", 1272 | "Testează mai multe combinații de preprocesări și embeddings pe setul de date de azi folosind unul din modelele de data trecută." 1273 | ], 1274 | "metadata": { 1275 | "id": "PLfjtrVxFu9r" 1276 | } 1277 | }, 1278 | { 1279 | "cell_type": "code", 1280 | "source": [], 1281 | "metadata": { 1282 | "id": "MFlXZfQoFu91" 1283 | }, 1284 | "execution_count": null, 1285 | "outputs": [] 1286 | } 1287 | ] 1288 | } -------------------------------------------------------------------------------- /Natural Language Processing/Reprezentarea_Cuvintelor.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "markdown", 19 | "source": [ 20 | "# Reprezentarea Cuvintelor (sau _Word Embeddings_)\n", 21 | "\n", 22 | "Acest laborator prezintă conceptele cheie și pașii pentru implementarea unei modalități de reprezentare a textelor sau cuvintelor ca vectori." 23 | ], 24 | "metadata": { 25 | "id": "q_G6xYEW1Eja" 26 | } 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "source": [ 31 | "## Setul de date" 32 | ], 33 | "metadata": { 34 | "id": "xnF1IrxEd43O" 35 | } 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "source": [ 40 | "Primul set de date pe care îl vom folosi este _common_texts_. Acesta conține o listă de documente, unde fiecare document conține o serie de cuvinte cheie prezentate tot ca o listă. Setul este mic si multe cuvinte se repetă, ceea ce îl face ușor de urmărit:" 41 | ], 42 | "metadata": { 43 | "id": "Gti7OAr-dmW-" 44 | } 45 | }, 46 | { 47 | "cell_type": "code", 48 | "source": [ 49 | "from gensim.test.utils import common_texts\n", 50 | "\n", 51 | "common_texts" 52 | ], 53 | "metadata": { 54 | "colab": { 55 | "base_uri": "https://localhost:8080/" 56 | }, 57 | "id": "S4UJhYwhZ8vL", 58 | "outputId": "c5c7626b-12cd-4eed-9681-ad4b7f952a18" 59 | }, 60 | "execution_count": 1, 61 | "outputs": [ 62 | { 63 | "output_type": "execute_result", 64 | "data": { 65 | "text/plain": [ 66 | "[['human', 'interface', 'computer'],\n", 67 | " ['survey', 'user', 'computer', 'system', 'response', 'time'],\n", 68 | " ['eps', 'user', 'interface', 'system'],\n", 69 | " ['system', 'human', 'system', 'eps'],\n", 70 | " ['user', 'response', 'time'],\n", 71 | " ['trees'],\n", 72 | " ['graph', 'trees'],\n", 73 | " ['graph', 'minors', 'trees'],\n", 74 | " ['graph', 'minors', 'survey']]" 75 | ] 76 | }, 77 | "metadata": {}, 78 | "execution_count": 1 79 | } 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "source": [ 85 | "text = [\n", 86 | " 'human interface computer',\n", 87 | " 'survey user computer system response time',\n", 88 | " 'eps user interface system',\n", 89 | " 'system human system eps',\n", 90 | " 'user response time',\n", 91 | " 'trees',\n", 92 | " 'graph trees',\n", 93 | " 'graph minors trees',\n", 94 | " 'graph minors survey'\n", 95 | "]" 96 | ], 97 | "metadata": { 98 | "id": "Y5HHDl5WDkOy" 99 | }, 100 | "execution_count": 2, 101 | "outputs": [] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "source": [ 106 | "În general o să folosim seturi de date mai mari, care ne transmit mai multe informații. Momentan folosim acest set de date fiindcă se mișcă mai rapid." 107 | ], 108 | "metadata": { 109 | "id": "pTqWJZnvAKOU" 110 | } 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "source": [ 115 | "## Bag of Words\n", 116 | "\n", 117 | "Nu vom implementa niciun model manual, vom folosi implementările deja existente. Pentru Bag of Words, aceasta se numește _CountVectorizer_:" 118 | ], 119 | "metadata": { 120 | "id": "kGGceCGWaRcM" 121 | } 122 | }, 123 | { 124 | "cell_type": "code", 125 | "source": [ 126 | "from sklearn.feature_extraction.text import CountVectorizer" 127 | ], 128 | "metadata": { 129 | "id": "sddR0EQSBSot" 130 | }, 131 | "execution_count": 3, 132 | "outputs": [] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "source": [ 137 | "_CountVectorizer_ este clasa pe care o vom folosi pentru a traduce fiecare propoziție din setul de date în varianta numerică a acesteia. Pentru asta trebuie să creăm o instanță a clasei noastre:" 138 | ], 139 | "metadata": { 140 | "id": "VC4FZiO0BS0C" 141 | } 142 | }, 143 | { 144 | "cell_type": "code", 145 | "source": [ 146 | "vectorizer = CountVectorizer()" 147 | ], 148 | "metadata": { 149 | "id": "R0g_8f2rB0L-" 150 | }, 151 | "execution_count": 9, 152 | "outputs": [] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "source": [ 157 | "_vectorizer_ este numele pe care îl vom da listei noastre de valori care ne transmit informațiile despre text.\n", 158 | "Pentru a aplica modelul pe textul nostru trebuie să apelăm funcția _fit_transform_ din interiorul instanței:" 159 | ], 160 | "metadata": { 161 | "id": "uGTP4eJdB0Ya" 162 | } 163 | }, 164 | { 165 | "cell_type": "code", 166 | "source": [ 167 | "X = vectorizer.fit_transform(text)" 168 | ], 169 | "metadata": { 170 | "id": "bP1R_KMoCi2g" 171 | }, 172 | "execution_count": 10, 173 | "outputs": [] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "source": [ 178 | "Funcția aplicată mai sus extrage lista de cuvinte din text și calculează de câte ori apare fiecare cuvânt în fiecare propoziție. Putem vedea lista de cuvinte folosind altă funcție din instanță:" 179 | ], 180 | "metadata": { 181 | "id": "do2dfaBtFV0o" 182 | } 183 | }, 184 | { 185 | "cell_type": "code", 186 | "source": [ 187 | "vectorizer.get_feature_names_out()" 188 | ], 189 | "metadata": { 190 | "colab": { 191 | "base_uri": "https://localhost:8080/" 192 | }, 193 | "id": "ADV-Mh37E9DF", 194 | "outputId": "04c5c669-0a81-4721-ac56-2a1ff7d7621a" 195 | }, 196 | "execution_count": 11, 197 | "outputs": [ 198 | { 199 | "output_type": "execute_result", 200 | "data": { 201 | "text/plain": [ 202 | "array(['computer', 'eps', 'graph', 'human', 'interface', 'minors',\n", 203 | " 'response', 'survey', 'system', 'time', 'trees', 'user'],\n", 204 | " dtype=object)" 205 | ] 206 | }, 207 | "metadata": {}, 208 | "execution_count": 11 209 | } 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "source": [ 215 | "X este lista noastră de valori. Fiecare linie reprezintă o propoziție, fiecare coloană reprezintă un cuvânt, iar valorile aflate la intersecție ne spun de câte ori apare fiecare cuvânt în fiecare propoziție." 216 | ], 217 | "metadata": { 218 | "id": "YoLQlpAkCjCU" 219 | } 220 | }, 221 | { 222 | "cell_type": "code", 223 | "source": [ 224 | "X.toarray()" 225 | ], 226 | "metadata": { 227 | "colab": { 228 | "base_uri": "https://localhost:8080/" 229 | }, 230 | "id": "BbtXiGHUEqJC", 231 | "outputId": "ac946455-c4b1-41cb-d02c-ba884ed30930" 232 | }, 233 | "execution_count": 12, 234 | "outputs": [ 235 | { 236 | "output_type": "execute_result", 237 | "data": { 238 | "text/plain": [ 239 | "array([[1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n", 240 | " [1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1],\n", 241 | " [0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1],\n", 242 | " [0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0],\n", 243 | " [0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1],\n", 244 | " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],\n", 245 | " [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0],\n", 246 | " [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0],\n", 247 | " [0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0]])" 248 | ] 249 | }, 250 | "metadata": {}, 251 | "execution_count": 12 252 | } 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "source": [ 258 | "Gata! Acum știm vectorizarea fiecărei propoziții folosind Bag of Words!" 259 | ], 260 | "metadata": { 261 | "id": "8-D0Tn4DFzmC" 262 | } 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "source": [ 267 | "### EXERCIȚIU:\n", 268 | "\n" 269 | ], 270 | "metadata": { 271 | "id": "9_roanSbF6rO" 272 | } 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "source": [ 277 | "La fel ca majoritatea claselor, CountVectorizer are o serie de parametri\n", 278 | "\n", 279 | "\n", 280 | " cu valori predefinite. Printre acestea se numără și _binary=False_, care numără de câte ori apar cuvintele. Setează valoarea acestui parametru ca _True_, antrenează din nou pe textul dat și afișează noua listă de valori (X)." 281 | ], 282 | "metadata": { 283 | "id": "9-i_TTioGIRF" 284 | } 285 | }, 286 | { 287 | "cell_type": "code", 288 | "source": [ 289 | "# TODO: creează o instanță cu parametrul binary setat True\n", 290 | "\n", 291 | "# TODO: antrenează din nou pe text\n", 292 | "\n", 293 | "# TODO: afișează noua listă de valori (X)\n" 294 | ], 295 | "metadata": { 296 | "id": "4ylruyKlGGmP", 297 | "colab": { 298 | "base_uri": "https://localhost:8080/" 299 | }, 300 | "outputId": "5409e802-2d62-4446-bc92-fcb599de1410", 301 | "is_executing": true 302 | }, 303 | "execution_count": null, 304 | "outputs": [] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "source": [ 309 | "## TFIDF\n", 310 | "\n", 311 | "TFIDF este antrenat și apelat exact la fel ca Bag of Words, doar că funcția pe care o folosim se numește _TfidfVectorizer_. Creează lista de valori numerice corespunzătoare textului folosind metoda TFIDF din cadrul clasei de mai jos:" 312 | ], 313 | "metadata": { 314 | "id": "SErBvUe7aP7v" 315 | } 316 | }, 317 | { 318 | "cell_type": "code", 319 | "source": [ 320 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 321 | "\n", 322 | "# TODO: La fel ca mai sus, creează o instanță a clasei TfidfVectorizer, antreneaz-o pe text și afișează lista de valori X\n", 323 | "\n", 324 | "# TODO: antrenează din nou pe text\n", 325 | "\n", 326 | "# TODO: afișează noua listă de valori (X)\n" 327 | ], 328 | "metadata": { 329 | "colab": { 330 | "base_uri": "https://localhost:8080/" 331 | }, 332 | "id": "7KNFe_r9aPTA", 333 | "outputId": "76dca847-0ae1-423c-8cbc-41309bc25aa4", 334 | "is_executing": true 335 | }, 336 | "execution_count": null, 337 | "outputs": [] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "source": [ 342 | "## Word2Vec\n", 343 | "\n", 344 | "Word2Vec funcționează puțin diferit. Îl vom antrena pe lista de cuvinte (în loc de propoziții) și îi spunem să ia în considerare cuvintele care apar minim o dată:" 345 | ], 346 | "metadata": { 347 | "id": "z2x8GG6haW7u" 348 | } 349 | }, 350 | { 351 | "cell_type": "code", 352 | "source": [ 353 | "from gensim.models import Word2Vec\n", 354 | "\n", 355 | "vectorizer = Word2Vec(common_texts, min_count=1).wv" 356 | ], 357 | "metadata": { 358 | "id": "lK8fWrimIjeh" 359 | }, 360 | "execution_count": 15, 361 | "outputs": [] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "source": [ 366 | "Putem vedea lista de cuvinte folosind următoarea funcție:" 367 | ], 368 | "metadata": { 369 | "id": "_862fHDlJ3bs" 370 | } 371 | }, 372 | { 373 | "cell_type": "code", 374 | "source": [ 375 | "vectorizer.key_to_index" 376 | ], 377 | "metadata": { 378 | "colab": { 379 | "base_uri": "https://localhost:8080/" 380 | }, 381 | "id": "Ay0NKjqaI6T9", 382 | "outputId": "5df0cc7a-0e22-4e06-db64-73b24481e0cd" 383 | }, 384 | "execution_count": 16, 385 | "outputs": [ 386 | { 387 | "output_type": "execute_result", 388 | "data": { 389 | "text/plain": [ 390 | "{'system': 0,\n", 391 | " 'graph': 1,\n", 392 | " 'trees': 2,\n", 393 | " 'user': 3,\n", 394 | " 'minors': 4,\n", 395 | " 'eps': 5,\n", 396 | " 'time': 6,\n", 397 | " 'response': 7,\n", 398 | " 'survey': 8,\n", 399 | " 'computer': 9,\n", 400 | " 'interface': 10,\n", 401 | " 'human': 11}" 402 | ] 403 | }, 404 | "metadata": {}, 405 | "execution_count": 16 406 | } 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "source": [ 412 | "Noua noastră instanță vine cu o serie de funcții noi pe care le putem descoperi. Folosește funcția _most_similar(word)_ pentru a vedea cuvintele care seamănă cel mai mult cu cuvântul _system_:" 413 | ], 414 | "metadata": { 415 | "id": "2JDO0s5iKMoE" 416 | } 417 | }, 418 | { 419 | "cell_type": "code", 420 | "source": [ 421 | "# TODO: găsește cuvintele cele mai apropiate de \"system\"\n" 422 | ], 423 | "metadata": { 424 | "id": "KstuGaPeapWO", 425 | "colab": { 426 | "base_uri": "https://localhost:8080/" 427 | }, 428 | "outputId": "b1e02131-9d9e-430d-dc4d-6d292b4898ed", 429 | "is_executing": true 430 | }, 431 | "execution_count": null, 432 | "outputs": [] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "source": [ 437 | "Folosește funcția _similarity(word1, word2)_ pentru a vedea cât de apropiate sunt cuvintele _human_ și _computer_:" 438 | ], 439 | "metadata": { 440 | "id": "QiUhEidcK8Fs" 441 | } 442 | }, 443 | { 444 | "cell_type": "code", 445 | "source": [ 446 | "# TODO: găsește similaritatea între \"human\" și \"computer\"\n" 447 | ], 448 | "metadata": { 449 | "id": "loQCmdJMap5S", 450 | "colab": { 451 | "base_uri": "https://localhost:8080/" 452 | }, 453 | "outputId": "893f29aa-f3ea-499d-b60f-d200e57f64c6", 454 | "is_executing": true 455 | }, 456 | "execution_count": null, 457 | "outputs": [] 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "source": [ 462 | "Ce se întâmplă dacă încercăm să calculăm _king + woman - man_ ?" 463 | ], 464 | "metadata": { 465 | "id": "R32-lcScLYcK" 466 | } 467 | }, 468 | { 469 | "cell_type": "code", 470 | "source": [ 471 | "vectorizer.most_similar(positive=[\"king\", \"woman\"], negative=[\"man\"])" 472 | ], 473 | "metadata": { 474 | "id": "RYeUYNunax23", 475 | "colab": { 476 | "base_uri": "https://localhost:8080/", 477 | "height": 287 478 | }, 479 | "outputId": "cec5aa9f-1fa5-4621-cbc6-b5a7c040465a" 480 | }, 481 | "execution_count": 19, 482 | "outputs": [ 483 | { 484 | "output_type": "error", 485 | "ename": "KeyError", 486 | "evalue": "\"Key 'king' not present in vocabulary\"", 487 | "traceback": [ 488 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", 489 | "\u001B[0;31mKeyError\u001B[0m Traceback (most recent call last)", 490 | "\u001B[0;32m\u001B[0m in \u001B[0;36m\u001B[0;34m()\u001B[0m\n\u001B[0;32m----> 1\u001B[0;31m \u001B[0mvectorizer\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmost_similar\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mpositive\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;34m\"king\"\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0;34m\"woman\"\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mnegative\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0;34m\"man\"\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m", 491 | "\u001B[0;32m/usr/local/lib/python3.10/dist-packages/gensim/models/keyedvectors.py\u001B[0m in \u001B[0;36mmost_similar\u001B[0;34m(self, positive, negative, topn, clip_start, clip_end, restrict_vocab, indexer)\u001B[0m\n\u001B[1;32m 839\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 840\u001B[0m \u001B[0;31m# compute the weighted average of all keys\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 841\u001B[0;31m \u001B[0mmean\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mget_mean_vector\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mkeys\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mweight\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mpre_normalize\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mpost_normalize\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mTrue\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mignore_missing\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;32mFalse\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 842\u001B[0m all_keys = [\n\u001B[1;32m 843\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mget_index\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mfor\u001B[0m \u001B[0mkey\u001B[0m \u001B[0;32min\u001B[0m \u001B[0mkeys\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0misinstance\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0m_KEY_TYPES\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32mand\u001B[0m \u001B[0mself\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mhas_index_for\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", 492 | "\u001B[0;32m/usr/local/lib/python3.10/dist-packages/gensim/models/keyedvectors.py\u001B[0m in \u001B[0;36mget_mean_vector\u001B[0;34m(self, keys, weights, pre_normalize, post_normalize, ignore_missing)\u001B[0m\n\u001B[1;32m 516\u001B[0m \u001B[0mtotal_weight\u001B[0m \u001B[0;34m+=\u001B[0m \u001B[0mabs\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mweights\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0midx\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 517\u001B[0m \u001B[0;32melif\u001B[0m \u001B[0;32mnot\u001B[0m \u001B[0mignore_missing\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m--> 518\u001B[0;31m \u001B[0;32mraise\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0;34mf\"Key '{key}' not present in vocabulary\"\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 519\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 520\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0mtotal_weight\u001B[0m \u001B[0;34m>\u001B[0m \u001B[0;36m0\u001B[0m\u001B[0;34m:\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", 493 | "\u001B[0;31mKeyError\u001B[0m: \"Key 'king' not present in vocabulary\"" 494 | ] 495 | } 496 | ] 497 | }, 498 | { 499 | "cell_type": "markdown", 500 | "source": [ 501 | "Modelul nostru nu a învățat aceste cuvinte, așa că nu știe ce să facă cu ele. Încearcă să calculezi diferența între alte 3 cuvinte din lista de cuvinte știute:" 502 | ], 503 | "metadata": { 504 | "id": "1YwnAzxCLm10" 505 | } 506 | }, 507 | { 508 | "cell_type": "code", 509 | "source": [ 510 | "# TODO\n" 511 | ], 512 | "metadata": { 513 | "id": "a3NITmGVa3rT", 514 | "colab": { 515 | "base_uri": "https://localhost:8080/" 516 | }, 517 | "outputId": "7c82f3be-6cdf-4539-d293-f56f946f914c", 518 | "is_executing": true 519 | }, 520 | "execution_count": null, 521 | "outputs": [] 522 | }, 523 | { 524 | "cell_type": "markdown", 525 | "source": [ 526 | "## BONUS" 527 | ], 528 | "metadata": { 529 | "id": "RSHYVd2chjjK" 530 | } 531 | }, 532 | { 533 | "cell_type": "markdown", 534 | "source": [ 535 | "Pentru următoarele exerciții vom folosi un set de date care conține 3.000.000 de cuvinte sub forma unei serii de știri extrase de pe Google. Întrucât setul este foarte mare și ar dura extrem de mult timp să îl antrenăm singuri, vom downloada modelul direct antrenat ca să ne uităm cum funcționează. Ne așteptăm ca downloadul să dureze minim 10 minute, deci aveți grijă când rulați această celulă:" 536 | ], 537 | "metadata": { 538 | "id": "MLOcFNf1ewu1" 539 | } 540 | }, 541 | { 542 | "cell_type": "code", 543 | "source": [ 544 | "import gensim.downloader as api\n", 545 | "\n", 546 | "model = api.load(\"word2vec-google-news-300\")" 547 | ], 548 | "metadata": { 549 | "colab": { 550 | "base_uri": "https://localhost:8080/" 551 | }, 552 | "id": "S5LAkwBJafdl", 553 | "outputId": "4d5a8f51-1f41-42e4-b858-ef9980515e69" 554 | }, 555 | "execution_count": 21, 556 | "outputs": [ 557 | { 558 | "output_type": "stream", 559 | "name": "stdout", 560 | "text": [ 561 | "[==================================================] 100.0% 1662.8/1662.8MB downloaded\n" 562 | ] 563 | } 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "source": [ 569 | "model.most_similar('system')" 570 | ], 571 | "metadata": { 572 | "id": "AEtcroBuTdzA", 573 | "colab": { 574 | "base_uri": "https://localhost:8080/" 575 | }, 576 | "outputId": "a0aa1b4d-360f-45dd-8924-fbf9e2be7bd4" 577 | }, 578 | "execution_count": 22, 579 | "outputs": [ 580 | { 581 | "output_type": "execute_result", 582 | "data": { 583 | "text/plain": [ 584 | "[('systems', 0.7227916717529297),\n", 585 | " ('sytem', 0.7129376530647278),\n", 586 | " ('sys_tem', 0.5871982574462891),\n", 587 | " ('System', 0.5275423526763916),\n", 588 | " ('mechanism', 0.5058810114860535),\n", 589 | " ('sysem', 0.5027822852134705),\n", 590 | " ('systen', 0.49969804286956787),\n", 591 | " ('system.The', 0.49599188566207886),\n", 592 | " ('sytems', 0.4949610233306885),\n", 593 | " ('computerized', 0.47604817152023315)]" 594 | ] 595 | }, 596 | "metadata": {}, 597 | "execution_count": 22 598 | } 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "source": [ 604 | "model.similarity('human', 'computer')" 605 | ], 606 | "metadata": { 607 | "id": "bV4_PiS-UANM", 608 | "colab": { 609 | "base_uri": "https://localhost:8080/" 610 | }, 611 | "outputId": "d26de90b-47fc-49ff-b947-e8aa3ff38310" 612 | }, 613 | "execution_count": 23, 614 | "outputs": [ 615 | { 616 | "output_type": "execute_result", 617 | "data": { 618 | "text/plain": [ 619 | "0.18846479" 620 | ] 621 | }, 622 | "metadata": {}, 623 | "execution_count": 23 624 | } 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "source": [ 630 | "model.most_similar(positive=[\"king\", \"woman\"], negative=[\"man\"])" 631 | ], 632 | "metadata": { 633 | "id": "vf3ihweJUGh7", 634 | "colab": { 635 | "base_uri": "https://localhost:8080/" 636 | }, 637 | "outputId": "a769604f-34d7-44e1-d2ab-d41d7a7fb909" 638 | }, 639 | "execution_count": 24, 640 | "outputs": [ 641 | { 642 | "output_type": "execute_result", 643 | "data": { 644 | "text/plain": [ 645 | "[('queen', 0.7118193507194519),\n", 646 | " ('monarch', 0.6189674139022827),\n", 647 | " ('princess', 0.5902431011199951),\n", 648 | " ('crown_prince', 0.5499460697174072),\n", 649 | " ('prince', 0.5377321839332581),\n", 650 | " ('kings', 0.5236844420433044),\n", 651 | " ('Queen_Consort', 0.5235945582389832),\n", 652 | " ('queens', 0.5181134343147278),\n", 653 | " ('sultan', 0.5098593831062317),\n", 654 | " ('monarchy', 0.5087411999702454)]" 655 | ] 656 | }, 657 | "metadata": {}, 658 | "execution_count": 24 659 | } 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "source": [ 665 | "model.most_similar(positive=['human'], negative=['computer', 'time'])" 666 | ], 667 | "metadata": { 668 | "id": "-FucrYuOUQVO", 669 | "colab": { 670 | "base_uri": "https://localhost:8080/" 671 | }, 672 | "outputId": "dfba8e2d-ac6f-425c-edce-768fb57bb0af" 673 | }, 674 | "execution_count": 25, 675 | "outputs": [ 676 | { 677 | "output_type": "execute_result", 678 | "data": { 679 | "text/plain": [ 680 | "[('non_toxigenic_C.', 0.3579254448413849),\n", 681 | " ('allotransplantation', 0.3464714586734772),\n", 682 | " ('speciesism', 0.3178519308567047),\n", 683 | " ('Atlantic_salmon_Salmo', 0.29254624247550964),\n", 684 | " ('nonhuman_animals', 0.2893490791320801),\n", 685 | " ('Sus_scrofa', 0.28739631175994873),\n", 686 | " ('K.Kahne_###-###', 0.2862851619720459),\n", 687 | " ('Neurotrophic_Factor', 0.2850346565246582),\n", 688 | " ('palmitoleic_acid', 0.28359511494636536),\n", 689 | " ('unbridled_individualism', 0.2827090919017792)]" 690 | ] 691 | }, 692 | "metadata": {}, 693 | "execution_count": 25 694 | } 695 | ] 696 | }, 697 | { 698 | "cell_type": "markdown", 699 | "source": [ 700 | "Vom instala modulul _wikipedia_ pentru a putea accesa paginile direct din cod:" 701 | ], 702 | "metadata": { 703 | "id": "mM-LSfLZMYPy" 704 | } 705 | }, 706 | { 707 | "cell_type": "code", 708 | "source": [ 709 | "! pip install wikipedia" 710 | ], 711 | "metadata": { 712 | "id": "ClB4fP-bniCy", 713 | "colab": { 714 | "base_uri": "https://localhost:8080/" 715 | }, 716 | "outputId": "df428dd3-9b81-4d93-d6cd-040e06e2d215" 717 | }, 718 | "execution_count": 26, 719 | "outputs": [ 720 | { 721 | "output_type": "stream", 722 | "name": "stdout", 723 | "text": [ 724 | "Collecting wikipedia\n", 725 | " Downloading wikipedia-1.4.0.tar.gz (27 kB)\n", 726 | " Preparing metadata (setup.py) ... \u001B[?25l\u001B[?25hdone\n", 727 | "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.10/dist-packages (from wikipedia) (4.12.3)\n", 728 | "Requirement already satisfied: requests<3.0.0,>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from wikipedia) (2.31.0)\n", 729 | "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->wikipedia) (3.3.2)\n", 730 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->wikipedia) (3.6)\n", 731 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->wikipedia) (2.0.7)\n", 732 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.0.0->wikipedia) (2024.2.2)\n", 733 | "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4->wikipedia) (2.5)\n", 734 | "Building wheels for collected packages: wikipedia\n", 735 | " Building wheel for wikipedia (setup.py) ... \u001B[?25l\u001B[?25hdone\n", 736 | " Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11680 sha256=9f750bcf936db970e0f2f06fd203378246c04e12d47bd617799022094f2b9912\n", 737 | " Stored in directory: /root/.cache/pip/wheels/5e/b6/c5/93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de\n", 738 | "Successfully built wikipedia\n", 739 | "Installing collected packages: wikipedia\n", 740 | "Successfully installed wikipedia-1.4.0\n" 741 | ] 742 | } 743 | ] 744 | }, 745 | { 746 | "cell_type": "markdown", 747 | "source": [ 748 | "1. Descarcă un articol de pe wikipedia. Înlocuiește _page\\_title_ cu un titlu de pagină de pe wikipedia:" 749 | ], 750 | "metadata": { 751 | "id": "ZPnKnhbchtLQ" 752 | } 753 | }, 754 | { 755 | "cell_type": "code", 756 | "source": [ 757 | "import wikipedia\n", 758 | "\n", 759 | "page_title = \"\" # TODO: Alege un titlu de pe wikipedia\n", 760 | "page = wikipedia.page(page_title, auto_suggest=False)\n", 761 | "\n", 762 | "print(page.content)" 763 | ], 764 | "metadata": { 765 | "id": "FM3ziGq8hxA2", 766 | "colab": { 767 | "base_uri": "https://localhost:8080/" 768 | }, 769 | "outputId": "1d1b3481-42aa-485f-c986-3f1ba8136d97" 770 | }, 771 | "execution_count": null, 772 | "outputs": [] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "source": [ 777 | "2. Descoperă câte cuvinte de pe pagina de wikipedia apar în modelul tău și câte nu." 778 | ], 779 | "metadata": { 780 | "id": "ETw3pflZNH5a" 781 | } 782 | }, 783 | { 784 | "cell_type": "code", 785 | "source": [ 786 | "import nltk\n", 787 | "nltk.download('punkt')" 788 | ], 789 | "metadata": { 790 | "id": "jJDGt_4pWIOi", 791 | "colab": { 792 | "base_uri": "https://localhost:8080/" 793 | }, 794 | "outputId": "43c4c840-e246-4dc5-df8c-3535534c4cf3" 795 | }, 796 | "execution_count": 29, 797 | "outputs": [ 798 | { 799 | "output_type": "stream", 800 | "name": "stderr", 801 | "text": [ 802 | "[nltk_data] Downloading package punkt to /root/nltk_data...\n", 803 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n" 804 | ] 805 | }, 806 | { 807 | "output_type": "execute_result", 808 | "data": { 809 | "text/plain": [ 810 | "True" 811 | ] 812 | }, 813 | "metadata": {}, 814 | "execution_count": 29 815 | } 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "source": [ 821 | "# TODO: Cate cuvinte de pe pagina de wiki apar în model și câte nu?\n", 822 | "from nltk import word_tokenize\n", 823 | "\n", 824 | "words = list(set(word_tokenize(page.content.lower())))\n", 825 | "words[:10]" 826 | ], 827 | "metadata": { 828 | "id": "hMH6kzu8NSs6", 829 | "colab": { 830 | "base_uri": "https://localhost:8080/" 831 | }, 832 | "outputId": "ff64f382-e0a2-48f7-d3ea-48443e71bad3" 833 | }, 834 | "execution_count": 37, 835 | "outputs": [ 836 | { 837 | "output_type": "execute_result", 838 | "data": { 839 | "text/plain": [ 840 | "['eleanor',\n", 841 | " 'banas',\n", 842 | " 'considerable',\n", 843 | " 'forester',\n", 844 | " 'impulses',\n", 845 | " 'snow',\n", 846 | " 'whom',\n", 847 | " 'archetype',\n", 848 | " 'andalusia',\n", 849 | " 'uses']" 850 | ] 851 | }, 852 | "metadata": {}, 853 | "execution_count": 37 854 | } 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "source": [ 860 | "lista_cuvinte = list(model.key_to_index.keys())\n", 861 | "lista_cuvinte[:10]" 862 | ], 863 | "metadata": { 864 | "id": "tRJWtH_oZbb4", 865 | "colab": { 866 | "base_uri": "https://localhost:8080/" 867 | }, 868 | "outputId": "d7003f5c-625b-49b0-8cd3-08d1fa33c1f6" 869 | }, 870 | "execution_count": 38, 871 | "outputs": [ 872 | { 873 | "output_type": "execute_result", 874 | "data": { 875 | "text/plain": [ 876 | "['', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']" 877 | ] 878 | }, 879 | "metadata": {}, 880 | "execution_count": 38 881 | } 882 | ] 883 | }, 884 | { 885 | "cell_type": "markdown", 886 | "source": [ 887 | "3. Determină similaritatea între toate cuvintele de pe pagina de wiki. Afișează top 3 cele mai apropiate perechi de cuvinte și top 3 cele mai diferite." 888 | ], 889 | "metadata": { 890 | "id": "HCsC5DckNfA4" 891 | } 892 | }, 893 | { 894 | "cell_type": "code", 895 | "source": [ 896 | "# TODO: Determină similaritatea între toate cuvintele din textul de pe wiki\n", 897 | "\n", 898 | "# TODO: Afișează cele mai apropiate 3 perechi de cuvinte din text\n", 899 | "\n", 900 | "# TODO: Afișează cele mai diferite 3 perechi de cuvinte din text\n" 901 | ], 902 | "metadata": { 903 | "id": "9CirEW9vN2BB", 904 | "colab": { 905 | "base_uri": "https://localhost:8080/" 906 | }, 907 | "outputId": "8e1be9e0-dcd3-4ee3-a1a7-60c89f28645d" 908 | }, 909 | "execution_count": null, 910 | "outputs": [] 911 | }, 912 | { 913 | "cell_type": "markdown", 914 | "source": [ 915 | "4. Pentru următoarele cuvinte: _user_, _survey_, _system_, _computer_ determină cel mai apropiat cuvânt folosind modelul încărcat pentru exercițiul bonus și modelul antrenat la începutul laboratorului. Observi diferențele?" 916 | ], 917 | "metadata": { 918 | "id": "9yT2NJk-N0Yw" 919 | } 920 | }, 921 | { 922 | "cell_type": "code", 923 | "source": [ 924 | "# TODO: Cel mai apropiat cuvânt de \"user\" folosind cele 2 modele\n", 925 | "\n", 926 | "# TODO: Cel mai apropiat cuvânt de \"survey\" folosind cele 2 modele\n", 927 | "\n", 928 | "# TODO: Cel mai apropiat cuvânt de \"system\" folosind cele 2 modele\n", 929 | "\n", 930 | "# TODO: Cel mai apropiat cuvânt de \"computer\" folosind cele 2 modele\n" 931 | ], 932 | "metadata": { 933 | "id": "4I-swOk3PMqA", 934 | "colab": { 935 | "base_uri": "https://localhost:8080/" 936 | }, 937 | "outputId": "3397a2c5-5302-4666-e3ea-f616e643375a" 938 | }, 939 | "execution_count": 68, 940 | "outputs": [ 941 | { 942 | "output_type": "stream", 943 | "name": "stdout", 944 | "text": [ 945 | "Modelul antrenat de noi | Model preantrenat\n", 946 | "('eps', 0.13147011399269104) ('users', 0.7195653319358826)\n", 947 | "('trees', 0.19912061095237732) ('surveys', 0.8096452355384827)\n", 948 | "('computer', 0.21617141366004944) ('systems', 0.7227916717529297)\n", 949 | "('system', 0.21617139875888824) ('computers', 0.7979379892349243)\n" 950 | ] 951 | } 952 | ] 953 | } 954 | ] 955 | } 956 | -------------------------------------------------------------------------------- /Python and Data Visualisation/Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "3e8a43b5-b1b1-4b5c-8561-6c2d20acbc3d", 6 | "metadata": {}, 7 | "source": [ 8 | "## Data Types" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "d522be57-dd08-41ae-b38f-c9a091f8476a", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "x = 3" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "25e417f3-3355-478e-9962-1e220b888963", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "type(x)" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "id": "b04eddbc-6eaf-401b-bc24-482639fab6df", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "y = 3.3" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "45fe65a5-db86-44af-85ac-b8c4c747da24", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "type(y)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "826bcb9a-7fdb-4c2f-ae56-17e602b46fa1", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "b = True" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "id": "865e5778-051e-4721-8479-7fc6b3253414", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "type(b)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "e063f5b0-3475-4568-9bc2-6f47f5175a7a", 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "y" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "d954ec14-3dda-429a-a3ab-5b01474f158a", 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "x" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "id": "35cf3a07-70af-4835-9c48-99b6abbe1d58", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "x ** 2" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "id": "320d09fc-7e34-4897-8099-5f26c9e586cd", 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "x / 2" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "id": "0314f2ad-a11f-4a0a-8399-7e58ddbc5340", 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "x // 2" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "6f1e6f22-145e-41fe-85dd-6a444220cdec", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "True and True" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": null, 134 | "id": "c803a3a2-91c8-436f-87da-2b8adddde7e7", 135 | "metadata": {}, 136 | "outputs": [], 137 | "source": [ 138 | "True or False" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "id": "4fb9685d-3404-4cef-a8c5-32216be33805", 144 | "metadata": {}, 145 | "source": [ 146 | "## Data Structures" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "id": "c609ede8-2f27-4a54-8be5-3fa43f1d1558", 152 | "metadata": {}, 153 | "source": [ 154 | "### Liste" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "e2cd52b1-c5a2-4614-bd48-6cc8b2d10be9", 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "l = [10, 11, 12, 13, 14, 15, 16]" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "id": "e5e1f4fe-7b65-4dd9-a99b-4e3836409a5c", 170 | "metadata": {}, 171 | "source": [ 172 | "#### Accesare" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "id": "f6a75c60-29e0-4740-9670-913ec785bdba", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "l[0]" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "id": "00728562-ae95-47c4-9345-91c8b2a1e2fd", 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "l[-1]" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "id": "9611a2b8-257e-41f1-9542-42858f0d2f02", 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "l[-2]" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "id": "98368a75-a89b-40bf-994c-92d6a2b00e86", 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "l[1]" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "id": "efd17c96-dd91-4901-b1e6-886728b763c8", 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "l[3]" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "id": "57e4683f-2d5f-4453-8266-1653a304874b", 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "l[1:4] " 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "id": "6a7e21be-4f59-4da8-9606-3389c0932a5d", 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "l" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "id": "78f0a6e0-d840-4294-a7ac-5749d812fbd3", 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "l[2:6]" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "id": "58280072-5b73-46dc-ac76-5b05e0d9242e", 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "l[:5]" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "id": "deffc91c-f94d-46dc-9f34-5d482e87627e", 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [ 272 | "l[-5:]" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "id": "a72b9aa7-dee6-4c5a-8b18-5d0fa1d0b76d", 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "l[1:6:2]" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "id": "bcecafae-3b69-4902-8683-0ef727ebedb7", 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "l[::-1]" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "id": "e5cd32f7-db85-41fc-bb4c-93d9f70a7d83", 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "l[5:1]" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "id": "176faa04-99d2-4d3e-bb3d-bbfd760ef6d1", 308 | "metadata": {}, 309 | "source": [ 310 | "#### Inserare" 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "id": "e6b9a024-aeeb-46e8-b0fc-afc981692509", 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "l = [10, 11, 12, 13]" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "id": "1c31b098-bfe1-434c-8c2f-8a9582581d79", 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "l.append(100)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "id": "b0bffaa6-16df-4db3-a501-2fb5a83f150b", 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "l" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "id": "81be5ac8-aded-4ede-92c3-fea801e0ede8", 347 | "metadata": {}, 348 | "outputs": [], 349 | "source": [ 350 | "l.insert(2, 200)" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": null, 356 | "id": "8d9988b3-ec72-46c7-94ce-d313fba55b90", 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "l" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "id": "eb2bb975-7bca-4f83-9217-dd610a0b70e2", 366 | "metadata": {}, 367 | "source": [ 368 | "#### Modificare" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "id": "555f3337-4b5f-4896-b58f-bb6d3c619dc2", 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [ 378 | "l[0] = 111" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": null, 384 | "id": "f0aa21d9-1275-40a0-9241-5191d8546c1c", 385 | "metadata": {}, 386 | "outputs": [], 387 | "source": [ 388 | "l" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": null, 394 | "id": "23402c1f-e0d6-42f5-915e-53bbc83810bc", 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "l = [1, 2, 3]\n", 399 | "l" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "id": "70e219bf-0d64-4ead-9fde-4c94aaf892ae", 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "l2 = l" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "id": "d9bb4c7e-1151-42b8-a89a-9cf32f2e51c1", 416 | "metadata": {}, 417 | "outputs": [], 418 | "source": [ 419 | "l2[0] = 100" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "id": "4b7bb8e8-f886-4eb4-b7b7-f16a0f5bfcb0", 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [ 429 | "l2" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "id": "87b00d31-4369-43c7-86d2-6715426526f2", 436 | "metadata": {}, 437 | "outputs": [], 438 | "source": [ 439 | "l" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": null, 445 | "id": "4a9f45bf-6c8c-4eef-87e9-3b03bd639d4f", 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "l2 = l[:]\n", 450 | "\n", 451 | "l2[0] = 111" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": null, 457 | "id": "63099a58-9719-4300-ad6b-d5c69d473f58", 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "l" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "id": "565d17ac-f679-4890-b0d3-fa9a29048a07", 468 | "metadata": {}, 469 | "outputs": [], 470 | "source": [ 471 | "l2" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "id": "1c8fe4d0-247e-4a8d-be18-a7a61ee3eabf", 477 | "metadata": {}, 478 | "source": [ 479 | "#### Eliminare" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "id": "ad2a11a1-a75d-4341-b38e-5a366f539f4a", 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "l = [1, 2, 3]" 490 | ] 491 | }, 492 | { 493 | "cell_type": "code", 494 | "execution_count": null, 495 | "id": "197433f8-bb72-40a8-a3e7-b76b977d8899", 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "l.pop()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "id": "69483241-d0ff-47ab-b1ac-55253cf968f5", 506 | "metadata": {}, 507 | "outputs": [], 508 | "source": [ 509 | "l" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "id": "dff097f1-d6fd-43a7-85ab-6d7d21607ab1", 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "l = [1, 2, 3]" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "id": "c21954e2-3636-4620-a5a0-4fc190c1e12b", 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "l.pop(1)" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "id": "371fe0c0-e09a-4432-a7cc-9a1d23d594f7", 536 | "metadata": {}, 537 | "outputs": [], 538 | "source": [ 539 | "l" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "id": "f7f9bef4-ac63-42ae-9920-fb4941f5af66", 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "l = [1, 2, 3, 3.3, True]" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "id": "c0ad5ebd-d2b4-4d46-b0d4-18a1d8981ed9", 555 | "metadata": {}, 556 | "source": [ 557 | "### Dictionare" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "id": "78f61fcf-28d7-4e04-bb2e-bff6722d2cdb", 564 | "metadata": {}, 565 | "outputs": [], 566 | "source": [ 567 | "d = {\n", 568 | " 'user1': 10,\n", 569 | " 'user2': 20,\n", 570 | " 'user3': 30\n", 571 | "}" 572 | ] 573 | }, 574 | { 575 | "cell_type": "markdown", 576 | "id": "7e91c06f-c9fd-496e-a3ea-4a8e012efa00", 577 | "metadata": {}, 578 | "source": [ 579 | "#### Accesare" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "id": "23632935-6be5-4065-902a-74fa72c3a7be", 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "d['user1']" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": null, 595 | "id": "c7a58a82-a0b5-4bb8-a117-de25eb971c44", 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "d['user4']" 600 | ] 601 | }, 602 | { 603 | "cell_type": "code", 604 | "execution_count": null, 605 | "id": "ebe24139-1c39-4868-9dad-b5fae75608d5", 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "'user4' in d" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "id": "8673771e-5bc8-4f6f-af17-b123e3aae6d4", 615 | "metadata": {}, 616 | "source": [ 617 | "#### Inserare" 618 | ] 619 | }, 620 | { 621 | "cell_type": "code", 622 | "execution_count": null, 623 | "id": "1726dfb3-1172-4eb0-afd6-fd3ae9f20663", 624 | "metadata": {}, 625 | "outputs": [], 626 | "source": [ 627 | "d['user4'] = 40" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "id": "f8fdce3e-74ca-4f4e-9cb3-21d7ef139d5f", 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "d" 638 | ] 639 | }, 640 | { 641 | "cell_type": "code", 642 | "execution_count": null, 643 | "id": "218f7ad0-e1c6-4b98-8896-b280ded12243", 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "id": "946449d9-ce89-4540-be1e-197af49557d6", 651 | "metadata": {}, 652 | "source": [ 653 | "#### Modificare" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "id": "9ea464b1-1d77-4bbb-8312-2bc550b14fad", 660 | "metadata": {}, 661 | "outputs": [], 662 | "source": [ 663 | "d" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "id": "f6820990-700a-4d02-abf6-dda69979ebb9", 670 | "metadata": {}, 671 | "outputs": [], 672 | "source": [ 673 | "d['user1'] = 100" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "id": "e80cb95d-1cf4-41f1-ac7f-8093b66c3bf2", 680 | "metadata": {}, 681 | "outputs": [], 682 | "source": [ 683 | "d" 684 | ] 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "id": "455c598d-2568-4048-939a-cc1002daf993", 689 | "metadata": {}, 690 | "source": [ 691 | "#### Eliminare" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "id": "abd781b4-fcc4-45af-a058-71289d0fb89b", 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "d.pop('user1')" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "id": "720c5467-39a6-4057-b803-ba7b47ebc68f", 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "d" 712 | ] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": null, 717 | "id": "f4fb8442-846f-4e82-ab25-05c8846ef43a", 718 | "metadata": {}, 719 | "outputs": [], 720 | "source": [ 721 | "d.pop('user5')" 722 | ] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": null, 727 | "id": "b236130e-774e-4265-a7f6-d01b6e1e202f", 728 | "metadata": {}, 729 | "outputs": [], 730 | "source": [ 731 | "d.pop('user5', False)" 732 | ] 733 | }, 734 | { 735 | "cell_type": "markdown", 736 | "id": "ab2a24b0-037b-48d8-9f56-b03278021126", 737 | "metadata": {}, 738 | "source": [ 739 | "## Functii" 740 | ] 741 | }, 742 | { 743 | "cell_type": "code", 744 | "execution_count": null, 745 | "id": "1708c418-679f-4d4f-b7dc-25ab3201a3f9", 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "def compute(x, y):\n", 750 | " result = (x ** 2 + y ** 2) ** 0.5\n", 751 | "\n", 752 | " return result" 753 | ] 754 | }, 755 | { 756 | "cell_type": "code", 757 | "execution_count": null, 758 | "id": "6e8b75b6-08a3-4d5a-9eb7-ba5df3a8bb56", 759 | "metadata": {}, 760 | "outputs": [], 761 | "source": [ 762 | "compute(2, 3)" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "id": "2e0fc639-73ea-4302-a88b-3b2a88b69df9", 769 | "metadata": {}, 770 | "outputs": [], 771 | "source": [ 772 | "def hello():\n", 773 | " print(\"hello\")" 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": null, 779 | "id": "05f9147e-b9c2-43e1-ba9d-7543423ee802", 780 | "metadata": {}, 781 | "outputs": [], 782 | "source": [ 783 | "hello()" 784 | ] 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": null, 789 | "id": "e5af5879-2ab9-4f0a-afeb-ea90c080843b", 790 | "metadata": {}, 791 | "outputs": [], 792 | "source": [ 793 | "rezultat = hello()" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": null, 799 | "id": "e2ea9124-d5e1-4e1c-910a-21cbfab39243", 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [ 803 | "type(rezultat)" 804 | ] 805 | }, 806 | { 807 | "cell_type": "code", 808 | "execution_count": null, 809 | "id": "7dbc7ece-a2cd-4288-817a-774c4a29879d", 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [ 813 | "b = None" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": null, 819 | "id": "64313522-2259-492a-a605-9096b8a72884", 820 | "metadata": {}, 821 | "outputs": [], 822 | "source": [ 823 | "def hello(loud=False):\n", 824 | " if loud:\n", 825 | " print(\"HELLO\")\n", 826 | " else:\n", 827 | " print(\"hello\")" 828 | ] 829 | }, 830 | { 831 | "cell_type": "code", 832 | "execution_count": null, 833 | "id": "3208ec9b-1737-4985-9941-ff4852d20dd9", 834 | "metadata": {}, 835 | "outputs": [], 836 | "source": [ 837 | "hello(True)" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "id": "4d5f02c2-a28d-4b65-b7e2-459d3dc03a2e", 844 | "metadata": {}, 845 | "outputs": [], 846 | "source": [ 847 | "a = 5\n", 848 | "\n", 849 | "if a < 3:\n", 850 | " print('S')\n", 851 | "elif a < 7:\n", 852 | " print('M')\n", 853 | "else:\n", 854 | " print('L')" 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "id": "a5e016c1-e4f2-4640-a8f3-49cadd008ca2", 861 | "metadata": {}, 862 | "outputs": [], 863 | "source": [ 864 | "for i in range(5):\n", 865 | " print(i)" 866 | ] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": null, 871 | "id": "4fe35a9d-264f-4c45-bd2e-73f2e082cfd7", 872 | "metadata": {}, 873 | "outputs": [], 874 | "source": [ 875 | "for i in range(2,6):\n", 876 | " print(i)" 877 | ] 878 | }, 879 | { 880 | "cell_type": "code", 881 | "execution_count": null, 882 | "id": "09de1c93-1787-43c9-a008-45c3f6c72901", 883 | "metadata": {}, 884 | "outputs": [], 885 | "source": [ 886 | "for i in range(10, 100, 20):\n", 887 | " print(i)" 888 | ] 889 | }, 890 | { 891 | "cell_type": "markdown", 892 | "id": "cce404bf-7da0-4eda-adfd-95ef1fd4bb2a", 893 | "metadata": {}, 894 | "source": [ 895 | "## Siruri de Caractere" 896 | ] 897 | }, 898 | { 899 | "cell_type": "code", 900 | "execution_count": null, 901 | "id": "ea925be3-e032-4b97-8cef-b4449d6a8401", 902 | "metadata": {}, 903 | "outputs": [], 904 | "source": [ 905 | "s1 = 'Hello'\n", 906 | "s2 = \"World\"" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": null, 912 | "id": "6e51010f-e9ee-4998-ac47-34c76dbccc73", 913 | "metadata": {}, 914 | "outputs": [], 915 | "source": [ 916 | "s1 + \" \" + s2" 917 | ] 918 | }, 919 | { 920 | "cell_type": "code", 921 | "execution_count": null, 922 | "id": "eb6043b9-bcc4-4573-90c3-2b71cdefa5cf", 923 | "metadata": {}, 924 | "outputs": [], 925 | "source": [ 926 | "s1 * 3" 927 | ] 928 | }, 929 | { 930 | "cell_type": "code", 931 | "execution_count": null, 932 | "id": "8d34de3f-63f8-45f7-85f7-c0e6544ce64d", 933 | "metadata": {}, 934 | "outputs": [], 935 | "source": [ 936 | "s1.lower()" 937 | ] 938 | }, 939 | { 940 | "cell_type": "code", 941 | "execution_count": null, 942 | "id": "785013ae-44cf-499e-82cc-97e21abacc23", 943 | "metadata": {}, 944 | "outputs": [], 945 | "source": [ 946 | "s1.upper()" 947 | ] 948 | }, 949 | { 950 | "cell_type": "code", 951 | "execution_count": null, 952 | "id": "c7abe100-b43a-4da8-ace4-ce888609d0a4", 953 | "metadata": {}, 954 | "outputs": [], 955 | "source": [ 956 | "s1.rjust(20)" 957 | ] 958 | }, 959 | { 960 | "cell_type": "code", 961 | "execution_count": null, 962 | "id": "6b6333ee-69d8-46fe-9825-105ddce42b8b", 963 | "metadata": {}, 964 | "outputs": [], 965 | "source": [ 966 | "s1.ljust(20)" 967 | ] 968 | }, 969 | { 970 | "cell_type": "code", 971 | "execution_count": null, 972 | "id": "b23efa06-923f-45c9-b13b-0a4568a67aa0", 973 | "metadata": {}, 974 | "outputs": [], 975 | "source": [ 976 | "sent = \"ana are mere\"" 977 | ] 978 | }, 979 | { 980 | "cell_type": "code", 981 | "execution_count": null, 982 | "id": "b7653bef-30f0-4472-8b11-94f853c4ed58", 983 | "metadata": {}, 984 | "outputs": [], 985 | "source": [ 986 | "sent.split()" 987 | ] 988 | }, 989 | { 990 | "cell_type": "code", 991 | "execution_count": null, 992 | "id": "f4354ccd-caac-4464-9d5a-1a78d40d6258", 993 | "metadata": {}, 994 | "outputs": [], 995 | "source": [ 996 | "s = \"a,b,c,d\"\n", 997 | "s.split(',')" 998 | ] 999 | }, 1000 | { 1001 | "cell_type": "code", 1002 | "execution_count": null, 1003 | "id": "9de60e4c-41fe-439f-b604-9a9db87695d8", 1004 | "metadata": {}, 1005 | "outputs": [], 1006 | "source": [ 1007 | "l = ['a', 'b', 'c', 'd']" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": null, 1013 | "id": "d54c2d22-80d8-4c7f-a006-c7776be0ce93", 1014 | "metadata": {}, 1015 | "outputs": [], 1016 | "source": [ 1017 | "'-'.join(l)" 1018 | ] 1019 | }, 1020 | { 1021 | "cell_type": "code", 1022 | "execution_count": null, 1023 | "id": "f9d625f4-f9dc-45f3-9a72-454048839761", 1024 | "metadata": {}, 1025 | "outputs": [], 1026 | "source": [] 1027 | } 1028 | ], 1029 | "metadata": { 1030 | "kernelspec": { 1031 | "display_name": "Python 3 (ipykernel)", 1032 | "language": "python", 1033 | "name": "python3" 1034 | }, 1035 | "language_info": { 1036 | "codemirror_mode": { 1037 | "name": "ipython", 1038 | "version": 3 1039 | }, 1040 | "file_extension": ".py", 1041 | "mimetype": "text/x-python", 1042 | "name": "python", 1043 | "nbconvert_exporter": "python", 1044 | "pygments_lexer": "ipython3", 1045 | "version": "3.11.7" 1046 | } 1047 | }, 1048 | "nbformat": 4, 1049 | "nbformat_minor": 5 1050 | } 1051 | -------------------------------------------------------------------------------- /Python and Data Visualisation/csv_pretty_print.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b1e1d195-10fc-417c-867f-41d0a4c065eb", 6 | "metadata": {}, 7 | "source": [ 8 | "# CSV Pretty Print" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "2c5ff7a4-5884-4415-8817-2ca0e5614549", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "csv_1 = 'col_1,col_2,col_3,color\\n1,3456,2134,red\\n123,0,0,green\\n349587,-1,14,red'" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "236c2975-e8e1-4533-b31c-934858b8ffbb", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "csv_2 = 'Sell,List,Living,Rooms,Beds,Baths,Age,Acres,Taxes\\n142,160,28,10,5,3,60,0.28,3167\\n175,180,18,8,4,1,12,0.43,4033\\n129,132,13,6,3,1,41,0.33,1471\\n138,140,17,7,3,1,22,0.46,3204\\n232,240,25,8,4,3,5,2.05,3613\\n135,140,18,7,4,3,9,0.57,3028\\n150,160,20,8,4,3,18,4.00,3131\\n207,225,22,8,4,2,16,2.22,5158\\n271,285,30,10,5,2,30,0.53,5702\\n89,90,10,5,3,1,43,0.30,2054\\n153,157,22,8,3,3,18,0.38,4127\\n87,90,16,7,3,1,50,0.65,1445\\n234,238,25,8,4,2,2,1.61,2087\\n106,116,20,8,4,1,13,0.22,2818\\n175,180,22,8,4,2,15,2.06,3917\\n165,170,17,8,4,2,33,0.46,2220\\n166,170,23,9,4,2,37,0.27,3498\\n136,140,19,7,3,1,22,0.63,3607\\n148,160,17,7,3,2,13,0.36,3648\\n151,153,19,8,4,2,24,0.34,3561'" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "id": "af3ab21d-eac4-4089-b073-fcc4da1e4a05", 34 | "metadata": {}, 35 | "source": [ 36 | "## 1. Parsare\n", 37 | "\n", 38 | "Scrieti o functie care sa parseze un sir de caractere care contine un CSV." 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "id": "e4f8d8b2-72f8-466c-a27e-f258a5cdfd32", 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "def parse_csv(csv_string):\n", 49 | " pass" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "9cea7b26-bcdb-4eb1-978c-e19d33217fe6", 55 | "metadata": {}, 56 | "source": [ 57 | "Exemplu:" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "id": "da4515b4-3255-4a73-9a35-62135c0d0e60", 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "table_1 = [\n", 68 | " ['col_1', 'col_2', 'col_3', 'color'],\n", 69 | " ['1', '3456', '2134', 'red'],\n", 70 | " ['123', '0', '0', 'green'],\n", 71 | " ['349587', '-1', '14', 'red']\n", 72 | "]" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "0010d07b-8785-4dc0-b434-7c99275c7b9c", 78 | "metadata": {}, 79 | "source": [ 80 | "**Hint**: `.split()`" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "id": "3234c7b1-fdb2-42ae-9a3f-8ad29d9ddf46", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "parse_csv(csv_1)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "f61c6c4f-eea6-4fa1-b998-4ac478c3dd22", 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "parse_csv(csv_2)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "id": "b38b94cd-2461-42c6-9e12-fdfcbcf44784", 106 | "metadata": {}, 107 | "source": [ 108 | "## 2. Afisare\n", 109 | "\n", 110 | "Scrieti o functie care afiseaza un CSV intr-un format *human-readable*." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "id": "46e6a694-6c15-47af-ae77-e8ad2029751b", 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "def print_csv(csv_string):\n", 121 | " pass" 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "id": "cc923a69-a65e-41b7-b746-a82965f7d911", 127 | "metadata": {}, 128 | "source": [ 129 | "Exemplu:\n", 130 | "\n", 131 | "```\n", 132 | "col_1 col_2 col_3 color\n", 133 | "-----------------------------\n", 134 | "1 3456 2134 red\n", 135 | "123 0 0 green\n", 136 | "349587 -1 14 red\n", 137 | "```" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "id": "b8dd4c1b-aae4-4f72-9234-64d77d44c4bc", 143 | "metadata": {}, 144 | "source": [ 145 | "**Note**: Puteti considera o valoare constanta pentru latimea coloanelor (de exemplu 8 caractere).\n", 146 | "\n", 147 | "**Hint**: Va puteti folosi de functia scrisa la exercitiul anterior." 148 | ] 149 | }, 150 | { 151 | "cell_type": "markdown", 152 | "id": "b8bb1bd9-c477-4faa-ac5e-db2136c2e85c", 153 | "metadata": {}, 154 | "source": [ 155 | "## 3. Bonus Challenge\n", 156 | "\n", 157 | "Rescrieti functia de la exercitiul 2 astfel incat latimea coloanelor sa fie determinata dinamic, in functie de latimea celei mai lungi valori.\n", 158 | "\n", 159 | "Exemplu fara latime dinamica:\n", 160 | "\n", 161 | "```\n", 162 | "col_1 col_2 col_3 \n", 163 | "------------------------------------------------------\n", 164 | "a 2 4 \n", 165 | "b 4 16 \n", 166 | "really_long_value 8 32\n", 167 | "```\n", 168 | "\n", 169 | "Exemplu cu latime dinamica:\n", 170 | "\n", 171 | "```\n", 172 | "col_1 col_2 col_3 \n", 173 | "---------------------------------\n", 174 | "a 2 4 \n", 175 | "b 4 16 \n", 176 | "really_long_value 8 32\n", 177 | "```" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "id": "1e441502-35cd-4a2b-acc9-b0f967ba6fc6", 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "def print_csv(csv_string):\n", 188 | " pass" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "id": "58fcd3e4-3cac-4ced-b5ed-5540498c8394", 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [] 198 | } 199 | ], 200 | "metadata": { 201 | "kernelspec": { 202 | "display_name": "Python 3 (ipykernel)", 203 | "language": "python", 204 | "name": "python3" 205 | }, 206 | "language_info": { 207 | "codemirror_mode": { 208 | "name": "ipython", 209 | "version": 3 210 | }, 211 | "file_extension": ".py", 212 | "mimetype": "text/x-python", 213 | "name": "python", 214 | "nbconvert_exporter": "python", 215 | "pygments_lexer": "ipython3", 216 | "version": "3.11.7" 217 | }, 218 | "toc-autonumbering": true, 219 | "vscode": { 220 | "interpreter": { 221 | "hash": "d7228b2cc0e7476a93584e7e1e912b2ec970aeeec9283636b1f7a7ce72c064cb" 222 | } 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 5 227 | } 228 | -------------------------------------------------------------------------------- /Python and Data Visualisation/csv_pretty_print_sol.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "b1e1d195-10fc-417c-867f-41d0a4c065eb", 6 | "metadata": {}, 7 | "source": [ 8 | "# CSV Pretty Print" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "2c5ff7a4-5884-4415-8817-2ca0e5614549", 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "csv_1 = 'col_1,col_2,col_3,color\\n1,3456,2134,red\\n123,0,0,green\\n349587,-1,14,red'" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "e7443b92-2c6f-400e-bffe-bd17e8f323b2", 25 | "metadata": {}, 26 | "outputs": [ 27 | { 28 | "name": "stdout", 29 | "output_type": "stream", 30 | "text": [ 31 | "col_1,col_2,col_3,color\n", 32 | "1,3456,2134,red\n", 33 | "123,0,0,green\n", 34 | "349587,-1,14,red\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "print(csv_1)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "id": "236c2975-e8e1-4533-b31c-934858b8ffbb", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "csv_2 = 'Sell,List,Living,Rooms,Beds,Baths,Age,Acres,Taxes\\n142,160,28,10,5,3,60,0.28,3167\\n175,180,18,8,4,1,12,0.43,4033\\n129,132,13,6,3,1,41,0.33,1471\\n138,140,17,7,3,1,22,0.46,3204\\n232,240,25,8,4,3,5,2.05,3613\\n135,140,18,7,4,3,9,0.57,3028\\n150,160,20,8,4,3,18,4.00,3131\\n207,225,22,8,4,2,16,2.22,5158\\n271,285,30,10,5,2,30,0.53,5702\\n89,90,10,5,3,1,43,0.30,2054\\n153,157,22,8,3,3,18,0.38,4127\\n87,90,16,7,3,1,50,0.65,1445\\n234,238,25,8,4,2,2,1.61,2087\\n106,116,20,8,4,1,13,0.22,2818\\n175,180,22,8,4,2,15,2.06,3917\\n165,170,17,8,4,2,33,0.46,2220\\n166,170,23,9,4,2,37,0.27,3498\\n136,140,19,7,3,1,22,0.63,3607\\n148,160,17,7,3,2,13,0.36,3648\\n151,153,19,8,4,2,24,0.34,3561'" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "id": "af3ab21d-eac4-4089-b073-fcc4da1e4a05", 55 | "metadata": {}, 56 | "source": [ 57 | "## 1. Parsare\n", 58 | "\n", 59 | "Scrieti o functie care sa parseze un sir de caractere care contine un CSV." 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 4, 65 | "id": "160e6ba3-3f58-4cb4-848f-e36180f86136", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "rows = csv_1.split('\\n')\n", 70 | "tabel = []\n", 71 | "for row in rows:\n", 72 | " tabel.append(row.split(','))" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "id": "f8c647f0-07d7-408a-8328-4b63c400208f", 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "[['col_1', 'col_2', 'col_3', 'color'],\n", 85 | " ['1', '3456', '2134', 'red'],\n", 86 | " ['123', '0', '0', 'green'],\n", 87 | " ['349587', '-1', '14', 'red']]" 88 | ] 89 | }, 90 | "execution_count": 5, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "tabel" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 6, 102 | "id": "e4f8d8b2-72f8-466c-a27e-f258a5cdfd32", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "def parse_csv(csv_string):\n", 107 | " rows = csv_string.split('\\n')\n", 108 | " tabel = []\n", 109 | " for row in rows:\n", 110 | " tabel.append(row.split(','))\n", 111 | " return tabel" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "9cea7b26-bcdb-4eb1-978c-e19d33217fe6", 117 | "metadata": {}, 118 | "source": [ 119 | "Exemplu:" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 7, 125 | "id": "da4515b4-3255-4a73-9a35-62135c0d0e60", 126 | "metadata": {}, 127 | "outputs": [], 128 | "source": [ 129 | "table_1 = [\n", 130 | " ['col_1', 'col_2', 'col_3', 'color'],\n", 131 | " ['1', '3456', '2134', 'red'],\n", 132 | " ['123', '0', '0', 'green'],\n", 133 | " ['349587', '-1', '14', 'red']\n", 134 | "]" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "0010d07b-8785-4dc0-b434-7c99275c7b9c", 140 | "metadata": {}, 141 | "source": [ 142 | "**Hint**: `.split()`" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 8, 148 | "id": "3234c7b1-fdb2-42ae-9a3f-8ad29d9ddf46", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "[['col_1', 'col_2', 'col_3', 'color'],\n", 155 | " ['1', '3456', '2134', 'red'],\n", 156 | " ['123', '0', '0', 'green'],\n", 157 | " ['349587', '-1', '14', 'red']]" 158 | ] 159 | }, 160 | "execution_count": 8, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "parse_csv(csv_1)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 9, 172 | "id": "f61c6c4f-eea6-4fa1-b998-4ac478c3dd22", 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/plain": [ 178 | "[['Sell', 'List', 'Living', 'Rooms', 'Beds', 'Baths', 'Age', 'Acres', 'Taxes'],\n", 179 | " ['142', '160', '28', '10', '5', '3', '60', '0.28', '3167'],\n", 180 | " ['175', '180', '18', '8', '4', '1', '12', '0.43', '4033'],\n", 181 | " ['129', '132', '13', '6', '3', '1', '41', '0.33', '1471'],\n", 182 | " ['138', '140', '17', '7', '3', '1', '22', '0.46', '3204'],\n", 183 | " ['232', '240', '25', '8', '4', '3', '5', '2.05', '3613'],\n", 184 | " ['135', '140', '18', '7', '4', '3', '9', '0.57', '3028'],\n", 185 | " ['150', '160', '20', '8', '4', '3', '18', '4.00', '3131'],\n", 186 | " ['207', '225', '22', '8', '4', '2', '16', '2.22', '5158'],\n", 187 | " ['271', '285', '30', '10', '5', '2', '30', '0.53', '5702'],\n", 188 | " ['89', '90', '10', '5', '3', '1', '43', '0.30', '2054'],\n", 189 | " ['153', '157', '22', '8', '3', '3', '18', '0.38', '4127'],\n", 190 | " ['87', '90', '16', '7', '3', '1', '50', '0.65', '1445'],\n", 191 | " ['234', '238', '25', '8', '4', '2', '2', '1.61', '2087'],\n", 192 | " ['106', '116', '20', '8', '4', '1', '13', '0.22', '2818'],\n", 193 | " ['175', '180', '22', '8', '4', '2', '15', '2.06', '3917'],\n", 194 | " ['165', '170', '17', '8', '4', '2', '33', '0.46', '2220'],\n", 195 | " ['166', '170', '23', '9', '4', '2', '37', '0.27', '3498'],\n", 196 | " ['136', '140', '19', '7', '3', '1', '22', '0.63', '3607'],\n", 197 | " ['148', '160', '17', '7', '3', '2', '13', '0.36', '3648'],\n", 198 | " ['151', '153', '19', '8', '4', '2', '24', '0.34', '3561']]" 199 | ] 200 | }, 201 | "execution_count": 9, 202 | "metadata": {}, 203 | "output_type": "execute_result" 204 | } 205 | ], 206 | "source": [ 207 | "parse_csv(csv_2)" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "id": "b38b94cd-2461-42c6-9e12-fdfcbcf44784", 213 | "metadata": {}, 214 | "source": [ 215 | "## 2. Afisare\n", 216 | "\n", 217 | "Scrieti o functie care afiseaza un CSV intr-un format *human-readable*." 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 10, 223 | "id": "46e6a694-6c15-47af-ae77-e8ad2029751b", 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "def print_csv(csv_string):\n", 228 | " tabel = parse_csv(csv_string)\n", 229 | " for cell in tabel[0]:\n", 230 | " print(cell.ljust(8), end='')\n", 231 | " print()\n", 232 | " print('-' * 8 * len(tabel[0]))\n", 233 | " for row in tabel[1:]:\n", 234 | " for cell in row:\n", 235 | " print(cell.ljust(8), end='')\n", 236 | " print()" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 11, 242 | "id": "02d0998b-a1f8-4fd6-9a03-a88a61762a62", 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "col_1 col_2 col_3 color \n", 250 | "--------------------------------\n", 251 | "1 3456 2134 red \n", 252 | "123 0 0 green \n", 253 | "349587 -1 14 red \n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "print_csv(csv_1)" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": 12, 264 | "id": "12dc3542-b1de-454f-b1c2-cc0b989e7ab4", 265 | "metadata": {}, 266 | "outputs": [ 267 | { 268 | "name": "stdout", 269 | "output_type": "stream", 270 | "text": [ 271 | "Sell List Living Rooms Beds Baths Age Acres Taxes \n", 272 | "------------------------------------------------------------------------\n", 273 | "142 160 28 10 5 3 60 0.28 3167 \n", 274 | "175 180 18 8 4 1 12 0.43 4033 \n", 275 | "129 132 13 6 3 1 41 0.33 1471 \n", 276 | "138 140 17 7 3 1 22 0.46 3204 \n", 277 | "232 240 25 8 4 3 5 2.05 3613 \n", 278 | "135 140 18 7 4 3 9 0.57 3028 \n", 279 | "150 160 20 8 4 3 18 4.00 3131 \n", 280 | "207 225 22 8 4 2 16 2.22 5158 \n", 281 | "271 285 30 10 5 2 30 0.53 5702 \n", 282 | "89 90 10 5 3 1 43 0.30 2054 \n", 283 | "153 157 22 8 3 3 18 0.38 4127 \n", 284 | "87 90 16 7 3 1 50 0.65 1445 \n", 285 | "234 238 25 8 4 2 2 1.61 2087 \n", 286 | "106 116 20 8 4 1 13 0.22 2818 \n", 287 | "175 180 22 8 4 2 15 2.06 3917 \n", 288 | "165 170 17 8 4 2 33 0.46 2220 \n", 289 | "166 170 23 9 4 2 37 0.27 3498 \n", 290 | "136 140 19 7 3 1 22 0.63 3607 \n", 291 | "148 160 17 7 3 2 13 0.36 3648 \n", 292 | "151 153 19 8 4 2 24 0.34 3561 \n" 293 | ] 294 | } 295 | ], 296 | "source": [ 297 | "print_csv(csv_2)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "id": "cc923a69-a65e-41b7-b746-a82965f7d911", 303 | "metadata": {}, 304 | "source": [ 305 | "Exemplu:\n", 306 | "\n", 307 | "```\n", 308 | "col_1 col_2 col_3 color\n", 309 | "-----------------------------\n", 310 | "1 3456 2134 red\n", 311 | "123 0 0 green\n", 312 | "349587 -1 14 red\n", 313 | "```" 314 | ] 315 | }, 316 | { 317 | "cell_type": "markdown", 318 | "id": "b8dd4c1b-aae4-4f72-9234-64d77d44c4bc", 319 | "metadata": {}, 320 | "source": [ 321 | "**Note**: Puteti considera o valoare constanta pentru latimea coloanelor (de exemplu 8 caractere).\n", 322 | "\n", 323 | "**Hint**: Va puteti folosi de functia scrisa la exercitiul anterior." 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "id": "b8bb1bd9-c477-4faa-ac5e-db2136c2e85c", 329 | "metadata": {}, 330 | "source": [ 331 | "## 3. Bonus Challenge\n", 332 | "\n", 333 | "Rescrieti functia de la exercitiul 2 astfel incat latimea coloanelor sa fie determinata dinamic, in functie de latimea celei mai lungi valori.\n", 334 | "\n", 335 | "Exemplu fara latime dinamica:\n", 336 | "\n", 337 | "```\n", 338 | "col_1 col_2 col_3 \n", 339 | "------------------------------------------------------\n", 340 | "a 2 4 \n", 341 | "b 4 16 \n", 342 | "really_long_value 8 32\n", 343 | "```\n", 344 | "\n", 345 | "Exemplu cu latime dinamica:\n", 346 | "\n", 347 | "```\n", 348 | "col_1 col_2 col_3 \n", 349 | "---------------------------------\n", 350 | "a 2 4 \n", 351 | "b 4 16 \n", 352 | "really_long_value 8 32\n", 353 | "```" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 13, 359 | "id": "1e441502-35cd-4a2b-acc9-b0f967ba6fc6", 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "def print_csv(csv_string):\n", 364 | " pass" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": null, 370 | "id": "58fcd3e4-3cac-4ced-b5ed-5540498c8394", 371 | "metadata": {}, 372 | "outputs": [], 373 | "source": [] 374 | } 375 | ], 376 | "metadata": { 377 | "kernelspec": { 378 | "display_name": "Python 3 (ipykernel)", 379 | "language": "python", 380 | "name": "python3" 381 | }, 382 | "language_info": { 383 | "codemirror_mode": { 384 | "name": "ipython", 385 | "version": 3 386 | }, 387 | "file_extension": ".py", 388 | "mimetype": "text/x-python", 389 | "name": "python", 390 | "nbconvert_exporter": "python", 391 | "pygments_lexer": "ipython3", 392 | "version": "3.11.7" 393 | }, 394 | "toc-autonumbering": true, 395 | "vscode": { 396 | "interpreter": { 397 | "hash": "d7228b2cc0e7476a93584e7e1e912b2ec970aeeec9283636b1f7a7ce72c064cb" 398 | } 399 | } 400 | }, 401 | "nbformat": 4, 402 | "nbformat_minor": 5 403 | } 404 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Workshops 2024 | Third Edition 2 | 3 | ## NLP Special 4 | * Speaker: Ioan Ungureanu 5 | * [GenAI in Real Life Presentation](https://docs.google.com/presentation/d/1bnmyUUnOXKsOok6Exn2Ik6IGhDvX8Msm/edit?usp=drive_link&ouid=104125965785644515865&rtpof=true&sd=true) 6 | 7 | ## Python & Data Visualization 8 | * Speaker: Matei Simtinică 9 | * [Exercises, Solutions](https://github.com/Nitro-Language-Processing/Workshops-2024/tree/main/Python%20and%20Data%20Visualisation) 10 | 11 | ## Machine Learning 12 | * Speaker: Matei Simtinică 13 | * [Toy Dataset, Exercises](https://github.com/Nitro-Language-Processing/Workshops-2024/tree/main/Machine%20Learning) 14 | 15 | ## Natural Language Processing 16 | * Speaker: Miruna-Andreea Zăvelcă 17 | * [Slides, Exercises, Solutions](https://github.com/Nitro-Language-Processing/Workshops-2024/tree/main/Natural%20Language%20Processing) 18 | 19 | ## Deep Learning 20 | * Speaker: Antonio Bărbălău 21 | * [Code Example](https://github.com/Nitro-Language-Processing/Workshops-2024/tree/main/Deep%20Learning) 22 | 23 | ## Transformers 24 | * Speaker: Dana Dăscălescu 25 | * [Theory, Code Example](https://github.com/Nitro-Language-Processing/Workshops-2024/tree/main/Transformers) 26 | --------------------------------------------------------------------------------