├── .gitignore ├── Chapter_10 ├── Final - agglomerative.ipynb ├── Final - dbscan.ipynb ├── Final - kmeans.ipynb ├── Final - mds.ipynb ├── Final - pca.ipynb └── cps.csv ├── Chapter_11 ├── Sample_code_GAN.ipynb └── distance_by_vgg.ipynb ├── Chapter_12 ├── DataModel0P100T50Run1234.csv ├── Fit.R ├── hawkes_eg.R └── hawkes_functions.R ├── Chapter_13 └── SNA.R ├── Chapter_14 └── text_mining_chapter.ipynb ├── Chapter_7 ├── Illustrating_SMMH_in_R.html └── R_scripts_for_vignette.R ├── Chapter_8 ├── bonus-plotly_express_ipywidgets.ipynb ├── html_example.html ├── stream_data.txt ├── stream_data_generation.py ├── structured_data.ipynb ├── structured_example_log.json ├── structured_example_log.xml ├── unstructured_data.ipynb ├── unstructured_data_microbatch.ipynb ├── unstructured_data_stream.ipynb └── unstructured_example_log.txt ├── Chapter_9 ├── chat_bigram_feature.csv.gz ├── chat_label.csv └── supervised_learning.ipynb ├── Erratas └── Chapter_14_erratas.md ├── README.md ├── book_cover.png └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | .DS_Store -------------------------------------------------------------------------------- /Chapter_10/Final - agglomerative.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "image/png": "\n", 11 | "text/plain": [ 12 | "
" 13 | ] 14 | }, 15 | "metadata": {}, 16 | "output_type": "display_data" 17 | } 18 | ], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "from scipy.cluster.hierarchy import dendrogram, linkage\n", 22 | "import matplotlib.pyplot as plt\n", 23 | "from sklearn.cluster import AgglomerativeClustering\n", 24 | "\n", 25 | "%matplotlib inline\n", 26 | "plt.style.use('seaborn-whitegrid')\n", 27 | "plt.figure(figsize=(8, 4))\n", 28 | "\n", 29 | "cps = pd.read_csv('cps.csv', header=None)\n", 30 | "X = cps.iloc[:,:].values\n", 31 | "Z = linkage(X, 'ward')\n", 32 | "\n", 33 | "dendrogram(Z, truncate_mode='lastp', p=12, leaf_rotation=-45.,\n", 34 | " leaf_font_size=12., show_contracted=True)\n", 35 | "\n", 36 | "plt.xlabel('Cluster Size', fontsize=12)\n", 37 | "plt.ylabel('Distance', fontsize=12)\n", 38 | "plt.axhline(y=250, linestyle='dashed')\n", 39 | "plt.axhline(y=150, linestyle='dotted')\n", 40 | "plt.show()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [] 56 | } 57 | ], 58 | "metadata": { 59 | "kernelspec": { 60 | "display_name": "Python 3", 61 | "language": "python", 62 | "name": "python3" 63 | }, 64 | "language_info": { 65 | "codemirror_mode": { 66 | "name": "ipython", 67 | "version": 3 68 | }, 69 | "file_extension": ".py", 70 | "mimetype": "text/x-python", 71 | "name": "python", 72 | "nbconvert_exporter": "python", 73 | "pygments_lexer": "ipython3", 74 | "version": "3.8.8" 75 | } 76 | }, 77 | "nbformat": 4, 78 | "nbformat_minor": 4 79 | } 80 | -------------------------------------------------------------------------------- /Chapter_10/Final - dbscan.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Counter({-1: 67, 0: 61, 1: 31})\n" 13 | ] 14 | }, 15 | { 16 | "data": { 17 | "image/png": "\n", 18 | "text/plain": [ 19 | "
" 20 | ] 21 | }, 22 | "metadata": { 23 | "needs_background": "light" 24 | }, 25 | "output_type": "display_data" 26 | } 27 | ], 28 | "source": [ 29 | "import numpy as np\n", 30 | "import pandas as pd\n", 31 | "import matplotlib.pyplot as plt\n", 32 | "import matplotlib.cm as cm\n", 33 | "from sklearn.cluster import DBSCAN\n", 34 | "from collections import Counter\n", 35 | "\n", 36 | "%matplotlib inline\n", 37 | "plt.figure(figsize=(4, 4))\n", 38 | "\n", 39 | "cps = pd.read_csv('cps.csv', header=None)\n", 40 | "X = cps.iloc[:,:].values\n", 41 | "\n", 42 | "model = DBSCAN(eps=8.0, min_samples=8).fit(X)\n", 43 | "print(Counter(model.labels_))\n", 44 | "\n", 45 | "cps_df = pd.DataFrame(cps)\n", 46 | "cps_df.columns = ['FI','MU','EM', 'EV', 'S']\n", 47 | "color_theme = cm.rainbow(np.linspace(0,1,len(Counter(model.labels_))))\n", 48 | "plt.scatter(x=cps_df.FI,y=cps_df.EM,c=color_theme[model.labels_],s=30)\n", 49 | "plt.xlabel(cps_df.columns[0], fontsize=12)\n", 50 | "plt.ylabel(cps_df.columns[2], fontsize=12)\n", 51 | "plt.show()" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [] 67 | } 68 | ], 69 | "metadata": { 70 | "kernelspec": { 71 | "display_name": "Python 3", 72 | "language": "python", 73 | "name": "python3" 74 | }, 75 | "language_info": { 76 | "codemirror_mode": { 77 | "name": "ipython", 78 | "version": 3 79 | }, 80 | "file_extension": ".py", 81 | "mimetype": "text/x-python", 82 | "name": "python", 83 | "nbconvert_exporter": "python", 84 | "pygments_lexer": "ipython3", 85 | "version": "3.8.5" 86 | } 87 | }, 88 | "nbformat": 4, 89 | "nbformat_minor": 2 90 | } 91 | -------------------------------------------------------------------------------- /Chapter_10/Final - kmeans.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [ 10 | { 11 | "data": { 12 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAARIAAAEJCAYAAABYJqh7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAABFKElEQVR4nO2dd3hU1daHf3t6SyH0HppU6Uqz0JuI4EUFQUCxcD97veq1KyAoAnpFpdgRQYqoKAhBFBWQIkjvnRBaQuokmcz6/liZZCZzZjKZnmS/z3OeyZyyz5qBs2bvVQURQSKRSAJBFWkBJBJJ+UcqEolEEjBSkUgkkoCRikQikQSMVCQSiSRgNJEWoKxUq1aNEhMTIy2GRFLp2LZt20Uiqq50rNwpksTERGzdujXSYkgklQ4hxAlPx+TSRiKRBIxUJBKJJGCkIpFIJAETNkUihHhcCLFHCLFbCLFQCGEQQiQIIdYIIQ4VvlYJlzwSiSR4hEWRCCHqAngEQGciagNADWAkgGcBJBFRMwBJhe8lEkk5I5xLGw0AoxBCA8AE4CyAWwB8Vnj8MwDDwiiPRBJ5fvkFaN0aUKuBhg2Br7+OtER+ERZFQkRnALwN4CSAZABXiOhnADWJKLnwnGQANZSuF0LcL4TYKoTYeuHChXCILJGEnr17gSFD+NVuB06eBCZMAH7+OdKSlZlwLW2qgGcfjQDUAWAWQozx9XoimkNEnYmoc/XqivEwEkn54733gNxc133Z2cDkyZGRJwDCtbTpC+AYEV0gonwAywB0B5AihKgNAIWv58Mkj0QSeU6fBgoK3PefOxd+WQIkXIrkJICuQgiTEEIA6ANgH4DvAIwrPGccgBVhkkciiTzDhwMmk+s+nQ4YOjQy8gRAuGwkmwEsAbAdwK7C+84B8CaAfkKIQwD6Fb6XSCLLqlVA8+aARgM0bQr88ENo7nPXXUCXLoDFwveyWIDGjYHnnw/N/UKIKG+lFjt37kwy10YSMrZsAXr2ZFuFA5MJWLMG6N49+Pez24GkJGDbNlZeQ4YAWm3w7xMEhBDbiKiz0jEZ2SqRODN9OpCT47ovOxuYNi1097Tbi7dySrnL/pVIQsqZM4DSLP3s2eDfKy8P6N+fZyM5OYDRCNSpA2zeDMTHB/9+IUTOSCQSZ4YP5wfaGaOR9webL78Etm4FMjPZe5OZCZw4Id2/EklUkZ4OjBvHNg6zmYO9MjO9X/Pvf3OkqcXC0aYWC3DVVcAjjwRfvuXLgaws1325ucB33wX/XiFGLm0kFZebbwY2beIlBAAsWMCxG6tXe77GaORrVq0C/vmHlcrgwexVCTb16rGyKhlLUqtW8O8VYqTXRlIxOXQIaNfO3XBqMAD793NeS6TZtw/o3NndQ/Ttt0C/fhETyxPSayOpfJw/r+xG1emAaMnXatkSWLkSaNOmOGnv00+jUomUhlzaSComHTooh58DwNVXh1cWb/TsCezaFWkpAkbOSCQVE5MJmD+fbR56PS9pjEb+xdfrg3+/7Gxg4kQ2zppMHLWalhb8+0QpckYiqbjccQfQtSuwbBkgBDBiBBs4Q8HIkRz9arXy+8WLgcOHgY0bQ3O/KEMqEknFpmFD4PHHQ3uP5GSuIeJcEiAvj70+e/aw56eCI5c2EkmgXLzIRtySaDRASkr45YkAUpFIJIHSooVynInNBlxzTfjliQBSkUgkgaLVAp9/zsZcg4FnJ0YjMHs2EBMTaenCgrSRSCTBYMgQDoJbsoRnIrfeCjRqFGmpwoZUJBJJsKhbF3j00UhLERHCVfy5uRBih9OWLoR4TDbIkkgqBuEqtXiAiNoTUXsAnQBkA1gO2SBLIqkQRMLY2gfAESI6AdkgS1LZ2bABaN+eDbRNm3LwXDkkEjaSkQAWFv7t0iBLCOGxQRaA+wGgQYMGYRFSIgk5+/cDAwcWZ/8eOcKh9XFxQJ8+kZWtjIR1RiKE0AEYCuCbslwnG2RJgkJeHntV3noL+P1315KKqanA3LnAzJnsfQkH776r3CDrjTfCc/8gEu4ZySAA24nIEe6XIoSoXTgbkQ2yJKHjwgVu/XDxIufD6HTAgAHAN99w5fh+/bj4ss0GPPccMGUK8NhjoZXp5EnlDOXk5NDeNwSE20YyCsXLGkA2yJKEi+ee4+poGRlAfj6XOFy9GlixArjzTt6flcUzBKuVzw/1Az1smHKDrCFDQnvfEBA2RSKEMIGbYDlbk2SDLEnoyMgA7ruPo0s//pgViDNZWZyle+aM+7UaDbB+fWjlGzsW6NiRSw8Iwa8NGpTeICsvD3jmGaBKFVZEd94JXLoUWllLIWxLGyLKBlC1xL5LYC+ORBJ8brkF+PNPdzuEA52OH1wlhABCbY/T6VhZrV7N1eSbN+dq9UoJgM7cfz8rQEcZySVLgN27gZ07We4IICNbJRWTw4e5iLMnJQJwjsy//812k4ULix9MtRqoWhXo1Sv0cqrVXFx68GB+f+oUL7c0Gg6zr1HCkXnlCvD1166fKz8fOHaM++F07Rp6mRWQSXuSism5c96PX3011xBJTAQ++IB/5c1mfrD792evjlodFlGL+OornpU8/TTw5JOcq7Nmjes5qamASuGxValK/8whRFaRl1RMLl70vDRJS+NYDSWIIrM8yMjgNhTOFeUBoFo1VhAOpWa3c05PSaWh17MXqOQMJojIKvKSigsR8Pbb/BAajbxEOH4cOHCAU/pLYjB4jxOJkI0Bmzcr1zSxWjlwzYFKBXzxBRtZDQa+xmgEpk4NqRIpDWkjkZRvpkwBJk0q/iVfvZrtBKtXKy8BAP6VjzaqVVOOKcnPZ3uNM337sg1o0SK26wwbxq0tIohUJJLyzbRprssBu53f79vHOSxbtxZ32tPrOZ9l6VKgWzfeIjUDKUm7dtwadPfuYje1wcCh8kqd92rXDn3AXBmQSxtJ+cVu5/6+JcnN5WCylSvZnarVsktVo+Flz/PPs0H19tt5jGhACDasDh3Kcur1wKhRPOsoB0hFIim/qFTc8rIkGg3QuzcQH1/sKh07lmcmmZn8mpUF/PQT8P33YRfbI1WrckxIXh4vWT7+mD1J5QCpSCSh5/ffOYLTYOBCyStXBm/sefPYA2MysWIxmYB77uGlggMh+J5Kka0rSsnKuHQJGD2ao06rVuXQ+ZLjKPHtt7xUMRi4AHRZ+tsIET1LLl8honK1derUiSTliD17iMxmIvav8GYyEf32W/DucekS0ezZRK++SrRpk/I5bdq4ygAQ6XREzz/veVy7na/TaouvMRqJJkzwLs/atfwZne9lNhMdPOj/Z4wCAGwlD89lxBVDWTepSMoZ991HpFa7P8QDB4ZXjoULlR/uY8c8X/PHH0QWi7vsBgNRerrn62680f0ajYbokUeC/anCijdFIpc2ktBy4oSyW/PUqfDKMXIkMH16cZBa69bsIk5M9HxNcrLyEkMI7319lZIAbTb+LiooUpFIQsvQoe6p8no9cNNN4Zdl4kTg/HlWbLt3Az16eD+/Rw9le0h8PEeXemLwYPfEO7MZuPnmMotcXpCKRBJaJkzgX3+Lhd9bLED9+sCzYajznZnJyiMujh/+hx7iGBNPgWolqVWLq5UZjcUuWZOJI0u9jfHSS6xonD9z+/bAmDGBfqKoRebaSEKPzcZek+3bOQJz+HB+KENN377sMXJkyhoM7BYuq9do92727phMwB13AHXqlH6N1cqBbwcPstdm0KDwJwEGGW+5NjKyVRJ6NBquDXLLLeG755Ej7rVIrFZg3Tq2z9Sv7/tYsbE8ozEa3ZdpnjAY2G1cSQhnhbR4IcQSIcR+IcQ+IUQ32SBLEjKSkzmitSQ6HZCS4r7fE3PnFqf2P/IIF0IqS0xIJSGcNpJZAFYRUQsA7QDsg2yQJQkVHTrwkqokRECbNr6NcfEiKw+rlSNNs7I43X/UKNcK9JKwteyMBXADgPkAQER5RJQG2SBLEirMZm4t4WyXUKuB//1PubyAEr/9plz28MQJXuIMHBi+1hVRTrhmJI0BXADwiRDibyHEPCGEGSUaZAHw2CBLCLFVCLH1woULYRJZUu5ZvNhVkWg0vM9XqlXznNRntXKFtW7dlBMHKxnhUiQaAB0BfEBEHQBkoQzLGJINsiRl5cgR4I8/iksIAGx4TUpSDoZLTwc++wx4/33OEAaA664Datb07G0hYoWilKFbUAD8+CMwaxbLEY1LofPn2QY0Zw7/HQieQl6DuQGoBeC40/vrAawEcABA7cJ9tQEcKG0sGSIv8YkNGzgvpmSoutFI9Ndfrudu2UIUG8sh80Yjb++/z8fOnOFwfqUwf4BICKKXX3Yd78oVztGxWIj0eh73ppuI8vPD8tF9YuVKThlw3r7/3usliHSIPBGdA3BKCNG8cFcfAHshG2RJQkXLlsVV4Z3JyeEMZAdE3BcmPZ2NqTk5vD35JHt36tThcgN5eRyDUjJk3mTi/c5Mnsy2k8xMngVlZXHbiWipLZKXxwbj7GzXbfRo71X3vRBOr83DABYIIf4B0B7AZMgGWRJ/SEnhwLCYGI4+nTzZ3Zaxf7/nmq0HDxa/v3iRiyaXRKvlmBMHKhUwezbHkzhiScxmjo25/nrXa5ctc38gs7K41kg4IWLjcv36HF178828bNuxw/M1f//t163C2SBrBwClqDjZIEviOwUFQPfu/PDbbPyrP2kSJ9FNm1Z8XtWqymHsQgAJCcXvvQWYlazt2rw5cPQo98BJTuaZyI03us9SqlVz9+ao1colE0PJjBnAiy8Wl6L88Ufgr7/YTqSUQ6RUH9ZHZK6NpHyxdi03BHeOEcnOZiOp88PRogX3rnEOStPpgGuv5X4xDsxmzgw2Gov3qdXcDrPkkgXg86pX561aNVclcukSMH8+x7A4jwdwSsCDD/r3mf1lyhTlerY7dgCdOrm6tnU6lrtZM//u5cl4Eq2bNLZWcj7+2L1QEsDFh1JTXc+9dIno1lu5FohWS3T77URpae5jWq1EEydynRG1mqhPH6ITJ9zPO3GCqHZtopgYPtdoJHrsMT62di3LZTaz4VKrJYqLY2NskyZEq1cH+5soHZVKuS7K1Kn8Xd12G7/XaIhGjCC6fNnrcJCFjSQVhsOH+SEu+YA0bcoVzZQoKOCtNOx2IpvN8/EhQ9wfTpOJ6M8/iapWdZfJbFZWXOGiSxd3mUwmos2bi8/x9buhKPDaSCRBo0kT4PHHi2u0Ggy8PPn0U891TlUqd3vJ8eNsfLRYOH/mgw94v7cM3aQkd6Ou1Qp8+aWyt0Ot5oznSDF3LiccOpZZZjN7a669tvgcpe/GD2T2r6T8MXkyN9j+/nv2oowaVTZDZnY20KULe2zsdvaoPPUU210eftjzdbGx7i5lvZ5dxEp5PQUFrobdcHP11cXG4XPngAEDOMguFHiaqkTrJpc2koD57DPlWqw1ani/bupU97qvsbFEFy8S9evHxaSdbRHt23tebpVDIJc2EokTZ84oB6tduuT9uqeeAv7zH56ZCMFejvXr2WX68MOutWntds4cLm9tJfxEVkiTVD42beLqaVlZrvu7d+e8mNIgYqXhaPpdUMBLq4sXXc8zmzmHxddiSFGOtwppckYiKX+cP89h7bGxbJ+YOrVsrTe7dAFGjCjuYmc0cl3Xjz7y7XohipUIwH2GrVb389RqYPNm3+UqycKF3KvYYgH69eP7RCnS2CopXxQUcHX348fZwJmRAbz2GpCaCrzpY4aFEMAnnwB3380tKWrXZsXkZ1Qn4uODb2xdtAi4997igLKkJC5ZcPiwe8RtFCBnJJKoJS2NM/vnzOGIdAAc2ZqS4h7Z+t57vrXSdCAEh7dPnsz2DX+VCADUq8feEOdIUY2GSxD89hsvpcpqQnjlFdeoVCJOtvv0U//lBICtWzn/5scflfsN+YmckUiikj/+4AJkAK9aHn2Uo8/vtJ5RXsbk57PNIz4+rHIWsXQp8MAD/Gq3s10kJQV45hle4vTuDSxf7nsleaW6sjk5xbVSyordztm9333Hf2u1vCzcuJHTAQJEzkgkUYfdDtx+O+fjZWbyD7PVyjP9jA43KCuSBg3YzhEpYmPZpuEoQeAoH2C18uu6dWUrI9Czp3ugmMUC9O/vn3zff8+b48vMyACOHeMePEFAKhJJyNm8mZf3jj5Ra9Z4P//YMeWOmFot8NvZprhw73MYrf4acUhDXZzBNO3zsM//JDSu1tRUbvIVH89LlddeU7aHOFCreeahVEbgm298v+/06WxfcXh8LBYuVzBkSJk/AgCWqaSXKi8P+PZb/8YrgVzaSELK/v1Anz7F/4d37gSGDWNl0r278jVxccrLd7udn+fuP72I47DDBhXSEYdXNa8jdZUKU24MsvBEvCTZu7e4ZOPUqWywcYTUK1G1qnIZgZo1fb93o0ZcLvKrr1iz9uzJkan+hrNXr86auKQdKQjLGgAyslUSWiZOVK5SOHiw8vm7dnGVw2uu4SqFzoGizZsTrVrFybclx9Prid57j+jQoSAKv3GjcgSswUCUkeH5um+/dY+ANZmI/vmn9HueO0c0bx7Rp5+Wmo1bJg4dUpZpwQKfh0A0ZP8COA5gF4AdDoEAJABYA+BQ4WuV0saRiqR80a+f+3MIELVu7Xqe3U700EPFJVMtFk6e1WpZEfXvz+VT589XriLgeL4NBqLXXw+S8N98o6y1jEblMgPOzJlDVK0aZwsnJhL9+GPp91uyhMc2m4u/gPXrg/NZiIiSkrikgUpFVKUK0axZZbo8mhRJtRL7pgF4tvDvZwFMLW0cqUjKF7Nmuf8Q6nRETz3let6GDe7nOcpk5OUVn3fokHJN55LP+b59QRD+zBnXaZFjq1nT97IEVqtv98rMdP8CHPk/3kob+IPV6lcOkDdFEmlj6y2QDbIqNPfeC1x1FdsKAQ4mrVsXeLZEM5IffnBPf7HZOF7MuchZ06bAQw+xDdKTbdVqBdq1A1q35nH9pk4d9mo4ShbodLDpTLhbfIrYeBWGDmXzhUeE8L1Z+saNrtGyDrKzgQMH/BLfI3p98A3TnjRMsDcAxwBsB7ANwP2F+9JKnJPq4dr7AWwFsLVBgwZl1qSSyJKXR7RoEc9CPv2UKDvb/Zy33lKuV1S3rvKYmzYRPfssL328zU5MJqKffw7wA2zfTvTf/9JfwyfTVYYTRWOrVETVqxOlpwc4PhHRtm3KazaDgWdGUQCiZGlTp/C1BoCd4BaePikS500ubSomycnudk2TiWjGDO/X/ec/yisC561Hj+DIWK+e+9hmM9HcuUEY3G7nXjgaTfHgej0bhwLBaiVaupTogw+IDh4MaChviiRsSxsiOlv4eh7AcgDXAkgRQtQGgMLXANt9ScortWpx9HurVryKiInh5c+jj3q/7o03gH//23uCrVJjPX9Q6hablQWcPh2EwYXgL6BfP17iaLXsJy9L7ElJjh8HEhOB8eM5SK5dOw69DwWeNEwwNwBmADFOf/8JYCCAt+BqbJ1W2lhyRlLxsVp9LiNaREEBe01Lrg40GqIJE4IjV8+eXMu55Izkl1+CM34ReXnB6crXp497jVmjkWj3br+GQxTMSGoC+F0IsRPAXwBWEtEqyAZZlZ6FC9kYGxsL3HQTx3Hp9WWPu1KpgLvu4h9dZ8Nu9erA668HR9b333ctgWqxAIMHc+5fUNFqlQ2vZWX9evd0ApuNOwcGmbBEthLRUQDtFPZfgmyQVWlZuNA1U/6nn9h5ceSIfwGXOh3w66+cl7ZxIyuoUaOKFUugtGrFsn35JRdZ69uXVyJRWwTNbOZWpM7odMGLZnVCVkiTRIyrrnKPJDeZuKyItxrM4SY5mbPujUZg6NDgKaaQ88IL3G3PuRxBXBxw4oRfCY6yQpokKjl3zn1fdjb/P48WvvgCaNyYjb4PPMClR8rN79irr3J3P5OJc33at+cpWwiypKUikUSM6693XxZYLMqdMiPB5cvA/fcXVwLIzASuXOH+5eViIq9Wcz/k9HQW/u+/2YgUAqQikfhMbi7w3HOcxFqjBvDEE8rF2H1l5kzO5nXOlL/xxuKCRqHk1Cku2xofzzOOESM44jYhgasGXL7MP97ORc8cnD3LNpJgk5XFS7rq1dkd/sILZSv65hG1mhuJhRJP7pxo3aT7N3L861+u0acGA9GAAYGNefky0bvvEj35JNHKlWV3+/pDVhZRrVrKrXEduUBt2hCtW+c507hkm+Fg0LOna2qP0Ug0Zkzw7+MviAL3r6Scc/YssHKla7F0q5VLkh454v+4Varwr/Dbb7MrNQjdI/H338CHH3J8l1IxtWXLeKbvqfB8Xl5xRcOEBNfqiHo90LEjFzs7eDBwWR3s2wf89ZdrPaScHI5HK9nlIhqRikTiE8nJytN8rTY003x/sNu5GPx11/Gya/hw7mFV0gN66pRvS7LkZFaU113HykSr5dfdu3n89u3dkw/95dQp5dARnU7ZKB1tSEUi8YlWrTzXXG7fPuziKLJiBceQZGezosjM5MTZkgFpN9xQusnAZuOuFw0acFxXVhYwaBDPGDIyiu/x3nvAtm2By965c3ERNmeEYDd5tCMVicQnjEZg9mx+1Wj4l9lo5DCF2NhIS8csWeJeljQ3l/c70707x4M4+mM5lykAeHlVpQrQti330tqwgZc0a9a4l4C0WnnJFygJCRw/YzTy/TUa/nvuXOWZYLQha7ZKfOauu/iXc+FCfqBGjuSG99GCp5rMJX/phQAWLGAbyqpVPKPYvLnY/mO3F/fR+esv9iL9/jt7lUouifT64AWKPvool2ZdtIiV2+jR5WM2AsjIVkkEuHCBf8X1ei6KHhMTnHE7dAB27HDfbzS6Bnc6Y7XybMCbzUQI4LbbOARj0iTXsWJigKNHo6v53dmzrCBjYvj7deQGBYq3yFY5I5GElaVLeWajUvEDKgRXQevWLfCxL11S3q9ke3Bw5UrpbYOJWFksXMh2l3ffZQXUvDk3vosmJTJvHnvB1Gr+jrVatvGEeuYobSSSsJGRAYwdy7/+jkjRjAz+tS9LD3BP9O2rvL9RI8/X1KhRuiLQ63l5o1Jxh88rV3jbswe45hr/5Q02KSmsRByRuBkZHFh3552hv7dUJJKw8ccfyi7O1FSORTl6FLjlFrY5tGhRtsZ0AEfKlkyoU6l4JuEJIbgVaEmDq+O92cyKZPZszrOZMqV4vzN5ecCLL3KZV0fUr6flVKhYu9b9cwAc73L5cmjvLZc2krARH68883B4Qq69lpWK3c6d9u65h/8eNcq38WNj+Vf5hReAX35hZfTWW6wAvLF2LS8FHOHoKhWHqA8bxl0uz5wpPvbGG2yDeO891zHuvpub2TlsLR98AOzaVXpXwWBSpYpySQMhQh8hH/GQ97JuMkS+/GK3EzVr5towS68nGjiQm2Ip1V5t2jS0MuXkKLe3sFiIPvzQcz3mzMziMVJSlLtWGI1E+/eHVn5n8vLcQ/8NBqK77grO+AgkRF4I8XFpm69KSwihFkL8LYT4ofB9ghBijRDiUOFr8CuuSKIGIXimcMMNxWVJhw7lJczx48pLgVBHdXoztu7d6xoe70ClcjXsnjunHOuh0wWpnquPaLUc89KlC3+/ej276D/6KPT39sVGMh5ANwAXAJzxsPnKowD2Ob1/FkASETUDkFT4XlKBqVsXWLeuOLN98WJekvTs6W7fEIKjS0OJJ2OrzcZLKiWPj8XCn8NB8+bKY+fmcl5OOGnaFPjzT1aQGRnAJ58Ez/3rFU9TFccGblq1FEAygA8AdCvtGg/j1AMri94AfijcdwBA7cK/awM4UNo4cmlDtHMn1/WNjyfq0CEIfVuigIICokGDiltSGI38+YLSMa8UVq/mZZVOx8WdDQbupxMfzy0oDAber9XyeT/84D7GZ5+5dpLQaIimTQu97OEEXpY2PgekFS47RgIYC6AqgM8B/I+I0ny8fgmAKQBiADxFREOEEGlEFO90TioRuS1vhBD3g5tkoUGDBp1ORFMJrTBz4gTHBGRkFO8zGrneadCLEIcZu50D1dauBRo2ZFdxuGI0jh3jamhHjwJff+2ahavXczGjxESWqUkT9+tHj+YYGcd1Oh0v4cJpbA013gLS/JlZxIJ79toA9PLxmiEAZhf+3RPFM5K0EuelljZWZZ+RPPWUcne5Xr3Cc/8rV4i++oroyy+5lkg0kJtLtGIFNxg/fjywscaOda9TIgTR7bd7vsabsfXAgcDkiSbgZUbik/tXCKECMADAOAA3AlgJoA8R/eqjMusBYKgQYjAAA4BYIcSXKGyQRUTJskGWbxw+rFw1y1E/I5SsXw/cfHPxe7udf4XDUdHME8ePcxKeo75IQQGXKn3mGf/GO3LE3fhKhZGtnnAYW51nMQDvO3Wq/OTLBIIvXpu3AZwA8DCAbwE0IqJ7y6BEQETPEVE9IkoEL4/WEdEYAN+BlRMKX1eUTfzKx8CB7l3lNBpuixBKbDYuR5iZWbxlZ/OUv+QDFE7uuYdjRzIyOJrTauVmciWr0/vKwIHuMRd6PTBggOdrosnYGil88do8AcAKtm38H4CfhRC/OW8B3F82yCoj48ZxoJXDw2EysR0hVJ0YHeze7TlnZcuW4Nzj2DHg1ls5ia5VK/f0/5LY7VxXVWkG8eOP/snwyCNA/frFkatmM0erPvmk52v0eg5Ac5RYUKn432XKlJC0kIlKfFna3F3K8TKlDxPRegDrC/+WDbLKiMHAzZ+WLuWQ81atgDFjQl8TJC5OOU3fZuOI1UC5coUjWy9fZsWQmspKs6CAZz1KOCI2S8afqNX+yxQbC+zcybEtW7dyRvGoUd57CwNsbO3YkZtn5efzNR06+CdDucST8cSxAXi3xPsJJd4vLW2MYG4VzdiakUH09ddEX3xBdOlSpKXxTvfuroZejYaoXTuOWPUXm43dr2PGKEeYNmvm/fonnnC/zmIh+t//uCdvILJJXIEXY6sviiS9xPvL3o6HeqtIimTjRqLYWK5UbrHwA7FiRaSl8szFi0Q33cTKRKMh6teP6Nw5/8c7f54VRUyMawxGSaXgjbw8ooce4lgPtZpjPwwGjvewWIg6diRKT/dfRkkxgSqSjBLvU70dD/VWURRJQQFRnTruD47ZzO0SopnsbN4CZcwYZVe2s9u1Xz/fxsrLI1q40D03Rq8neuaZwGWVeFckvhhbS9pASnsv8YFDh9guUBK1mm0fzhBxwZpmzbh50vjxwPkIOsqNxuCEXX//vecGUEYj2ytmzfJtLK2Wx1Oq2bp4cWBySkrHF2OrRgjRC4Dw8F4hrUlSGrGx7oWEATY0lmzN+s47wEsvFRsVFyzgNgkHDijXnygvWCzuylSj4bqwt97KCrN6dd/H0+uV9ysl3kmCiy8zkvMAPgYwv3C7VOK9DCLzg9q1OSHNOWtUrWZXo3PVLSKugeHsmbDZuGnSTz+FT95Q8Pjj7t4QvZ6bQj39dNmUCOBZqWZnc9h7yf42kuBRqiIhokQiauRtC4egFZElS7hXilbLv8TXXQckJbkWp7HZlJdA+fmcd1Oeefxx3sxm/g4aN+b4j9IKEXkiLU15f3IyNwOvU4ejcyXBR1aRjwJycnhJU7J8n4NWrbilozMmE7dIqAixCvn5PGuIjVWu8OUr8+dzS4eSdhJnqlblkHalko8S73hL2pM1W6MAo9GzEgG4SZLZXLwMMps5SCualMj588U2jebNgY8/5mVZaSxfDnTqxBm1t91WnDNUUABMncpZwLVrcw1UbwoC4KCwli29f5d5ecA///j6qSQ+48mdE61bRXH/lpWTJ4lefZXowQe5/kg0BVrl5RE1auQaC2IyEc2Y4f26b75xLa+oUhFVq0aUmko0caLrMb2e6IYbSpclN5dowQLO1tXp3F3KRiPRoUPB+NSVDwQSRxJtW2VVJNHMsmUcVFbyoU1I8H5d8+bu15hMRG+9xUFlSsd27vRNJrudo26dlZtWS3TttQF/3EqLN0UilzaSgDl5Ujmh7/JlZRe3g7Nn3fdlZ3OtVCUbhkbju4HZ0XjrxhuL68P26QP88INv10vKhlQkkoDp0UM5VqNNG+8xHF27uhtXLRaueaLkys3L48Q+X6lZk6utpaXx9tNPZXcpS3xDKhKJGwsXAq1b80M3erTyzAFgV3WXLlzDIz6+ONpVp2OD55w53u8zaxZ7ahz1PywWVi5Dh3Llc0davhDspfrvf1k5+EpuLve4ad6ct1de8d6+U+I/0v0rceHjj7ntoyMATq3mh/fwYdew+F9/5RgY5+bbRiN7Xho35oJD9euXfr/z57l/7tGjQP/+rEQcy5p9+4DPPuNiRSNHspIpC8OG8fLGai2Wb+hQDk6TlB3ZRLwCkF+Qj5+P/IzLOZfRp3Ef1ImpE5L7OIfiA2zjSE/n+idjxhTvf/llVyXiODc1lSNza9Xy7X7VqrH7t2ZNoF07V9tIy5bAm36WujpxwlWJACzvihU8w6pT4uuzWvn8zExWaHIJVDbCokiEEAYAvwHQF95zCRG9LIRIALAIQCKA4wBuJ6LUcMhUnjiWegzXfXIdMvMyYSc7bAU2TOs/DQ9f+3DQ76WUDJid7W7kVDJ65uVxZOr69bxk+eMPjgPxxMWLHM179iz7VWw2rlA2dWpAHwEA10rV610VCcDLrjNnXBXJnj1slHUse/LzOXbHWXFKvBMuG0kugN5E1A5AewADhRBdIRtk+cTdK+7GucxzSM9NR2ZeJqwFVjyz5hkcTzse9HspBbkZje6Nqnr2VDakFhRw/dTkZO6H640nn+QlTUYGzwSsVuB//+MKcIHStq2yPcRm40hhZ0aOZA9TRgZvVitw332u3fQk3gmLIil0Q2cWvtUWbgTgFgCfFe7/DNyMq1xARJi9ZTaavtsUtd+ujf9b+X9Is6YF/T42uw0bTm6AnVwLkwoI/HjoR1itbISsV49//SdNUi6J6CsffADExBRn0prNQN++/Iudnw+8/jrfZ/VqPsdTOQG7nWcm3ty/333nXkYgJwf49lv/5XcQGwtMm8ZGWpWquI7qjBmuka8XLwIHD7pH4Wq17PGR+EbYbCRCCDWAbQCaAnifiDYLIWoSUTIAELekqOHhWucGWeES2SuvrH8Fb298G9n5bFCY//d8/HHqD/z9wN9QieDpZ5VQQafWwWpznaOrVWrE6mMxYgR7TxxT+EmTuLzA55/7d7+OHYH9+9noevo0MHgwMGQIe07Gj+eQdodtxGDgFpFVqnDP2ZKo1fwAe8Jsdk+00+mCUwMWAB56COjWjeuoqtW8VGnf3vWckhXjnQl1HdyKRNjcv0RUQETtwa07rxVCtCnDtXOIqDMRda4eBVaw/IJ8TN84vUiJAEBeQR6Oph7FbycCKarvjkqocE/7e2DUuP70q4UabXXDsG6du0Fx8eLACh/VrMnxGj168IOnUnGi27JlrgZWq5W9ObfeqjyOw3XricceU26tEUzbRKdOPAt5+213JQKwy/mWW1xrmTjczX37Bk+Oik7Y40iIW3yuBzAQhQ2yAKA8NcjKyMtAvl25tNeJtODn9r8z4B3cefWd0Kv10Kq0aFW9FdaPX49LyRaXeiYO9Ho2KPpDcjJXYhsxAvi//+P4i8mTeTxP9zpyRDmAzGr1vrR54gl2NZtMPHbDhhx56ovbOJh88gm7inU6/hwdO3LhqPJcNCrchCWORAhRHUA+EaUJIYwAfgYwFdy17xIRvSmEeBZAAhF57ZEWDXEkRIT6M+rjTIbr02rUGLFj4g5cVTU0rdWsNity8nNQxcjNUlJT2ftQ0jNhsfCMxJ9yiLfcwp4XZzuL0cgPVs+e7hm4BgOff/PN7sdat+Z+OKWRl8fG1ipVAisjECg5OSxLyQp1EiYaygjUBvCLEOIfAFsArCGiH1BOG2QJITB/6HyYtCZoVGxmMmvNmNBxQsiUCAAYNIYiJQLwgzdpkqtB0WgE3n3X/5qqq1e7G2vz89nwOHOmu/FyyhSgVy9WQA4jpiOy9aOP3MfPzgb+8x82DjdqBLz1FtsvEhIiq0QA/s6kEvEPGdkaAIcvH8bHf3+MNGsa/tXyX+jdqDdEBJ6Gv/4CvvqKH8ixYzmwy1+qVHE3gJpM/MD/3/8BO3awIddu5/B5R1lIIs5l+e47trHcc49yDEmfPsCffxbPokwmlvmDD/yXWRIevM1IpCKRuPD885wD4xzdarFwO81q1QIbe88eNuKW7Iyn17MxN1jemmCwYwcXQGrTxvf+vdnZwKpVPKMbOLDieX1kiLzEZ157jYOy5s3jWUeDBjwDCVSJAFz9TKk8gFbLjcCjQZE4mqWvWcPLN7sduOEGnml5M75u2sTKw9GHuKCAvVzemo9XJOSMRKJIbi4bQINpu0hJ4eVObq7r/rg4Ng4reYXCzdy57JZ2njWZTBy2/9BDytfY7WzzSU523R8Tw5/LW6xKeSIajK2SQk6kncAdS+5ArbdroeNHHbHy4MpIi6SIXs+FkoNp8qlZk5dOJhOPq1azgfP996NDiQDcM6jk0is7m4PaPLFvn3KrCyF4plIZkEubMJJmTUPnuZ1xOecy7GRHSlYKbvvmNnw94msMbT400uKFhZdeYoPrwoWsrO6+m+0Q0YKn5ZW3ZZfFohwvU1DAs5LKgJyRhJEvdn6B7Pxsl7yZHFsOnk96PoJShYb8fPbiLFwIXLjgeqxHD07Omz49upQIwO0sSkbbmkzcf8cTDRuyQdbZhqJW83LHV0NteUcqkjByOPWwS1i9g9PppyMgTeg4fJiNtHfcATzwAP/9ySeRlso3evXiOJwqVdi2ER/PCq80o+mKFUDv3qxMtFrO8Vm7NvKxMeFCGlvDyLJ9yzBu+Thk5mcW7RMQ6N+kP1aNWRVByYJL164c2+L8X8tgYBeyrwWPIk1BAZcRSEgoWzOtjAzl/s0VAWlsDYDpf05HzJQYqF5Voc70OlhzZI3fYw1tPhQdaneAWcshoAaNATH6GMwYMMPrdUSEL3Z+gdazW6PO9DqY+MNEXMrmYhm5tly8/MvLaDijIRrPaow3f38TNnsAdQQCJCsL2LbNPS1fo+EYi/KCWg3UqFH2jnwxMRVTiZSGnJF4YeammXh8teviWEBgx8QdaFuzrV9j5hfkY+m+pVhzZA0aV2mMCR0noJbF+8/0rM2z8HzS80XLIq1Ki8T4ROx9cC/+tehfWHN0DXJsnJZr0phwe+vb8cmwyKwlcnM5EKtkUaGYGC5NMGJERMSSBAEZ2eoncVPikJ7n7tfrmdgTv4z7xeu1J9JOYNbmWTBrzXiy+5OIN8T7JQMRoeq0qki1ulagjNHF4J0B7+CRnx4pUiIO9Go9Tj1+CtXN1XE6/TR+O/EballqoVu9bkg6loTs/Gz0a9zPJW8nmIwdC3zzjWsyYUIC1zfxNwdIEnlkZKufONsynDmaetTrdS+uexFvbHij6P2kDZPw9YivcXvr28ssg81uU6y8lm/Px85zO6FT69wViUaP0+mnMe/veXjt19egUWlgt9thLbDCpDVBQMBmt+Hz4Z9jRKvgTxE+/JBtDEuX8hKnWTP23kglUnGRNhIv1Iutp7i/d6PeHq85l3nORYkAAIEwetlo2O12D1d5RqvWolnVZm77VUKFYS2GIbcg1+1Ygb0ABVSA1399HVabFZl5mci2sds5My8TGXkZyLHlYOzysbhivVJmmUrDZOLArkuXuLDz7t3A1VcH/TaSKEIqEi8suHUBBFz9d0aNETMGzAARYe72uWj5fkvUe6ceHlv1GNJz0/Hhlg8Vx7LZbaj+dnWMXT4WZ9LLVnVo7s1zYdaaoVVxoIJZa8aw5sPQp3EfvNbztaJZhkqoYNKaMGPgDKw5sgZ5Bd67QWlVWiQdSyqTLGXBbOboWEnFR9pISuHAxQN4fPXjOHjpIHol9sL0/tMRa4jFa7++hql/TC0ygOrUOrSu3hpj247F4z97jl5SCzWqm6vj0MOHYNFZfJbjWOoxzN0+FylZKRjeYjgGNxtcVBt246mN+PKfL6FWqTG+/Xh0rN0R725+F/9Z+x+3Wq/OxOhisOT2JejfpL/PckgqL9LYGmTyC/JRZWoVZOW7lgSz6CxYfvtyDFwwEAXkucagWWvGzIEzcW/He92ObTu7DQcuHUCHWh3QsnpLv2VMyUxBk3ebuMnoQECgpqUmTj1+qqg4k0TijYjHkQgh6gshfhFC7BNC7BFCPFq4P0EIsUYIcajwNTRuhCCTnpuuWLOViHAm4wzWjl0Lk8akcCWTlZ/lZrDNteWiz+d9cOOnN2LiDxPRaU4njF0+1q0Nha/UtNTET6N/QsO4htCr9dCr9UiMT4ROrYNOrUP7Wu3x6/hfpRKRBIVw/S+yAXiSiLYLIWIAbBNCrAEwHtwgy1Gz9VkA/wmTTH6TYExANVM1nM1w7a5tJzu61OuCFtVaIOu/WThy+QhGLxuNzWc2u5xn0VnQvX53l30zNs3AxlMbXTwwy/Ytw03NbsIdbe7wS87rG16PY48ew4XsC4jRxcCoNSIjNwN5BXmoapLGC0nwCFeDrGQi2l74dwaAfQDqIkoaZK07tg713qkH9WtqxE6JxTsb3yk6tv74elz38XWoPb02RiwegcOXD0MIgbk3z+WaraK4Zuu4duPQoloL5ORzIl7vz3ojOSO5aEbgOO/aOtdiUNNBLjIs2LXAzY2blZ+FBbsWlOmzrDmyBt3mdUOd6XUwaskonLhyAjXMNWDUsu81Rh8jlYgk6IR9XiuESATQAcBmABFvkLXn/B70/bwvCGwrysjLwJM/PwkA6FS7EwYvGFz0gC/fvxxJx5Kw/8H9GNxsMLbfvx3z/p6H1JxU3NbqtiKj5eAFg7HpzKYiQ6dBY0D7mu3RvFpzDGo6CP9q9S+oVa79LmN07vnmAgJxBt/jrVcfXo3hi4YXybt472KsOboGBx8+iARjQhm/GYnEd8JqbBVCWAD8CmASES0TQqQRUbzT8VQi8monCbaxte/nfRVdoDG6GLSv1R4bTrq2kDNoDHj+uufx4o0vKo73T8o/6Da/m1uWr0FjwIWnL3j01CzZuwTjvh3ncp1Ja0LS2CR0rdfVp8/SaU4nbE/e7rLPqDHi9V6v48nuT/o0hkTiiYgbWwuF0AJYCmABES0r3B3xBllHLh9R3K9kEAW4t8zei3s9jnc09aiiAVMt1EjJTPF43YhWI/BKz1dg0Vm47YShCj4a8pHPSgQATqaddNuXY8vBgUsHfB5DIvGHcHltBID5APYR0TtOh74DMK7w73EAVoRDHmd6NeqluL+OpQ5uaHiDWx9fs9aMvo0893K8ps41yLO5B4Jp1Vo0iPO+LHu6+9O49MwlHH3kKM4/fR5j2patd2W3+t3cAugsWgt6JvYs0zgSSVkJ14ykB4C7APQWQuwo3AYjChpkvTPgnSJDqDNf3PoFJvWe5Da7SDAmYHTb0R7HqxtbF491fayoVIBaqGHSmjD7ptnQqkvvAalT61A7prbPbtm8gjy88dsbaDyrMXamcO6Ns2G3VY1WIcmnkUicCYuxlYh+B+CpVlSfcMjgibyCPBg0BpecFYOay37P3DTTLcz8VPopbDq1CT0b9fQ45pS+U9C/SX98tesrmHQmTOgwwe+yA6Vxxzd3YPWR1UUGVqPGiJbVWqJ1jdbo27gvRrYZCZ06SiorSyoslT6yddJvk/DGhjfcQsl71O+BTac3KUaoJsYn4tijx4Img78cSz2GVrNbucluUBtw8vGTqG6uHiHJJBWRqDC2Riv7Lu5TzEc5mnrUY5j7+ayw24QVOZZ2THFZplPrKlwdWEl0U+kVSe9GvYvsGQ5UQoUeDXp4DHNvVyOA5rpBpF3Ndsi1KZQRoAI0r9Y8AhJJKiuVXpHcefWdaJrQtLiOqtqAOH0c3uzzJuYOnat4zbKRy0BE+OTvT9Bmdhs0nNkQT/38FDJyM0q93+n00xi7fCzqvVMP3eZ1w89HfvZb9qqmqnip50tuZQSmD5gOk9Zzro9EEmwqvY0E4NiQr3Z9haSjSWhVvRXu63Qfaphr4LcTv6H/F/1dDLFx+jgcfPgg5m2fh0kbJhUFkOnVerSp0QZb7tsC4aEHwRXrFVz1v6twKftS0bLJpDVh8YjFuOmqm/yW//eTv+OLf76ARmgwvv14XFP3Gr/Hkkg8IUstFpJmTUPS0STE6GPQu1HvIherQWNAr8ReMGvNaFylMaqb2Ej54i8vulUgs9qs+GDLB3h749suUai5Bbk4cOkAfj/5O65veL3i/b/850tk5mW62F6y87PxXNJzASmS6xpch+saXOf39RJJoFQaRbJozyLc/e3d0Kq1ICJYdBb8Ov5XNE1oiidWP4EPt30IrUoLO9nRrmY7/HzXz4pRr7kFufgn5R/F6mNEhCOpRzwqkoOXDyo2yDpx5UTgH1AiiSCVwkZyKfsSxn87Hjm2HKTnpiMjLwPnMs9h1NJRWHN0DeZunwurzYqMvAxk5WdhW/I2vPbra7iuwXVuka0mrQkDmg5AVaN7Bq2d7OhSt4tHOa5vcL2bYReA12skkvJApVAkPx/52S1SlED4J+UffL7zc7cqYrkFuVi4eyEm95mMOH1cUYCaWWtGkypNcFfbuzDn5jkwaU1QC3XRsTFtx3itajasxTC0rdm2SJno1fqithISSXmmUixtTFqT4pLCTnYU2JVjRXJtuWhcpTH2P7Qfc7bNwd4Le9G7UW+Mvno0jFojhlw1BFvu24I52+Yg1ZqK21vdjsHNBnuVQ6PSYP349Vi0exFWHV6FJglN8ECnB1A3tm5QPqdEEikqhSKJ1cd6LFnoqUdNei43xqpuqo5BTQfhqqpX4dq61xYVCAKAVtVbYebAmWWSRafWoVejXjBoDKgfVx91YuqU6XqJJBqpFIrkVPopGDVGtwpkaqFGckay4jV59jxk52dj4JcDsT15O1RChXx7Ph7o9ABmDJjh0cVbGq+sfwVTf58KrZoNu82rNse6cevKVMBIIok2KoWNxJMxM8GU4LGMQG1Lbbzx2xvYcmYLsvKzkJGXAavNinnb5/kdRLb59Ga89edbsBYUG3Z3X9iNZ9Y+49d4Ekm0UCkUSfNqzTGyzcii+qoAoBVafHjTh5g1aJZivspnwz7DV7u+grXANQ8nKz8LX+/+2i85luxbgpx811lRXkEelu5d6td4Ekm0UCkUCRFhz/k9LssRjVqD3ed3I78gH0aNa1Nag9oArVqr6KpVCRVi9O71VX3BorUo1hkxaAx+jSeRRAuVQpFsPL0Rey/udelFk2PLwZt/vIkPt37oNuuwFljx36T/4tGuj7rlrBg0hqLGVgX2AiQdTcLyfcsVG32X5K52d7kVNzJpTXjw2gf9/GQSSXQQrlKLHwshzgshdjvtC1tzrMOXDyvuzy/Ix86UnYplBI6kHsF9He/DY10eg1FjhEljQlVjVXx6y6doW7Mtjlw+gsSZiRi+aDjGrxiPOtPr4Js933iVo3GVxvjmtm9Q01wTRo0RBo0BEzpMwDPdpY1EUr4JS9KeEOIGAJkAPieiNoX7pgG47NQcqwoRldocy5+kvf0X96PjRx3dvDY1zTXxWq/X8MTqJ1yC0lRCheEthmPJ7UsAADn5ObiYfRF1YuoUtZHoMrcLtiZvdXErGzQGnH78dKl9Y+xkx5n0M0gwJsCsc18+SSTRSMQLGxHRbwAul9gd9OZYRIS52+aizew2aPpuU7z8y8uw2qxoUa0FxrQd41ZH9aMhH+GutnehXmy9oghVFTgV/82+xeVjjVoj6sfVL1Ii6bnp+Pvc326xKVqVFquPrC5VTpVQoX5cfalEJBWGSMaR+NQcC/C9QdZzSc/hvb/eK4pinfbnNPx+6nckjU3CR0M+wtDmQ7Fo9yLEG+LxQOcH0KZGG1zIuoDzWeeLMnLt4GjXsxln0TShqeJ9NCqNW7V2ByUNtxJJZaBcGFuJaA4RdSaiztWrK9chzcrLwrub33UJhbfarNh0ehN2ntsJIQSub3A9RrQageEth6NlNc6Jmbt9rtuSJ8eWgxfWveBRHpPWhJub3+zmNtaoNBjUbJCHqySSikskZyQpQojahbORgJtjnc8675apC/DDfST1CA5fPoyxy8dCo9KAQIjVx+LX8b9i74W9isZWTwZaB5/c8gnGLB+D1YdXQwiBBnENsHjEYunKlVRKIqlIHM2x3kQQmmPVi60Hrcq9b0xeQR6aVGmCbvO7ucw8svKzMHLpSDzQ6QF8u/9bN2Nr9/rdvd4vRh+DFSNXIDUnFdn52agTU8fvsHmJpLwTLvfvQgAbATQXQpwWQkxAkJtjadVavDf4PZi0pqKZiVlrxsROE7Hv4j6gRM6enezYdnYbbmp2ExpVaVQUL+JI7Z/SZ4pP961irIK6sXWlEpFUasLVIGuUh0NBbY41pu0YtKreCnO3zUVWfhbuvPpODGgyAMv2LEOOPcftfMcS5697/8JnOz/DumPr0LJaS0zsPBG1Y2oHUzSJpEJTKYo/37HkDizes1jx2IUnL6CapZrX63ec24GjqUfRuU7nUvv3SiQVlUpf/HnnuZ0ej206uwlDrhqieCwnPweDFwzGX2f/gkalQV5BHh685kG81e8tuZSRSJwoF+7fQBnYeKDHY30b9/V4bPKGydh0ZhOy87ORnpsOq82KD7d+iLVH14ZCTImk3FIpFIldKFdHA4CUzBScSDuBscvHInFmIvp+3hd/nPwDAPDlri/dXMNZ+Vn4atdXIZVXIilvVIqljUVn8XgsOy8b1396PdKsaSigApy4cgJ/nvoTq8esVoxSVQu11/EkkspIhZuR2Ow2JB1NwsqDK5GVx7Ehz/R4RjGkvZalFlYcXIGs/CyXplU5thz8d91/8XCXh93KCOjV+qIyAhKJhKlQimTfhX2o9049DF80HHcuuxM1366JVYdXId4Qj69HfO0SsFbdVB2b792MXSm7FCNbD106hImdJuKhax7iMgJaE6oYqmDeLfPQrlZ0NBGXSKKFCuX+bfV+K+y/uB+E4s9k1pqR8lQKzDoz7HY7tpzdggRjAppVbQYAeHz145i5aabbWFclXIUDDx8AwHk857POo35cfcUKZxJJZSDiZQTCwZn0MziaetRFiQAc7v7L8V9w8spJJM5KRNf5XdH8f83R74t+KCgoQGpOquJ4mXmZAICNpzZi+KLh6P1Zb9yz4h6cSJPtNSWSklSYn1edWgeb3ea2Pzs/GzqhQ6NZjYrqhxAIa4+uRavZrXBD4g2K42lUGvx56k/0+6JfUUbxqV2n8OOhH7H/of2oZvIexCaRVCYqzIzErDO7zUYAoIAK8NWerxQbZB28fBAqD1+BVq3FC+tecClLUEAFyMrPwpxtc4InuERSAagwiiQlM0Uxhd+itWDL2S0erzuWekxxf1ZeFg5eOui232qzYtf5Xf4LKpFUQCqMIvFURsBmt2FUG085g9zYu2TbCQGBbvW7oVu9bm41TkxaE25seGNwhJZIKggVRpFo1VrMGjgLOrWuaJ9OpcM9He7BCze8gBidey+aoVcNxd0d7kYNcw2XOBODxoA3+76JyX0mI0YXU1QJzaQ1oWFcQ4xtNzb0H0giKUdUGEUCAIcuH3KzeRy6fAgAkPpMKm5tcSuMGiPi9fF4s8+bWDFqBU6nn8axtGMu9pUcWw6SjiahWdVm2PvgXjzd/WkMbzEc0/pOw5b7trgFqUkklZ0KE0eSnZ+NatOqudVfNWlN2DRhE66ueTU2HN+Af//4b1Q1VsV3d3yHOFMcBn05CKuOrHIbL04fh7Rn00L1MSSSckdUlxEQQgwEMAuAGsA8IvKrUlpKZoqi+ze/IB+HLh/CiMUjcPBysfE0/q14vHz9y9h/cb/ieBl5Gf6IIZFUSiK6tBFCqAG8D2AQgFYARgkhWvkzVr3Yei4tOR3k2/Nx6MIhFyXi4NUNr3qszVrTXNMfMSSSSkmkbSTXAjhMREeJKA/A1+DGWWVmzdE1Ho+9uuFVj8dubn6zi4HWwfyh8/0RQyKplERakdQFcMrp/enCfS4IIe4XQmwVQmy9cOGC4kBx+jiPN1FyCztIjEvEqcdOYUTLEahlroVu9bphy31bZH8aiaQMRNpGolSv0M36S0RzAMwB2NiqNFCPBj2gFmqXcgAOfhn3CzrN7aQoQNf6XQEA39zuvQG4RCLxTKRnJKcB1Hd6Xw/AWX8H+/3u34t6+Dp4b9B76FinI+5odYfb+T/e+aO/t5JIJE5E1P0rhNAAOAhuS3EGwBYAdxLRHk/X+FJFfv2x9Ui1pmLoVUOhVhcrlvz8fEz5YwqaJDTB6Lajg/IZJJLKQtS6f4nIJoR4CMBqsPv3Y29KxFd6NuqpuF+r1eKlni8FOrxEIilBpG0kIKIfAcg1hkRSjom0jUQikVQApCKRSCQBIxWJRCIJGKlIJBJJwJS77F8hxAUAvlRgrgbgYojF8QUphyvRIEc0yACUPzkaElF1pQPlTpH4ihBiqyeft5SjcssRDTJUNDnk0kYikQSMVCQSiSRgKrIiiZaeEVIOV6JBjmiQAahAclRYG4lEIgkfFXlGIpFIwoRUJBKJJGAqpCIRQgwUQhwQQhwWQjwbxvt+LIQ4L4TY7bQvQQixRghxqPC1SohlqC+E+EUIsU8IsUcI8WiE5DAIIf4SQuwslOPVSMhReE+1EOJvIcQPkZKh8L7HhRC7hBA7hBBbIyGLECJeCLFECLG/8P9It2DIUOEUSTALSvvBpwAGltj3LIAkImoGIKnwfSixAXiSiFoC6ArgwcLPH245cgH0JqJ2ANoDGCiE6BoBOQDgUQD7nN5HQgYHvYiovVPcRrhlmQVgFRG1ANAO/L0ELgMRVagNQDcAq53ePwfguTDePxHAbqf3BwDULvy7NoADYf4+VgDoF0k5AJgAbAfQJdxygKvuJQHoDeCHSP6bADgOoFqJfWGTBUAsgGModLIEU4YKNyOBjwWlw0hNIkoGgMLXGuG6sRAiEUAHAJsjIUfhkmIHgPMA1hBRJOSYCeAZAHanfZH6NyEAPwshtgkh7o+ALI0BXADwSeFSb54QwhwMGSqiIvGpoHRFRwhhAbAUwGNElB4JGYiogIjag2cF1woh2oTz/kKIIQDOE9G2cN7XCz2IqCN42f2gEOKGMN9fA6AjgA+IqAOALARpKVURFUlQC0oHgRQhRG0AKHw9H+obCiG0YCWygIiWRUoOB0SUBmA92H4UTjl6ABgqhDgO7pnUWwjxZZhlKIKIzha+ngewHNzXKZyynAZwunBmCABLwIolYBkqoiLZAqCZEKKREEIHYCSA7yIoz3cAxhX+PQ5sswgZQggBYD6AfUT0TgTlqC6EiC/82wigL4D94ZSDiJ4jonpElAj+f7COiMaEUwYHQgizECLG8TeA/gB2h1MWIjoH4JQQonnhrj4A9gZFhnAYmcK9ARgMrk5/BMB/w3jfhQCSAeSDtf8EAFXBxr5Dha8JIZbhOvBS7h8AOwq3wRGQoy2Avwvl2A3gpcL9YZXDSZ6eKDa2hl0GsH1iZ+G2x/H/MgL/Lu0BbC38d/kWQJVgyCBD5CUSScBUxKWNRCIJM1KRSCSSgJGKRCKRBIxUJBKJJGCkIpFIJAEjFYlEIgkYqUgkIaUwdT5HCJHptHUXQpAQIuK9pyXBQSoSSTi4mYgsjg2RTVmQhACpSCQSScBIRSKRSAJGKhJJOPhWCJFWuH0baWEkwUcauyThYBgRrXW8KSy4JKlAyBmJRCIJGKlIJBJJwEhFIpFIAkbWI5FIJAEjZyQSiSRgpCKRSCQBIxWJRCIJGKlIJBJJwEhFIpFIAkYqEolEEjBSkUgkkoCRikQikQTM/wM6M+Bp2YbvuQAAAABJRU5ErkJggg==\n", 13 | "text/plain": [ 14 | "
" 15 | ] 16 | }, 17 | "metadata": { 18 | "needs_background": "light" 19 | }, 20 | "output_type": "display_data" 21 | } 22 | ], 23 | "source": [ 24 | "import numpy as np\n", 25 | "import pandas as pd\n", 26 | "import matplotlib.pyplot as plt\n", 27 | "from sklearn.cluster import KMeans\n", 28 | "\n", 29 | "%matplotlib inline\n", 30 | "plt.figure(figsize=(4, 4))\n", 31 | "\n", 32 | "cps = pd.read_csv('cps.csv', header=None)\n", 33 | "X = cps.iloc[:,:].values\n", 34 | "\n", 35 | "clustering = KMeans(n_clusters=3, random_state=8).fit(X)\n", 36 | "\n", 37 | "cps_df = pd.DataFrame(cps)\n", 38 | "cps_df.columns = ['FI','MU','EM', 'EV', 'S']\n", 39 | "color_theme = np.array(['red','green','blue'])\n", 40 | "plt.scatter(x=cps_df.FI, y=cps_df.EM, \n", 41 | " c=color_theme[clustering.labels_], s=30)\n", 42 | "plt.xlabel(cps_df.columns[0], fontsize=12)\n", 43 | "plt.ylabel(cps_df.columns[2], fontsize=12)\n", 44 | "plt.show()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [] 53 | } 54 | ], 55 | "metadata": { 56 | "kernelspec": { 57 | "display_name": "Python 3", 58 | "language": "python", 59 | "name": "python3" 60 | }, 61 | "language_info": { 62 | "codemirror_mode": { 63 | "name": "ipython", 64 | "version": 3 65 | }, 66 | "file_extension": ".py", 67 | "mimetype": "text/x-python", 68 | "name": "python", 69 | "nbconvert_exporter": "python", 70 | "pygments_lexer": "ipython3", 71 | "version": "3.8.5" 72 | } 73 | }, 74 | "nbformat": 4, 75 | "nbformat_minor": 2 76 | } 77 | -------------------------------------------------------------------------------- /Chapter_10/Final - mds.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "image/png": "\n", 11 | "text/plain": [ 12 | "
" 13 | ] 14 | }, 15 | "metadata": { 16 | "needs_background": "light" 17 | }, 18 | "output_type": "display_data" 19 | } 20 | ], 21 | "source": [ 22 | "import pandas as pd\n", 23 | "import matplotlib.pyplot as plt\n", 24 | "from sklearn.manifold import MDS\n", 25 | "from sklearn.metrics import pairwise_distances\n", 26 | "\n", 27 | "%matplotlib inline\n", 28 | "plt.figure(figsize=(4, 4))\n", 29 | "\n", 30 | "cps = pd.read_csv('cps.csv', header=None)\n", 31 | "X = cps.iloc[:,:].values\n", 32 | "\n", 33 | "dist=pairwise_distances(X)\n", 34 | "mds=MDS(n_components=2, dissimilarity='precomputed', random_state=1)\n", 35 | "pos=mds.fit_transform(dist)\n", 36 | "xs, ys=pos[:,0], pos[:,1]\n", 37 | "\n", 38 | "plt.scatter(x=xs, y=ys, s=30, color='green')\n", 39 | "ax = plt.gca()\n", 40 | "ax.xaxis.set_visible(False)\n", 41 | "ax.yaxis.set_visible(False)\n", 42 | "plt.show()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": null, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [] 51 | } 52 | ], 53 | "metadata": { 54 | "kernelspec": { 55 | "display_name": "Python 3", 56 | "language": "python", 57 | "name": "python3" 58 | }, 59 | "language_info": { 60 | "codemirror_mode": { 61 | "name": "ipython", 62 | "version": 3 63 | }, 64 | "file_extension": ".py", 65 | "mimetype": "text/x-python", 66 | "name": "python", 67 | "nbconvert_exporter": "python", 68 | "pygments_lexer": "ipython3", 69 | "version": "3.8.5" 70 | } 71 | }, 72 | "nbformat": 4, 73 | "nbformat_minor": 2 74 | } 75 | -------------------------------------------------------------------------------- /Chapter_10/Final - pca.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "[0.91355698 0.05692013 0.01805982 0.00931031 0.00215277]\n" 13 | ] 14 | }, 15 | { 16 | "data": { 17 | "text/html": [ 18 | "
\n", 19 | "\n", 32 | "\n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | "
FIMUENEVS
00.6024960.3259880.7233200.0479940.072356
10.784025-0.391908-0.478342-0.0341000.041694
20.1101880.838756-0.4928020.0484590.197850
3-0.098870-0.1903820.0566230.2671360.937770
4-0.0196660.0195670.044004-0.9606370.272892
\n", 86 | "
" 87 | ], 88 | "text/plain": [ 89 | " FI MU EN EV S\n", 90 | "0 0.602496 0.325988 0.723320 0.047994 0.072356\n", 91 | "1 0.784025 -0.391908 -0.478342 -0.034100 0.041694\n", 92 | "2 0.110188 0.838756 -0.492802 0.048459 0.197850\n", 93 | "3 -0.098870 -0.190382 0.056623 0.267136 0.937770\n", 94 | "4 -0.019666 0.019567 0.044004 -0.960637 0.272892" 95 | ] 96 | }, 97 | "metadata": {}, 98 | "output_type": "display_data" 99 | } 100 | ], 101 | "source": [ 102 | "import pandas as pd\n", 103 | "from sklearn import decomposition\n", 104 | "from sklearn.decomposition import PCA\n", 105 | "\n", 106 | "cps = pd.read_csv('cps.csv', header=None)\n", 107 | "cps.columns = ['FI','MU','EN', 'EV', 'S']\n", 108 | "X = cps.iloc[:,:].values\n", 109 | "\n", 110 | "pca = decomposition.PCA()\n", 111 | "pca.fit_transform(X)\n", 112 | "\n", 113 | "print(pca.explained_variance_ratio_)\n", 114 | "display(pd.DataFrame(pca.components_, columns=cps.columns))" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.8.5" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 2 146 | } 147 | -------------------------------------------------------------------------------- /Chapter_10/cps.csv: -------------------------------------------------------------------------------- 1 | 47,10,26,3,7 2 | 39,21,31,4,9 3 | 43,31,77,4,3 4 | 55,41,67,2,0 5 | 2,0,0,0,0 6 | 48,32,63,4,11 7 | 28,14,30,5,0 8 | 44,35,60,7,4 9 | 38,17,35,2,0 10 | 47,43,73,6,2 11 | 44,20,46,6,9 12 | 33,28,50,7,6 13 | 6,15,23,2,0 14 | 4,10,13,0,0 15 | 38,27,43,3,5 16 | 45,20,44,7,8 17 | 47,18,39,2,2 18 | 48,26,52,1,8 19 | 47,31,56,7,9 20 | 0,1,1,0,0 21 | 10,18,22,1,2 22 | 4,9,9,0,0 23 | 42,9,30,5,6 24 | 40,28,64,6,1 25 | 34,22,52,3,4 26 | 30,19,51,2,4 27 | 51,38,69,6,11 28 | 4,11,14,3,1 29 | 33,26,41,5,6 30 | 0,2,4,0,0 31 | 51,38,79,5,11 32 | 10,16,22,2,1 33 | 58,27,57,4,15 34 | 39,13,34,5,5 35 | 41,25,55,3,0 36 | 18,24,29,3,0 37 | 40,24,68,6,13 38 | 2,5,6,1,1 39 | 37,39,65,6,5 40 | 2,5,6,0,0 41 | 4,9,11,2,0 42 | 0,0,0,0,0 43 | 0,0,0,0,0 44 | 32,8,36,3,0 45 | 58,30,48,0,1 46 | 60,36,58,0,0 47 | 39,26,61,5,6 48 | 4,18,19,1,1 49 | 51,35,76,5,9 50 | 45,19,56,3,1 51 | 12,9,12,3,0 52 | 26,20,38,3,8 53 | 4,14,17,3,1 54 | 30,28,63,5,4 55 | 45,31,59,5,5 56 | 42,22,45,7,15 57 | 2,0,0,0,0 58 | 2,6,8,2,1 59 | 2,6,7,2,0 60 | 53,22,53,5,3 61 | 6,14,17,2,0 62 | 39,34,75,7,9 63 | 47,20,54,3,4 64 | 24,24,31,4,5 65 | 37,26,38,6,3 66 | 38,23,49,4,3 67 | 42,37,68,7,9 68 | 52,34,66,6,2 69 | 43,22,39,6,5 70 | 26,24,45,3,6 71 | 42,16,41,3,2 72 | 36,28,53,5,0 73 | 44,22,41,4,7 74 | 14,17,20,3,2 75 | 8,16,21,0,0 76 | 39,25,59,5,0 77 | 13,15,36,4,1 78 | 16,23,35,6,1 79 | 27,11,31,3,4 80 | 0,0,3,0,0 81 | 47,22,41,3,2 82 | 12,11,25,5,0 83 | 6,10,12,1,1 84 | 29,8,24,3,2 85 | 16,18,23,2,0 86 | 6,2,3,2,0 87 | 32,33,30,1,4 88 | 38,19,43,3,8 89 | 8,24,29,3,1 90 | 36,20,57,3,0 91 | 41,16,45,6,5 92 | 4,5,5,0,0 93 | 42,25,43,5,8 94 | 0,0,0,0,0 95 | 34,28,33,6,12 96 | 37,17,50,5,4 97 | 42,23,58,1,0 98 | 8,6,10,3,0 99 | 2,0,0,0,0 100 | 2,0,0,2,0 101 | 4,10,13,0,0 102 | 6,17,19,3,1 103 | 38,21,36,5,2 104 | 2,2,3,2,0 105 | 2,2,3,2,0 106 | 59,34,52,3,4 107 | 2,6,7,2,0 108 | 28,14,29,0,2 109 | 8,9,15,0,0 110 | 39,29,66,6,6 111 | 12,12,15,3,3 112 | 2,6,7,0,0 113 | 57,23,42,6,2 114 | 26,12,29,4,2 115 | 16,9,30,4,3 116 | 20,8,22,3,1 117 | 19,5,19,3,2 118 | 21,11,22,4,1 119 | 6,2,6,2,0 120 | 4,1,1,0,0 121 | 4,1,1,0,0 122 | 31,17,42,3,4 123 | 10,14,19,3,2 124 | 4,2,3,0,0 125 | 6,12,17,3,1 126 | 32,9,33,3,4 127 | 45,20,49,3,2 128 | 18,15,23,5,12 129 | 31,12,27,3,2 130 | 4,4,4,0,0 131 | 32,31,47,5,4 132 | 39,33,58,7,12 133 | 48,21,58,3,4 134 | 18,16,17,5,3 135 | 36,22,51,3,2 136 | 33,17,37,6,4 137 | 39,28,67,6,3 138 | 38,24,44,4,5 139 | 2,10,13,0,0 140 | 11,15,21,6,1 141 | 24,19,28,5,6 142 | 44,23,51,3,0 143 | 43,16,43,5,4 144 | 6,13,15,3,1 145 | 2,0,0,0,0 146 | 44,32,63,6,1 147 | 52,29,50,4,6 148 | 6,19,23,3,2 149 | 52,22,45,4,12 150 | 2,9,11,3,1 151 | 43,15,26,3,2 152 | 4,13,19,0,0 153 | 2,2,3,2,0 154 | 28,30,33,9,11 155 | 2,13,14,1,1 156 | 17,19,39,5,1 157 | 2,4,7,2,0 158 | 6,13,17,3,1 159 | 0,5,9,2,0 160 | -------------------------------------------------------------------------------- /Chapter_11/Sample_code_GAN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The sample code of the discriminator and the generator of a GAN model" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import tensorflow as tf\n", 17 | "import numpy as np\n", 18 | "\n", 19 | "def discriminator(self, image, y=None, reuse=False):\n", 20 | " with tf.variable_scope(\"discriminator\") as scope:\n", 21 | " if reuse:\n", 22 | " tf.get_variable_scope().reuse_variables()\n", 23 | " else:\n", 24 | " assert tf.get_variable_scope().reuse == False\n", 25 | "\n", 26 | " h0 = lrelu(conv2d(image, self.df_dim, name='d_h0_conv'))\n", 27 | " h1 = lrelu(self.d_bn1(conv2d(h0, self.df_dim*2, name='d_h1_conv')))\n", 28 | " h2 = lrelu(self.d_bn2(conv2d(h1, self.df_dim*4, name='d_h2_conv')))\n", 29 | " h3 = lrelu(self.d_bn3(conv2d(h2, self.df_dim*8, d_h=1, d_w=1, name='d_h3_conv')))\n", 30 | " h4 = linear(tf.reshape(h3, [self.batch_size, -1]), 1, 'd_h3_lin')\n", 31 | " return tf.nn.sigmoid(h4), h4, h3, h2, h1, h0\n", 32 | "\n", 33 | "def generator(self, image, y=None):\n", 34 | " with tf.variable_scope(\"generator\") as scope:\n", 35 | " s = self.output_size\n", 36 | " s2, s4, s8, s16, s32, s64, s128 = int(s/2), int(s/4), int(s/8), int(s/16), int(s/32), int(s/64), int(s/128)\n", 37 | "\n", 38 | " e1 = conv2d(image, self.gf_dim, name='g_e1_conv')\n", 39 | " e2 = self.g_bn_e2(conv2d(lrelu(e1), self.gf_dim*2, name='g_e2_conv'))\n", 40 | " e3 = self.g_bn_e3(conv2d(lrelu(e2), self.gf_dim*4, name='g_e3_conv'))\n", 41 | " e4 = self.g_bn_e4(conv2d(lrelu(e3), self.gf_dim*8, name='g_e4_conv'))\n", 42 | " e5 = self.g_bn_e5(conv2d(lrelu(e4), self.gf_dim*8, name='g_e5_conv'))\n", 43 | " e6 = self.g_bn_e6(conv2d(lrelu(e5), self.gf_dim*8, name='g_e6_conv'))\n", 44 | " e7 = self.g_bn_e7(conv2d(lrelu(e6), self.gf_dim*8, name='g_e7_conv'))\n", 45 | " e8 = self.g_bn_e8(conv2d(lrelu(e7), self.gf_dim*8, name='g_e8_conv'))\n", 46 | "\n", 47 | " self.d1, self.d1_w, self.d1_b = deconv2d(tf.nn.relu(e8),\n", 48 | " [self.batch_size, s128, s128, self.gf_dim*8], name='g_d1', with_w=True)\n", 49 | " d1 = tf.nn.dropout(self.g_bn_d1(self.d1), 0.5)\n", 50 | " d1 = tf.concat_v2([d1, e7], 3)\n", 51 | " self.d2, self.d2_w, self.d2_b = deconv2d(tf.nn.relu(d1),\n", 52 | " [self.batch_size, s64, s64, self.gf_dim*8], name='g_d2', with_w=True)\n", 53 | " d2 = tf.nn.dropout(self.g_bn_d2(self.d2), 0.5)\n", 54 | " d2 = tf.concat_v2([d2, e6], 3)\n", 55 | " self.d3, self.d3_w, self.d3_b = deconv2d(tf.nn.relu(d2),\n", 56 | " [self.batch_size, s32, s32, self.gf_dim*8], name='g_d3', with_w=True)\n", 57 | " d3 = tf.nn.dropout(self.g_bn_d3(self.d3), 0.5)\n", 58 | " d3 = tf.concat_v2([d3, e5], 3)\n", 59 | " self.d4, self.d4_w, self.d4_b = deconv2d(tf.nn.relu(d3),\n", 60 | " [self.batch_size, s16, s16, self.gf_dim*8], name='g_d4', with_w=True)\n", 61 | " d4 = self.g_bn_d4(self.d4)\n", 62 | " d4 = tf.concat_v2([d4, e4], 3)\n", 63 | " self.d5, self.d5_w, self.d5_b = deconv2d(tf.nn.relu(d4),\n", 64 | " [self.batch_size, s8, s8, self.gf_dim*4], name='g_d5', with_w=True)\n", 65 | " d5 = self.g_bn_d5(self.d5)\n", 66 | " d5 = tf.concat_v2([d5, e3], 3)\n", 67 | " self.d6, self.d6_w, self.d6_b = deconv2d(tf.nn.relu(d5),\n", 68 | " [self.batch_size, s4, s4, self.gf_dim*2], name='g_d6', with_w=True)\n", 69 | " d6 = self.g_bn_d6(self.d6)\n", 70 | " d6 = tf.concat_v2([d6, e2], 3)\n", 71 | " self.d7, self.d7_w, self.d7_b = deconv2d(tf.nn.relu(d6),\n", 72 | " [self.batch_size, s2, s2, self.gf_dim], name='g_d7', with_w=True)\n", 73 | " d7 = self.g_bn_d7(self.d7)\n", 74 | " d7 = tf.concat_v2([d7, e1], 3)\n", 75 | " self.d8, self.d8_w, self.d8_b = deconv2d(tf.nn.relu(d7),\n", 76 | " [self.batch_size, s, s, self.output_c_dim], name='g_d8', with_w=True)\n", 77 | " return tf.nn.tanh(self.d8) \n" 78 | ] 79 | } 80 | ], 81 | "metadata": { 82 | "kernelspec": { 83 | "display_name": "key", 84 | "language": "python", 85 | "name": "key" 86 | }, 87 | "language_info": { 88 | "codemirror_mode": { 89 | "name": "ipython", 90 | "version": 3 91 | }, 92 | "file_extension": ".py", 93 | "mimetype": "text/x-python", 94 | "name": "python", 95 | "nbconvert_exporter": "python", 96 | "pygments_lexer": "ipython3", 97 | "version": "3.8.6" 98 | } 99 | }, 100 | "nbformat": 4, 101 | "nbformat_minor": 4 102 | } 103 | -------------------------------------------------------------------------------- /Chapter_11/distance_by_vgg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load the needed libraries" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import torch\n", 17 | "import torch.nn as nn\n", 18 | "import torchvision.models as models\n", 19 | "from torchvision import transforms\n", 20 | "from torch.autograd import Variable\n", 21 | "from PIL import Image\n", 22 | "import numpy as np" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "# Load the vgg model" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "vgg19 = models.vgg19(pretrained=True)\n", 39 | "modules=list(vgg19.children())[:-1]\n", 40 | "vgg19 = nn.Sequential(*modules)\n", 41 | "\n", 42 | "for p in vgg19.parameters():\n", 43 | " p.requires_grad = False" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "# Preprocessing" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "#Define the normalization transform to normalize the image intensities \n", 60 | "normalize = transforms.Normalize(\n", 61 | " mean=[0.485, 0.456, 0.406],\n", 62 | " std=[0.229, 0.224, 0.225])\n", 63 | "\n", 64 | "#Define the preprocessing transform to perform the scaling of the images\n", 65 | "preprocess = transforms.Compose([\n", 66 | " transforms.Scale((224,224)),\n", 67 | " transforms.CenterCrop(224),\n", 68 | " transforms.ToTensor(),\n", 69 | " normalize])" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "# Feature extracting and distance computation" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": null, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "# import and extract a feature from image1\n", 86 | "img_pil_first = Image.open(\"first.png\")\n", 87 | "img_tensor_first = preprocess(img_pil_first)\n", 88 | "img_tensor_first.unsqueeze_(0)\n", 89 | "img_var_first = Variable(img_tensor_first) # assign it to a variable\n", 90 | "features_var_first = vgg19(img_var_first) # get the output from the last hidden layer of the pretrained model\n", 91 | "features_first = features_var_first.data # get the tensor out of the variable\n", 92 | "feat_first = torch.squeeze(features_first)\n", 93 | "feat_first = feat_first.numpy()\n", 94 | "#print(feat_first)\n", 95 | "\n", 96 | "# import and extract a feature from image2\n", 97 | "img_pil_first = Image.open(\"second.png\")\n", 98 | "img_tensor_last = preprocess(img_pil_last)\n", 99 | "img_tensor_last.unsqueeze_(0)\n", 100 | "img_var_last = Variable(img_tensor_last) \n", 101 | "features_var_last = vgg19(img_var_last) \n", 102 | "features_last = features_var_last.data # get the tensor out of the variable\n", 103 | "feat_last = torch.squeeze(features_last) \n", 104 | "feat_last = feat_last.numpy()\n", 105 | "print(\"the distance between two images\")\n", 106 | "print(np.sqrt(np.sum(np.power(feat_first - feat_last, 2))))" 107 | ] 108 | } 109 | ], 110 | "metadata": { 111 | "kernelspec": { 112 | "display_name": "Python 3", 113 | "language": "python", 114 | "name": "python3" 115 | }, 116 | "language_info": { 117 | "codemirror_mode": { 118 | "name": "ipython", 119 | "version": 3 120 | }, 121 | "file_extension": ".py", 122 | "mimetype": "text/x-python", 123 | "name": "python", 124 | "nbconvert_exporter": "python", 125 | "pygments_lexer": "ipython3", 126 | "version": "3.8.8" 127 | } 128 | }, 129 | "nbformat": 4, 130 | "nbformat_minor": 4 131 | } 132 | -------------------------------------------------------------------------------- /Chapter_12/Fit.R: -------------------------------------------------------------------------------- 1 | #---- Load the package ---- 2 | require(dynr) 3 | #---- Read in the data ---- 4 | data.simulate <- read.csv("DataModel0P100T50Run1234.csv") 5 | data <- dynr.data(data.simulate, id = "id", time = "time", observed = c("x", "y")) 6 | 7 | #---- Starting Values ---- 8 | startVals <- c(rho1 = .2, rho2 = .2, a12 = 0, a21 = 0, K = 3, 9 | var1 = 0.01, var2 = 0.01, var_e1 = .01, var_e2 = 0.01, 10 | muOne1 = 2, muTwo1 = 2, varOne1 = .05, varTwo1 = .05, cov_OneTwo1 = 0) 11 | 12 | #---- Prepare the recipes ---- 13 | # Dynamic Model 14 | formula =list(One ~ rho1 * One * (1 - (One + a12 * Two)/K), 15 | Two ~ rho2 * Two * (1 - (Two + a21 * One)/K)) 16 | dynm <- prep.formulaDynamics(formula = formula, 17 | startval = c(rho1 = unname(startVals["rho1"]), rho2 = unname(startVals["rho2"]), 18 | a12 = unname(startVals["a12"]), a21 = unname(startVals["a21"]), 19 | K = unname(startVals["K"])), 20 | isContinuousTime = TRUE) 21 | # Measurement Model 22 | meas <- prep.measurement( 23 | values.load = matrix(c(1, 0, 24 | 0, 1), ncol = 2, byrow = TRUE), 25 | obs.names = c("x", "y"), state.names = c("One", "Two")) 26 | 27 | # Initial Conditions 28 | initial <- prep.initial( 29 | values.inistate = c(startVals["muOne1"], startVals["muTwo1"]), 30 | params.inistate = c("muOne1", "muTwo1"), 31 | values.inicov = matrix(c(startVals["varOne1"], startVals["cov_OneTwo1"], 32 | startVals["cov_OneTwo1"], startVals["varTwo1"]), 2, 2, byrow = TRUE), 33 | params.inicov = matrix(c("varOne1", "cov_OneTwo1", 34 | "cov_OneTwo1", "varTwo1"), 2, 2, byrow = TRUE)) 35 | # Noise Covariance Matrix 36 | mdcov <- prep.noise( 37 | values.latent=diag(c(startVals["var1"], startVals["var2"])), 38 | params.latent=diag(c("var1", "var2")), 39 | values.observed=diag(c(startVals["var_e1"], startVals["var_e2"])), 40 | params.observed=diag(c("var_e1","var_e2"))) 41 | #---- Cooking materials ---- 42 | # Put all the recipes together in a Model Specification 43 | model <- dynr.model(dynamics = dynm, measurement = meas, 44 | noise = mdcov, initial = initial, 45 | data = data, outfile="mutualism.c") 46 | # Estimate free parameters 47 | res <- dynr.cook(dynrModel = model) 48 | #---- Examine results ---- 49 | summary(res, digits = 2) 50 | 51 | dynr.ggplot(res, model, style = 2, title = "Results of fitting the model to simulated data", 52 | numSubjDemo = 3, text = element_text(size = 16)) 53 | 54 | -------------------------------------------------------------------------------- /Chapter_12/hawkes_eg.R: -------------------------------------------------------------------------------- 1 | source("hawkes_functions.R") 2 | 3 | # Data generation: 10,000 events from Hawkes process with gamma kernel 4 | parms <- list("mu" = .1, "alpha" = .6, "shape" = 15, "scale" = .5) 5 | set.seed(1001) 6 | times <- hawkes_sim(10, parms) 7 | 8 | # Fit Hawkes model with gamma and exponential kernels 9 | gamma_hawkes <- optim(unlist(parms), 10 | logl, times = times, method = "L-BFGS-B", 11 | lower = rep(.00001, 4), upper = c(1, 1, 20, 20), 12 | hessian = T) 13 | 14 | exp_hawkes <- optim(unlist(parms[-3]), 15 | logl, times = times, method = "L-BFGS-B", 16 | lower = rep(.00001, 3), upper = c(1, 1, 20), 17 | hessian = T) 18 | 19 | # Likelihood ratio test of model fit 20 | lr <- 2 * (exp_hawkes$value - gamma_hawkes$value) 21 | cat("Likelihood ratio test of nested models.", 22 | "LR:", lr, "p-value: ", pchisq(lr, 1, lower.tail = F)) 23 | 24 | # 95% confidence intervals on overall intensity parameter 25 | gamma_alpha <- gamma_hawkes$par["alpha"] 26 | gamma_alpha_se <- se(gamma_hawkes)["alpha"] 27 | cat("95% CI on overall intensity, gamma kernel.", 28 | "Lower:", gamma_alpha - gamma_alpha_se * 1.96, 29 | "Upper:", gamma_alpha + gamma_alpha_se * 1.96) 30 | 31 | exp_alpha <- exp_hawkes$par["alpha"] 32 | exp_alpha_se <- se(exp_hawkes)["alpha"] 33 | cat("95% CI on overall intensity, exponential kernel.", 34 | "Lower:", exp_alpha - exp_alpha_se * 1.96, 35 | "Upper:", exp_alpha + exp_alpha_se * 1.96) 36 | 37 | -------------------------------------------------------------------------------- /Chapter_12/hawkes_functions.R: -------------------------------------------------------------------------------- 1 | # Functions to simulate and estimate univariate hawkes process with gamma reponse kernel 2 | 3 | # 1. cif 4 | cif <- function(times, parms) { 5 | if(is.null(parms$shape)) {parms$shape = 1} 6 | lambda <- times*0 7 | lambda[1] <- parms$mu 8 | 9 | for (i in 2:length(times)) { 10 | z <- times[i] - times[1:(i-1)] 11 | 12 | lambda[i] <- parms$mu + parms$alpha * sum( 13 | dgamma(z, parms$shape, 1/parms$scale)) 14 | } 15 | return(lambda) 16 | } 17 | 18 | # 2. compensator 19 | compensator <- function(t, times = NULL, parms) { 20 | if(is.null(parms$shape)) {parms$shape = 1} 21 | out <- parms$mu * t 22 | if(!is.null(times)) { 23 | z <- t - times[which(times < t)] 24 | out <- out + parms$alpha * sum( 25 | pgamma(z, parms$shape, 1/parms$scale)) 26 | } 27 | return(out) 28 | } 29 | 30 | # 3. loglikelihood 31 | logl <- function(parms, times, neglog = T) { 32 | parms <- as.list(parms) 33 | t <- times[length(times)] 34 | out <- sum(log(cif(times, parms))) - compensator(t, times, parms) 35 | if (neglog) { return(-out) } else { return(out) } 36 | } 37 | 38 | # 4. standard errors (observed Fisher info, computed via finite difference) 39 | se <- function(optim.object) { 40 | sqrt(diag(solve(optim.object$hessian))) 41 | } 42 | 43 | # 5. data simulation 44 | hawkes_sim <- function(n_events, parms) { 45 | s <- cumsum(rexp(n_events)) 46 | t <- s * 0 47 | end_time <- n_events/parms$mu * 2 48 | 49 | f <- function(t, s, times, parms) { 50 | (s - compensator(t, times, parms))^2 51 | } 52 | 53 | t[1] <- optimize( 54 | f, 55 | interval = c(0, end_time), 56 | s = s[1], 57 | times = NULL, 58 | parms = parms)$minimum 59 | 60 | for (i in 2:n_events) { 61 | t[i] <- optimize( 62 | f, 63 | interval = c(t[i - 1], end_time), 64 | s = s[i], 65 | times = t[1:(i - 1)], 66 | parms = parms)$minimum 67 | } 68 | return(t) 69 | } 70 | 71 | # some examples 72 | #gamma_parms <- list("mu" = .2, "alpha" = .6, "shape" = 3, "scale" = 3) 73 | #exp_parms <- gamma_parms[-3] 74 | 75 | ##x <- seq(1, 20, by = .5) 76 | #plot(x, dgamma(x, gamma_parms$shape, 1/gamma_parms$scale), type = "l") 77 | 78 | #n_events <- 5000 79 | #times <- hawkes_sim(n_events, gamma_parms) 80 | 81 | # rough check 82 | #n_mu <- parms$mu * times[n_events] # s/b < n_events 83 | #a <- (n_events - n_mu) / n_events # s/b ~ alpha 84 | -------------------------------------------------------------------------------- /Chapter_13/SNA.R: -------------------------------------------------------------------------------- 1 | #### Create the adjacency matrix for the example network 2 | net.matrix <- matrix(c(0,1,0,0,0, 0,0,4,0,0, 0,2,0,1,1, 3 | 0,1,0,0,0, 0,0,0,0,0), 4 | nrow = 5, ncol = 5, byrow = TRUE, 5 | dimnames = list(c("1", "2", "3", "4", "5"), 6 | c("1", "2", "3", "4", "5"))) 7 | 8 | #### Network Visualization 9 | library(igraph) 10 | net=graph.adjacency(net.matrix,mode="directed",weighted=TRUE, 11 | diag=FALSE) 12 | l <- layout.fruchterman.reingold(net) 13 | plot.igraph(net,layout=l, vertex.label.cex=1, 14 | vertex.label.color="black", 15 | edge.color="gray60",vertex.color="light blue", 16 | edge.width=E(net)$weight, 17 | edge.arrow.size=0.4,edge.curved=.3) 18 | 19 | #### Network Analysis 20 | library(sna) 21 | 22 | ## Create the network object 23 | net <- network(net.matrix,matrix.type="adjacency",directed=TRUE) 24 | 25 | ## Network Size 26 | network.size(net) 27 | 28 | ## Outdegrees and Indegrees for Nodes 29 | degree(net,cmode="outdegree") 30 | degree(net,cmode="indegree") 31 | 32 | ## Weighted Outdegrees and Indegrees for Nodes 33 | degree(net.matrix,cmode="outdegree") 34 | degree(net.matrix,cmode="indegree") 35 | 36 | ## Network Density 37 | gden(net) 38 | 39 | ## Weighted Density 40 | gden(net.matrix) 41 | 42 | ## Network Reciprocity 43 | grecip(net,measure="dyadic.nonnull") 44 | 45 | ## Network Centralization 46 | centralization(net,degree,cmode="outdegree") 47 | centralization(net,degree,cmode="indegree") 48 | 49 | ## Triad Census 50 | triad.census(net, mode = "digraph") 51 | 52 | -------------------------------------------------------------------------------- /Chapter_14/text_mining_chapter.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "[nltk_data] Downloading package punkt to /Users/jhao/nltk_data...\n", 13 | "[nltk_data] Package punkt is already up-to-date!\n", 14 | "[nltk_data] Downloading package stopwords to /Users/jhao/nltk_data...\n", 15 | "[nltk_data] Package stopwords is already up-to-date!\n", 16 | "[nltk_data] Downloading package averaged_perceptron_tagger to\n", 17 | "[nltk_data] /Users/jhao/nltk_data...\n", 18 | "[nltk_data] Package averaged_perceptron_tagger is already up-to-\n", 19 | "[nltk_data] date!\n" 20 | ] 21 | }, 22 | { 23 | "data": { 24 | "text/plain": [ 25 | "True" 26 | ] 27 | }, 28 | "execution_count": 1, 29 | "metadata": {}, 30 | "output_type": "execute_result" 31 | } 32 | ], 33 | "source": [ 34 | "from nltk.tokenize import word_tokenize, sent_tokenize\n", 35 | "from nltk.corpus import stopwords\n", 36 | "from nltk.stem.porter import PorterStemmer\n", 37 | "from nltk import pos_tag\n", 38 | "import nltk\n", 39 | "from spellchecker import SpellChecker\n", 40 | "import string\n", 41 | "nltk.download('punkt')\n", 42 | "nltk.download('stopwords')\n", 43 | "nltk.download('averaged_perceptron_tagger')" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "text = 'The class is over. I hopep it is intersting to you. Please let me knoww if not.'" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "'the class is over. i hopep it is intersting to you. please let me knoww if not.'" 64 | ] 65 | }, 66 | "execution_count": 3, 67 | "metadata": {}, 68 | "output_type": "execute_result" 69 | } 70 | ], 71 | "source": [ 72 | "#change to lower case\n", 73 | "text.lower()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "['The', 'class', 'is', 'over', '.', 'I', 'hopep', 'it', 'is', 'intersting', 'to', 'you', '.', 'Please', 'let', 'me', 'knoww', 'if', 'not', '.']\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "# word tokenization\n", 91 | "word_tokens = word_tokenize(text)\n", 92 | "print(word_tokens)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "['class', 'hopep', 'intersting', 'please', 'let', 'knoww']\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "# remove stop words and punctuations\n", 110 | "stopword_list = stopwords.words('english')\n", 111 | "punctuation_list = list(string.punctuation)\n", 112 | "cleaned_text = [txt for txt in word_tokenize(text.lower()) if txt not in stopword_list+punctuation_list]\n", 113 | "print(cleaned_text)" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "['class', 'hope', 'intersting', 'please', 'let', 'knoww']\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "# typo correction\n", 131 | "spell = SpellChecker()\n", 132 | "corrected_text = [spell.correction(wd) for wd in cleaned_text]\n", 133 | "print(corrected_text)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 7, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "[('class', 'NN'),\n", 145 | " ('hope', 'NN'),\n", 146 | " ('intersting', 'VBG'),\n", 147 | " ('please', 'JJ'),\n", 148 | " ('let', 'VB'),\n", 149 | " ('knoww', 'VB')]" 150 | ] 151 | }, 152 | "execution_count": 7, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "# part of speech tagging\n", 159 | "pos_tag(corrected_text)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 8, 165 | "metadata": {}, 166 | "outputs": [ 167 | { 168 | "data": { 169 | "text/plain": [ 170 | "[('class', 'class'),\n", 171 | " ('hope', 'hope'),\n", 172 | " ('intersting', 'interst'),\n", 173 | " ('please', 'pleas'),\n", 174 | " ('let', 'let'),\n", 175 | " ('knoww', 'knoww')]" 176 | ] 177 | }, 178 | "execution_count": 8, 179 | "metadata": {}, 180 | "output_type": "execute_result" 181 | } 182 | ], 183 | "source": [ 184 | "# Stemming the words\n", 185 | "porter = PorterStemmer()\n", 186 | "stem_words = [porter.stem(txt) for txt in corrected_text]\n", 187 | "list(zip(corrected_text,stem_words))" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 9, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "# ngram representation\n", 197 | "from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer\n", 198 | "import pandas as pd" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 10, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "['The class is over.', 'I hopep it is intersting to you.', 'Please let me knoww if not.']\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "# sentence tokenization\n", 216 | "sentence_list = sent_tokenize(text)\n", 217 | "print(sentence_list)" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 11, 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [ 226 | "# applying the stop words removal and typo correction\n", 227 | "correct_sentence_list = []\n", 228 | "for sent in sentence_list:\n", 229 | " correct_sentence_list.append(' '.join([spell.correction(wd) for wd in word_tokenize(sent.lower()) \\\n", 230 | " if wd not in stopword_list+punctuation_list]))\n", 231 | " " 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 12, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "['class', 'hope intersting', 'please let knoww']" 243 | ] 244 | }, 245 | "execution_count": 12, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "correct_sentence_list" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 13, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/html": [ 262 | "
\n", 263 | "\n", 276 | "\n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | "
classhopeinterstingknowwletplease
0100000
1011000
2000111
\n", 318 | "
" 319 | ], 320 | "text/plain": [ 321 | " class hope intersting knoww let please\n", 322 | "0 1 0 0 0 0 0\n", 323 | "1 0 1 1 0 0 0\n", 324 | "2 0 0 0 1 1 1" 325 | ] 326 | }, 327 | "execution_count": 13, 328 | "metadata": {}, 329 | "output_type": "execute_result" 330 | } 331 | ], 332 | "source": [ 333 | "#unigram\n", 334 | "vectorizer = CountVectorizer(ngram_range=(1,1)) \n", 335 | "X = vectorizer.fit_transform(correct_sentence_list)\n", 336 | "df = pd.DataFrame(X.toarray())\n", 337 | "df.columns = vectorizer.get_feature_names()\n", 338 | "df" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 14, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "data": { 348 | "text/html": [ 349 | "
\n", 350 | "\n", 363 | "\n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | "
classhopeinterstingknowwletplease
01.00.000.000.000.000.00
10.00.710.710.000.000.00
20.00.000.000.580.580.58
\n", 405 | "
" 406 | ], 407 | "text/plain": [ 408 | " class hope intersting knoww let please\n", 409 | "0 1.0 0.00 0.00 0.00 0.00 0.00\n", 410 | "1 0.0 0.71 0.71 0.00 0.00 0.00\n", 411 | "2 0.0 0.00 0.00 0.58 0.58 0.58" 412 | ] 413 | }, 414 | "execution_count": 14, 415 | "metadata": {}, 416 | "output_type": "execute_result" 417 | } 418 | ], 419 | "source": [ 420 | "# Tf-Idf transformation of unigram\n", 421 | "vectorizer = TfidfVectorizer(ngram_range=(1,1)) \n", 422 | "X = vectorizer.fit_transform(correct_sentence_list)\n", 423 | "df = pd.DataFrame(X.toarray())\n", 424 | "df.columns = vectorizer.get_feature_names()\n", 425 | "df.round(2)" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": 15, 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "data": { 435 | "text/html": [ 436 | "
\n", 437 | "\n", 450 | "\n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | "
class ishopep itif notintersting tois interstingis overit isknoww iflet meme knowwplease letthe classto you
01000010000010
10101101000001
20010000111100
\n", 520 | "
" 521 | ], 522 | "text/plain": [ 523 | " class is hopep it if not intersting to is intersting is over it is \\\n", 524 | "0 1 0 0 0 0 1 0 \n", 525 | "1 0 1 0 1 1 0 1 \n", 526 | "2 0 0 1 0 0 0 0 \n", 527 | "\n", 528 | " knoww if let me me knoww please let the class to you \n", 529 | "0 0 0 0 0 1 0 \n", 530 | "1 0 0 0 0 0 1 \n", 531 | "2 1 1 1 1 0 0 " 532 | ] 533 | }, 534 | "execution_count": 15, 535 | "metadata": {}, 536 | "output_type": "execute_result" 537 | } 538 | ], 539 | "source": [ 540 | "#bigramI \n", 541 | "vectorizer = CountVectorizer(ngram_range=(2,2))\n", 542 | "X = vectorizer.fit_transform(sentence_list)\n", 543 | "df = pd.DataFrame(X.toarray())\n", 544 | "df.columns = vectorizer.get_feature_names()\n", 545 | "df" 546 | ] 547 | }, 548 | { 549 | "cell_type": "markdown", 550 | "metadata": {}, 551 | "source": [ 552 | "## Word embedding - note that the following are not included in the code snippets in the book and we provide them here as a bonus. " 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 17, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "# word vectors - note that the following are not included in the code snippets in the book and we provide them here as a bonus. \n", 562 | "import gensim.downloader as api\n", 563 | "from scipy.spatial.distance import cosine" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": 18, 569 | "metadata": {}, 570 | "outputs": [ 571 | { 572 | "name": "stdout", 573 | "output_type": "stream", 574 | "text": [ 575 | "[==================================================] 100.0% 387.1/387.1MB downloaded\n" 576 | ] 577 | } 578 | ], 579 | "source": [ 580 | "#loading the 100 dimension word vector dictionary trained on twitter data\n", 581 | "model = api.load(\"glove-twitter-100\")" 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 19, 587 | "metadata": {}, 588 | "outputs": [ 589 | { 590 | "data": { 591 | "text/plain": [ 592 | "array([ 0.38446 , -0.45507 , 0.45351 , 0.4301 , -0.050908 ,\n", 593 | " -0.26414 , 0.43253 , -0.3166 , 0.32214 , 0.0064333,\n", 594 | " -0.47066 , 0.95335 , -3.2063 , 0.010913 , -0.27565 ,\n", 595 | " 1.1732 , 0.52033 , -0.045973 , 0.094254 , -0.53846 ,\n", 596 | " 0.0035668, 0.11934 , -0.17815 , -0.58093 , 0.65081 ,\n", 597 | " -0.48746 , -0.50961 , 0.42771 , -0.30638 , 0.32385 ,\n", 598 | " 0.33687 , -0.1717 , -0.39104 , -0.19038 , 0.37016 ,\n", 599 | " -0.50396 , 0.041969 , -0.20517 , 0.3223 , 0.41217 ,\n", 600 | " -0.42191 , -0.26359 , -0.1773 , -0.35658 , 0.52145 ,\n", 601 | " 0.57282 , 0.60204 , 0.74369 , 0.33377 , -0.45041 ,\n", 602 | " 0.015978 , -0.12575 , 0.29786 , -0.77635 , 0.23759 ,\n", 603 | " 0.63821 , 0.63726 , 1.0079 , 0.13714 , -0.031928 ,\n", 604 | " -0.21299 , 0.52348 , 0.67934 , -0.1427 , -0.64236 ,\n", 605 | " -0.47996 , -0.87915 , 0.17501 , 0.64517 , 0.3778 ,\n", 606 | " 0.53493 , -0.29723 , -0.25206 , -0.757 , 0.33647 ,\n", 607 | " 0.053759 , -0.8084 , 0.22205 , 0.10799 , -0.68982 ,\n", 608 | " 1.5073 , 0.96641 , -0.51839 , 0.32803 , 0.11878 ,\n", 609 | " -0.72009 , 0.23227 , 0.098733 , -0.096396 , 0.40295 ,\n", 610 | " -0.003925 , -0.10405 , -0.15234 , 0.17573 , 0.29694 ,\n", 611 | " 0.14938 , 0.11754 , 0.15699 , -0.34272 , 0.2435 ],\n", 612 | " dtype=float32)" 613 | ] 614 | }, 615 | "execution_count": 19, 616 | "metadata": {}, 617 | "output_type": "execute_result" 618 | } 619 | ], 620 | "source": [ 621 | "# get the vector of the word cat\n", 622 | "model.get_vector('cat')" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 20, 628 | "metadata": {}, 629 | "outputs": [ 630 | { 631 | "data": { 632 | "text/plain": [ 633 | "[('dog', 0.875208854675293),\n", 634 | " ('kitty', 0.8015091419219971),\n", 635 | " ('pet', 0.7986468076705933),\n", 636 | " ('cats', 0.797942578792572),\n", 637 | " ('kitten', 0.7936834096908569),\n", 638 | " ('puppy', 0.7702749967575073),\n", 639 | " ('monkey', 0.758426308631897),\n", 640 | " ('bear', 0.7507943511009216),\n", 641 | " ('dogs', 0.7460062503814697),\n", 642 | " ('pig', 0.7117346525192261)]" 643 | ] 644 | }, 645 | "execution_count": 20, 646 | "metadata": {}, 647 | "output_type": "execute_result" 648 | } 649 | ], 650 | "source": [ 651 | "# get the most similar words as cat\n", 652 | "model.most_similar('cat')" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 21, 658 | "metadata": {}, 659 | "outputs": [ 660 | { 661 | "data": { 662 | "text/plain": [ 663 | "0.6474888920783997" 664 | ] 665 | }, 666 | "execution_count": 21, 667 | "metadata": {}, 668 | "output_type": "execute_result" 669 | } 670 | ], 671 | "source": [ 672 | "# cosine similarity between cat and tiger\n", 673 | "1-cosine(model.get_vector('cat'), model.get_vector('tiger'))" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": 22, 679 | "metadata": {}, 680 | "outputs": [ 681 | { 682 | "data": { 683 | "text/plain": [ 684 | "0.7936834692955017" 685 | ] 686 | }, 687 | "execution_count": 22, 688 | "metadata": {}, 689 | "output_type": "execute_result" 690 | } 691 | ], 692 | "source": [ 693 | "#cosine similarity between cat and kitten\n", 694 | "1-cosine(model.get_vector('cat'), model.get_vector('kitten'))" 695 | ] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": 23, 700 | "metadata": {}, 701 | "outputs": [ 702 | { 703 | "data": { 704 | "text/plain": [ 705 | "0.5291033983230591" 706 | ] 707 | }, 708 | "execution_count": 23, 709 | "metadata": {}, 710 | "output_type": "execute_result" 711 | } 712 | ], 713 | "source": [ 714 | "#cosine similarit between cat and car\n", 715 | "1-cosine(model.get_vector('cat'), model.get_vector('car'))" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [] 724 | } 725 | ], 726 | "metadata": { 727 | "kernelspec": { 728 | "display_name": "Python 3", 729 | "language": "python", 730 | "name": "python3" 731 | }, 732 | "language_info": { 733 | "codemirror_mode": { 734 | "name": "ipython", 735 | "version": 3 736 | }, 737 | "file_extension": ".py", 738 | "mimetype": "text/x-python", 739 | "name": "python", 740 | "nbconvert_exporter": "python", 741 | "pygments_lexer": "ipython3", 742 | "version": "3.8.8" 743 | } 744 | }, 745 | "nbformat": 4, 746 | "nbformat_minor": 4 747 | } 748 | -------------------------------------------------------------------------------- /Chapter_7/R_scripts_for_vignette.R: -------------------------------------------------------------------------------- 1 | library(extraDistr) 2 | 3 | ############## BASE FUNCTIONS for SM-MH 4 | rPrior <- function(n, mu, sigma, prior) 5 | { 6 | if (prior == "normal") out = rnorm(n,mu,sigma) 7 | if (prior == "exp") out = rexp(n,rate=sigma) 8 | if (prior == "Laplace") out = rlaplace(n, mu, sigma) 9 | return(out) 10 | } 11 | 12 | Prior_MH <- function(z, j, y, current, mu, sigma, prior) 13 | { 14 | n = length(z)-1 15 | if (prior=="normal") 16 | { 17 | c_4=dnorm(z[j], mean=mu,sd=sigma,log = TRUE) 18 | c_5=dnorm(current,mean=mu,sd=sigma,log = TRUE) 19 | c_6=pnorm(current,mean=mu,sd=sigma,lower.tail=y[n],log.p = TRUE) 20 | c_7=pnorm(z[j],mean=mu,sd=sigma,lower.tail=y[n],log.p = TRUE) 21 | } 22 | if (prior=="exp") 23 | { 24 | c_4=dexp(z[j], rate=sigma,log = TRUE) 25 | c_5=dexp(current,rate=sigma,log = TRUE) 26 | c_6=pexp(current,rate=sigma,lower.tail=y[n],log.p = TRUE) 27 | c_7=pexp(z[j],rate=sigma,lower.tail=y[n],log.p = TRUE) 28 | } 29 | if (prior=="Laplace") 30 | { 31 | c_4=dlaplace(z[j], mu=0, sigma, log=TRUE) 32 | c_5=dlaplace(current, mu=0, sigma, log = TRUE) 33 | c_6=plaplace(current, mu=0, sigma, lower.tail=y[n],log.p = TRUE) 34 | c_7=plaplace(z[j], mu=0, sigma, lower.tail=y[n],log.p = TRUE) 35 | } 36 | return(c_4-c_5+c_6-c_7) 37 | } 38 | 39 | MH_R = function(x, a, b, current, mu, sigma, prior=c("normal", "exp", "Laplace"), accept_all=FALSE) 40 | { 41 | prior = match.arg(prior) 42 | if (any(a==0)){ 43 | indx = which(a==0) 44 | x = x[-indx] 45 | a = a[-indx] 46 | b = b[-indx] 47 | } 48 | if (any(a<0)){ 49 | indx = which(a<0) 50 | x[indx] = 1 - x[indx] 51 | a[indx] = -a[indx] 52 | b[indx] = -b[indx] 53 | } 54 | n=length(x) 55 | z=rlogis(n,location=-b/a,scale=1/a) 56 | z = c(z, rPrior(1,mu,sigma, prior)) 57 | j=order(z)[sum(x)+1] 58 | candidate = z[j] 59 | if (accept_all == FALSE) 60 | { 61 | y=sapply(z[-j],function(x) 1*(x<=z[j])) 62 | if (j<(n+1)){ 63 | logA = (x[j]-1)*a[j]*(z[j]-current) 64 | logA = logA + log(1+exp(a[j]*z[j]+b[j])) 65 | logA = logA - log(1+exp(a[j]*current+b[j])) 66 | logA = logA + Prior_MH(z, j, y, current, mu, sigma, prior) 67 | logalpha = logA + sum(a[-j]*(x[-j]-y[-n]))*(z[j]-current) 68 | }else{ 69 | logalpha = sum(a*(x-y))*(z[j]-current) 70 | } 71 | candidate = ifelse(log(runif(1,0,1))<=logalpha, z[j], current) 72 | } 73 | return(candidate) 74 | } 75 | 76 | ########## IRT data 77 | ## Rasch 78 | Simdat.item = function(nP, alpha, delta, theta) 79 | { 80 | return(1*(rlogis(nP,0,1)<=(alpha*theta - delta))) 81 | } 82 | 83 | 84 | ## MIRT model with factor loadings alpha 85 | rMIRT_R = function(theta, alpha, delta) 86 | { 87 | nP = nrow(theta) 88 | nI = nrow(alpha) 89 | x=matrix(NA,nP,nI) 90 | for (i in 1:nI) 91 | { 92 | x[,i] = 1*(rlogis(nP,0,1)<=(theta%*%alpha[i,]+delta[i])) 93 | } 94 | return(x) 95 | } 96 | 97 | ################# Gibbs sampler Rasch Model 98 | nP=nrow(y); nI=ncol(y) 99 | # priors 100 | mu.th = 0; sigma.th = 2 101 | mu.b = 0; sigma.b = 0.5 102 | # start_values 103 | delta=runif(nI,-1,1) 104 | theta=rnorm(nP,mu.th,sigma.th) 105 | 106 | for (iter in 1:50) 107 | { 108 | init = ifelse(iter<5, 1,0) 109 | theta=sapply(1:nP,function(p) MH_R(y[p,], rep(1,nI), delta, theta[p], mu.th, sigma.th, 'normal',init)) 110 | delta=sapply(1:nI,function(i) MH_R(y[,i], rep(1,nP), theta, delta[i], mu.b, sigma.b,'normal',init)) 111 | 112 | # Identify 113 | delta = delta-mean(delta) 114 | 115 | # Update Prior 116 | sigma.th = sqrt(1/rgamma(1,shape=(nP-1)/2,rate=((nP-1)/2)*var(theta))) 117 | mu.th = rnorm(1,mean(theta),sigma.th/sqrt(nP)) 118 | } 119 | 120 | 121 | 122 | ############## Gibbs sampler 2PL 123 | nP=nrow(y); nI=ncol(y) 124 | # priors 125 | mu.th = 0; sigma.th = 2 126 | mu.b = 0; sigma.b = 0.5 127 | # start_values 128 | delta=runif(nI,-1,1) 129 | theta=rnorm(nP,mu.th,sigma.th) 130 | alpha=rep(1,nI) 131 | 132 | for (iter in 1:50) 133 | { 134 | init = ifelse(iter<5,1,0) 135 | theta = sapply(1:nP,function(p) MH_R(y[p,], alpha, delta, theta[p], mu.th, sigma.th, 'normal',init)) 136 | delta = sapply(1:nI,function(i) MH_R(y[,i], rep(1,nP), alpha[i]*theta, delta[i], mu.b, sigma.b,'normal',init)) 137 | alpha = sapply(1:nI,function(i) MH_R(y[,i], theta, rep(delta,nP), alpha[i], 1, 0.5,'normal',init)) 138 | 139 | # Identify 140 | alpha=alpha/alpha[1] 141 | shift=sum(delta)/sum(alpha) 142 | delta = delta-shift*alpha 143 | 144 | # Update ability prior 145 | sigma.th = sqrt(1/rgamma(1,shape=(nP-1)/2,rate=((nP-1)/2)*var(theta))) 146 | mu.th = rnorm(1,mean(theta),sigma.th/sqrt(nP)) 147 | } 148 | 149 | 150 | ### Gibbs sampler BI-FACTOR model 151 | BIfactor = function(x, C, nIter=100) 152 | { 153 | nD = ncol(C) 154 | m=nrow(x) 155 | n=ncol(x) 156 | mu.pv=rep(0,nD) 157 | delta=runif(n,-1,1) 158 | lambda=rep(1,nD) 159 | pv=matrix(0,m,nD) 160 | for (j in 1:nD) pv[,j]=rnorm(m,mu.pv[j],1) 161 | 162 | ### vectors 163 | a_n=vector("numeric", n) 164 | b_n=vector("numeric", n) 165 | a_m=vector("numeric", m) 166 | b_m=vector("numeric", m) 167 | a_nm=vector("numeric", n*m) 168 | b_nm=vector("numeric", n*m) 169 | 170 | store_lambda= matrix(0,nD,nIter) 171 | store_delta = matrix(0,m,nIter) 172 | 173 | for (iter in 1:nIter) 174 | { 175 | init=(iter<2) 176 | 177 | # Update \delta_i 178 | for (i in 1:n) 179 | { 180 | for (p in 1:m) 181 | { 182 | b_m[p]=0; a_m[p]=1 183 | for (j in 1:nD) 184 | { 185 | b_m[p]=b_m[p]+C[i,j]*lambda[j]*pv[p,j] 186 | } 187 | } 188 | delta[i]=MH_R(x[,i], a_m, b_m, delta[i], 0, 1, 'normal', init) 189 | } 190 | # Update \theta_{pj} 191 | for (p in 1:m) 192 | { 193 | for (j in 1:nD) 194 | { 195 | for (i in 1:n) 196 | { 197 | a_n[i]=lambda[j]*C[i,j] 198 | b_n[i]=delta[i] 199 | for (h in 1:nD) 200 | { 201 | if (h!=j) b_n[i]=b_n[i]+lambda[h]*C[i,h]*pv[p,h] 202 | } 203 | } 204 | pv[p,j]=MH_R(x[p,],a_n, b_n, pv[p,j], mu.pv[j], 1,'normal', init) 205 | } 206 | } 207 | 208 | for (j in 1:nD) 209 | { 210 | ip=0 211 | for (i in 1:n) 212 | { 213 | for (p in 1:m) 214 | { 215 | ip=ip+1 216 | b_nm[ip]=delta[i] 217 | a_nm[ip]=C[i,j]*pv[p,j] 218 | for (h in 1:nD) 219 | { 220 | if (h!=j) b_nm[ip]=b_nm[ip]+lambda[h]*C[i,h]*pv[p,h] 221 | } 222 | } 223 | } 224 | lambda[j]=MH_R(as.vector(x),a_nm,b_nm,lambda[j],1,2,'exp', init) 225 | } 226 | 227 | delta = delta - mean(delta) 228 | 229 | ## update prior plausible values 230 | for (j in 1:nD){ 231 | pv[,j] = pv[,j]/sd(pv[,j]) 232 | mu.pv[j] = rnorm(1,mean(pv[,j]),1/sqrt(m)) 233 | } 234 | 235 | store_lambda[,iter] = lambda 236 | store_delta[,iter] = delta 237 | } 238 | out=list(delta=store_delta, lambda=store_lambda) 239 | } 240 | 241 | 242 | ##### Gibbs sampler Logistic regression 243 | GibbsLogistic_R = function(dat, out=1, covariates, lasso=FALSE, lambda=NULL, nIter = 1000, center=TRUE) 244 | { 245 | dat = cbind(dat,1) 246 | covariates = c(covariates,ncol(dat)) 247 | nb = length(covariates) 248 | est_lambda = is.null(lambda) 249 | tr_beta = matrix(0,nb,nIter) 250 | if (est_lambda){ 251 | tr_lambda = rep(0,nIter) 252 | }else 253 | { 254 | tr_lambda = rep(lambda,nIter) 255 | } 256 | est_lambda = is.null(lambda) 257 | 258 | M=10 259 | 260 | # centre covariates 261 | if (center) 262 | { 263 | mean_x = colMeans(dat) 264 | for (i in covariates[1:(nb-1)]) dat[,i] = dat[,i]-mean_x[i] 265 | } 266 | 267 | if (lasso&est_lambda) lambda = rgamma(1,3,1) 268 | 269 | beta=runif(nb,-1,1) 270 | for (iter in 1:nIter) 271 | { 272 | for (i in 1:(nb-1)) 273 | { 274 | b = as.matrix(dat[,covariates[-i]])%*%beta[-i] 275 | beta[i] = ifelse(lasso, 276 | MH(dat[,out], dat[,covariates[i]], b, beta[i], 0, 1/lambda, 277 | prior = "Laplace", 278 | accept_all = ifelse(iter 2 | 3 | The Dormouse's story 4 | 5 | 6 | 7 |

The Dormouse's story

8 |

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

9 | 10 | 11 | -------------------------------------------------------------------------------- /Chapter_8/stream_data.txt: -------------------------------------------------------------------------------- 1 | -- This is Line: 1 -- 2 | -- This is Line: 2 -- 3 | -- This is Line: 3 -- 4 | -- This is Line: 4 -- 5 | -- This is Line: 5 -- 6 | -- This is Line: 6 -- 7 | -- This is Line: 7 -- 8 | -- This is Line: 8 -- 9 | -- This is Line: 9 -- 10 | -- This is Line: 10 -- 11 | -- This is Line: 11 -- 12 | -- This is Line: 12 -- 13 | -- This is Line: 13 -- 14 | -- This is Line: 14 -- 15 | -- This is Line: 15 -- 16 | -- This is Line: 16 -- 17 | -- This is Line: 17 -- 18 | -- This is Line: 18 -- 19 | 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /Chapter_8/stream_data_generation.py: -------------------------------------------------------------------------------- 1 | # ---- stream_data_generation.py --- 2 | # generating a growing data file 3 | 4 | import time 5 | 6 | if __name__=="__main__": 7 | line_number = 1 8 | while True: 9 | file_stream = open("stream_data.txt", "a") 10 | contents = '-- This is Line: '+str(line_number)+' -- \n' 11 | file_stream.write(contents) 12 | file_stream.close() 13 | time.sleep(2) 14 | line_number = line_number +1 15 | print(line_number) 16 | if line_number == 3600: 17 | break 18 | continue 19 | 20 | -------------------------------------------------------------------------------- /Chapter_8/structured_example_log.json: -------------------------------------------------------------------------------- 1 | 2 | 3 | {"data": [{"session_time": "2013-05-15 14:17:26", "event_name": "Session Start", "event_attribute": "NaN"}, {"session_time": "2013-05-15 14:17:26", "event_name": "Leaving sequence", "event_attribute": "loadXML, moving forward."}, {"session_time": "2013-05-15 14:17:30", "event_name": "Player submitted name", "event_attribute": "Carl"}, {"session_time": "2013-05-15 14:17:30", "event_name": "Leaving sequence", "event_attribute": "InputNameScreen, moving forward."}, {"session_time": "2013-05-15 14:17:31", "event_name": "Player submitted name", "event_attribute": "Carl"}, {"session_time": "2013-05-15 14:17:31", "event_name": "Leaving sequence", "event_attribute": "startScreen, moving forward."}, {"session_time": "2013-05-15 14:17:50", "event_name": "Player submitted name", "event_attribute": "Carl"}, {"session_time": "2013-05-15 14:17:50", "event_name": "Leaving sequence", "event_attribute": "slide2, moving forward."}, {"session_time": "2013-05-15 14:17:55", "event_name": "Player submitted name", "event_attribute": "Carl"}, {"session_time": "2013-05-15 14:17:55", "event_name": "Leaving sequence", "event_attribute": "slide2b, moving forward."}, {"session_time": "2013-05-15 14:18:34", "event_name": "Player submitted name", "event_attribute": "Carl"}, {"session_time": "2013-05-15 14:18:34", "event_name": "Leaving sequence", "event_attribute": "slide2c, moving forward."}, {"session_time": "2013-05-15 14:20:09", "event_name": "Player submitted name", "event_attribute": "Carl"}, {"session_time": "2013-05-15 14:20:09", "event_name": "Leaving sequence", "event_attribute": "slide3, moving forward."}, {"session_time": "2013-05-15 14:20:13", "event_name": "Player submitted name", "event_attribute": "Carl"}, {"session_time": "2013-05-15 14:20:13", "event_name": "Leaving sequence", "event_attribute": "slide4, moving forward."}]} 4 | 5 | 6 | -------------------------------------------------------------------------------- /Chapter_8/structured_example_log.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7369 5 | hao_jiangang 6 | userID1jiangang 7 | nickname1jianganguserID2hao 8 | nickname2hao 9 | 17 10 | attempt17 11 | n_player2 12 | team_assembly_moderandom 13 | P1jiangang 14 | P2hao 15 | 16 | chat2019-11-06T14:18:31Z2019-11-06T14:18:31Zjiangangothershislide1-step0 17 | question2019-11-06T14:18:42Z2019-11-06T14:18:42Zjiangangcbal-1-01slide2-step0attemptnullcorrectnull 18 | question2019-11-06T14:18:42Z2019-11-06T14:18:42Zjiangangcbal-1-1dfsaslide2-step0attemptnullcorrectnull 19 | chat2019-11-06T14:18:44Z2019-11-06T14:18:44Zjiangangothersokslide2-step0 20 | chat2019-11-06T14:18:46Z2019-11-06T14:18:46Zjiangangothershow are youslide2-step0 21 | chat2019-11-06T14:18:49Z2019-11-06T14:18:49Zjiangangothersthis is super greatslide2-step0 22 | chat2019-11-06T14:18:50Z2019-11-06T14:18:50Zsystemjianganggood work! Using positive language is helpful for working with a partner!- 23 | question2019-11-06T14:18:56Z2019-11-06T14:18:56Zhaocbal-1-1fgfdgslide2-step0attemptnullcorrectnull 24 | question2019-11-06T14:18:56Z2019-11-06T14:18:56Zhaocbal-1-02slide2-step0attemptnullcorrectnull 25 | chat2019-11-06T14:18:59Z2019-11-06T14:18:59Zjiangangothersshitslide3-step0 26 | chat2019-11-06T14:18:59Z2019-11-06T14:18:59Zsystemjiangangplease avoid using disrespectful language in your chats. That is not helpful for working with a partner!- 27 | question2019-11-06T14:19:05Z2019-11-06T14:19:05Zjiangangcbal-2-11slide3-step0attemptnullcorrectnull 28 | question2019-11-06T14:19:05Z2019-11-06T14:19:05Zjiangangcbal-2-02slide3-step0attemptnullcorrectnull 29 | question2019-11-06T14:19:14Z2019-11-06T14:19:14Zhaocbal-2-01slide3-step0attemptnullcorrectnull 30 | question2019-11-06T14:19:14Z2019-11-06T14:19:14Zhaocbal-2-11slide3-step0attemptnullcorrectnull 31 | view_animation2019-11-06T14:19:16Z2019-11-06T14:19:16Zhaocondensationanimation viewedslide-cbal-3 32 | question2019-11-06T14:19:26Z2019-11-06T14:19:26Zhaocbal-3-0water-3-bslide4-step0attemptnullcorrectnull 33 | question2019-11-06T14:19:26Z2019-11-06T14:19:26Zhaocbal-3-1speed-3-aslide4-step0attemptnullcorrectnull 34 | question2019-11-06T14:19:29Z2019-11-06T14:19:29Zjiangangcbal-3-0water-3-bslide4-step0attemptnullcorrectnull 35 | question2019-11-06T14:19:29Z2019-11-06T14:19:29Zjiangangcbal-3-1speed-3-aslide4-step0attemptnullcorrectnull 36 | question2019-11-06T14:19:36Z2019-11-06T14:19:36Zhaocbal-4-0water-4-bslide5-step0attemptnullcorrectnull 37 | question2019-11-06T14:19:36Z2019-11-06T14:19:36Zhaocbal-4-1speed-4-aslide5-step0attemptnullcorrectnull 38 | question2019-11-06T14:19:39Z2019-11-06T14:19:39Zjiangangcbal-4-0water-4-aslide5-step0attemptnullcorrectnull 39 | question2019-11-06T14:19:39Z2019-11-06T14:19:39Zjiangangcbal-4-1speed-4-aslide5-step0attemptnullcorrectnull 40 | file_saved2019-11-06T14:19:45Z2019-11-06T14:19:45Zjiangangdrawing-widget/uploads/games/G7369/G7369_jiangang_drawing-widget_1.pngdrawing-widget 41 | file_saved2019-11-06T14:19:47Z2019-11-06T14:19:47Zjiangangdrawing-widget/uploads/games/G7369/G7369_jiangang_drawing-widget_3.pngdrawing-widget 42 | file_saved2019-11-06T14:19:47Z2019-11-06T14:19:47Zjiangangdrawing-widget/uploads/games/G7369/G7369_jiangang_drawing-widget_2.pngdrawing-widget 43 | file_saved2019-11-06T14:19:59Z2019-11-06T14:19:59Zjiangangdrawing-widget/uploads/games/G7369/G7369_jiangang_drawing-widget_4.pngdrawing-widget 44 | file_saved2019-11-06T14:20:08Z2019-11-06T14:20:08Zjiangangdrawing-widget/uploads/games/G7369/G7369_jiangang_drawing-widget_5.pngdrawing-widget 45 | file_saved2019-11-06T14:20:09Z2019-11-06T14:20:09Zjiangangdrawing-widget/uploads/games/G7369/G7369_jiangang_drawing-widget_6.pngdrawing-widget 46 | file_saved2019-11-06T14:20:12Z2019-11-06T14:20:12Zjiangangdrawing-widget/uploads/games/G7369/G7369_jiangang_drawing-widget_7.pngdrawing-widget 47 | file_saved2019-11-06T14:20:20Z2019-11-06T14:20:20Zhaodrawing-widget/uploads/games/G7369/G7369_hao_drawing-widget_7.pngdrawing-widget 48 | file_saved2019-11-06T14:20:21Z2019-11-06T14:20:21Zhaodrawing-widget/uploads/games/G7369/G7369_hao_drawing-widget_8.pngdrawing-widget 49 | file_saved2019-11-06T14:20:23Z2019-11-06T14:20:23Zhaodrawing-widget/uploads/games/G7369/G7369_hao_drawing-widget_9.pngdrawing-widget 50 | file_saved2019-11-06T14:20:26Z2019-11-06T14:20:26Zhaodrawing-widget/uploads/games/G7369/G7369_hao_drawing-widget_10.pngdrawing-widget 51 | question2019-11-06T14:20:28Z2019-11-06T14:20:28Zhaoslide6-step0attemptnullcorrectnullallow_skipnullseenby_idsnullshow_answernullmax_attemptsnullrandomize_ordernull 52 | question2019-11-06T14:20:30Z2019-11-06T14:20:30Zjiangangslide6-step0attemptnullcorrectnullallow_skipnullseenby_idsnullshow_answernullmax_attemptsnullrandomize_ordernull 53 | question2019-11-06T14:20:38Z2019-11-06T14:20:38Zjiangangcbal-5-3this si coolslide7-step0attemptnullcorrectnull 54 | question2019-11-06T14:20:38Z2019-11-06T14:20:38Zjiangangcbal-5-2water-5-hslide7-step0attemptnullcorrectnull 55 | question2019-11-06T14:20:38Z2019-11-06T14:20:38Zjiangangcbal-5-0water-5-bslide7-step0attemptnullcorrectnull 56 | question2019-11-06T14:20:38Z2019-11-06T14:20:38Zjiangangcbal-5-1water-5-fslide7-step0attemptnullcorrectnull 57 | question2019-11-06T14:20:45Z2019-11-06T14:20:45Zhaocbal-5-1okslide7-step0attemptnullcorrectnull 58 | question2019-11-06T14:20:45Z2019-11-06T14:20:45Zhaocbal-5-0water-5-hslide7-step0attemptnullcorrectnull 59 | question2019-11-06T14:20:48Z2019-11-06T14:20:48Zjiangangslide8-step0attemptnullcorrectnullallow_skipnullseenby_idsnullshow_answernullmax_attemptsnullrandomize_ordernull 60 | question2019-11-06T14:20:51Z2019-11-06T14:20:51Zhaoslide8-step0attemptnullcorrectnullallow_skipnullseenby_idsnullshow_answernullmax_attemptsnullrandomize_ordernull 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /Chapter_8/unstructured_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "##
Parsing and restructuring unstructured process data
" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Step 1. Loading the needed packages" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "import numpy as np\n", 25 | "from datetime import datetime" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Step 2. Open the unstructured data file and read it into a python list" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "with open('unstructured_example_log.txt') as f:\n", 42 | " txt = f.readlines()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [ 50 | { 51 | "data": { 52 | "text/plain": [ 53 | "['[5/15/2013 2:17:26 PM] Session Start\\n',\n", 54 | " '[5/15/2013 2:17:26 PM] Leaving sequence: loadXML, moving forward.\\n',\n", 55 | " '[5/15/2013 2:17:30 PM] Player submitted name: Carl\\n',\n", 56 | " '[5/15/2013 2:17:30 PM] Leaving sequence: InputNameScreen, moving forward.\\n',\n", 57 | " '[5/15/2013 2:17:31 PM] Player submitted name: Carl\\n',\n", 58 | " '[5/15/2013 2:17:31 PM] Leaving sequence: startScreen, moving forward.\\n',\n", 59 | " '[5/15/2013 2:17:50 PM] Player submitted name: Carl\\n',\n", 60 | " '[5/15/2013 2:17:50 PM] Leaving sequence: slide2, moving forward.\\n',\n", 61 | " '[5/15/2013 2:17:55 PM] Player submitted name: Carl\\n',\n", 62 | " '[5/15/2013 2:17:55 PM] Leaving sequence: slide2b, moving forward.\\n',\n", 63 | " '[5/15/2013 2:18:34 PM] Player submitted name: Carl\\n',\n", 64 | " '[5/15/2013 2:18:34 PM] Leaving sequence: slide2c, moving forward.\\n',\n", 65 | " '[5/15/2013 2:20:09 PM] Player submitted name: Carl\\n',\n", 66 | " '[5/15/2013 2:20:09 PM] Leaving sequence: slide3, moving forward.\\n',\n", 67 | " '[5/15/2013 2:20:13 PM] Player submitted name: Carl\\n',\n", 68 | " '[5/15/2013 2:20:13 PM] Leaving sequence: slide4, moving forward.\\n']" 69 | ] 70 | }, 71 | "execution_count": 3, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "txt" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 4, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "'['" 89 | ] 90 | }, 91 | "execution_count": 4, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "txt[0][0]" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "### Step 3. Clean each line to strip off the \\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 5, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "txt = [t.strip() for t in txt]" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "data": { 123 | "text/plain": [ 124 | "['[5/15/2013 2:17:26 PM] Session Start',\n", 125 | " '[5/15/2013 2:17:26 PM] Leaving sequence: loadXML, moving forward.',\n", 126 | " '[5/15/2013 2:17:30 PM] Player submitted name: Carl',\n", 127 | " '[5/15/2013 2:17:30 PM] Leaving sequence: InputNameScreen, moving forward.',\n", 128 | " '[5/15/2013 2:17:31 PM] Player submitted name: Carl',\n", 129 | " '[5/15/2013 2:17:31 PM] Leaving sequence: startScreen, moving forward.',\n", 130 | " '[5/15/2013 2:17:50 PM] Player submitted name: Carl',\n", 131 | " '[5/15/2013 2:17:50 PM] Leaving sequence: slide2, moving forward.',\n", 132 | " '[5/15/2013 2:17:55 PM] Player submitted name: Carl',\n", 133 | " '[5/15/2013 2:17:55 PM] Leaving sequence: slide2b, moving forward.',\n", 134 | " '[5/15/2013 2:18:34 PM] Player submitted name: Carl',\n", 135 | " '[5/15/2013 2:18:34 PM] Leaving sequence: slide2c, moving forward.',\n", 136 | " '[5/15/2013 2:20:09 PM] Player submitted name: Carl',\n", 137 | " '[5/15/2013 2:20:09 PM] Leaving sequence: slide3, moving forward.',\n", 138 | " '[5/15/2013 2:20:13 PM] Player submitted name: Carl',\n", 139 | " '[5/15/2013 2:20:13 PM] Leaving sequence: slide4, moving forward.']" 140 | ] 141 | }, 142 | "execution_count": 6, 143 | "metadata": {}, 144 | "output_type": "execute_result" 145 | } 146 | ], 147 | "source": [ 148 | "txt" 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": {}, 154 | "source": [ 155 | "### Step 4. Sepate the time stamp and convert it to standard Python datetime object" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 7, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "s0 = txt[0].split(']')[0].strip('[')\n", 165 | "s10 = txt[10].split(']')[0].strip('[')" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 8, 171 | "metadata": {}, 172 | "outputs": [ 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "'5/15/2013 2:17:26 PM'" 177 | ] 178 | }, 179 | "execution_count": 8, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "s0" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 9, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "data": { 195 | "text/plain": [ 196 | "'5/15/2013 2:18:34 PM'" 197 | ] 198 | }, 199 | "execution_count": 9, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "s10" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 10, 211 | "metadata": {}, 212 | "outputs": [], 213 | "source": [ 214 | "dtfmt ='%m/%d/%Y %I:%M:%S %p' # %H -> 24 hours, %I-> 12 hours\n", 215 | "t0 = datetime.strptime(s0, dtfmt)\n", 216 | "t10 = datetime.strptime(s10, dtfmt)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 11, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "data": { 226 | "text/plain": [ 227 | "datetime.datetime(2013, 5, 15, 14, 17, 26)" 228 | ] 229 | }, 230 | "execution_count": 11, 231 | "metadata": {}, 232 | "output_type": "execute_result" 233 | } 234 | ], 235 | "source": [ 236 | "t0" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 12, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "datetime.datetime(2013, 5, 15, 14, 18, 34)" 248 | ] 249 | }, 250 | "execution_count": 12, 251 | "metadata": {}, 252 | "output_type": "execute_result" 253 | } 254 | ], 255 | "source": [ 256 | "t10" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 13, 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "data": { 266 | "text/plain": [ 267 | "68" 268 | ] 269 | }, 270 | "execution_count": 13, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "(t10-t0).seconds" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "### Step 5. Restructure the information into a data frame" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 14, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "# define a function to combine all the above steps and turn the results into a pandas Data Frame\n", 293 | "\n", 294 | "def generate_data_frame(txt):\n", 295 | " session_time = []\n", 296 | " event_name = []\n", 297 | " event_attribute = []\n", 298 | " dtfmt ='%m/%d/%Y %I:%M:%S %p'\n", 299 | " for line in txt:\n", 300 | " s1=line.split(']')[0].strip('[')\n", 301 | " dt = datetime.strptime(s1, dtfmt)\n", 302 | " session_time.append(dt)\n", 303 | " s= line.split(']')[1].strip().split(':')\n", 304 | " event_name.append(s[0])\n", 305 | " if len(s) == 2:\n", 306 | " event_attribute.append(s[1].lstrip())\n", 307 | " else:\n", 308 | " event_attribute.append(np.nan)\n", 309 | " df = pd.DataFrame([session_time,event_name,event_attribute]).T\n", 310 | " df.columns=['session_time','event_name', 'event_attribute'] \n", 311 | " return df" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 15, 317 | "metadata": {}, 318 | "outputs": [ 319 | { 320 | "data": { 321 | "text/html": [ 322 | "
\n", 323 | "\n", 336 | "\n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | "
session_timeevent_nameevent_attribute
02013-05-15 14:17:26Session StartNaN
12013-05-15 14:17:26Leaving sequenceloadXML, moving forward.
22013-05-15 14:17:30Player submitted nameCarl
32013-05-15 14:17:30Leaving sequenceInputNameScreen, moving forward.
42013-05-15 14:17:31Player submitted nameCarl
52013-05-15 14:17:31Leaving sequencestartScreen, moving forward.
62013-05-15 14:17:50Player submitted nameCarl
72013-05-15 14:17:50Leaving sequenceslide2, moving forward.
82013-05-15 14:17:55Player submitted nameCarl
92013-05-15 14:17:55Leaving sequenceslide2b, moving forward.
102013-05-15 14:18:34Player submitted nameCarl
112013-05-15 14:18:34Leaving sequenceslide2c, moving forward.
122013-05-15 14:20:09Player submitted nameCarl
132013-05-15 14:20:09Leaving sequenceslide3, moving forward.
142013-05-15 14:20:13Player submitted nameCarl
152013-05-15 14:20:13Leaving sequenceslide4, moving forward.
\n", 444 | "
" 445 | ], 446 | "text/plain": [ 447 | " session_time event_name \\\n", 448 | "0 2013-05-15 14:17:26 Session Start \n", 449 | "1 2013-05-15 14:17:26 Leaving sequence \n", 450 | "2 2013-05-15 14:17:30 Player submitted name \n", 451 | "3 2013-05-15 14:17:30 Leaving sequence \n", 452 | "4 2013-05-15 14:17:31 Player submitted name \n", 453 | "5 2013-05-15 14:17:31 Leaving sequence \n", 454 | "6 2013-05-15 14:17:50 Player submitted name \n", 455 | "7 2013-05-15 14:17:50 Leaving sequence \n", 456 | "8 2013-05-15 14:17:55 Player submitted name \n", 457 | "9 2013-05-15 14:17:55 Leaving sequence \n", 458 | "10 2013-05-15 14:18:34 Player submitted name \n", 459 | "11 2013-05-15 14:18:34 Leaving sequence \n", 460 | "12 2013-05-15 14:20:09 Player submitted name \n", 461 | "13 2013-05-15 14:20:09 Leaving sequence \n", 462 | "14 2013-05-15 14:20:13 Player submitted name \n", 463 | "15 2013-05-15 14:20:13 Leaving sequence \n", 464 | "\n", 465 | " event_attribute \n", 466 | "0 NaN \n", 467 | "1 loadXML, moving forward. \n", 468 | "2 Carl \n", 469 | "3 InputNameScreen, moving forward. \n", 470 | "4 Carl \n", 471 | "5 startScreen, moving forward. \n", 472 | "6 Carl \n", 473 | "7 slide2, moving forward. \n", 474 | "8 Carl \n", 475 | "9 slide2b, moving forward. \n", 476 | "10 Carl \n", 477 | "11 slide2c, moving forward. \n", 478 | "12 Carl \n", 479 | "13 slide3, moving forward. \n", 480 | "14 Carl \n", 481 | "15 slide4, moving forward. " 482 | ] 483 | }, 484 | "execution_count": 15, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "generate_data_frame(txt)" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [] 499 | } 500 | ], 501 | "metadata": { 502 | "kernelspec": { 503 | "display_name": "Python 3", 504 | "language": "python", 505 | "name": "python3" 506 | }, 507 | "language_info": { 508 | "codemirror_mode": { 509 | "name": "ipython", 510 | "version": 3 511 | }, 512 | "file_extension": ".py", 513 | "mimetype": "text/x-python", 514 | "name": "python", 515 | "nbconvert_exporter": "python", 516 | "pygments_lexer": "ipython3", 517 | "version": "3.8.8" 518 | } 519 | }, 520 | "nbformat": 4, 521 | "nbformat_minor": 4 522 | } 523 | -------------------------------------------------------------------------------- /Chapter_8/unstructured_data_microbatch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Step 1. Loading the needed packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "from datetime import datetime\n", 19 | "from itertools import islice\n", 20 | "import sys" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "### Step 2. Define functions" 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 2, 33 | "metadata": {}, 34 | "outputs": [], 35 | "source": [ 36 | "# define generator function to read in files in chuncks\n", 37 | "def read_chunks(file_obj,chunk_size):\n", 38 | " while True:\n", 39 | " lines = list(islice(file_obj, chunk_size))\n", 40 | " if lines: \n", 41 | " yield lines\n", 42 | " else: \n", 43 | " print('end of file')\n", 44 | " break" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Step 3. Reload the previously defined function" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# this function is the same as the previous one but we reload it here\n", 61 | "def generate_data_frame(txt):\n", 62 | " session_time = []\n", 63 | " event_name = []\n", 64 | " event_attribute = []\n", 65 | " dtfmt ='%m/%d/%Y %I:%M:%S %p'\n", 66 | " for line in txt:\n", 67 | " s1=line.split(']')[0].strip('[')\n", 68 | " dt = datetime.strptime(s1, dtfmt)\n", 69 | " session_time.append(dt)\n", 70 | " s= line.split(']')[1].strip().split(':')\n", 71 | " event_name.append(s[0])\n", 72 | " if len(s) == 2:\n", 73 | " event_attribute.append(s[1].lstrip())\n", 74 | " else:\n", 75 | " event_attribute.append(np.nan)\n", 76 | " df = pd.DataFrame([session_time,event_name,event_attribute]).T\n", 77 | " df.columns=['session_time','event_name', 'event_attribute'] \n", 78 | " return df" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### Step 4. Open files and read data into a generator object" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "# open the file\n", 95 | "f = open('unstructured_example_log.txt','r')" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# read file into chunks of 4 lines and create a generator object \n", 105 | "chunk_generator = read_chunks(f,4)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 6, 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "128" 117 | ] 118 | }, 119 | "execution_count": 7, 120 | "metadata": {}, 121 | "output_type": "execute_result" 122 | } 123 | ], 124 | "source": [ 125 | "# check the memory usage of the generator object in bytes\n", 126 | "sys.getsizeof(chunk_generator)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 7, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "data": { 136 | "text/plain": [ 137 | "['[5/15/2013 2:17:26 PM] Session Start\\n',\n", 138 | " '[5/15/2013 2:17:26 PM] Leaving sequence: loadXML, moving forward.\\n',\n", 139 | " '[5/15/2013 2:17:30 PM] Player submitted name: Carl\\n',\n", 140 | " '[5/15/2013 2:17:30 PM] Leaving sequence: InputNameScreen, moving forward.\\n']" 141 | ] 142 | }, 143 | "execution_count": 8, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "# get the first chunk using next()\n", 150 | "next(chunk_generator)" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 8, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "# create a new generator function called df \n", 160 | "df = (generate_data_frame(txt) for txt in chunk_generator)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 9, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "128" 172 | ] 173 | }, 174 | "execution_count": 12, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "# check the memory usage of the generator object in bytes\n", 181 | "sys.getsizeof(df)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 10, 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/html": [ 192 | "
\n", 193 | "\n", 206 | "\n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | "
session_timeevent_nameevent_attribute
02013-05-15 14:17:31Player submitted nameCarl
12013-05-15 14:17:31Leaving sequencestartScreen, moving forward.
22013-05-15 14:17:50Player submitted nameCarl
32013-05-15 14:17:50Leaving sequenceslide2, moving forward.
\n", 242 | "
" 243 | ], 244 | "text/plain": [ 245 | " session_time event_name event_attribute\n", 246 | "0 2013-05-15 14:17:31 Player submitted name Carl\n", 247 | "1 2013-05-15 14:17:31 Leaving sequence startScreen, moving forward.\n", 248 | "2 2013-05-15 14:17:50 Player submitted name Carl\n", 249 | "3 2013-05-15 14:17:50 Leaving sequence slide2, moving forward." 250 | ] 251 | }, 252 | "execution_count": 14, 253 | "metadata": {}, 254 | "output_type": "execute_result" 255 | } 256 | ], 257 | "source": [ 258 | "# get the first object in dfnext(df)\n", 259 | "next(df)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 11, 265 | "metadata": {}, 266 | "outputs": [], 267 | "source": [ 268 | "f.close()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [] 277 | } 278 | ], 279 | "metadata": { 280 | "kernelspec": { 281 | "display_name": "Python 3", 282 | "language": "python", 283 | "name": "python3" 284 | }, 285 | "language_info": { 286 | "codemirror_mode": { 287 | "name": "ipython", 288 | "version": 3 289 | }, 290 | "file_extension": ".py", 291 | "mimetype": "text/x-python", 292 | "name": "python", 293 | "nbconvert_exporter": "python", 294 | "pygments_lexer": "ipython3", 295 | "version": "3.8.8" 296 | } 297 | }, 298 | "nbformat": 4, 299 | "nbformat_minor": 4 300 | } 301 | -------------------------------------------------------------------------------- /Chapter_8/unstructured_data_stream.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Step 1. Loading the needed packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import time" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": {}, 22 | "source": [ 23 | "### Step 2. Define a generator function to get the latest log entries" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "# Example function modified from David Beazley (https://www.dabeaz.com/generators/Generators.pdf)\n", 33 | "\n", 34 | "def follow(thefile,start='head'):\n", 35 | " if start == 'head':\n", 36 | " thefile.seek(0,0) # start from the beginning of the file\n", 37 | " if start == 'tail':\n", 38 | " thefile.seek(0,2) # start from the current ending point of the file\n", 39 | " while True:\n", 40 | " line = thefile.readline()\n", 41 | " if not line:\n", 42 | " time.sleep(0.1) # Sleep briefly\n", 43 | " continue\n", 44 | " yield line" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "### Step 3. Working on the lines of the growing file" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 9, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "# open the log file\n", 61 | "logfile = open('stream_data.txt')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 10, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# follow all lines in the growing file\n", 71 | "line_all = follow(logfile,start='head')" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "data": { 81 | "text/plain": [ 82 | "'-- This is Line: 1 -- \\n'" 83 | ] 84 | }, 85 | "execution_count": 5, 86 | "metadata": {}, 87 | "output_type": "execute_result" 88 | } 89 | ], 90 | "source": [ 91 | "# get out the lines sequentially\n", 92 | "next(line_all)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 6, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# follow the new lines starting from the execution of the function\n", 102 | "line_latest = follow(logfile,start='tail')" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 7, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "'-- This is Line: 8 -- \\n'" 114 | ] 115 | }, 116 | "execution_count": 7, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "next(line_latest)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 8, 128 | "metadata": {}, 129 | "outputs": [], 130 | "source": [ 131 | "# close the log file\n", 132 | "logfile.close()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.8.8" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 4 164 | } 165 | -------------------------------------------------------------------------------- /Chapter_8/unstructured_example_log.txt: -------------------------------------------------------------------------------- 1 | [5/15/2013 2:17:26 PM] Session Start 2 | [5/15/2013 2:17:26 PM] Leaving sequence: loadXML, moving forward. 3 | [5/15/2013 2:17:30 PM] Player submitted name: Carl 4 | [5/15/2013 2:17:30 PM] Leaving sequence: InputNameScreen, moving forward. 5 | [5/15/2013 2:17:31 PM] Player submitted name: Carl 6 | [5/15/2013 2:17:31 PM] Leaving sequence: startScreen, moving forward. 7 | [5/15/2013 2:17:50 PM] Player submitted name: Carl 8 | [5/15/2013 2:17:50 PM] Leaving sequence: slide2, moving forward. 9 | [5/15/2013 2:17:55 PM] Player submitted name: Carl 10 | [5/15/2013 2:17:55 PM] Leaving sequence: slide2b, moving forward. 11 | [5/15/2013 2:18:34 PM] Player submitted name: Carl 12 | [5/15/2013 2:18:34 PM] Leaving sequence: slide2c, moving forward. 13 | [5/15/2013 2:20:09 PM] Player submitted name: Carl 14 | [5/15/2013 2:20:09 PM] Leaving sequence: slide3, moving forward. 15 | [5/15/2013 2:20:13 PM] Player submitted name: Carl 16 | [5/15/2013 2:20:13 PM] Leaving sequence: slide4, moving forward. 17 | -------------------------------------------------------------------------------- /Chapter_9/chat_bigram_feature.csv.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgbrainstorm/computational_psychometrics/e905a98ec89a00bff57eb6a0e2d58061a685e28f/Chapter_9/chat_bigram_feature.csv.gz -------------------------------------------------------------------------------- /Chapter_9/supervised_learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "

Supervised Learning Examples

\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### 1. Load the needed libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "from sklearn.svm import LinearSVC # for Support Vector Machine\n", 24 | "from sklearn.ensemble import RandomForestClassifier # for Random Forest\n", 25 | "from sklearn.ensemble import GradientBoostingClassifier # for Gradient Boosting Machine\n", 26 | "\n", 27 | "from sklearn.model_selection import train_test_split\n", 28 | "from sklearn.metrics import accuracy_score\n", 29 | "import pandas as pd\n", 30 | "import warnings \n", 31 | "warnings.simplefilter('ignore')" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "### 2. Instantiate the models and check the details" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 2, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "model_SVM = LinearSVC()\n", 48 | "model_RF = RandomForestClassifier()\n", 49 | "model_GBM = GradientBoostingClassifier()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "LinearSVC()" 61 | ] 62 | }, 63 | "execution_count": 3, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "model_SVM" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "RandomForestClassifier()" 81 | ] 82 | }, 83 | "execution_count": 4, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "model_RF" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 5, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "data": { 99 | "text/plain": [ 100 | "GradientBoostingClassifier()" 101 | ] 102 | }, 103 | "execution_count": 5, 104 | "metadata": {}, 105 | "output_type": "execute_result" 106 | } 107 | ], 108 | "source": [ 109 | "model_GBM" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "### 3. Read in the data file" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 8, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [ 125 | "X=pd.read_csv('chat_bigram_feature.csv.gz',compression='gzip').values\n", 126 | "y=pd.read_csv('chat_label.csv').values.ravel()" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### 4. Create the training and validation sets" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 9, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "### 5. Train the models and generate predicted labels" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 10, 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# train the models with training set\n", 159 | "model_SVM.fit(X_train,y_train)\n", 160 | "model_RF.fit(X_train,y_train)\n", 161 | "model_GBM.fit(X_train,y_train)\n", 162 | "\n", 163 | "# -- get the predicted labels on the test dataset.\n", 164 | "y_pred_SVM = model_SVM.predict(X_test)\n", 165 | "y_pred_RF = model_RF.predict(X_test)\n", 166 | "y_pred_GBM = model_GBM.predict(X_test)" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "### 6. Check the predictive performance" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 13, 179 | "metadata": {}, 180 | "outputs": [ 181 | { 182 | "data": { 183 | "text/plain": [ 184 | "0.692" 185 | ] 186 | }, 187 | "execution_count": 13, 188 | "metadata": {}, 189 | "output_type": "execute_result" 190 | } 191 | ], 192 | "source": [ 193 | "# calculate the accuracy of the predicted labels from SVM\n", 194 | "accuracy_score(y_pred_SVM, y_test).round(3)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 14, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "data": { 204 | "text/plain": [ 205 | "0.664" 206 | ] 207 | }, 208 | "execution_count": 14, 209 | "metadata": {}, 210 | "output_type": "execute_result" 211 | } 212 | ], 213 | "source": [ 214 | "# calculate the accuracy of the predicted labels from Random Forest\n", 215 | "accuracy_score(y_pred_RF, y_test).round(3)" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 15, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/plain": [ 226 | "0.644" 227 | ] 228 | }, 229 | "execution_count": 15, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "# calculate the accuracy of the predicted labels from Gradient Boosting Machine\n", 236 | "accuracy_score(y_pred_GBM, y_test).round(3)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 16, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "data": { 246 | "text/plain": [ 247 | "3 0.300999\n", 248 | "1 0.271304\n", 249 | "2 0.256546\n", 250 | "4 0.171151\n", 251 | "dtype: float64" 252 | ] 253 | }, 254 | "execution_count": 16, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "pd.value_counts(y)/pd.value_counts(y).sum()" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": {}, 273 | "source": [ 274 | "***" 275 | ] 276 | } 277 | ], 278 | "metadata": { 279 | "kernelspec": { 280 | "display_name": "Python 3", 281 | "language": "python", 282 | "name": "python3" 283 | }, 284 | "language_info": { 285 | "codemirror_mode": { 286 | "name": "ipython", 287 | "version": 3 288 | }, 289 | "file_extension": ".py", 290 | "mimetype": "text/x-python", 291 | "name": "python", 292 | "nbconvert_exporter": "python", 293 | "pygments_lexer": "ipython3", 294 | "version": "3.8.8" 295 | } 296 | }, 297 | "nbformat": 4, 298 | "nbformat_minor": 4 299 | } 300 | -------------------------------------------------------------------------------- /Erratas/Chapter_14_erratas.md: -------------------------------------------------------------------------------- 1 | # Erratas for Chapter 14 Text Mining and Automated Scoring 2 | 3 | 1. Page 254, second paragraph, "For a term-document matrix X with d rows (document) and t columns (terms)..." -> "For a term-document matrix X with t rows (terms) and d columns (documents)..." 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/jgbrainstorm/computational_psychometrics/HEAD) 2 | # Computational Psychometrics 3 | This is the code repo for the forthcoming book, Computational Psychometrics, edited by Alina A. von Davier, Robert J. Mislevy and Jiangang Hao. Please note that only chapter 7 to chapter 14 contain code examples and the other chapters do not have any code examples. Click [here to visit the Springer page for the book](https://link.springer.com/book/9783030743932). 4 | 5 | Computational Psychometrics 6 | 7 | ## Python and R Tutorials 8 | * Python tutorials: https://docs.python.org/3/tutorial/ 9 | * R tutorials: https://cran.r-project.org/doc/manuals/r-release/R-intro.html 10 | 11 | ## Preface 12 | 13 | “The future is already here—it’s just not very evenly distributed.”—attributed to W. Gibson 14 | 15 | Digitally based learning and assessment systems generate complex data that record learners' interaction with the tasks or items at finer time granularity than in traditional settings. These rich process data can provide new opportunities for validating assessments, improving measurement precision, revealing response patterns/styles, uncovering group differences, detecting unintended behaviors, identifying new constructs, and providing more actionable feedback to learners and other stakeholders. But these benefits do not come for free. The significantly increased volume, velocity, and variety of data pose new challenges to researchers in psychometrics to handle, analyze, and interpret them in order to materialize the value of these rich data. 16 | 17 | The techniques needed to handle these complex data are not well-covered in most existing treatments of psychometric methods. It has become necessary to extend psychometric methodology to new techniques, such as those for creating quantitative representations of the complex responses (such as video, audio, and keystroke) and those for relating the complex evidentiary representations to the targeted constructs. 18 | 19 | A concise new term would be beneficial to facilitate the exchanges among researchers on this new front of psychometrics. However, coming up with a concise name to encompass a discipline with such a vast intension is challenging. After years of back and forth, we noted that despite their widespread differences the new methodologies share one common feature: computational models. We thus arrived at the term Computational Psychometrics (von Davier, 2015). We want to capture the essential feature of this new discipline, encompassing data-driven computational algorithms, while at the same time, establishing its alignment to fundamental concepts of psychometrics. This may not be a perfect name, as always in the history of science, but it is a concise one that highlights the key features of this new discipline. 20 | 21 | Computational psychometrics thus aims at integrating techniques from data science and machine learning into psychometrics, guided by well-established psychometric principles in measurement science. Just as psychometrics is not a simple collection of statistical methods for educational measurement, computational psychometrics is more than a simple aggregation of data science and machine learning methods for use in measurement contexts. Insights into educational measurement are essential for realizing the potential of applying the new techniques in those contexts. 22 | 23 | By drawing examples from real-world use cases, this edited volume is intended to further define the scope of this new discipline and to serve as a steppingstone for students and researchers in psychometrics prepare for the design, development, and analysis of the learning and assessment systems with their increasingly big and complex data. A strength of the volume resides in its GitHub-site companion, which provides a repository for the code —in R or Python—for all of the methodological chapters. 24 | 25 | This volume mirrors the societal changes, advances in technology and computational power, and the wide-spread adoption of digital learning and assessments. It is our goal that the methods provided in this book will enable researchers and developers to create systems and tools for better access, more affordability, broader inclusion, and higher quality education for everyone, everywhere. 26 | 27 |

28 | Alina A. von Davier, Newton, MA 29 |
30 | Robert Mislevy, Severna Park, MD 31 |
32 | Jiangang Hao, Princeton, NJ 33 |

34 | 35 | 36 | ## Table of Contents 37 | 38 | Chapter 1. Introduction to Computational Psychometrics: towards a principled integration of data science and machine learning techniques into psychometrics 39 | > **Authors:** Alina A. von Davier, Robert J. Mislevy and Jiangang Hao 40 | 41 | > **Abstract:** *In this chapter we articulate what is computational psychometrics, why 42 | we need a volume focused on it, and how this book contributes to the expansion of 43 | psychometric toolbox to include methodologies from machine learning and data 44 | science in order to address the complexities of big data collected from virtual 45 | learning and assessment systems. We also discuss here the structure of the edited 46 | volume, how each chapter contributes to enhancing the psychometrics science and 47 | our recommendations for further readings.* 48 | 49 | ### Part I Conceptualization 50 | 51 | Chapter 2. Next generation learning and assessment: what, why and how 52 | > **Author:** Robert J. Mislevy 53 | 54 | > **Abstract:** *Computational psychometrics is a blend of stochastic processes theory, 55 | computer science-based methods, and theory-based psychometric approaches that 56 | may aid the analyses of complex data from performance assessments. This chapter 57 | discusses the grounds for using complex performance assessments, the design of 58 | such assessments so that useful evidence about targeted abilities will be present 59 | in the data to be analysed, and roles that computational psychometric ideas and 60 | methods can play. It first provides background on a situative, sociocognitive, 61 | perspective on human capabilities and how we develop them and use them—a 62 | perspective we believe is necessary to synthesize the methodologies. Next it reviews 63 | the form of evidentiary argument that underlies the evidence-centered approach 64 | to design, interpretation, and use of educational assessments. It then points out 65 | junctures in extensions of the argument form where computational psychometric 66 | methods can carry out vital roles in assessment of more advanced constructs, from 67 | more complex data, in new forms and contexts of assessment. It concludes by 68 | reflecting on how one reconceives and extends the notions of validity, reliability, 69 | comparability, fairness, and generalizability to more complex assessments and 70 | analytic methods.* 71 | 72 | Chapter 3. Computational psychometrics 73 | > **Authors:** Alina A. von Davier, Kristen DiCerbo and Josine Verhagen 74 | 75 | > **Abstract:** *In recent years the advances in technology provided affordances for 76 | learning and assessments opportunities. In this chapter we first describe computational 77 | psychometrics as a framework for the measurement of learners’ skills, 78 | knowledge, and abilities. We discuss the changes in educational measurement 79 | that led to the need for expanding the psychometrics toolbox and describe the 80 | properties of psychometric data. We then give an example of a class of models, 81 | the Dynamic Bayesian Models that encompass many traditional psychometric 82 | models and machine-learning algorithms. We conclude by emphasizing that model 83 | complexity and power need to be balanced with the responsibility for transparency 84 | and fairness towards stakeholders.* 85 | 86 | 87 | Chapter 4. Virtual performance-based assessments 88 | > **Authors:** Jessica Andrews Todd, Robert J. Mislevy, Michelle LaMar and Sebastiaan de Klerk 89 | 90 | > **Abstract:** *Virtual performance-based assessments (VPBAs) are environments for 91 | test takers to interact with systems, sometimes including other persons or agents, 92 | in order to provide evidence about their knowledge, skills, or other attributes. 93 | Examples include tasks based on interactive simulations, games, branching scenarios, 94 | and collaboration among students communicating through digital chats. They 95 | may be used for summative purposes, as in certification examinations, or for other 96 | purposes, as in intelligent tutoring systems and exploratory learning environments. 97 | They afford opportunities to obtain direct evidence about capabilities that inherently 98 | involve interaction, such as inquiry and collaboration. Our focus here is digital, 99 | usually with regard to the environment but always with regard to the form of 100 | data. Digital data capture makes it possible to acquire rich details about students’ 101 | actions and the evolving situations in which they occur. The challenges they pose to 102 | psychometrics lie in designing VPBAs to optimally evoke the targeted capabilities, 103 | providing students with affordances that evidence that cognition, capturing the relevant 104 | aspects of the performances, identifying meaningful patterns in performances 105 | that constitute evidence about the targeted capabilities, and providing an inferential 106 | framework for synthesizing the evidence and characterizing its properties. This 107 | chapter provides an introduction to VPBAs and psychometric considerations in 108 | VPBA design and analysis.* 109 | 110 | Chapter 5. Knowledge Inference Models Used in Adaptive Learning 111 | > **Authors:** Maria Ofelia, M.O.Z. San Pedro and Ryan S. Baker 112 | 113 | > **Abstract:** *This chapter provides an overview of adaptive learning and examines 114 | the student model component used in adaptive learning systems. Established and 115 | more recent approaches to student modeling that infer student knowledge (i.e. what 116 | students know at any given moment during the learning experience) are discussed, 117 | as student knowledge is the most common learner characteristic widely assessed 118 | in large-scale adaptive systems. This chapter concludes with a discussion of the 119 | limitations of the current generation of adaptive learning systems, and areas of 120 | potential for future progress.* 121 | 122 | ### Part II Methodology 123 | 124 | Chapter 6. Concepts and models from Psychometrics 125 | > **Authors:** Robert J. Mislevy and Maria Bolsinova 126 | 127 | > **Abstract:** *The concepts and methods of psychometrics originated under trait and 128 | behavioral psychology, with relatively simple data, used mainly for purposes of 129 | prediction and selection. Ideas emerged over time that nevertheless hold value for 130 | the new psychological perspectives, contexts of use, and forms of data and analytic 131 | tools we are now seeing. In this chapter we review some fundamental models 132 | and ideas from psychometrics that can be profitably reconceived, extended, and 133 | augmented in the new world of assessment. Methods we address include classical 134 | test theory, generalizability theory, item response theory, latent class models, 135 | cognitive diagnosis models, factor analysis, hierarchical models, and Bayesian 136 | networks. Key concepts are these: (1) The essential nature of psychometric models 137 | (observations, constructs, latent variables, and probability-based reasoning). (2) The 138 | interplay of design and discovery in assessment. (3) Understanding themeasurement 139 | issues of validity, reliability, comparability, generalizability, and fairness as social 140 | values that pertain even as forms of data, analysis, context, and purpose evolve.* 141 | 142 | Chapter 7. Bayesian Inference in Large-Scale Computational Psychometrics 143 | > **Authors:** Gunter Maris, Timo Bechger and Maarten Marsman 144 | 145 | > **Abstract:** *This chapter provides an introduction to Bayesian inference using 146 | Markov Chain Monte Carlo (MCMC) methods. We focus on two popular MCMC 147 | methods: Metropolis-Hastings and the Gibbs sampler. A Metropolis-Hastings 148 | algorithm developed by Marsman et al. (Sci Rep 5:9050, 1–7, 2015) will be used 149 | to illustrate how MCMC can be done for a wide range of models in computational 150 | statistics.* 151 | 152 | Chapter 8. Data science perspectives 153 | > **Authors:** Jiangang Hao and Robert J. Mislevy 154 | 155 | > **Abstract:** *Digitally based learning and assessment systems generate large volumes 156 | of complex process data. The next generation psychometricians need to acquire new 157 | data science skills to meet the data challenge. In this chapter, we summarize data 158 | science skills and identify the subset that psychometricians need to prioritize. We 159 | introduce an evidence identification centered data design (EICDD) process during 160 | the task design, as an important way to address the data challenges from digitally 161 | based assessments. We describe some specific data techniques to parse and process 162 | complex process data with example codes in Python programming language. We 163 | also outline the general methodological strategies when dealing with process data 164 | from digitally based assessments.* 165 | 166 | Chapter 9. Supervised machine learning 167 | > **Author:** Jiangang Hao 168 | 169 | > **Abstract:** *Machine learning refers to a set of methodologies that allow computers 170 | to “learn” the relationship among numerical representations of data. In this Chapter, 171 | we focus on an important branch of machine learning, supervised machine learning, 172 | and introduce three widely used supervised learning methods, the Support Vector 173 | Machine, Random forest, and Gradient Boosting Machine. Python codes examples 174 | are included to show how to use these methods in practice.* 175 | 176 | Chapter 10. Unsupervised machine learning 177 | > **Author** Pak Chunk Wong 178 | 179 | > **Abstract:** *The chapter introduces the concept of machine learning with an emphasis 180 | on unsupervised learning algorithms and applications. The discussion starts with a 181 | brief background on machine learning and then a high-level discussion on the differences 182 | between supervised and unsupervised learning algorithms. We present three 183 | categories of unsupervised machine learning techniques that include clustering, 184 | outlier detection, and dimension reduction; five prevailing unsupervised learning 185 | algorithms that include K-means, agglomerative clustering, DBSCAN, principal 186 | component analysis, and multidimensional scaling; and five Python programming 187 | examples that demonstrate the learning concepts and results using psychometric 188 | assessment data collected from an online collaborative problem-solving environment. 189 | This chapter demonstrates the potential of machine learning and highlights 190 | the opportunities it presents in psychometric research and development.* 191 | 192 | Chapter 11. AI and deep learning for educational research 193 | > **Authors:** Yuchi Huang and Saad M. Khan 194 | 195 | > **Abstract:** *There is a growing need for assessment and learning tools that capture 196 | a broad range of learner behavior necessary for the evaluation of skills such as 197 | problem solving, communication and collaboration. In these real-world applications 198 | student data is captured with a high degree of granularity, variety of temporal 199 | scales and in a multitude of modalities. Unfortunately such complex, noisy and 200 | unstructured data limit the applicability of traditional models of measurement and 201 | psychometrics designed to extract evidence of competency from item response 202 | data. In this chapter, we present recent advances in AI and Machine Learning 203 | that can be utilized for measurement of a variety of complex constructs and 204 | competencies. These models include frameworks such as deep neural networks and 205 | adversarial generative networks that enable us to harness concept hierarchies and 206 | the latent structure within data to learn increasingly complex representations and 207 | make powerful predictions.* 208 | 209 | Chapter 12. Time series and stochastic processes 210 | > **Authors:** Peter Halpin, Lu Ou and Michelle LaMar 211 | 212 | > **Abstract:** *This chapter addresses some statistical modeling approaches for time 213 | series data and discusses their potential for psychometric applications. We adopt a 214 | broad conceptualization of time series, including under this rubric any type of data 215 | that involves serial statistical dependence. Such dependence may be represented in 216 | continuous time, discrete time, or in a purely sequential manner. This chapter begins 217 | by discussing the relationships among these three representations and offers some 218 | general advice on when each might prove useful. We then provide an overview of 219 | three modeling frameworks that exemplify the different representations of statistical 220 | dependence: Markov decision processes, state-space modeling, and temporal point 221 | processes. For each modeling framework, we discuss its specification, its psychometric 222 | interpretation, and provide a brief numeric example including R code.* 223 | 224 | Chapter 13. Social network analysis 225 | > **Author:** Mengxiao Zhu 226 | 227 | > **Abstract:** *Supported by advances in technology, simulation-, scenario- and game-based assessments (DiCerbo & Behrens, 2012; Mislevy et al., 2014) provide opportunities for the students to interact with complex tasks. Rich process data can be collected during the assessment, such as log data of student response actions (e.g., Zhu, Shu, & von Davier, 2016), keystroke data (e.g., Almond, Deane, Quinlan, Wagner, & Sydorenko, 2012), and eye-tracking data (e.g., Tai, Loehr, & Brigham, 2006). Process data record the series of activities conducted by students during problem-solving processes and contain information not represented in the final answers. One useful direction in which to study process data is to explore how students transit from one action to other actions, or from one state to other states. In this chapter, we introduce the basic concepts and methods of Social Network Analysis (SNA) and discuss related applications in visualizing and analyzing process data using SNA to understand the transitions in response process data.* 228 | 229 | Chapter 14. Text mining and automated scoring 230 | > **Authors:** Michael Flor and Jiangang Hao 231 | 232 | > **Abstract:** *Natural Language Processing (NLP) is playing an increasingly important 233 | role in learning and assessments. Some typical applications of NLP in education 234 | include automated scoring, automated item generation, conversation-based assessments, 235 | writing assistants, text mining for education, and so on. In this chapter, 236 | we aim at introducing some basics of NLP through two typical applications in 237 | educational contexts, text mining and automated scoring. We hope readers can get 238 | an overall picture of NLP and get familiarized with some basic tools for handling 239 | natural language data, which may serve as stepping stones for their future work with 240 | NLP.* 241 | 242 | ## Afterwords 243 | > After the book was published, we received much feedback from the learning and assessment community. We want to thank all the readers for their commendations, suggestions, and comments on the book. Meanwhile, we realized that it might be more beneficial to share additional thoughts on how we embarked on creating this edited volume to introduce computational psychometrics. Three main reasons pushed us to this effort. 244 | 245 | > First of all, as we briefly mentioned in the preface and the chapter 1, there is an intrinsic need to expand the existing psychometric methodologies to include new methods from, e.g., data science and machine learning, to address the new challenges of learning and assessment in the digital age. When many new methods are included, introducing a new term to encompass these new features will be more convenient and effective for communication in the community. Historically, this is the general process of how new disciplines emerge, such as how psychometrics has emerged from statistics. 246 | 247 | > Second, we intended to help address the practical challenge of preparing the workforce. Over the past few years, we all went through some painstaking efforts to hire people with the right combination of skills to meet the challenge of digital learning and assessment. We observed that many applicants from psychometrics programs do not have the needed data science/machine learning skills (and mindsets) to process and model complex data from digital tasks. In contrast, applicants with data science/machine learning skills from other disciplines, such as computer science, generally know very little about the core values of psychometrics. In practice, hiring people who do not know the core values of the substantive area poses a big retention challenge for organizations, as they may quickly move on if they find they are not interested in the area at all after a few months. Therefore, we feel it is imperative to prioritize a set of new methodologies and integrate them with the core values of psychometrics in a principled manner to help prepare a stable workforce for digital learning and assessment in the future. 248 | 249 | > Finally, we noticed a lack of a bridge for people from other quantitative disciplines (such as computer science, applied mathematics, physics, and others) to digital learning and assessment. There are many talents with superb technical skills but know little about the values and principles of learning and assessment. We believe that providing a concise coverage of psychometrics' established values and methods could help them better understand how to apply their skills to join forces to promote learning and assessment in a digital age. 250 | 251 | > As such, we decided to create an edited volume with carefully selected topics contributed by experts we can reach. We hope the book could help readers who have been or will be working in the exciting areas of digital learning and assessments to better understand the methods and principles, and communicate them efficiently under the name of computational psychometrics. 252 | 253 | -------------------------------------------------------------------------------- /book_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jgbrainstorm/computational_psychometrics/e905a98ec89a00bff57eb6a0e2d58061a685e28f/book_cover.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.2.2 2 | plotly==4.14.3 3 | ipywidgets==7.6.3 4 | scikit-learn==1.1.0 5 | nltk==3.6.7 6 | pyspellchecker==0.6.2 7 | numpy==1.21.6 8 | scipy==1.7.3 9 | matplotlib==3.5.0 10 | #tensorflow==2.6.1 11 | #torch==1.10.1 12 | #torchvision==0.11.2 13 | #Pillow 14 | beautifulsoup4==4.9.3 15 | gensim==4.1.2 16 | 17 | --------------------------------------------------------------------------------