├── Pickling.ipynb ├── .gitignore ├── Unpickling.ipynb └── README.md /Pickling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 9, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "R-Squared: 96.03\n", 13 | "Coefficient: 9267.24\n", 14 | "Intercept: 27178.6\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "import pickle\n", 20 | "import pandas as pd\n", 21 | "import numpy as np\n", 22 | "from sklearn.linear_model import LinearRegression\n", 23 | "\n", 24 | "# Create data\n", 25 | "d = {'years_experience': [1.1,1.3,1.5,2,2.2,2.9,3.2,3.7,3.9,4.5,4.9,\n", 26 | " 5.1,5.3,5.9,6,7.9,8.2,8.7,9,9.6,10.3,10.5],\n", 27 | " 'salary': [39343,46205,37731,43525,39891,56642,64445,57189,63218,61111,\n", 28 | " 67938,66029,83088,81363,93940,101302,113812,109431,105582,\n", 29 | " 112635,122391,121872]}\n", 30 | "df = pd.DataFrame(data=d)\n", 31 | "\n", 32 | "# Split data\n", 33 | "X = pd.DataFrame(df.years_experience)\n", 34 | "y = pd.DataFrame(df.salary)\n", 35 | "\n", 36 | "# Fit and score regression\n", 37 | "reg = LinearRegression().fit(X,y)\n", 38 | "r_squared = reg.score(X,y)\n", 39 | "\n", 40 | "# Get coefficient to check after unpickling\n", 41 | "print(\"R-Squared:\", round(r_squared*100,2))\n", 42 | "print(f\"Coefficient: {round(reg.coef_[0][0],2)}\")\n", 43 | "print(f\"Intercept: {round(reg.intercept_[0],2)}\")\n", 44 | "\n", 45 | "\n", 46 | "# Pickle the regression model object\n", 47 | "with open(\"pickled_model.p\", \"wb\") as p:\n", 48 | " pickle.dump(reg, p)" 49 | ] 50 | } 51 | ], 52 | "metadata": { 53 | "kernelspec": { 54 | "display_name": "Python 3", 55 | "language": "python", 56 | "name": "python3" 57 | }, 58 | "language_info": { 59 | "codemirror_mode": { 60 | "name": "ipython", 61 | "version": 3 62 | }, 63 | "file_extension": ".py", 64 | "mimetype": "text/x-python", 65 | "name": "python", 66 | "nbconvert_exporter": "python", 67 | "pygments_lexer": "ipython3", 68 | "version": "3.7.6" 69 | } 70 | }, 71 | "nbformat": 4, 72 | "nbformat_minor": 4 73 | } 74 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Pickle files 2 | .p 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | -------------------------------------------------------------------------------- /Unpickling.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Coefficient of unpickled object: 9267.24\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "import pickle\n", 18 | "import pandas as pd\n", 19 | "\n", 20 | "# Unpickle the regression model object\n", 21 | "with open(\"pickled_model.p\", \"rb\") as p:\n", 22 | " new_reg = pickle.load(p)\n", 23 | "\n", 24 | "# The coefficient is the same!\n", 25 | "print(f\"Coefficient of unpickled object: {round(new_reg.coef_[0][0],2)}\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 13, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "MAPE: 7.96%\n", 38 | "R-Squared: 93.95\n", 39 | "Coefficient: 9267.24\n", 40 | "Intercept: 27178.6\n" 41 | ] 42 | } 43 | ], 44 | "source": [ 45 | "# New data\n", 46 | "d = {'years_experience': [3,3.2,4,4,4.1,6.8,7.1,9.5],\n", 47 | " 'salary': [60150,54445,55794,56957,57081,91738,98273,116969]}\n", 48 | "test = pd.DataFrame(data=d)\n", 49 | "\n", 50 | "# Separate data into X and y\n", 51 | "X_test = pd.DataFrame(test.years_experience)\n", 52 | "y_test = pd.DataFrame(test.salary)\n", 53 | "\n", 54 | "# R-Squared\n", 55 | "r_squared = new_reg.score(X_test, y_test)\n", 56 | "\n", 57 | "# Predictions\n", 58 | "preds = new_reg.predict(X_test)\n", 59 | "\n", 60 | "# MAPE\n", 61 | "mape = abs((y_test - preds)/y_test).mean()\n", 62 | "print(f\"MAPE: {round(mape[0]*100, 2)}%\")\n", 63 | "\n", 64 | "print(f\"R-Squared: {round(r_squared*100, 2)}\")\n", 65 | "print(f\"Coefficient: {round(new_reg.coef_[0][0],2)}\")\n", 66 | "print(f\"Intercept: {round(new_reg.intercept_[0],2)}\")" 67 | ] 68 | } 69 | ], 70 | "metadata": { 71 | "kernelspec": { 72 | "display_name": "Python 3", 73 | "language": "python", 74 | "name": "python3" 75 | }, 76 | "language_info": { 77 | "codemirror_mode": { 78 | "name": "ipython", 79 | "version": 3 80 | }, 81 | "file_extension": ".py", 82 | "mimetype": "text/x-python", 83 | "name": "python", 84 | "nbconvert_exporter": "python", 85 | "pygments_lexer": "ipython3", 86 | "version": "3.7.6" 87 | } 88 | }, 89 | "nbformat": 4, 90 | "nbformat_minor": 4 91 | } 92 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Pickling in Python 2 | 3 | >_Well, save my model and call me pickled!_ :cucumber: 4 | 5 | ## Why pickle? What do I pickle? 6 | Pickling does exactly what it sounds like: it preserves something for later.* If you train and score a model and want to save it for later or deploy it for use on new data, you can pickle it so you don't have to retrain the model every time you want to use it. The `pickle` module is built into Python and uses one line of code to save your model to a separate file that can be called and used later, even in a completely separate script or notebook. 7 | 8 | * Pickling is a process called "serialization," which basically means it breaks your object down into a single-file stream of bytes and saves them in order. In this tutorial, the object is a LinearRegression sklearn model. 9 | 10 | ## Example: 11 | This is an example of a simple linear regression predicting salary using years of experience. The data came from [this Kaggle data set](https://www.kaggle.com/rohankayan/years-of-experience-and-salary-dataset). 12 | 13 | ### Create & pickle a model 14 | Start by importing packages and data: 15 | 16 | ```python 17 | import pickle 18 | import pandas as pd 19 | from sklearn.linear_model import LinearRegression 20 | 21 | # Create data 22 | d = {'years_experience': [1.1,1.3,1.5,2,2.2,2.9,3.2,3.7,3.9,4.5,4.9, 23 | 5.1,5.3,5.9,6,7.9,8.2,8.7,9,9.6,10.3,10.5], 24 | 'salary': [39343,46205,37731,43525,39891,56642,64445,57189,63218,61111, 25 | 67938,66029,83088,81363,93940,101302,113812,109431,105582, 26 | 112635,122391,121872]} 27 | df = pd.DataFrame(data=d) 28 | ``` 29 | 30 | Then, separate the predictor and target and fit a linear regression: 31 | 32 | ```python 33 | # Split data 34 | X = pd.DataFrame(df.years_experience) 35 | y = pd.DataFrame(df.salary) 36 | 37 | # Fit regression 38 | reg = LinearRegression().fit(X,y) #<-- We're going to pickle this in a minute 39 | ``` 40 | 41 | We can print the single coefficient and the intercept to compare with the new object when we unpickle it: 42 | 43 | ```python 44 | print(f"Coefficient: {round(reg.coef_[0][0],2)}") # 9267.24 45 | print(f"Intercept: {round(reg.intercept_[0],2)}") # 27178.6 46 | ``` 47 | 48 | Lastly, to save the regression model object (`reg`), we use the `.dump()` method to save the model in a file called `pickled_model.p` in the working directory. The `wb` argument is telling the pickle module to write (i.e., create) a new file. 49 | 50 | I like to think of it like storing something in a box. We are going to `open` a box and `dump` our `reg` model into it. Then we'll label it `pickled_model.p` so we can find it easily later. 51 | 52 | ```python 53 | # Pickle the regression model object 54 | with open("pickled_model.p", "wb") as p: 55 | pickle.dump(reg, p) 56 | ``` 57 | 58 | ### Unpickle the model 59 | Now, we will create a totally new file and unpickle the model we just created. 60 | 61 | Start by importing packages and creating some new data: 62 | ```python 63 | import pickle 64 | import pandas as pd 65 | 66 | # New data 67 | d = {'years_experience': [3,3.2,4,4,4.1,6.8,7.1,9.5], 68 | 'salary': [60150,54445,55794,56957,57081,91738,98273,116969]} 69 | test = pd.DataFrame(data=d) 70 | 71 | # Separate data into X and y 72 | X_test = pd.DataFrame(test.years_experience) 73 | y_test = pd.DataFrame(test.salary) 74 | ``` 75 | 76 | Now, load the pickled model using the `.load()` method, opening the file we saved earlier (`pickled_model.p`) and reading it with the `rb` argument. 77 | 78 | ```python 79 | # Unpickle the regression model object 80 | with open("pickled_model.p", "rb") as p: 81 | new_reg = pickle.load(p) 82 | ``` 83 | 84 | We can print the coefficient and intercept to verify that it's the same model: 85 | 86 | ```python 87 | print(f"Coefficient: {round(new_reg.coef_[0][0],2)}") # 9267.24 88 | print(f"Intercept: {round(new_reg.intercept_[0],2)}") # 27178.6 89 | ``` 90 | 91 | They're the same! :tada: 92 | 93 | Now we can make predictions on new data and evaluate model quality with R2 and the Mean Absolute Percentage Error (MAPE): 94 | 95 | ```python 96 | # R-Squared 97 | r_squared = new_reg.score(X_test, y_test) 98 | print(f"R-Squared: {round(r_squared*100, 2)}") # 93.95 99 | 100 | # Predictions 101 | preds = new_reg.predict(X_test) 102 | 103 | # MAPE 104 | mape = abs((y_test - preds)/y_test).mean() 105 | print(f"MAPE: {round(mape[0]*100, 2)}%") # 7.96% 106 | ``` 107 | 108 | The actual results in this example aren't that important, but I wanted to go through the whole example. I also wanted to demonstrate that you don't have to import the LinearRegression module when you unpickle the file because the methods required for predicting and scoring were pickled in the original file. Nifty! :smiley: 109 | 110 | 111 | ## Considerations & Conclusion 112 | A couple of things to keep in mind: 113 | - When you unpickle something, you'll need to be running on the same version of Python. 114 | - Only unpickle files that you trust completely. You can pickle just about any object, including malicious code, so be extra cautious. 115 | - The `pickle` module is exclusive to Python. If you are planning to use your object in a different language or want the pickled object to be readable by humans, consider JSON serialization instead. Wouldn't it be cool if there was a library that could pickle to JSON?? Oh look, it's [jsonpickle](https://github.com/jsonpickle/jsonpickle)! 116 | 117 | 118 | Now, go forth and pickle! 119 | 120 | 121 | ## Other Resources 122 | [Official pickle module docs](https://docs.python.org/3/library/pickle.html) 123 | [Save and Load Machine Learning Models in Python with scikit-learn](https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/) 124 | 125 | 126 | ## Thanks 127 | Thanks to [Mark Freeman II](https://www.linkedin.com/in/mafreeman2) and [Timo Voipio](https://www.linkedin.com/in/t-voipio) for their suggestions on improving this repo :+1: 128 | --------------------------------------------------------------------------------