├── Pickling.ipynb
├── .gitignore
├── Unpickling.ipynb
└── README.md
/Pickling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 9,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "R-Squared: 96.03\n",
13 | "Coefficient: 9267.24\n",
14 | "Intercept: 27178.6\n"
15 | ]
16 | }
17 | ],
18 | "source": [
19 | "import pickle\n",
20 | "import pandas as pd\n",
21 | "import numpy as np\n",
22 | "from sklearn.linear_model import LinearRegression\n",
23 | "\n",
24 | "# Create data\n",
25 | "d = {'years_experience': [1.1,1.3,1.5,2,2.2,2.9,3.2,3.7,3.9,4.5,4.9,\n",
26 | " 5.1,5.3,5.9,6,7.9,8.2,8.7,9,9.6,10.3,10.5],\n",
27 | " 'salary': [39343,46205,37731,43525,39891,56642,64445,57189,63218,61111,\n",
28 | " 67938,66029,83088,81363,93940,101302,113812,109431,105582,\n",
29 | " 112635,122391,121872]}\n",
30 | "df = pd.DataFrame(data=d)\n",
31 | "\n",
32 | "# Split data\n",
33 | "X = pd.DataFrame(df.years_experience)\n",
34 | "y = pd.DataFrame(df.salary)\n",
35 | "\n",
36 | "# Fit and score regression\n",
37 | "reg = LinearRegression().fit(X,y)\n",
38 | "r_squared = reg.score(X,y)\n",
39 | "\n",
40 | "# Get coefficient to check after unpickling\n",
41 | "print(\"R-Squared:\", round(r_squared*100,2))\n",
42 | "print(f\"Coefficient: {round(reg.coef_[0][0],2)}\")\n",
43 | "print(f\"Intercept: {round(reg.intercept_[0],2)}\")\n",
44 | "\n",
45 | "\n",
46 | "# Pickle the regression model object\n",
47 | "with open(\"pickled_model.p\", \"wb\") as p:\n",
48 | " pickle.dump(reg, p)"
49 | ]
50 | }
51 | ],
52 | "metadata": {
53 | "kernelspec": {
54 | "display_name": "Python 3",
55 | "language": "python",
56 | "name": "python3"
57 | },
58 | "language_info": {
59 | "codemirror_mode": {
60 | "name": "ipython",
61 | "version": 3
62 | },
63 | "file_extension": ".py",
64 | "mimetype": "text/x-python",
65 | "name": "python",
66 | "nbconvert_exporter": "python",
67 | "pygments_lexer": "ipython3",
68 | "version": "3.7.6"
69 | }
70 | },
71 | "nbformat": 4,
72 | "nbformat_minor": 4
73 | }
74 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Pickle files
2 | .p
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | pip-wheel-metadata/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 |
57 | # Translations
58 | *.mo
59 | *.pot
60 |
61 | # Django stuff:
62 | *.log
63 | local_settings.py
64 | db.sqlite3
65 | db.sqlite3-journal
66 |
67 | # Flask stuff:
68 | instance/
69 | .webassets-cache
70 |
71 | # Scrapy stuff:
72 | .scrapy
73 |
74 | # Sphinx documentation
75 | docs/_build/
76 |
77 | # PyBuilder
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
98 | __pypackages__/
99 |
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 |
104 | # SageMath parsed files
105 | *.sage.py
106 |
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 |
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 |
120 | # Rope project settings
121 | .ropeproject
122 |
123 | # mkdocs documentation
124 | /site
125 |
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 |
131 | # Pyre type checker
132 | .pyre/
133 |
--------------------------------------------------------------------------------
/Unpickling.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "Coefficient of unpickled object: 9267.24\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "import pickle\n",
18 | "import pandas as pd\n",
19 | "\n",
20 | "# Unpickle the regression model object\n",
21 | "with open(\"pickled_model.p\", \"rb\") as p:\n",
22 | " new_reg = pickle.load(p)\n",
23 | "\n",
24 | "# The coefficient is the same!\n",
25 | "print(f\"Coefficient of unpickled object: {round(new_reg.coef_[0][0],2)}\")"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 13,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "name": "stdout",
35 | "output_type": "stream",
36 | "text": [
37 | "MAPE: 7.96%\n",
38 | "R-Squared: 93.95\n",
39 | "Coefficient: 9267.24\n",
40 | "Intercept: 27178.6\n"
41 | ]
42 | }
43 | ],
44 | "source": [
45 | "# New data\n",
46 | "d = {'years_experience': [3,3.2,4,4,4.1,6.8,7.1,9.5],\n",
47 | " 'salary': [60150,54445,55794,56957,57081,91738,98273,116969]}\n",
48 | "test = pd.DataFrame(data=d)\n",
49 | "\n",
50 | "# Separate data into X and y\n",
51 | "X_test = pd.DataFrame(test.years_experience)\n",
52 | "y_test = pd.DataFrame(test.salary)\n",
53 | "\n",
54 | "# R-Squared\n",
55 | "r_squared = new_reg.score(X_test, y_test)\n",
56 | "\n",
57 | "# Predictions\n",
58 | "preds = new_reg.predict(X_test)\n",
59 | "\n",
60 | "# MAPE\n",
61 | "mape = abs((y_test - preds)/y_test).mean()\n",
62 | "print(f\"MAPE: {round(mape[0]*100, 2)}%\")\n",
63 | "\n",
64 | "print(f\"R-Squared: {round(r_squared*100, 2)}\")\n",
65 | "print(f\"Coefficient: {round(new_reg.coef_[0][0],2)}\")\n",
66 | "print(f\"Intercept: {round(new_reg.intercept_[0],2)}\")"
67 | ]
68 | }
69 | ],
70 | "metadata": {
71 | "kernelspec": {
72 | "display_name": "Python 3",
73 | "language": "python",
74 | "name": "python3"
75 | },
76 | "language_info": {
77 | "codemirror_mode": {
78 | "name": "ipython",
79 | "version": 3
80 | },
81 | "file_extension": ".py",
82 | "mimetype": "text/x-python",
83 | "name": "python",
84 | "nbconvert_exporter": "python",
85 | "pygments_lexer": "ipython3",
86 | "version": "3.7.6"
87 | }
88 | },
89 | "nbformat": 4,
90 | "nbformat_minor": 4
91 | }
92 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Pickling in Python
2 |
3 | >_Well, save my model and call me pickled!_ :cucumber:
4 |
5 | ## Why pickle? What do I pickle?
6 | Pickling does exactly what it sounds like: it preserves something for later.* If you train and score a model and want to save it for later or deploy it for use on new data, you can pickle it so you don't have to retrain the model every time you want to use it. The `pickle` module is built into Python and uses one line of code to save your model to a separate file that can be called and used later, even in a completely separate script or notebook.
7 |
8 | * Pickling is a process called "serialization," which basically means it breaks your object down into a single-file stream of bytes and saves them in order. In this tutorial, the object is a LinearRegression sklearn model.
9 |
10 | ## Example:
11 | This is an example of a simple linear regression predicting salary using years of experience. The data came from [this Kaggle data set](https://www.kaggle.com/rohankayan/years-of-experience-and-salary-dataset).
12 |
13 | ### Create & pickle a model
14 | Start by importing packages and data:
15 |
16 | ```python
17 | import pickle
18 | import pandas as pd
19 | from sklearn.linear_model import LinearRegression
20 |
21 | # Create data
22 | d = {'years_experience': [1.1,1.3,1.5,2,2.2,2.9,3.2,3.7,3.9,4.5,4.9,
23 | 5.1,5.3,5.9,6,7.9,8.2,8.7,9,9.6,10.3,10.5],
24 | 'salary': [39343,46205,37731,43525,39891,56642,64445,57189,63218,61111,
25 | 67938,66029,83088,81363,93940,101302,113812,109431,105582,
26 | 112635,122391,121872]}
27 | df = pd.DataFrame(data=d)
28 | ```
29 |
30 | Then, separate the predictor and target and fit a linear regression:
31 |
32 | ```python
33 | # Split data
34 | X = pd.DataFrame(df.years_experience)
35 | y = pd.DataFrame(df.salary)
36 |
37 | # Fit regression
38 | reg = LinearRegression().fit(X,y) #<-- We're going to pickle this in a minute
39 | ```
40 |
41 | We can print the single coefficient and the intercept to compare with the new object when we unpickle it:
42 |
43 | ```python
44 | print(f"Coefficient: {round(reg.coef_[0][0],2)}") # 9267.24
45 | print(f"Intercept: {round(reg.intercept_[0],2)}") # 27178.6
46 | ```
47 |
48 | Lastly, to save the regression model object (`reg`), we use the `.dump()` method to save the model in a file called `pickled_model.p` in the working directory. The `wb` argument is telling the pickle module to write (i.e., create) a new file.
49 |
50 | I like to think of it like storing something in a box. We are going to `open` a box and `dump` our `reg` model into it. Then we'll label it `pickled_model.p` so we can find it easily later.
51 |
52 | ```python
53 | # Pickle the regression model object
54 | with open("pickled_model.p", "wb") as p:
55 | pickle.dump(reg, p)
56 | ```
57 |
58 | ### Unpickle the model
59 | Now, we will create a totally new file and unpickle the model we just created.
60 |
61 | Start by importing packages and creating some new data:
62 | ```python
63 | import pickle
64 | import pandas as pd
65 |
66 | # New data
67 | d = {'years_experience': [3,3.2,4,4,4.1,6.8,7.1,9.5],
68 | 'salary': [60150,54445,55794,56957,57081,91738,98273,116969]}
69 | test = pd.DataFrame(data=d)
70 |
71 | # Separate data into X and y
72 | X_test = pd.DataFrame(test.years_experience)
73 | y_test = pd.DataFrame(test.salary)
74 | ```
75 |
76 | Now, load the pickled model using the `.load()` method, opening the file we saved earlier (`pickled_model.p`) and reading it with the `rb` argument.
77 |
78 | ```python
79 | # Unpickle the regression model object
80 | with open("pickled_model.p", "rb") as p:
81 | new_reg = pickle.load(p)
82 | ```
83 |
84 | We can print the coefficient and intercept to verify that it's the same model:
85 |
86 | ```python
87 | print(f"Coefficient: {round(new_reg.coef_[0][0],2)}") # 9267.24
88 | print(f"Intercept: {round(new_reg.intercept_[0],2)}") # 27178.6
89 | ```
90 |
91 | They're the same! :tada:
92 |
93 | Now we can make predictions on new data and evaluate model quality with R2 and the Mean Absolute Percentage Error (MAPE):
94 |
95 | ```python
96 | # R-Squared
97 | r_squared = new_reg.score(X_test, y_test)
98 | print(f"R-Squared: {round(r_squared*100, 2)}") # 93.95
99 |
100 | # Predictions
101 | preds = new_reg.predict(X_test)
102 |
103 | # MAPE
104 | mape = abs((y_test - preds)/y_test).mean()
105 | print(f"MAPE: {round(mape[0]*100, 2)}%") # 7.96%
106 | ```
107 |
108 | The actual results in this example aren't that important, but I wanted to go through the whole example. I also wanted to demonstrate that you don't have to import the LinearRegression module when you unpickle the file because the methods required for predicting and scoring were pickled in the original file. Nifty! :smiley:
109 |
110 |
111 | ## Considerations & Conclusion
112 | A couple of things to keep in mind:
113 | - When you unpickle something, you'll need to be running on the same version of Python.
114 | - Only unpickle files that you trust completely. You can pickle just about any object, including malicious code, so be extra cautious.
115 | - The `pickle` module is exclusive to Python. If you are planning to use your object in a different language or want the pickled object to be readable by humans, consider JSON serialization instead. Wouldn't it be cool if there was a library that could pickle to JSON?? Oh look, it's [jsonpickle](https://github.com/jsonpickle/jsonpickle)!
116 |
117 |
118 | Now, go forth and pickle!
119 |
120 |
121 | ## Other Resources
122 | [Official pickle module docs](https://docs.python.org/3/library/pickle.html)
123 | [Save and Load Machine Learning Models in Python with scikit-learn](https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/)
124 |
125 |
126 | ## Thanks
127 | Thanks to [Mark Freeman II](https://www.linkedin.com/in/mafreeman2) and [Timo Voipio](https://www.linkedin.com/in/t-voipio) for their suggestions on improving this repo :+1:
128 |
--------------------------------------------------------------------------------