├── Pickling.ipynb
├── .gitignore
├── Unpickling.ipynb
└── README.md


/Pickling.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 9,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "R-Squared: 96.03\n",
13 |       "Coefficient: 9267.24\n",
14 |       "Intercept: 27178.6\n"
15 |      ]
16 |     }
17 |    ],
18 |    "source": [
19 |     "import pickle\n",
20 |     "import pandas as pd\n",
21 |     "import numpy as np\n",
22 |     "from sklearn.linear_model import LinearRegression\n",
23 |     "\n",
24 |     "# Create data\n",
25 |     "d = {'years_experience': [1.1,1.3,1.5,2,2.2,2.9,3.2,3.7,3.9,4.5,4.9,\n",
26 |     "                          5.1,5.3,5.9,6,7.9,8.2,8.7,9,9.6,10.3,10.5],\n",
27 |     "     'salary': [39343,46205,37731,43525,39891,56642,64445,57189,63218,61111,\n",
28 |     "                67938,66029,83088,81363,93940,101302,113812,109431,105582,\n",
29 |     "                112635,122391,121872]}\n",
30 |     "df = pd.DataFrame(data=d)\n",
31 |     "\n",
32 |     "# Split data\n",
33 |     "X = pd.DataFrame(df.years_experience)\n",
34 |     "y = pd.DataFrame(df.salary)\n",
35 |     "\n",
36 |     "# Fit and score regression\n",
37 |     "reg = LinearRegression().fit(X,y)\n",
38 |     "r_squared = reg.score(X,y)\n",
39 |     "\n",
40 |     "# Get coefficient to check after unpickling\n",
41 |     "print(\"R-Squared:\", round(r_squared*100,2))\n",
42 |     "print(f\"Coefficient: {round(reg.coef_[0][0],2)}\")\n",
43 |     "print(f\"Intercept: {round(reg.intercept_[0],2)}\")\n",
44 |     "\n",
45 |     "\n",
46 |     "# Pickle the regression model object\n",
47 |     "with open(\"pickled_model.p\", \"wb\") as p:\n",
48 |     "    pickle.dump(reg, p)"
49 |    ]
50 |   }
51 |  ],
52 |  "metadata": {
53 |   "kernelspec": {
54 |    "display_name": "Python 3",
55 |    "language": "python",
56 |    "name": "python3"
57 |   },
58 |   "language_info": {
59 |    "codemirror_mode": {
60 |     "name": "ipython",
61 |     "version": 3
62 |    },
63 |    "file_extension": ".py",
64 |    "mimetype": "text/x-python",
65 |    "name": "python",
66 |    "nbconvert_exporter": "python",
67 |    "pygments_lexer": "ipython3",
68 |    "version": "3.7.6"
69 |   }
70 |  },
71 |  "nbformat": 4,
72 |  "nbformat_minor": 4
73 | }
74 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Pickle files
  2 | .p
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/Unpickling.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Coefficient of unpickled object: 9267.24\n"
13 |      ]
14 |     }
15 |    ],
16 |    "source": [
17 |     "import pickle\n",
18 |     "import pandas as pd\n",
19 |     "\n",
20 |     "# Unpickle the regression model object\n",
21 |     "with open(\"pickled_model.p\", \"rb\") as p:\n",
22 |     "    new_reg = pickle.load(p)\n",
23 |     "\n",
24 |     "# The coefficient is the same!\n",
25 |     "print(f\"Coefficient of unpickled object: {round(new_reg.coef_[0][0],2)}\")"
26 |    ]
27 |   },
28 |   {
29 |    "cell_type": "code",
30 |    "execution_count": 13,
31 |    "metadata": {},
32 |    "outputs": [
33 |     {
34 |      "name": "stdout",
35 |      "output_type": "stream",
36 |      "text": [
37 |       "MAPE: 7.96%\n",
38 |       "R-Squared: 93.95\n",
39 |       "Coefficient: 9267.24\n",
40 |       "Intercept: 27178.6\n"
41 |      ]
42 |     }
43 |    ],
44 |    "source": [
45 |     "# New data\n",
46 |     "d = {'years_experience': [3,3.2,4,4,4.1,6.8,7.1,9.5],\n",
47 |     "     'salary': [60150,54445,55794,56957,57081,91738,98273,116969]}\n",
48 |     "test = pd.DataFrame(data=d)\n",
49 |     "\n",
50 |     "# Separate data into X and y\n",
51 |     "X_test = pd.DataFrame(test.years_experience)\n",
52 |     "y_test = pd.DataFrame(test.salary)\n",
53 |     "\n",
54 |     "# R-Squared\n",
55 |     "r_squared = new_reg.score(X_test, y_test)\n",
56 |     "\n",
57 |     "# Predictions\n",
58 |     "preds = new_reg.predict(X_test)\n",
59 |     "\n",
60 |     "# MAPE\n",
61 |     "mape = abs((y_test - preds)/y_test).mean()\n",
62 |     "print(f\"MAPE: {round(mape[0]*100, 2)}%\")\n",
63 |     "\n",
64 |     "print(f\"R-Squared: {round(r_squared*100, 2)}\")\n",
65 |     "print(f\"Coefficient: {round(new_reg.coef_[0][0],2)}\")\n",
66 |     "print(f\"Intercept: {round(new_reg.intercept_[0],2)}\")"
67 |    ]
68 |   }
69 |  ],
70 |  "metadata": {
71 |   "kernelspec": {
72 |    "display_name": "Python 3",
73 |    "language": "python",
74 |    "name": "python3"
75 |   },
76 |   "language_info": {
77 |    "codemirror_mode": {
78 |     "name": "ipython",
79 |     "version": 3
80 |    },
81 |    "file_extension": ".py",
82 |    "mimetype": "text/x-python",
83 |    "name": "python",
84 |    "nbconvert_exporter": "python",
85 |    "pygments_lexer": "ipython3",
86 |    "version": "3.7.6"
87 |   }
88 |  },
89 |  "nbformat": 4,
90 |  "nbformat_minor": 4
91 | }
92 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Pickling in Python
  2 | 
  3 | >_Well, save my model and call me pickled!_ :cucumber:
  4 | 
  5 | ## Why pickle?  What do I pickle?
  6 | Pickling does exactly what it sounds like: it preserves something for later.*  If you train and score a model and want to save it for later or deploy it for use on new data, you can pickle it so you don't have to retrain the model every time you want to use it.  The `pickle` module is built into Python and uses one line of code to save your model to a separate file that can be called and used later, even in a completely separate script or notebook.
  7 | 
  8 | <sub>* Pickling is a process called "serialization," which basically means it breaks your object down into a single-file stream of bytes and saves them in order.  In this tutorial, the object is a LinearRegression sklearn model.</sub>
  9 | 
 10 | ## Example:
 11 | This is an example of a simple linear regression predicting salary using years of experience.  The data came from [this Kaggle data set](https://www.kaggle.com/rohankayan/years-of-experience-and-salary-dataset).
 12 | 
 13 | ### Create & pickle a model
 14 | Start by importing packages and data:
 15 | 
 16 | ```python
 17 | import pickle
 18 | import pandas as pd
 19 | from sklearn.linear_model import LinearRegression
 20 | 
 21 | # Create data
 22 | d = {'years_experience': [1.1,1.3,1.5,2,2.2,2.9,3.2,3.7,3.9,4.5,4.9,
 23 |                           5.1,5.3,5.9,6,7.9,8.2,8.7,9,9.6,10.3,10.5],
 24 |      'salary': [39343,46205,37731,43525,39891,56642,64445,57189,63218,61111,
 25 |                 67938,66029,83088,81363,93940,101302,113812,109431,105582,
 26 |                 112635,122391,121872]}
 27 | df = pd.DataFrame(data=d)
 28 | ```
 29 | 
 30 | Then, separate the predictor and target and fit a linear regression:
 31 | 
 32 | ```python
 33 | # Split data
 34 | X = pd.DataFrame(df.years_experience)
 35 | y = pd.DataFrame(df.salary)
 36 | 
 37 | # Fit regression
 38 | reg = LinearRegression().fit(X,y)  #<-- We're going to pickle this in a minute
 39 | ```
 40 | 
 41 | We can print the single coefficient and the intercept to compare with the new object when we unpickle it:
 42 | 
 43 | ```python
 44 | print(f"Coefficient: {round(reg.coef_[0][0],2)}")  # 9267.24
 45 | print(f"Intercept: {round(reg.intercept_[0],2)}")  # 27178.6
 46 | ```
 47 | 
 48 | Lastly, to save the regression model object (`reg`), we use the `.dump()` method to save the model in a file called `pickled_model.p` in the working directory.  The `wb` argument is telling the pickle module to write (i.e., create) a new file.
 49 | 
 50 | I like to think of it like storing something in a box. We are going to `open` a box and `dump` our `reg` model into it.  Then we'll label it `pickled_model.p` so we can find it easily later.
 51 | 
 52 | ```python
 53 | # Pickle the regression model object
 54 | with open("pickled_model.p", "wb") as p:
 55 |     pickle.dump(reg, p)
 56 | ```
 57 | 
 58 | ### Unpickle the model
 59 | Now, we will create a totally new file and unpickle the model we just created.
 60 | 
 61 | Start by importing packages and creating some new data:
 62 | ```python
 63 | import pickle
 64 | import pandas as pd
 65 | 
 66 | # New data
 67 | d = {'years_experience': [3,3.2,4,4,4.1,6.8,7.1,9.5],
 68 |      'salary': [60150,54445,55794,56957,57081,91738,98273,116969]}
 69 | test = pd.DataFrame(data=d)
 70 | 
 71 | # Separate data into X and y
 72 | X_test = pd.DataFrame(test.years_experience)
 73 | y_test = pd.DataFrame(test.salary)
 74 | ```
 75 | 
 76 | Now, load the pickled model using the `.load()` method, opening the file we saved earlier (`pickled_model.p`) and reading it with the `rb` argument.
 77 | 
 78 | ```python
 79 | # Unpickle the regression model object
 80 | with open("pickled_model.p", "rb") as p:
 81 |     new_reg = pickle.load(p)
 82 | ```
 83 | 
 84 | We can print the coefficient and intercept to verify that it's the same model:
 85 | 
 86 | ```python
 87 | print(f"Coefficient: {round(new_reg.coef_[0][0],2)}")  # 9267.24
 88 | print(f"Intercept: {round(new_reg.intercept_[0],2)}")  # 27178.6
 89 | ```
 90 | 
 91 | They're the same! :tada:
 92 | 
 93 | Now we can make predictions on new data and evaluate model quality with R<sup>2</sup> and the Mean Absolute Percentage Error (MAPE):
 94 | 
 95 | ```python
 96 | # R-Squared
 97 | r_squared = new_reg.score(X_test, y_test)
 98 | print(f"R-Squared: {round(r_squared*100, 2)}")  # 93.95
 99 | 
100 | # Predictions
101 | preds = new_reg.predict(X_test)
102 | 
103 | # MAPE
104 | mape = abs((y_test - preds)/y_test).mean()
105 | print(f"MAPE: {round(mape[0]*100, 2)}%")  # 7.96%
106 | ```
107 | 
108 | The actual results in this example aren't that important, but I wanted to go through the whole example.  I also wanted to demonstrate that you don't have to import the LinearRegression module when you unpickle the file because the methods required for predicting and scoring were pickled in the original file.  Nifty! :smiley:
109 | 
110 | 
111 | ## Considerations & Conclusion
112 | A couple of things to keep in mind:
113 | - When you unpickle something, you'll need to be running on the same version of Python.
114 | - Only unpickle files that you trust completely.  You can pickle just about any object, including malicious code, so be extra cautious.
115 | - The `pickle` module is exclusive to Python.  If you are planning to use your object in a different language or want the pickled object to be readable by humans, consider JSON serialization instead.  Wouldn't it be cool if there was a library that could pickle to JSON??  Oh look, it's [jsonpickle](https://github.com/jsonpickle/jsonpickle)!
116 | 
117 | 
118 | Now, go forth and pickle!
119 | 
120 | 
121 | ## Other Resources
122 | [Official pickle module docs](https://docs.python.org/3/library/pickle.html)  
123 | [Save and Load Machine Learning Models in Python with scikit-learn](https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/)
124 | 
125 | 
126 | ## Thanks
127 | Thanks to [Mark Freeman II](https://www.linkedin.com/in/mafreeman2) and [Timo Voipio](https://www.linkedin.com/in/t-voipio) for their suggestions on improving this repo :+1:
128 | 


--------------------------------------------------------------------------------