├── .gitignore
├── LICENSE
├── README.md
├── session-1
├── data
│ ├── emails.csv
│ └── survey.csv
├── example_1-email_broadcast.ipynb
├── example_2-email_broadcast.py
├── example_3-report_builder_emailer.ipynb
├── requirements.txt
├── slides.pdf
└── top_secret.py
├── session-2
├── data
│ └── transformed_data.csv
├── example_1-gmail_ETL.ipynb
├── example_2-email_classifier.ipynb
├── requirements.txt
├── slides.pdf
└── top_secret.py
├── session-3
├── data
│ └── transformed_data.csv
├── example_1-paper_summarizer.ipynb
├── example_2-text-classifier.ipynb
├── example_3-local_visual_QA.ipynb
├── papers
│ ├── attention-is-all-you-need.pdf
│ └── attention-is-all-you-need.png
├── requirements.txt
├── slides.pdf
└── top_secret.py
├── session-4
├── articles
│ ├── 2023-07-13_A-Practical-Introduction-to-LLMs-65194dda1148.html
│ ├── 2023-07-21_Cracking-Open-the-OpenAI--Python--API-230e4cae7971.html
│ ├── 2023-08-05_Cracking-Open-the-Hugging-Face-Transformers-Library-350aa0ef0161.html
│ ├── 2023-08-25_Prompt-Engineering---How-to-trick-AI-into-solving-your-problems-7ce1ed3b553f.html
│ ├── 2023-09-11_Fine-Tuning-Large-Language-Models--LLMs--23473d763b91.html
│ ├── 2023-09-21_How-to-Build-an-LLM-from-Scratch-8c477768f1f9.html
│ ├── 2024-02-22_QLoRA---How-to-Fine-Tune-an-LLM-on-a-Single-GPU-4e44d6b5be32.html
│ ├── 2024-03-09_How-to-Improve-LLMs-with-RAG-abdc132f76ac.html
│ ├── 2024-03-27_Text-Embeddings--Classification--and-Semantic-Search-8291746220be.html
│ ├── 2024-08-30_Compressing-Large-Language-Models--LLMs--9f406eea5b5e.html
│ ├── 2024-09-26_LLM-Fine-tuning---FAQs-200442827c99.html
│ ├── 2024-10-17_Fine-Tuning-BERT-for-Text-Classification-a01f89b179fc.html
│ ├── 2024-11-19_Multimodal-Models---LLMs-that-can-see-and-hear-5c6737c981d3.html
│ └── 2024-11-29_Multimodal-Embeddings--An-Introduction-5dc36975966f.html
├── data
│ ├── chunk_embeddings.pt
│ ├── chunk_list.json
│ └── survey.csv
├── example_1-unstructured_survey_analysis.ipynb
├── example_2-blog_semantic_search.ipynb
├── example_3-blog_QA_RAG.ipynb
├── functions.py
├── requirements.txt
├── slides.pdf
└── top_secret.py
├── session-5
├── README.md
├── data
│ ├── LI_posts.csv
│ ├── LI_posts.numbers
│ ├── train-data.jsonl
│ └── valid-data.jsonl
├── example_1-finetune_bert_classifier.ipynb
├── example_2-linkedin_post_writer.ipynb
├── requirements.txt
├── slides.pdf
└── top_secret.py
└── session-6
└── ABB-1-session_6.pdf
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
164 | .DS_Store
165 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # AI-Builders-Bootcamp-1
2 | Code repository for AI Builders Bootcamp #1.
3 |
4 | Course homepage on Maven: https://maven.com/shaw-talebi/ai-builders-bootcamp
5 |
6 | ## Session 1: Introduction, Software 1.0
7 | Getting started with AI and building basic automation scripts with Python.
8 |
9 | Examples:
10 |
11 | - [Automated email broadcaster](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-1/example_1-email_broadcast.ipynb)
12 | - [Automated report builder and emailer](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-1/example_3-report_builder_emailer.ipynb)
13 |
14 | ## Session 2: Software 2.0, Data Engineering, Machine Learning
15 | Building ETL pipelines for machine learning projects.
16 |
17 | Examples:
18 | - [ETL of Gmail inbox](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-2/example_1-gmail_ETL.ipynb)
19 | - [Training email classifier](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-2/example_2-email_classifier.ipynb)
20 |
21 | ## Session 3: Software 3.0, Foundation Models, Prompt Engineering
22 | Here, we begin building AI systems with LLMs. Unlike machine learning, we don't need datasets to get started.
23 |
24 | Examples:
25 | - [Research paper summarizer](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-3/example_1-paper_summarizer.ipynb)
26 | - [Text classifier](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-3/example_2-text-classifier.ipynb)
27 | - [Local document QA with LLaMA 3.2 Vision](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-3/example_3-local_visual_QA.ipynb)
28 |
29 | ## Session 4: Software 3.0, RAG, Text Embeddings
30 | Prompting LLMs ChatGPT-style only scratches the surface of what we can use modern language models for. We can also leverage RAG to improve model performance and text embeddings to make text computable.
31 |
32 | Examples:
33 | - [Analyzing unstructured survey results](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-4/example_1-unstructured_survey_analysis.ipynb)
34 | - [Semantic search over blog posts](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-4/example_2-blog_semantic_search.ipynb)
35 | - [RAG chatbot over blog series](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-4/example_3-blog_QA_RAG.ipynb)
36 |
37 | ## Session 5: Software 3.0, Fine-tuning
38 | Although LLMs can solve a wide range of problems out-of-the-box, there are situations where more model customization is required. This can be achieved through model fine-tuning, which involves adapting a model to a particular use case through additional training.
39 |
40 | Examples
41 | - [Fine-tuning BERT for Text Classification](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/example_1-finetune_bert_classifier.ipynb)
42 | - [Fine-tuning a LinkedIn Post Writer](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/example_2-linkedin_post_writer.ipynb)
43 |
44 | ## Session 6: AI Project Management
45 | Technical skills are necessary but not sufficient for generating value with AI. In the final session, we review a project management framework for getting an ROI on AI projects and best practices.
46 |
--------------------------------------------------------------------------------
/session-1/data/emails.csv:
--------------------------------------------------------------------------------
1 | Name,Email
2 | Shaw Talebi,shaw@example.com
3 | Luca Romano,luca@example.com
4 | Elena Petrova,elena@example.com
--------------------------------------------------------------------------------
/session-1/data/survey.csv:
--------------------------------------------------------------------------------
1 | What is your current level of experience with AI/ML?,How many years of programming experience do you have?,Which programming language(s) are you most comfortable with?
2 | 1-3 years,4-6 years,Python
3 | 1-3 years,4-6 years,Python
4 | <1 year,<1 year,Python
5 | 4-6 years,7+ years,Python
6 | <1 year,7+ years,"PHP, Javascript"
7 | <1 year,7+ years,Python
8 | <1 year,1-3 years,I used to code in a previous life and started learning Python this year (intermediate level) and comfortable in BASH
9 | 1-3 years,7+ years,"python, javascript, java, c++, typescript, goLang"
10 | <1 year,7+ years,"C#, C++. Java"
11 | 7+ years,7+ years,"Python, C++ (including CUDA C), R, Javascript, PHP"
12 | <1 year,7+ years,"PHP, some JavaScript"
13 | <1 year,<1 year,Python
14 | <1 year,1-3 years,Python
15 | 1-3 years,7+ years,c# python
16 | <1 year,<1 year,Python
17 | 1-3 years,7+ years,Python
18 | 1-3 years,1-3 years,"Python, bash"
19 | <1 year,<1 year,Python
20 | <1 year,1-3 years,Fortran / Algol / Basic -- 30+ years ago :) (but I was pretty good at it)
21 | <1 year,1-3 years,T-SQL
22 | <1 year,1-3 years,"HTML, CSS, Python"
23 | 1-3 years,1-3 years,Python
24 | <1 year,1-3 years,"Python, C"
--------------------------------------------------------------------------------
/session-1/example_1-email_broadcast.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Replicating Maven Broadcast Function\n",
8 | "### ABB #1 - Session 1\n",
9 | "\n",
10 | "Code authored by: Shaw Talebi"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "### imports"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "metadata": {
24 | "jupyter": {
25 | "source_hidden": true
26 | }
27 | },
28 | "outputs": [],
29 | "source": [
30 | "import csv\n",
31 | "\n",
32 | "import smtplib\n",
33 | "from email.mime.multipart import MIMEMultipart\n",
34 | "from email.mime.text import MIMEText\n",
35 | "\n",
36 | "from top_secret import app_password"
37 | ]
38 | },
39 | {
40 | "cell_type": "markdown",
41 | "metadata": {},
42 | "source": [
43 | "### define variables"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {
50 | "jupyter": {
51 | "source_hidden": true
52 | }
53 | },
54 | "outputs": [],
55 | "source": [
56 | "course_name = \"AI Builders Bootcamp\"\n",
57 | "start_date = \"Nov 8th, 2024\"\n",
58 | "end_date = \"Dec 20th, 2024\"\n",
59 | "portal_open_date = \"Nov 4th, 2024\""
60 | ]
61 | },
62 | {
63 | "cell_type": "markdown",
64 | "metadata": {},
65 | "source": [
66 | "### construct message"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 3,
72 | "metadata": {
73 | "jupyter": {
74 | "source_hidden": true
75 | }
76 | },
77 | "outputs": [],
78 | "source": [
79 | "# construct message template\n",
80 | "subject = f\"Welcome to {course_name}!\""
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 4,
86 | "metadata": {
87 | "jupyter": {
88 | "source_hidden": true
89 | }
90 | },
91 | "outputs": [],
92 | "source": [
93 | "body = lambda student_name : f\"\"\"Hey {student_name},\n",
94 | "\n",
95 | "I'm super excited to have you here!\n",
96 | "\n",
97 | "We'll be getting started on {start_date} and finishing up on {end_date}.\n",
98 | "\n",
99 | "Keep your eye out for the course portal, which will open up on {portal_open_date}.\n",
100 | "\n",
101 | "Let me know if you have any questions in the meantime :)\n",
102 | "\n",
103 | "-Shaw\"\"\""
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": 5,
109 | "metadata": {
110 | "jupyter": {
111 | "source_hidden": true
112 | }
113 | },
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "Welcome to AI Builders Bootcamp!\n",
120 | "Hey Shaw,\n",
121 | "\n",
122 | "I'm super excited to have you here!\n",
123 | "\n",
124 | "We'll be getting started on Nov 8th, 2024 and finishing up on Dec 20th, 2024.\n",
125 | "\n",
126 | "Keep your eye out for the course portal, which will open up on Nov 4th, 2024.\n",
127 | "\n",
128 | "Let me know if you have any questions in the meantime :)\n",
129 | "\n",
130 | "-Shaw\n"
131 | ]
132 | }
133 | ],
134 | "source": [
135 | "print(subject)\n",
136 | "print(body(\"Shaw\"))"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "metadata": {},
142 | "source": [
143 | "### read names and emails from .csv file"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 6,
149 | "metadata": {
150 | "jupyter": {
151 | "source_hidden": true
152 | }
153 | },
154 | "outputs": [
155 | {
156 | "name": "stdout",
157 | "output_type": "stream",
158 | "text": [
159 | "[('Shaw', 'shaw@example.com'), ('Luca', 'luca@example.com'), ('Elena', 'elena@example.com')]\n"
160 | ]
161 | }
162 | ],
163 | "source": [
164 | "# path to CSV file\n",
165 | "filename = \"data/emails.csv\"\n",
166 | "\n",
167 | "# initialize an empty list to store recipient tuples\n",
168 | "recipients_list = []\n",
169 | "\n",
170 | "# read CSV file and populate the recipients list\n",
171 | "with open(filename, mode='r') as csvfile:\n",
172 | " csv_reader = csv.DictReader(csvfile)\n",
173 | " for row in csv_reader:\n",
174 | " name = row[\"Name\"].split(\" \")[0]\n",
175 | " email = row[\"Email\"]\n",
176 | " recipients_list.append((name, email))\n",
177 | "\n",
178 | "print(recipients_list)"
179 | ]
180 | },
181 | {
182 | "cell_type": "markdown",
183 | "metadata": {},
184 | "source": [
185 | "### send email"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 7,
191 | "metadata": {},
192 | "outputs": [
193 | {
194 | "name": "stdout",
195 | "output_type": "stream",
196 | "text": [
197 | "Email sent successfully to Shaw at shaw@example.com!\n",
198 | "Email sent successfully to Luca at luca@example.com!\n",
199 | "Email sent successfully to Elena at elena@example.com!\n"
200 | ]
201 | }
202 | ],
203 | "source": [
204 | "# Define your email credentials and recipient\n",
205 | "sender_email = \"your email here\"\n",
206 | "password = app_password\n",
207 | "\n",
208 | "# Connect to the server\n",
209 | "with smtplib.SMTP(\"smtp.gmail.com\", 587) as server:\n",
210 | " server.starttls() # Start TLS encryption\n",
211 | " server.login(sender_email, password) # Log into email account\n",
212 | "\n",
213 | " # Send the email to each recipient with personalized content\n",
214 | " for name, email in recipients_list:\n",
215 | " # Set up the MIME\n",
216 | " message = MIMEMultipart()\n",
217 | " message[\"From\"] = sender_email\n",
218 | " message[\"To\"] = email\n",
219 | " message[\"Subject\"] = subject\n",
220 | " \n",
221 | " # Email body with personalization\n",
222 | " personalized_body = body(name)\n",
223 | " message.attach(MIMEText(personalized_body, \"plain\"))\n",
224 | "\n",
225 | " # Send the email\n",
226 | " server.sendmail(sender_email, email, message.as_string())\n",
227 | " print(f\"Email sent successfully to {name} at {email}!\")"
228 | ]
229 | },
230 | {
231 | "cell_type": "markdown",
232 | "metadata": {},
233 | "source": [
234 | "Note: to create app password for Gmail follow the following steps.\n",
235 | "\n",
236 | "1. Navigate to [Google Account](https://myaccount.google.com/)\n",
237 | "3. Click on \"2-Step Verification\" under \"How you sign in to Google\"\n",
238 | "4. Click on right arrow under \"App passwords\"\n",
239 | "5. Type in new App name and hit \"Create\"\n",
240 | "\n",
241 | "Additional support [here](https://support.google.com/mail/answer/185833?hl=en)"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": null,
247 | "metadata": {},
248 | "outputs": [],
249 | "source": []
250 | }
251 | ],
252 | "metadata": {
253 | "kernelspec": {
254 | "display_name": "Python 3 (ipykernel)",
255 | "language": "python",
256 | "name": "python3"
257 | },
258 | "language_info": {
259 | "codemirror_mode": {
260 | "name": "ipython",
261 | "version": 3
262 | },
263 | "file_extension": ".py",
264 | "mimetype": "text/x-python",
265 | "name": "python",
266 | "nbconvert_exporter": "python",
267 | "pygments_lexer": "ipython3",
268 | "version": "3.12.2"
269 | }
270 | },
271 | "nbformat": 4,
272 | "nbformat_minor": 4
273 | }
274 |
--------------------------------------------------------------------------------
/session-1/example_2-email_broadcast.py:
--------------------------------------------------------------------------------
1 | # Replicating Maven Broadcast Function (Script version)
2 | # ABB #1 - Session 1
3 |
4 | # Code authored by: Shaw Talebi
5 |
6 | # IMPORTS
7 | import csv
8 |
9 | import smtplib
10 | from email.mime.multipart import MIMEMultipart
11 | from email.mime.text import MIMEText
12 |
13 | from top_secret import app_password
14 |
15 | # DEFINE VARIABLES
16 | course_name = "AI Builders Bootcamp"
17 | start_date = "Nov 8th, 2024"
18 | end_date = "Dec 20th, 2024"
19 | portal_open_date = "Nov 4th, 2024"
20 |
21 | # CONSTRUCT EMAIL
22 | subject = f"Welcome to {course_name}!"
23 | print(subject)
24 |
25 | body = lambda student_name : f"""Hey {student_name},
26 |
27 | I'm super excited to have you here!
28 |
29 | We'll be getting started on {start_date} and finishing up on {end_date}.
30 |
31 | Keep your eye out for the course portal, which will open up on {portal_open_date}.
32 |
33 | Let me know if you have any questions in the meantime :)
34 |
35 | -Shaw"""
36 | print(body("Test"))
37 |
38 | # READ NAMES/EMAILS FROM CSV
39 | filename = "data/emails.csv" # path to CSV file
40 |
41 | recipients_list = [] # initialize an empty list to store recipient tuples
42 |
43 | # read CSV file and populate the recipients list
44 | with open(filename, mode='r') as csvfile:
45 | csv_reader = csv.DictReader(csvfile)
46 | for row in csv_reader:
47 | name = row["Name"].split(" ")[0]
48 | email = row["Email"]
49 | recipients_list.append((name, email))
50 |
51 | print('\n')
52 | print("Recipients List:", recipients_list)
53 |
54 | # SEND EMAILS
55 | # define email credentials and recipient
56 | sender_email = "your email here"
57 | password = app_password
58 |
59 | # connect to the server
60 | with smtplib.SMTP("smtp.gmail.com", 587) as server:
61 | server.starttls() # start TLS encryption
62 | server.login(sender_email, password) # log into email account
63 |
64 | # send the email to each recipient with personalized content
65 | for name, email in recipients_list:
66 | # set up the MIME
67 | message = MIMEMultipart()
68 | message["From"] = sender_email
69 | message["To"] = email
70 | message["Subject"] = subject
71 |
72 | # email body with personalization
73 | personalized_body = body(name)
74 | message.attach(MIMEText(personalized_body, "plain"))
75 |
76 | # send the email
77 | server.sendmail(sender_email, email, message.as_string())
78 | print(f"Email sent successfully to {name} at {email}!")
--------------------------------------------------------------------------------
/session-1/example_3-report_builder_emailer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Automated Report Builder and Emailer\n",
8 | "### ABB #1 - Session 1\n",
9 | "\n",
10 | "Code authored by: Shaw Talebi"
11 | ]
12 | },
13 | {
14 | "cell_type": "markdown",
15 | "metadata": {},
16 | "source": [
17 | "### imports"
18 | ]
19 | },
20 | {
21 | "cell_type": "code",
22 | "execution_count": 1,
23 | "metadata": {
24 | "jupyter": {
25 | "source_hidden": true
26 | }
27 | },
28 | "outputs": [],
29 | "source": [
30 | "import csv\n",
31 | "import pandas as pd\n",
32 | "\n",
33 | "import smtplib\n",
34 | "from email.mime.multipart import MIMEMultipart\n",
35 | "from email.mime.text import MIMEText\n",
36 | "\n",
37 | "from top_secret import app_password"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "metadata": {},
43 | "source": [
44 | "### load data"
45 | ]
46 | },
47 | {
48 | "cell_type": "markdown",
49 | "metadata": {},
50 | "source": [
51 | "#### read in emails"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 2,
57 | "metadata": {
58 | "jupyter": {
59 | "source_hidden": true
60 | }
61 | },
62 | "outputs": [
63 | {
64 | "name": "stdout",
65 | "output_type": "stream",
66 | "text": [
67 | "[('Shaw', 'shaw@example.com'), ('Luca', 'luca@example.com'), ('Elena', 'elena@example.com')]\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "# load contact info\n",
73 | "filename = \"data/emails.csv\"\n",
74 | "recipients_list = []\n",
75 | "\n",
76 | "# read CSV file and populate the recipients list\n",
77 | "with open(filename, mode='r') as csvfile:\n",
78 | " csv_reader = csv.DictReader(csvfile)\n",
79 | " for row in csv_reader:\n",
80 | " name = row[\"Name\"].split(\" \")[0]\n",
81 | " email = row[\"Email\"]\n",
82 | " recipients_list.append((name, email))\n",
83 | "\n",
84 | "print(recipients_list)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 3,
90 | "metadata": {
91 | "jupyter": {
92 | "source_hidden": true
93 | }
94 | },
95 | "outputs": [],
96 | "source": [
97 | "# recipients_list = [('Shaw', 'shawhintalebi@gmail.com')]"
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {},
103 | "source": [
104 | "#### read survey results"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 4,
110 | "metadata": {
111 | "jupyter": {
112 | "source_hidden": true
113 | }
114 | },
115 | "outputs": [],
116 | "source": [
117 | "filename = \"data/survey.csv\"\n",
118 | "df_survey = pd.read_csv(filename)"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 5,
124 | "metadata": {
125 | "jupyter": {
126 | "source_hidden": true
127 | }
128 | },
129 | "outputs": [
130 | {
131 | "data": {
132 | "text/html": [
133 | "
\n",
134 | "\n",
147 | "
\n",
148 | " \n",
149 | " \n",
150 | " | \n",
151 | " What is your current level of experience with AI/ML? | \n",
152 | " How many years of programming experience do you have? | \n",
153 | " Which programming language(s) are you most comfortable with? | \n",
154 | "
\n",
155 | " \n",
156 | " \n",
157 | " \n",
158 | " 0 | \n",
159 | " 1-3 years | \n",
160 | " 4-6 years | \n",
161 | " Python | \n",
162 | "
\n",
163 | " \n",
164 | " 1 | \n",
165 | " 1-3 years | \n",
166 | " 4-6 years | \n",
167 | " Python | \n",
168 | "
\n",
169 | " \n",
170 | " 2 | \n",
171 | " <1 year | \n",
172 | " <1 year | \n",
173 | " Python | \n",
174 | "
\n",
175 | " \n",
176 | " 3 | \n",
177 | " 4-6 years | \n",
178 | " 7+ years | \n",
179 | " Python | \n",
180 | "
\n",
181 | " \n",
182 | " 4 | \n",
183 | " <1 year | \n",
184 | " 7+ years | \n",
185 | " PHP, Javascript | \n",
186 | "
\n",
187 | " \n",
188 | "
\n",
189 | "
"
190 | ],
191 | "text/plain": [
192 | " What is your current level of experience with AI/ML? \\\n",
193 | "0 1-3 years \n",
194 | "1 1-3 years \n",
195 | "2 <1 year \n",
196 | "3 4-6 years \n",
197 | "4 <1 year \n",
198 | "\n",
199 | " How many years of programming experience do you have? \\\n",
200 | "0 4-6 years \n",
201 | "1 4-6 years \n",
202 | "2 <1 year \n",
203 | "3 7+ years \n",
204 | "4 7+ years \n",
205 | "\n",
206 | " Which programming language(s) are you most comfortable with? \n",
207 | "0 Python \n",
208 | "1 Python \n",
209 | "2 Python \n",
210 | "3 Python \n",
211 | "4 PHP, Javascript "
212 | ]
213 | },
214 | "execution_count": 5,
215 | "metadata": {},
216 | "output_type": "execute_result"
217 | }
218 | ],
219 | "source": [
220 | "df_survey.head()"
221 | ]
222 | },
223 | {
224 | "cell_type": "code",
225 | "execution_count": 6,
226 | "metadata": {
227 | "jupyter": {
228 | "source_hidden": true
229 | }
230 | },
231 | "outputs": [
232 | {
233 | "name": "stdout",
234 | "output_type": "stream",
235 | "text": [
236 | "23\n"
237 | ]
238 | }
239 | ],
240 | "source": [
241 | "# response count\n",
242 | "class_size = 43\n",
243 | "num_responses = len(df_survey)\n",
244 | "print(num_responses)"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 7,
250 | "metadata": {
251 | "jupyter": {
252 | "source_hidden": true
253 | }
254 | },
255 | "outputs": [
256 | {
257 | "name": "stdout",
258 | "output_type": "stream",
259 | "text": [
260 | "What is your current level of experience with AI/ML?\n",
261 | "<1 year 14\n",
262 | "1-3 years 7\n",
263 | "4-6 years 1\n",
264 | "7+ years 1\n",
265 | "Name: count, dtype: int64\n"
266 | ]
267 | }
268 | ],
269 | "source": [
270 | "# AI experience distribution\n",
271 | "ai_experience = df_survey.iloc[:,0].value_counts()\n",
272 | "print(ai_experience)"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 8,
278 | "metadata": {
279 | "jupyter": {
280 | "source_hidden": true
281 | }
282 | },
283 | "outputs": [
284 | {
285 | "name": "stdout",
286 | "output_type": "stream",
287 | "text": [
288 | "How many years of programming experience do you have?\n",
289 | "7+ years 9\n",
290 | "1-3 years 8\n",
291 | "<1 year 4\n",
292 | "4-6 years 2\n",
293 | "Name: count, dtype: int64\n"
294 | ]
295 | }
296 | ],
297 | "source": [
298 | "# coding experience distribution\n",
299 | "coding_experience = df_survey.iloc[:,1].value_counts()\n",
300 | "print(coding_experience)"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": 9,
306 | "metadata": {
307 | "jupyter": {
308 | "source_hidden": true
309 | }
310 | },
311 | "outputs": [
312 | {
313 | "name": "stdout",
314 | "output_type": "stream",
315 | "text": [
316 | " Count\n",
317 | "Language \n",
318 | "Python 18\n",
319 | "C++ 7\n",
320 | "JavaScript 4\n",
321 | "PHP 3\n",
322 | "Bash 2\n",
323 | "Java 2\n",
324 | "TypeScript 1\n",
325 | "goLang 1\n",
326 | "R 1\n",
327 | "Fortran 1\n",
328 | "Algol 1\n",
329 | "Basic 1\n",
330 | "SQL 1\n",
331 | "HTML 1\n",
332 | "CSS 1\n",
333 | "C# 0\n"
334 | ]
335 | }
336 | ],
337 | "source": [
338 | "# coding language distribution\n",
339 | "\n",
340 | "# define coding languages\n",
341 | "lang_list = ['Python', 'PHP', 'JavaScript', 'Bash', 'C++', 'Java', 'TypeScript', 'goLang', 'C#', 'R', 'Fortran', 'Algol', 'Basic', 'SQL', 'HTML', 'CSS']\n",
342 | "\n",
343 | "# create a dictionary to store the count for each language\n",
344 | "lang_dict = {}\n",
345 | "\n",
346 | "# count occurrences of each language in the Series\n",
347 | "for lang in lang_list:\n",
348 | " lang_dict[lang] = df_survey.iloc[:,2].str.lower().str.count(r'\\b' + lang.lower() + r'\\b').sum()\n",
349 | "\n",
350 | "# convert the counts to a DataFrame for easier readability\n",
351 | "df_lang = pd.DataFrame(list(lang_dict.items()), columns=['Language', 'Count']).set_index('Language').sort_values(by='Count', ascending=False)\n",
352 | "print(df_lang)"
353 | ]
354 | },
355 | {
356 | "cell_type": "markdown",
357 | "metadata": {},
358 | "source": [
359 | "### construct message"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 10,
365 | "metadata": {
366 | "jupyter": {
367 | "source_hidden": true
368 | }
369 | },
370 | "outputs": [],
371 | "source": [
372 | "def remove_first_last_lines(text):\n",
373 | " lines = text.splitlines()\n",
374 | " return \"\\n\".join(lines[1:-1])\n",
375 | "\n",
376 | "def remove_first_two_lines(text):\n",
377 | " lines = text.splitlines()\n",
378 | " return \"\\n\".join(lines[2:])"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 11,
384 | "metadata": {
385 | "jupyter": {
386 | "source_hidden": true
387 | }
388 | },
389 | "outputs": [],
390 | "source": [
391 | "subject = \"The results are in...\"\n",
392 | "body = lambda student_name : f\"\"\"Hey {student_name},\n",
393 | "\n",
394 | "I wanted to share the pre-course survey results before our first session tomorrow.\n",
395 | "\n",
396 | "We had {num_responses} responses out of the {class_size} enrolled ({int(100*round(num_responses/class_size,2))}% response rate).\n",
397 | "\n",
398 | "Here's a breakdown of this cohort's incoming experience 👇\n",
399 | "\n",
400 | "AI/ML Experience:\n",
401 | "{remove_first_last_lines(str(ai_experience))}\n",
402 | "\n",
403 | "Programming Experience:\n",
404 | "{remove_first_last_lines(str(coding_experience))}\n",
405 | "\n",
406 | "Top Languages:\n",
407 | "{remove_first_two_lines(str(df_lang[:4]))}\n",
408 | "\n",
409 | "Looking forward to meeting you all tomorrow!\n",
410 | "\n",
411 | "-Shaw\n",
412 | "\n",
413 | "P.S. If you haven't completed the pre-course survey yet, you can access it here: https://forms.gle/YQ5DbbovRUf5Utvc8\"\"\""
414 | ]
415 | },
416 | {
417 | "cell_type": "code",
418 | "execution_count": 12,
419 | "metadata": {
420 | "jupyter": {
421 | "source_hidden": true
422 | }
423 | },
424 | "outputs": [
425 | {
426 | "name": "stdout",
427 | "output_type": "stream",
428 | "text": [
429 | "Hey Test,\n",
430 | "\n",
431 | "I wanted to share the pre-course survey results before our first session tomorrow.\n",
432 | "\n",
433 | "We had 23 responses out of the 43 enrolled (53% response rate).\n",
434 | "\n",
435 | "Here's a breakdown of this cohort's incoming experience 👇\n",
436 | "\n",
437 | "AI/ML Experience:\n",
438 | "<1 year 14\n",
439 | "1-3 years 7\n",
440 | "4-6 years 1\n",
441 | "7+ years 1\n",
442 | "\n",
443 | "Programming Experience:\n",
444 | "7+ years 9\n",
445 | "1-3 years 8\n",
446 | "<1 year 4\n",
447 | "4-6 years 2\n",
448 | "\n",
449 | "Top Languages:\n",
450 | "Python 18\n",
451 | "C++ 7\n",
452 | "JavaScript 4\n",
453 | "PHP 3\n",
454 | "\n",
455 | "Looking forward to meeting you all tomorrow!\n",
456 | "\n",
457 | "-Shaw\n",
458 | "\n",
459 | "P.S. If you haven't completed the pre-course survey yet, you can access it here: https://forms.gle/YQ5DbbovRUf5Utvc8\n"
460 | ]
461 | }
462 | ],
463 | "source": [
464 | "subject = \"The results are in...\"\n",
465 | "print(body(\"Test\"))"
466 | ]
467 | },
468 | {
469 | "cell_type": "markdown",
470 | "metadata": {},
471 | "source": [
472 | "### send emails"
473 | ]
474 | },
475 | {
476 | "cell_type": "code",
477 | "execution_count": 13,
478 | "metadata": {},
479 | "outputs": [
480 | {
481 | "name": "stdout",
482 | "output_type": "stream",
483 | "text": [
484 | "Email sent successfully to Shaw at shaw@example.com!\n",
485 | "Email sent successfully to Luca at luca@example.com!\n",
486 | "Email sent successfully to Elena at elena@example.com!\n"
487 | ]
488 | }
489 | ],
490 | "source": [
491 | "# Define your email credentials and recipient\n",
492 | "sender_email = \"your email here\"\n",
493 | "password = app_password\n",
494 | "\n",
495 | "# Connect to the server\n",
496 | "with smtplib.SMTP(\"smtp.gmail.com\", 587) as server:\n",
497 | " server.starttls() # Start TLS encryption\n",
498 | " server.login(sender_email, password) # Log into email account\n",
499 | "\n",
500 | " # Send the email to each recipient with personalized content\n",
501 | " for name, email in recipients_list:\n",
502 | " # Set up the MIME\n",
503 | " message = MIMEMultipart()\n",
504 | " message[\"From\"] = sender_email\n",
505 | " message[\"To\"] = email\n",
506 | " message[\"Subject\"] = subject\n",
507 | " \n",
508 | " # Email body with personalization\n",
509 | " personalized_body = body(name)\n",
510 | " message.attach(MIMEText(personalized_body, \"plain\"))\n",
511 | "\n",
512 | " # Send the email\n",
513 | " server.sendmail(sender_email, email, message.as_string())\n",
514 | " print(f\"Email sent successfully to {name} at {email}!\")"
515 | ]
516 | },
517 | {
518 | "cell_type": "code",
519 | "execution_count": null,
520 | "metadata": {},
521 | "outputs": [],
522 | "source": []
523 | }
524 | ],
525 | "metadata": {
526 | "kernelspec": {
527 | "display_name": "Python 3 (ipykernel)",
528 | "language": "python",
529 | "name": "python3"
530 | },
531 | "language_info": {
532 | "codemirror_mode": {
533 | "name": "ipython",
534 | "version": 3
535 | },
536 | "file_extension": ".py",
537 | "mimetype": "text/x-python",
538 | "name": "python",
539 | "nbconvert_exporter": "python",
540 | "pygments_lexer": "ipython3",
541 | "version": "3.12.2"
542 | }
543 | },
544 | "nbformat": 4,
545 | "nbformat_minor": 4
546 | }
547 |
--------------------------------------------------------------------------------
/session-1/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | ipykernel
3 | pandas
4 |
--------------------------------------------------------------------------------
/session-1/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-1/slides.pdf
--------------------------------------------------------------------------------
/session-1/top_secret.py:
--------------------------------------------------------------------------------
1 | app_password = "paste app password here"
--------------------------------------------------------------------------------
/session-2/data/transformed_data.csv:
--------------------------------------------------------------------------------
1 | body_length,contains_lol,contains_omg,contains_attached,contains_attachment,contains_order_confirmation,contains_payment_summary,sender_has_common_domain,is_personal
2 | 1830,False,False,False,False,False,False,False,True
3 | 1522,False,False,False,False,False,False,True,True
4 | 1838,False,False,False,False,False,False,False,True
5 | 251,False,False,False,True,False,False,False,True
6 | 3170,False,False,False,False,False,False,False,True
7 | 2782,False,False,False,False,False,False,False,True
8 | 11026,False,False,False,False,False,False,False,True
9 | 7025,False,False,False,False,False,False,False,True
10 | 3265,False,False,False,True,False,False,True,True
11 | 3007,False,False,False,True,False,False,False,True
12 | 1954,False,False,False,False,False,False,True,True
13 | 1600,False,False,False,False,False,False,False,True
14 | 1841,False,False,False,False,False,False,False,True
15 | 1841,False,False,False,False,False,False,False,True
16 | 201,False,False,False,False,False,False,False,True
17 | 8520,False,False,False,False,False,False,False,True
18 | 7454,False,False,False,False,False,False,False,True
19 | 8520,False,False,False,False,False,False,False,True
20 | 8520,False,False,False,False,False,False,False,True
21 | 201,False,False,False,False,False,False,False,True
22 | 201,False,False,False,False,False,False,False,True
23 | 3801,False,False,False,False,False,False,False,True
24 | 1068,False,False,False,False,False,False,False,True
25 | 1593,False,False,False,False,False,False,False,True
26 | 1955,False,False,False,False,False,False,False,True
27 | 4499,False,False,False,False,False,False,False,True
28 | 5088,False,False,False,False,False,False,False,True
29 | 57838,False,False,False,False,False,False,False,True
30 | 4420,False,False,False,False,False,False,False,True
31 | 384,True,False,True,False,False,False,True,True
32 | 298,False,False,True,False,False,False,True,True
33 | 845,False,False,False,False,False,False,True,True
34 | 115,False,False,False,False,False,False,True,True
35 | 2,False,False,False,False,False,False,True,True
36 | 1376,False,False,False,False,False,False,True,True
37 | 1146,False,False,False,False,False,False,True,True
38 | 822,True,False,False,True,False,False,True,True
39 | 6847,False,False,True,False,False,False,True,True
40 | 6323,False,False,False,False,False,False,True,True
41 | 4230,False,False,False,False,False,False,True,True
42 | 2020,False,False,False,True,False,False,True,True
43 | 1608,False,False,False,False,False,False,True,True
44 | 1119,False,False,False,False,False,False,False,True
45 | 773,False,False,False,False,False,False,True,True
46 | 640,False,False,False,False,False,False,False,True
47 | 2246,False,False,False,False,False,False,False,True
48 | 1865,False,False,True,False,False,False,False,True
49 | 4720,False,False,False,False,False,False,False,True
50 | 16327,False,False,True,False,False,False,False,True
51 | 6165,False,False,False,True,False,False,True,True
52 | 1931,False,False,False,False,False,False,False,True
53 | 325,False,False,False,False,False,False,True,True
54 | 926,False,False,False,False,False,False,True,True
55 | 776,False,False,False,False,False,False,True,True
56 | 260,False,False,False,True,False,False,True,True
57 | 1405,True,False,False,False,False,False,True,True
58 | 1338,True,False,False,False,False,False,False,True
59 | 13259,False,False,False,False,False,False,True,True
60 | 0,False,False,False,False,False,False,False,True
61 | 53,False,False,False,False,False,False,True,True
62 | 2040,False,False,False,False,False,False,True,True
63 | 14690,False,False,False,False,False,False,True,True
64 | 4830,False,False,True,False,False,False,True,True
65 | 3466,False,False,False,False,False,False,True,True
66 | 3181,False,False,False,False,False,False,True,True
67 | 10438,False,False,False,False,False,False,True,True
68 | 1470,False,False,False,True,True,False,True,True
69 | 1321,False,False,False,False,False,False,True,True
70 | 580,False,False,False,False,False,False,True,True
71 | 0,False,False,False,False,False,False,False,True
72 | 17408,False,False,False,False,False,False,True,True
73 | 16971,False,False,False,False,False,False,True,True
74 | 22849,False,False,False,False,False,False,True,True
75 | 728,False,False,False,False,False,False,True,True
76 | 599,False,False,False,False,False,False,True,True
77 | 480,False,False,False,False,False,False,True,True
78 | 367,False,False,False,False,False,False,True,True
79 | 4,False,False,False,False,False,False,True,True
80 | 1552,False,False,False,False,False,False,True,True
81 | 331,False,False,False,False,False,False,False,True
82 | 85,False,False,False,False,False,False,True,True
83 | 34,False,False,False,False,False,False,True,True
84 | 1247,False,False,False,False,False,False,True,True
85 | 3027,False,False,False,False,False,False,True,True
86 | 23190,True,False,False,False,False,False,True,True
87 | 14691,False,False,False,False,False,False,True,True
88 | 23705,False,False,False,False,False,False,True,True
89 | 1371,False,False,False,False,False,False,True,True
90 | 1373,False,False,False,False,False,False,True,True
91 | 21076,False,False,False,False,True,False,True,True
92 | 2640,False,False,False,False,True,False,False,True
93 | 3511,False,False,False,False,False,False,False,True
94 | 27,False,False,False,False,False,False,True,True
95 | 2534,False,False,False,False,False,False,True,True
96 | 4220,False,False,False,False,False,False,True,True
97 | 4041,False,False,False,False,False,False,True,True
98 | 1073,False,False,False,False,False,False,False,True
99 | 11,False,False,False,False,False,False,True,True
100 | 85,False,False,False,False,False,False,True,True
101 | 471,False,False,True,False,False,False,True,True
102 | 381,False,False,True,False,False,False,True,True
103 | 211,False,False,False,False,False,False,True,True
104 | 5258,False,False,False,False,False,False,True,True
105 | 4017,False,False,False,False,False,False,True,True
106 | 17,False,False,False,False,False,False,True,True
107 | 4884,False,False,False,False,True,False,True,True
108 | 5083,False,False,False,False,False,False,True,True
109 | 5301,False,False,False,False,False,False,True,True
110 | 4791,False,False,False,False,False,False,True,True
111 | 4577,False,False,False,False,False,False,True,True
112 | 7449,False,False,False,False,False,False,False,True
113 | 861,False,False,False,False,False,False,True,True
114 | 501,False,False,False,False,False,False,True,True
115 | 390,False,False,False,False,False,False,True,True
116 | 284,False,False,False,True,False,False,False,True
117 | 258,False,False,False,True,False,False,False,True
118 | 532,False,False,False,False,False,False,True,True
119 | 360,False,False,False,False,False,False,True,True
120 | 654,False,False,False,False,False,False,True,True
121 | 127,False,False,False,False,False,False,True,True
122 | 31,False,False,False,False,False,False,True,True
123 | 1323,False,False,False,False,False,False,True,True
124 | 1289,False,False,False,False,False,False,True,True
125 | 4,False,False,False,False,False,False,True,True
126 | 3313,False,False,False,False,False,False,True,True
127 | 305,False,False,False,False,False,False,True,True
128 | 201,False,False,False,False,False,False,True,True
129 | 409,False,False,False,False,False,False,True,True
130 | 3035,True,False,False,False,False,False,True,True
131 | 2857,False,False,False,False,False,False,True,True
132 | 2624,False,False,False,False,False,False,True,True
133 | 2375,False,False,False,False,False,False,True,True
134 | 17911,False,False,False,False,False,False,True,True
135 | 257,False,False,False,True,False,False,False,True
136 | 492,False,False,False,False,False,False,True,True
137 | 8092,False,False,False,False,False,False,True,True
138 | 1772,False,False,False,False,False,False,True,True
139 | 8239,False,False,False,False,False,False,True,True
140 | 11113,False,False,False,False,False,False,True,True
141 | 4437,False,False,True,False,False,False,True,True
142 | 3844,False,False,True,False,False,False,True,True
143 | 2931,False,False,True,False,False,False,True,True
144 | 314,False,False,False,True,False,False,False,True
145 | 3603,False,False,True,False,False,False,True,True
146 | 6514,False,False,True,False,False,False,True,True
147 | 3337,False,False,False,False,False,False,True,True
148 | 3244,False,False,False,False,False,False,True,True
149 | 1529,False,False,False,False,False,False,False,True
150 | 23901,False,True,False,False,False,False,True,True
151 | 23428,False,True,False,False,False,False,True,True
152 | 1616,False,False,False,False,False,False,False,False
153 | 1164,False,False,False,False,False,False,True,False
154 | 16654,False,False,False,False,False,False,False,False
155 | 78,False,False,False,False,False,False,True,False
156 | 2550,False,False,False,False,False,False,False,False
157 | 2452,False,False,False,False,False,False,False,False
158 | 1737,False,False,False,False,False,False,False,False
159 | 1542,False,False,False,False,False,False,False,False
160 | 1107,False,False,False,False,False,False,False,False
161 | 1641,False,False,False,False,False,False,False,False
162 | 1713,False,False,False,False,False,False,False,False
163 | 1737,False,False,False,False,False,False,False,False
164 | 1074,False,False,False,False,False,False,False,False
165 | 4451,True,False,False,False,False,False,False,False
166 | 21592,False,False,False,False,False,False,False,False
167 | 9581,False,False,False,False,False,False,False,False
168 | 4086,False,False,False,False,False,False,False,False
169 | 11073,False,False,False,False,False,False,False,False
170 | 1456,False,False,False,False,False,False,False,False
171 | 720,False,False,False,False,False,False,False,False
172 | 2422,False,False,False,False,False,False,False,False
173 | 3766,False,True,False,False,False,False,False,False
174 | 1477,False,False,False,False,False,False,False,False
175 | 2712,False,False,False,False,False,False,False,False
176 | 2295,False,False,False,False,False,False,False,False
177 | 2841,False,False,False,False,False,False,False,False
178 | 3427,False,False,False,False,False,False,False,False
179 | 2872,False,False,False,False,False,False,True,False
180 | 2490,False,False,False,False,False,False,False,False
181 | 677,False,False,False,False,False,False,False,False
182 | 2283,False,False,False,False,False,False,False,False
183 | 528,False,False,False,False,False,False,False,False
184 | 3338,False,False,False,False,False,False,True,False
185 | 2983,False,False,False,False,False,False,True,False
186 | 0,False,False,False,False,False,False,False,False
187 | 4790,True,True,False,False,False,False,True,False
188 | 0,False,False,False,False,False,False,True,False
189 | 25289,False,False,False,False,False,False,False,False
190 | 58,False,False,False,False,False,False,True,False
191 | 1947,False,False,True,False,False,False,True,False
192 | 883,False,False,False,False,False,False,False,False
193 | 855,False,False,False,False,False,False,False,False
194 | 2035,False,False,False,False,False,False,False,False
195 | 2424,False,False,False,False,False,False,False,False
196 | 605,False,False,False,False,False,True,False,False
197 | 612,False,False,False,False,False,True,False,False
198 | 617,False,False,False,False,False,True,False,False
199 | 615,False,False,False,False,False,True,False,False
200 | 617,False,False,False,False,False,True,False,False
201 | 3975,False,False,False,False,False,False,False,False
202 | 616,False,False,False,False,False,True,False,False
203 | 6167,False,False,False,False,False,False,False,False
204 | 615,False,False,False,False,False,True,False,False
205 | 613,False,False,False,False,False,True,False,False
206 | 607,False,False,False,False,False,True,False,False
207 | 636,False,False,False,False,False,True,False,False
208 | 633,False,False,False,False,False,True,False,False
209 | 635,False,False,False,False,False,True,False,False
210 | 3017,False,False,False,False,False,False,False,False
211 | 3017,False,False,False,False,False,False,False,False
212 | 637,False,False,False,False,False,True,False,False
213 | 644,False,False,False,False,False,True,False,False
214 | 18824,False,False,False,False,False,False,False,False
215 | 647,False,False,False,False,False,True,False,False
216 | 9982,False,False,False,False,False,False,False,False
217 | 1333,False,False,False,False,False,False,False,False
218 | 9158,False,False,False,False,False,False,False,False
219 | 645,False,False,False,False,False,True,False,False
220 | 9573,False,False,False,False,False,False,False,False
221 | 20297,False,False,False,False,False,False,False,False
222 | 10222,False,False,False,False,False,False,False,False
223 | 647,False,False,False,False,False,True,False,False
224 | 7078,False,False,False,False,False,False,False,False
225 | 646,False,False,False,False,False,True,False,False
226 | 3926,False,False,False,False,False,False,False,False
227 | 1276,False,False,False,False,False,False,False,False
228 | 4223,False,False,False,False,False,False,False,False
229 | 645,False,False,False,False,False,True,False,False
230 | 3627,False,False,False,False,False,False,False,False
231 | 1148,False,False,False,False,False,False,False,False
232 | 3825,False,False,False,False,False,False,False,False
233 | 644,False,False,False,False,False,True,False,False
234 | 18682,False,False,False,False,False,False,False,False
235 | 638,False,False,False,False,False,True,False,False
236 | 5736,False,False,False,False,False,False,False,False
237 | 9269,False,False,False,False,False,False,False,False
238 | 9326,False,False,False,False,False,False,False,False
239 | 11111,False,False,False,False,False,False,False,False
240 | 636,False,False,False,False,False,True,False,False
241 | 3200,False,False,False,False,False,False,False,False
242 | 11024,False,False,False,False,False,False,False,False
243 | 633,False,False,False,False,False,True,False,False
244 | 14645,False,False,False,False,False,False,False,False
245 | 14231,False,False,False,False,False,False,False,False
246 | 635,False,False,False,False,False,True,False,False
247 | 2439,False,False,False,False,False,False,False,False
248 | 2439,False,False,False,False,False,False,False,False
249 | 14950,False,False,False,False,False,False,False,False
250 | 25518,False,False,False,False,False,False,False,False
251 | 17292,False,False,False,False,False,False,False,False
252 | 22354,False,False,False,False,False,False,False,False
253 | 637,False,False,False,False,False,True,False,False
254 | 30169,False,False,False,False,False,False,False,False
255 | 2464,False,False,False,False,False,False,False,False
256 | 535,False,False,False,False,False,False,False,False
257 | 31044,False,False,False,False,False,False,False,False
258 | 37546,False,False,False,False,False,False,False,False
259 | 2463,False,False,False,False,False,False,False,False
260 | 22601,False,False,False,False,False,False,False,False
261 | 15373,False,False,False,False,False,False,False,False
262 | 11570,False,False,False,False,False,False,False,False
263 | 2812,False,False,False,False,False,False,False,False
264 | 21623,False,False,False,False,False,False,False,False
265 | 18536,False,False,False,False,False,False,False,False
266 | 23076,False,False,False,False,False,False,False,False
267 | 2464,False,False,False,False,False,False,False,False
268 | 13046,False,False,False,False,False,False,False,False
269 | 12586,False,False,False,False,False,False,False,False
270 | 2459,False,False,False,False,False,False,False,False
271 | 15562,False,False,False,False,False,False,False,False
272 | 2460,False,False,False,False,False,False,False,False
273 | 28681,False,False,False,False,False,False,False,False
274 | 3185,False,False,False,False,False,False,False,False
275 | 33824,False,False,False,False,False,False,False,False
276 | 2455,False,False,False,False,False,False,False,False
277 | 27622,False,False,False,False,False,False,False,False
278 | 25305,False,False,False,False,False,False,False,False
279 | 2452,False,False,False,False,False,False,False,False
280 | 3248,False,False,False,False,False,False,False,False
281 | 501,False,False,False,False,False,False,False,False
282 | 14414,False,False,False,False,False,False,False,False
283 | 14563,False,False,False,False,False,False,False,False
284 | 20847,False,False,False,False,False,False,False,False
285 | 2450,False,False,False,False,False,False,False,False
286 | 20246,False,False,False,False,False,False,False,False
287 | 21494,False,False,False,False,False,False,False,False
288 | 15083,False,False,False,False,False,False,False,False
289 | 2448,False,False,False,False,False,False,False,False
290 | 13933,False,False,False,False,False,False,False,False
291 | 11642,False,False,False,False,False,False,False,False
292 | 2456,False,False,False,False,False,False,False,False
293 | 2450,False,False,False,False,False,False,False,False
294 | 2456,False,False,False,False,False,False,False,False
295 | 19151,False,False,False,False,False,False,False,False
296 | 20385,False,False,False,False,False,False,False,False
297 | 2453,False,False,False,False,False,False,False,False
298 | 12549,False,False,False,False,False,False,False,False
299 | 27430,False,False,False,False,False,False,False,False
300 | 11894,False,False,False,False,False,False,False,False
301 | 25131,False,False,False,False,False,False,False,False
302 |
--------------------------------------------------------------------------------
/session-2/example_1-gmail_ETL.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "773f2b77-a3c6-47b8-8246-1b958555f4af",
6 | "metadata": {},
7 | "source": [
8 | "# ETL of Gmail Data\n",
9 | "## ABB #1 - Session 2\n",
10 | "\n",
11 | "Code authored by: Shaw Talebi"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "62058d9d-adc2-4a6f-a60e-ff2abca7d93b",
17 | "metadata": {},
18 | "source": [
19 | "### imports"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "id": "aa6c91d1-058e-49a7-8296-75bb925c2e8b",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import imaplib\n",
30 | "import email\n",
31 | "from email.policy import default\n",
32 | "import pandas as pd\n",
33 | "\n",
34 | "from top_secret import app_password"
35 | ]
36 | },
37 | {
38 | "cell_type": "markdown",
39 | "id": "7e99a8de",
40 | "metadata": {},
41 | "source": [
42 | "### functions"
43 | ]
44 | },
45 | {
46 | "cell_type": "code",
47 | "execution_count": 2,
48 | "id": "eb2bc87b-3641-4953-88de-261aaabd48ca",
49 | "metadata": {},
50 | "outputs": [],
51 | "source": [
52 | "def parse_emails(mail, mailbox_name):\n",
53 | " \"\"\"\n",
54 | " Parse emails from a given mailbox and return a dataframe with the emails and labels.\n",
55 | " \"\"\"\n",
56 | "\n",
57 | " # Step 2: Select the folder\n",
58 | " mail.select(mailbox_name) # 'inbox' or 'spam'\n",
59 | "\n",
60 | " # Step 3: Search for emails\n",
61 | " _, messages = mail.search(None, \"ALL\")\n",
62 | "\n",
63 | " # Step 4: Extract emails\n",
64 | " # Fetch and parse the emails\n",
65 | " email_data = []\n",
66 | " for msg_id in messages[0].split():\n",
67 | " _, msg_data = mail.fetch(msg_id, \"(RFC822)\")\n",
68 | " \n",
69 | " for response_part in msg_data:\n",
70 | " if isinstance(response_part, tuple):\n",
71 | " msg = email.message_from_bytes(response_part[1], policy=default)\n",
72 | " subject = msg[\"subject\"]\n",
73 | " from_ = msg[\"from\"]\n",
74 | " if msg.is_multipart():\n",
75 | " body = \"\"\n",
76 | " for part in msg.walk():\n",
77 | " if part.get_content_type() == \"text/plain\":\n",
78 | " body += part.get_payload(decode=True).decode('utf-8', errors='ignore')\n",
79 | " else:\n",
80 | " body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')\n",
81 | "\n",
82 | " email_data.append({\n",
83 | " \"subject\": subject,\n",
84 | " \"from\": from_,\n",
85 | " \"body\": body\n",
86 | " })\n",
87 | "\n",
88 | " return email_data"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 3,
94 | "id": "4300d448",
95 | "metadata": {},
96 | "outputs": [],
97 | "source": [
98 | "def extract_folder_names(mail, keyword):\n",
99 | " \"\"\"\n",
100 | " Extract folder names containing a given keyword.\n",
101 | " \"\"\"\n",
102 | " # Get list of folders\n",
103 | " _, folders = mail.list()\n",
104 | "\n",
105 | " # Extract folder names containing \"Personal\"\n",
106 | " folder_list = []\n",
107 | " for folder in folders:\n",
108 | " # Decode from bytes to string and split by quotes\n",
109 | " folder_str = folder.decode('utf-8')\n",
110 | " # Extract folder name (it's between the last quote marks)\n",
111 | " folder_name = folder_str.split('\"')[-2]\n",
112 | " \n",
113 | " if keyword in folder_name:\n",
114 | " folder_list.append(folder_name)\n",
115 | "\n",
116 | " return folder_list"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 4,
122 | "id": "dd1396b8",
123 | "metadata": {},
124 | "outputs": [],
125 | "source": [
126 | "def parse_emails_from_keyword(mail, num_emails, keyword, label):\n",
127 | " \"\"\"\n",
128 | " Parse emails from a given keyword and return a dataframe with the emails and labels.\n",
129 | " \"\"\"\n",
130 | "\n",
131 | " folder_list = extract_folder_names(mail, keyword)\n",
132 | "\n",
133 | " email_data = []\n",
134 | " for mailbox_name in folder_list:\n",
135 | " try:\n",
136 | " email_data = email_data + parse_emails(mail, mailbox_name)\n",
137 | " except:\n",
138 | " print(f\"Could not parse {mailbox_name}\")\n",
139 | "\n",
140 | " if len(email_data) > num_emails:\n",
141 | " break\n",
142 | "\n",
143 | " df = pd.DataFrame(email_data)\n",
144 | " df[\"label\"] = label\n",
145 | " \n",
146 | " return df"
147 | ]
148 | },
149 | {
150 | "cell_type": "markdown",
151 | "id": "6f0e9770-2ec3-47ea-bd42-884c53a40918",
152 | "metadata": {},
153 | "source": [
154 | "### Extract"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 5,
160 | "id": "6b44d835",
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "data": {
165 | "text/plain": [
166 | "('OK', [b'swcballa@gmail.com authenticated (Success)'])"
167 | ]
168 | },
169 | "execution_count": 5,
170 | "metadata": {},
171 | "output_type": "execute_result"
172 | }
173 | ],
174 | "source": [
175 | "# define email details\n",
176 | "imap_server = \"imap.gmail.com\"\n",
177 | "email_address = \"your email here\"\n",
178 | "\n",
179 | "# establish connection to the server\n",
180 | "mail = imaplib.IMAP4_SSL(imap_server)\n",
181 | "mail.login(email_address, app_password)"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 6,
187 | "id": "39722d2a",
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "name": "stdout",
192 | "output_type": "stream",
193 | "text": [
194 | "Could not parse Personal/Family/Amoo Ali\n",
195 | "254\n"
196 | ]
197 | }
198 | ],
199 | "source": [
200 | "# extract personal emails\n",
201 | "num_emails = 150\n",
202 | "keyword = \"Personal\"\n",
203 | "label = \"personal\"\n",
204 | "\n",
205 | "df_personal = parse_emails_from_keyword(mail, num_emails, keyword, label)\n",
206 | "print(len(df_personal))"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 7,
212 | "id": "dcb6e0c1",
213 | "metadata": {},
214 | "outputs": [
215 | {
216 | "name": "stdout",
217 | "output_type": "stream",
218 | "text": [
219 | "Could not parse STV/Amazon Assoc\n",
220 | "160\n"
221 | ]
222 | }
223 | ],
224 | "source": [
225 | "# extract non-personal emails\n",
226 | "keyword = \"STV\"\n",
227 | "label = \"not personal\"\n",
228 | "\n",
229 | "df_not_personal = parse_emails_from_keyword(mail, num_emails, keyword, label)\n",
230 | "print(len(df_not_personal))"
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 8,
236 | "id": "2aec83d4",
237 | "metadata": {},
238 | "outputs": [
239 | {
240 | "data": {
241 | "text/plain": [
242 | "('BYE', [b'LOGOUT Requested'])"
243 | ]
244 | },
245 | "execution_count": 8,
246 | "metadata": {},
247 | "output_type": "execute_result"
248 | }
249 | ],
250 | "source": [
251 | "# log out of the server\n",
252 | "mail.logout()"
253 | ]
254 | },
255 | {
256 | "cell_type": "markdown",
257 | "id": "4ed3e5b4",
258 | "metadata": {},
259 | "source": [
260 | "### Transform"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 9,
266 | "id": "35ea97fc",
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "name": "stdout",
271 | "output_type": "stream",
272 | "text": [
273 | "300\n"
274 | ]
275 | }
276 | ],
277 | "source": [
278 | "# merge emails into a single dataframe\n",
279 | "df = pd.concat([df_personal[:num_emails], df_not_personal[:num_emails]])\n",
280 | "print(len(df))"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": 10,
286 | "id": "9534cb4a",
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "# feature engineering\n",
291 | "\n",
292 | "# length of the body\n",
293 | "df[\"body_length\"] = df[\"body\"].apply(len)\n",
294 | "\n",
295 | "# contains \"lol\"\n",
296 | "df[\"contains_lol\"] = df[\"body\"].apply(lambda x: \"lol\" in x.lower())\n",
297 | "\n",
298 | "# contains \"OMG\"\n",
299 | "df[\"contains_omg\"] = df[\"body\"].apply(lambda x: \"omg\" in x.lower())\n",
300 | "\n",
301 | "# contains \"attached\" or \"see attached\" or \"see attachment\"\n",
302 | "df[\"contains_attached\"] = df[\"body\"].apply(lambda x: \"attached\" in x.lower())\n",
303 | "df[\"contains_attachment\"] = df[\"body\"].apply(lambda x: \"attachment\" in x.lower())\n",
304 | "\n",
305 | "# contains \"Order Confirmation\"\n",
306 | "df[\"contains_order_confirmation\"] = df[\"body\"].apply(lambda x: \"order confirmation\" in x.lower())\n",
307 | "\n",
308 | "# contains \"payment summary\"\n",
309 | "df[\"contains_payment_summary\"] = df[\"body\"].apply(lambda x: \"payment summary\" in x.lower())\n",
310 | "\n",
311 | "# sender is common person domain (gmail, yahoo, hotmail) \n",
312 | "df[\"sender_has_common_domain\"] = df[\"from\"].apply(lambda x: any(domain in x.lower() for domain in [\"gmail\", \"yahoo\", \"hotmail\"]))\n",
313 | "\n",
314 | "# is personal email\n",
315 | "df[\"is_personal\"] = df[\"label\"] == \"personal\""
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 11,
321 | "id": "e7c1e97f",
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/html": [
327 | "\n",
328 | "\n",
341 | "
\n",
342 | " \n",
343 | " \n",
344 | " | \n",
345 | " body_length | \n",
346 | " contains_lol | \n",
347 | " contains_omg | \n",
348 | " contains_attached | \n",
349 | " contains_attachment | \n",
350 | " contains_order_confirmation | \n",
351 | " contains_payment_summary | \n",
352 | " sender_has_common_domain | \n",
353 | " is_personal | \n",
354 | "
\n",
355 | " \n",
356 | " \n",
357 | " \n",
358 | " 0 | \n",
359 | " 1830 | \n",
360 | " False | \n",
361 | " False | \n",
362 | " False | \n",
363 | " False | \n",
364 | " False | \n",
365 | " False | \n",
366 | " False | \n",
367 | " True | \n",
368 | "
\n",
369 | " \n",
370 | " 1 | \n",
371 | " 1522 | \n",
372 | " False | \n",
373 | " False | \n",
374 | " False | \n",
375 | " False | \n",
376 | " False | \n",
377 | " False | \n",
378 | " True | \n",
379 | " True | \n",
380 | "
\n",
381 | " \n",
382 | " 2 | \n",
383 | " 1838 | \n",
384 | " False | \n",
385 | " False | \n",
386 | " False | \n",
387 | " False | \n",
388 | " False | \n",
389 | " False | \n",
390 | " False | \n",
391 | " True | \n",
392 | "
\n",
393 | " \n",
394 | " 3 | \n",
395 | " 251 | \n",
396 | " False | \n",
397 | " False | \n",
398 | " False | \n",
399 | " True | \n",
400 | " False | \n",
401 | " False | \n",
402 | " False | \n",
403 | " True | \n",
404 | "
\n",
405 | " \n",
406 | " 4 | \n",
407 | " 3170 | \n",
408 | " False | \n",
409 | " False | \n",
410 | " False | \n",
411 | " False | \n",
412 | " False | \n",
413 | " False | \n",
414 | " False | \n",
415 | " True | \n",
416 | "
\n",
417 | " \n",
418 | "
\n",
419 | "
"
420 | ],
421 | "text/plain": [
422 | " body_length contains_lol contains_omg contains_attached \\\n",
423 | "0 1830 False False False \n",
424 | "1 1522 False False False \n",
425 | "2 1838 False False False \n",
426 | "3 251 False False False \n",
427 | "4 3170 False False False \n",
428 | "\n",
429 | " contains_attachment contains_order_confirmation contains_payment_summary \\\n",
430 | "0 False False False \n",
431 | "1 False False False \n",
432 | "2 False False False \n",
433 | "3 True False False \n",
434 | "4 False False False \n",
435 | "\n",
436 | " sender_has_common_domain is_personal \n",
437 | "0 False True \n",
438 | "1 True True \n",
439 | "2 False True \n",
440 | "3 False True \n",
441 | "4 False True "
442 | ]
443 | },
444 | "execution_count": 11,
445 | "metadata": {},
446 | "output_type": "execute_result"
447 | }
448 | ],
449 | "source": [
450 | "# drop original columns\n",
451 | "df_transformed = df.iloc[:, 4:]\n",
452 | "df_transformed.head()"
453 | ]
454 | },
455 | {
456 | "cell_type": "markdown",
457 | "id": "3658bda0",
458 | "metadata": {},
459 | "source": [
460 | "### Load"
461 | ]
462 | },
463 | {
464 | "cell_type": "code",
465 | "execution_count": 12,
466 | "id": "55644e43",
467 | "metadata": {},
468 | "outputs": [],
469 | "source": [
470 | "# save to csv\n",
471 | "df_transformed.to_csv(\"data/transformed_data.csv\", index=False)"
472 | ]
473 | },
474 | {
475 | "cell_type": "code",
476 | "execution_count": null,
477 | "id": "e36b6ea9",
478 | "metadata": {},
479 | "outputs": [],
480 | "source": []
481 | }
482 | ],
483 | "metadata": {
484 | "kernelspec": {
485 | "display_name": "Python 3 (ipykernel)",
486 | "language": "python",
487 | "name": "python3"
488 | },
489 | "language_info": {
490 | "codemirror_mode": {
491 | "name": "ipython",
492 | "version": 3
493 | },
494 | "file_extension": ".py",
495 | "mimetype": "text/x-python",
496 | "name": "python",
497 | "nbconvert_exporter": "python",
498 | "pygments_lexer": "ipython3",
499 | "version": "3.9.12"
500 | }
501 | },
502 | "nbformat": 4,
503 | "nbformat_minor": 5
504 | }
505 |
--------------------------------------------------------------------------------
/session-2/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | ipykernel
3 | pandas
4 | scikit-learn
5 | matplotlib
6 | numpy
7 |
--------------------------------------------------------------------------------
/session-2/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-2/slides.pdf
--------------------------------------------------------------------------------
/session-2/top_secret.py:
--------------------------------------------------------------------------------
1 | app_password = "your app password here"
--------------------------------------------------------------------------------
/session-3/data/transformed_data.csv:
--------------------------------------------------------------------------------
1 | body_length,contains_lol,contains_omg,contains_attached,contains_attachment,contains_order_confirmation,contains_payment_summary,sender_has_common_domain,is_personal,label_0shot,correct_0shot,label_fewshot,correct_fewshot
2 | 1830,False,False,False,False,False,False,False,True,False,False,False,False
3 | 1522,False,False,False,False,False,False,True,True,True,True,True,True
4 | 1838,False,False,False,False,False,False,False,True,False,False,False,False
5 | 251,False,False,False,True,False,False,False,True,False,False,False,False
6 | 3170,False,False,False,False,False,False,False,True,False,False,False,False
7 | 2782,False,False,False,False,False,False,False,True,False,False,False,False
8 | 11026,False,False,False,False,False,False,False,True,False,False,False,False
9 | 7025,False,False,False,False,False,False,False,True,False,False,False,False
10 | 3265,False,False,False,True,False,False,True,True,False,False,True,True
11 | 3007,False,False,False,True,False,False,False,True,False,False,False,False
12 | 1954,False,False,False,False,False,False,True,True,False,False,True,True
13 | 1600,False,False,False,False,False,False,False,True,False,False,False,False
14 | 1841,False,False,False,False,False,False,False,True,False,False,True,True
15 | 1841,False,False,False,False,False,False,False,True,False,False,True,True
16 | 2562,False,False,False,False,False,False,True,True,False,False,False,False
17 | 201,False,False,False,False,False,False,False,True,False,False,False,False
18 | 8520,False,False,False,False,False,False,False,True,False,False,False,False
19 | 7454,False,False,False,False,False,False,False,True,False,False,False,False
20 | 8520,False,False,False,False,False,False,False,True,False,False,False,False
21 | 8520,False,False,False,False,False,False,False,True,False,False,False,False
22 | 201,False,False,False,False,False,False,False,True,False,False,False,False
23 | 201,False,False,False,False,False,False,False,True,False,False,False,False
24 | 3801,False,False,False,False,False,False,False,True,False,False,False,False
25 | 1068,False,False,False,False,False,False,False,True,True,True,True,True
26 | 1593,False,False,False,False,False,False,False,True,True,True,True,True
27 | 1955,False,False,False,False,False,False,False,True,True,True,True,True
28 | 4499,False,False,False,False,False,False,False,True,True,True,True,True
29 | 5088,False,False,False,False,False,False,False,True,True,True,True,True
30 | 57838,False,False,False,False,False,False,False,True,False,False,True,True
31 | 4420,False,False,False,False,False,False,False,True,True,True,True,True
32 | 384,True,False,True,False,False,False,True,True,False,False,True,True
33 | 298,False,False,True,False,False,False,True,True,False,False,False,False
34 | 845,False,False,False,False,False,False,True,True,False,False,True,True
35 | 115,False,False,False,False,False,False,True,True,False,False,True,True
36 | 2,False,False,False,False,False,False,True,True,False,False,True,True
37 | 1376,False,False,False,False,False,False,True,True,False,False,False,False
38 | 1146,False,False,False,False,False,False,True,True,False,False,False,False
39 | 822,True,False,False,True,False,False,True,True,False,False,False,False
40 | 6847,False,False,True,False,False,False,True,True,False,False,False,False
41 | 6323,False,False,False,False,False,False,True,True,False,False,False,False
42 | 4230,False,False,False,False,False,False,True,True,False,False,False,False
43 | 2020,False,False,False,True,False,False,True,True,False,False,False,False
44 | 1608,False,False,False,False,False,False,True,True,False,False,False,False
45 | 1119,False,False,False,False,False,False,False,True,False,False,False,False
46 | 773,False,False,False,False,False,False,True,True,False,False,False,False
47 | 640,False,False,False,False,False,False,False,True,False,False,False,False
48 | 2246,False,False,False,False,False,False,False,True,False,False,False,False
49 | 1865,False,False,True,False,False,False,False,True,False,False,False,False
50 | 4720,False,False,False,False,False,False,False,True,False,False,False,False
51 | 16327,False,False,True,False,False,False,False,True,False,False,False,False
52 | 6165,False,False,False,True,False,False,True,True,False,False,False,False
53 | 1931,False,False,False,False,False,False,False,True,False,False,True,True
54 | 325,False,False,False,False,False,False,True,True,False,False,False,False
55 | 926,False,False,False,False,False,False,True,True,True,True,True,True
56 | 776,False,False,False,False,False,False,True,True,False,False,False,False
57 | 260,False,False,False,True,False,False,True,True,False,False,False,False
58 | 1405,True,False,False,False,False,False,True,True,False,False,False,False
59 | 1338,True,False,False,False,False,False,False,True,False,False,False,False
60 | 13259,False,False,False,False,False,False,True,True,False,False,False,False
61 | 3,False,False,False,False,False,False,False,True,False,False,False,False
62 | 53,False,False,False,False,False,False,True,True,True,True,True,True
63 | 2040,False,False,False,False,False,False,True,True,False,False,True,True
64 | 14690,False,False,False,False,False,False,True,True,False,False,False,False
65 | 4830,False,False,True,False,False,False,True,True,False,False,True,True
66 | 3466,False,False,False,False,False,False,True,True,False,False,False,False
67 | 3181,False,False,False,False,False,False,True,True,False,False,False,False
68 | 10438,False,False,False,False,False,False,True,True,False,False,False,False
69 | 1470,False,False,False,True,True,False,True,True,False,False,False,False
70 | 1321,False,False,False,False,False,False,True,True,False,False,True,True
71 | 580,False,False,False,False,False,False,True,True,False,False,True,True
72 | 3,False,False,False,False,False,False,False,True,False,False,False,False
73 | 17408,False,False,False,False,False,False,True,True,True,True,True,True
74 | 16971,False,False,False,False,False,False,True,True,True,True,True,True
75 | 22849,False,False,False,False,False,False,True,True,False,False,True,True
76 | 728,False,False,False,False,False,False,True,True,False,False,True,True
77 | 599,False,False,False,False,False,False,True,True,False,False,True,True
78 | 480,False,False,False,False,False,False,True,True,False,False,True,True
79 | 367,False,False,False,False,False,False,True,True,False,False,True,True
80 | 4,False,False,False,False,False,False,True,True,True,True,True,True
81 | 1552,False,False,False,False,False,False,True,True,False,False,False,False
82 | 331,False,False,False,False,False,False,False,True,False,False,False,False
83 | 85,False,False,False,False,False,False,True,True,True,True,True,True
84 | 34,False,False,False,False,False,False,True,True,True,True,True,True
85 | 1247,False,False,False,False,False,False,True,True,False,False,True,True
86 | 3027,False,False,False,False,False,False,True,True,False,False,True,True
87 | 23190,True,False,False,False,False,False,True,True,False,False,False,False
88 | 14691,False,False,False,False,False,False,True,True,False,False,True,True
89 | 23705,False,False,False,False,False,False,True,True,True,True,True,True
90 | 1371,False,False,False,False,False,False,True,True,False,False,True,True
91 | 1373,False,False,False,False,False,False,True,True,False,False,True,True
92 | 21076,False,False,False,False,True,False,True,True,False,False,False,False
93 | 2640,False,False,False,False,True,False,False,True,False,False,False,False
94 | 3511,False,False,False,False,False,False,False,True,False,False,False,False
95 | 27,False,False,False,False,False,False,True,True,True,True,True,True
96 | 2534,False,False,False,False,False,False,True,True,False,False,False,False
97 | 4220,False,False,False,False,False,False,True,True,False,False,False,False
98 | 4041,False,False,False,False,False,False,True,True,False,False,False,False
99 | 1073,False,False,False,False,False,False,False,True,True,True,True,True
100 | 11,False,False,False,False,False,False,True,True,True,True,True,True
101 | 85,False,False,False,False,False,False,True,True,True,True,True,True
102 | 471,False,False,True,False,False,False,True,True,False,False,False,False
103 | 381,False,False,True,False,False,False,True,True,False,False,True,True
104 | 211,False,False,False,False,False,False,True,True,False,False,True,True
105 | 5258,False,False,False,False,False,False,True,True,False,False,True,True
106 | 4017,False,False,False,False,False,False,True,True,True,True,True,True
107 | 17,False,False,False,False,False,False,True,True,True,True,True,True
108 | 4884,False,False,False,False,True,False,True,True,False,False,False,False
109 | 5083,False,False,False,False,False,False,True,True,False,False,False,False
110 | 5301,False,False,False,False,False,False,True,True,False,False,False,False
111 | 4791,False,False,False,False,False,False,True,True,False,False,False,False
112 | 4577,False,False,False,False,False,False,True,True,False,False,False,False
113 | 7449,False,False,False,False,False,False,False,True,False,False,False,False
114 | 861,False,False,False,False,False,False,True,True,True,True,True,True
115 | 501,False,False,False,False,False,False,True,True,True,True,True,True
116 | 390,False,False,False,False,False,False,True,True,True,True,True,True
117 | 284,False,False,False,True,False,False,False,True,False,False,False,False
118 | 258,False,False,False,True,False,False,False,True,False,False,False,False
119 | 532,False,False,False,False,False,False,True,True,True,True,True,True
120 | 360,False,False,False,False,False,False,True,True,True,True,True,True
121 | 654,False,False,False,False,False,False,True,True,True,True,True,True
122 | 127,False,False,False,False,False,False,True,True,False,False,True,True
123 | 31,False,False,False,False,False,False,True,True,True,True,True,True
124 | 1323,False,False,False,False,False,False,True,True,False,False,True,True
125 | 1289,False,False,False,False,False,False,True,True,False,False,True,True
126 | 4,False,False,False,False,False,False,True,True,False,False,True,True
127 | 3313,False,False,False,False,False,False,True,True,True,True,True,True
128 | 305,False,False,False,False,False,False,True,True,False,False,True,True
129 | 201,False,False,False,False,False,False,True,True,False,False,False,False
130 | 409,False,False,False,False,False,False,True,True,True,True,True,True
131 | 3035,True,False,False,False,False,False,True,True,True,True,True,True
132 | 2857,False,False,False,False,False,False,True,True,True,True,True,True
133 | 2624,False,False,False,False,False,False,True,True,True,True,True,True
134 | 2375,False,False,False,False,False,False,True,True,True,True,True,True
135 | 17911,False,False,False,False,False,False,True,True,False,False,False,False
136 | 257,False,False,False,True,False,False,False,True,True,True,True,True
137 | 492,False,False,False,False,False,False,True,True,False,False,False,False
138 | 8092,False,False,False,False,False,False,True,True,False,False,False,False
139 | 1772,False,False,False,False,False,False,True,True,False,False,False,False
140 | 8239,False,False,False,False,False,False,True,True,False,False,False,False
141 | 11113,False,False,False,False,False,False,True,True,False,False,False,False
142 | 4437,False,False,True,False,False,False,True,True,False,False,False,False
143 | 3844,False,False,True,False,False,False,True,True,False,False,False,False
144 | 2931,False,False,True,False,False,False,True,True,False,False,False,False
145 | 314,False,False,False,True,False,False,False,True,False,False,False,False
146 | 3603,False,False,True,False,False,False,True,True,False,False,False,False
147 | 6514,False,False,True,False,False,False,True,True,False,False,False,False
148 | 3337,False,False,False,False,False,False,True,True,False,False,True,True
149 | 3244,False,False,False,False,False,False,True,True,False,False,True,True
150 | 1529,False,False,False,False,False,False,False,True,False,False,False,False
151 | 23901,False,True,False,False,False,False,True,True,False,False,True,True
152 | 1616,False,False,False,False,False,False,False,False,False,True,False,True
153 | 1164,False,False,False,False,False,False,True,False,False,True,False,True
154 | 16654,False,False,False,False,False,False,False,False,False,True,False,True
155 | 78,False,False,False,False,False,False,True,False,False,True,False,True
156 | 2550,False,False,False,False,False,False,False,False,False,True,False,True
157 | 2452,False,False,False,False,False,False,False,False,False,True,False,True
158 | 1737,False,False,False,False,False,False,False,False,False,True,False,True
159 | 1542,False,False,False,False,False,False,False,False,False,True,False,True
160 | 1107,False,False,False,False,False,False,False,False,False,True,False,True
161 | 1641,False,False,False,False,False,False,False,False,False,True,False,True
162 | 1713,False,False,False,False,False,False,False,False,False,True,False,True
163 | 1737,False,False,False,False,False,False,False,False,False,True,False,True
164 | 1074,False,False,False,False,False,False,False,False,False,True,False,True
165 | 4451,True,False,False,False,False,False,False,False,False,True,False,True
166 | 21592,False,False,False,False,False,False,False,False,False,True,False,True
167 | 9581,False,False,False,False,False,False,False,False,False,True,False,True
168 | 4086,False,False,False,False,False,False,False,False,False,True,False,True
169 | 11073,False,False,False,False,False,False,False,False,False,True,False,True
170 | 1456,False,False,False,False,False,False,False,False,False,True,False,True
171 | 720,False,False,False,False,False,False,False,False,False,True,False,True
172 | 2422,False,False,False,False,False,False,False,False,False,True,False,True
173 | 3766,False,True,False,False,False,False,False,False,False,True,False,True
174 | 1477,False,False,False,False,False,False,False,False,False,True,False,True
175 | 2712,False,False,False,False,False,False,False,False,False,True,False,True
176 | 2295,False,False,False,False,False,False,False,False,False,True,False,True
177 | 2841,False,False,False,False,False,False,False,False,False,True,False,True
178 | 3427,False,False,False,False,False,False,False,False,False,True,False,True
179 | 2872,False,False,False,False,False,False,True,False,False,True,False,True
180 | 2490,False,False,False,False,False,False,False,False,False,True,False,True
181 | 677,False,False,False,False,False,False,False,False,False,True,False,True
182 | 2283,False,False,False,False,False,False,False,False,False,True,False,True
183 | 528,False,False,False,False,False,False,False,False,False,True,False,True
184 | 3338,False,False,False,False,False,False,True,False,False,True,False,True
185 | 2983,False,False,False,False,False,False,True,False,False,True,False,True
186 | 3,False,False,False,False,False,False,False,False,False,True,False,True
187 | 4790,True,True,False,False,False,False,True,False,False,True,True,False
188 | 3,False,False,False,False,False,False,True,False,False,True,False,True
189 | 25289,False,False,False,False,False,False,False,False,False,True,False,True
190 | 58,False,False,False,False,False,False,True,False,False,True,True,False
191 | 1947,False,False,True,False,False,False,True,False,False,True,False,True
192 | 883,False,False,False,False,False,False,False,False,False,True,False,True
193 | 855,False,False,False,False,False,False,False,False,False,True,False,True
194 | 2035,False,False,False,False,False,False,False,False,False,True,False,True
195 | 2424,False,False,False,False,False,False,False,False,False,True,False,True
196 | 605,False,False,False,False,False,True,False,False,False,True,False,True
197 | 612,False,False,False,False,False,True,False,False,False,True,False,True
198 | 617,False,False,False,False,False,True,False,False,False,True,False,True
199 | 615,False,False,False,False,False,True,False,False,False,True,False,True
200 | 617,False,False,False,False,False,True,False,False,False,True,False,True
201 | 3975,False,False,False,False,False,False,False,False,False,True,False,True
202 | 616,False,False,False,False,False,True,False,False,False,True,False,True
203 | 6167,False,False,False,False,False,False,False,False,False,True,False,True
204 | 615,False,False,False,False,False,True,False,False,False,True,False,True
205 | 613,False,False,False,False,False,True,False,False,False,True,False,True
206 | 607,False,False,False,False,False,True,False,False,False,True,False,True
207 | 636,False,False,False,False,False,True,False,False,False,True,False,True
208 | 633,False,False,False,False,False,True,False,False,False,True,False,True
209 | 635,False,False,False,False,False,True,False,False,False,True,False,True
210 | 3017,False,False,False,False,False,False,False,False,False,True,False,True
211 | 3017,False,False,False,False,False,False,False,False,False,True,False,True
212 | 637,False,False,False,False,False,True,False,False,False,True,False,True
213 | 644,False,False,False,False,False,True,False,False,False,True,False,True
214 | 18824,False,False,False,False,False,False,False,False,False,True,False,True
215 | 647,False,False,False,False,False,True,False,False,False,True,False,True
216 | 9982,False,False,False,False,False,False,False,False,False,True,False,True
217 | 1333,False,False,False,False,False,False,False,False,False,True,False,True
218 | 9158,False,False,False,False,False,False,False,False,False,True,False,True
219 | 645,False,False,False,False,False,True,False,False,False,True,False,True
220 | 9573,False,False,False,False,False,False,False,False,False,True,False,True
221 | 20297,False,False,False,False,False,False,False,False,False,True,False,True
222 | 10222,False,False,False,False,False,False,False,False,False,True,False,True
223 | 647,False,False,False,False,False,True,False,False,False,True,False,True
224 | 7078,False,False,False,False,False,False,False,False,False,True,False,True
225 | 646,False,False,False,False,False,True,False,False,False,True,False,True
226 | 3926,False,False,False,False,False,False,False,False,False,True,False,True
227 | 1276,False,False,False,False,False,False,False,False,False,True,False,True
228 | 4223,False,False,False,False,False,False,False,False,False,True,False,True
229 | 645,False,False,False,False,False,True,False,False,False,True,False,True
230 | 3627,False,False,False,False,False,False,False,False,False,True,False,True
231 | 1148,False,False,False,False,False,False,False,False,False,True,False,True
232 | 3825,False,False,False,False,False,False,False,False,False,True,False,True
233 | 644,False,False,False,False,False,True,False,False,False,True,False,True
234 | 18682,False,False,False,False,False,False,False,False,False,True,False,True
235 | 638,False,False,False,False,False,True,False,False,False,True,False,True
236 | 5736,False,False,False,False,False,False,False,False,False,True,False,True
237 | 9269,False,False,False,False,False,False,False,False,False,True,False,True
238 | 9326,False,False,False,False,False,False,False,False,False,True,False,True
239 | 11111,False,False,False,False,False,False,False,False,False,True,False,True
240 | 636,False,False,False,False,False,True,False,False,False,True,False,True
241 | 3200,False,False,False,False,False,False,False,False,False,True,False,True
242 | 11024,False,False,False,False,False,False,False,False,False,True,False,True
243 | 633,False,False,False,False,False,True,False,False,False,True,False,True
244 | 14645,False,False,False,False,False,False,False,False,False,True,False,True
245 | 14231,False,False,False,False,False,False,False,False,False,True,False,True
246 | 635,False,False,False,False,False,True,False,False,False,True,False,True
247 | 2439,False,False,False,False,False,False,False,False,False,True,False,True
248 | 2439,False,False,False,False,False,False,False,False,False,True,False,True
249 | 14950,False,False,False,False,False,False,False,False,False,True,False,True
250 | 25518,False,False,False,False,False,False,False,False,False,True,False,True
251 | 17292,False,False,False,False,False,False,False,False,False,True,False,True
252 | 22354,False,False,False,False,False,False,False,False,False,True,False,True
253 | 637,False,False,False,False,False,True,False,False,False,True,False,True
254 | 30169,False,False,False,False,False,False,False,False,False,True,False,True
255 | 2464,False,False,False,False,False,False,False,False,False,True,False,True
256 | 535,False,False,False,False,False,False,False,False,False,True,False,True
257 | 31044,False,False,False,False,False,False,False,False,False,True,False,True
258 | 37546,False,False,False,False,False,False,False,False,False,True,False,True
259 | 2463,False,False,False,False,False,False,False,False,False,True,False,True
260 | 22601,False,False,False,False,False,False,False,False,False,True,False,True
261 | 15373,False,False,False,False,False,False,False,False,False,True,False,True
262 | 11570,False,False,False,False,False,False,False,False,False,True,False,True
263 | 2812,False,False,False,False,False,False,False,False,False,True,False,True
264 | 21623,False,False,False,False,False,False,False,False,False,True,False,True
265 | 18536,False,False,False,False,False,False,False,False,False,True,False,True
266 | 23076,False,False,False,False,False,False,False,False,False,True,False,True
267 | 2464,False,False,False,False,False,False,False,False,False,True,False,True
268 | 13046,False,False,False,False,False,False,False,False,False,True,False,True
269 | 12586,False,False,False,False,False,False,False,False,False,True,False,True
270 | 2459,False,False,False,False,False,False,False,False,False,True,False,True
271 | 15562,False,False,False,False,False,False,False,False,False,True,False,True
272 | 2460,False,False,False,False,False,False,False,False,False,True,False,True
273 | 28681,False,False,False,False,False,False,False,False,False,True,False,True
274 | 3185,False,False,False,False,False,False,False,False,False,True,False,True
275 | 33824,False,False,False,False,False,False,False,False,False,True,False,True
276 | 2455,False,False,False,False,False,False,False,False,False,True,False,True
277 | 27622,False,False,False,False,False,False,False,False,False,True,False,True
278 | 25305,False,False,False,False,False,False,False,False,False,True,False,True
279 | 2452,False,False,False,False,False,False,False,False,False,True,False,True
280 | 3248,False,False,False,False,False,False,False,False,False,True,False,True
281 | 501,False,False,False,False,False,False,False,False,False,True,False,True
282 | 14414,False,False,False,False,False,False,False,False,False,True,False,True
283 | 14563,False,False,False,False,False,False,False,False,False,True,False,True
284 | 20847,False,False,False,False,False,False,False,False,False,True,False,True
285 | 2450,False,False,False,False,False,False,False,False,False,True,False,True
286 | 20246,False,False,False,False,False,False,False,False,False,True,False,True
287 | 21494,False,False,False,False,False,False,False,False,False,True,False,True
288 | 15083,False,False,False,False,False,False,False,False,False,True,False,True
289 | 2448,False,False,False,False,False,False,False,False,False,True,False,True
290 | 13933,False,False,False,False,False,False,False,False,False,True,False,True
291 | 11642,False,False,False,False,False,False,False,False,False,True,False,True
292 | 2456,False,False,False,False,False,False,False,False,False,True,False,True
293 | 2450,False,False,False,False,False,False,False,False,False,True,False,True
294 | 2456,False,False,False,False,False,False,False,False,False,True,False,True
295 | 19151,False,False,False,False,False,False,False,False,False,True,False,True
296 | 20385,False,False,False,False,False,False,False,False,False,True,False,True
297 | 2453,False,False,False,False,False,False,False,False,False,True,False,True
298 | 12549,False,False,False,False,False,False,False,False,False,True,False,True
299 | 27430,False,False,False,False,False,False,False,False,False,True,False,True
300 | 11894,False,False,False,False,False,False,False,False,False,True,False,True
301 | 25131,False,False,False,False,False,False,False,False,False,True,False,True
302 |
--------------------------------------------------------------------------------
/session-3/example_1-paper_summarizer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "5a4ef775-778d-4c0f-b901-65df2ba5caf5",
6 | "metadata": {},
7 | "source": [
8 | "# Summarizing Research Papers with GPT-4o\n",
9 | "## ABB #1 - Session 3\n",
10 | "\n",
11 | "Code authored by: Shaw Talebi"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "ddf11ab2-418e-4ea3-a3b8-2d2a09232c88",
17 | "metadata": {},
18 | "source": [
19 | "### imports"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "id": "c2b91857-1b75-4b5e-ac03-0bb05bf9e75f",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import fitz # PyMuPDF\n",
30 | "from IPython.display import Markdown, display\n",
31 | "from openai import OpenAI\n",
32 | "from top_secret import my_sk"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": null,
38 | "id": "4db523c7-0b98-402d-91c4-d5e4c21c2d1a",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# setup api client\n",
43 | "client = OpenAI(api_key=my_sk)"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "id": "84d88996-64c0-43b0-af2a-0a4af4f46152",
49 | "metadata": {},
50 | "source": [
51 | "### 1) Extract text"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 2,
57 | "id": "c06f8b84-23cf-49c3-a024-ebf98b1b8666",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "filepath = \"papers/attention-is-all-you-need.pdf\""
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 3,
67 | "id": "e42881f0-32bf-4733-809a-b290118c9760",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "pdf = fitz.open(filepath)\n",
72 | "text = \"\".join([page.get_text() for page in pdf])"
73 | ]
74 | },
75 | {
76 | "cell_type": "markdown",
77 | "id": "eb264250-d62c-472b-ad3e-21bbd263551e",
78 | "metadata": {},
79 | "source": [
80 | "### 2) Write prompt"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 4,
86 | "id": "4aaab4a9-cfe5-4488-87b3-c173fc96320c",
87 | "metadata": {},
88 | "outputs": [],
89 | "source": [
90 | "prompt = f\"\"\"# System Role: Research Paper Summarizer\n",
91 | "\n",
92 | "Your task is to act as an academic summarizer, condensing research papers into concise, clear, and accessible summaries. Follow these guidelines:\n",
93 | "\n",
94 | "## Structure\n",
95 | "\n",
96 | "Always use the following structure for your summary:\n",
97 | "\n",
98 | "- **Title**: The paper's title.\n",
99 | "- **Authors**: Key authors and their affiliations (if available).\n",
100 | "- **Abstract**: A paraphrased version of the paper's abstract.\n",
101 | "- **Objective**: The research goal or question being addressed.\n",
102 | "- **Methodology**: Key methods or experiments used.\n",
103 | "- **Findings**: Main results or insights.\n",
104 | "- **Conclusion**: The authors' conclusion and implications.\n",
105 | "- **Relevance**: Why this research is significant or what problem it addresses.\n",
106 | "\n",
107 | "## Style\n",
108 | "\n",
109 | "- Write in a neutral and academic tone.\n",
110 | "- Use simple, precise language to ensure clarity for a broad audience.\n",
111 | "\n",
112 | "## Length\n",
113 | "\n",
114 | "- Keep summaries concise (150-300 words) unless otherwise specified.\n",
115 | "\n",
116 | "## Audience\n",
117 | "\n",
118 | "- Assume the audience has general technical knowledge but may not be familiar with the specific field of the paper.\n",
119 | "\n",
120 | "## Special Instructions\n",
121 | "\n",
122 | "- If specific sections of the paper are missing, skip them and note it clearly (e.g., \"Authors section not specified\").\n",
123 | "- Avoid subjective opinions or interpretations beyond the paper's content.\n",
124 | "\n",
125 | "## Citations\n",
126 | "\n",
127 | "- Provide clear citations in APA format (if needed).\n",
128 | "\n",
129 | "---\n",
130 | "\n",
131 | "You are summarizing the following research paper:\n",
132 | "\n",
133 | "{text}\n",
134 | "\"\"\""
135 | ]
136 | },
137 | {
138 | "cell_type": "markdown",
139 | "id": "9bc17b34-940f-4ff3-ad67-3b2fd7ee5a48",
140 | "metadata": {},
141 | "source": [
142 | "### 3) Summarize Paper with GPT-4o"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 7,
148 | "id": "2a27209d-5711-4eab-9828-97d301d17bdf",
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "- **Title**: Attention Is All You Need\n",
156 | "- **Authors**: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin (Google Brain, Google Research, University of Toronto)\n",
157 | "- **Abstract**: This paper introduces the Transformer, a new architecture for sequence transduction that relies entirely on attention mechanisms, eliminating the need for recurrence and convolutions. The Transformer demonstrates superior performance in machine translation tasks, achieving state-of-the-art BLEU scores for English-to-German and English-to-French translations, while also being more efficient in training time and parallelization.\n",
158 | "\n",
159 | "- **Objective**: The primary goal of this research is to propose a novel model architecture for sequence transduction that improves upon existing recurrent and convolutional models by leveraging attention mechanisms exclusively.\n",
160 | "\n",
161 | "- **Methodology**: The Transformer model is structured with an encoder-decoder architecture, where both components utilize multi-head self-attention and feed-forward neural networks. The authors conducted experiments on the WMT 2014 English-to-German and English-to-French translation tasks, comparing the performance and training efficiency of the Transformer against existing state-of-the-art models.\n",
162 | "\n",
163 | "- **Findings**: The Transformer achieved a BLEU score of 28.4 for English-to-German and 41.8 for English-to-French translations, surpassing previous models by significant margins and requiring less training time. The model also showed strong generalization to other tasks, such as English constituency parsing.\n",
164 | "\n",
165 | "- **Conclusion**: The authors conclude that the Transformer model, by relying solely on attention mechanisms, not only achieves state-of-the-art performance in translation tasks but also offers advantages in training speed and efficiency. They express optimism about the potential applications of attention-based models in various domains beyond text.\n",
166 | "\n",
167 | "- **Relevance**: This research is significant as it introduces a transformative approach to sequence modeling, addressing limitations of traditional recurrent and convolutional networks, and lays the groundwork for further advancements in natural language processing and other fields requiring sequence transduction.\n",
168 | "\n",
169 | "**Citation**: Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, Ł., & Polosukhin, I. (2023). Attention Is All You Need. 31st Conference on Neural Information Processing Systems (NIPS 2017). arXiv:1706.03762v7 [cs.CL].\n"
170 | ]
171 | }
172 | ],
173 | "source": [
174 | "# make api call\n",
175 | "response = client.chat.completions.create(\n",
176 | " model=\"gpt-4o-mini\",\n",
177 | " messages=[\n",
178 | " {\"role\": \"system\", \"content\": \"Research paper summarizer.\"},\n",
179 | " {\"role\": \"user\", \"content\": prompt}\n",
180 | " ], \n",
181 | " temperature = 0.5\n",
182 | ")\n",
183 | "\n",
184 | "# extract response\n",
185 | "summary = response.choices[0].message.content\n",
186 | "print(summary)"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "id": "d8e89e07-f2d5-4eec-8f83-ba8da7fb2772",
192 | "metadata": {},
193 | "source": [
194 | "### 4) Display Markdown"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 9,
200 | "id": "a058e3ea-480e-4f61-96cf-ddfced60c94b",
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "data": {
205 | "text/markdown": [
206 | "- **Title**: Attention Is All You Need\n",
207 | "- **Authors**: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin (Google Brain, Google Research, University of Toronto)\n",
208 | "- **Abstract**: This paper introduces the Transformer, a new architecture for sequence transduction that relies entirely on attention mechanisms, eliminating the need for recurrence and convolutions. The Transformer demonstrates superior performance in machine translation tasks, achieving state-of-the-art BLEU scores for English-to-German and English-to-French translations, while also being more efficient in training time and parallelization.\n",
209 | "\n",
210 | "- **Objective**: The primary goal of this research is to propose a novel model architecture for sequence transduction that improves upon existing recurrent and convolutional models by leveraging attention mechanisms exclusively.\n",
211 | "\n",
212 | "- **Methodology**: The Transformer model is structured with an encoder-decoder architecture, where both components utilize multi-head self-attention and feed-forward neural networks. The authors conducted experiments on the WMT 2014 English-to-German and English-to-French translation tasks, comparing the performance and training efficiency of the Transformer against existing state-of-the-art models.\n",
213 | "\n",
214 | "- **Findings**: The Transformer achieved a BLEU score of 28.4 for English-to-German and 41.8 for English-to-French translations, surpassing previous models by significant margins and requiring less training time. The model also showed strong generalization to other tasks, such as English constituency parsing.\n",
215 | "\n",
216 | "- **Conclusion**: The authors conclude that the Transformer model, by relying solely on attention mechanisms, not only achieves state-of-the-art performance in translation tasks but also offers advantages in training speed and efficiency. They express optimism about the potential applications of attention-based models in various domains beyond text.\n",
217 | "\n",
218 | "- **Relevance**: This research is significant as it introduces a transformative approach to sequence modeling, addressing limitations of traditional recurrent and convolutional networks, and lays the groundwork for further advancements in natural language processing and other fields requiring sequence transduction.\n",
219 | "\n",
220 | "**Citation**: Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, Ł., & Polosukhin, I. (2023). Attention Is All You Need. 31st Conference on Neural Information Processing Systems (NIPS 2017). arXiv:1706.03762v7 [cs.CL]."
221 | ],
222 | "text/plain": [
223 | ""
224 | ]
225 | },
226 | "metadata": {},
227 | "output_type": "display_data"
228 | }
229 | ],
230 | "source": [
231 | "display(Markdown(summary))"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "id": "d0aad5dc-bbc8-4ac9-8421-1e9ae83fcdc4",
238 | "metadata": {},
239 | "outputs": [],
240 | "source": []
241 | }
242 | ],
243 | "metadata": {
244 | "kernelspec": {
245 | "display_name": "Python 3 (ipykernel)",
246 | "language": "python",
247 | "name": "python3"
248 | },
249 | "language_info": {
250 | "codemirror_mode": {
251 | "name": "ipython",
252 | "version": 3
253 | },
254 | "file_extension": ".py",
255 | "mimetype": "text/x-python",
256 | "name": "python",
257 | "nbconvert_exporter": "python",
258 | "pygments_lexer": "ipython3",
259 | "version": "3.12.2"
260 | }
261 | },
262 | "nbformat": 4,
263 | "nbformat_minor": 5
264 | }
265 |
--------------------------------------------------------------------------------
/session-3/example_2-text-classifier.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "773f2b77-a3c6-47b8-8246-1b958555f4af",
6 | "metadata": {},
7 | "source": [
8 | "# Text Classification with GPT-4o\n",
9 | "## ABB #1 - Session 3\n",
10 | "\n",
11 | "Code authored by: Shaw Talebi"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "62058d9d-adc2-4a6f-a60e-ff2abca7d93b",
17 | "metadata": {},
18 | "source": [
19 | "### imports"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "id": "aa6c91d1-058e-49a7-8296-75bb925c2e8b",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import pandas as pd\n",
30 | "\n",
31 | "from openai import OpenAI\n",
32 | "from top_secret import my_sk"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 2,
38 | "id": "28da37e3-47a3-4fc5-9722-e638d8fdc79e",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "# setup api client\n",
43 | "client = OpenAI(api_key=my_sk)"
44 | ]
45 | },
46 | {
47 | "cell_type": "markdown",
48 | "id": "7e99a8de",
49 | "metadata": {},
50 | "source": [
51 | "### functions"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "id": "7822e88b-1968-4458-ad40-326b7c93a06e",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "def manual_feature_engineering(df):\n",
62 | " \"\"\"\n",
63 | " Generate a suite of manually defined features\n",
64 | " \"\"\"\n",
65 | " \n",
66 | " # length of the body\n",
67 | " df[\"body_length\"] = df[\"body\"].apply(len)\n",
68 | " \n",
69 | " # contains \"lol\"\n",
70 | " df[\"contains_lol\"] = df[\"body\"].apply(lambda x: \"lol\" in x.lower())\n",
71 | " \n",
72 | " # contains \"OMG\"\n",
73 | " df[\"contains_omg\"] = df[\"body\"].apply(lambda x: \"omg\" in x.lower())\n",
74 | " \n",
75 | " # contains \"attached\" or \"see attached\" or \"see attachment\"\n",
76 | " df[\"contains_attached\"] = df[\"body\"].apply(lambda x: \"attached\" in x.lower())\n",
77 | " df[\"contains_attachment\"] = df[\"body\"].apply(lambda x: \"attachment\" in x.lower())\n",
78 | " \n",
79 | " # contains \"Order Confirmation\"\n",
80 | " df[\"contains_order_confirmation\"] = df[\"body\"].apply(lambda x: \"order confirmation\" in x.lower())\n",
81 | " \n",
82 | " # contains \"payment summary\"\n",
83 | " df[\"contains_payment_summary\"] = df[\"body\"].apply(lambda x: \"payment summary\" in x.lower())\n",
84 | " \n",
85 | " # sender is common person domain (gmail, yahoo, hotmail) \n",
86 | " df[\"sender_has_common_domain\"] = df[\"from\"].apply(lambda x: any(domain in x.lower() for domain in [\"gmail\", \"yahoo\", \"hotmail\"]))\n",
87 | " \n",
88 | " # is personal email\n",
89 | " df[\"is_personal\"] = df[\"label\"] == \"personal\"\n",
90 | "\n",
91 | " return df"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 4,
97 | "id": "7c73701a-ae49-4b74-8339-94776fb3d850",
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "def generate_label(prompt_template, subject, sender, body):\n",
102 | " \"\"\"\n",
103 | " Function to generate 0-shot label for email based on subject, sender, and body\n",
104 | " \"\"\"\n",
105 | " prompt = prompt_template(subject, sender, body)\n",
106 | " \n",
107 | " response = client.chat.completions.create(\n",
108 | " model=\"gpt-4o-mini\",\n",
109 | " messages=[\n",
110 | " {\"role\": \"system\", \"content\": \"Email Classifer\"},\n",
111 | " {\"role\": \"user\", \"content\": prompt}\n",
112 | " ], \n",
113 | " temperature = 0.25,\n",
114 | " max_completion_tokens=25,\n",
115 | " )\n",
116 | " \n",
117 | " # extract response\n",
118 | " return response.choices[0].message.content"
119 | ]
120 | },
121 | {
122 | "cell_type": "markdown",
123 | "id": "6f0e9770-2ec3-47ea-bd42-884c53a40918",
124 | "metadata": {},
125 | "source": [
126 | "### Load Data"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 5,
132 | "id": "3a26e03d-eb6a-4ea3-89b2-335803ea3514",
133 | "metadata": {},
134 | "outputs": [],
135 | "source": [
136 | "df = pd.read_csv(\"data/emails.csv\").astype(str)"
137 | ]
138 | },
139 | {
140 | "cell_type": "markdown",
141 | "id": "44ad81b2-2a15-4165-a47d-42307b87aad3",
142 | "metadata": {},
143 | "source": [
144 | "### Feature Engineering (Manual)"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 6,
150 | "id": "9534cb4a",
151 | "metadata": {},
152 | "outputs": [
153 | {
154 | "data": {
155 | "text/html": [
156 | "\n",
157 | "\n",
170 | "
\n",
171 | " \n",
172 | " \n",
173 | " | \n",
174 | " subject | \n",
175 | " from | \n",
176 | " body | \n",
177 | " label | \n",
178 | " body_length | \n",
179 | " contains_lol | \n",
180 | " contains_omg | \n",
181 | " contains_attached | \n",
182 | " contains_attachment | \n",
183 | " contains_order_confirmation | \n",
184 | " contains_payment_summary | \n",
185 | " sender_has_common_domain | \n",
186 | " is_personal | \n",
187 | "
\n",
188 | " \n",
189 | " \n",
190 | " \n",
191 | " 0 | \n",
192 | " Thank you Shawhin for your RSVP | \n",
193 | " Evite <info@mailva.evite.com> | \n",
194 | " Thank you for your RSVP\\n\\n\\nYou replied Yes f... | \n",
195 | " personal | \n",
196 | " 1830 | \n",
197 | " False | \n",
198 | " False | \n",
199 | " False | \n",
200 | " False | \n",
201 | " False | \n",
202 | " False | \n",
203 | " False | \n",
204 | " True | \n",
205 | "
\n",
206 | " \n",
207 | " 1 | \n",
208 | " Find Date for Knocking at Door | \n",
209 | " ifyahuna@gmail.com | \n",
210 | " So you're gonna talk to your dad about this ri... | \n",
211 | " personal | \n",
212 | " 1522 | \n",
213 | " False | \n",
214 | " False | \n",
215 | " False | \n",
216 | " False | \n",
217 | " False | \n",
218 | " False | \n",
219 | " True | \n",
220 | " True | \n",
221 | "
\n",
222 | " \n",
223 | " 2 | \n",
224 | " Thank you Shawhin for your RSVP | \n",
225 | " Evite <info@mailva.evite.com> | \n",
226 | " Thank you for your RSVP\\n\\n\\nYou replied Yes f... | \n",
227 | " personal | \n",
228 | " 1838 | \n",
229 | " False | \n",
230 | " False | \n",
231 | " False | \n",
232 | " False | \n",
233 | " False | \n",
234 | " False | \n",
235 | " False | \n",
236 | " True | \n",
237 | "
\n",
238 | " \n",
239 | " 3 | \n",
240 | " Folder shared with you: \"Knocking at The Door\" | \n",
241 | " \"Ifeoma Ahuna (via Google Drive)\" <drive-share... | \n",
242 | " I've shared an item with you:\\r\\n\\r\\nKnocking ... | \n",
243 | " personal | \n",
244 | " 251 | \n",
245 | " False | \n",
246 | " False | \n",
247 | " False | \n",
248 | " True | \n",
249 | " False | \n",
250 | " False | \n",
251 | " False | \n",
252 | " True | \n",
253 | "
\n",
254 | " \n",
255 | " 4 | \n",
256 | " The Colony Shoreline Trail 5K and 15K Registra... | \n",
257 | " RunSignup <info+auto@runsignup.com> | \n",
258 | " [1]The Colony Shoreline Trail 5K and 15K\\r\\n\\r... | \n",
259 | " personal | \n",
260 | " 3170 | \n",
261 | " False | \n",
262 | " False | \n",
263 | " False | \n",
264 | " False | \n",
265 | " False | \n",
266 | " False | \n",
267 | " False | \n",
268 | " True | \n",
269 | "
\n",
270 | " \n",
271 | "
\n",
272 | "
"
273 | ],
274 | "text/plain": [
275 | " subject \\\n",
276 | "0 Thank you Shawhin for your RSVP \n",
277 | "1 Find Date for Knocking at Door \n",
278 | "2 Thank you Shawhin for your RSVP \n",
279 | "3 Folder shared with you: \"Knocking at The Door\" \n",
280 | "4 The Colony Shoreline Trail 5K and 15K Registra... \n",
281 | "\n",
282 | " from \\\n",
283 | "0 Evite \n",
284 | "1 ifyahuna@gmail.com \n",
285 | "2 Evite \n",
286 | "3 \"Ifeoma Ahuna (via Google Drive)\" \n",
288 | "\n",
289 | " body label body_length \\\n",
290 | "0 Thank you for your RSVP\\n\\n\\nYou replied Yes f... personal 1830 \n",
291 | "1 So you're gonna talk to your dad about this ri... personal 1522 \n",
292 | "2 Thank you for your RSVP\\n\\n\\nYou replied Yes f... personal 1838 \n",
293 | "3 I've shared an item with you:\\r\\n\\r\\nKnocking ... personal 251 \n",
294 | "4 [1]The Colony Shoreline Trail 5K and 15K\\r\\n\\r... personal 3170 \n",
295 | "\n",
296 | " contains_lol contains_omg contains_attached contains_attachment \\\n",
297 | "0 False False False False \n",
298 | "1 False False False False \n",
299 | "2 False False False False \n",
300 | "3 False False False True \n",
301 | "4 False False False False \n",
302 | "\n",
303 | " contains_order_confirmation contains_payment_summary \\\n",
304 | "0 False False \n",
305 | "1 False False \n",
306 | "2 False False \n",
307 | "3 False False \n",
308 | "4 False False \n",
309 | "\n",
310 | " sender_has_common_domain is_personal \n",
311 | "0 False True \n",
312 | "1 True True \n",
313 | "2 False True \n",
314 | "3 False True \n",
315 | "4 False True "
316 | ]
317 | },
318 | "execution_count": 6,
319 | "metadata": {},
320 | "output_type": "execute_result"
321 | }
322 | ],
323 | "source": [
324 | "df = manual_feature_engineering(df)\n",
325 | "df.head()"
326 | ]
327 | },
328 | {
329 | "cell_type": "markdown",
330 | "id": "eb8e8fff-cee3-4bed-a875-ff63aa37bf90",
331 | "metadata": {},
332 | "source": [
333 | "### Text Classification with GPT-4o"
334 | ]
335 | },
336 | {
337 | "cell_type": "markdown",
338 | "id": "198a79f1-f15d-4ab5-a529-ba7b959fbfae",
339 | "metadata": {},
340 | "source": [
341 | "#### 0-shot"
342 | ]
343 | },
344 | {
345 | "cell_type": "code",
346 | "execution_count": 7,
347 | "id": "5f5a8ec7-3c7b-4051-a77b-b3cf82709b0d",
348 | "metadata": {},
349 | "outputs": [],
350 | "source": [
351 | "# prompt\n",
352 | "prompt_template = lambda subject, sender, body : f\"\"\"You are an intelligent assistant that classifies emails based on whether they are personal or not. \\\n",
353 | "Given an email's subject, sender, and body, determine if the email is personal (indicated by 1) or not personal (indicated by 0). A personal email typically \\\n",
354 | "includes messages from friends, family, or individuals addressing personal topics. Non-personal emails include promotional content, work-related messages, \\\n",
355 | "newsletters, or automated notifications.\n",
356 | "\n",
357 | "Input Email:\n",
358 | "Subject: {subject}\n",
359 | "Sender: {sender}\n",
360 | "Body: {body}\n",
361 | "\n",
362 | "Instructions:\n",
363 | "Carefully analyze the subject, sender, and body to understand the context and tone of the email.\n",
364 | "Return:\n",
365 | "1 if the email is personal.\n",
366 | "0 if the email is not personal.\n",
367 | "\n",
368 | "Output: [Your classification: 1 or 0]\n",
369 | "\"\"\""
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": 8,
375 | "id": "c9fd5623-1ccb-4ecd-8abd-2f4a4f3fda12",
376 | "metadata": {},
377 | "outputs": [
378 | {
379 | "name": "stdout",
380 | "output_type": "stream",
381 | "text": [
382 | "CPU times: user 3.56 s, sys: 176 ms, total: 3.74 s\n",
383 | "Wall time: 2min 38s\n"
384 | ]
385 | }
386 | ],
387 | "source": [
388 | "%%time\n",
389 | "# intialize list to store labels\n",
390 | "label_0shot_list = []\n",
391 | "\n",
392 | "# generate labels for each row\n",
393 | "for index, row in df.iterrows():\n",
394 | " label_0shot_list.append(generate_label(prompt_template, row['subject'], row['from'], row['body']))"
395 | ]
396 | },
397 | {
398 | "cell_type": "code",
399 | "execution_count": 9,
400 | "id": "27b40dde-2b69-4630-8a8a-840410ea8dc1",
401 | "metadata": {},
402 | "outputs": [
403 | {
404 | "name": "stdout",
405 | "output_type": "stream",
406 | "text": [
407 | "['0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '[Your classification: 0]', '1', '0', '0', '0', '[Your classification: 1]', '1', '1', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '1', 'Output: 1', '1', '0', '0', '1', '1', '1', '1', '1', '0', '0', '0', '1', '0', '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '[Your classification: 0]', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '[Your classification: 0]', '0', '0', '0', '0', '0', '0', '0']\n"
408 | ]
409 | }
410 | ],
411 | "source": [
412 | "print(label_0shot_list)"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": 10,
418 | "id": "21a66448-37a3-40d7-aae7-895869fd68a9",
419 | "metadata": {},
420 | "outputs": [],
421 | "source": [
422 | "# add label to df\n",
423 | "df['label_0shot'] = [\"1\" in label for label in label_0shot_list]\n",
424 | "df['correct_0shot'] = df['is_personal']==df['label_0shot']"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 11,
430 | "id": "369f4883-5fc0-449b-a3e0-22e0aca7666a",
431 | "metadata": {},
432 | "outputs": [
433 | {
434 | "name": "stdout",
435 | "output_type": "stream",
436 | "text": [
437 | "0.63\n"
438 | ]
439 | }
440 | ],
441 | "source": [
442 | "# compare ground truth to 0-shot label\n",
443 | "print(df['correct_0shot'].sum()/len(df))"
444 | ]
445 | },
446 | {
447 | "cell_type": "markdown",
448 | "id": "9b155e45-72e4-454e-871b-e2d037b3452f",
449 | "metadata": {},
450 | "source": [
451 | "#### few-shot"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 12,
457 | "id": "5c4d4c93-7c5e-46fe-b8d3-7c659fe40d95",
458 | "metadata": {},
459 | "outputs": [],
460 | "source": [
461 | "# prompt\n",
462 | "prompt_template_fewshot = lambda subject, sender, body :f\"\"\"You are an intelligent assistant that classifies emails as personal or not personal. Given an email's subject, sender, and body, determine if the email is personal (indicated by 1) or not personal (indicated by 0). A personal email typically includes messages from friends, family, or individuals addressing personal topics. Non-personal emails include promotional content, work-related messages, newsletters, or automated notifications.\n",
463 | "\n",
464 | "##Instructions:\n",
465 | "Carefully analyze the subject, sender, and body to understand the context and tone of the email.\n",
466 | "Return:\n",
467 | "1 if the email is personal.\n",
468 | "0 if the email is not personal.\n",
469 | "\n",
470 | "##Examples:\n",
471 | "**Example 1:**\n",
472 | "\n",
473 | "Subject: {df['subject'][0]}\n",
474 | "Sender: {df['from'][0]}\n",
475 | "Body: {df['body'][0]}\n",
476 | "Output: {int(df['is_personal'][0])}\n",
477 | "\n",
478 | "**Example 2:**\n",
479 | "\n",
480 | "Subject: {df['subject'][1]}\n",
481 | "Sender: {df['from'][1]}\n",
482 | "Body: {df['body'][1]}\n",
483 | "Output: {int(df['is_personal'][1])}\n",
484 | "\n",
485 | "**Example 3:**\n",
486 | "\n",
487 | "Subject: {df['subject'][150]}\n",
488 | "Sender: {df['from'][150]}\n",
489 | "Body: {df['body'][150]}\n",
490 | "Output: {int(df['is_personal'][150])}\n",
491 | "\n",
492 | "**Input Email:**\n",
493 | "\n",
494 | "Subject: {subject}\n",
495 | "Sender: {sender}\n",
496 | "Body: {body}\n",
497 | "Output: [Your classification: 1 or 0]\n",
498 | "\"\"\""
499 | ]
500 | },
501 | {
502 | "cell_type": "code",
503 | "execution_count": 13,
504 | "id": "d31ad359-a0b7-4503-9e8c-4a8b1d902730",
505 | "metadata": {},
506 | "outputs": [
507 | {
508 | "name": "stdout",
509 | "output_type": "stream",
510 | "text": [
511 | "CPU times: user 4.87 s, sys: 251 ms, total: 5.12 s\n",
512 | "Wall time: 4min 21s\n"
513 | ]
514 | }
515 | ],
516 | "source": [
517 | "%%time\n",
518 | "# intialize list to store labels\n",
519 | "label_fewshot_list = []\n",
520 | "\n",
521 | "# generate labels for each row\n",
522 | "for index, row in df.iterrows():\n",
523 | " label_fewshot_list.append(generate_label(prompt_template_fewshot, row['subject'], row['from'], row['body']))"
524 | ]
525 | },
526 | {
527 | "cell_type": "code",
528 | "execution_count": 14,
529 | "id": "237d3648-d041-481c-872d-a1eab08fa363",
530 | "metadata": {},
531 | "outputs": [
532 | {
533 | "name": "stdout",
534 | "output_type": "stream",
535 | "text": [
536 | "['0', '1', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '0', '0', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '0', '0', '1', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']\n"
537 | ]
538 | }
539 | ],
540 | "source": [
541 | "print(label_fewshot_list)"
542 | ]
543 | },
544 | {
545 | "cell_type": "code",
546 | "execution_count": 15,
547 | "id": "3b614bce-c2f0-4cd1-bfba-c16e7b243287",
548 | "metadata": {},
549 | "outputs": [],
550 | "source": [
551 | "# add label to df\n",
552 | "df['label_fewshot'] = [\"1\" in label for label in label_fewshot_list]\n",
553 | "df['correct_fewshot'] = df['is_personal']==df['label_fewshot']"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 16,
559 | "id": "aba0703e-4c20-443b-ae09-19abd9ed7747",
560 | "metadata": {},
561 | "outputs": [
562 | {
563 | "name": "stdout",
564 | "output_type": "stream",
565 | "text": [
566 | "0.7433333333333333\n"
567 | ]
568 | }
569 | ],
570 | "source": [
571 | "# compare ground truth to 0-shot label\n",
572 | "print(df['correct_fewshot'].sum()/len(df))"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": 17,
578 | "id": "e7c1e97f",
579 | "metadata": {},
580 | "outputs": [],
581 | "source": [
582 | "# save data to file\n",
583 | "df_transformed = df.iloc[:, 4:]\n",
584 | "df_transformed.to_csv(\"data/transformed_data.csv\", index=False)"
585 | ]
586 | },
587 | {
588 | "cell_type": "markdown",
589 | "id": "4303b0ae-a9f3-4877-98c0-07c81ae1537c",
590 | "metadata": {},
591 | "source": [
592 | "**Bonus:** train a classifer using noisy labels from GPT-4o-mini using [Example 2](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-2/example_2-email_classifier.ipynb) from Session 2"
593 | ]
594 | },
595 | {
596 | "cell_type": "code",
597 | "execution_count": null,
598 | "id": "b033bcb8-d81c-43c2-af65-6979b5a88e1d",
599 | "metadata": {},
600 | "outputs": [],
601 | "source": []
602 | }
603 | ],
604 | "metadata": {
605 | "kernelspec": {
606 | "display_name": "Python 3 (ipykernel)",
607 | "language": "python",
608 | "name": "python3"
609 | },
610 | "language_info": {
611 | "codemirror_mode": {
612 | "name": "ipython",
613 | "version": 3
614 | },
615 | "file_extension": ".py",
616 | "mimetype": "text/x-python",
617 | "name": "python",
618 | "nbconvert_exporter": "python",
619 | "pygments_lexer": "ipython3",
620 | "version": "3.12.2"
621 | }
622 | },
623 | "nbformat": 4,
624 | "nbformat_minor": 5
625 | }
626 |
--------------------------------------------------------------------------------
/session-3/example_3-local_visual_QA.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "f785df67-ce40-44b5-8375-9af6d3c5420a",
6 | "metadata": {},
7 | "source": [
8 | "# Local Visual QA with LLaMA 3.2 Vision\n",
9 | "## ABB #1 - Session 3\n",
10 | "\n",
11 | "Code authored by: Shaw Talebi"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "ccab25a6-d20f-450e-9981-d43bd29a3181",
17 | "metadata": {},
18 | "source": [
19 | "### imports"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "id": "fcc60723-d912-40ed-9b22-260b88d2182e",
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stderr",
30 | "output_type": "stream",
31 | "text": [
32 | "/Users/shawhin/Documents/_code/_stv/ABB/AI-Builders-Bootcamp-1/session-3/s3-env/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
33 | " from .autonotebook import tqdm as notebook_tqdm\n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "import ollama\n",
39 | "import gradio as gr\n",
40 | "import time"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "id": "cebe3d9d-0759-4832-820f-0d555357bc1f",
46 | "metadata": {},
47 | "source": [
48 | "### basic usage"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 2,
54 | "id": "c72e1dd0-b473-49fe-a1eb-0f3fa8a98340",
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/plain": [
60 | "{'status': 'success'}"
61 | ]
62 | },
63 | "execution_count": 2,
64 | "metadata": {},
65 | "output_type": "execute_result"
66 | }
67 | ],
68 | "source": [
69 | "# pull model\n",
70 | "ollama.pull('llama3.2-vision')"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 3,
76 | "id": "53fd54ff-0e11-4a92-aa29-3735be934fb4",
77 | "metadata": {},
78 | "outputs": [
79 | {
80 | "name": "stdout",
81 | "output_type": "stream",
82 | "text": [
83 | "The abstract of the paper states that it proposes a new system for translating text from German to English. The system, known as BLEU (Bilingual Evaluation Understudy), uses deep learning techniques to generate more accurate and natural-sounding translations than existing methods.\n",
84 | "\n",
85 | "**Key Features:**\n",
86 | "\n",
87 | "* **Parallelization**: The model achieves significant speedup by dividing the translation task into smaller sub-tasks that can be processed in parallel.\n",
88 | "* **Improved Accuracy**: The system produces more accurate and natural-sounding translations than existing methods.\n",
89 | "* **Efficient Training**: The model is trained using a combination of supervised and unsupervised learning techniques, which allows it to learn from large amounts of data quickly and efficiently.\n",
90 | "\n",
91 | "**Impact:**\n",
92 | "\n",
93 | "The paper presents a new approach to machine translation that has the potential to significantly improve the accuracy and efficiency of language translation systems. By leveraging deep learning techniques and parallelization, the system can handle complex tasks such as translating text from one language to another while maintaining the nuances and context of the original text."
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "# interact with model (locally)\n",
99 | "stream = ollama.chat(\n",
100 | " model='llama3.2-vision',\n",
101 | " messages=[{\n",
102 | " 'role': 'user',\n",
103 | " 'content': 'What is this paper about?',\n",
104 | " 'images': ['papers/attention-is-all-you-need.png']\n",
105 | " }],\n",
106 | " stream=True,\n",
107 | ")\n",
108 | "\n",
109 | "for chunk in stream:\n",
110 | " print(chunk['message']['content'], end='', flush=True)"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "id": "b402a518-7d2d-4158-93c3-dcddbbf5ce1e",
116 | "metadata": {},
117 | "source": [
118 | "### gradio UI"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 4,
124 | "id": "0e3ee81f-4a84-4fca-a2a0-01a8e7783b2e",
125 | "metadata": {},
126 | "outputs": [],
127 | "source": [
128 | "# Function to interact with the Ollama model\n",
129 | "def stream_chat(message, history):\n",
130 | " \"\"\"\n",
131 | " Streams the response from the Ollama model and sends it to the Gradio UI.\n",
132 | " \n",
133 | " Args:\n",
134 | " message (str): The user input message.\n",
135 | " history (list): A list of previous conversation messages.\n",
136 | " \n",
137 | " Yields:\n",
138 | " str: The chatbot's response chunk by chunk.\n",
139 | " \"\"\"\n",
140 | " # Append the user message to the conversation history\n",
141 | " history.append({\"role\": \"user\", \"content\": message[\"text\"], \"images\":message[\"files\"]})\n",
142 | " \n",
143 | " # Initialize streaming from Ollama\n",
144 | " stream = ollama.chat(\n",
145 | " model='llama3.2-vision',\n",
146 | " messages=history, # Full chat history including the current user message\n",
147 | " stream=True,\n",
148 | " )\n",
149 | " \n",
150 | " response_text = \"\"\n",
151 | " for chunk in stream:\n",
152 | " content = chunk['message']['content']\n",
153 | " response_text += content\n",
154 | " yield response_text # Send the response incrementally to the UI\n",
155 | "\n",
156 | " # Append the assistant's full response to the history\n",
157 | " history.append({\"role\": \"assistant\", \"content\": response_text})"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 5,
163 | "id": "2072e9ee-c43b-4702-a79e-29e7ac8f3a91",
164 | "metadata": {},
165 | "outputs": [
166 | {
167 | "name": "stdout",
168 | "output_type": "stream",
169 | "text": [
170 | "* Running on local URL: http://127.0.0.1:7860\n",
171 | "\n",
172 | "To create a public link, set `share=True` in `launch()`.\n"
173 | ]
174 | },
175 | {
176 | "data": {
177 | "text/html": [
178 | ""
179 | ],
180 | "text/plain": [
181 | ""
182 | ]
183 | },
184 | "metadata": {},
185 | "output_type": "display_data"
186 | },
187 | {
188 | "data": {
189 | "text/plain": []
190 | },
191 | "execution_count": 5,
192 | "metadata": {},
193 | "output_type": "execute_result"
194 | }
195 | ],
196 | "source": [
197 | "# Create a Gradio ChatInterface\n",
198 | "gr.ChatInterface(\n",
199 | " fn=stream_chat, # The function handling the chat\n",
200 | " type=\"messages\", # Using \"messages\" to enable chat-style conversation\n",
201 | " examples=[{\"text\": \"What is this paper about?\", \"files\": ['papers/attention-is-all-you-need.png']}], # Example inputs\n",
202 | " multimodal=True,\n",
203 | ").launch()"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": null,
209 | "id": "d18e4b9a-ebeb-458e-bce5-b4812937febb",
210 | "metadata": {},
211 | "outputs": [],
212 | "source": []
213 | }
214 | ],
215 | "metadata": {
216 | "kernelspec": {
217 | "display_name": "Python 3 (ipykernel)",
218 | "language": "python",
219 | "name": "python3"
220 | },
221 | "language_info": {
222 | "codemirror_mode": {
223 | "name": "ipython",
224 | "version": 3
225 | },
226 | "file_extension": ".py",
227 | "mimetype": "text/x-python",
228 | "name": "python",
229 | "nbconvert_exporter": "python",
230 | "pygments_lexer": "ipython3",
231 | "version": "3.12.2"
232 | }
233 | },
234 | "nbformat": 4,
235 | "nbformat_minor": 5
236 | }
237 |
--------------------------------------------------------------------------------
/session-3/papers/attention-is-all-you-need.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-3/papers/attention-is-all-you-need.pdf
--------------------------------------------------------------------------------
/session-3/papers/attention-is-all-you-need.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-3/papers/attention-is-all-you-need.png
--------------------------------------------------------------------------------
/session-3/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | ipykernel
3 | PyMuPDF
4 | openai
5 | ollama
6 | gradio
7 |
--------------------------------------------------------------------------------
/session-3/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-3/slides.pdf
--------------------------------------------------------------------------------
/session-3/top_secret.py:
--------------------------------------------------------------------------------
1 | my_sk = "your sk here"
--------------------------------------------------------------------------------
/session-4/data/chunk_embeddings.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-4/data/chunk_embeddings.pt
--------------------------------------------------------------------------------
/session-4/data/survey.csv:
--------------------------------------------------------------------------------
1 | "Timestamp","What is your current level of experience with AI/ML?","How many years of programming experience do you have?","Which programming language(s) are you most comfortable with?","What is your dream outcome for this course?","How many hours per week can you dedicate to this project outside of live sessions?","Is there anything else you'd like to share?"
2 | "2024/10/23 3:50:16 PM CST","1-3 years","4-6 years","Python","Hands on projects, and switch to Data science career path from research","6-10","Really excited about the course"
3 | "2024/10/23 11:10:31 PM CST","1-3 years","4-6 years","Python","Get to know Shaw better! Plus, have a clear path with resources to guide you on where to go and what to do for the future","5",""
4 | "2024/10/24 2:28:16 AM CST","<1 year","<1 year","Python","Fundamentals understanding, Hands on skills, small projects running in diff frameworks, create a small GH portfolio.","It depends if I find it interesting","Answers correspond to specific Ai hands on skills, industry experience around 12 years."
5 | "2024/10/24 3:18:45 AM CST","4-6 years","7+ years","Python","Build multiple AI applications","2",""
6 | "2024/10/24 6:26:56 AM CST","<1 year","7+ years","PHP, Javascript","I would like to learn how to practically setup LLM application in a production environment so that I can start creating some AI web apps for internal and external use. ","10",""
7 | "2024/10/24 2:49:01 PM CST","<1 year","7+ years","Python","Ability to train model based on a data set, and how to do predictive analysis.","20 hours",""
8 | "2024/10/25 3:55:59 AM CST","<1 year","1-3 years","I used to code in a previous life and started learning Python this year (intermediate level) and comfortable in BASH","Develop a good foundation in AI/ML. ","10","Great idea, glad to see it growing! "
9 | "2024/10/26 9:16:00 AM CST","1-3 years","7+ years","python, javascript, java, c++, typescript, goLang","I love to figure out good strategies to use in my software solutions using AI","10",""
10 | "2024/10/30 8:07:45 AM CST","<1 year","7+ years","C#, C++. Java","Learn to decide and advice between different data science archictures and options","4",""
11 | "2024/10/30 8:27:13 AM CST","7+ years","7+ years","Python, C++ (including CUDA C), R, Javascript, PHP","Implement practical AI cases","10",""
12 | "2024/10/30 9:57:49 AM CST","<1 year","7+ years","PHP, some JavaScript","Foundation in AI/ML","8 hrs",""
13 | "2024/10/30 10:10:27 AM CST","<1 year","<1 year","Python","Land a program manager job working with AI product/program","4","do I get help outside of classroom if I'm stuck (coding, etc)?"
14 | "2024/10/31 9:05:28 AM CST","<1 year","1-3 years","Python","Build an app","4",""
15 | "2024/10/31 5:11:27 PM CST","1-3 years","7+ years","c# python","able to create a LLM from scratch","12","no"
16 | "2024/11/04 10:22:23 AM CST","<1 year","<1 year","Python","Get a new job","5",""
17 | "2024/11/04 1:46:30 PM CST","1-3 years","7+ years","Python","At my former job I had a coworker who had the tedious task of making a weekly report summarizing local news for the boss. At the time I was completely sure that was something AI could do, but I didn't have the time nor the knowledge to develop such a thing. I constantly find myself having ideas like this, so a dream outcome would be finishing the course with at least a roadmap for making such an app.","6 to 8 are already on my calendar, but I can do more if I get too excited. ",""
18 | "2024/11/04 2:59:01 PM CST","1-3 years","1-3 years","Python, bash","Build cool products","10",""
19 | "2024/11/05 12:16:31 AM CST","<1 year","<1 year","Python","Be able to be comfortable with GenAi","20",""
20 | "2024/11/05 12:56:29 AM CST","<1 year","1-3 years","Fortran / Algol / Basic -- 30+ years ago :) (but I was pretty good at it)","A certificate to showcase my new skills, A jump start to do my own programing and be able to communicate with programmers","8-10 hours, flexible for more","On a journey from power generation (Caterpillar) sales professional to AI 'something'"
21 | "2024/11/05 5:48:09 AM CST","<1 year","1-3 years","T-SQL","Setup my own environment to compare Machine Learning statistics created by my companies Data Scientists against my own environment.","3",""
22 | "2024/11/05 9:42:43 AM CST","<1 year","1-3 years","HTML, CSS, Python","Be able to create and deploy my own Ai powered apps","4",""
23 | "2024/11/06 10:53:18 AM CST","1-3 years","1-3 years","Python","Build and launch a product","5",""
24 | "2024/11/07 9:30:00 AM CST","<1 year","1-3 years","Python, C","Getting skills to build AI technologies for many projects","8",""
25 | "2024/11/07 11:17:28 AM CST","<1 year","<1 year","Python","Being able to implement llms into projects ","10","No"
26 | "2024/11/07 12:35:01 PM CST","1-3 years","7+ years","Python SQL R","Landing a new job","5-10",""
27 | "2024/11/07 2:08:57 PM CST","1-3 years","7+ years","SAS, SQL, Python, Scala","to deepen my understanding of advanced AI techniques.And I also aim to expand my professional network and collaborate with like-minded individuals to explore new opportunities in the AI landscape.","10 - 15 Hrs per week",""
28 | "2024/11/07 2:35:51 PM CST","<1 year","4-6 years","Python, java, c#","Be able to train a model and have some practical usage of llm.","2",""
29 | "2024/11/07 2:52:33 PM CST","<1 year","7+ years","C#, C++, Java","Learn to identify the ML solution and lead projects based on AI","3",""
30 | "2024/11/07 5:01:38 PM CST","<1 year","7+ years","SQL","Able to put into practice AI/ML for real bunsiess solutions","4",""
31 | "2024/11/07 8:04:05 PM CST","1-3 years","1-3 years","Python","Master AI powered productivity tools to streamline regulatory compliance work","7","Looking for useful practical sample projects implementations that can be adapted in my work - Financial management and regulatory compliance"
32 | "2024/11/07 8:36:26 PM CST","<1 year","7+ years","PHP, SQL","Learn Python related to AI. Implement prototypes.","10",""
33 | "2024/11/08 12:00:22 AM CST","7+ years","7+ years","Python, R but I'm very rusty","1) build GPT that accesses functions and APIs, 2) hands-on fine-tune a model using LoRA 3) write and debug Python code (with AI assistance) that accesses ChatGPT, Perplexity and Google search, 4) build working agent(s) that can output results in less than 10 seconds 5) possibly use a RAG efficiently for booklength PDF texts ","5-10","I probably understand already the concepts you are going to teach, but I want more current hands-on experience as I am an executive --I was a programmer a long time ago. I am very rusty with Python programming and need to relearn it with professional tools (including Github, VSCode or PyCharm, Canvas or Projects) and best practices like using Docker and practicing best security. I need to expertly know how to protect our data and conversations from the foundational models training on it."
34 | "2024/11/08 12:25:02 AM CST","1-3 years","4-6 years","Python, C#, C++","Fully understand AI and LLM, and how to build one using Python","2",""
35 | "2024/11/08 2:03:37 AM CST","<1 year","1-3 years","JavaScript ","Be able to comfortably add AI into daily or business use cases ","10",""
36 | "2024/11/08 9:02:27 AM CST","4-6 years","1-3 years","Pascal","Build my own AI product with the AI services available these days","3","I'd love to network with classmates, and it would be super awesome if we have a slack community to stay in touch!"
37 | "2024/11/08 9:48:30 AM CST","<1 year","<1 year","Python, NextJS","I want to build my own MicroSaaS products","4 - 5","Looking forward to building my own stuff"
38 | "2024/11/15 8:34:32 AM CST","1-3 years","7+ years","python, GoLang, Java, c++, f#","I love to apply ai solution to existing and future back-end projects","10","looking forward to learning deep ai concepts"
--------------------------------------------------------------------------------
/session-4/example_2-blog_semantic_search.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "44353ae1-41a5-4e8e-bdfb-fec1f445f239",
6 | "metadata": {},
7 | "source": [
8 | "# Semantic Search with Text Embeddings\n",
9 | "## ABB #1 - Session 4\n",
10 | "\n",
11 | "Code authored by: Shaw Talebi"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "9e22ee10-b6c8-4e59-babd-366b41f4a357",
17 | "metadata": {},
18 | "source": [
19 | "### imports"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "id": "18c3df73-038b-44b9-9bf2-526dc485c311",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import os\n",
30 | "from bs4 import BeautifulSoup\n",
31 | "import json\n",
32 | "from sentence_transformers import SentenceTransformer\n",
33 | "import torch\n",
34 | "from IPython.display import display, Markdown\n",
35 | "from functions import *"
36 | ]
37 | },
38 | {
39 | "cell_type": "markdown",
40 | "id": "3667ac64-c7f7-4a29-9bf4-9eda692145c8",
41 | "metadata": {},
42 | "source": [
43 | "### 1) chunk articles"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "id": "a53d05bc-7ace-455a-a6bd-bdf45116d916",
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Get all HTML files from raw directory\n",
54 | "filename_list = [\"articles/\"+f for f in os.listdir('articles')]\n",
55 | "\n",
56 | "chunk_list = []\n",
57 | "for filename in filename_list:\n",
58 | "\n",
59 | " # read html file\n",
60 | " with open(filename, 'r', encoding='utf-8') as file:\n",
61 | " html_content = file.read()\n",
62 | "\n",
63 | " # Parse HTML\n",
64 | " soup = BeautifulSoup(html_content, 'html.parser')\n",
65 | " \n",
66 | " # Get article title\n",
67 | " article_title = soup.find('title').get_text().strip() if soup.find('title') else \"Untitled\"\n",
68 | " \n",
69 | " # Initialize variables\n",
70 | " article_content = []\n",
71 | " current_section = \"Main\" # Default section if no headers found\n",
72 | " \n",
73 | " # Find all headers and text content\n",
74 | " content_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol'])\n",
75 | "\n",
76 | " # iterate through elements and extract text with metadata\n",
77 | " for element in content_elements:\n",
78 | " if element.name in ['h1', 'h2', 'h3']:\n",
79 | " current_section = element.get_text().strip()\n",
80 | " elif element.name in ['p', 'ul', 'ol']:\n",
81 | " text = element.get_text().strip()\n",
82 | " # Only add non-empty content that's at least 30 characters long\n",
83 | " if text and len(text) >= 30:\n",
84 | " article_content.append({\n",
85 | " 'article_title': article_title,\n",
86 | " 'section': current_section,\n",
87 | " 'text': text\n",
88 | " })\n",
89 | "\n",
90 | " # add article content to list\n",
91 | " chunk_list.extend(article_content)"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 3,
97 | "id": "b70b7fbf-c363-4a5a-84b8-be342a450dd4",
98 | "metadata": {},
99 | "outputs": [],
100 | "source": [
101 | "# save chunk list to file\n",
102 | "filename='data/chunk_list.json'\n",
103 | "with open(filename, 'w', encoding='utf-8') as f:\n",
104 | " json.dump(chunk_list, f, indent=4, ensure_ascii=False)"
105 | ]
106 | },
107 | {
108 | "cell_type": "markdown",
109 | "id": "6e50b033-8519-4b93-b8ac-b05eed841604",
110 | "metadata": {},
111 | "source": [
112 | "### 2) embed chunks"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 4,
118 | "id": "52e1ddfa-8691-4cb0-8eef-74d7dc7db933",
119 | "metadata": {},
120 | "outputs": [
121 | {
122 | "name": "stdout",
123 | "output_type": "stream",
124 | "text": [
125 | "Num chunks: 778\n"
126 | ]
127 | }
128 | ],
129 | "source": [
130 | "# define text to embed\n",
131 | "text_list = []\n",
132 | "for content in chunk_list:\n",
133 | " # concatenate title and section header\n",
134 | " context = content['article_title'] + \" - \" + content['section'] + \": \"\n",
135 | " # append text from paragraph to fill CLIP's 256 sequence limit\n",
136 | " text = context + content['text'][:512-len(context)]\n",
137 | " \n",
138 | " text_list.append(text)\n",
139 | "print(\"Num chunks:\",len(text_list))"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 5,
145 | "id": "d03fe32a-230a-416a-ac7b-a25d85a41b05",
146 | "metadata": {},
147 | "outputs": [],
148 | "source": [
149 | "# load model\n",
150 | "model = SentenceTransformer(\"multi-qa-MiniLM-L6-cos-v1\")\n",
151 | "\n",
152 | "# compute embeddings\n",
153 | "chunk_embeddings = model.encode(text_list)\n",
154 | "chunk_embeddings.shape\n",
155 | "\n",
156 | "# save chunk embeddings to file\n",
157 | "torch.save(chunk_embeddings, 'data/chunk_embeddings.pt')"
158 | ]
159 | },
160 | {
161 | "cell_type": "markdown",
162 | "id": "c9ac9ac6-134e-421c-904e-ec6bf81834db",
163 | "metadata": {},
164 | "source": [
165 | "### 3) semantic search"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 6,
171 | "id": "43e73516-fdfd-4963-ad19-2c3e413ab5a3",
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "name": "stdout",
176 | "output_type": "stream",
177 | "text": [
178 | "torch.Size([1, 778])\n",
179 | "tensor([0.0781, 0.1124, 0.0879, 0.1028, 0.0970])\n"
180 | ]
181 | }
182 | ],
183 | "source": [
184 | "# define query\n",
185 | "query = \"What is a token?\"\n",
186 | "query_embedding = model.encode(query)\n",
187 | "\n",
188 | "# compute similarity between query and all chunks\n",
189 | "similarities = model.similarity(query_embedding, chunk_embeddings)\n",
190 | "print(similarities.shape)\n",
191 | "print(similarities[0][:5])"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 7,
197 | "id": "c3aaf616-c522-45c1-a89b-3af6ae4166bb",
198 | "metadata": {},
199 | "outputs": [],
200 | "source": [
201 | "# define search parameters\n",
202 | "temp = 0.1\n",
203 | "k=3\n",
204 | "threshold = 0.05\n",
205 | "\n",
206 | "# Rescale similarities via softmax\n",
207 | "scores = torch.nn.functional.softmax(similarities/temp, dim=1)\n",
208 | "\n",
209 | "# Get sorted indices and scores\n",
210 | "sorted_indices = scores.argsort(descending=True)[0]\n",
211 | "sorted_scores = scores[0][sorted_indices]\n",
212 | "\n",
213 | "# Filter by threshold and get top k\n",
214 | "filtered_indices = [\n",
215 | " idx.item() for idx, score in zip(sorted_indices, sorted_scores) \n",
216 | " if score.item() >= threshold\n",
217 | "][:k]\n",
218 | "\n",
219 | "# Get corresponding content items and scores\n",
220 | "top_results = [chunk_list[i] for i in filtered_indices]\n",
221 | "result_scores = [scores[0][i].item() for i in filtered_indices]"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "id": "5b9d785f-a1df-40aa-b762-2b79f3748df5",
227 | "metadata": {},
228 | "source": [
229 | "### 4) display results"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 8,
235 | "id": "246b8d0f-beea-406b-81b0-484171fb6231",
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "results_markdown = \"\"\n",
240 | "for i, result in enumerate(top_results, start=1):\n",
241 | " results_markdown += f\"{i}. **Article title:** {result['article_title']} \\n\"\n",
242 | " results_markdown += f\" **Section:** {result['section']} \\n\"\n",
243 | " results_markdown += f\" **Snippet:** {result['text']} \\n\\n\""
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 9,
249 | "id": "97df4231-af86-4dd3-b806-cab95bb4b5d2",
250 | "metadata": {},
251 | "outputs": [
252 | {
253 | "data": {
254 | "text/markdown": [
255 | "1. **Article title:** Cracking Open the OpenAI (Python) API \n",
256 | " **Section:** 2) OpenAI’s (Python) API \n",
257 | " **Snippet:** Tokens, in the context of LLMs, are essentially a set of numbers representing a set of words and characters. For example, “The” could be a token, “ end” (with the space) could be another, and “.” another. \n",
258 | "\n"
259 | ],
260 | "text/plain": [
261 | ""
262 | ]
263 | },
264 | "metadata": {},
265 | "output_type": "display_data"
266 | }
267 | ],
268 | "source": [
269 | "display(Markdown(results_markdown))"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 10,
275 | "id": "b101df6f-93a0-4a32-a362-e067397a9140",
276 | "metadata": {},
277 | "outputs": [
278 | {
279 | "data": {
280 | "text/markdown": [
281 | "1. **Article title:** LLM Fine-tuning — FAQs \n",
282 | " **Section:** RAG vs Fine-tuning? \n",
283 | " **Snippet:** We’ve already mentioned situations where RAG and fine-tuning perform well. However, since this is such a common question, it’s worth reemphasizing when each approach works best. \n",
284 | "\n",
285 | "2. **Article title:** LLM Fine-tuning — FAQs \n",
286 | " **Section:** RAG vs Fine-tuning? \n",
287 | " **Snippet:** RAG is when we inject relevant context into an LLM’s input prompt so that it can generate more helpful responses. For example, if we have a domain-specific knowledge base (e.g., internal company documents and emails), we might identify the items most relevant to the user’s query so that an LLM can synthesize information in an accurate and digestible way. \n",
288 | "\n",
289 | "3. **Article title:** LLM Fine-tuning — FAQs \n",
290 | " **Section:** RAG vs Fine-tuning? \n",
291 | " **Snippet:** Here’s high-level guidance on when to use each. \n",
292 | "\n"
293 | ],
294 | "text/plain": [
295 | ""
296 | ]
297 | },
298 | "metadata": {},
299 | "output_type": "display_data"
300 | }
301 | ],
302 | "source": [
303 | "# bringing it all together\n",
304 | "query = \"What's the difference between RAG and Fine-tuning?\"\n",
305 | "results_markdown = semantic_search(query, model, chunk_embeddings, chunk_list, temp=0.1, k=3, threshold=0)\n",
306 | "display(Markdown(results_markdown))"
307 | ]
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": null,
312 | "id": "390287f1-7ea2-4627-a646-e2ff9101b1e3",
313 | "metadata": {},
314 | "outputs": [],
315 | "source": []
316 | }
317 | ],
318 | "metadata": {
319 | "kernelspec": {
320 | "display_name": "Python 3 (ipykernel)",
321 | "language": "python",
322 | "name": "python3"
323 | },
324 | "language_info": {
325 | "codemirror_mode": {
326 | "name": "ipython",
327 | "version": 3
328 | },
329 | "file_extension": ".py",
330 | "mimetype": "text/x-python",
331 | "name": "python",
332 | "nbconvert_exporter": "python",
333 | "pygments_lexer": "ipython3",
334 | "version": "3.12.2"
335 | }
336 | },
337 | "nbformat": 4,
338 | "nbformat_minor": 5
339 | }
340 |
--------------------------------------------------------------------------------
/session-4/example_3-blog_QA_RAG.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "22c6a7a2-26c9-4e14-aafa-5cf069d02e46",
6 | "metadata": {},
7 | "source": [
8 | "# Article Series QA Assistant with RAG\n",
9 | "## ABB #1 - Session 4\n",
10 | "\n",
11 | "Code authored by: Shaw Talebi"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "bdff7bfa-ebdb-4323-b16a-d69aa948eb5b",
17 | "metadata": {},
18 | "source": [
19 | "### imports"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "id": "dd3ebb26-94ac-4c47-a5fc-170c8caa0cd9",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import json\n",
30 | "from sentence_transformers import SentenceTransformer\n",
31 | "import torch\n",
32 | "from IPython.display import display, Markdown\n",
33 | "from functions import *\n",
34 | "\n",
35 | "from openai import OpenAI\n",
36 | "from top_secret import my_sk\n",
37 | "\n",
38 | "import os \n",
39 | "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\""
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "id": "953a58cc-0b6b-4529-a2af-c1f58facd0ae",
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "# setup api client\n",
50 | "client = OpenAI(api_key=my_sk)"
51 | ]
52 | },
53 | {
54 | "cell_type": "markdown",
55 | "id": "7dbe890f-5c07-44c9-8afb-dbc819e82660",
56 | "metadata": {},
57 | "source": [
58 | "### load data & model"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 3,
64 | "id": "029c926b-6f42-4af6-816a-e34dc20aab8c",
65 | "metadata": {},
66 | "outputs": [],
67 | "source": [
68 | "# load chunks\n",
69 | "filename = 'data/chunk_list.json'\n",
70 | "with open(filename, 'r', encoding='utf-8') as f:\n",
71 | " chunk_list = json.load(f)\n",
72 | "\n",
73 | "# load embeddings\n",
74 | "chunk_embeddings = torch.load('data/chunk_embeddings.pt', weights_only=False)"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 4,
80 | "id": "07c9b8c5-105a-492e-a14e-fb711e735f29",
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "name": "stdout",
85 | "output_type": "stream",
86 | "text": [
87 | "Num chunks: 778\n",
88 | "(778, 384)\n"
89 | ]
90 | }
91 | ],
92 | "source": [
93 | "print(\"Num chunks:\",len(chunk_list))\n",
94 | "print(chunk_embeddings.shape)"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 5,
100 | "id": "5fff575e-faed-4d42-9c19-c4325f1e8e4e",
101 | "metadata": {},
102 | "outputs": [],
103 | "source": [
104 | "# load model\n",
105 | "model = SentenceTransformer(\"multi-qa-MiniLM-L6-cos-v1\")"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "id": "f109f511-8897-4e69-aa99-30a2ff79b0c9",
111 | "metadata": {},
112 | "source": [
113 | "### 1) define query"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 6,
119 | "id": "6da59e64-ba7b-4150-8615-e23fe9b71a40",
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "# define query\n",
124 | "query = \"When does it make sense to use RAG vs fine-tuning?\""
125 | ]
126 | },
127 | {
128 | "cell_type": "markdown",
129 | "id": "bba55143-e771-4603-98a6-80b9f758d26d",
130 | "metadata": {},
131 | "source": [
132 | "### 2) context retreival"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 7,
138 | "id": "ce377b19-c0ec-4519-9cd1-e119911f109a",
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "results_markdown = semantic_search(query, model, chunk_embeddings, chunk_list, temp=0.1, k=10, threshold=0.01)"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 8,
148 | "id": "d16694e7-6dc2-4630-80cb-e1d087e38f7f",
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "data": {
153 | "text/markdown": [
154 | "1. **Article title:** LLM Fine-tuning — FAQs \n",
155 | " **Section:** RAG vs Fine-tuning? \n",
156 | " **Snippet:** We’ve already mentioned situations where RAG and fine-tuning perform well. However, since this is such a common question, it’s worth reemphasizing when each approach works best. \n",
157 | "\n",
158 | "2. **Article title:** LLM Fine-tuning — FAQs \n",
159 | " **Section:** RAG vs Fine-tuning? \n",
160 | " **Snippet:** Here’s high-level guidance on when to use each. \n",
161 | "\n",
162 | "3. **Article title:** How to Improve LLMs with RAG \n",
163 | " **Section:** Why we care \n",
164 | " **Snippet:** Previous articles in this series discussed fine-tuning, which adapts an existing model for a particular use case. While this is an alternative way to endow an LLM with specialized knowledge, empirically, fine-tuning seems to be less effective than RAG at doing this [1]. \n",
165 | "\n",
166 | "4. **Article title:** LLM Fine-tuning — FAQs \n",
167 | " **Section:** RAG vs Fine-tuning? \n",
168 | " **Snippet:** RAG is when we inject relevant context into an LLM’s input prompt so that it can generate more helpful responses. For example, if we have a domain-specific knowledge base (e.g., internal company documents and emails), we might identify the items most relevant to the user’s query so that an LLM can synthesize information in an accurate and digestible way. \n",
169 | "\n",
170 | "5. **Article title:** LLM Fine-tuning — FAQs \n",
171 | " **Section:** RAG vs Fine-tuning? \n",
172 | " **Snippet:** Notice that these approaches are not mutually exclusive. In fact, the original RAG system proposed by Facebook researchers used fine-tuning to better use retrieved information for generating responses [4]. \n",
173 | "\n",
174 | "6. **Article title:** How to Improve LLMs with RAG \n",
175 | " **Section:** Some Nuances \n",
176 | " **Snippet:** Document preparation—The quality of a RAG system is driven by how well useful information can be extracted from source documents. For example, if a document is unformatted and full of images and tables, it will be more difficult to parse than a well-formatted text file. \n",
177 | "\n",
178 | "7. **Article title:** How to Improve LLMs with RAG \n",
179 | " **Section:** Some Nuances \n",
180 | " **Snippet:** While the steps for building a RAG system are conceptually simple, several nuances can make building one (in the real world) more complicated. \n",
181 | "\n",
182 | "8. **Article title:** LLM Fine-tuning — FAQs \n",
183 | " **Section:** When NOT to Fine-tune \n",
184 | " **Snippet:** The effectiveness of any approach will depend on the details of the use case. For example, fine-tuning is less effective than retrieval augmented generation (RAG) to provide LLMs with specialized knowledge [1]. \n",
185 | "\n",
186 | "9. **Article title:** How to Improve LLMs with RAG \n",
187 | " **Section:** How it works \n",
188 | " **Snippet:** There are 2 key elements of a RAG system: a retriever and a knowledge base. \n",
189 | "\n",
190 | "10. **Article title:** How to Improve LLMs with RAG \n",
191 | " **Section:** Why we care \n",
192 | " **Snippet:** Notice that RAG does not fundamentally change how we use an LLM; it's still prompt-in and response-out. RAG simply augments this process (hence the name). \n",
193 | "\n"
194 | ],
195 | "text/plain": [
196 | ""
197 | ]
198 | },
199 | "metadata": {},
200 | "output_type": "display_data"
201 | }
202 | ],
203 | "source": [
204 | "display(Markdown(results_markdown))"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "id": "f6442014-564f-4413-9c5f-d2e3b2e55d11",
210 | "metadata": {},
211 | "source": [
212 | "### 3) prompt engineering"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 9,
218 | "id": "48874ba8-97a1-40ab-9364-90c2c7d96203",
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "prompt_template = lambda query, results_markdown : f\"\"\" You are an AI assistant tasked with answering user questions based on excerpts from blog posts. Use the following snippets to \\\n",
223 | "provide accurate, concise, and synthesized answers. If the snippets don’t provide enough information, let the user know and suggest further exploration.\n",
224 | "\n",
225 | "## Question:\n",
226 | "{query}\n",
227 | "\n",
228 | "## Relevant Snippets:\n",
229 | "{results_markdown}\n",
230 | "\n",
231 | "---\n",
232 | "\n",
233 | "## Response:\n",
234 | "Provide a clear and concise response below, synthesizing information from the snippets and referencing them directly. If additional information is \\\n",
235 | "required, suggest further follow-ups or note what’s missing.\n",
236 | "\"\"\""
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 10,
242 | "id": "6022b678-3522-471f-b8b9-0f896e4eedb2",
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "prompt = prompt_template(query, results_markdown)\n",
247 | "# print(prompt)"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "id": "d0551c2a-fde3-4d48-a5d7-d7dfb6f362c4",
253 | "metadata": {},
254 | "source": [
255 | "### 4) prompt GPT-4o-mini"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 11,
261 | "id": "ab389a06-8359-49d1-96db-1663c2fdfc0d",
262 | "metadata": {},
263 | "outputs": [],
264 | "source": [
265 | "# make api call\n",
266 | "response = client.chat.completions.create(\n",
267 | " model=\"gpt-4o-mini\",\n",
268 | " messages=[\n",
269 | " {\"role\": \"user\", \"content\": prompt}\n",
270 | " ], \n",
271 | " temperature = 0.5\n",
272 | ")\n",
273 | "\n",
274 | "# extract response\n",
275 | "answer = response.choices[0].message.content"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "id": "18a8da58-78a8-4136-8823-40e576d2115b",
281 | "metadata": {},
282 | "source": [
283 | "### 5) display results"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 12,
289 | "id": "3a4dbc7d-88c6-4a4d-9406-a96f16918f16",
290 | "metadata": {},
291 | "outputs": [
292 | {
293 | "name": "stdout",
294 | "output_type": "stream",
295 | "text": [
296 | "\n",
297 | "When does it make sense to use RAG vs fine-tuning?\n",
298 | "\n"
299 | ]
300 | },
301 | {
302 | "data": {
303 | "text/markdown": [
304 | "When deciding between Retrieval-Augmented Generation (RAG) and fine-tuning for enhancing large language models (LLMs), consider the following:\n",
305 | "\n",
306 | "1. **RAG** is ideal when you need to inject relevant context into the model's input to improve response quality. It works well with domain-specific knowledge bases, allowing the model to synthesize information from relevant documents effectively (Snippet 4). This method is particularly useful when fine-tuning is less effective at providing specialized knowledge (Snippet 8).\n",
307 | "\n",
308 | "2. **Fine-tuning** adapts an existing model for a specific use case but is generally considered less effective than RAG for embedding specialized knowledge (Snippet 3). It can be beneficial when you have a well-defined dataset and the goal is to customize the model's behavior more fundamentally.\n",
309 | "\n",
310 | "3. Both approaches can be used together; for instance, the original RAG system utilized fine-tuning to enhance how retrieved information is employed in generating responses (Snippet 5).\n",
311 | "\n",
312 | "In summary, use RAG when you need to leverage external knowledge sources for better context in responses. Opt for fine-tuning when you want to fundamentally adjust the model's capabilities for a specific task, keeping in mind that it may be less effective for specialized knowledge compared to RAG. If you need more detailed guidance on specific use cases, further exploration of the topic may be beneficial."
313 | ],
314 | "text/plain": [
315 | ""
316 | ]
317 | },
318 | "metadata": {},
319 | "output_type": "display_data"
320 | }
321 | ],
322 | "source": [
323 | "print()\n",
324 | "print(query)\n",
325 | "print()\n",
326 | "display(Markdown(answer))"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 13,
332 | "id": "9c238da9-3bf7-4333-965c-285be39218b5",
333 | "metadata": {},
334 | "outputs": [
335 | {
336 | "data": {
337 | "text/markdown": [
338 | "The benefits of fine-tuning large language models (LLMs) include:\n",
339 | "\n",
340 | "1. **Improved Performance for Specific Tasks**: Fine-tuned models can outperform larger pre-trained models for particular use cases, even when clever prompt engineering is applied (Snippet 6).\n",
341 | "\n",
342 | "2. **Lower Inference Costs**: Fine-tuning can lead to reduced inference costs, making it a practical choice for deploying AI assistants (Snippet 9).\n",
343 | "\n",
344 | "3. **Customization**: Fine-tuning allows for the adaptation of a model to specialized knowledge or tasks, enhancing its relevance and effectiveness (Snippet 2).\n",
345 | "\n",
346 | "4. **Quality of Training Data**: The performance of a fine-tuned model is heavily influenced by the quality of the training dataset used, emphasizing the importance of data preparation (Snippet 7).\n",
347 | "\n",
348 | "However, it is important to note that fine-tuning is not a one-size-fits-all solution. It may not be as effective as other techniques like retrieval augmented generation (RAG) for certain applications (Snippet 1), and it can incur an \"alignment tax,\" where performance may drop in some tasks (Snippet 5). \n",
349 | "\n",
350 | "For further exploration, consider looking into specific use cases where fine-tuning has shown significant benefits or challenges."
351 | ],
352 | "text/plain": [
353 | ""
354 | ]
355 | },
356 | "metadata": {},
357 | "output_type": "display_data"
358 | }
359 | ],
360 | "source": [
361 | "# bringing it all together\n",
362 | "query = \"What are the benefits of LLM fine-tuning?\"\n",
363 | "results_markdown = semantic_search(query, model, chunk_embeddings, chunk_list, temp=0.1, k=10, threshold=0.01)\n",
364 | "answer = answer_query(query, results_markdown, prompt_template, client)\n",
365 | "display(Markdown(answer))"
366 | ]
367 | },
368 | {
369 | "cell_type": "code",
370 | "execution_count": null,
371 | "id": "1b3a2383-fdc7-40ab-a5f6-f4e44cf6d287",
372 | "metadata": {},
373 | "outputs": [],
374 | "source": []
375 | }
376 | ],
377 | "metadata": {
378 | "kernelspec": {
379 | "display_name": "Python 3 (ipykernel)",
380 | "language": "python",
381 | "name": "python3"
382 | },
383 | "language_info": {
384 | "codemirror_mode": {
385 | "name": "ipython",
386 | "version": 3
387 | },
388 | "file_extension": ".py",
389 | "mimetype": "text/x-python",
390 | "name": "python",
391 | "nbconvert_exporter": "python",
392 | "pygments_lexer": "ipython3",
393 | "version": "3.12.2"
394 | }
395 | },
396 | "nbformat": 4,
397 | "nbformat_minor": 5
398 | }
399 |
--------------------------------------------------------------------------------
/session-4/functions.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import torch
3 |
4 | def parse_html_content(html_content):
5 | """
6 | Parse HTML content and extract structured content with sections and paragraphs.
7 |
8 | Args:
9 | html_content (str): Raw HTML content to parse
10 |
11 | Returns:
12 | list: List of dictionaries containing structured content
13 | """
14 | # Parse HTML
15 | soup = BeautifulSoup(html_content, 'html.parser')
16 |
17 | # Get article title
18 | article_title = soup.find('title').get_text().strip() if soup.find('title') else "Untitled"
19 |
20 | # Initialize variables
21 | structured_content = []
22 | current_section = "Main" # Default section if no headers found
23 |
24 | # Find all headers and text content
25 | content_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol'])
26 |
27 | for element in content_elements:
28 | if element.name in ['h1', 'h2', 'h3']:
29 | current_section = element.get_text().strip()
30 | elif element.name in ['p', 'ul', 'ol']:
31 | text = element.get_text().strip()
32 | # Only add non-empty content that's at least 30 characters long
33 | if text and len(text) >= 30:
34 | structured_content.append({
35 | 'article_title': article_title,
36 | 'section': current_section,
37 | 'text': text
38 | })
39 |
40 | return structured_content
41 |
42 |
43 | def get_top_k_items(similarities, chunk_list, temp=0.1, k=3, threshold=0.05):
44 | """
45 | Retrieves the top-k most similar items from a list based on a similarity matrix.
46 |
47 | Args:
48 | similarities (torch.Tensor): A 2D tensor where each row represents the similarity scores of an item.
49 | chunk_list (list): A list of content items corresponding to the columns of the similarity tensor.
50 | temp (float, optional): Temperature for softmax rescaling. Defaults to 0.1.
51 | k (int, optional): Number of top items to return. Defaults to 3.
52 | threshold (float, optional): Minimum similarity score for inclusion. Defaults to 0.05.
53 |
54 | Returns:
55 | list: The top-k most similar content items.
56 | list: The scores corresponding to the top-k items.
57 | """
58 | # Rescale similarities via softmax
59 | scores = torch.nn.functional.softmax(similarities / temp, dim=1)
60 |
61 | # Get sorted indices and scores
62 | sorted_indices = scores.argsort(descending=True)[0]
63 | sorted_scores = scores[0][sorted_indices]
64 |
65 | # Filter by threshold and get top k
66 | filtered_indices = [
67 | idx.item() for idx, score in zip(sorted_indices, sorted_scores)
68 | if score.item() >= threshold
69 | ][:k]
70 |
71 | # Get corresponding content items and scores
72 | top_results = [chunk_list[i] for i in filtered_indices]
73 | result_scores = [scores[0][i].item() for i in filtered_indices]
74 |
75 | return top_results, result_scores
76 |
77 | def compute_query_similarities(query, model, chunk_embeddings):
78 | """
79 | Computes similarity scores between a query and a list of chunk embeddings.
80 |
81 | Args:
82 | query (str): The query string to compute similarities for.
83 | model: The embedding model that provides methods for encoding and similarity computation.
84 | chunk_embeddings (torch.Tensor): A tensor containing embeddings for the content chunks.
85 |
86 | Returns:
87 | torch.Tensor: A tensor of similarity scores between the query and the content chunks.
88 | """
89 | # Encode the query into an embedding
90 | query_embedding = model.encode(query)
91 |
92 | # Compute similarity between query embedding and all chunk embeddings
93 | similarities = model.similarity(query_embedding, chunk_embeddings)
94 |
95 | return similarities
96 |
97 | def format_results_to_markdown(top_results):
98 | """
99 | Formats a list of result dictionaries into a Markdown string for display.
100 |
101 | Args:
102 | top_results (list): A list of dictionaries, each containing 'article_title', 'section', and 'text' keys.
103 |
104 | Returns:
105 | str: A formatted Markdown string containing the titles, sections, and snippets from the results.
106 | """
107 | results_markdown = ""
108 | for i, result in enumerate(top_results, start=1):
109 | results_markdown += f"{i}. **Article title:** {result['article_title']} \n"
110 | results_markdown += f" **Section:** {result['section']} \n"
111 | results_markdown += f" **Snippet:** {result['text']} \n\n"
112 |
113 | return results_markdown
114 |
115 | def semantic_search(query, model, chunk_embeddings, chunk_list, temp=0.1, k=3, threshold=0.05):
116 | """
117 | Executes a semantic search pipeline: computes similarities, retrieves top results, and formats them into Markdown.
118 |
119 | Args:
120 | query (str): The search query string.
121 | model: The embedding model with methods for encoding and similarity computation.
122 | chunk_embeddings (torch.Tensor): A tensor of precomputed chunk embeddings.
123 | chunk_list (list): A list of content chunks corresponding to the embeddings.
124 | temp (float, optional): Temperature for softmax rescaling. Defaults to 0.1.
125 | k (int, optional): Number of top items to return. Defaults to 3.
126 | threshold (float, optional): Minimum similarity score for inclusion. Defaults to 0.05.
127 |
128 | Returns:
129 | str: Formatted Markdown string of the top results.
130 | """
131 | # Compute similarities between the query and chunk embeddings
132 | similarities = compute_query_similarities(query, model, chunk_embeddings)
133 |
134 | # Retrieve the top-k results and their scores
135 | top_results, result_scores = get_top_k_items(similarities, chunk_list, temp=temp, k=k, threshold=threshold)
136 |
137 | # Format the top results into a Markdown string
138 | results_markdown = format_results_to_markdown(top_results)
139 |
140 | return results_markdown
141 |
142 | def answer_query(query, results_markdown, prompt_template, client):
143 | """
144 | Function answer user query based on semantic search results
145 | """
146 | prompt = prompt_template(query, results_markdown)
147 |
148 | response = client.chat.completions.create(
149 | model="gpt-4o-mini",
150 | messages=[
151 | {"role": "user", "content": prompt}
152 | ],
153 | temperature = 0.5,
154 | )
155 |
156 | # extract response
157 | return response.choices[0].message.content
--------------------------------------------------------------------------------
/session-4/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | ipykernel
3 | matplotlib
4 | beautifulsoup4
5 | pandas
6 | scikit-learn
7 | sentence-transformers
8 | openai
9 | gradio
10 | numpy<2
11 |
--------------------------------------------------------------------------------
/session-4/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-4/slides.pdf
--------------------------------------------------------------------------------
/session-4/top_secret.py:
--------------------------------------------------------------------------------
1 | my_sk = "your sk here"
--------------------------------------------------------------------------------
/session-5/README.md:
--------------------------------------------------------------------------------
1 | # Session 5: Software 3.0, Fine-tuning
2 |
3 | Links:
4 | - [Example 1: Fine-tuning BERT for Text Classification](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/example_1-finetune_bert_classifier.ipynb)
5 | - [Example 2: Fine-tuning a LinkedIn Post Writer](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/example_2-linkedin_post_writer.ipynb)
6 | - [Slides](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/slides.pdf)
7 |
8 | ## How to run the examples
9 |
10 | 1. Clone this repo
11 | 2. Navigate to downloaded folder and create new venv
12 | ```
13 | python -m venv s5-env
14 | ```
15 | 3. Activate venv
16 | ```
17 | # mac/linux
18 | source s5-env/bin/activate
19 |
20 | # windows
21 | .\s5-env\Scripts\activate.bat
22 | ```
23 | 4. Install dependencies
24 | ```
25 | pip install -r requirements.txt
26 | ```
27 | 5. Launch Jupyter Lab
28 | ```
29 | jupyter lab
30 | ```
31 |
--------------------------------------------------------------------------------
/session-5/data/LI_posts.numbers:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-5/data/LI_posts.numbers
--------------------------------------------------------------------------------
/session-5/data/valid-data.jsonl:
--------------------------------------------------------------------------------
1 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nSharing 5 AI project ideas blog\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Within 24 hours, my latest blog was read by 10k people\u2026 \n\nIt fleshes out the 5 AI project ideas (you can build this weekend) that I shared here on Monday. \n\nI break each idea down step-by-step and share some code for getting started. \n\nHere\u2019s the link if you want to check it out \ud83d\udc47 \n\nhttps://lnkd.in/gDvxCHx5\n\n--\nMedia: Article screenshot"}]}
2 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\n3 ways to make LLMs multimodal and sharing blog post\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Over the past year, we\u2019ve seen LLMs gain the ability to see (i.e. process images) and hear (i.e. process audio directly), but how does that work under the hood? \n\nHere are 3 general ways to do that \ud83d\udc47 \n\n1. LLM + Tools: Augment LLMs with pre-built components \n\n2. LLM + Adapters: Augment LLMs with multi-modal encoders or decoders, which are aligned via adapter fine-tuning \n\n3. Unified Models: Expand LLM architecture to fuse modalities at pre-training \n\nIn my latest blog post in Towards Data Science, I discuss these three approaches and share example code for (locally) using LLaMA 3.2 Vision to perform various image-to-text tasks. \n\nCheck it out here: https://lnkd.in/gGcvVk9u\n\n--\nMedia: Article screenshot"}]}
3 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nHitting 1M views on YouTube\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "This has been a month of milestones for my YouTube channel. \n\nLast week, I crossed 1,000,000 views \ud83e\udd2f\u2026 a number too big to fully comprehend.\n\nI started this journey by making content that would have been helpful to a past version of myself. \n\nAnd as a Physics PhD student, I didn\u2019t think that would be relevant to a lot of people \ud83d\ude05 \n\nBut, to my surprise, it seems there are many people (not just PhD students) who have an appetite for learning about data science and AI. \n\nWhich makes me pretty lucky \ud83d\ude01 \n\nThank you to everyone who watched \ud83d\ude4f.. \n\n.. Next stop 10M??\n\n--\nMedia: Image"}]}
4 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nExplaining multimodal embeddings and sharing blog post\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Although text and images may look very different to us, to a neural network, these are represented via the same mathematical object, i.e., a vector. \n\nTherefore, we can image vector spaces where text and images look similar, given they represent the same underlying information. \n\nThis is the key idea behind multimodal embeddings. \n\nIn my latest blog post, I share how we can generate such representations along with two practical use cases for them: 1) 0-shot Image Classification and 2) Image search. \n\nCheck it out here \ud83d\udc47 https://lnkd.in/g-Dvy9dF\n\n--\nMedia: Article screenshot"}]}
5 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nRecap of past 12 months on my 29th bday\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "I turned 29 today. \n\n28 was wild\u2026 here are some things that happened. \n\n>> 1.2M people watched my YouTube videos \n>> My YT channel went from 2k \u2014> 31k subs \n>> 600 people joined The Data Entrepreneurs Discord \n>> I started a podcast with Ifeoma Ahuna \n\nBut most importantly\u2026 \n>> I survived my 1st year of entrepreneurship! \n\nWhile this past year has been harder than I expected (emotionally), going down this path is the best decision I\u2019ve made in my 29 years of life. \n\nEntrepreneurship allows me to live a life better aligned with my values (relationships, growth, and freedom), which I feel is key to a fulfilling life. \n\nExcited to see where Year 2 takes me (other than turning 30 \ud83d\ude05)\n\n--\nMedia: Image"}]}
6 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nMultimodal RAG and its value. Share blog post\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Two major topics in AI this year were multimodal models and RAG. \n\nBut\u2026 what if we brought these ideas together? Multimodal RAG systems are capable of processing text and non-text data. \n\nThis enables more powerful solutions beyond what is conveyed by text alone. \n\nFor example, it could analyze someone\u2019s facial expressions and speech tonality to give a richer context to a meeting\u2019s transcription. \n\nIn my latest blog post, I give a beginner-friendly guide to Multimodal RAG and share an example implementation using LLaMA 3.2 Vision and CLIP. \n\nCheck it out here \ud83d\udc47 \n\nhttps://lnkd.in/grY5bGDz\n\n--\nMedia: Article screenshot"}]}
7 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nQuitting consoling to focus on content creation\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "I\u2019m quitting consulting to focus on YouTube. \n\nHere\u2019s why. \ud83d\udc47 \n\n1) YouTube is working! (80% of my traffic comes from YT) \n\n2) It allows me to focus on one thing (i.e. avoid distractions) \n\n3) Making 1 video a week gives me a clear, quantifiable goal that I can use to frame my learning and product development. \n\nWhile committing to one thing is scary (especially something unpredictable like YouTube \ud83d\ude05), it is also necessary for success. \n\nThe reason (IMO) is that commitment backs you into a corner, forcing you to innovate and learn new things. \n\nYou can watch this experiment risk-free on my YouTube channel \ud83d\ude09 \n\n(\ud83d\udd17 link in comments)\n\n--\nMedia: Image"}]}
8 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nGetting a data science job was hard until I learned these 5 things\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Getting a Data Science Job was hard until I learned these 5 things \ud83d\udc47 \n\n1. Adapt your resume to the specific role \n\nYou have about 7 seconds of a hiring manager's attention when reviewing your resume; use it to call out all the skills they listed in the job description. \n\n2. Do real-world projects \n\nWhile toy projects from class or Kaggle are a great way to learn, hiring managers care about real-world impact. \n\n3. Make a portfolio \n\nDespite the free no-code tools for making a portfolio website, most people don't make one. That makes this a super easy way to stand out. \n\n4. Stay out of the weeds (unless they ask) \n\nThis was one of my greatest struggles. When someone asked about a project, I would dive right into the technical details. \n\nThis is the best way to lose someone's attention (even data scientists). \n\n5. Be yourself (not who you think they want to hire) \n\nWhen I interviewed for my full-time data science role, I dove into a rant about my love for optimization. \n\nMost people get confused and awkward when I discuss such topics, but not the interviewers. They got pumped up... that's when I knew I found my people \ud83d\ude01 \n\nWhat was an unlock for you when applying to data science roles?\n\n--\nMedia: Image"}]}
9 |
--------------------------------------------------------------------------------
/session-5/example_2-linkedin_post_writer.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "11c076dd-4e32-431c-9793-e0784197e3d1",
6 | "metadata": {},
7 | "source": [
8 | "# Fine-tuning GPT-4o to Write LinkedIn Posts\n",
9 | "## ABB #1 - Session 5\n",
10 | "\n",
11 | "Code authored by: Shaw Talebi"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "id": "7c7812b7-9bb9-47da-8116-f6af553e18c5",
17 | "metadata": {},
18 | "source": [
19 | "### imports"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "id": "8369ae91-d925-4ed1-9a85-32fb8e64a153",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "import csv\n",
30 | "import json\n",
31 | "import random\n",
32 | "\n",
33 | "from openai import OpenAI\n",
34 | "from top_secret import my_sk \n",
35 | "\n",
36 | "# connect to openai API\n",
37 | "client = OpenAI(api_key=my_sk)"
38 | ]
39 | },
40 | {
41 | "cell_type": "markdown",
42 | "id": "b67b2a2d-75c7-4913-b603-93127ece77fc",
43 | "metadata": {},
44 | "source": [
45 | "### Read data"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 2,
51 | "id": "4bc4f088-1331-4de4-a252-c60210479042",
52 | "metadata": {},
53 | "outputs": [],
54 | "source": [
55 | "# load csv of YouTube comments\n",
56 | "idea_list = []\n",
57 | "copy_list = []\n",
58 | "media_list = []\n",
59 | "\n",
60 | "with open('data/LI_posts.csv', mode ='r') as file:\n",
61 | " file = csv.reader(file)\n",
62 | " \n",
63 | " # read file line by line\n",
64 | " for line in file:\n",
65 | " # skip first line\n",
66 | " if line[0]=='Idea':\n",
67 | " continue\n",
68 | " \n",
69 | " # append comments and responses to respective lists\n",
70 | " idea_list.append(line[0])\n",
71 | " copy_list.append(line[1])\n",
72 | " media_list.append(line[2])"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 3,
78 | "id": "5bb3cc63-84cb-40ff-bc75-ad40447d46a3",
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "name": "stdout",
83 | "output_type": "stream",
84 | "text": [
85 | "50\n",
86 | "50\n",
87 | "50\n"
88 | ]
89 | }
90 | ],
91 | "source": [
92 | "print(len(idea_list))\n",
93 | "print(len(copy_list))\n",
94 | "print(len(media_list))"
95 | ]
96 | },
97 | {
98 | "cell_type": "markdown",
99 | "id": "9afd558b-b292-4582-a965-04ebeb46c403",
100 | "metadata": {},
101 | "source": [
102 | "### Create training examples"
103 | ]
104 | },
105 | {
106 | "cell_type": "code",
107 | "execution_count": 4,
108 | "id": "f6ed7112-b939-41d4-9fc2-077c9c85229a",
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "# construct training examples\n",
113 | "example_list = []\n",
114 | "\n",
115 | "system_prompt = \"LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur\"\n",
116 | "\n",
117 | "prompt_template = lambda idea_string : f\"\"\"Write a LinkedIn post based on the following idea:\n",
118 | "{idea_string}\n",
119 | "\n",
120 | "Include:\n",
121 | "- A compelling opening line that hooks the reader\n",
122 | "- Copy that expands upon the idea in valuable way\n",
123 | "- A call to action or share relevant content\n",
124 | "\n",
125 | "Output:\n",
126 | "\"\"\"\n",
127 | "\n",
128 | "for i in range(len(idea_list)): \n",
129 | " system_dict = {\"role\": \"system\", \"content\": system_prompt}\n",
130 | " user_dict = {\"role\": \"user\", \"content\": prompt_template(idea_list[i])}\n",
131 | " assistant_dict = {\"role\": \"assistant\", \"content\": copy_list[i] + \"\\n\\n--\\nMedia: \" + media_list[i]}\n",
132 | " \n",
133 | " messages_list = [system_dict, user_dict, assistant_dict]\n",
134 | " \n",
135 | " example_list.append({\"messages\": messages_list})"
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 5,
141 | "id": "1c847ac6-89de-413f-a7c4-854e9a514658",
142 | "metadata": {},
143 | "outputs": [
144 | {
145 | "name": "stdout",
146 | "output_type": "stream",
147 | "text": [
148 | "Write a LinkedIn post based on the following idea:\n",
149 | "3 types of AI Tik Tok\n",
150 | "\n",
151 | "Include:\n",
152 | "- A compelling opening line that hooks the reader\n",
153 | "- Copy that expands upon the idea in valuable way\n",
154 | "- A call to action or share relevant content\n",
155 | "\n",
156 | "Output:\n",
157 | "\n",
158 | "A problem with AI today is that it means different things to different people. \n",
159 | "\n",
160 | "This framework from Andrej Karpathy helped give me much more clarity 👇 \n",
161 | "\n",
162 | "Software 1.0 = Rule-based software systems. Humans program computers to solve problems step-by-step. \n",
163 | "\n",
164 | "Software 2.0 = Computers program themselves by seeing examples (i.e. machine learning) \n",
165 | "\n",
166 | "Software 3.0 = Repurposing general-purpose ML models for specific use cases (i.e. GenAI + Foundation Models) \n",
167 | "\n",
168 | "But… what’s Software 4.0 going to be? 🤔\n",
169 | "\n",
170 | "--\n",
171 | "Media: Video\n"
172 | ]
173 | }
174 | ],
175 | "source": [
176 | "print(example_list[0]['messages'][1]['content'])\n",
177 | "print(example_list[0]['messages'][2]['content'])"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 6,
183 | "id": "daa80954-4589-4850-8b69-b2427828247b",
184 | "metadata": {},
185 | "outputs": [
186 | {
187 | "data": {
188 | "text/plain": [
189 | "50"
190 | ]
191 | },
192 | "execution_count": 6,
193 | "metadata": {},
194 | "output_type": "execute_result"
195 | }
196 | ],
197 | "source": [
198 | "len(example_list)"
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "id": "39301b03-69be-4f2c-b7fd-44e17a22be2a",
204 | "metadata": {},
205 | "source": [
206 | "### Create train/validation split"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 7,
212 | "id": "6d43a2f6-b5b8-40c6-92c9-f4ac00c35643",
213 | "metadata": {},
214 | "outputs": [],
215 | "source": [
216 | "# randomly pick out validation examples\n",
217 | "num_examples = 10\n",
218 | "validation_index_list = random.sample(range(0, len(example_list)-1), num_examples)\n",
219 | "validation_data_list = [example_list[index] for index in validation_index_list]\n",
220 | "\n",
221 | "for example in validation_data_list:\n",
222 | " example_list.remove(example)"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 8,
228 | "id": "7bf38b30-6c4e-4956-96e6-b7bc5f92423f",
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "40\n",
236 | "10\n"
237 | ]
238 | }
239 | ],
240 | "source": [
241 | "print(len(example_list))\n",
242 | "print(len(validation_data_list))"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 9,
248 | "id": "09999da1-e0a4-4bee-949d-c31006dad700",
249 | "metadata": {},
250 | "outputs": [],
251 | "source": [
252 | "# write examples to file\n",
253 | "with open('data/train-data.jsonl', 'w') as train_file:\n",
254 | " for example in example_list:\n",
255 | " json.dump(example, train_file)\n",
256 | " train_file.write('\\n')\n",
257 | "\n",
258 | "with open('data/valid-data.jsonl', 'w') as valid_file:\n",
259 | " for example in validation_data_list:\n",
260 | " json.dump(example, valid_file)\n",
261 | " valid_file.write('\\n')"
262 | ]
263 | },
264 | {
265 | "cell_type": "markdown",
266 | "id": "ce1eb41d-f61b-493e-b2dc-1abc458ba9a4",
267 | "metadata": {},
268 | "source": [
269 | "### Upload data to OpenAI"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": 10,
275 | "id": "4f3ff2b6-7530-48f9-837b-86000d7f72c6",
276 | "metadata": {},
277 | "outputs": [],
278 | "source": [
279 | "train_file = client.files.create(\n",
280 | " file = open(\"data/train-data.jsonl\", \"rb\"),\n",
281 | " purpose = \"fine-tune\"\n",
282 | ")\n",
283 | "\n",
284 | "valid_file = client.files.create(\n",
285 | " file = open(\"data/valid-data.jsonl\", \"rb\"),\n",
286 | " purpose = \"fine-tune\"\n",
287 | ")"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "id": "d753629d-45e5-4d0f-b0bd-0bef1c51062c",
293 | "metadata": {},
294 | "source": [
295 | "### Fine-tune model"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 11,
301 | "id": "88425464-a1ec-4230-a025-19f9a848a73a",
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "data": {
306 | "text/plain": [
307 | "FineTuningJob(id='ftjob-eCS6EchA0sb7hWMrOQlZITRQ', created_at=1734050118, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-KjWERyZ9WLUqIdrdMeJh4zC0', result_files=[], seed=616771098, status='validating_files', trained_tokens=None, training_file='file-2qUUvaBrn3qmzK8UjBZwdD', validation_file='file-61iDsGKpr4LM9ssQknQiC5', estimated_finish=None, integrations=[], user_provided_suffix='LI-post-writer')"
308 | ]
309 | },
310 | "execution_count": 11,
311 | "metadata": {},
312 | "output_type": "execute_result"
313 | }
314 | ],
315 | "source": [
316 | "client.fine_tuning.jobs.create(\n",
317 | " training_file = train_file.id,\n",
318 | " validation_file = valid_file.id,\n",
319 | " suffix = \"LI-post-writer\",\n",
320 | " model = \"gpt-4o-mini-2024-07-18\"\n",
321 | ")"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "id": "3c650e56-77a0-4a76-a52e-6d9194fb6278",
327 | "metadata": {},
328 | "source": [
329 | "### Evaluate fine-tuned model"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 12,
335 | "id": "dd26bc6e-7bee-45f2-88a2-e419de0fb762",
336 | "metadata": {},
337 | "outputs": [],
338 | "source": [
339 | "def generate_post(system_prompt, model_name, idea):\n",
340 | " response = client.chat.completions.create(\n",
341 | " model=model_name,\n",
342 | " messages=[\n",
343 | " {\"role\": \"system\", \"content\": system_prompt},\n",
344 | " {\"role\": \"user\", \"content\": prompt_template(idea)}\n",
345 | " ],\n",
346 | " temperature=0.7,\n",
347 | " )\n",
348 | " return response.choices[0].message.content"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 13,
354 | "id": "7f5b4455-b219-4fee-95ec-d99ab052acbd",
355 | "metadata": {},
356 | "outputs": [],
357 | "source": [
358 | "idea = \"Python was hard until I learned these 5 things\""
359 | ]
360 | },
361 | {
362 | "cell_type": "code",
363 | "execution_count": 14,
364 | "id": "17be2632-d9ee-44b9-9569-19e274f0709f",
365 | "metadata": {},
366 | "outputs": [
367 | {
368 | "name": "stdout",
369 | "output_type": "stream",
370 | "text": [
371 | "🚀 Struggling with Python? You're not alone. It was a tough nut to crack until I discovered these 5 game-changing strategies.\n",
372 | "\n",
373 | "1. **Think in Data Structures:** Understanding lists, dictionaries, and sets is crucial. They're the backbone of efficient Python coding, allowing you to solve complex problems with ease.\n",
374 | "\n",
375 | "2. **Master List Comprehensions:** Transform your loops into concise, readable expressions. This not only saves time but also boosts your code’s performance.\n",
376 | "\n",
377 | "3. **Leverage Libraries:** Python’s strength lies in its vast ecosystem of libraries. Familiarize yourself with pandas for data manipulation, NumPy for numerical computations, and requests for HTTP requests.\n",
378 | "\n",
379 | "4. **Embrace the Zen of Python:** \"Readability counts.\" Keep your code clean and simple. Follow PEP 8 guidelines to ensure your code is not just functional but also elegant.\n",
380 | "\n",
381 | "5. **Practice, Practice, Practice:** There's no substitute for hands-on experience. Challenge yourself with real-world problems and projects to solidify your understanding and confidence.\n",
382 | "\n",
383 | "Ready to level up your Python skills? Start by implementing these strategies today. Share your favorite Python tips in the comments or connect with me for more insights into mastering this versatile language! 🐍🔧\n",
384 | "\n",
385 | "#Python #CodingTips #TechEducation #AI #DataScience\n"
386 | ]
387 | }
388 | ],
389 | "source": [
390 | "# GPT-4o (no fine-tuning)\n",
391 | "model_name = \"gpt-4o\"\n",
392 | "system_prompt_long = \"You are an AI assistant helping Shaw Talebi, an AI educator and entrepreneur, craft LinkedIn posts. Your goal is to generate posts \\\n",
393 | "that reflect Shaw Talebi's voice: authoritative yet approachable, insightful yet concise. Shaw Talebi's posts aim to educate and inspire professionals \\\n",
394 | "in the tech and AI space. Focus on providing value, discussing new trends, or offering actionable advice, while keeping the tone professional but \\\n",
395 | "conversational. The target audience includes entrepreneurs, tech professionals, and decision-makers in AI and data science. Always ensure the post is \\\n",
396 | "relevant, engaging, and on-brand for Shaw Talebi's public persona.\"\n",
397 | "\n",
398 | "# print(system_prompt_long, \"\\n--\")\n",
399 | "print(generate_post(system_prompt_long, model_name, idea))"
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "execution_count": 15,
405 | "id": "5cc2e3fb-a692-47eb-87e8-4e9ae788f0ac",
406 | "metadata": {},
407 | "outputs": [
408 | {
409 | "name": "stdout",
410 | "output_type": "stream",
411 | "text": [
412 | "Python was hard until I learned these 5 things 👇 \n",
413 | "\n",
414 | "1) Use a good IDE \n",
415 | "\n",
416 | "2) Learn by building projects \n",
417 | "\n",
418 | "3) Use ChatGPT \n",
419 | "\n",
420 | "4) Break things down into smaller problems \n",
421 | "\n",
422 | "5) Use the Python documentation \n",
423 | "\n",
424 | "I share my full Python learning journey here 👇 \n",
425 | "\n",
426 | "https://lnkd.in/gZy68cZC \n",
427 | "\n",
428 | "#Python #Programming #AI \n",
429 | "\n",
430 | "--\n",
431 | "Media: Meme\n"
432 | ]
433 | }
434 | ],
435 | "source": [
436 | "# GPT-4o-mini (fine-tuned)\n",
437 | "model_name = \"ft:gpt-4o-mini-2024-07-18:shawhin-talebi-ventures-llc:li-post-writer:Adk6A5Pd\"\n",
438 | "\n",
439 | "# print(system_prompt, \"\\n--\")\n",
440 | "print(generate_post(system_prompt, model_name, idea))"
441 | ]
442 | },
443 | {
444 | "cell_type": "code",
445 | "execution_count": 16,
446 | "id": "55c2cd55-477e-464f-8480-f9e354fb4341",
447 | "metadata": {},
448 | "outputs": [],
449 | "source": [
450 | "# # delete files (after fine-tuning is done)\n",
451 | "# client.files.delete(train_file.id)\n",
452 | "# client.files.delete(valid_file.id)"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": null,
458 | "id": "683d94f8-03f1-4418-927c-5b00c45e683a",
459 | "metadata": {},
460 | "outputs": [],
461 | "source": []
462 | }
463 | ],
464 | "metadata": {
465 | "kernelspec": {
466 | "display_name": "Python 3 (ipykernel)",
467 | "language": "python",
468 | "name": "python3"
469 | },
470 | "language_info": {
471 | "codemirror_mode": {
472 | "name": "ipython",
473 | "version": 3
474 | },
475 | "file_extension": ".py",
476 | "mimetype": "text/x-python",
477 | "name": "python",
478 | "nbconvert_exporter": "python",
479 | "pygments_lexer": "ipython3",
480 | "version": "3.12.2"
481 | }
482 | },
483 | "nbformat": 4,
484 | "nbformat_minor": 5
485 | }
486 |
--------------------------------------------------------------------------------
/session-5/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyterlab
2 | ipykernel
3 | ipywidgets
4 | datasets
5 | transformers[torch]
6 | evaluate
7 | numpy
8 | openai
--------------------------------------------------------------------------------
/session-5/slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-5/slides.pdf
--------------------------------------------------------------------------------
/session-5/top_secret.py:
--------------------------------------------------------------------------------
1 | my_sk = "your sk here"
--------------------------------------------------------------------------------
/session-6/ABB-1-session_6.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-6/ABB-1-session_6.pdf
--------------------------------------------------------------------------------