├── .gitignore ├── LICENSE ├── README.md ├── session-1 ├── data │ ├── emails.csv │ └── survey.csv ├── example_1-email_broadcast.ipynb ├── example_2-email_broadcast.py ├── example_3-report_builder_emailer.ipynb ├── requirements.txt ├── slides.pdf └── top_secret.py ├── session-2 ├── data │ └── transformed_data.csv ├── example_1-gmail_ETL.ipynb ├── example_2-email_classifier.ipynb ├── requirements.txt ├── slides.pdf └── top_secret.py ├── session-3 ├── data │ └── transformed_data.csv ├── example_1-paper_summarizer.ipynb ├── example_2-text-classifier.ipynb ├── example_3-local_visual_QA.ipynb ├── papers │ ├── attention-is-all-you-need.pdf │ └── attention-is-all-you-need.png ├── requirements.txt ├── slides.pdf └── top_secret.py ├── session-4 ├── articles │ ├── 2023-07-13_A-Practical-Introduction-to-LLMs-65194dda1148.html │ ├── 2023-07-21_Cracking-Open-the-OpenAI--Python--API-230e4cae7971.html │ ├── 2023-08-05_Cracking-Open-the-Hugging-Face-Transformers-Library-350aa0ef0161.html │ ├── 2023-08-25_Prompt-Engineering---How-to-trick-AI-into-solving-your-problems-7ce1ed3b553f.html │ ├── 2023-09-11_Fine-Tuning-Large-Language-Models--LLMs--23473d763b91.html │ ├── 2023-09-21_How-to-Build-an-LLM-from-Scratch-8c477768f1f9.html │ ├── 2024-02-22_QLoRA---How-to-Fine-Tune-an-LLM-on-a-Single-GPU-4e44d6b5be32.html │ ├── 2024-03-09_How-to-Improve-LLMs-with-RAG-abdc132f76ac.html │ ├── 2024-03-27_Text-Embeddings--Classification--and-Semantic-Search-8291746220be.html │ ├── 2024-08-30_Compressing-Large-Language-Models--LLMs--9f406eea5b5e.html │ ├── 2024-09-26_LLM-Fine-tuning---FAQs-200442827c99.html │ ├── 2024-10-17_Fine-Tuning-BERT-for-Text-Classification-a01f89b179fc.html │ ├── 2024-11-19_Multimodal-Models---LLMs-that-can-see-and-hear-5c6737c981d3.html │ └── 2024-11-29_Multimodal-Embeddings--An-Introduction-5dc36975966f.html ├── data │ ├── chunk_embeddings.pt │ ├── chunk_list.json │ └── survey.csv ├── example_1-unstructured_survey_analysis.ipynb ├── example_2-blog_semantic_search.ipynb ├── example_3-blog_QA_RAG.ipynb ├── functions.py ├── requirements.txt ├── slides.pdf └── top_secret.py ├── session-5 ├── README.md ├── data │ ├── LI_posts.csv │ ├── LI_posts.numbers │ ├── train-data.jsonl │ └── valid-data.jsonl ├── example_1-finetune_bert_classifier.ipynb ├── example_2-linkedin_post_writer.ipynb ├── requirements.txt ├── slides.pdf └── top_secret.py └── session-6 └── ABB-1-session_6.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | .DS_Store 165 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AI-Builders-Bootcamp-1 2 | Code repository for AI Builders Bootcamp #1. 3 | 4 | Course homepage on Maven: https://maven.com/shaw-talebi/ai-builders-bootcamp 5 | 6 | ## Session 1: Introduction, Software 1.0 7 | Getting started with AI and building basic automation scripts with Python. 8 | 9 | Examples: 10 | 11 | - [Automated email broadcaster](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-1/example_1-email_broadcast.ipynb) 12 | - [Automated report builder and emailer](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-1/example_3-report_builder_emailer.ipynb) 13 | 14 | ## Session 2: Software 2.0, Data Engineering, Machine Learning 15 | Building ETL pipelines for machine learning projects. 16 | 17 | Examples: 18 | - [ETL of Gmail inbox](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-2/example_1-gmail_ETL.ipynb) 19 | - [Training email classifier](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-2/example_2-email_classifier.ipynb) 20 | 21 | ## Session 3: Software 3.0, Foundation Models, Prompt Engineering 22 | Here, we begin building AI systems with LLMs. Unlike machine learning, we don't need datasets to get started. 23 | 24 | Examples: 25 | - [Research paper summarizer](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-3/example_1-paper_summarizer.ipynb) 26 | - [Text classifier](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-3/example_2-text-classifier.ipynb) 27 | - [Local document QA with LLaMA 3.2 Vision](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-3/example_3-local_visual_QA.ipynb) 28 | 29 | ## Session 4: Software 3.0, RAG, Text Embeddings 30 | Prompting LLMs ChatGPT-style only scratches the surface of what we can use modern language models for. We can also leverage RAG to improve model performance and text embeddings to make text computable. 31 | 32 | Examples: 33 | - [Analyzing unstructured survey results](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-4/example_1-unstructured_survey_analysis.ipynb) 34 | - [Semantic search over blog posts](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-4/example_2-blog_semantic_search.ipynb) 35 | - [RAG chatbot over blog series](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-4/example_3-blog_QA_RAG.ipynb) 36 | 37 | ## Session 5: Software 3.0, Fine-tuning 38 | Although LLMs can solve a wide range of problems out-of-the-box, there are situations where more model customization is required. This can be achieved through model fine-tuning, which involves adapting a model to a particular use case through additional training. 39 | 40 | Examples 41 | - [Fine-tuning BERT for Text Classification](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/example_1-finetune_bert_classifier.ipynb) 42 | - [Fine-tuning a LinkedIn Post Writer](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/example_2-linkedin_post_writer.ipynb) 43 | 44 | ## Session 6: AI Project Management 45 | Technical skills are necessary but not sufficient for generating value with AI. In the final session, we review a project management framework for getting an ROI on AI projects and best practices. 46 | -------------------------------------------------------------------------------- /session-1/data/emails.csv: -------------------------------------------------------------------------------- 1 | Name,Email 2 | Shaw Talebi,shaw@example.com 3 | Luca Romano,luca@example.com 4 | Elena Petrova,elena@example.com -------------------------------------------------------------------------------- /session-1/data/survey.csv: -------------------------------------------------------------------------------- 1 | What is your current level of experience with AI/ML?,How many years of programming experience do you have?,Which programming language(s) are you most comfortable with? 2 | 1-3 years,4-6 years,Python 3 | 1-3 years,4-6 years,Python 4 | <1 year,<1 year,Python 5 | 4-6 years,7+ years,Python 6 | <1 year,7+ years,"PHP, Javascript" 7 | <1 year,7+ years,Python 8 | <1 year,1-3 years,I used to code in a previous life and started learning Python this year (intermediate level) and comfortable in BASH 9 | 1-3 years,7+ years,"python, javascript, java, c++, typescript, goLang" 10 | <1 year,7+ years,"C#, C++. Java" 11 | 7+ years,7+ years,"Python, C++ (including CUDA C), R, Javascript, PHP" 12 | <1 year,7+ years,"PHP, some JavaScript" 13 | <1 year,<1 year,Python 14 | <1 year,1-3 years,Python 15 | 1-3 years,7+ years,c# python 16 | <1 year,<1 year,Python 17 | 1-3 years,7+ years,Python 18 | 1-3 years,1-3 years,"Python, bash" 19 | <1 year,<1 year,Python 20 | <1 year,1-3 years,Fortran / Algol / Basic -- 30+ years ago :) (but I was pretty good at it) 21 | <1 year,1-3 years,T-SQL 22 | <1 year,1-3 years,"HTML, CSS, Python" 23 | 1-3 years,1-3 years,Python 24 | <1 year,1-3 years,"Python, C" -------------------------------------------------------------------------------- /session-1/example_1-email_broadcast.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Replicating Maven Broadcast Function\n", 8 | "### ABB #1 - Session 1\n", 9 | "\n", 10 | "Code authored by: Shaw Talebi" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### imports" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": { 24 | "jupyter": { 25 | "source_hidden": true 26 | } 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import csv\n", 31 | "\n", 32 | "import smtplib\n", 33 | "from email.mime.multipart import MIMEMultipart\n", 34 | "from email.mime.text import MIMEText\n", 35 | "\n", 36 | "from top_secret import app_password" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### define variables" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": { 50 | "jupyter": { 51 | "source_hidden": true 52 | } 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "course_name = \"AI Builders Bootcamp\"\n", 57 | "start_date = \"Nov 8th, 2024\"\n", 58 | "end_date = \"Dec 20th, 2024\"\n", 59 | "portal_open_date = \"Nov 4th, 2024\"" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "### construct message" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 3, 72 | "metadata": { 73 | "jupyter": { 74 | "source_hidden": true 75 | } 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "# construct message template\n", 80 | "subject = f\"Welcome to {course_name}!\"" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "metadata": { 87 | "jupyter": { 88 | "source_hidden": true 89 | } 90 | }, 91 | "outputs": [], 92 | "source": [ 93 | "body = lambda student_name : f\"\"\"Hey {student_name},\n", 94 | "\n", 95 | "I'm super excited to have you here!\n", 96 | "\n", 97 | "We'll be getting started on {start_date} and finishing up on {end_date}.\n", 98 | "\n", 99 | "Keep your eye out for the course portal, which will open up on {portal_open_date}.\n", 100 | "\n", 101 | "Let me know if you have any questions in the meantime :)\n", 102 | "\n", 103 | "-Shaw\"\"\"" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 5, 109 | "metadata": { 110 | "jupyter": { 111 | "source_hidden": true 112 | } 113 | }, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "Welcome to AI Builders Bootcamp!\n", 120 | "Hey Shaw,\n", 121 | "\n", 122 | "I'm super excited to have you here!\n", 123 | "\n", 124 | "We'll be getting started on Nov 8th, 2024 and finishing up on Dec 20th, 2024.\n", 125 | "\n", 126 | "Keep your eye out for the course portal, which will open up on Nov 4th, 2024.\n", 127 | "\n", 128 | "Let me know if you have any questions in the meantime :)\n", 129 | "\n", 130 | "-Shaw\n" 131 | ] 132 | } 133 | ], 134 | "source": [ 135 | "print(subject)\n", 136 | "print(body(\"Shaw\"))" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "metadata": {}, 142 | "source": [ 143 | "### read names and emails from .csv file" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 6, 149 | "metadata": { 150 | "jupyter": { 151 | "source_hidden": true 152 | } 153 | }, 154 | "outputs": [ 155 | { 156 | "name": "stdout", 157 | "output_type": "stream", 158 | "text": [ 159 | "[('Shaw', 'shaw@example.com'), ('Luca', 'luca@example.com'), ('Elena', 'elena@example.com')]\n" 160 | ] 161 | } 162 | ], 163 | "source": [ 164 | "# path to CSV file\n", 165 | "filename = \"data/emails.csv\"\n", 166 | "\n", 167 | "# initialize an empty list to store recipient tuples\n", 168 | "recipients_list = []\n", 169 | "\n", 170 | "# read CSV file and populate the recipients list\n", 171 | "with open(filename, mode='r') as csvfile:\n", 172 | " csv_reader = csv.DictReader(csvfile)\n", 173 | " for row in csv_reader:\n", 174 | " name = row[\"Name\"].split(\" \")[0]\n", 175 | " email = row[\"Email\"]\n", 176 | " recipients_list.append((name, email))\n", 177 | "\n", 178 | "print(recipients_list)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "### send email" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 7, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "Email sent successfully to Shaw at shaw@example.com!\n", 198 | "Email sent successfully to Luca at luca@example.com!\n", 199 | "Email sent successfully to Elena at elena@example.com!\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "# Define your email credentials and recipient\n", 205 | "sender_email = \"your email here\"\n", 206 | "password = app_password\n", 207 | "\n", 208 | "# Connect to the server\n", 209 | "with smtplib.SMTP(\"smtp.gmail.com\", 587) as server:\n", 210 | " server.starttls() # Start TLS encryption\n", 211 | " server.login(sender_email, password) # Log into email account\n", 212 | "\n", 213 | " # Send the email to each recipient with personalized content\n", 214 | " for name, email in recipients_list:\n", 215 | " # Set up the MIME\n", 216 | " message = MIMEMultipart()\n", 217 | " message[\"From\"] = sender_email\n", 218 | " message[\"To\"] = email\n", 219 | " message[\"Subject\"] = subject\n", 220 | " \n", 221 | " # Email body with personalization\n", 222 | " personalized_body = body(name)\n", 223 | " message.attach(MIMEText(personalized_body, \"plain\"))\n", 224 | "\n", 225 | " # Send the email\n", 226 | " server.sendmail(sender_email, email, message.as_string())\n", 227 | " print(f\"Email sent successfully to {name} at {email}!\")" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "Note: to create app password for Gmail follow the following steps.\n", 235 | "\n", 236 | "1. Navigate to [Google Account](https://myaccount.google.com/)\n", 237 | "3. Click on \"2-Step Verification\" under \"How you sign in to Google\"\n", 238 | "4. Click on right arrow under \"App passwords\"\n", 239 | "5. Type in new App name and hit \"Create\"\n", 240 | "\n", 241 | "Additional support [here](https://support.google.com/mail/answer/185833?hl=en)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [] 250 | } 251 | ], 252 | "metadata": { 253 | "kernelspec": { 254 | "display_name": "Python 3 (ipykernel)", 255 | "language": "python", 256 | "name": "python3" 257 | }, 258 | "language_info": { 259 | "codemirror_mode": { 260 | "name": "ipython", 261 | "version": 3 262 | }, 263 | "file_extension": ".py", 264 | "mimetype": "text/x-python", 265 | "name": "python", 266 | "nbconvert_exporter": "python", 267 | "pygments_lexer": "ipython3", 268 | "version": "3.12.2" 269 | } 270 | }, 271 | "nbformat": 4, 272 | "nbformat_minor": 4 273 | } 274 | -------------------------------------------------------------------------------- /session-1/example_2-email_broadcast.py: -------------------------------------------------------------------------------- 1 | # Replicating Maven Broadcast Function (Script version) 2 | # ABB #1 - Session 1 3 | 4 | # Code authored by: Shaw Talebi 5 | 6 | # IMPORTS 7 | import csv 8 | 9 | import smtplib 10 | from email.mime.multipart import MIMEMultipart 11 | from email.mime.text import MIMEText 12 | 13 | from top_secret import app_password 14 | 15 | # DEFINE VARIABLES 16 | course_name = "AI Builders Bootcamp" 17 | start_date = "Nov 8th, 2024" 18 | end_date = "Dec 20th, 2024" 19 | portal_open_date = "Nov 4th, 2024" 20 | 21 | # CONSTRUCT EMAIL 22 | subject = f"Welcome to {course_name}!" 23 | print(subject) 24 | 25 | body = lambda student_name : f"""Hey {student_name}, 26 | 27 | I'm super excited to have you here! 28 | 29 | We'll be getting started on {start_date} and finishing up on {end_date}. 30 | 31 | Keep your eye out for the course portal, which will open up on {portal_open_date}. 32 | 33 | Let me know if you have any questions in the meantime :) 34 | 35 | -Shaw""" 36 | print(body("Test")) 37 | 38 | # READ NAMES/EMAILS FROM CSV 39 | filename = "data/emails.csv" # path to CSV file 40 | 41 | recipients_list = [] # initialize an empty list to store recipient tuples 42 | 43 | # read CSV file and populate the recipients list 44 | with open(filename, mode='r') as csvfile: 45 | csv_reader = csv.DictReader(csvfile) 46 | for row in csv_reader: 47 | name = row["Name"].split(" ")[0] 48 | email = row["Email"] 49 | recipients_list.append((name, email)) 50 | 51 | print('\n') 52 | print("Recipients List:", recipients_list) 53 | 54 | # SEND EMAILS 55 | # define email credentials and recipient 56 | sender_email = "your email here" 57 | password = app_password 58 | 59 | # connect to the server 60 | with smtplib.SMTP("smtp.gmail.com", 587) as server: 61 | server.starttls() # start TLS encryption 62 | server.login(sender_email, password) # log into email account 63 | 64 | # send the email to each recipient with personalized content 65 | for name, email in recipients_list: 66 | # set up the MIME 67 | message = MIMEMultipart() 68 | message["From"] = sender_email 69 | message["To"] = email 70 | message["Subject"] = subject 71 | 72 | # email body with personalization 73 | personalized_body = body(name) 74 | message.attach(MIMEText(personalized_body, "plain")) 75 | 76 | # send the email 77 | server.sendmail(sender_email, email, message.as_string()) 78 | print(f"Email sent successfully to {name} at {email}!") -------------------------------------------------------------------------------- /session-1/example_3-report_builder_emailer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Automated Report Builder and Emailer\n", 8 | "### ABB #1 - Session 1\n", 9 | "\n", 10 | "Code authored by: Shaw Talebi" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### imports" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 1, 23 | "metadata": { 24 | "jupyter": { 25 | "source_hidden": true 26 | } 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import csv\n", 31 | "import pandas as pd\n", 32 | "\n", 33 | "import smtplib\n", 34 | "from email.mime.multipart import MIMEMultipart\n", 35 | "from email.mime.text import MIMEText\n", 36 | "\n", 37 | "from top_secret import app_password" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "### load data" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "#### read in emails" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "metadata": { 58 | "jupyter": { 59 | "source_hidden": true 60 | } 61 | }, 62 | "outputs": [ 63 | { 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "[('Shaw', 'shaw@example.com'), ('Luca', 'luca@example.com'), ('Elena', 'elena@example.com')]\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "# load contact info\n", 73 | "filename = \"data/emails.csv\"\n", 74 | "recipients_list = []\n", 75 | "\n", 76 | "# read CSV file and populate the recipients list\n", 77 | "with open(filename, mode='r') as csvfile:\n", 78 | " csv_reader = csv.DictReader(csvfile)\n", 79 | " for row in csv_reader:\n", 80 | " name = row[\"Name\"].split(\" \")[0]\n", 81 | " email = row[\"Email\"]\n", 82 | " recipients_list.append((name, email))\n", 83 | "\n", 84 | "print(recipients_list)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 3, 90 | "metadata": { 91 | "jupyter": { 92 | "source_hidden": true 93 | } 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "# recipients_list = [('Shaw', 'shawhintalebi@gmail.com')]" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "#### read survey results" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": { 111 | "jupyter": { 112 | "source_hidden": true 113 | } 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "filename = \"data/survey.csv\"\n", 118 | "df_survey = pd.read_csv(filename)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "metadata": { 125 | "jupyter": { 126 | "source_hidden": true 127 | } 128 | }, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/html": [ 133 | "
\n", 134 | "\n", 147 | "\n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | "
What is your current level of experience with AI/ML?How many years of programming experience do you have?Which programming language(s) are you most comfortable with?
01-3 years4-6 yearsPython
11-3 years4-6 yearsPython
2<1 year<1 yearPython
34-6 years7+ yearsPython
4<1 year7+ yearsPHP, Javascript
\n", 189 | "
" 190 | ], 191 | "text/plain": [ 192 | " What is your current level of experience with AI/ML? \\\n", 193 | "0 1-3 years \n", 194 | "1 1-3 years \n", 195 | "2 <1 year \n", 196 | "3 4-6 years \n", 197 | "4 <1 year \n", 198 | "\n", 199 | " How many years of programming experience do you have? \\\n", 200 | "0 4-6 years \n", 201 | "1 4-6 years \n", 202 | "2 <1 year \n", 203 | "3 7+ years \n", 204 | "4 7+ years \n", 205 | "\n", 206 | " Which programming language(s) are you most comfortable with? \n", 207 | "0 Python \n", 208 | "1 Python \n", 209 | "2 Python \n", 210 | "3 Python \n", 211 | "4 PHP, Javascript " 212 | ] 213 | }, 214 | "execution_count": 5, 215 | "metadata": {}, 216 | "output_type": "execute_result" 217 | } 218 | ], 219 | "source": [ 220 | "df_survey.head()" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 6, 226 | "metadata": { 227 | "jupyter": { 228 | "source_hidden": true 229 | } 230 | }, 231 | "outputs": [ 232 | { 233 | "name": "stdout", 234 | "output_type": "stream", 235 | "text": [ 236 | "23\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "# response count\n", 242 | "class_size = 43\n", 243 | "num_responses = len(df_survey)\n", 244 | "print(num_responses)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 7, 250 | "metadata": { 251 | "jupyter": { 252 | "source_hidden": true 253 | } 254 | }, 255 | "outputs": [ 256 | { 257 | "name": "stdout", 258 | "output_type": "stream", 259 | "text": [ 260 | "What is your current level of experience with AI/ML?\n", 261 | "<1 year 14\n", 262 | "1-3 years 7\n", 263 | "4-6 years 1\n", 264 | "7+ years 1\n", 265 | "Name: count, dtype: int64\n" 266 | ] 267 | } 268 | ], 269 | "source": [ 270 | "# AI experience distribution\n", 271 | "ai_experience = df_survey.iloc[:,0].value_counts()\n", 272 | "print(ai_experience)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 8, 278 | "metadata": { 279 | "jupyter": { 280 | "source_hidden": true 281 | } 282 | }, 283 | "outputs": [ 284 | { 285 | "name": "stdout", 286 | "output_type": "stream", 287 | "text": [ 288 | "How many years of programming experience do you have?\n", 289 | "7+ years 9\n", 290 | "1-3 years 8\n", 291 | "<1 year 4\n", 292 | "4-6 years 2\n", 293 | "Name: count, dtype: int64\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "# coding experience distribution\n", 299 | "coding_experience = df_survey.iloc[:,1].value_counts()\n", 300 | "print(coding_experience)" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 9, 306 | "metadata": { 307 | "jupyter": { 308 | "source_hidden": true 309 | } 310 | }, 311 | "outputs": [ 312 | { 313 | "name": "stdout", 314 | "output_type": "stream", 315 | "text": [ 316 | " Count\n", 317 | "Language \n", 318 | "Python 18\n", 319 | "C++ 7\n", 320 | "JavaScript 4\n", 321 | "PHP 3\n", 322 | "Bash 2\n", 323 | "Java 2\n", 324 | "TypeScript 1\n", 325 | "goLang 1\n", 326 | "R 1\n", 327 | "Fortran 1\n", 328 | "Algol 1\n", 329 | "Basic 1\n", 330 | "SQL 1\n", 331 | "HTML 1\n", 332 | "CSS 1\n", 333 | "C# 0\n" 334 | ] 335 | } 336 | ], 337 | "source": [ 338 | "# coding language distribution\n", 339 | "\n", 340 | "# define coding languages\n", 341 | "lang_list = ['Python', 'PHP', 'JavaScript', 'Bash', 'C++', 'Java', 'TypeScript', 'goLang', 'C#', 'R', 'Fortran', 'Algol', 'Basic', 'SQL', 'HTML', 'CSS']\n", 342 | "\n", 343 | "# create a dictionary to store the count for each language\n", 344 | "lang_dict = {}\n", 345 | "\n", 346 | "# count occurrences of each language in the Series\n", 347 | "for lang in lang_list:\n", 348 | " lang_dict[lang] = df_survey.iloc[:,2].str.lower().str.count(r'\\b' + lang.lower() + r'\\b').sum()\n", 349 | "\n", 350 | "# convert the counts to a DataFrame for easier readability\n", 351 | "df_lang = pd.DataFrame(list(lang_dict.items()), columns=['Language', 'Count']).set_index('Language').sort_values(by='Count', ascending=False)\n", 352 | "print(df_lang)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "### construct message" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 10, 365 | "metadata": { 366 | "jupyter": { 367 | "source_hidden": true 368 | } 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "def remove_first_last_lines(text):\n", 373 | " lines = text.splitlines()\n", 374 | " return \"\\n\".join(lines[1:-1])\n", 375 | "\n", 376 | "def remove_first_two_lines(text):\n", 377 | " lines = text.splitlines()\n", 378 | " return \"\\n\".join(lines[2:])" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 11, 384 | "metadata": { 385 | "jupyter": { 386 | "source_hidden": true 387 | } 388 | }, 389 | "outputs": [], 390 | "source": [ 391 | "subject = \"The results are in...\"\n", 392 | "body = lambda student_name : f\"\"\"Hey {student_name},\n", 393 | "\n", 394 | "I wanted to share the pre-course survey results before our first session tomorrow.\n", 395 | "\n", 396 | "We had {num_responses} responses out of the {class_size} enrolled ({int(100*round(num_responses/class_size,2))}% response rate).\n", 397 | "\n", 398 | "Here's a breakdown of this cohort's incoming experience 👇\n", 399 | "\n", 400 | "AI/ML Experience:\n", 401 | "{remove_first_last_lines(str(ai_experience))}\n", 402 | "\n", 403 | "Programming Experience:\n", 404 | "{remove_first_last_lines(str(coding_experience))}\n", 405 | "\n", 406 | "Top Languages:\n", 407 | "{remove_first_two_lines(str(df_lang[:4]))}\n", 408 | "\n", 409 | "Looking forward to meeting you all tomorrow!\n", 410 | "\n", 411 | "-Shaw\n", 412 | "\n", 413 | "P.S. If you haven't completed the pre-course survey yet, you can access it here: https://forms.gle/YQ5DbbovRUf5Utvc8\"\"\"" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": 12, 419 | "metadata": { 420 | "jupyter": { 421 | "source_hidden": true 422 | } 423 | }, 424 | "outputs": [ 425 | { 426 | "name": "stdout", 427 | "output_type": "stream", 428 | "text": [ 429 | "Hey Test,\n", 430 | "\n", 431 | "I wanted to share the pre-course survey results before our first session tomorrow.\n", 432 | "\n", 433 | "We had 23 responses out of the 43 enrolled (53% response rate).\n", 434 | "\n", 435 | "Here's a breakdown of this cohort's incoming experience 👇\n", 436 | "\n", 437 | "AI/ML Experience:\n", 438 | "<1 year 14\n", 439 | "1-3 years 7\n", 440 | "4-6 years 1\n", 441 | "7+ years 1\n", 442 | "\n", 443 | "Programming Experience:\n", 444 | "7+ years 9\n", 445 | "1-3 years 8\n", 446 | "<1 year 4\n", 447 | "4-6 years 2\n", 448 | "\n", 449 | "Top Languages:\n", 450 | "Python 18\n", 451 | "C++ 7\n", 452 | "JavaScript 4\n", 453 | "PHP 3\n", 454 | "\n", 455 | "Looking forward to meeting you all tomorrow!\n", 456 | "\n", 457 | "-Shaw\n", 458 | "\n", 459 | "P.S. If you haven't completed the pre-course survey yet, you can access it here: https://forms.gle/YQ5DbbovRUf5Utvc8\n" 460 | ] 461 | } 462 | ], 463 | "source": [ 464 | "subject = \"The results are in...\"\n", 465 | "print(body(\"Test\"))" 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "### send emails" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 13, 478 | "metadata": {}, 479 | "outputs": [ 480 | { 481 | "name": "stdout", 482 | "output_type": "stream", 483 | "text": [ 484 | "Email sent successfully to Shaw at shaw@example.com!\n", 485 | "Email sent successfully to Luca at luca@example.com!\n", 486 | "Email sent successfully to Elena at elena@example.com!\n" 487 | ] 488 | } 489 | ], 490 | "source": [ 491 | "# Define your email credentials and recipient\n", 492 | "sender_email = \"your email here\"\n", 493 | "password = app_password\n", 494 | "\n", 495 | "# Connect to the server\n", 496 | "with smtplib.SMTP(\"smtp.gmail.com\", 587) as server:\n", 497 | " server.starttls() # Start TLS encryption\n", 498 | " server.login(sender_email, password) # Log into email account\n", 499 | "\n", 500 | " # Send the email to each recipient with personalized content\n", 501 | " for name, email in recipients_list:\n", 502 | " # Set up the MIME\n", 503 | " message = MIMEMultipart()\n", 504 | " message[\"From\"] = sender_email\n", 505 | " message[\"To\"] = email\n", 506 | " message[\"Subject\"] = subject\n", 507 | " \n", 508 | " # Email body with personalization\n", 509 | " personalized_body = body(name)\n", 510 | " message.attach(MIMEText(personalized_body, \"plain\"))\n", 511 | "\n", 512 | " # Send the email\n", 513 | " server.sendmail(sender_email, email, message.as_string())\n", 514 | " print(f\"Email sent successfully to {name} at {email}!\")" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": null, 520 | "metadata": {}, 521 | "outputs": [], 522 | "source": [] 523 | } 524 | ], 525 | "metadata": { 526 | "kernelspec": { 527 | "display_name": "Python 3 (ipykernel)", 528 | "language": "python", 529 | "name": "python3" 530 | }, 531 | "language_info": { 532 | "codemirror_mode": { 533 | "name": "ipython", 534 | "version": 3 535 | }, 536 | "file_extension": ".py", 537 | "mimetype": "text/x-python", 538 | "name": "python", 539 | "nbconvert_exporter": "python", 540 | "pygments_lexer": "ipython3", 541 | "version": "3.12.2" 542 | } 543 | }, 544 | "nbformat": 4, 545 | "nbformat_minor": 4 546 | } 547 | -------------------------------------------------------------------------------- /session-1/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | ipykernel 3 | pandas 4 | -------------------------------------------------------------------------------- /session-1/slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-1/slides.pdf -------------------------------------------------------------------------------- /session-1/top_secret.py: -------------------------------------------------------------------------------- 1 | app_password = "paste app password here" -------------------------------------------------------------------------------- /session-2/data/transformed_data.csv: -------------------------------------------------------------------------------- 1 | body_length,contains_lol,contains_omg,contains_attached,contains_attachment,contains_order_confirmation,contains_payment_summary,sender_has_common_domain,is_personal 2 | 1830,False,False,False,False,False,False,False,True 3 | 1522,False,False,False,False,False,False,True,True 4 | 1838,False,False,False,False,False,False,False,True 5 | 251,False,False,False,True,False,False,False,True 6 | 3170,False,False,False,False,False,False,False,True 7 | 2782,False,False,False,False,False,False,False,True 8 | 11026,False,False,False,False,False,False,False,True 9 | 7025,False,False,False,False,False,False,False,True 10 | 3265,False,False,False,True,False,False,True,True 11 | 3007,False,False,False,True,False,False,False,True 12 | 1954,False,False,False,False,False,False,True,True 13 | 1600,False,False,False,False,False,False,False,True 14 | 1841,False,False,False,False,False,False,False,True 15 | 1841,False,False,False,False,False,False,False,True 16 | 201,False,False,False,False,False,False,False,True 17 | 8520,False,False,False,False,False,False,False,True 18 | 7454,False,False,False,False,False,False,False,True 19 | 8520,False,False,False,False,False,False,False,True 20 | 8520,False,False,False,False,False,False,False,True 21 | 201,False,False,False,False,False,False,False,True 22 | 201,False,False,False,False,False,False,False,True 23 | 3801,False,False,False,False,False,False,False,True 24 | 1068,False,False,False,False,False,False,False,True 25 | 1593,False,False,False,False,False,False,False,True 26 | 1955,False,False,False,False,False,False,False,True 27 | 4499,False,False,False,False,False,False,False,True 28 | 5088,False,False,False,False,False,False,False,True 29 | 57838,False,False,False,False,False,False,False,True 30 | 4420,False,False,False,False,False,False,False,True 31 | 384,True,False,True,False,False,False,True,True 32 | 298,False,False,True,False,False,False,True,True 33 | 845,False,False,False,False,False,False,True,True 34 | 115,False,False,False,False,False,False,True,True 35 | 2,False,False,False,False,False,False,True,True 36 | 1376,False,False,False,False,False,False,True,True 37 | 1146,False,False,False,False,False,False,True,True 38 | 822,True,False,False,True,False,False,True,True 39 | 6847,False,False,True,False,False,False,True,True 40 | 6323,False,False,False,False,False,False,True,True 41 | 4230,False,False,False,False,False,False,True,True 42 | 2020,False,False,False,True,False,False,True,True 43 | 1608,False,False,False,False,False,False,True,True 44 | 1119,False,False,False,False,False,False,False,True 45 | 773,False,False,False,False,False,False,True,True 46 | 640,False,False,False,False,False,False,False,True 47 | 2246,False,False,False,False,False,False,False,True 48 | 1865,False,False,True,False,False,False,False,True 49 | 4720,False,False,False,False,False,False,False,True 50 | 16327,False,False,True,False,False,False,False,True 51 | 6165,False,False,False,True,False,False,True,True 52 | 1931,False,False,False,False,False,False,False,True 53 | 325,False,False,False,False,False,False,True,True 54 | 926,False,False,False,False,False,False,True,True 55 | 776,False,False,False,False,False,False,True,True 56 | 260,False,False,False,True,False,False,True,True 57 | 1405,True,False,False,False,False,False,True,True 58 | 1338,True,False,False,False,False,False,False,True 59 | 13259,False,False,False,False,False,False,True,True 60 | 0,False,False,False,False,False,False,False,True 61 | 53,False,False,False,False,False,False,True,True 62 | 2040,False,False,False,False,False,False,True,True 63 | 14690,False,False,False,False,False,False,True,True 64 | 4830,False,False,True,False,False,False,True,True 65 | 3466,False,False,False,False,False,False,True,True 66 | 3181,False,False,False,False,False,False,True,True 67 | 10438,False,False,False,False,False,False,True,True 68 | 1470,False,False,False,True,True,False,True,True 69 | 1321,False,False,False,False,False,False,True,True 70 | 580,False,False,False,False,False,False,True,True 71 | 0,False,False,False,False,False,False,False,True 72 | 17408,False,False,False,False,False,False,True,True 73 | 16971,False,False,False,False,False,False,True,True 74 | 22849,False,False,False,False,False,False,True,True 75 | 728,False,False,False,False,False,False,True,True 76 | 599,False,False,False,False,False,False,True,True 77 | 480,False,False,False,False,False,False,True,True 78 | 367,False,False,False,False,False,False,True,True 79 | 4,False,False,False,False,False,False,True,True 80 | 1552,False,False,False,False,False,False,True,True 81 | 331,False,False,False,False,False,False,False,True 82 | 85,False,False,False,False,False,False,True,True 83 | 34,False,False,False,False,False,False,True,True 84 | 1247,False,False,False,False,False,False,True,True 85 | 3027,False,False,False,False,False,False,True,True 86 | 23190,True,False,False,False,False,False,True,True 87 | 14691,False,False,False,False,False,False,True,True 88 | 23705,False,False,False,False,False,False,True,True 89 | 1371,False,False,False,False,False,False,True,True 90 | 1373,False,False,False,False,False,False,True,True 91 | 21076,False,False,False,False,True,False,True,True 92 | 2640,False,False,False,False,True,False,False,True 93 | 3511,False,False,False,False,False,False,False,True 94 | 27,False,False,False,False,False,False,True,True 95 | 2534,False,False,False,False,False,False,True,True 96 | 4220,False,False,False,False,False,False,True,True 97 | 4041,False,False,False,False,False,False,True,True 98 | 1073,False,False,False,False,False,False,False,True 99 | 11,False,False,False,False,False,False,True,True 100 | 85,False,False,False,False,False,False,True,True 101 | 471,False,False,True,False,False,False,True,True 102 | 381,False,False,True,False,False,False,True,True 103 | 211,False,False,False,False,False,False,True,True 104 | 5258,False,False,False,False,False,False,True,True 105 | 4017,False,False,False,False,False,False,True,True 106 | 17,False,False,False,False,False,False,True,True 107 | 4884,False,False,False,False,True,False,True,True 108 | 5083,False,False,False,False,False,False,True,True 109 | 5301,False,False,False,False,False,False,True,True 110 | 4791,False,False,False,False,False,False,True,True 111 | 4577,False,False,False,False,False,False,True,True 112 | 7449,False,False,False,False,False,False,False,True 113 | 861,False,False,False,False,False,False,True,True 114 | 501,False,False,False,False,False,False,True,True 115 | 390,False,False,False,False,False,False,True,True 116 | 284,False,False,False,True,False,False,False,True 117 | 258,False,False,False,True,False,False,False,True 118 | 532,False,False,False,False,False,False,True,True 119 | 360,False,False,False,False,False,False,True,True 120 | 654,False,False,False,False,False,False,True,True 121 | 127,False,False,False,False,False,False,True,True 122 | 31,False,False,False,False,False,False,True,True 123 | 1323,False,False,False,False,False,False,True,True 124 | 1289,False,False,False,False,False,False,True,True 125 | 4,False,False,False,False,False,False,True,True 126 | 3313,False,False,False,False,False,False,True,True 127 | 305,False,False,False,False,False,False,True,True 128 | 201,False,False,False,False,False,False,True,True 129 | 409,False,False,False,False,False,False,True,True 130 | 3035,True,False,False,False,False,False,True,True 131 | 2857,False,False,False,False,False,False,True,True 132 | 2624,False,False,False,False,False,False,True,True 133 | 2375,False,False,False,False,False,False,True,True 134 | 17911,False,False,False,False,False,False,True,True 135 | 257,False,False,False,True,False,False,False,True 136 | 492,False,False,False,False,False,False,True,True 137 | 8092,False,False,False,False,False,False,True,True 138 | 1772,False,False,False,False,False,False,True,True 139 | 8239,False,False,False,False,False,False,True,True 140 | 11113,False,False,False,False,False,False,True,True 141 | 4437,False,False,True,False,False,False,True,True 142 | 3844,False,False,True,False,False,False,True,True 143 | 2931,False,False,True,False,False,False,True,True 144 | 314,False,False,False,True,False,False,False,True 145 | 3603,False,False,True,False,False,False,True,True 146 | 6514,False,False,True,False,False,False,True,True 147 | 3337,False,False,False,False,False,False,True,True 148 | 3244,False,False,False,False,False,False,True,True 149 | 1529,False,False,False,False,False,False,False,True 150 | 23901,False,True,False,False,False,False,True,True 151 | 23428,False,True,False,False,False,False,True,True 152 | 1616,False,False,False,False,False,False,False,False 153 | 1164,False,False,False,False,False,False,True,False 154 | 16654,False,False,False,False,False,False,False,False 155 | 78,False,False,False,False,False,False,True,False 156 | 2550,False,False,False,False,False,False,False,False 157 | 2452,False,False,False,False,False,False,False,False 158 | 1737,False,False,False,False,False,False,False,False 159 | 1542,False,False,False,False,False,False,False,False 160 | 1107,False,False,False,False,False,False,False,False 161 | 1641,False,False,False,False,False,False,False,False 162 | 1713,False,False,False,False,False,False,False,False 163 | 1737,False,False,False,False,False,False,False,False 164 | 1074,False,False,False,False,False,False,False,False 165 | 4451,True,False,False,False,False,False,False,False 166 | 21592,False,False,False,False,False,False,False,False 167 | 9581,False,False,False,False,False,False,False,False 168 | 4086,False,False,False,False,False,False,False,False 169 | 11073,False,False,False,False,False,False,False,False 170 | 1456,False,False,False,False,False,False,False,False 171 | 720,False,False,False,False,False,False,False,False 172 | 2422,False,False,False,False,False,False,False,False 173 | 3766,False,True,False,False,False,False,False,False 174 | 1477,False,False,False,False,False,False,False,False 175 | 2712,False,False,False,False,False,False,False,False 176 | 2295,False,False,False,False,False,False,False,False 177 | 2841,False,False,False,False,False,False,False,False 178 | 3427,False,False,False,False,False,False,False,False 179 | 2872,False,False,False,False,False,False,True,False 180 | 2490,False,False,False,False,False,False,False,False 181 | 677,False,False,False,False,False,False,False,False 182 | 2283,False,False,False,False,False,False,False,False 183 | 528,False,False,False,False,False,False,False,False 184 | 3338,False,False,False,False,False,False,True,False 185 | 2983,False,False,False,False,False,False,True,False 186 | 0,False,False,False,False,False,False,False,False 187 | 4790,True,True,False,False,False,False,True,False 188 | 0,False,False,False,False,False,False,True,False 189 | 25289,False,False,False,False,False,False,False,False 190 | 58,False,False,False,False,False,False,True,False 191 | 1947,False,False,True,False,False,False,True,False 192 | 883,False,False,False,False,False,False,False,False 193 | 855,False,False,False,False,False,False,False,False 194 | 2035,False,False,False,False,False,False,False,False 195 | 2424,False,False,False,False,False,False,False,False 196 | 605,False,False,False,False,False,True,False,False 197 | 612,False,False,False,False,False,True,False,False 198 | 617,False,False,False,False,False,True,False,False 199 | 615,False,False,False,False,False,True,False,False 200 | 617,False,False,False,False,False,True,False,False 201 | 3975,False,False,False,False,False,False,False,False 202 | 616,False,False,False,False,False,True,False,False 203 | 6167,False,False,False,False,False,False,False,False 204 | 615,False,False,False,False,False,True,False,False 205 | 613,False,False,False,False,False,True,False,False 206 | 607,False,False,False,False,False,True,False,False 207 | 636,False,False,False,False,False,True,False,False 208 | 633,False,False,False,False,False,True,False,False 209 | 635,False,False,False,False,False,True,False,False 210 | 3017,False,False,False,False,False,False,False,False 211 | 3017,False,False,False,False,False,False,False,False 212 | 637,False,False,False,False,False,True,False,False 213 | 644,False,False,False,False,False,True,False,False 214 | 18824,False,False,False,False,False,False,False,False 215 | 647,False,False,False,False,False,True,False,False 216 | 9982,False,False,False,False,False,False,False,False 217 | 1333,False,False,False,False,False,False,False,False 218 | 9158,False,False,False,False,False,False,False,False 219 | 645,False,False,False,False,False,True,False,False 220 | 9573,False,False,False,False,False,False,False,False 221 | 20297,False,False,False,False,False,False,False,False 222 | 10222,False,False,False,False,False,False,False,False 223 | 647,False,False,False,False,False,True,False,False 224 | 7078,False,False,False,False,False,False,False,False 225 | 646,False,False,False,False,False,True,False,False 226 | 3926,False,False,False,False,False,False,False,False 227 | 1276,False,False,False,False,False,False,False,False 228 | 4223,False,False,False,False,False,False,False,False 229 | 645,False,False,False,False,False,True,False,False 230 | 3627,False,False,False,False,False,False,False,False 231 | 1148,False,False,False,False,False,False,False,False 232 | 3825,False,False,False,False,False,False,False,False 233 | 644,False,False,False,False,False,True,False,False 234 | 18682,False,False,False,False,False,False,False,False 235 | 638,False,False,False,False,False,True,False,False 236 | 5736,False,False,False,False,False,False,False,False 237 | 9269,False,False,False,False,False,False,False,False 238 | 9326,False,False,False,False,False,False,False,False 239 | 11111,False,False,False,False,False,False,False,False 240 | 636,False,False,False,False,False,True,False,False 241 | 3200,False,False,False,False,False,False,False,False 242 | 11024,False,False,False,False,False,False,False,False 243 | 633,False,False,False,False,False,True,False,False 244 | 14645,False,False,False,False,False,False,False,False 245 | 14231,False,False,False,False,False,False,False,False 246 | 635,False,False,False,False,False,True,False,False 247 | 2439,False,False,False,False,False,False,False,False 248 | 2439,False,False,False,False,False,False,False,False 249 | 14950,False,False,False,False,False,False,False,False 250 | 25518,False,False,False,False,False,False,False,False 251 | 17292,False,False,False,False,False,False,False,False 252 | 22354,False,False,False,False,False,False,False,False 253 | 637,False,False,False,False,False,True,False,False 254 | 30169,False,False,False,False,False,False,False,False 255 | 2464,False,False,False,False,False,False,False,False 256 | 535,False,False,False,False,False,False,False,False 257 | 31044,False,False,False,False,False,False,False,False 258 | 37546,False,False,False,False,False,False,False,False 259 | 2463,False,False,False,False,False,False,False,False 260 | 22601,False,False,False,False,False,False,False,False 261 | 15373,False,False,False,False,False,False,False,False 262 | 11570,False,False,False,False,False,False,False,False 263 | 2812,False,False,False,False,False,False,False,False 264 | 21623,False,False,False,False,False,False,False,False 265 | 18536,False,False,False,False,False,False,False,False 266 | 23076,False,False,False,False,False,False,False,False 267 | 2464,False,False,False,False,False,False,False,False 268 | 13046,False,False,False,False,False,False,False,False 269 | 12586,False,False,False,False,False,False,False,False 270 | 2459,False,False,False,False,False,False,False,False 271 | 15562,False,False,False,False,False,False,False,False 272 | 2460,False,False,False,False,False,False,False,False 273 | 28681,False,False,False,False,False,False,False,False 274 | 3185,False,False,False,False,False,False,False,False 275 | 33824,False,False,False,False,False,False,False,False 276 | 2455,False,False,False,False,False,False,False,False 277 | 27622,False,False,False,False,False,False,False,False 278 | 25305,False,False,False,False,False,False,False,False 279 | 2452,False,False,False,False,False,False,False,False 280 | 3248,False,False,False,False,False,False,False,False 281 | 501,False,False,False,False,False,False,False,False 282 | 14414,False,False,False,False,False,False,False,False 283 | 14563,False,False,False,False,False,False,False,False 284 | 20847,False,False,False,False,False,False,False,False 285 | 2450,False,False,False,False,False,False,False,False 286 | 20246,False,False,False,False,False,False,False,False 287 | 21494,False,False,False,False,False,False,False,False 288 | 15083,False,False,False,False,False,False,False,False 289 | 2448,False,False,False,False,False,False,False,False 290 | 13933,False,False,False,False,False,False,False,False 291 | 11642,False,False,False,False,False,False,False,False 292 | 2456,False,False,False,False,False,False,False,False 293 | 2450,False,False,False,False,False,False,False,False 294 | 2456,False,False,False,False,False,False,False,False 295 | 19151,False,False,False,False,False,False,False,False 296 | 20385,False,False,False,False,False,False,False,False 297 | 2453,False,False,False,False,False,False,False,False 298 | 12549,False,False,False,False,False,False,False,False 299 | 27430,False,False,False,False,False,False,False,False 300 | 11894,False,False,False,False,False,False,False,False 301 | 25131,False,False,False,False,False,False,False,False 302 | -------------------------------------------------------------------------------- /session-2/example_1-gmail_ETL.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "773f2b77-a3c6-47b8-8246-1b958555f4af", 6 | "metadata": {}, 7 | "source": [ 8 | "# ETL of Gmail Data\n", 9 | "## ABB #1 - Session 2\n", 10 | "\n", 11 | "Code authored by: Shaw Talebi" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "62058d9d-adc2-4a6f-a60e-ff2abca7d93b", 17 | "metadata": {}, 18 | "source": [ 19 | "### imports" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "aa6c91d1-058e-49a7-8296-75bb925c2e8b", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import imaplib\n", 30 | "import email\n", 31 | "from email.policy import default\n", 32 | "import pandas as pd\n", 33 | "\n", 34 | "from top_secret import app_password" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "7e99a8de", 40 | "metadata": {}, 41 | "source": [ 42 | "### functions" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "id": "eb2bc87b-3641-4953-88de-261aaabd48ca", 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "def parse_emails(mail, mailbox_name):\n", 53 | " \"\"\"\n", 54 | " Parse emails from a given mailbox and return a dataframe with the emails and labels.\n", 55 | " \"\"\"\n", 56 | "\n", 57 | " # Step 2: Select the folder\n", 58 | " mail.select(mailbox_name) # 'inbox' or 'spam'\n", 59 | "\n", 60 | " # Step 3: Search for emails\n", 61 | " _, messages = mail.search(None, \"ALL\")\n", 62 | "\n", 63 | " # Step 4: Extract emails\n", 64 | " # Fetch and parse the emails\n", 65 | " email_data = []\n", 66 | " for msg_id in messages[0].split():\n", 67 | " _, msg_data = mail.fetch(msg_id, \"(RFC822)\")\n", 68 | " \n", 69 | " for response_part in msg_data:\n", 70 | " if isinstance(response_part, tuple):\n", 71 | " msg = email.message_from_bytes(response_part[1], policy=default)\n", 72 | " subject = msg[\"subject\"]\n", 73 | " from_ = msg[\"from\"]\n", 74 | " if msg.is_multipart():\n", 75 | " body = \"\"\n", 76 | " for part in msg.walk():\n", 77 | " if part.get_content_type() == \"text/plain\":\n", 78 | " body += part.get_payload(decode=True).decode('utf-8', errors='ignore')\n", 79 | " else:\n", 80 | " body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')\n", 81 | "\n", 82 | " email_data.append({\n", 83 | " \"subject\": subject,\n", 84 | " \"from\": from_,\n", 85 | " \"body\": body\n", 86 | " })\n", 87 | "\n", 88 | " return email_data" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 3, 94 | "id": "4300d448", 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "def extract_folder_names(mail, keyword):\n", 99 | " \"\"\"\n", 100 | " Extract folder names containing a given keyword.\n", 101 | " \"\"\"\n", 102 | " # Get list of folders\n", 103 | " _, folders = mail.list()\n", 104 | "\n", 105 | " # Extract folder names containing \"Personal\"\n", 106 | " folder_list = []\n", 107 | " for folder in folders:\n", 108 | " # Decode from bytes to string and split by quotes\n", 109 | " folder_str = folder.decode('utf-8')\n", 110 | " # Extract folder name (it's between the last quote marks)\n", 111 | " folder_name = folder_str.split('\"')[-2]\n", 112 | " \n", 113 | " if keyword in folder_name:\n", 114 | " folder_list.append(folder_name)\n", 115 | "\n", 116 | " return folder_list" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 4, 122 | "id": "dd1396b8", 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "def parse_emails_from_keyword(mail, num_emails, keyword, label):\n", 127 | " \"\"\"\n", 128 | " Parse emails from a given keyword and return a dataframe with the emails and labels.\n", 129 | " \"\"\"\n", 130 | "\n", 131 | " folder_list = extract_folder_names(mail, keyword)\n", 132 | "\n", 133 | " email_data = []\n", 134 | " for mailbox_name in folder_list:\n", 135 | " try:\n", 136 | " email_data = email_data + parse_emails(mail, mailbox_name)\n", 137 | " except:\n", 138 | " print(f\"Could not parse {mailbox_name}\")\n", 139 | "\n", 140 | " if len(email_data) > num_emails:\n", 141 | " break\n", 142 | "\n", 143 | " df = pd.DataFrame(email_data)\n", 144 | " df[\"label\"] = label\n", 145 | " \n", 146 | " return df" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "id": "6f0e9770-2ec3-47ea-bd42-884c53a40918", 152 | "metadata": {}, 153 | "source": [ 154 | "### Extract" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 5, 160 | "id": "6b44d835", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "data": { 165 | "text/plain": [ 166 | "('OK', [b'swcballa@gmail.com authenticated (Success)'])" 167 | ] 168 | }, 169 | "execution_count": 5, 170 | "metadata": {}, 171 | "output_type": "execute_result" 172 | } 173 | ], 174 | "source": [ 175 | "# define email details\n", 176 | "imap_server = \"imap.gmail.com\"\n", 177 | "email_address = \"your email here\"\n", 178 | "\n", 179 | "# establish connection to the server\n", 180 | "mail = imaplib.IMAP4_SSL(imap_server)\n", 181 | "mail.login(email_address, app_password)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 6, 187 | "id": "39722d2a", 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "Could not parse Personal/Family/Amoo Ali\n", 195 | "254\n" 196 | ] 197 | } 198 | ], 199 | "source": [ 200 | "# extract personal emails\n", 201 | "num_emails = 150\n", 202 | "keyword = \"Personal\"\n", 203 | "label = \"personal\"\n", 204 | "\n", 205 | "df_personal = parse_emails_from_keyword(mail, num_emails, keyword, label)\n", 206 | "print(len(df_personal))" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 7, 212 | "id": "dcb6e0c1", 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "Could not parse STV/Amazon Assoc\n", 220 | "160\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "# extract non-personal emails\n", 226 | "keyword = \"STV\"\n", 227 | "label = \"not personal\"\n", 228 | "\n", 229 | "df_not_personal = parse_emails_from_keyword(mail, num_emails, keyword, label)\n", 230 | "print(len(df_not_personal))" 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 8, 236 | "id": "2aec83d4", 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "('BYE', [b'LOGOUT Requested'])" 243 | ] 244 | }, 245 | "execution_count": 8, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "# log out of the server\n", 252 | "mail.logout()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "id": "4ed3e5b4", 258 | "metadata": {}, 259 | "source": [ 260 | "### Transform" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 9, 266 | "id": "35ea97fc", 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "300\n" 274 | ] 275 | } 276 | ], 277 | "source": [ 278 | "# merge emails into a single dataframe\n", 279 | "df = pd.concat([df_personal[:num_emails], df_not_personal[:num_emails]])\n", 280 | "print(len(df))" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 10, 286 | "id": "9534cb4a", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# feature engineering\n", 291 | "\n", 292 | "# length of the body\n", 293 | "df[\"body_length\"] = df[\"body\"].apply(len)\n", 294 | "\n", 295 | "# contains \"lol\"\n", 296 | "df[\"contains_lol\"] = df[\"body\"].apply(lambda x: \"lol\" in x.lower())\n", 297 | "\n", 298 | "# contains \"OMG\"\n", 299 | "df[\"contains_omg\"] = df[\"body\"].apply(lambda x: \"omg\" in x.lower())\n", 300 | "\n", 301 | "# contains \"attached\" or \"see attached\" or \"see attachment\"\n", 302 | "df[\"contains_attached\"] = df[\"body\"].apply(lambda x: \"attached\" in x.lower())\n", 303 | "df[\"contains_attachment\"] = df[\"body\"].apply(lambda x: \"attachment\" in x.lower())\n", 304 | "\n", 305 | "# contains \"Order Confirmation\"\n", 306 | "df[\"contains_order_confirmation\"] = df[\"body\"].apply(lambda x: \"order confirmation\" in x.lower())\n", 307 | "\n", 308 | "# contains \"payment summary\"\n", 309 | "df[\"contains_payment_summary\"] = df[\"body\"].apply(lambda x: \"payment summary\" in x.lower())\n", 310 | "\n", 311 | "# sender is common person domain (gmail, yahoo, hotmail) \n", 312 | "df[\"sender_has_common_domain\"] = df[\"from\"].apply(lambda x: any(domain in x.lower() for domain in [\"gmail\", \"yahoo\", \"hotmail\"]))\n", 313 | "\n", 314 | "# is personal email\n", 315 | "df[\"is_personal\"] = df[\"label\"] == \"personal\"" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 11, 321 | "id": "e7c1e97f", 322 | "metadata": {}, 323 | "outputs": [ 324 | { 325 | "data": { 326 | "text/html": [ 327 | "
\n", 328 | "\n", 341 | "\n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | "
body_lengthcontains_lolcontains_omgcontains_attachedcontains_attachmentcontains_order_confirmationcontains_payment_summarysender_has_common_domainis_personal
01830FalseFalseFalseFalseFalseFalseFalseTrue
11522FalseFalseFalseFalseFalseFalseTrueTrue
21838FalseFalseFalseFalseFalseFalseFalseTrue
3251FalseFalseFalseTrueFalseFalseFalseTrue
43170FalseFalseFalseFalseFalseFalseFalseTrue
\n", 419 | "
" 420 | ], 421 | "text/plain": [ 422 | " body_length contains_lol contains_omg contains_attached \\\n", 423 | "0 1830 False False False \n", 424 | "1 1522 False False False \n", 425 | "2 1838 False False False \n", 426 | "3 251 False False False \n", 427 | "4 3170 False False False \n", 428 | "\n", 429 | " contains_attachment contains_order_confirmation contains_payment_summary \\\n", 430 | "0 False False False \n", 431 | "1 False False False \n", 432 | "2 False False False \n", 433 | "3 True False False \n", 434 | "4 False False False \n", 435 | "\n", 436 | " sender_has_common_domain is_personal \n", 437 | "0 False True \n", 438 | "1 True True \n", 439 | "2 False True \n", 440 | "3 False True \n", 441 | "4 False True " 442 | ] 443 | }, 444 | "execution_count": 11, 445 | "metadata": {}, 446 | "output_type": "execute_result" 447 | } 448 | ], 449 | "source": [ 450 | "# drop original columns\n", 451 | "df_transformed = df.iloc[:, 4:]\n", 452 | "df_transformed.head()" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "id": "3658bda0", 458 | "metadata": {}, 459 | "source": [ 460 | "### Load" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": 12, 466 | "id": "55644e43", 467 | "metadata": {}, 468 | "outputs": [], 469 | "source": [ 470 | "# save to csv\n", 471 | "df_transformed.to_csv(\"data/transformed_data.csv\", index=False)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "id": "e36b6ea9", 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [] 481 | } 482 | ], 483 | "metadata": { 484 | "kernelspec": { 485 | "display_name": "Python 3 (ipykernel)", 486 | "language": "python", 487 | "name": "python3" 488 | }, 489 | "language_info": { 490 | "codemirror_mode": { 491 | "name": "ipython", 492 | "version": 3 493 | }, 494 | "file_extension": ".py", 495 | "mimetype": "text/x-python", 496 | "name": "python", 497 | "nbconvert_exporter": "python", 498 | "pygments_lexer": "ipython3", 499 | "version": "3.9.12" 500 | } 501 | }, 502 | "nbformat": 4, 503 | "nbformat_minor": 5 504 | } 505 | -------------------------------------------------------------------------------- /session-2/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | ipykernel 3 | pandas 4 | scikit-learn 5 | matplotlib 6 | numpy 7 | -------------------------------------------------------------------------------- /session-2/slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-2/slides.pdf -------------------------------------------------------------------------------- /session-2/top_secret.py: -------------------------------------------------------------------------------- 1 | app_password = "your app password here" -------------------------------------------------------------------------------- /session-3/data/transformed_data.csv: -------------------------------------------------------------------------------- 1 | body_length,contains_lol,contains_omg,contains_attached,contains_attachment,contains_order_confirmation,contains_payment_summary,sender_has_common_domain,is_personal,label_0shot,correct_0shot,label_fewshot,correct_fewshot 2 | 1830,False,False,False,False,False,False,False,True,False,False,False,False 3 | 1522,False,False,False,False,False,False,True,True,True,True,True,True 4 | 1838,False,False,False,False,False,False,False,True,False,False,False,False 5 | 251,False,False,False,True,False,False,False,True,False,False,False,False 6 | 3170,False,False,False,False,False,False,False,True,False,False,False,False 7 | 2782,False,False,False,False,False,False,False,True,False,False,False,False 8 | 11026,False,False,False,False,False,False,False,True,False,False,False,False 9 | 7025,False,False,False,False,False,False,False,True,False,False,False,False 10 | 3265,False,False,False,True,False,False,True,True,False,False,True,True 11 | 3007,False,False,False,True,False,False,False,True,False,False,False,False 12 | 1954,False,False,False,False,False,False,True,True,False,False,True,True 13 | 1600,False,False,False,False,False,False,False,True,False,False,False,False 14 | 1841,False,False,False,False,False,False,False,True,False,False,True,True 15 | 1841,False,False,False,False,False,False,False,True,False,False,True,True 16 | 2562,False,False,False,False,False,False,True,True,False,False,False,False 17 | 201,False,False,False,False,False,False,False,True,False,False,False,False 18 | 8520,False,False,False,False,False,False,False,True,False,False,False,False 19 | 7454,False,False,False,False,False,False,False,True,False,False,False,False 20 | 8520,False,False,False,False,False,False,False,True,False,False,False,False 21 | 8520,False,False,False,False,False,False,False,True,False,False,False,False 22 | 201,False,False,False,False,False,False,False,True,False,False,False,False 23 | 201,False,False,False,False,False,False,False,True,False,False,False,False 24 | 3801,False,False,False,False,False,False,False,True,False,False,False,False 25 | 1068,False,False,False,False,False,False,False,True,True,True,True,True 26 | 1593,False,False,False,False,False,False,False,True,True,True,True,True 27 | 1955,False,False,False,False,False,False,False,True,True,True,True,True 28 | 4499,False,False,False,False,False,False,False,True,True,True,True,True 29 | 5088,False,False,False,False,False,False,False,True,True,True,True,True 30 | 57838,False,False,False,False,False,False,False,True,False,False,True,True 31 | 4420,False,False,False,False,False,False,False,True,True,True,True,True 32 | 384,True,False,True,False,False,False,True,True,False,False,True,True 33 | 298,False,False,True,False,False,False,True,True,False,False,False,False 34 | 845,False,False,False,False,False,False,True,True,False,False,True,True 35 | 115,False,False,False,False,False,False,True,True,False,False,True,True 36 | 2,False,False,False,False,False,False,True,True,False,False,True,True 37 | 1376,False,False,False,False,False,False,True,True,False,False,False,False 38 | 1146,False,False,False,False,False,False,True,True,False,False,False,False 39 | 822,True,False,False,True,False,False,True,True,False,False,False,False 40 | 6847,False,False,True,False,False,False,True,True,False,False,False,False 41 | 6323,False,False,False,False,False,False,True,True,False,False,False,False 42 | 4230,False,False,False,False,False,False,True,True,False,False,False,False 43 | 2020,False,False,False,True,False,False,True,True,False,False,False,False 44 | 1608,False,False,False,False,False,False,True,True,False,False,False,False 45 | 1119,False,False,False,False,False,False,False,True,False,False,False,False 46 | 773,False,False,False,False,False,False,True,True,False,False,False,False 47 | 640,False,False,False,False,False,False,False,True,False,False,False,False 48 | 2246,False,False,False,False,False,False,False,True,False,False,False,False 49 | 1865,False,False,True,False,False,False,False,True,False,False,False,False 50 | 4720,False,False,False,False,False,False,False,True,False,False,False,False 51 | 16327,False,False,True,False,False,False,False,True,False,False,False,False 52 | 6165,False,False,False,True,False,False,True,True,False,False,False,False 53 | 1931,False,False,False,False,False,False,False,True,False,False,True,True 54 | 325,False,False,False,False,False,False,True,True,False,False,False,False 55 | 926,False,False,False,False,False,False,True,True,True,True,True,True 56 | 776,False,False,False,False,False,False,True,True,False,False,False,False 57 | 260,False,False,False,True,False,False,True,True,False,False,False,False 58 | 1405,True,False,False,False,False,False,True,True,False,False,False,False 59 | 1338,True,False,False,False,False,False,False,True,False,False,False,False 60 | 13259,False,False,False,False,False,False,True,True,False,False,False,False 61 | 3,False,False,False,False,False,False,False,True,False,False,False,False 62 | 53,False,False,False,False,False,False,True,True,True,True,True,True 63 | 2040,False,False,False,False,False,False,True,True,False,False,True,True 64 | 14690,False,False,False,False,False,False,True,True,False,False,False,False 65 | 4830,False,False,True,False,False,False,True,True,False,False,True,True 66 | 3466,False,False,False,False,False,False,True,True,False,False,False,False 67 | 3181,False,False,False,False,False,False,True,True,False,False,False,False 68 | 10438,False,False,False,False,False,False,True,True,False,False,False,False 69 | 1470,False,False,False,True,True,False,True,True,False,False,False,False 70 | 1321,False,False,False,False,False,False,True,True,False,False,True,True 71 | 580,False,False,False,False,False,False,True,True,False,False,True,True 72 | 3,False,False,False,False,False,False,False,True,False,False,False,False 73 | 17408,False,False,False,False,False,False,True,True,True,True,True,True 74 | 16971,False,False,False,False,False,False,True,True,True,True,True,True 75 | 22849,False,False,False,False,False,False,True,True,False,False,True,True 76 | 728,False,False,False,False,False,False,True,True,False,False,True,True 77 | 599,False,False,False,False,False,False,True,True,False,False,True,True 78 | 480,False,False,False,False,False,False,True,True,False,False,True,True 79 | 367,False,False,False,False,False,False,True,True,False,False,True,True 80 | 4,False,False,False,False,False,False,True,True,True,True,True,True 81 | 1552,False,False,False,False,False,False,True,True,False,False,False,False 82 | 331,False,False,False,False,False,False,False,True,False,False,False,False 83 | 85,False,False,False,False,False,False,True,True,True,True,True,True 84 | 34,False,False,False,False,False,False,True,True,True,True,True,True 85 | 1247,False,False,False,False,False,False,True,True,False,False,True,True 86 | 3027,False,False,False,False,False,False,True,True,False,False,True,True 87 | 23190,True,False,False,False,False,False,True,True,False,False,False,False 88 | 14691,False,False,False,False,False,False,True,True,False,False,True,True 89 | 23705,False,False,False,False,False,False,True,True,True,True,True,True 90 | 1371,False,False,False,False,False,False,True,True,False,False,True,True 91 | 1373,False,False,False,False,False,False,True,True,False,False,True,True 92 | 21076,False,False,False,False,True,False,True,True,False,False,False,False 93 | 2640,False,False,False,False,True,False,False,True,False,False,False,False 94 | 3511,False,False,False,False,False,False,False,True,False,False,False,False 95 | 27,False,False,False,False,False,False,True,True,True,True,True,True 96 | 2534,False,False,False,False,False,False,True,True,False,False,False,False 97 | 4220,False,False,False,False,False,False,True,True,False,False,False,False 98 | 4041,False,False,False,False,False,False,True,True,False,False,False,False 99 | 1073,False,False,False,False,False,False,False,True,True,True,True,True 100 | 11,False,False,False,False,False,False,True,True,True,True,True,True 101 | 85,False,False,False,False,False,False,True,True,True,True,True,True 102 | 471,False,False,True,False,False,False,True,True,False,False,False,False 103 | 381,False,False,True,False,False,False,True,True,False,False,True,True 104 | 211,False,False,False,False,False,False,True,True,False,False,True,True 105 | 5258,False,False,False,False,False,False,True,True,False,False,True,True 106 | 4017,False,False,False,False,False,False,True,True,True,True,True,True 107 | 17,False,False,False,False,False,False,True,True,True,True,True,True 108 | 4884,False,False,False,False,True,False,True,True,False,False,False,False 109 | 5083,False,False,False,False,False,False,True,True,False,False,False,False 110 | 5301,False,False,False,False,False,False,True,True,False,False,False,False 111 | 4791,False,False,False,False,False,False,True,True,False,False,False,False 112 | 4577,False,False,False,False,False,False,True,True,False,False,False,False 113 | 7449,False,False,False,False,False,False,False,True,False,False,False,False 114 | 861,False,False,False,False,False,False,True,True,True,True,True,True 115 | 501,False,False,False,False,False,False,True,True,True,True,True,True 116 | 390,False,False,False,False,False,False,True,True,True,True,True,True 117 | 284,False,False,False,True,False,False,False,True,False,False,False,False 118 | 258,False,False,False,True,False,False,False,True,False,False,False,False 119 | 532,False,False,False,False,False,False,True,True,True,True,True,True 120 | 360,False,False,False,False,False,False,True,True,True,True,True,True 121 | 654,False,False,False,False,False,False,True,True,True,True,True,True 122 | 127,False,False,False,False,False,False,True,True,False,False,True,True 123 | 31,False,False,False,False,False,False,True,True,True,True,True,True 124 | 1323,False,False,False,False,False,False,True,True,False,False,True,True 125 | 1289,False,False,False,False,False,False,True,True,False,False,True,True 126 | 4,False,False,False,False,False,False,True,True,False,False,True,True 127 | 3313,False,False,False,False,False,False,True,True,True,True,True,True 128 | 305,False,False,False,False,False,False,True,True,False,False,True,True 129 | 201,False,False,False,False,False,False,True,True,False,False,False,False 130 | 409,False,False,False,False,False,False,True,True,True,True,True,True 131 | 3035,True,False,False,False,False,False,True,True,True,True,True,True 132 | 2857,False,False,False,False,False,False,True,True,True,True,True,True 133 | 2624,False,False,False,False,False,False,True,True,True,True,True,True 134 | 2375,False,False,False,False,False,False,True,True,True,True,True,True 135 | 17911,False,False,False,False,False,False,True,True,False,False,False,False 136 | 257,False,False,False,True,False,False,False,True,True,True,True,True 137 | 492,False,False,False,False,False,False,True,True,False,False,False,False 138 | 8092,False,False,False,False,False,False,True,True,False,False,False,False 139 | 1772,False,False,False,False,False,False,True,True,False,False,False,False 140 | 8239,False,False,False,False,False,False,True,True,False,False,False,False 141 | 11113,False,False,False,False,False,False,True,True,False,False,False,False 142 | 4437,False,False,True,False,False,False,True,True,False,False,False,False 143 | 3844,False,False,True,False,False,False,True,True,False,False,False,False 144 | 2931,False,False,True,False,False,False,True,True,False,False,False,False 145 | 314,False,False,False,True,False,False,False,True,False,False,False,False 146 | 3603,False,False,True,False,False,False,True,True,False,False,False,False 147 | 6514,False,False,True,False,False,False,True,True,False,False,False,False 148 | 3337,False,False,False,False,False,False,True,True,False,False,True,True 149 | 3244,False,False,False,False,False,False,True,True,False,False,True,True 150 | 1529,False,False,False,False,False,False,False,True,False,False,False,False 151 | 23901,False,True,False,False,False,False,True,True,False,False,True,True 152 | 1616,False,False,False,False,False,False,False,False,False,True,False,True 153 | 1164,False,False,False,False,False,False,True,False,False,True,False,True 154 | 16654,False,False,False,False,False,False,False,False,False,True,False,True 155 | 78,False,False,False,False,False,False,True,False,False,True,False,True 156 | 2550,False,False,False,False,False,False,False,False,False,True,False,True 157 | 2452,False,False,False,False,False,False,False,False,False,True,False,True 158 | 1737,False,False,False,False,False,False,False,False,False,True,False,True 159 | 1542,False,False,False,False,False,False,False,False,False,True,False,True 160 | 1107,False,False,False,False,False,False,False,False,False,True,False,True 161 | 1641,False,False,False,False,False,False,False,False,False,True,False,True 162 | 1713,False,False,False,False,False,False,False,False,False,True,False,True 163 | 1737,False,False,False,False,False,False,False,False,False,True,False,True 164 | 1074,False,False,False,False,False,False,False,False,False,True,False,True 165 | 4451,True,False,False,False,False,False,False,False,False,True,False,True 166 | 21592,False,False,False,False,False,False,False,False,False,True,False,True 167 | 9581,False,False,False,False,False,False,False,False,False,True,False,True 168 | 4086,False,False,False,False,False,False,False,False,False,True,False,True 169 | 11073,False,False,False,False,False,False,False,False,False,True,False,True 170 | 1456,False,False,False,False,False,False,False,False,False,True,False,True 171 | 720,False,False,False,False,False,False,False,False,False,True,False,True 172 | 2422,False,False,False,False,False,False,False,False,False,True,False,True 173 | 3766,False,True,False,False,False,False,False,False,False,True,False,True 174 | 1477,False,False,False,False,False,False,False,False,False,True,False,True 175 | 2712,False,False,False,False,False,False,False,False,False,True,False,True 176 | 2295,False,False,False,False,False,False,False,False,False,True,False,True 177 | 2841,False,False,False,False,False,False,False,False,False,True,False,True 178 | 3427,False,False,False,False,False,False,False,False,False,True,False,True 179 | 2872,False,False,False,False,False,False,True,False,False,True,False,True 180 | 2490,False,False,False,False,False,False,False,False,False,True,False,True 181 | 677,False,False,False,False,False,False,False,False,False,True,False,True 182 | 2283,False,False,False,False,False,False,False,False,False,True,False,True 183 | 528,False,False,False,False,False,False,False,False,False,True,False,True 184 | 3338,False,False,False,False,False,False,True,False,False,True,False,True 185 | 2983,False,False,False,False,False,False,True,False,False,True,False,True 186 | 3,False,False,False,False,False,False,False,False,False,True,False,True 187 | 4790,True,True,False,False,False,False,True,False,False,True,True,False 188 | 3,False,False,False,False,False,False,True,False,False,True,False,True 189 | 25289,False,False,False,False,False,False,False,False,False,True,False,True 190 | 58,False,False,False,False,False,False,True,False,False,True,True,False 191 | 1947,False,False,True,False,False,False,True,False,False,True,False,True 192 | 883,False,False,False,False,False,False,False,False,False,True,False,True 193 | 855,False,False,False,False,False,False,False,False,False,True,False,True 194 | 2035,False,False,False,False,False,False,False,False,False,True,False,True 195 | 2424,False,False,False,False,False,False,False,False,False,True,False,True 196 | 605,False,False,False,False,False,True,False,False,False,True,False,True 197 | 612,False,False,False,False,False,True,False,False,False,True,False,True 198 | 617,False,False,False,False,False,True,False,False,False,True,False,True 199 | 615,False,False,False,False,False,True,False,False,False,True,False,True 200 | 617,False,False,False,False,False,True,False,False,False,True,False,True 201 | 3975,False,False,False,False,False,False,False,False,False,True,False,True 202 | 616,False,False,False,False,False,True,False,False,False,True,False,True 203 | 6167,False,False,False,False,False,False,False,False,False,True,False,True 204 | 615,False,False,False,False,False,True,False,False,False,True,False,True 205 | 613,False,False,False,False,False,True,False,False,False,True,False,True 206 | 607,False,False,False,False,False,True,False,False,False,True,False,True 207 | 636,False,False,False,False,False,True,False,False,False,True,False,True 208 | 633,False,False,False,False,False,True,False,False,False,True,False,True 209 | 635,False,False,False,False,False,True,False,False,False,True,False,True 210 | 3017,False,False,False,False,False,False,False,False,False,True,False,True 211 | 3017,False,False,False,False,False,False,False,False,False,True,False,True 212 | 637,False,False,False,False,False,True,False,False,False,True,False,True 213 | 644,False,False,False,False,False,True,False,False,False,True,False,True 214 | 18824,False,False,False,False,False,False,False,False,False,True,False,True 215 | 647,False,False,False,False,False,True,False,False,False,True,False,True 216 | 9982,False,False,False,False,False,False,False,False,False,True,False,True 217 | 1333,False,False,False,False,False,False,False,False,False,True,False,True 218 | 9158,False,False,False,False,False,False,False,False,False,True,False,True 219 | 645,False,False,False,False,False,True,False,False,False,True,False,True 220 | 9573,False,False,False,False,False,False,False,False,False,True,False,True 221 | 20297,False,False,False,False,False,False,False,False,False,True,False,True 222 | 10222,False,False,False,False,False,False,False,False,False,True,False,True 223 | 647,False,False,False,False,False,True,False,False,False,True,False,True 224 | 7078,False,False,False,False,False,False,False,False,False,True,False,True 225 | 646,False,False,False,False,False,True,False,False,False,True,False,True 226 | 3926,False,False,False,False,False,False,False,False,False,True,False,True 227 | 1276,False,False,False,False,False,False,False,False,False,True,False,True 228 | 4223,False,False,False,False,False,False,False,False,False,True,False,True 229 | 645,False,False,False,False,False,True,False,False,False,True,False,True 230 | 3627,False,False,False,False,False,False,False,False,False,True,False,True 231 | 1148,False,False,False,False,False,False,False,False,False,True,False,True 232 | 3825,False,False,False,False,False,False,False,False,False,True,False,True 233 | 644,False,False,False,False,False,True,False,False,False,True,False,True 234 | 18682,False,False,False,False,False,False,False,False,False,True,False,True 235 | 638,False,False,False,False,False,True,False,False,False,True,False,True 236 | 5736,False,False,False,False,False,False,False,False,False,True,False,True 237 | 9269,False,False,False,False,False,False,False,False,False,True,False,True 238 | 9326,False,False,False,False,False,False,False,False,False,True,False,True 239 | 11111,False,False,False,False,False,False,False,False,False,True,False,True 240 | 636,False,False,False,False,False,True,False,False,False,True,False,True 241 | 3200,False,False,False,False,False,False,False,False,False,True,False,True 242 | 11024,False,False,False,False,False,False,False,False,False,True,False,True 243 | 633,False,False,False,False,False,True,False,False,False,True,False,True 244 | 14645,False,False,False,False,False,False,False,False,False,True,False,True 245 | 14231,False,False,False,False,False,False,False,False,False,True,False,True 246 | 635,False,False,False,False,False,True,False,False,False,True,False,True 247 | 2439,False,False,False,False,False,False,False,False,False,True,False,True 248 | 2439,False,False,False,False,False,False,False,False,False,True,False,True 249 | 14950,False,False,False,False,False,False,False,False,False,True,False,True 250 | 25518,False,False,False,False,False,False,False,False,False,True,False,True 251 | 17292,False,False,False,False,False,False,False,False,False,True,False,True 252 | 22354,False,False,False,False,False,False,False,False,False,True,False,True 253 | 637,False,False,False,False,False,True,False,False,False,True,False,True 254 | 30169,False,False,False,False,False,False,False,False,False,True,False,True 255 | 2464,False,False,False,False,False,False,False,False,False,True,False,True 256 | 535,False,False,False,False,False,False,False,False,False,True,False,True 257 | 31044,False,False,False,False,False,False,False,False,False,True,False,True 258 | 37546,False,False,False,False,False,False,False,False,False,True,False,True 259 | 2463,False,False,False,False,False,False,False,False,False,True,False,True 260 | 22601,False,False,False,False,False,False,False,False,False,True,False,True 261 | 15373,False,False,False,False,False,False,False,False,False,True,False,True 262 | 11570,False,False,False,False,False,False,False,False,False,True,False,True 263 | 2812,False,False,False,False,False,False,False,False,False,True,False,True 264 | 21623,False,False,False,False,False,False,False,False,False,True,False,True 265 | 18536,False,False,False,False,False,False,False,False,False,True,False,True 266 | 23076,False,False,False,False,False,False,False,False,False,True,False,True 267 | 2464,False,False,False,False,False,False,False,False,False,True,False,True 268 | 13046,False,False,False,False,False,False,False,False,False,True,False,True 269 | 12586,False,False,False,False,False,False,False,False,False,True,False,True 270 | 2459,False,False,False,False,False,False,False,False,False,True,False,True 271 | 15562,False,False,False,False,False,False,False,False,False,True,False,True 272 | 2460,False,False,False,False,False,False,False,False,False,True,False,True 273 | 28681,False,False,False,False,False,False,False,False,False,True,False,True 274 | 3185,False,False,False,False,False,False,False,False,False,True,False,True 275 | 33824,False,False,False,False,False,False,False,False,False,True,False,True 276 | 2455,False,False,False,False,False,False,False,False,False,True,False,True 277 | 27622,False,False,False,False,False,False,False,False,False,True,False,True 278 | 25305,False,False,False,False,False,False,False,False,False,True,False,True 279 | 2452,False,False,False,False,False,False,False,False,False,True,False,True 280 | 3248,False,False,False,False,False,False,False,False,False,True,False,True 281 | 501,False,False,False,False,False,False,False,False,False,True,False,True 282 | 14414,False,False,False,False,False,False,False,False,False,True,False,True 283 | 14563,False,False,False,False,False,False,False,False,False,True,False,True 284 | 20847,False,False,False,False,False,False,False,False,False,True,False,True 285 | 2450,False,False,False,False,False,False,False,False,False,True,False,True 286 | 20246,False,False,False,False,False,False,False,False,False,True,False,True 287 | 21494,False,False,False,False,False,False,False,False,False,True,False,True 288 | 15083,False,False,False,False,False,False,False,False,False,True,False,True 289 | 2448,False,False,False,False,False,False,False,False,False,True,False,True 290 | 13933,False,False,False,False,False,False,False,False,False,True,False,True 291 | 11642,False,False,False,False,False,False,False,False,False,True,False,True 292 | 2456,False,False,False,False,False,False,False,False,False,True,False,True 293 | 2450,False,False,False,False,False,False,False,False,False,True,False,True 294 | 2456,False,False,False,False,False,False,False,False,False,True,False,True 295 | 19151,False,False,False,False,False,False,False,False,False,True,False,True 296 | 20385,False,False,False,False,False,False,False,False,False,True,False,True 297 | 2453,False,False,False,False,False,False,False,False,False,True,False,True 298 | 12549,False,False,False,False,False,False,False,False,False,True,False,True 299 | 27430,False,False,False,False,False,False,False,False,False,True,False,True 300 | 11894,False,False,False,False,False,False,False,False,False,True,False,True 301 | 25131,False,False,False,False,False,False,False,False,False,True,False,True 302 | -------------------------------------------------------------------------------- /session-3/example_1-paper_summarizer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "5a4ef775-778d-4c0f-b901-65df2ba5caf5", 6 | "metadata": {}, 7 | "source": [ 8 | "# Summarizing Research Papers with GPT-4o\n", 9 | "## ABB #1 - Session 3\n", 10 | "\n", 11 | "Code authored by: Shaw Talebi" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "ddf11ab2-418e-4ea3-a3b8-2d2a09232c88", 17 | "metadata": {}, 18 | "source": [ 19 | "### imports" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "c2b91857-1b75-4b5e-ac03-0bb05bf9e75f", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import fitz # PyMuPDF\n", 30 | "from IPython.display import Markdown, display\n", 31 | "from openai import OpenAI\n", 32 | "from top_secret import my_sk" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "id": "4db523c7-0b98-402d-91c4-d5e4c21c2d1a", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# setup api client\n", 43 | "client = OpenAI(api_key=my_sk)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "id": "84d88996-64c0-43b0-af2a-0a4af4f46152", 49 | "metadata": {}, 50 | "source": [ 51 | "### 1) Extract text" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 2, 57 | "id": "c06f8b84-23cf-49c3-a024-ebf98b1b8666", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "filepath = \"papers/attention-is-all-you-need.pdf\"" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "id": "e42881f0-32bf-4733-809a-b290118c9760", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "pdf = fitz.open(filepath)\n", 72 | "text = \"\".join([page.get_text() for page in pdf])" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "id": "eb264250-d62c-472b-ad3e-21bbd263551e", 78 | "metadata": {}, 79 | "source": [ 80 | "### 2) Write prompt" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 4, 86 | "id": "4aaab4a9-cfe5-4488-87b3-c173fc96320c", 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "prompt = f\"\"\"# System Role: Research Paper Summarizer\n", 91 | "\n", 92 | "Your task is to act as an academic summarizer, condensing research papers into concise, clear, and accessible summaries. Follow these guidelines:\n", 93 | "\n", 94 | "## Structure\n", 95 | "\n", 96 | "Always use the following structure for your summary:\n", 97 | "\n", 98 | "- **Title**: The paper's title.\n", 99 | "- **Authors**: Key authors and their affiliations (if available).\n", 100 | "- **Abstract**: A paraphrased version of the paper's abstract.\n", 101 | "- **Objective**: The research goal or question being addressed.\n", 102 | "- **Methodology**: Key methods or experiments used.\n", 103 | "- **Findings**: Main results or insights.\n", 104 | "- **Conclusion**: The authors' conclusion and implications.\n", 105 | "- **Relevance**: Why this research is significant or what problem it addresses.\n", 106 | "\n", 107 | "## Style\n", 108 | "\n", 109 | "- Write in a neutral and academic tone.\n", 110 | "- Use simple, precise language to ensure clarity for a broad audience.\n", 111 | "\n", 112 | "## Length\n", 113 | "\n", 114 | "- Keep summaries concise (150-300 words) unless otherwise specified.\n", 115 | "\n", 116 | "## Audience\n", 117 | "\n", 118 | "- Assume the audience has general technical knowledge but may not be familiar with the specific field of the paper.\n", 119 | "\n", 120 | "## Special Instructions\n", 121 | "\n", 122 | "- If specific sections of the paper are missing, skip them and note it clearly (e.g., \"Authors section not specified\").\n", 123 | "- Avoid subjective opinions or interpretations beyond the paper's content.\n", 124 | "\n", 125 | "## Citations\n", 126 | "\n", 127 | "- Provide clear citations in APA format (if needed).\n", 128 | "\n", 129 | "---\n", 130 | "\n", 131 | "You are summarizing the following research paper:\n", 132 | "\n", 133 | "{text}\n", 134 | "\"\"\"" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "9bc17b34-940f-4ff3-ad67-3b2fd7ee5a48", 140 | "metadata": {}, 141 | "source": [ 142 | "### 3) Summarize Paper with GPT-4o" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 7, 148 | "id": "2a27209d-5711-4eab-9828-97d301d17bdf", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "- **Title**: Attention Is All You Need\n", 156 | "- **Authors**: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin (Google Brain, Google Research, University of Toronto)\n", 157 | "- **Abstract**: This paper introduces the Transformer, a new architecture for sequence transduction that relies entirely on attention mechanisms, eliminating the need for recurrence and convolutions. The Transformer demonstrates superior performance in machine translation tasks, achieving state-of-the-art BLEU scores for English-to-German and English-to-French translations, while also being more efficient in training time and parallelization.\n", 158 | "\n", 159 | "- **Objective**: The primary goal of this research is to propose a novel model architecture for sequence transduction that improves upon existing recurrent and convolutional models by leveraging attention mechanisms exclusively.\n", 160 | "\n", 161 | "- **Methodology**: The Transformer model is structured with an encoder-decoder architecture, where both components utilize multi-head self-attention and feed-forward neural networks. The authors conducted experiments on the WMT 2014 English-to-German and English-to-French translation tasks, comparing the performance and training efficiency of the Transformer against existing state-of-the-art models.\n", 162 | "\n", 163 | "- **Findings**: The Transformer achieved a BLEU score of 28.4 for English-to-German and 41.8 for English-to-French translations, surpassing previous models by significant margins and requiring less training time. The model also showed strong generalization to other tasks, such as English constituency parsing.\n", 164 | "\n", 165 | "- **Conclusion**: The authors conclude that the Transformer model, by relying solely on attention mechanisms, not only achieves state-of-the-art performance in translation tasks but also offers advantages in training speed and efficiency. They express optimism about the potential applications of attention-based models in various domains beyond text.\n", 166 | "\n", 167 | "- **Relevance**: This research is significant as it introduces a transformative approach to sequence modeling, addressing limitations of traditional recurrent and convolutional networks, and lays the groundwork for further advancements in natural language processing and other fields requiring sequence transduction.\n", 168 | "\n", 169 | "**Citation**: Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, Ł., & Polosukhin, I. (2023). Attention Is All You Need. 31st Conference on Neural Information Processing Systems (NIPS 2017). arXiv:1706.03762v7 [cs.CL].\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "# make api call\n", 175 | "response = client.chat.completions.create(\n", 176 | " model=\"gpt-4o-mini\",\n", 177 | " messages=[\n", 178 | " {\"role\": \"system\", \"content\": \"Research paper summarizer.\"},\n", 179 | " {\"role\": \"user\", \"content\": prompt}\n", 180 | " ], \n", 181 | " temperature = 0.5\n", 182 | ")\n", 183 | "\n", 184 | "# extract response\n", 185 | "summary = response.choices[0].message.content\n", 186 | "print(summary)" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "id": "d8e89e07-f2d5-4eec-8f83-ba8da7fb2772", 192 | "metadata": {}, 193 | "source": [ 194 | "### 4) Display Markdown" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 9, 200 | "id": "a058e3ea-480e-4f61-96cf-ddfced60c94b", 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "data": { 205 | "text/markdown": [ 206 | "- **Title**: Attention Is All You Need\n", 207 | "- **Authors**: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Łukasz Kaiser, Illia Polosukhin (Google Brain, Google Research, University of Toronto)\n", 208 | "- **Abstract**: This paper introduces the Transformer, a new architecture for sequence transduction that relies entirely on attention mechanisms, eliminating the need for recurrence and convolutions. The Transformer demonstrates superior performance in machine translation tasks, achieving state-of-the-art BLEU scores for English-to-German and English-to-French translations, while also being more efficient in training time and parallelization.\n", 209 | "\n", 210 | "- **Objective**: The primary goal of this research is to propose a novel model architecture for sequence transduction that improves upon existing recurrent and convolutional models by leveraging attention mechanisms exclusively.\n", 211 | "\n", 212 | "- **Methodology**: The Transformer model is structured with an encoder-decoder architecture, where both components utilize multi-head self-attention and feed-forward neural networks. The authors conducted experiments on the WMT 2014 English-to-German and English-to-French translation tasks, comparing the performance and training efficiency of the Transformer against existing state-of-the-art models.\n", 213 | "\n", 214 | "- **Findings**: The Transformer achieved a BLEU score of 28.4 for English-to-German and 41.8 for English-to-French translations, surpassing previous models by significant margins and requiring less training time. The model also showed strong generalization to other tasks, such as English constituency parsing.\n", 215 | "\n", 216 | "- **Conclusion**: The authors conclude that the Transformer model, by relying solely on attention mechanisms, not only achieves state-of-the-art performance in translation tasks but also offers advantages in training speed and efficiency. They express optimism about the potential applications of attention-based models in various domains beyond text.\n", 217 | "\n", 218 | "- **Relevance**: This research is significant as it introduces a transformative approach to sequence modeling, addressing limitations of traditional recurrent and convolutional networks, and lays the groundwork for further advancements in natural language processing and other fields requiring sequence transduction.\n", 219 | "\n", 220 | "**Citation**: Vaswani, A., Shazeer, N., Parmar, N., Uszkoreit, J., Jones, L., Gomez, A. N., Kaiser, Ł., & Polosukhin, I. (2023). Attention Is All You Need. 31st Conference on Neural Information Processing Systems (NIPS 2017). arXiv:1706.03762v7 [cs.CL]." 221 | ], 222 | "text/plain": [ 223 | "" 224 | ] 225 | }, 226 | "metadata": {}, 227 | "output_type": "display_data" 228 | } 229 | ], 230 | "source": [ 231 | "display(Markdown(summary))" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "id": "d0aad5dc-bbc8-4ac9-8421-1e9ae83fcdc4", 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [] 241 | } 242 | ], 243 | "metadata": { 244 | "kernelspec": { 245 | "display_name": "Python 3 (ipykernel)", 246 | "language": "python", 247 | "name": "python3" 248 | }, 249 | "language_info": { 250 | "codemirror_mode": { 251 | "name": "ipython", 252 | "version": 3 253 | }, 254 | "file_extension": ".py", 255 | "mimetype": "text/x-python", 256 | "name": "python", 257 | "nbconvert_exporter": "python", 258 | "pygments_lexer": "ipython3", 259 | "version": "3.12.2" 260 | } 261 | }, 262 | "nbformat": 4, 263 | "nbformat_minor": 5 264 | } 265 | -------------------------------------------------------------------------------- /session-3/example_2-text-classifier.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "773f2b77-a3c6-47b8-8246-1b958555f4af", 6 | "metadata": {}, 7 | "source": [ 8 | "# Text Classification with GPT-4o\n", 9 | "## ABB #1 - Session 3\n", 10 | "\n", 11 | "Code authored by: Shaw Talebi" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "62058d9d-adc2-4a6f-a60e-ff2abca7d93b", 17 | "metadata": {}, 18 | "source": [ 19 | "### imports" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "aa6c91d1-058e-49a7-8296-75bb925c2e8b", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd\n", 30 | "\n", 31 | "from openai import OpenAI\n", 32 | "from top_secret import my_sk" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "id": "28da37e3-47a3-4fc5-9722-e638d8fdc79e", 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# setup api client\n", 43 | "client = OpenAI(api_key=my_sk)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "id": "7e99a8de", 49 | "metadata": {}, 50 | "source": [ 51 | "### functions" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "id": "7822e88b-1968-4458-ad40-326b7c93a06e", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "def manual_feature_engineering(df):\n", 62 | " \"\"\"\n", 63 | " Generate a suite of manually defined features\n", 64 | " \"\"\"\n", 65 | " \n", 66 | " # length of the body\n", 67 | " df[\"body_length\"] = df[\"body\"].apply(len)\n", 68 | " \n", 69 | " # contains \"lol\"\n", 70 | " df[\"contains_lol\"] = df[\"body\"].apply(lambda x: \"lol\" in x.lower())\n", 71 | " \n", 72 | " # contains \"OMG\"\n", 73 | " df[\"contains_omg\"] = df[\"body\"].apply(lambda x: \"omg\" in x.lower())\n", 74 | " \n", 75 | " # contains \"attached\" or \"see attached\" or \"see attachment\"\n", 76 | " df[\"contains_attached\"] = df[\"body\"].apply(lambda x: \"attached\" in x.lower())\n", 77 | " df[\"contains_attachment\"] = df[\"body\"].apply(lambda x: \"attachment\" in x.lower())\n", 78 | " \n", 79 | " # contains \"Order Confirmation\"\n", 80 | " df[\"contains_order_confirmation\"] = df[\"body\"].apply(lambda x: \"order confirmation\" in x.lower())\n", 81 | " \n", 82 | " # contains \"payment summary\"\n", 83 | " df[\"contains_payment_summary\"] = df[\"body\"].apply(lambda x: \"payment summary\" in x.lower())\n", 84 | " \n", 85 | " # sender is common person domain (gmail, yahoo, hotmail) \n", 86 | " df[\"sender_has_common_domain\"] = df[\"from\"].apply(lambda x: any(domain in x.lower() for domain in [\"gmail\", \"yahoo\", \"hotmail\"]))\n", 87 | " \n", 88 | " # is personal email\n", 89 | " df[\"is_personal\"] = df[\"label\"] == \"personal\"\n", 90 | "\n", 91 | " return df" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 4, 97 | "id": "7c73701a-ae49-4b74-8339-94776fb3d850", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "def generate_label(prompt_template, subject, sender, body):\n", 102 | " \"\"\"\n", 103 | " Function to generate 0-shot label for email based on subject, sender, and body\n", 104 | " \"\"\"\n", 105 | " prompt = prompt_template(subject, sender, body)\n", 106 | " \n", 107 | " response = client.chat.completions.create(\n", 108 | " model=\"gpt-4o-mini\",\n", 109 | " messages=[\n", 110 | " {\"role\": \"system\", \"content\": \"Email Classifer\"},\n", 111 | " {\"role\": \"user\", \"content\": prompt}\n", 112 | " ], \n", 113 | " temperature = 0.25,\n", 114 | " max_completion_tokens=25,\n", 115 | " )\n", 116 | " \n", 117 | " # extract response\n", 118 | " return response.choices[0].message.content" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "id": "6f0e9770-2ec3-47ea-bd42-884c53a40918", 124 | "metadata": {}, 125 | "source": [ 126 | "### Load Data" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 5, 132 | "id": "3a26e03d-eb6a-4ea3-89b2-335803ea3514", 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "df = pd.read_csv(\"data/emails.csv\").astype(str)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "markdown", 141 | "id": "44ad81b2-2a15-4165-a47d-42307b87aad3", 142 | "metadata": {}, 143 | "source": [ 144 | "### Feature Engineering (Manual)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 6, 150 | "id": "9534cb4a", 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "data": { 155 | "text/html": [ 156 | "
\n", 157 | "\n", 170 | "\n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | "
subjectfrombodylabelbody_lengthcontains_lolcontains_omgcontains_attachedcontains_attachmentcontains_order_confirmationcontains_payment_summarysender_has_common_domainis_personal
0Thank you Shawhin for your RSVPEvite <info@mailva.evite.com>Thank you for your RSVP\\n\\n\\nYou replied Yes f...personal1830FalseFalseFalseFalseFalseFalseFalseTrue
1Find Date for Knocking at Doorifyahuna@gmail.comSo you're gonna talk to your dad about this ri...personal1522FalseFalseFalseFalseFalseFalseTrueTrue
2Thank you Shawhin for your RSVPEvite <info@mailva.evite.com>Thank you for your RSVP\\n\\n\\nYou replied Yes f...personal1838FalseFalseFalseFalseFalseFalseFalseTrue
3Folder shared with you: \"Knocking at The Door\"\"Ifeoma Ahuna (via Google Drive)\" <drive-share...I've shared an item with you:\\r\\n\\r\\nKnocking ...personal251FalseFalseFalseTrueFalseFalseFalseTrue
4The Colony Shoreline Trail 5K and 15K Registra...RunSignup <info+auto@runsignup.com>[1]The Colony Shoreline Trail 5K and 15K\\r\\n\\r...personal3170FalseFalseFalseFalseFalseFalseFalseTrue
\n", 272 | "
" 273 | ], 274 | "text/plain": [ 275 | " subject \\\n", 276 | "0 Thank you Shawhin for your RSVP \n", 277 | "1 Find Date for Knocking at Door \n", 278 | "2 Thank you Shawhin for your RSVP \n", 279 | "3 Folder shared with you: \"Knocking at The Door\" \n", 280 | "4 The Colony Shoreline Trail 5K and 15K Registra... \n", 281 | "\n", 282 | " from \\\n", 283 | "0 Evite \n", 284 | "1 ifyahuna@gmail.com \n", 285 | "2 Evite \n", 286 | "3 \"Ifeoma Ahuna (via Google Drive)\" \n", 288 | "\n", 289 | " body label body_length \\\n", 290 | "0 Thank you for your RSVP\\n\\n\\nYou replied Yes f... personal 1830 \n", 291 | "1 So you're gonna talk to your dad about this ri... personal 1522 \n", 292 | "2 Thank you for your RSVP\\n\\n\\nYou replied Yes f... personal 1838 \n", 293 | "3 I've shared an item with you:\\r\\n\\r\\nKnocking ... personal 251 \n", 294 | "4 [1]The Colony Shoreline Trail 5K and 15K\\r\\n\\r... personal 3170 \n", 295 | "\n", 296 | " contains_lol contains_omg contains_attached contains_attachment \\\n", 297 | "0 False False False False \n", 298 | "1 False False False False \n", 299 | "2 False False False False \n", 300 | "3 False False False True \n", 301 | "4 False False False False \n", 302 | "\n", 303 | " contains_order_confirmation contains_payment_summary \\\n", 304 | "0 False False \n", 305 | "1 False False \n", 306 | "2 False False \n", 307 | "3 False False \n", 308 | "4 False False \n", 309 | "\n", 310 | " sender_has_common_domain is_personal \n", 311 | "0 False True \n", 312 | "1 True True \n", 313 | "2 False True \n", 314 | "3 False True \n", 315 | "4 False True " 316 | ] 317 | }, 318 | "execution_count": 6, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "df = manual_feature_engineering(df)\n", 325 | "df.head()" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "id": "eb8e8fff-cee3-4bed-a875-ff63aa37bf90", 331 | "metadata": {}, 332 | "source": [ 333 | "### Text Classification with GPT-4o" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "id": "198a79f1-f15d-4ab5-a529-ba7b959fbfae", 339 | "metadata": {}, 340 | "source": [ 341 | "#### 0-shot" 342 | ] 343 | }, 344 | { 345 | "cell_type": "code", 346 | "execution_count": 7, 347 | "id": "5f5a8ec7-3c7b-4051-a77b-b3cf82709b0d", 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "# prompt\n", 352 | "prompt_template = lambda subject, sender, body : f\"\"\"You are an intelligent assistant that classifies emails based on whether they are personal or not. \\\n", 353 | "Given an email's subject, sender, and body, determine if the email is personal (indicated by 1) or not personal (indicated by 0). A personal email typically \\\n", 354 | "includes messages from friends, family, or individuals addressing personal topics. Non-personal emails include promotional content, work-related messages, \\\n", 355 | "newsletters, or automated notifications.\n", 356 | "\n", 357 | "Input Email:\n", 358 | "Subject: {subject}\n", 359 | "Sender: {sender}\n", 360 | "Body: {body}\n", 361 | "\n", 362 | "Instructions:\n", 363 | "Carefully analyze the subject, sender, and body to understand the context and tone of the email.\n", 364 | "Return:\n", 365 | "1 if the email is personal.\n", 366 | "0 if the email is not personal.\n", 367 | "\n", 368 | "Output: [Your classification: 1 or 0]\n", 369 | "\"\"\"" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 8, 375 | "id": "c9fd5623-1ccb-4ecd-8abd-2f4a4f3fda12", 376 | "metadata": {}, 377 | "outputs": [ 378 | { 379 | "name": "stdout", 380 | "output_type": "stream", 381 | "text": [ 382 | "CPU times: user 3.56 s, sys: 176 ms, total: 3.74 s\n", 383 | "Wall time: 2min 38s\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "%%time\n", 389 | "# intialize list to store labels\n", 390 | "label_0shot_list = []\n", 391 | "\n", 392 | "# generate labels for each row\n", 393 | "for index, row in df.iterrows():\n", 394 | " label_0shot_list.append(generate_label(prompt_template, row['subject'], row['from'], row['body']))" 395 | ] 396 | }, 397 | { 398 | "cell_type": "code", 399 | "execution_count": 9, 400 | "id": "27b40dde-2b69-4630-8a8a-840410ea8dc1", 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "['0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '[Your classification: 0]', '1', '0', '0', '0', '[Your classification: 1]', '1', '1', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '1', 'Output: 1', '1', '0', '0', '1', '1', '1', '1', '1', '0', '0', '0', '1', '0', '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '[Your classification: 0]', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '[Your classification: 0]', '0', '0', '0', '0', '0', '0', '0']\n" 408 | ] 409 | } 410 | ], 411 | "source": [ 412 | "print(label_0shot_list)" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 10, 418 | "id": "21a66448-37a3-40d7-aae7-895869fd68a9", 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "# add label to df\n", 423 | "df['label_0shot'] = [\"1\" in label for label in label_0shot_list]\n", 424 | "df['correct_0shot'] = df['is_personal']==df['label_0shot']" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 11, 430 | "id": "369f4883-5fc0-449b-a3e0-22e0aca7666a", 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "name": "stdout", 435 | "output_type": "stream", 436 | "text": [ 437 | "0.63\n" 438 | ] 439 | } 440 | ], 441 | "source": [ 442 | "# compare ground truth to 0-shot label\n", 443 | "print(df['correct_0shot'].sum()/len(df))" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "id": "9b155e45-72e4-454e-871b-e2d037b3452f", 449 | "metadata": {}, 450 | "source": [ 451 | "#### few-shot" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 12, 457 | "id": "5c4d4c93-7c5e-46fe-b8d3-7c659fe40d95", 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "# prompt\n", 462 | "prompt_template_fewshot = lambda subject, sender, body :f\"\"\"You are an intelligent assistant that classifies emails as personal or not personal. Given an email's subject, sender, and body, determine if the email is personal (indicated by 1) or not personal (indicated by 0). A personal email typically includes messages from friends, family, or individuals addressing personal topics. Non-personal emails include promotional content, work-related messages, newsletters, or automated notifications.\n", 463 | "\n", 464 | "##Instructions:\n", 465 | "Carefully analyze the subject, sender, and body to understand the context and tone of the email.\n", 466 | "Return:\n", 467 | "1 if the email is personal.\n", 468 | "0 if the email is not personal.\n", 469 | "\n", 470 | "##Examples:\n", 471 | "**Example 1:**\n", 472 | "\n", 473 | "Subject: {df['subject'][0]}\n", 474 | "Sender: {df['from'][0]}\n", 475 | "Body: {df['body'][0]}\n", 476 | "Output: {int(df['is_personal'][0])}\n", 477 | "\n", 478 | "**Example 2:**\n", 479 | "\n", 480 | "Subject: {df['subject'][1]}\n", 481 | "Sender: {df['from'][1]}\n", 482 | "Body: {df['body'][1]}\n", 483 | "Output: {int(df['is_personal'][1])}\n", 484 | "\n", 485 | "**Example 3:**\n", 486 | "\n", 487 | "Subject: {df['subject'][150]}\n", 488 | "Sender: {df['from'][150]}\n", 489 | "Body: {df['body'][150]}\n", 490 | "Output: {int(df['is_personal'][150])}\n", 491 | "\n", 492 | "**Input Email:**\n", 493 | "\n", 494 | "Subject: {subject}\n", 495 | "Sender: {sender}\n", 496 | "Body: {body}\n", 497 | "Output: [Your classification: 1 or 0]\n", 498 | "\"\"\"" 499 | ] 500 | }, 501 | { 502 | "cell_type": "code", 503 | "execution_count": 13, 504 | "id": "d31ad359-a0b7-4503-9e8c-4a8b1d902730", 505 | "metadata": {}, 506 | "outputs": [ 507 | { 508 | "name": "stdout", 509 | "output_type": "stream", 510 | "text": [ 511 | "CPU times: user 4.87 s, sys: 251 ms, total: 5.12 s\n", 512 | "Wall time: 4min 21s\n" 513 | ] 514 | } 515 | ], 516 | "source": [ 517 | "%%time\n", 518 | "# intialize list to store labels\n", 519 | "label_fewshot_list = []\n", 520 | "\n", 521 | "# generate labels for each row\n", 522 | "for index, row in df.iterrows():\n", 523 | " label_fewshot_list.append(generate_label(prompt_template_fewshot, row['subject'], row['from'], row['body']))" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": 14, 529 | "id": "237d3648-d041-481c-872d-a1eab08fa363", 530 | "metadata": {}, 531 | "outputs": [ 532 | { 533 | "name": "stdout", 534 | "output_type": "stream", 535 | "text": [ 536 | "['0', '1', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '0', '0', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '0', '0', '1', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']\n" 537 | ] 538 | } 539 | ], 540 | "source": [ 541 | "print(label_fewshot_list)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 15, 547 | "id": "3b614bce-c2f0-4cd1-bfba-c16e7b243287", 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "# add label to df\n", 552 | "df['label_fewshot'] = [\"1\" in label for label in label_fewshot_list]\n", 553 | "df['correct_fewshot'] = df['is_personal']==df['label_fewshot']" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": 16, 559 | "id": "aba0703e-4c20-443b-ae09-19abd9ed7747", 560 | "metadata": {}, 561 | "outputs": [ 562 | { 563 | "name": "stdout", 564 | "output_type": "stream", 565 | "text": [ 566 | "0.7433333333333333\n" 567 | ] 568 | } 569 | ], 570 | "source": [ 571 | "# compare ground truth to 0-shot label\n", 572 | "print(df['correct_fewshot'].sum()/len(df))" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 17, 578 | "id": "e7c1e97f", 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "# save data to file\n", 583 | "df_transformed = df.iloc[:, 4:]\n", 584 | "df_transformed.to_csv(\"data/transformed_data.csv\", index=False)" 585 | ] 586 | }, 587 | { 588 | "cell_type": "markdown", 589 | "id": "4303b0ae-a9f3-4877-98c0-07c81ae1537c", 590 | "metadata": {}, 591 | "source": [ 592 | "**Bonus:** train a classifer using noisy labels from GPT-4o-mini using [Example 2](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-2/example_2-email_classifier.ipynb) from Session 2" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": null, 598 | "id": "b033bcb8-d81c-43c2-af65-6979b5a88e1d", 599 | "metadata": {}, 600 | "outputs": [], 601 | "source": [] 602 | } 603 | ], 604 | "metadata": { 605 | "kernelspec": { 606 | "display_name": "Python 3 (ipykernel)", 607 | "language": "python", 608 | "name": "python3" 609 | }, 610 | "language_info": { 611 | "codemirror_mode": { 612 | "name": "ipython", 613 | "version": 3 614 | }, 615 | "file_extension": ".py", 616 | "mimetype": "text/x-python", 617 | "name": "python", 618 | "nbconvert_exporter": "python", 619 | "pygments_lexer": "ipython3", 620 | "version": "3.12.2" 621 | } 622 | }, 623 | "nbformat": 4, 624 | "nbformat_minor": 5 625 | } 626 | -------------------------------------------------------------------------------- /session-3/example_3-local_visual_QA.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "f785df67-ce40-44b5-8375-9af6d3c5420a", 6 | "metadata": {}, 7 | "source": [ 8 | "# Local Visual QA with LLaMA 3.2 Vision\n", 9 | "## ABB #1 - Session 3\n", 10 | "\n", 11 | "Code authored by: Shaw Talebi" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "ccab25a6-d20f-450e-9981-d43bd29a3181", 17 | "metadata": {}, 18 | "source": [ 19 | "### imports" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "fcc60723-d912-40ed-9b22-260b88d2182e", 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stderr", 30 | "output_type": "stream", 31 | "text": [ 32 | "/Users/shawhin/Documents/_code/_stv/ABB/AI-Builders-Bootcamp-1/session-3/s3-env/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 33 | " from .autonotebook import tqdm as notebook_tqdm\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "import ollama\n", 39 | "import gradio as gr\n", 40 | "import time" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "cebe3d9d-0759-4832-820f-0d555357bc1f", 46 | "metadata": {}, 47 | "source": [ 48 | "### basic usage" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 2, 54 | "id": "c72e1dd0-b473-49fe-a1eb-0f3fa8a98340", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "{'status': 'success'}" 61 | ] 62 | }, 63 | "execution_count": 2, 64 | "metadata": {}, 65 | "output_type": "execute_result" 66 | } 67 | ], 68 | "source": [ 69 | "# pull model\n", 70 | "ollama.pull('llama3.2-vision')" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "id": "53fd54ff-0e11-4a92-aa29-3735be934fb4", 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "The abstract of the paper states that it proposes a new system for translating text from German to English. The system, known as BLEU (Bilingual Evaluation Understudy), uses deep learning techniques to generate more accurate and natural-sounding translations than existing methods.\n", 84 | "\n", 85 | "**Key Features:**\n", 86 | "\n", 87 | "* **Parallelization**: The model achieves significant speedup by dividing the translation task into smaller sub-tasks that can be processed in parallel.\n", 88 | "* **Improved Accuracy**: The system produces more accurate and natural-sounding translations than existing methods.\n", 89 | "* **Efficient Training**: The model is trained using a combination of supervised and unsupervised learning techniques, which allows it to learn from large amounts of data quickly and efficiently.\n", 90 | "\n", 91 | "**Impact:**\n", 92 | "\n", 93 | "The paper presents a new approach to machine translation that has the potential to significantly improve the accuracy and efficiency of language translation systems. By leveraging deep learning techniques and parallelization, the system can handle complex tasks such as translating text from one language to another while maintaining the nuances and context of the original text." 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "# interact with model (locally)\n", 99 | "stream = ollama.chat(\n", 100 | " model='llama3.2-vision',\n", 101 | " messages=[{\n", 102 | " 'role': 'user',\n", 103 | " 'content': 'What is this paper about?',\n", 104 | " 'images': ['papers/attention-is-all-you-need.png']\n", 105 | " }],\n", 106 | " stream=True,\n", 107 | ")\n", 108 | "\n", 109 | "for chunk in stream:\n", 110 | " print(chunk['message']['content'], end='', flush=True)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "b402a518-7d2d-4158-93c3-dcddbbf5ce1e", 116 | "metadata": {}, 117 | "source": [ 118 | "### gradio UI" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 4, 124 | "id": "0e3ee81f-4a84-4fca-a2a0-01a8e7783b2e", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "# Function to interact with the Ollama model\n", 129 | "def stream_chat(message, history):\n", 130 | " \"\"\"\n", 131 | " Streams the response from the Ollama model and sends it to the Gradio UI.\n", 132 | " \n", 133 | " Args:\n", 134 | " message (str): The user input message.\n", 135 | " history (list): A list of previous conversation messages.\n", 136 | " \n", 137 | " Yields:\n", 138 | " str: The chatbot's response chunk by chunk.\n", 139 | " \"\"\"\n", 140 | " # Append the user message to the conversation history\n", 141 | " history.append({\"role\": \"user\", \"content\": message[\"text\"], \"images\":message[\"files\"]})\n", 142 | " \n", 143 | " # Initialize streaming from Ollama\n", 144 | " stream = ollama.chat(\n", 145 | " model='llama3.2-vision',\n", 146 | " messages=history, # Full chat history including the current user message\n", 147 | " stream=True,\n", 148 | " )\n", 149 | " \n", 150 | " response_text = \"\"\n", 151 | " for chunk in stream:\n", 152 | " content = chunk['message']['content']\n", 153 | " response_text += content\n", 154 | " yield response_text # Send the response incrementally to the UI\n", 155 | "\n", 156 | " # Append the assistant's full response to the history\n", 157 | " history.append({\"role\": \"assistant\", \"content\": response_text})" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 5, 163 | "id": "2072e9ee-c43b-4702-a79e-29e7ac8f3a91", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "* Running on local URL: http://127.0.0.1:7860\n", 171 | "\n", 172 | "To create a public link, set `share=True` in `launch()`.\n" 173 | ] 174 | }, 175 | { 176 | "data": { 177 | "text/html": [ 178 | "
" 179 | ], 180 | "text/plain": [ 181 | "" 182 | ] 183 | }, 184 | "metadata": {}, 185 | "output_type": "display_data" 186 | }, 187 | { 188 | "data": { 189 | "text/plain": [] 190 | }, 191 | "execution_count": 5, 192 | "metadata": {}, 193 | "output_type": "execute_result" 194 | } 195 | ], 196 | "source": [ 197 | "# Create a Gradio ChatInterface\n", 198 | "gr.ChatInterface(\n", 199 | " fn=stream_chat, # The function handling the chat\n", 200 | " type=\"messages\", # Using \"messages\" to enable chat-style conversation\n", 201 | " examples=[{\"text\": \"What is this paper about?\", \"files\": ['papers/attention-is-all-you-need.png']}], # Example inputs\n", 202 | " multimodal=True,\n", 203 | ").launch()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "id": "d18e4b9a-ebeb-458e-bce5-b4812937febb", 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 3 (ipykernel)", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.12.2" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 5 236 | } 237 | -------------------------------------------------------------------------------- /session-3/papers/attention-is-all-you-need.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-3/papers/attention-is-all-you-need.pdf -------------------------------------------------------------------------------- /session-3/papers/attention-is-all-you-need.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-3/papers/attention-is-all-you-need.png -------------------------------------------------------------------------------- /session-3/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | ipykernel 3 | PyMuPDF 4 | openai 5 | ollama 6 | gradio 7 | -------------------------------------------------------------------------------- /session-3/slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-3/slides.pdf -------------------------------------------------------------------------------- /session-3/top_secret.py: -------------------------------------------------------------------------------- 1 | my_sk = "your sk here" -------------------------------------------------------------------------------- /session-4/data/chunk_embeddings.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-4/data/chunk_embeddings.pt -------------------------------------------------------------------------------- /session-4/data/survey.csv: -------------------------------------------------------------------------------- 1 | "Timestamp","What is your current level of experience with AI/ML?","How many years of programming experience do you have?","Which programming language(s) are you most comfortable with?","What is your dream outcome for this course?","How many hours per week can you dedicate to this project outside of live sessions?","Is there anything else you'd like to share?" 2 | "2024/10/23 3:50:16 PM CST","1-3 years","4-6 years","Python","Hands on projects, and switch to Data science career path from research","6-10","Really excited about the course" 3 | "2024/10/23 11:10:31 PM CST","1-3 years","4-6 years","Python","Get to know Shaw better! Plus, have a clear path with resources to guide you on where to go and what to do for the future","5","" 4 | "2024/10/24 2:28:16 AM CST","<1 year","<1 year","Python","Fundamentals understanding, Hands on skills, small projects running in diff frameworks, create a small GH portfolio.","It depends if I find it interesting","Answers correspond to specific Ai hands on skills, industry experience around 12 years." 5 | "2024/10/24 3:18:45 AM CST","4-6 years","7+ years","Python","Build multiple AI applications","2","" 6 | "2024/10/24 6:26:56 AM CST","<1 year","7+ years","PHP, Javascript","I would like to learn how to practically setup LLM application in a production environment so that I can start creating some AI web apps for internal and external use. ","10","" 7 | "2024/10/24 2:49:01 PM CST","<1 year","7+ years","Python","Ability to train model based on a data set, and how to do predictive analysis.","20 hours","" 8 | "2024/10/25 3:55:59 AM CST","<1 year","1-3 years","I used to code in a previous life and started learning Python this year (intermediate level) and comfortable in BASH","Develop a good foundation in AI/ML. ","10","Great idea, glad to see it growing! " 9 | "2024/10/26 9:16:00 AM CST","1-3 years","7+ years","python, javascript, java, c++, typescript, goLang","I love to figure out good strategies to use in my software solutions using AI","10","" 10 | "2024/10/30 8:07:45 AM CST","<1 year","7+ years","C#, C++. Java","Learn to decide and advice between different data science archictures and options","4","" 11 | "2024/10/30 8:27:13 AM CST","7+ years","7+ years","Python, C++ (including CUDA C), R, Javascript, PHP","Implement practical AI cases","10","" 12 | "2024/10/30 9:57:49 AM CST","<1 year","7+ years","PHP, some JavaScript","Foundation in AI/ML","8 hrs","" 13 | "2024/10/30 10:10:27 AM CST","<1 year","<1 year","Python","Land a program manager job working with AI product/program","4","do I get help outside of classroom if I'm stuck (coding, etc)?" 14 | "2024/10/31 9:05:28 AM CST","<1 year","1-3 years","Python","Build an app","4","" 15 | "2024/10/31 5:11:27 PM CST","1-3 years","7+ years","c# python","able to create a LLM from scratch","12","no" 16 | "2024/11/04 10:22:23 AM CST","<1 year","<1 year","Python","Get a new job","5","" 17 | "2024/11/04 1:46:30 PM CST","1-3 years","7+ years","Python","At my former job I had a coworker who had the tedious task of making a weekly report summarizing local news for the boss. At the time I was completely sure that was something AI could do, but I didn't have the time nor the knowledge to develop such a thing. I constantly find myself having ideas like this, so a dream outcome would be finishing the course with at least a roadmap for making such an app.","6 to 8 are already on my calendar, but I can do more if I get too excited. ","" 18 | "2024/11/04 2:59:01 PM CST","1-3 years","1-3 years","Python, bash","Build cool products","10","" 19 | "2024/11/05 12:16:31 AM CST","<1 year","<1 year","Python","Be able to be comfortable with GenAi","20","" 20 | "2024/11/05 12:56:29 AM CST","<1 year","1-3 years","Fortran / Algol / Basic -- 30+ years ago :) (but I was pretty good at it)","A certificate to showcase my new skills, A jump start to do my own programing and be able to communicate with programmers","8-10 hours, flexible for more","On a journey from power generation (Caterpillar) sales professional to AI 'something'" 21 | "2024/11/05 5:48:09 AM CST","<1 year","1-3 years","T-SQL","Setup my own environment to compare Machine Learning statistics created by my companies Data Scientists against my own environment.","3","" 22 | "2024/11/05 9:42:43 AM CST","<1 year","1-3 years","HTML, CSS, Python","Be able to create and deploy my own Ai powered apps","4","" 23 | "2024/11/06 10:53:18 AM CST","1-3 years","1-3 years","Python","Build and launch a product","5","" 24 | "2024/11/07 9:30:00 AM CST","<1 year","1-3 years","Python, C","Getting skills to build AI technologies for many projects","8","" 25 | "2024/11/07 11:17:28 AM CST","<1 year","<1 year","Python","Being able to implement llms into projects ","10","No" 26 | "2024/11/07 12:35:01 PM CST","1-3 years","7+ years","Python SQL R","Landing a new job","5-10","" 27 | "2024/11/07 2:08:57 PM CST","1-3 years","7+ years","SAS, SQL, Python, Scala","to deepen my understanding of advanced AI techniques.And I also aim to expand my professional network and collaborate with like-minded individuals to explore new opportunities in the AI landscape.","10 - 15 Hrs per week","" 28 | "2024/11/07 2:35:51 PM CST","<1 year","4-6 years","Python, java, c#","Be able to train a model and have some practical usage of llm.","2","" 29 | "2024/11/07 2:52:33 PM CST","<1 year","7+ years","C#, C++, Java","Learn to identify the ML solution and lead projects based on AI","3","" 30 | "2024/11/07 5:01:38 PM CST","<1 year","7+ years","SQL","Able to put into practice AI/ML for real bunsiess solutions","4","" 31 | "2024/11/07 8:04:05 PM CST","1-3 years","1-3 years","Python","Master AI powered productivity tools to streamline regulatory compliance work","7","Looking for useful practical sample projects implementations that can be adapted in my work - Financial management and regulatory compliance" 32 | "2024/11/07 8:36:26 PM CST","<1 year","7+ years","PHP, SQL","Learn Python related to AI. Implement prototypes.","10","" 33 | "2024/11/08 12:00:22 AM CST","7+ years","7+ years","Python, R but I'm very rusty","1) build GPT that accesses functions and APIs, 2) hands-on fine-tune a model using LoRA 3) write and debug Python code (with AI assistance) that accesses ChatGPT, Perplexity and Google search, 4) build working agent(s) that can output results in less than 10 seconds 5) possibly use a RAG efficiently for booklength PDF texts ","5-10","I probably understand already the concepts you are going to teach, but I want more current hands-on experience as I am an executive --I was a programmer a long time ago. I am very rusty with Python programming and need to relearn it with professional tools (including Github, VSCode or PyCharm, Canvas or Projects) and best practices like using Docker and practicing best security. I need to expertly know how to protect our data and conversations from the foundational models training on it." 34 | "2024/11/08 12:25:02 AM CST","1-3 years","4-6 years","Python, C#, C++","Fully understand AI and LLM, and how to build one using Python","2","" 35 | "2024/11/08 2:03:37 AM CST","<1 year","1-3 years","JavaScript ","Be able to comfortably add AI into daily or business use cases ","10","" 36 | "2024/11/08 9:02:27 AM CST","4-6 years","1-3 years","Pascal","Build my own AI product with the AI services available these days","3","I'd love to network with classmates, and it would be super awesome if we have a slack community to stay in touch!" 37 | "2024/11/08 9:48:30 AM CST","<1 year","<1 year","Python, NextJS","I want to build my own MicroSaaS products","4 - 5","Looking forward to building my own stuff" 38 | "2024/11/15 8:34:32 AM CST","1-3 years","7+ years","python, GoLang, Java, c++, f#","I love to apply ai solution to existing and future back-end projects","10","looking forward to learning deep ai concepts" -------------------------------------------------------------------------------- /session-4/example_2-blog_semantic_search.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "44353ae1-41a5-4e8e-bdfb-fec1f445f239", 6 | "metadata": {}, 7 | "source": [ 8 | "# Semantic Search with Text Embeddings\n", 9 | "## ABB #1 - Session 4\n", 10 | "\n", 11 | "Code authored by: Shaw Talebi" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "9e22ee10-b6c8-4e59-babd-366b41f4a357", 17 | "metadata": {}, 18 | "source": [ 19 | "### imports" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "18c3df73-038b-44b9-9bf2-526dc485c311", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import os\n", 30 | "from bs4 import BeautifulSoup\n", 31 | "import json\n", 32 | "from sentence_transformers import SentenceTransformer\n", 33 | "import torch\n", 34 | "from IPython.display import display, Markdown\n", 35 | "from functions import *" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "id": "3667ac64-c7f7-4a29-9bf4-9eda692145c8", 41 | "metadata": {}, 42 | "source": [ 43 | "### 1) chunk articles" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "id": "a53d05bc-7ace-455a-a6bd-bdf45116d916", 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "# Get all HTML files from raw directory\n", 54 | "filename_list = [\"articles/\"+f for f in os.listdir('articles')]\n", 55 | "\n", 56 | "chunk_list = []\n", 57 | "for filename in filename_list:\n", 58 | "\n", 59 | " # read html file\n", 60 | " with open(filename, 'r', encoding='utf-8') as file:\n", 61 | " html_content = file.read()\n", 62 | "\n", 63 | " # Parse HTML\n", 64 | " soup = BeautifulSoup(html_content, 'html.parser')\n", 65 | " \n", 66 | " # Get article title\n", 67 | " article_title = soup.find('title').get_text().strip() if soup.find('title') else \"Untitled\"\n", 68 | " \n", 69 | " # Initialize variables\n", 70 | " article_content = []\n", 71 | " current_section = \"Main\" # Default section if no headers found\n", 72 | " \n", 73 | " # Find all headers and text content\n", 74 | " content_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol'])\n", 75 | "\n", 76 | " # iterate through elements and extract text with metadata\n", 77 | " for element in content_elements:\n", 78 | " if element.name in ['h1', 'h2', 'h3']:\n", 79 | " current_section = element.get_text().strip()\n", 80 | " elif element.name in ['p', 'ul', 'ol']:\n", 81 | " text = element.get_text().strip()\n", 82 | " # Only add non-empty content that's at least 30 characters long\n", 83 | " if text and len(text) >= 30:\n", 84 | " article_content.append({\n", 85 | " 'article_title': article_title,\n", 86 | " 'section': current_section,\n", 87 | " 'text': text\n", 88 | " })\n", 89 | "\n", 90 | " # add article content to list\n", 91 | " chunk_list.extend(article_content)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 3, 97 | "id": "b70b7fbf-c363-4a5a-84b8-be342a450dd4", 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "# save chunk list to file\n", 102 | "filename='data/chunk_list.json'\n", 103 | "with open(filename, 'w', encoding='utf-8') as f:\n", 104 | " json.dump(chunk_list, f, indent=4, ensure_ascii=False)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "6e50b033-8519-4b93-b8ac-b05eed841604", 110 | "metadata": {}, 111 | "source": [ 112 | "### 2) embed chunks" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 4, 118 | "id": "52e1ddfa-8691-4cb0-8eef-74d7dc7db933", 119 | "metadata": {}, 120 | "outputs": [ 121 | { 122 | "name": "stdout", 123 | "output_type": "stream", 124 | "text": [ 125 | "Num chunks: 778\n" 126 | ] 127 | } 128 | ], 129 | "source": [ 130 | "# define text to embed\n", 131 | "text_list = []\n", 132 | "for content in chunk_list:\n", 133 | " # concatenate title and section header\n", 134 | " context = content['article_title'] + \" - \" + content['section'] + \": \"\n", 135 | " # append text from paragraph to fill CLIP's 256 sequence limit\n", 136 | " text = context + content['text'][:512-len(context)]\n", 137 | " \n", 138 | " text_list.append(text)\n", 139 | "print(\"Num chunks:\",len(text_list))" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 5, 145 | "id": "d03fe32a-230a-416a-ac7b-a25d85a41b05", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "# load model\n", 150 | "model = SentenceTransformer(\"multi-qa-MiniLM-L6-cos-v1\")\n", 151 | "\n", 152 | "# compute embeddings\n", 153 | "chunk_embeddings = model.encode(text_list)\n", 154 | "chunk_embeddings.shape\n", 155 | "\n", 156 | "# save chunk embeddings to file\n", 157 | "torch.save(chunk_embeddings, 'data/chunk_embeddings.pt')" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "id": "c9ac9ac6-134e-421c-904e-ec6bf81834db", 163 | "metadata": {}, 164 | "source": [ 165 | "### 3) semantic search" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 6, 171 | "id": "43e73516-fdfd-4963-ad19-2c3e413ab5a3", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "torch.Size([1, 778])\n", 179 | "tensor([0.0781, 0.1124, 0.0879, 0.1028, 0.0970])\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "# define query\n", 185 | "query = \"What is a token?\"\n", 186 | "query_embedding = model.encode(query)\n", 187 | "\n", 188 | "# compute similarity between query and all chunks\n", 189 | "similarities = model.similarity(query_embedding, chunk_embeddings)\n", 190 | "print(similarities.shape)\n", 191 | "print(similarities[0][:5])" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 7, 197 | "id": "c3aaf616-c522-45c1-a89b-3af6ae4166bb", 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "# define search parameters\n", 202 | "temp = 0.1\n", 203 | "k=3\n", 204 | "threshold = 0.05\n", 205 | "\n", 206 | "# Rescale similarities via softmax\n", 207 | "scores = torch.nn.functional.softmax(similarities/temp, dim=1)\n", 208 | "\n", 209 | "# Get sorted indices and scores\n", 210 | "sorted_indices = scores.argsort(descending=True)[0]\n", 211 | "sorted_scores = scores[0][sorted_indices]\n", 212 | "\n", 213 | "# Filter by threshold and get top k\n", 214 | "filtered_indices = [\n", 215 | " idx.item() for idx, score in zip(sorted_indices, sorted_scores) \n", 216 | " if score.item() >= threshold\n", 217 | "][:k]\n", 218 | "\n", 219 | "# Get corresponding content items and scores\n", 220 | "top_results = [chunk_list[i] for i in filtered_indices]\n", 221 | "result_scores = [scores[0][i].item() for i in filtered_indices]" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "id": "5b9d785f-a1df-40aa-b762-2b79f3748df5", 227 | "metadata": {}, 228 | "source": [ 229 | "### 4) display results" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 8, 235 | "id": "246b8d0f-beea-406b-81b0-484171fb6231", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "results_markdown = \"\"\n", 240 | "for i, result in enumerate(top_results, start=1):\n", 241 | " results_markdown += f\"{i}. **Article title:** {result['article_title']} \\n\"\n", 242 | " results_markdown += f\" **Section:** {result['section']} \\n\"\n", 243 | " results_markdown += f\" **Snippet:** {result['text']} \\n\\n\"" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 9, 249 | "id": "97df4231-af86-4dd3-b806-cab95bb4b5d2", 250 | "metadata": {}, 251 | "outputs": [ 252 | { 253 | "data": { 254 | "text/markdown": [ 255 | "1. **Article title:** Cracking Open the OpenAI (Python) API \n", 256 | " **Section:** 2) OpenAI’s (Python) API \n", 257 | " **Snippet:** Tokens, in the context of LLMs, are essentially a set of numbers representing a set of words and characters. For example, “The” could be a token, “ end” (with the space) could be another, and “.” another. \n", 258 | "\n" 259 | ], 260 | "text/plain": [ 261 | "" 262 | ] 263 | }, 264 | "metadata": {}, 265 | "output_type": "display_data" 266 | } 267 | ], 268 | "source": [ 269 | "display(Markdown(results_markdown))" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 10, 275 | "id": "b101df6f-93a0-4a32-a362-e067397a9140", 276 | "metadata": {}, 277 | "outputs": [ 278 | { 279 | "data": { 280 | "text/markdown": [ 281 | "1. **Article title:** LLM Fine-tuning — FAQs \n", 282 | " **Section:** RAG vs Fine-tuning? \n", 283 | " **Snippet:** We’ve already mentioned situations where RAG and fine-tuning perform well. However, since this is such a common question, it’s worth reemphasizing when each approach works best. \n", 284 | "\n", 285 | "2. **Article title:** LLM Fine-tuning — FAQs \n", 286 | " **Section:** RAG vs Fine-tuning? \n", 287 | " **Snippet:** RAG is when we inject relevant context into an LLM’s input prompt so that it can generate more helpful responses. For example, if we have a domain-specific knowledge base (e.g., internal company documents and emails), we might identify the items most relevant to the user’s query so that an LLM can synthesize information in an accurate and digestible way. \n", 288 | "\n", 289 | "3. **Article title:** LLM Fine-tuning — FAQs \n", 290 | " **Section:** RAG vs Fine-tuning? \n", 291 | " **Snippet:** Here’s high-level guidance on when to use each. \n", 292 | "\n" 293 | ], 294 | "text/plain": [ 295 | "" 296 | ] 297 | }, 298 | "metadata": {}, 299 | "output_type": "display_data" 300 | } 301 | ], 302 | "source": [ 303 | "# bringing it all together\n", 304 | "query = \"What's the difference between RAG and Fine-tuning?\"\n", 305 | "results_markdown = semantic_search(query, model, chunk_embeddings, chunk_list, temp=0.1, k=3, threshold=0)\n", 306 | "display(Markdown(results_markdown))" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "id": "390287f1-7ea2-4627-a646-e2ff9101b1e3", 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [] 316 | } 317 | ], 318 | "metadata": { 319 | "kernelspec": { 320 | "display_name": "Python 3 (ipykernel)", 321 | "language": "python", 322 | "name": "python3" 323 | }, 324 | "language_info": { 325 | "codemirror_mode": { 326 | "name": "ipython", 327 | "version": 3 328 | }, 329 | "file_extension": ".py", 330 | "mimetype": "text/x-python", 331 | "name": "python", 332 | "nbconvert_exporter": "python", 333 | "pygments_lexer": "ipython3", 334 | "version": "3.12.2" 335 | } 336 | }, 337 | "nbformat": 4, 338 | "nbformat_minor": 5 339 | } 340 | -------------------------------------------------------------------------------- /session-4/example_3-blog_QA_RAG.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "22c6a7a2-26c9-4e14-aafa-5cf069d02e46", 6 | "metadata": {}, 7 | "source": [ 8 | "# Article Series QA Assistant with RAG\n", 9 | "## ABB #1 - Session 4\n", 10 | "\n", 11 | "Code authored by: Shaw Talebi" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "bdff7bfa-ebdb-4323-b16a-d69aa948eb5b", 17 | "metadata": {}, 18 | "source": [ 19 | "### imports" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "dd3ebb26-94ac-4c47-a5fc-170c8caa0cd9", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import json\n", 30 | "from sentence_transformers import SentenceTransformer\n", 31 | "import torch\n", 32 | "from IPython.display import display, Markdown\n", 33 | "from functions import *\n", 34 | "\n", 35 | "from openai import OpenAI\n", 36 | "from top_secret import my_sk\n", 37 | "\n", 38 | "import os \n", 39 | "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "id": "953a58cc-0b6b-4529-a2af-c1f58facd0ae", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# setup api client\n", 50 | "client = OpenAI(api_key=my_sk)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "id": "7dbe890f-5c07-44c9-8afb-dbc819e82660", 56 | "metadata": {}, 57 | "source": [ 58 | "### load data & model" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 3, 64 | "id": "029c926b-6f42-4af6-816a-e34dc20aab8c", 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "# load chunks\n", 69 | "filename = 'data/chunk_list.json'\n", 70 | "with open(filename, 'r', encoding='utf-8') as f:\n", 71 | " chunk_list = json.load(f)\n", 72 | "\n", 73 | "# load embeddings\n", 74 | "chunk_embeddings = torch.load('data/chunk_embeddings.pt', weights_only=False)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 4, 80 | "id": "07c9b8c5-105a-492e-a14e-fb711e735f29", 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Num chunks: 778\n", 88 | "(778, 384)\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "print(\"Num chunks:\",len(chunk_list))\n", 94 | "print(chunk_embeddings.shape)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 5, 100 | "id": "5fff575e-faed-4d42-9c19-c4325f1e8e4e", 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "# load model\n", 105 | "model = SentenceTransformer(\"multi-qa-MiniLM-L6-cos-v1\")" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "f109f511-8897-4e69-aa99-30a2ff79b0c9", 111 | "metadata": {}, 112 | "source": [ 113 | "### 1) define query" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 6, 119 | "id": "6da59e64-ba7b-4150-8615-e23fe9b71a40", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "# define query\n", 124 | "query = \"When does it make sense to use RAG vs fine-tuning?\"" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "id": "bba55143-e771-4603-98a6-80b9f758d26d", 130 | "metadata": {}, 131 | "source": [ 132 | "### 2) context retreival" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 7, 138 | "id": "ce377b19-c0ec-4519-9cd1-e119911f109a", 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "results_markdown = semantic_search(query, model, chunk_embeddings, chunk_list, temp=0.1, k=10, threshold=0.01)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 8, 148 | "id": "d16694e7-6dc2-4630-80cb-e1d087e38f7f", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/markdown": [ 154 | "1. **Article title:** LLM Fine-tuning — FAQs \n", 155 | " **Section:** RAG vs Fine-tuning? \n", 156 | " **Snippet:** We’ve already mentioned situations where RAG and fine-tuning perform well. However, since this is such a common question, it’s worth reemphasizing when each approach works best. \n", 157 | "\n", 158 | "2. **Article title:** LLM Fine-tuning — FAQs \n", 159 | " **Section:** RAG vs Fine-tuning? \n", 160 | " **Snippet:** Here’s high-level guidance on when to use each. \n", 161 | "\n", 162 | "3. **Article title:** How to Improve LLMs with RAG \n", 163 | " **Section:** Why we care \n", 164 | " **Snippet:** Previous articles in this series discussed fine-tuning, which adapts an existing model for a particular use case. While this is an alternative way to endow an LLM with specialized knowledge, empirically, fine-tuning seems to be less effective than RAG at doing this [1]. \n", 165 | "\n", 166 | "4. **Article title:** LLM Fine-tuning — FAQs \n", 167 | " **Section:** RAG vs Fine-tuning? \n", 168 | " **Snippet:** RAG is when we inject relevant context into an LLM’s input prompt so that it can generate more helpful responses. For example, if we have a domain-specific knowledge base (e.g., internal company documents and emails), we might identify the items most relevant to the user’s query so that an LLM can synthesize information in an accurate and digestible way. \n", 169 | "\n", 170 | "5. **Article title:** LLM Fine-tuning — FAQs \n", 171 | " **Section:** RAG vs Fine-tuning? \n", 172 | " **Snippet:** Notice that these approaches are not mutually exclusive. In fact, the original RAG system proposed by Facebook researchers used fine-tuning to better use retrieved information for generating responses [4]. \n", 173 | "\n", 174 | "6. **Article title:** How to Improve LLMs with RAG \n", 175 | " **Section:** Some Nuances \n", 176 | " **Snippet:** Document preparation—The quality of a RAG system is driven by how well useful information can be extracted from source documents. For example, if a document is unformatted and full of images and tables, it will be more difficult to parse than a well-formatted text file. \n", 177 | "\n", 178 | "7. **Article title:** How to Improve LLMs with RAG \n", 179 | " **Section:** Some Nuances \n", 180 | " **Snippet:** While the steps for building a RAG system are conceptually simple, several nuances can make building one (in the real world) more complicated. \n", 181 | "\n", 182 | "8. **Article title:** LLM Fine-tuning — FAQs \n", 183 | " **Section:** When NOT to Fine-tune \n", 184 | " **Snippet:** The effectiveness of any approach will depend on the details of the use case. For example, fine-tuning is less effective than retrieval augmented generation (RAG) to provide LLMs with specialized knowledge [1]. \n", 185 | "\n", 186 | "9. **Article title:** How to Improve LLMs with RAG \n", 187 | " **Section:** How it works \n", 188 | " **Snippet:** There are 2 key elements of a RAG system: a retriever and a knowledge base. \n", 189 | "\n", 190 | "10. **Article title:** How to Improve LLMs with RAG \n", 191 | " **Section:** Why we care \n", 192 | " **Snippet:** Notice that RAG does not fundamentally change how we use an LLM; it's still prompt-in and response-out. RAG simply augments this process (hence the name). \n", 193 | "\n" 194 | ], 195 | "text/plain": [ 196 | "" 197 | ] 198 | }, 199 | "metadata": {}, 200 | "output_type": "display_data" 201 | } 202 | ], 203 | "source": [ 204 | "display(Markdown(results_markdown))" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "id": "f6442014-564f-4413-9c5f-d2e3b2e55d11", 210 | "metadata": {}, 211 | "source": [ 212 | "### 3) prompt engineering" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 9, 218 | "id": "48874ba8-97a1-40ab-9364-90c2c7d96203", 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "prompt_template = lambda query, results_markdown : f\"\"\" You are an AI assistant tasked with answering user questions based on excerpts from blog posts. Use the following snippets to \\\n", 223 | "provide accurate, concise, and synthesized answers. If the snippets don’t provide enough information, let the user know and suggest further exploration.\n", 224 | "\n", 225 | "## Question:\n", 226 | "{query}\n", 227 | "\n", 228 | "## Relevant Snippets:\n", 229 | "{results_markdown}\n", 230 | "\n", 231 | "---\n", 232 | "\n", 233 | "## Response:\n", 234 | "Provide a clear and concise response below, synthesizing information from the snippets and referencing them directly. If additional information is \\\n", 235 | "required, suggest further follow-ups or note what’s missing.\n", 236 | "\"\"\"" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 10, 242 | "id": "6022b678-3522-471f-b8b9-0f896e4eedb2", 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "prompt = prompt_template(query, results_markdown)\n", 247 | "# print(prompt)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "d0551c2a-fde3-4d48-a5d7-d7dfb6f362c4", 253 | "metadata": {}, 254 | "source": [ 255 | "### 4) prompt GPT-4o-mini" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 11, 261 | "id": "ab389a06-8359-49d1-96db-1663c2fdfc0d", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "# make api call\n", 266 | "response = client.chat.completions.create(\n", 267 | " model=\"gpt-4o-mini\",\n", 268 | " messages=[\n", 269 | " {\"role\": \"user\", \"content\": prompt}\n", 270 | " ], \n", 271 | " temperature = 0.5\n", 272 | ")\n", 273 | "\n", 274 | "# extract response\n", 275 | "answer = response.choices[0].message.content" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "id": "18a8da58-78a8-4136-8823-40e576d2115b", 281 | "metadata": {}, 282 | "source": [ 283 | "### 5) display results" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 12, 289 | "id": "3a4dbc7d-88c6-4a4d-9406-a96f16918f16", 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "\n", 297 | "When does it make sense to use RAG vs fine-tuning?\n", 298 | "\n" 299 | ] 300 | }, 301 | { 302 | "data": { 303 | "text/markdown": [ 304 | "When deciding between Retrieval-Augmented Generation (RAG) and fine-tuning for enhancing large language models (LLMs), consider the following:\n", 305 | "\n", 306 | "1. **RAG** is ideal when you need to inject relevant context into the model's input to improve response quality. It works well with domain-specific knowledge bases, allowing the model to synthesize information from relevant documents effectively (Snippet 4). This method is particularly useful when fine-tuning is less effective at providing specialized knowledge (Snippet 8).\n", 307 | "\n", 308 | "2. **Fine-tuning** adapts an existing model for a specific use case but is generally considered less effective than RAG for embedding specialized knowledge (Snippet 3). It can be beneficial when you have a well-defined dataset and the goal is to customize the model's behavior more fundamentally.\n", 309 | "\n", 310 | "3. Both approaches can be used together; for instance, the original RAG system utilized fine-tuning to enhance how retrieved information is employed in generating responses (Snippet 5).\n", 311 | "\n", 312 | "In summary, use RAG when you need to leverage external knowledge sources for better context in responses. Opt for fine-tuning when you want to fundamentally adjust the model's capabilities for a specific task, keeping in mind that it may be less effective for specialized knowledge compared to RAG. If you need more detailed guidance on specific use cases, further exploration of the topic may be beneficial." 313 | ], 314 | "text/plain": [ 315 | "" 316 | ] 317 | }, 318 | "metadata": {}, 319 | "output_type": "display_data" 320 | } 321 | ], 322 | "source": [ 323 | "print()\n", 324 | "print(query)\n", 325 | "print()\n", 326 | "display(Markdown(answer))" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 13, 332 | "id": "9c238da9-3bf7-4333-965c-285be39218b5", 333 | "metadata": {}, 334 | "outputs": [ 335 | { 336 | "data": { 337 | "text/markdown": [ 338 | "The benefits of fine-tuning large language models (LLMs) include:\n", 339 | "\n", 340 | "1. **Improved Performance for Specific Tasks**: Fine-tuned models can outperform larger pre-trained models for particular use cases, even when clever prompt engineering is applied (Snippet 6).\n", 341 | "\n", 342 | "2. **Lower Inference Costs**: Fine-tuning can lead to reduced inference costs, making it a practical choice for deploying AI assistants (Snippet 9).\n", 343 | "\n", 344 | "3. **Customization**: Fine-tuning allows for the adaptation of a model to specialized knowledge or tasks, enhancing its relevance and effectiveness (Snippet 2).\n", 345 | "\n", 346 | "4. **Quality of Training Data**: The performance of a fine-tuned model is heavily influenced by the quality of the training dataset used, emphasizing the importance of data preparation (Snippet 7).\n", 347 | "\n", 348 | "However, it is important to note that fine-tuning is not a one-size-fits-all solution. It may not be as effective as other techniques like retrieval augmented generation (RAG) for certain applications (Snippet 1), and it can incur an \"alignment tax,\" where performance may drop in some tasks (Snippet 5). \n", 349 | "\n", 350 | "For further exploration, consider looking into specific use cases where fine-tuning has shown significant benefits or challenges." 351 | ], 352 | "text/plain": [ 353 | "" 354 | ] 355 | }, 356 | "metadata": {}, 357 | "output_type": "display_data" 358 | } 359 | ], 360 | "source": [ 361 | "# bringing it all together\n", 362 | "query = \"What are the benefits of LLM fine-tuning?\"\n", 363 | "results_markdown = semantic_search(query, model, chunk_embeddings, chunk_list, temp=0.1, k=10, threshold=0.01)\n", 364 | "answer = answer_query(query, results_markdown, prompt_template, client)\n", 365 | "display(Markdown(answer))" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "id": "1b3a2383-fdc7-40ab-a5f6-f4e44cf6d287", 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [] 375 | } 376 | ], 377 | "metadata": { 378 | "kernelspec": { 379 | "display_name": "Python 3 (ipykernel)", 380 | "language": "python", 381 | "name": "python3" 382 | }, 383 | "language_info": { 384 | "codemirror_mode": { 385 | "name": "ipython", 386 | "version": 3 387 | }, 388 | "file_extension": ".py", 389 | "mimetype": "text/x-python", 390 | "name": "python", 391 | "nbconvert_exporter": "python", 392 | "pygments_lexer": "ipython3", 393 | "version": "3.12.2" 394 | } 395 | }, 396 | "nbformat": 4, 397 | "nbformat_minor": 5 398 | } 399 | -------------------------------------------------------------------------------- /session-4/functions.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import torch 3 | 4 | def parse_html_content(html_content): 5 | """ 6 | Parse HTML content and extract structured content with sections and paragraphs. 7 | 8 | Args: 9 | html_content (str): Raw HTML content to parse 10 | 11 | Returns: 12 | list: List of dictionaries containing structured content 13 | """ 14 | # Parse HTML 15 | soup = BeautifulSoup(html_content, 'html.parser') 16 | 17 | # Get article title 18 | article_title = soup.find('title').get_text().strip() if soup.find('title') else "Untitled" 19 | 20 | # Initialize variables 21 | structured_content = [] 22 | current_section = "Main" # Default section if no headers found 23 | 24 | # Find all headers and text content 25 | content_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol']) 26 | 27 | for element in content_elements: 28 | if element.name in ['h1', 'h2', 'h3']: 29 | current_section = element.get_text().strip() 30 | elif element.name in ['p', 'ul', 'ol']: 31 | text = element.get_text().strip() 32 | # Only add non-empty content that's at least 30 characters long 33 | if text and len(text) >= 30: 34 | structured_content.append({ 35 | 'article_title': article_title, 36 | 'section': current_section, 37 | 'text': text 38 | }) 39 | 40 | return structured_content 41 | 42 | 43 | def get_top_k_items(similarities, chunk_list, temp=0.1, k=3, threshold=0.05): 44 | """ 45 | Retrieves the top-k most similar items from a list based on a similarity matrix. 46 | 47 | Args: 48 | similarities (torch.Tensor): A 2D tensor where each row represents the similarity scores of an item. 49 | chunk_list (list): A list of content items corresponding to the columns of the similarity tensor. 50 | temp (float, optional): Temperature for softmax rescaling. Defaults to 0.1. 51 | k (int, optional): Number of top items to return. Defaults to 3. 52 | threshold (float, optional): Minimum similarity score for inclusion. Defaults to 0.05. 53 | 54 | Returns: 55 | list: The top-k most similar content items. 56 | list: The scores corresponding to the top-k items. 57 | """ 58 | # Rescale similarities via softmax 59 | scores = torch.nn.functional.softmax(similarities / temp, dim=1) 60 | 61 | # Get sorted indices and scores 62 | sorted_indices = scores.argsort(descending=True)[0] 63 | sorted_scores = scores[0][sorted_indices] 64 | 65 | # Filter by threshold and get top k 66 | filtered_indices = [ 67 | idx.item() for idx, score in zip(sorted_indices, sorted_scores) 68 | if score.item() >= threshold 69 | ][:k] 70 | 71 | # Get corresponding content items and scores 72 | top_results = [chunk_list[i] for i in filtered_indices] 73 | result_scores = [scores[0][i].item() for i in filtered_indices] 74 | 75 | return top_results, result_scores 76 | 77 | def compute_query_similarities(query, model, chunk_embeddings): 78 | """ 79 | Computes similarity scores between a query and a list of chunk embeddings. 80 | 81 | Args: 82 | query (str): The query string to compute similarities for. 83 | model: The embedding model that provides methods for encoding and similarity computation. 84 | chunk_embeddings (torch.Tensor): A tensor containing embeddings for the content chunks. 85 | 86 | Returns: 87 | torch.Tensor: A tensor of similarity scores between the query and the content chunks. 88 | """ 89 | # Encode the query into an embedding 90 | query_embedding = model.encode(query) 91 | 92 | # Compute similarity between query embedding and all chunk embeddings 93 | similarities = model.similarity(query_embedding, chunk_embeddings) 94 | 95 | return similarities 96 | 97 | def format_results_to_markdown(top_results): 98 | """ 99 | Formats a list of result dictionaries into a Markdown string for display. 100 | 101 | Args: 102 | top_results (list): A list of dictionaries, each containing 'article_title', 'section', and 'text' keys. 103 | 104 | Returns: 105 | str: A formatted Markdown string containing the titles, sections, and snippets from the results. 106 | """ 107 | results_markdown = "" 108 | for i, result in enumerate(top_results, start=1): 109 | results_markdown += f"{i}. **Article title:** {result['article_title']} \n" 110 | results_markdown += f" **Section:** {result['section']} \n" 111 | results_markdown += f" **Snippet:** {result['text']} \n\n" 112 | 113 | return results_markdown 114 | 115 | def semantic_search(query, model, chunk_embeddings, chunk_list, temp=0.1, k=3, threshold=0.05): 116 | """ 117 | Executes a semantic search pipeline: computes similarities, retrieves top results, and formats them into Markdown. 118 | 119 | Args: 120 | query (str): The search query string. 121 | model: The embedding model with methods for encoding and similarity computation. 122 | chunk_embeddings (torch.Tensor): A tensor of precomputed chunk embeddings. 123 | chunk_list (list): A list of content chunks corresponding to the embeddings. 124 | temp (float, optional): Temperature for softmax rescaling. Defaults to 0.1. 125 | k (int, optional): Number of top items to return. Defaults to 3. 126 | threshold (float, optional): Minimum similarity score for inclusion. Defaults to 0.05. 127 | 128 | Returns: 129 | str: Formatted Markdown string of the top results. 130 | """ 131 | # Compute similarities between the query and chunk embeddings 132 | similarities = compute_query_similarities(query, model, chunk_embeddings) 133 | 134 | # Retrieve the top-k results and their scores 135 | top_results, result_scores = get_top_k_items(similarities, chunk_list, temp=temp, k=k, threshold=threshold) 136 | 137 | # Format the top results into a Markdown string 138 | results_markdown = format_results_to_markdown(top_results) 139 | 140 | return results_markdown 141 | 142 | def answer_query(query, results_markdown, prompt_template, client): 143 | """ 144 | Function answer user query based on semantic search results 145 | """ 146 | prompt = prompt_template(query, results_markdown) 147 | 148 | response = client.chat.completions.create( 149 | model="gpt-4o-mini", 150 | messages=[ 151 | {"role": "user", "content": prompt} 152 | ], 153 | temperature = 0.5, 154 | ) 155 | 156 | # extract response 157 | return response.choices[0].message.content -------------------------------------------------------------------------------- /session-4/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | ipykernel 3 | matplotlib 4 | beautifulsoup4 5 | pandas 6 | scikit-learn 7 | sentence-transformers 8 | openai 9 | gradio 10 | numpy<2 11 | -------------------------------------------------------------------------------- /session-4/slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-4/slides.pdf -------------------------------------------------------------------------------- /session-4/top_secret.py: -------------------------------------------------------------------------------- 1 | my_sk = "your sk here" -------------------------------------------------------------------------------- /session-5/README.md: -------------------------------------------------------------------------------- 1 | # Session 5: Software 3.0, Fine-tuning 2 | 3 | Links: 4 | - [Example 1: Fine-tuning BERT for Text Classification](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/example_1-finetune_bert_classifier.ipynb) 5 | - [Example 2: Fine-tuning a LinkedIn Post Writer](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/example_2-linkedin_post_writer.ipynb) 6 | - [Slides](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-5/slides.pdf) 7 | 8 | ## How to run the examples 9 | 10 | 1. Clone this repo 11 | 2. Navigate to downloaded folder and create new venv 12 | ``` 13 | python -m venv s5-env 14 | ``` 15 | 3. Activate venv 16 | ``` 17 | # mac/linux 18 | source s5-env/bin/activate 19 | 20 | # windows 21 | .\s5-env\Scripts\activate.bat 22 | ``` 23 | 4. Install dependencies 24 | ``` 25 | pip install -r requirements.txt 26 | ``` 27 | 5. Launch Jupyter Lab 28 | ``` 29 | jupyter lab 30 | ``` 31 | -------------------------------------------------------------------------------- /session-5/data/LI_posts.numbers: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-5/data/LI_posts.numbers -------------------------------------------------------------------------------- /session-5/data/valid-data.jsonl: -------------------------------------------------------------------------------- 1 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nSharing 5 AI project ideas blog\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Within 24 hours, my latest blog was read by 10k people\u2026 \n\nIt fleshes out the 5 AI project ideas (you can build this weekend) that I shared here on Monday. \n\nI break each idea down step-by-step and share some code for getting started. \n\nHere\u2019s the link if you want to check it out \ud83d\udc47 \n\nhttps://lnkd.in/gDvxCHx5\n\n--\nMedia: Article screenshot"}]} 2 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\n3 ways to make LLMs multimodal and sharing blog post\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Over the past year, we\u2019ve seen LLMs gain the ability to see (i.e. process images) and hear (i.e. process audio directly), but how does that work under the hood? \n\nHere are 3 general ways to do that \ud83d\udc47 \n\n1. LLM + Tools: Augment LLMs with pre-built components \n\n2. LLM + Adapters: Augment LLMs with multi-modal encoders or decoders, which are aligned via adapter fine-tuning \n\n3. Unified Models: Expand LLM architecture to fuse modalities at pre-training \n\nIn my latest blog post in Towards Data Science, I discuss these three approaches and share example code for (locally) using LLaMA 3.2 Vision to perform various image-to-text tasks. \n\nCheck it out here: https://lnkd.in/gGcvVk9u\n\n--\nMedia: Article screenshot"}]} 3 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nHitting 1M views on YouTube\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "This has been a month of milestones for my YouTube channel. \n\nLast week, I crossed 1,000,000 views \ud83e\udd2f\u2026 a number too big to fully comprehend.\n\nI started this journey by making content that would have been helpful to a past version of myself. \n\nAnd as a Physics PhD student, I didn\u2019t think that would be relevant to a lot of people \ud83d\ude05 \n\nBut, to my surprise, it seems there are many people (not just PhD students) who have an appetite for learning about data science and AI. \n\nWhich makes me pretty lucky \ud83d\ude01 \n\nThank you to everyone who watched \ud83d\ude4f.. \n\n.. Next stop 10M??\n\n--\nMedia: Image"}]} 4 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nExplaining multimodal embeddings and sharing blog post\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Although text and images may look very different to us, to a neural network, these are represented via the same mathematical object, i.e., a vector. \n\nTherefore, we can image vector spaces where text and images look similar, given they represent the same underlying information. \n\nThis is the key idea behind multimodal embeddings. \n\nIn my latest blog post, I share how we can generate such representations along with two practical use cases for them: 1) 0-shot Image Classification and 2) Image search. \n\nCheck it out here \ud83d\udc47 https://lnkd.in/g-Dvy9dF\n\n--\nMedia: Article screenshot"}]} 5 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nRecap of past 12 months on my 29th bday\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "I turned 29 today. \n\n28 was wild\u2026 here are some things that happened. \n\n>> 1.2M people watched my YouTube videos \n>> My YT channel went from 2k \u2014> 31k subs \n>> 600 people joined The Data Entrepreneurs Discord \n>> I started a podcast with Ifeoma Ahuna \n\nBut most importantly\u2026 \n>> I survived my 1st year of entrepreneurship! \n\nWhile this past year has been harder than I expected (emotionally), going down this path is the best decision I\u2019ve made in my 29 years of life. \n\nEntrepreneurship allows me to live a life better aligned with my values (relationships, growth, and freedom), which I feel is key to a fulfilling life. \n\nExcited to see where Year 2 takes me (other than turning 30 \ud83d\ude05)\n\n--\nMedia: Image"}]} 6 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nMultimodal RAG and its value. Share blog post\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Two major topics in AI this year were multimodal models and RAG. \n\nBut\u2026 what if we brought these ideas together? Multimodal RAG systems are capable of processing text and non-text data. \n\nThis enables more powerful solutions beyond what is conveyed by text alone. \n\nFor example, it could analyze someone\u2019s facial expressions and speech tonality to give a richer context to a meeting\u2019s transcription. \n\nIn my latest blog post, I give a beginner-friendly guide to Multimodal RAG and share an example implementation using LLaMA 3.2 Vision and CLIP. \n\nCheck it out here \ud83d\udc47 \n\nhttps://lnkd.in/grY5bGDz\n\n--\nMedia: Article screenshot"}]} 7 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nQuitting consoling to focus on content creation\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "I\u2019m quitting consulting to focus on YouTube. \n\nHere\u2019s why. \ud83d\udc47 \n\n1) YouTube is working! (80% of my traffic comes from YT) \n\n2) It allows me to focus on one thing (i.e. avoid distractions) \n\n3) Making 1 video a week gives me a clear, quantifiable goal that I can use to frame my learning and product development. \n\nWhile committing to one thing is scary (especially something unpredictable like YouTube \ud83d\ude05), it is also necessary for success. \n\nThe reason (IMO) is that commitment backs you into a corner, forcing you to innovate and learn new things. \n\nYou can watch this experiment risk-free on my YouTube channel \ud83d\ude09 \n\n(\ud83d\udd17 link in comments)\n\n--\nMedia: Image"}]} 8 | {"messages": [{"role": "system", "content": "LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur"}, {"role": "user", "content": "Write a LinkedIn post based on the following idea:\nGetting a data science job was hard until I learned these 5 things\n\nInclude:\n- A compelling opening line that hooks the reader\n- Copy that expands upon the idea in valuable way\n- A call to action or share relevant content\n\nOutput:\n"}, {"role": "assistant", "content": "Getting a Data Science Job was hard until I learned these 5 things \ud83d\udc47 \n\n1. Adapt your resume to the specific role \n\nYou have about 7 seconds of a hiring manager's attention when reviewing your resume; use it to call out all the skills they listed in the job description. \n\n2. Do real-world projects \n\nWhile toy projects from class or Kaggle are a great way to learn, hiring managers care about real-world impact. \n\n3. Make a portfolio \n\nDespite the free no-code tools for making a portfolio website, most people don't make one. That makes this a super easy way to stand out. \n\n4. Stay out of the weeds (unless they ask) \n\nThis was one of my greatest struggles. When someone asked about a project, I would dive right into the technical details. \n\nThis is the best way to lose someone's attention (even data scientists). \n\n5. Be yourself (not who you think they want to hire) \n\nWhen I interviewed for my full-time data science role, I dove into a rant about my love for optimization. \n\nMost people get confused and awkward when I discuss such topics, but not the interviewers. They got pumped up... that's when I knew I found my people \ud83d\ude01 \n\nWhat was an unlock for you when applying to data science roles?\n\n--\nMedia: Image"}]} 9 | -------------------------------------------------------------------------------- /session-5/example_2-linkedin_post_writer.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "11c076dd-4e32-431c-9793-e0784197e3d1", 6 | "metadata": {}, 7 | "source": [ 8 | "# Fine-tuning GPT-4o to Write LinkedIn Posts\n", 9 | "## ABB #1 - Session 5\n", 10 | "\n", 11 | "Code authored by: Shaw Talebi" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "id": "7c7812b7-9bb9-47da-8116-f6af553e18c5", 17 | "metadata": {}, 18 | "source": [ 19 | "### imports" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "id": "8369ae91-d925-4ed1-9a85-32fb8e64a153", 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import csv\n", 30 | "import json\n", 31 | "import random\n", 32 | "\n", 33 | "from openai import OpenAI\n", 34 | "from top_secret import my_sk \n", 35 | "\n", 36 | "# connect to openai API\n", 37 | "client = OpenAI(api_key=my_sk)" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "id": "b67b2a2d-75c7-4913-b603-93127ece77fc", 43 | "metadata": {}, 44 | "source": [ 45 | "### Read data" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "id": "4bc4f088-1331-4de4-a252-c60210479042", 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# load csv of YouTube comments\n", 56 | "idea_list = []\n", 57 | "copy_list = []\n", 58 | "media_list = []\n", 59 | "\n", 60 | "with open('data/LI_posts.csv', mode ='r') as file:\n", 61 | " file = csv.reader(file)\n", 62 | " \n", 63 | " # read file line by line\n", 64 | " for line in file:\n", 65 | " # skip first line\n", 66 | " if line[0]=='Idea':\n", 67 | " continue\n", 68 | " \n", 69 | " # append comments and responses to respective lists\n", 70 | " idea_list.append(line[0])\n", 71 | " copy_list.append(line[1])\n", 72 | " media_list.append(line[2])" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 3, 78 | "id": "5bb3cc63-84cb-40ff-bc75-ad40447d46a3", 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "50\n", 86 | "50\n", 87 | "50\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "print(len(idea_list))\n", 93 | "print(len(copy_list))\n", 94 | "print(len(media_list))" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "id": "9afd558b-b292-4582-a965-04ebeb46c403", 100 | "metadata": {}, 101 | "source": [ 102 | "### Create training examples" 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 4, 108 | "id": "f6ed7112-b939-41d4-9fc2-077c9c85229a", 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# construct training examples\n", 113 | "example_list = []\n", 114 | "\n", 115 | "system_prompt = \"LinkedIn Post Writer for Shaw Talebi, AI educator and entrepreneur\"\n", 116 | "\n", 117 | "prompt_template = lambda idea_string : f\"\"\"Write a LinkedIn post based on the following idea:\n", 118 | "{idea_string}\n", 119 | "\n", 120 | "Include:\n", 121 | "- A compelling opening line that hooks the reader\n", 122 | "- Copy that expands upon the idea in valuable way\n", 123 | "- A call to action or share relevant content\n", 124 | "\n", 125 | "Output:\n", 126 | "\"\"\"\n", 127 | "\n", 128 | "for i in range(len(idea_list)): \n", 129 | " system_dict = {\"role\": \"system\", \"content\": system_prompt}\n", 130 | " user_dict = {\"role\": \"user\", \"content\": prompt_template(idea_list[i])}\n", 131 | " assistant_dict = {\"role\": \"assistant\", \"content\": copy_list[i] + \"\\n\\n--\\nMedia: \" + media_list[i]}\n", 132 | " \n", 133 | " messages_list = [system_dict, user_dict, assistant_dict]\n", 134 | " \n", 135 | " example_list.append({\"messages\": messages_list})" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 5, 141 | "id": "1c847ac6-89de-413f-a7c4-854e9a514658", 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "Write a LinkedIn post based on the following idea:\n", 149 | "3 types of AI Tik Tok\n", 150 | "\n", 151 | "Include:\n", 152 | "- A compelling opening line that hooks the reader\n", 153 | "- Copy that expands upon the idea in valuable way\n", 154 | "- A call to action or share relevant content\n", 155 | "\n", 156 | "Output:\n", 157 | "\n", 158 | "A problem with AI today is that it means different things to different people. \n", 159 | "\n", 160 | "This framework from Andrej Karpathy helped give me much more clarity 👇 \n", 161 | "\n", 162 | "Software 1.0 = Rule-based software systems. Humans program computers to solve problems step-by-step. \n", 163 | "\n", 164 | "Software 2.0 = Computers program themselves by seeing examples (i.e. machine learning) \n", 165 | "\n", 166 | "Software 3.0 = Repurposing general-purpose ML models for specific use cases (i.e. GenAI + Foundation Models) \n", 167 | "\n", 168 | "But… what’s Software 4.0 going to be? 🤔\n", 169 | "\n", 170 | "--\n", 171 | "Media: Video\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "print(example_list[0]['messages'][1]['content'])\n", 177 | "print(example_list[0]['messages'][2]['content'])" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 6, 183 | "id": "daa80954-4589-4850-8b69-b2427828247b", 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "data": { 188 | "text/plain": [ 189 | "50" 190 | ] 191 | }, 192 | "execution_count": 6, 193 | "metadata": {}, 194 | "output_type": "execute_result" 195 | } 196 | ], 197 | "source": [ 198 | "len(example_list)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "id": "39301b03-69be-4f2c-b7fd-44e17a22be2a", 204 | "metadata": {}, 205 | "source": [ 206 | "### Create train/validation split" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 7, 212 | "id": "6d43a2f6-b5b8-40c6-92c9-f4ac00c35643", 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "# randomly pick out validation examples\n", 217 | "num_examples = 10\n", 218 | "validation_index_list = random.sample(range(0, len(example_list)-1), num_examples)\n", 219 | "validation_data_list = [example_list[index] for index in validation_index_list]\n", 220 | "\n", 221 | "for example in validation_data_list:\n", 222 | " example_list.remove(example)" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 8, 228 | "id": "7bf38b30-6c4e-4956-96e6-b7bc5f92423f", 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "40\n", 236 | "10\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "print(len(example_list))\n", 242 | "print(len(validation_data_list))" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 9, 248 | "id": "09999da1-e0a4-4bee-949d-c31006dad700", 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "# write examples to file\n", 253 | "with open('data/train-data.jsonl', 'w') as train_file:\n", 254 | " for example in example_list:\n", 255 | " json.dump(example, train_file)\n", 256 | " train_file.write('\\n')\n", 257 | "\n", 258 | "with open('data/valid-data.jsonl', 'w') as valid_file:\n", 259 | " for example in validation_data_list:\n", 260 | " json.dump(example, valid_file)\n", 261 | " valid_file.write('\\n')" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "id": "ce1eb41d-f61b-493e-b2dc-1abc458ba9a4", 267 | "metadata": {}, 268 | "source": [ 269 | "### Upload data to OpenAI" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 10, 275 | "id": "4f3ff2b6-7530-48f9-837b-86000d7f72c6", 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "train_file = client.files.create(\n", 280 | " file = open(\"data/train-data.jsonl\", \"rb\"),\n", 281 | " purpose = \"fine-tune\"\n", 282 | ")\n", 283 | "\n", 284 | "valid_file = client.files.create(\n", 285 | " file = open(\"data/valid-data.jsonl\", \"rb\"),\n", 286 | " purpose = \"fine-tune\"\n", 287 | ")" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "id": "d753629d-45e5-4d0f-b0bd-0bef1c51062c", 293 | "metadata": {}, 294 | "source": [ 295 | "### Fine-tune model" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 11, 301 | "id": "88425464-a1ec-4230-a025-19f9a848a73a", 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "FineTuningJob(id='ftjob-eCS6EchA0sb7hWMrOQlZITRQ', created_at=1734050118, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-KjWERyZ9WLUqIdrdMeJh4zC0', result_files=[], seed=616771098, status='validating_files', trained_tokens=None, training_file='file-2qUUvaBrn3qmzK8UjBZwdD', validation_file='file-61iDsGKpr4LM9ssQknQiC5', estimated_finish=None, integrations=[], user_provided_suffix='LI-post-writer')" 308 | ] 309 | }, 310 | "execution_count": 11, 311 | "metadata": {}, 312 | "output_type": "execute_result" 313 | } 314 | ], 315 | "source": [ 316 | "client.fine_tuning.jobs.create(\n", 317 | " training_file = train_file.id,\n", 318 | " validation_file = valid_file.id,\n", 319 | " suffix = \"LI-post-writer\",\n", 320 | " model = \"gpt-4o-mini-2024-07-18\"\n", 321 | ")" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "id": "3c650e56-77a0-4a76-a52e-6d9194fb6278", 327 | "metadata": {}, 328 | "source": [ 329 | "### Evaluate fine-tuned model" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 12, 335 | "id": "dd26bc6e-7bee-45f2-88a2-e419de0fb762", 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "def generate_post(system_prompt, model_name, idea):\n", 340 | " response = client.chat.completions.create(\n", 341 | " model=model_name,\n", 342 | " messages=[\n", 343 | " {\"role\": \"system\", \"content\": system_prompt},\n", 344 | " {\"role\": \"user\", \"content\": prompt_template(idea)}\n", 345 | " ],\n", 346 | " temperature=0.7,\n", 347 | " )\n", 348 | " return response.choices[0].message.content" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 13, 354 | "id": "7f5b4455-b219-4fee-95ec-d99ab052acbd", 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "idea = \"Python was hard until I learned these 5 things\"" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 14, 364 | "id": "17be2632-d9ee-44b9-9569-19e274f0709f", 365 | "metadata": {}, 366 | "outputs": [ 367 | { 368 | "name": "stdout", 369 | "output_type": "stream", 370 | "text": [ 371 | "🚀 Struggling with Python? You're not alone. It was a tough nut to crack until I discovered these 5 game-changing strategies.\n", 372 | "\n", 373 | "1. **Think in Data Structures:** Understanding lists, dictionaries, and sets is crucial. They're the backbone of efficient Python coding, allowing you to solve complex problems with ease.\n", 374 | "\n", 375 | "2. **Master List Comprehensions:** Transform your loops into concise, readable expressions. This not only saves time but also boosts your code’s performance.\n", 376 | "\n", 377 | "3. **Leverage Libraries:** Python’s strength lies in its vast ecosystem of libraries. Familiarize yourself with pandas for data manipulation, NumPy for numerical computations, and requests for HTTP requests.\n", 378 | "\n", 379 | "4. **Embrace the Zen of Python:** \"Readability counts.\" Keep your code clean and simple. Follow PEP 8 guidelines to ensure your code is not just functional but also elegant.\n", 380 | "\n", 381 | "5. **Practice, Practice, Practice:** There's no substitute for hands-on experience. Challenge yourself with real-world problems and projects to solidify your understanding and confidence.\n", 382 | "\n", 383 | "Ready to level up your Python skills? Start by implementing these strategies today. Share your favorite Python tips in the comments or connect with me for more insights into mastering this versatile language! 🐍🔧\n", 384 | "\n", 385 | "#Python #CodingTips #TechEducation #AI #DataScience\n" 386 | ] 387 | } 388 | ], 389 | "source": [ 390 | "# GPT-4o (no fine-tuning)\n", 391 | "model_name = \"gpt-4o\"\n", 392 | "system_prompt_long = \"You are an AI assistant helping Shaw Talebi, an AI educator and entrepreneur, craft LinkedIn posts. Your goal is to generate posts \\\n", 393 | "that reflect Shaw Talebi's voice: authoritative yet approachable, insightful yet concise. Shaw Talebi's posts aim to educate and inspire professionals \\\n", 394 | "in the tech and AI space. Focus on providing value, discussing new trends, or offering actionable advice, while keeping the tone professional but \\\n", 395 | "conversational. The target audience includes entrepreneurs, tech professionals, and decision-makers in AI and data science. Always ensure the post is \\\n", 396 | "relevant, engaging, and on-brand for Shaw Talebi's public persona.\"\n", 397 | "\n", 398 | "# print(system_prompt_long, \"\\n--\")\n", 399 | "print(generate_post(system_prompt_long, model_name, idea))" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": 15, 405 | "id": "5cc2e3fb-a692-47eb-87e8-4e9ae788f0ac", 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "Python was hard until I learned these 5 things 👇 \n", 413 | "\n", 414 | "1) Use a good IDE \n", 415 | "\n", 416 | "2) Learn by building projects \n", 417 | "\n", 418 | "3) Use ChatGPT \n", 419 | "\n", 420 | "4) Break things down into smaller problems \n", 421 | "\n", 422 | "5) Use the Python documentation \n", 423 | "\n", 424 | "I share my full Python learning journey here 👇 \n", 425 | "\n", 426 | "https://lnkd.in/gZy68cZC \n", 427 | "\n", 428 | "#Python #Programming #AI \n", 429 | "\n", 430 | "--\n", 431 | "Media: Meme\n" 432 | ] 433 | } 434 | ], 435 | "source": [ 436 | "# GPT-4o-mini (fine-tuned)\n", 437 | "model_name = \"ft:gpt-4o-mini-2024-07-18:shawhin-talebi-ventures-llc:li-post-writer:Adk6A5Pd\"\n", 438 | "\n", 439 | "# print(system_prompt, \"\\n--\")\n", 440 | "print(generate_post(system_prompt, model_name, idea))" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 16, 446 | "id": "55c2cd55-477e-464f-8480-f9e354fb4341", 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [ 450 | "# # delete files (after fine-tuning is done)\n", 451 | "# client.files.delete(train_file.id)\n", 452 | "# client.files.delete(valid_file.id)" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "id": "683d94f8-03f1-4418-927c-5b00c45e683a", 459 | "metadata": {}, 460 | "outputs": [], 461 | "source": [] 462 | } 463 | ], 464 | "metadata": { 465 | "kernelspec": { 466 | "display_name": "Python 3 (ipykernel)", 467 | "language": "python", 468 | "name": "python3" 469 | }, 470 | "language_info": { 471 | "codemirror_mode": { 472 | "name": "ipython", 473 | "version": 3 474 | }, 475 | "file_extension": ".py", 476 | "mimetype": "text/x-python", 477 | "name": "python", 478 | "nbconvert_exporter": "python", 479 | "pygments_lexer": "ipython3", 480 | "version": "3.12.2" 481 | } 482 | }, 483 | "nbformat": 4, 484 | "nbformat_minor": 5 485 | } 486 | -------------------------------------------------------------------------------- /session-5/requirements.txt: -------------------------------------------------------------------------------- 1 | jupyterlab 2 | ipykernel 3 | ipywidgets 4 | datasets 5 | transformers[torch] 6 | evaluate 7 | numpy 8 | openai -------------------------------------------------------------------------------- /session-5/slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-5/slides.pdf -------------------------------------------------------------------------------- /session-5/top_secret.py: -------------------------------------------------------------------------------- 1 | my_sk = "your sk here" -------------------------------------------------------------------------------- /session-6/ABB-1-session_6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ShawhinT/AI-Builders-Bootcamp-1/175b96b2324bb8bf28050f20808ba24bbd160f6a/session-6/ABB-1-session_6.pdf --------------------------------------------------------------------------------