├── Chapter9_notebooks ├── requirements__Ch9_Completing_a_Complex_Analysis_with_a_Team_of_LLM_Agents.txt ├── README.md ├── record.pickle ├── requirements__Ch9_Advanced_Methods_with_Chains.txt ├── requirements__Ch9_Advanced_LangChain_Configurations_and_Pipeline.txt ├── requirements__Ch9_Retrieve_Content_from_a_YouTube_Video_and_Summarize.txt └── requirements__Ch9_RAGLlamaIndex_Prompt_Compression.txt ├── Chapter5_notebooks ├── README.md └── requirements__Ch5_Text_Classification_Traditional_ML.txt ├── Chapter6_notebooks ├── README.md └── requirements__Ch6_Text_Classification_DL.txt ├── Chapter8_notebooks ├── README.md ├── requirements__Ch8_Setting_Up_Close_Source_and_Open_Source_LLMs.txt ├── requirements__Ch8_Setting_Up_LangChain_Configurations_and_Pipeline.txt └── mocked_up_physician_records.csv ├── Chapter4_notebooks ├── README.md ├── requirements__Ch4_NER_and_POS.txt ├── requirements__Ch4_Preprocessing_Pipeline.txt └── Ch4_Preprocessing_Pipeline.ipynb ├── LICENSE └── README.md /Chapter9_notebooks/requirements__Ch9_Completing_a_Complex_Analysis_with_a_Team_of_LLM_Agents.txt: -------------------------------------------------------------------------------- 1 | pyautogen==0.2.23 -------------------------------------------------------------------------------- /Chapter5_notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Mastering NLP from Foundations to LLMs: Chapter 5 2 | All codes for chapter 5 verified and updated for 2025. -------------------------------------------------------------------------------- /Chapter6_notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Mastering NLP from Foundations to LLMs: Chapter 6 2 | All codes for chapter 6 verified and updated for 2025. -------------------------------------------------------------------------------- /Chapter8_notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Mastering NLP from Foundations to LLMs: Chapter 8 2 | All codes for chapter 8 verified and updated for 2025. -------------------------------------------------------------------------------- /Chapter9_notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Mastering NLP from Foundations to LLMs: Chapter 9 2 | All codes for chapter 9 verified and updated for 2025. -------------------------------------------------------------------------------- /Chapter4_notebooks/README.md: -------------------------------------------------------------------------------- 1 | # Mastering NLP from Foundations to LLMs: Chapter 4 2 | All codes for chapter 4 verified and updated for 2025. 3 | -------------------------------------------------------------------------------- /Chapter9_notebooks/record.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Mastering-NLP-from-Foundations-to-LLMs/HEAD/Chapter9_notebooks/record.pickle -------------------------------------------------------------------------------- /Chapter8_notebooks/requirements__Ch8_Setting_Up_Close_Source_and_Open_Source_LLMs.txt: -------------------------------------------------------------------------------- 1 | openai==1.17.0 2 | regex==2023.12.25 3 | scikit-image==0.19.3 4 | scikit-learn==1.2.2 5 | scipy==1.11.4 6 | transformers==4.39.3 -------------------------------------------------------------------------------- /Chapter9_notebooks/requirements__Ch9_Advanced_Methods_with_Chains.txt: -------------------------------------------------------------------------------- 1 | langchain==0.1.16 2 | langchain-community==0.0.32 3 | langchain-core==0.3.74 4 | langchain-openai==0.1.3 5 | langchain-text-splitters==0.0.1 6 | -------------------------------------------------------------------------------- /Chapter8_notebooks/requirements__Ch8_Setting_Up_LangChain_Configurations_and_Pipeline.txt: -------------------------------------------------------------------------------- 1 | faiss-cpu==1.8.0 2 | langchain==0.1.16 3 | langchain-community 4 | langchain-core==0.1.42 5 | langchain-text-splitters==0.0.1 6 | regex==2023.12.25 7 | sentence-transformers==2.6.1 -------------------------------------------------------------------------------- /Chapter9_notebooks/requirements__Ch9_Advanced_LangChain_Configurations_and_Pipeline.txt: -------------------------------------------------------------------------------- 1 | faiss-cpu==1.8.0 2 | gpt4all==1.0.12 3 | langchain==0.1.16 4 | langchain-community 5 | langchain-core==0.3.74 6 | langchain-text-splitters==0.0.1 7 | openai==0.28.1 8 | sentence-transformers==2.6.1 9 | -------------------------------------------------------------------------------- /Chapter4_notebooks/requirements__Ch4_NER_and_POS.txt: -------------------------------------------------------------------------------- 1 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 2 | spacy==3.7.4 3 | spacy-legacy==3.0.12 4 | spacy-loggers==1.0.5 5 | -------------------------------------------------------------------------------- /Chapter4_notebooks/requirements__Ch4_Preprocessing_Pipeline.txt: -------------------------------------------------------------------------------- 1 | autocorrect==2.6.1 2 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 3 | nltk==3.8.1 4 | num2words==0.5.13 5 | regex==2023.12.25 6 | scipy==1.11.4 7 | -------------------------------------------------------------------------------- /Chapter5_notebooks/requirements__Ch5_Text_Classification_Traditional_ML.txt: -------------------------------------------------------------------------------- 1 | autocorrect==2.6.1 2 | datasets==2.18.0 3 | en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl#sha256=86cc141f63942d4b2c5fcee06630fd6f904788d2f0ab005cce45aadb8fb73889 4 | huggingface-hub==0.20.3 5 | matplotlib==3.7.1 6 | matplotlib-inline==0.1.6 7 | matplotlib-venn==0.11.10 8 | nltk==3.8.1 9 | num2words==0.5.13 10 | regex==2023.12.25 11 | scikit-image==0.19.3 12 | scikit-learn==1.2.2 13 | scipy==1.11.4 14 | spacy==3.7.4 15 | spacy-legacy==3.0.12 16 | spacy-loggers==1.0.5 -------------------------------------------------------------------------------- /Chapter6_notebooks/requirements__Ch6_Text_Classification_DL.txt: -------------------------------------------------------------------------------- 1 | accelerate==0.29.2 2 | autocorrect==2.6.1 3 | datasets==2.18.0 4 | evaluate==0.4.1 5 | matplotlib==3.7.1 6 | matplotlib-inline==0.1.6 7 | matplotlib-venn==0.11.10 8 | nltk==3.8.1 9 | num2words==0.5.13 10 | regex==2023.12.25 11 | scikit-image==0.19.3 12 | scikit-learn==1.2.2 13 | scipy==1.11.4 14 | spacy==3.7.4 15 | spacy-legacy==3.0.12 16 | spacy-loggers==1.0.5 17 | sympy 18 | torch @ https://download.pytorch.org/whl/cu121/torch-2.2.1%2Bcu121-cp310-cp310-linux_x86_64.whl#sha256=1adf430f01ff649c848ac021785e18007b0714fdde68e4e65bd0c640bf3fb8e1 19 | transformers==4.28.0 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Chapter8_notebooks/mocked_up_physician_records.csv: -------------------------------------------------------------------------------- 1 | "Title: Mocked up record 2 | Physician Name: Dr. ABC 3 | Date: June 25, 2099 4 | Patient ID: 987654321 5 | Chief Complaint: Abdominal pain 6 | 7 | History of Present Illness: 8 | The patient, Mr. John Anderson, a 42-year-old male, presents today with a chief complaint of abdominal pain. He is married and resides with his wife and two children. Mr. Anderson recently returned from a business trip to Europe about two weeks ago. He denies any respiratory symptoms or exposure to sick individuals during his travel. 9 | 10 | During the evaluation, Mr. Anderson revealed a pertinent family history of cardiovascular disease, with his father having suffered a myocardial infarction in his 60s. He also reports that his maternal grandmother had type 2 diabetes. Mr. Anderson denies any personal history of chronic illnesses, surgeries, or hospitalizations. 11 | 12 | Regarding his chief complaint, Mr. Anderson describes the abdominal pain as a dull, intermittent ache located in the lower right quadrant. He rates the pain as 5 out of 10 in severity. The pain is exacerbated by physical activity and seems to worsen after meals. He denies any associated symptoms such as nausea, vomiting, or changes in bowel movements. 13 | 14 | Based on the information provided, further assessment and diagnostic tests will be performed to determine the underlying cause of Mr. Anderson's abdominal pain." 15 | "Title: Mocked up record 16 | Physician Name: Dr. ABC 17 | Date: November 15, 2099 18 | Patient ID: 123456789 19 | Chief Complaint: Fatigue and joint pain 20 | 21 | History of Present Illness: 22 | The patient, Ms. Sarah Thompson, a 57-year-old female, presents today with complaints of fatigue and joint pain. Ms. Thompson is widowed and lives alone. She has no recent history of travel outside the country. 23 | 24 | During the evaluation, Ms. Thompson reveals a pertinent family history of autoimmune diseases, with her sister being diagnosed with rheumatoid arthritis. She also reports a personal history of hypothyroidism, which is being managed with thyroid hormone replacement therapy. 25 | 26 | Regarding her chief complaint, Ms. Thompson describes the fatigue as persistent and overwhelming, affecting her ability to perform daily activities. She rates her fatigue as 8 out of 10 in severity. Additionally, she reports joint pain primarily in her knees and wrists, which is worse in the morning and improves with movement throughout the day. She denies any swelling or redness in the affected joints. 27 | 28 | Given the clinical presentation, further investigation will be carried out to explore potential causes for Ms. Thompson's fatigue and joint pain. This may include laboratory tests, imaging studies, and consultation with specialists as necessary." 29 | "Title: Mocked up record 30 | Physician Name: Dr. ABC 31 | Date: November 28, 2099 32 | Patient ID: 987654321 33 | Chief Complaint: Migraine Headaches 34 | 35 | History of Present Illness: 36 | Title: Mocked up record 37 | The patient, Mr. Michael Johnson, a 40-year-old male, presents today with a chief complaint of recurring migraine headaches. He is married and lives with his spouse and two children. Mr. Johnson has not traveled recently outside of his local area. 38 | 39 | During the evaluation, Mr. Johnson reports a family history of migraine headaches, with his mother and sister both experiencing similar symptoms. He denies any significant past medical conditions, surgeries, or hospitalizations. He mentions occasional stress and irregular sleep patterns due to his demanding work schedule. 40 | 41 | Regarding his chief complaint, Mr. Johnson describes his headaches as recurrent episodes of moderate to severe throbbing pain, usually localized to one side of his head. He experiences associated symptoms such as sensitivity to light and sound, as well as nausea and occasional vomiting. The migraines typically last for several hours and occur once or twice a month. 42 | 43 | Based on the information provided, further assessment will be conducted to manage Mr. Johnson's migraines. A detailed headache diary will be recommended to track the frequency, duration, and associated triggers of his headaches. Lifestyle modifications, stress management techniques, and targeted medications will be discussed to alleviate his symptoms and improve his quality of life." 44 | "Title: Mocked up record 45 | Physician Name: Dr. ABC 46 | Date: July 10, 2099 47 | Patient ID: 246813579 48 | Chief Complaint: Pregnancy Follow-up 49 | 50 | History of Present Illness: 51 | The patient, Mrs. Emily Adams, a 30-year-old female, presents today for a routine pregnancy follow-up. She is currently 32 weeks pregnant, with a due date of August 27th, 2099. Mrs. Adams is married and lives with her husband. 52 | 53 | During the evaluation, Mrs. Adams reveals a family history of gestational diabetes, with her mother having developed the condition during her own pregnancies. She mentions no personal history of significant medical conditions, surgeries, or complications in previous pregnancies. 54 | 55 | Regarding her chief complaint, Mrs. Adams reports typical discomforts associated with the third trimester of pregnancy, including backache, frequent urination, and occasional heartburn. She denies any vaginal bleeding, severe abdominal pain, or significant changes in fetal movements. Mrs. Adams mentions adhering to a well-balanced diet and regular exercise routine to maintain her overall health during pregnancy. 56 | 57 | Based on the information provided, a routine prenatal examination will be conducted to monitor the progress of Mrs. Adams' pregnancy. This will include assessing her blood pressure, weight gain, fundal height measurement, and fetal heart rate monitoring. Discussions about childbirth preparation, breastfeeding, and postnatal care will also be addressed to ensure a healthy and smooth transition into motherhood." 58 | -------------------------------------------------------------------------------- /Chapter9_notebooks/requirements__Ch9_Retrieve_Content_from_a_YouTube_Video_and_Summarize.txt: -------------------------------------------------------------------------------- 1 | accelerate==1.6.0 2 | aiohappyeyeballs==2.6.1 3 | aiohttp==3.11.16 4 | aiosignal==1.3.1 5 | alembic==1.13.2 6 | annotated-types==0.7.0 7 | anyio==4.4.0 8 | argon2-cffi==23.1.0 9 | argon2-cffi-bindings==21.2.0 10 | arrow==1.3.0 11 | asgiref==3.8.1 12 | asttokens==2.4.1 13 | async-lru==2.0.5 14 | async-timeout==4.0.2 15 | asyncer==0.0.8 16 | attrs==23.1.0 17 | autocorrect==2.6.1 18 | babel==2.17.0 19 | backoff==2.2.1 20 | bcrypt==4.2.0 21 | beautifulsoup4==4.12.3 22 | bleach==6.2.0 23 | blis==1.2.0 24 | build==1.2.2 25 | cachetools==5.5.0 26 | catalogue==2.0.10 27 | certifi==2023.5.7 28 | cffi==1.17.1 29 | charset-normalizer==3.1.0 30 | chroma-hnswlib==0.7.3 31 | chromadb==0.4.24 32 | click==8.1.7 33 | cloudpathlib==0.21.0 34 | colorama==0.4.6 35 | coloredlogs==15.0.1 36 | comm==0.2.2 37 | confection==0.1.5 38 | contourpy==1.3.1 39 | cycler==0.12.1 40 | cymem==2.0.11 41 | dataclasses-json==0.6.7 42 | datasets==3.5.0 43 | debugpy==1.8.5 44 | decorator==5.1.1 45 | defusedxml==0.7.1 46 | Deprecated==1.2.14 47 | dill==0.3.8 48 | dirtyjson==1.0.8 49 | diskcache==5.6.3 50 | distro==1.9.0 51 | docker==7.1.0 52 | docopt==0.6.2 53 | docstring_parser==0.16 54 | embedchain==0.1.100 55 | en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85 56 | evaluate==0.4.3 57 | executing==2.1.0 58 | faiss-cpu==1.10.0 59 | fastapi==0.114.0 60 | fastjsonschema==2.21.1 61 | filelock==3.16.0 62 | flatbuffers==24.3.25 63 | fonttools==4.56.0 64 | fqdn==1.5.1 65 | frozenlist==1.3.3 66 | fsspec==2024.9.0 67 | gensim==4.3.3 68 | google-api-core==2.19.2 69 | google-auth==2.34.0 70 | google-cloud-aiplatform==1.65.0 71 | google-cloud-bigquery==3.25.0 72 | google-cloud-core==2.4.1 73 | google-cloud-resource-manager==1.12.5 74 | google-cloud-storage==2.18.2 75 | google-crc32c==1.6.0 76 | google-resumable-media==2.7.2 77 | googleapis-common-protos==1.65.0 78 | gpt4all==1.0.12 79 | gptcache==0.1.44 80 | greenlet==3.0.3 81 | grpc-google-iam-v1==0.13.1 82 | grpcio==1.66.1 83 | grpcio-status==1.62.3 84 | h11==0.14.0 85 | httpcore==1.0.5 86 | httptools==0.6.1 87 | httpx==0.28.1 88 | httpx-sse==0.4.0 89 | huggingface-hub==0.30.1 90 | humanfriendly==10.0 91 | idna==3.4 92 | imageio==2.37.0 93 | importlib_metadata==8.4.0 94 | importlib_resources==6.4.4 95 | ipykernel==6.29.5 96 | ipython==8.27.0 97 | ipywidgets==8.1.5 98 | isoduration==20.11.0 99 | jedi==0.19.1 100 | Jinja2==3.1.6 101 | jiter==0.5.0 102 | joblib==1.4.2 103 | json5==0.10.0 104 | jsonpatch==1.33 105 | jsonpointer==3.0.0 106 | jsonschema==4.23.0 107 | jsonschema-specifications==2024.10.1 108 | jupyter==1.1.1 109 | jupyter-console==6.6.3 110 | jupyter-events==0.12.0 111 | jupyter-lsp==2.2.5 112 | jupyter_client==8.6.2 113 | jupyter_core==5.7.2 114 | jupyter_server==2.15.0 115 | jupyter_server_terminals==0.5.3 116 | jupyterlab==4.3.6 117 | jupyterlab_pygments==0.3.0 118 | jupyterlab_server==2.27.3 119 | jupyterlab_widgets==3.0.13 120 | kiwisolver==1.4.8 121 | kubernetes==30.1.0 122 | langchain==0.1.20 123 | langchain-community==0.0.38 124 | langchain-core==0.1.53 125 | langchain-openai==0.0.5 126 | langchain-text-splitters==0.0.2 127 | langcodes==3.5.0 128 | langsmith==0.1.147 129 | language_data==1.3.0 130 | lazy_loader==0.4 131 | llama-index==0.10.7 132 | llama-index-agent-openai==0.1.7 133 | llama-index-core==0.10.68.post1 134 | llama-index-embeddings-openai==0.1.11 135 | llama-index-legacy==0.9.48.post4 136 | llama-index-llms-openai==0.1.31 137 | llama-index-multi-modal-llms-openai==0.1.9 138 | llama-index-postprocessor-longllmlingua==0.1.2 139 | llama-index-program-openai==0.1.7 140 | llama-index-question-gen-openai==0.1.3 141 | llama-index-readers-file==0.1.33 142 | llmlingua==0.2.2 143 | Mako==1.3.5 144 | marisa-trie==1.2.1 145 | Markdown==3.7 146 | markdown-it-py==3.0.0 147 | MarkupSafe==2.1.5 148 | marshmallow==3.22.0 149 | matplotlib==3.10.1 150 | matplotlib-inline==0.1.7 151 | mdurl==0.1.2 152 | mistune==3.1.3 153 | mmh3==4.1.0 154 | monotonic==1.6 155 | mpmath==1.3.0 156 | multidict==6.0.4 157 | multiprocess==0.70.16 158 | murmurhash==1.0.12 159 | mypy-extensions==1.0.0 160 | narwhals==1.31.0 161 | nbclient==0.10.2 162 | nbconvert==7.16.6 163 | nbformat==5.10.4 164 | nest-asyncio==1.6.0 165 | networkx==3.4.2 166 | nltk==3.9.1 167 | notebook==7.3.3 168 | notebook_shim==0.2.4 169 | num2words==0.5.14 170 | numpy==1.26.4 171 | oauthlib==3.2.2 172 | onnxruntime==1.19.2 173 | openai==1.75.0 174 | opentelemetry-api==1.27.0 175 | opentelemetry-exporter-otlp-proto-common==1.27.0 176 | opentelemetry-exporter-otlp-proto-grpc==1.27.0 177 | opentelemetry-instrumentation==0.48b0 178 | opentelemetry-instrumentation-asgi==0.48b0 179 | opentelemetry-instrumentation-fastapi==0.48b0 180 | opentelemetry-proto==1.27.0 181 | opentelemetry-sdk==1.27.0 182 | opentelemetry-semantic-conventions==0.48b0 183 | opentelemetry-util-http==0.48b0 184 | orjson==3.10.7 185 | overrides==7.7.0 186 | packaging==23.2 187 | pandas==2.2.2 188 | pandocfilters==1.5.1 189 | parso==0.8.4 190 | pillow==11.1.0 191 | platformdirs==4.2.2 192 | plotly==6.0.1 193 | posthog==3.6.3 194 | preshed==3.0.9 195 | prometheus_client==0.21.1 196 | prompt_toolkit==3.0.47 197 | propcache==0.3.1 198 | proto-plus==1.24.0 199 | protobuf==4.25.4 200 | psutil==6.0.0 201 | pulsar-client==3.5.0 202 | pure_eval==0.2.3 203 | pyarrow==19.0.1 204 | pyasn1==0.6.0 205 | pyasn1_modules==0.4.0 206 | pyautogen==0.8.7 207 | pycparser==2.22 208 | pydantic==2.9.0 209 | pydantic-settings==2.8.1 210 | pydantic_core==2.23.2 211 | Pygments==2.18.0 212 | pyparsing==3.2.3 213 | pypdf==6.0.0 214 | PyPika==0.48.9 215 | pyproject_hooks==1.1.0 216 | pyreadline3==3.4.1 217 | pysbd==0.3.4 218 | python-dateutil==2.9.0.post0 219 | python-dotenv==1.0.1 220 | python-json-logger==3.3.0 221 | pytube==15.0.0 222 | pytz==2024.1 223 | pywin32==306 224 | pywinpty==2.0.15 225 | PyYAML==6.0.2 226 | pyzmq==26.2.0 227 | referencing==0.36.2 228 | regex==2024.7.24 229 | requests==2.32.3 230 | requests-oauthlib==2.0.0 231 | requests-toolbelt==1.0.0 232 | rfc3339-validator==0.1.4 233 | rfc3986-validator==0.1.1 234 | rich==13.8.0 235 | rpds-py==0.24.0 236 | rsa==4.9 237 | safetensors==0.5.3 238 | schema==0.7.7 239 | scikit-image==0.25.2 240 | scikit-learn==1.6.1 241 | scipy==1.13.1 242 | Send2Trash==1.8.3 243 | sentence-transformers==4.0.2 244 | shapely==2.0.6 245 | shellingham==1.5.4 246 | six==1.16.0 247 | smart-open==7.1.0 248 | sniffio==1.3.1 249 | soupsieve==2.6 250 | spacy==3.8.4 251 | spacy-legacy==3.0.12 252 | spacy-loggers==1.0.5 253 | SQLAlchemy==2.0.34 254 | srsly==2.5.1 255 | stack-data==0.6.3 256 | starlette==0.38.4 257 | striprtf==0.0.26 258 | sympy==1.13.1 259 | tenacity==8.5.0 260 | termcolor==3.0.1 261 | terminado==0.18.1 262 | thinc==8.3.4 263 | threadpoolctl==3.6.0 264 | tifffile==2025.3.30 265 | tiktoken==0.5.2 266 | tinycss2==1.4.0 267 | tokenizers==0.21.1 268 | torch==2.6.0 269 | tornado==6.4.1 270 | tqdm==4.67.1 271 | traitlets==5.14.3 272 | transformers==4.51.2 273 | typer==0.12.5 274 | types-python-dateutil==2.9.0.20241206 275 | typing-inspect==0.9.0 276 | typing_extensions==4.12.2 277 | tzdata==2024.1 278 | uri-template==1.3.0 279 | urllib3==2.0.3 280 | uvicorn==0.30.6 281 | wasabi==1.1.3 282 | watchfiles==0.24.0 283 | wcwidth==0.2.13 284 | weasel==0.4.1 285 | webcolors==24.11.1 286 | webencodings==0.5.1 287 | websocket-client==1.8.0 288 | websockets==13.0.1 289 | widgetsnbextension==4.0.13 290 | wordcloud==1.9.4 291 | wrapt==1.16.0 292 | xxhash==3.5.0 293 | yarl==1.20.0 294 | youtube-transcript-api==1.2.2 295 | zipp==3.20.1 296 | zstandard==0.23.0 297 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Mastering NLP from Foundations to LLMs 2 | ``` 3 | All codes are refactored/updated per April 2025. 4 | ``` 5 | 6 | Mastering NLP from Foundations to LLMs 7 | This is the code repository for [Mastering NLP from Foundations to LLMs](https://www.packtpub.com/product/mastering-nlp-from-foundations-to-llms/9781804619186), published by Packt. 8 | 9 | **Apply advanced rule-based techniques to LLMs and solve real-world business problems using Python** 10 | 11 | ## About Authors: 12 | - [Lior Gazit](https://www.linkedin.com/in/liorgazit) is a highly skilled ML professional with a proven track record of success in building and leading teams that use ML to drive business growth. He is an expert in NLP and has successfully developed innovative ML pipelines and products. He holds a master’s degree and has published in peer-reviewed journals and conferences. As a senior director of a ML group in the financial sector and a principal ML advisor at an emerging start-up, Lior is a respected leader in the industry, with a wealth of knowledge and experience to share. With much passion and inspiration, Lior is dedicated to using ML to drive positive change and growth in his organizations. 13 | 14 | - [Meysam Ghaffari](https://www.linkedin.com/in/meysam-ghaffari-ph-d-a2553088/) is a senior data scientist with a strong background in NLP and deep learning. He currently works at MSKCC, where he specializes in developing and improving ML and NLP models for healthcare problems. He has over nine years of experience in ML and over four years of experience in NLP and deep learning. He received his Ph.D. in computer science from Florida State University, his MS in computer science – artificial intelligence from the Isfahan University of Technology, and his BS in computer science from Iran University of Science and Technology. He also worked as a post-doctoral research associate at the University of Wisconsin-Madison before joining MSKCC. 15 | 16 | 17 | Enhance your NLP proficiency with modern frameworks like LangChain, explore mathematical foundations and code samples, and gain expert insights into current and future trends 18 | 19 | ### Key Features 20 | * Learn how to build Python-driven solutions with a focus on NLP, LLMs, RAGs, and GPT 21 | * Master embedding techniques and machine learning principles for real-world applications 22 | * Understand the mathematical foundations of NLP and deep learning designs 23 | Purchase of the print or Kindle book includes a free PDF eBook 24 | 25 | If you feel this book is for you, get your [copy](https://www.amazon.com/Mastering-NLP-Foundations-LLMs-Techniques/dp/1804619183/ref=sr_1_1?sr=8-1) today! 26 | 27 | ### Book Description 28 | Do you want to master Natural Language Processing (NLP) but don’t know where to begin? This book will give you the right head start. Written by leaders in machine learning and NLP, Mastering NLP from Foundations to LLMs provides an in-depth introduction to techniques. Starting with the mathematical foundations of machine learning (ML), you’ll gradually progress to advanced NLP applications such as large language models (LLMs) and AI applications. You’ll get to grips with linear algebra, optimization, probability, and statistics, which are essential for understanding and implementing machine learning and NLP algorithms. You’ll also explore general machine learning techniques and find out how they relate to NLP. Next, you’ll learn how to preprocess text data, explore methods for cleaning and preparing text for analysis, and understand how to do text classification. You’ll get all of this and more along with complete Python code samples. 29 | 30 | By the end of the book, the advanced topics of LLMs’ theory, design, and applications will be discussed along with the future trends in NLP, which will feature expert opinions. You’ll also get to strengthen your practical skills by working on sample real-world NLP business problems and solutions. 31 | 32 | ### What you will learn 33 | * Master the mathematical foundations of machine learning and NLP Implement advanced techniques for preprocessing text data and analysis Design ML-NLP systems in Python 34 | * Model and classify text using traditional machine learning and deep learning methods 35 | * Understand the theory and design of LLMs and their implementation for various applications in AI 36 | * Explore NLP insights, trends, and expert opinions on its future direction and potential 37 | 38 | ## Instructions and Navigations 39 | All of the code is organized into folders. 40 | 41 | The code will look like the following: 42 | ``` 43 | import pandas as pd 44 | import matplotlib.pyplot as plt 45 | # Load the record dict from URL 46 | import requests 47 | import pickle 48 | ``` 49 | 50 | ### Who this book is for 51 | This book is for deep learning and machine learning researchers, NLP practitioners, ML/NLP educators, and STEM students. Professionals working with text data as part of their projects will also find plenty of useful information in this book. Beginner-level familiarity with machine learning and a basic working knowledge of Python will help you get the best out of this book. 52 | 53 | With the following software and hardware list you can run all code files present in the book (Chapter 1-11). 54 | 55 | ### Software and Hardware List 56 | 57 | | Chapter | Software required | OS required | 58 | | -------- | -------------------------------------------------------------------------------------| -----------------------------------| 59 | | 1-11 | Access to a Python environment via one of the following: Accessing Google Colab, which is free and easy from any browser on any device (recommended). A local/cloud development environment of Python with the ability to install public packages and access OpenAI’s API | Windows, macOS or Linux | 60 | | 1-11 | Sufficient computation resources, as follows: The previously recommended free access to Google Colab includes a free GPU instance. If opting to avoid Google Colab, the local/cloud environment should have a GPU for several code examples | | 61 | 62 | 63 | ### Table of Contents 64 | 1. Navigating the NLP Landscape: A comprehensive introduction 65 | 1. Mastering Linear Algebra, Probability, and Statistics for Machine Learning and NLP 66 | 1. Unleashing Machine Learning Potentials in NLP 67 | 1. Streamlining Text Preprocessing Techniques for Optimal NLP Performance ([Notebooks for chapter 4](https://github.com/PacktPublishing/Mastering-NLP-from-Foundations-to-LLMs/tree/main/Chapter4_notebooks)) 68 | 1. Empowering Text Classification: Leveraging Traditional Machine Learning Techniques ([Notebooks for chapter 5](https://github.com/PacktPublishing/Mastering-NLP-from-Foundations-to-LLMs/tree/main/Chapter5_notebooks)) 69 | 1. Text Classification Reimagined: Delving Deep into Deep Learning Language Models ([Notebooks for chapter 6](https://github.com/PacktPublishing/Mastering-NLP-from-Foundations-to-LLMs/tree/main/Chapter6_notebooks)) 70 | 1. Demystifying Large Language Models: Theory, Design, and Langchain Implementation 71 | 1. Accessing the Power of Large Language Models: Advanced Setup and Integration with RAG ([Notebooks for chapter 8](https://github.com/PacktPublishing/Mastering-NLP-from-Foundations-to-LLMs/tree/main/Chapter8_notebooks)) 72 | 1. Exploring the Frontiers: Advanced Applications and Innovations Driven by LLMs ([Notebooks for chapter 9](https://github.com/PacktPublishing/Mastering-NLP-from-Foundations-to-LLMs/tree/main/Chapter9_notebooks)) 73 | 1. Riding the Wave: Analyzing Past, Present, and Future Trends Shaped by LLMs and AI 74 | 1. Exclusive Industry Insights: Perspectives and Predictions from World Class Experts 75 | -------------------------------------------------------------------------------- /Chapter9_notebooks/requirements__Ch9_RAGLlamaIndex_Prompt_Compression.txt: -------------------------------------------------------------------------------- 1 | absl-py==1.4.0 2 | accelerate==1.5.2 3 | aiohappyeyeballs==2.6.1 4 | aiohttp==3.11.15 5 | aiosignal==1.3.2 6 | alabaster==1.0.0 7 | albucore==0.0.23 8 | albumentations==2.0.5 9 | ale-py==0.10.2 10 | altair==5.5.0 11 | annotated-types==0.7.0 12 | anyio==4.9.0 13 | argon2-cffi==23.1.0 14 | argon2-cffi-bindings==21.2.0 15 | array_record==0.7.1 16 | arviz==0.21.0 17 | astropy==7.0.1 18 | astropy-iers-data==0.2025.4.14.0.37.22 19 | astunparse==1.6.3 20 | atpublic==5.1 21 | attrs==25.3.0 22 | audioread==3.0.1 23 | autograd==1.7.0 24 | babel==2.17.0 25 | backcall==0.2.0 26 | backports.tarfile==1.2.0 27 | banks==2.1.1 28 | beautifulsoup4==4.13.4 29 | betterproto==2.0.0b6 30 | bigframes==1.42.0 31 | bigquery-magics==0.9.0 32 | bitsandbytes==0.45.5 33 | bleach==6.2.0 34 | blinker==1.9.0 35 | blis==1.3.0 36 | blosc2==3.3.0 37 | bokeh==3.6.3 38 | Bottleneck==1.4.2 39 | bqplot==0.12.44 40 | branca==0.8.1 41 | CacheControl==0.14.2 42 | cachetools==5.5.2 43 | catalogue==2.0.10 44 | certifi==2025.1.31 45 | cffi==1.17.1 46 | chardet==5.2.0 47 | charset-normalizer==3.4.1 48 | chex==0.1.89 49 | clarabel==0.10.0 50 | click==8.1.8 51 | cloudpathlib==0.21.0 52 | cloudpickle==3.1.1 53 | cmake==3.31.6 54 | cmdstanpy==1.2.5 55 | colorama==0.4.6 56 | colorcet==3.1.0 57 | colorlover==0.3.0 58 | colour==0.1.5 59 | community==1.0.0b1 60 | confection==0.1.5 61 | cons==0.4.6 62 | contourpy==1.3.2 63 | cramjam==2.10.0 64 | cryptography==43.0.3 65 | cuda-python==12.6.2.post1 66 | cudf-cu12 @ https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.2.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl 67 | cudf-polars-cu12==25.2.2 68 | cufflinks==0.17.3 69 | cuml-cu12==25.2.1 70 | cupy-cuda12x==13.3.0 71 | cuvs-cu12==25.2.1 72 | cvxopt==1.3.2 73 | cvxpy==1.6.5 74 | cycler==0.12.1 75 | cyipopt==1.5.0 76 | cymem==2.0.11 77 | Cython==3.0.12 78 | dask==2024.12.1 79 | dask-cuda==25.2.0 80 | dask-cudf-cu12==25.2.2 81 | dask-expr==1.1.21 82 | dataclasses-json==0.6.7 83 | datascience==0.17.6 84 | datasets==3.5.0 85 | db-dtypes==1.4.2 86 | dbus-python==1.2.18 87 | debugpy==1.8.0 88 | decorator==4.4.2 89 | defusedxml==0.7.1 90 | Deprecated==1.2.18 91 | diffusers==0.32.2 92 | dill==0.3.8 93 | dirtyjson==1.0.8 94 | distributed==2024.12.1 95 | distributed-ucxx-cu12==0.42.0 96 | distro==1.9.0 97 | dlib==19.24.6 98 | dm-tree==0.1.9 99 | docker-pycreds==0.4.0 100 | docstring_parser==0.16 101 | docutils==0.21.2 102 | dopamine_rl==4.1.2 103 | duckdb==1.2.2 104 | earthengine-api==1.5.11 105 | easydict==1.13 106 | editdistance==0.8.1 107 | eerepr==0.1.1 108 | einops==0.8.1 109 | en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85 110 | entrypoints==0.4 111 | et_xmlfile==2.0.0 112 | etils==1.12.2 113 | etuples==0.3.9 114 | Farama-Notifications==0.0.4 115 | fastai==2.7.19 116 | fastcore==1.7.29 117 | fastdownload==0.0.7 118 | fastjsonschema==2.21.1 119 | fastprogress==1.0.3 120 | fastrlock==0.8.3 121 | filelock==3.18.0 122 | filetype==1.2.0 123 | firebase-admin==6.7.0 124 | Flask==3.1.0 125 | flatbuffers==25.2.10 126 | flax==0.10.5 127 | folium==0.19.5 128 | fonttools==4.57.0 129 | frozendict==2.4.6 130 | frozenlist==1.5.0 131 | fsspec==2024.12.0 132 | future==1.0.0 133 | gast==0.6.0 134 | gcsfs==2025.3.2 135 | GDAL==3.6.4 136 | gdown==5.2.0 137 | geemap==0.35.3 138 | geocoder==1.38.1 139 | geographiclib==2.0 140 | geopandas==1.0.1 141 | geopy==2.4.1 142 | gin-config==0.5.0 143 | gitdb==4.0.12 144 | GitPython==3.1.44 145 | glob2==0.7 146 | google==2.0.3 147 | google-ai-generativelanguage==0.6.15 148 | google-api-core==2.24.2 149 | google-api-python-client==2.164.0 150 | google-auth==2.38.0 151 | google-auth-httplib2==0.2.0 152 | google-auth-oauthlib==1.2.1 153 | google-cloud-aiplatform==1.88.0 154 | google-cloud-bigquery==3.31.0 155 | google-cloud-bigquery-connection==1.18.2 156 | google-cloud-bigquery-storage==2.30.0 157 | google-cloud-bigtable==2.30.0 158 | google-cloud-core==2.4.3 159 | google-cloud-dataproc==5.18.1 160 | google-cloud-datastore==2.21.0 161 | google-cloud-firestore==2.20.1 162 | google-cloud-functions==1.20.3 163 | google-cloud-iam==2.19.0 164 | google-cloud-language==2.17.1 165 | google-cloud-pubsub==2.29.0 166 | google-cloud-resource-manager==1.14.2 167 | google-cloud-spanner==3.53.0 168 | google-cloud-storage==2.19.0 169 | google-cloud-translate==3.20.2 170 | google-colab @ file:///colabtools/dist/google_colab-1.0.0.tar.gz 171 | google-crc32c==1.7.1 172 | google-genai==1.10.0 173 | google-generativeai==0.8.4 174 | google-pasta==0.2.0 175 | google-resumable-media==2.7.2 176 | google-spark-connect==0.5.2 177 | googleapis-common-protos==1.70.0 178 | googledrivedownloader==1.1.0 179 | graphviz==0.20.3 180 | greenlet==3.2.0 181 | griffe==1.7.2 182 | grpc-google-iam-v1==0.14.2 183 | grpc-interceptor==0.15.4 184 | grpcio==1.71.0 185 | grpcio-status==1.71.0 186 | grpclib==0.4.7 187 | gspread==6.2.0 188 | gspread-dataframe==4.0.0 189 | gym==0.25.2 190 | gym-notices==0.0.8 191 | gymnasium==1.1.1 192 | h11==0.14.0 193 | h2==4.2.0 194 | h5netcdf==1.6.1 195 | h5py==3.13.0 196 | hdbscan==0.8.40 197 | highspy==1.9.0 198 | holidays==0.70 199 | holoviews==1.20.2 200 | hpack==4.1.0 201 | html5lib==1.1 202 | httpcore==1.0.8 203 | httpimport==1.4.1 204 | httplib2==0.22.0 205 | httpx==0.28.1 206 | httpx-sse==0.4.0 207 | huggingface-hub==0.30.2 208 | humanize==4.12.2 209 | hyperframe==6.1.0 210 | hyperopt==0.2.7 211 | ibis-framework==9.5.0 212 | idna==3.10 213 | imageio==2.37.0 214 | imageio-ffmpeg==0.6.0 215 | imagesize==1.4.1 216 | imbalanced-learn==0.13.0 217 | immutabledict==4.2.1 218 | importlib_metadata==8.6.1 219 | importlib_resources==6.5.2 220 | imutils==0.5.4 221 | inflect==7.5.0 222 | iniconfig==2.1.0 223 | intel-cmplr-lib-ur==2025.1.0 224 | intel-openmp==2025.1.0 225 | ipyevents==2.0.2 226 | ipyfilechooser==0.6.0 227 | ipykernel==6.17.1 228 | ipyleaflet==0.19.2 229 | ipyparallel==8.8.0 230 | ipython==7.34.0 231 | ipython-genutils==0.2.0 232 | ipython-sql==0.5.0 233 | ipytree==0.2.2 234 | ipywidgets==7.7.1 235 | itsdangerous==2.2.0 236 | jaraco.classes==3.4.0 237 | jaraco.context==6.0.1 238 | jaraco.functools==4.1.0 239 | jax==0.5.2 240 | jax-cuda12-pjrt==0.5.1 241 | jax-cuda12-plugin==0.5.1 242 | jaxlib==0.5.1 243 | jeepney==0.9.0 244 | jellyfish==1.1.0 245 | jieba==0.42.1 246 | Jinja2==3.1.6 247 | jiter==0.9.0 248 | joblib==1.4.2 249 | jsonpatch==1.33 250 | jsonpickle==4.0.5 251 | jsonpointer==3.0.0 252 | jsonschema==4.23.0 253 | jsonschema-specifications==2024.10.1 254 | jupyter-client==6.1.12 255 | jupyter-console==6.1.0 256 | jupyter-leaflet==0.19.2 257 | jupyter-server==1.16.0 258 | jupyter_core==5.7.2 259 | jupyterlab_pygments==0.3.0 260 | jupyterlab_widgets==3.0.14 261 | kaggle==1.7.4.2 262 | kagglehub==0.3.11 263 | keras==3.10.0 264 | keras-hub==0.18.1 265 | keras-nlp==0.18.1 266 | keyring==25.6.0 267 | keyrings.google-artifactregistry-auth==1.1.2 268 | kiwisolver==1.4.8 269 | langchain==0.3.23 270 | langchain-community==0.3.21 271 | langchain-core==0.3.74 272 | langchain-text-splitters==0.3.8 273 | langcodes==3.5.0 274 | langsmith==0.3.31 275 | language_data==1.3.0 276 | launchpadlib==1.10.16 277 | lazr.restfulclient==0.14.4 278 | lazr.uri==1.0.6 279 | lazy_loader==0.4 280 | libclang==18.1.1 281 | libcudf-cu12 @ https://pypi.nvidia.com/libcudf-cu12/libcudf_cu12-25.2.1-py3-none-manylinux_2_28_x86_64.whl 282 | libcugraph-cu12==25.2.0 283 | libcuml-cu12==25.2.1 284 | libcuvs-cu12==25.2.1 285 | libkvikio-cu12==25.2.1 286 | libraft-cu12==25.2.0 287 | librosa==0.11.0 288 | libucx-cu12==1.18.0 289 | libucxx-cu12==0.42.0 290 | lightgbm==4.5.0 291 | linkify-it-py==2.0.3 292 | llama-cloud==0.1.18 293 | llama-cloud-services==0.6.12 294 | llama-index==0.12.31 295 | llama-index-agent-openai==0.4.6 296 | llama-index-cli==0.4.1 297 | llama-index-core==0.12.31 298 | llama-index-embeddings-openai==0.3.1 299 | llama-index-indices-managed-llama-cloud==0.6.11 300 | llama-index-llms-openai==0.3.37 301 | llama-index-multi-modal-llms-openai==0.4.3 302 | llama-index-postprocessor-longllmlingua==0.4.0 303 | llama-index-program-openai==0.3.1 304 | llama-index-question-gen-openai==0.3.0 305 | llama-index-readers-file==0.4.7 306 | llama-index-readers-llama-parse==0.4.0 307 | llama-parse==0.6.12 308 | llmlingua==0.2.2 309 | llvmlite==0.43.0 310 | locket==1.0.0 311 | logical-unification==0.4.6 312 | lxml==5.3.2 313 | Mako==1.1.3 314 | marisa-trie==1.2.1 315 | Markdown==3.8 316 | markdown-it-py==3.0.0 317 | MarkupSafe==3.0.2 318 | marshmallow==3.26.1 319 | matplotlib==3.10.0 320 | matplotlib-inline==0.1.7 321 | matplotlib-venn==1.1.2 322 | mdit-py-plugins==0.4.2 323 | mdurl==0.1.2 324 | miniKanren==1.0.3 325 | missingno==0.5.2 326 | mistune==3.1.3 327 | mizani==0.13.3 328 | mkl==2025.0.1 329 | ml-dtypes==0.4.1 330 | mlxtend==0.23.4 331 | more-itertools==10.6.0 332 | moviepy==1.0.3 333 | mpmath==1.3.0 334 | msgpack==1.1.0 335 | multidict==6.4.3 336 | multipledispatch==1.0.0 337 | multiprocess==0.70.16 338 | multitasking==0.0.11 339 | murmurhash==1.0.12 340 | music21==9.3.0 341 | mypy-extensions==1.0.0 342 | namex==0.0.8 343 | narwhals==1.35.0 344 | natsort==8.4.0 345 | nbclassic==1.2.0 346 | nbclient==0.10.2 347 | nbconvert==7.16.6 348 | nbformat==5.10.4 349 | ndindex==1.9.2 350 | nest-asyncio==1.6.0 351 | networkx==3.4.2 352 | nibabel==5.3.2 353 | nltk==3.9.1 354 | notebook==6.5.7 355 | notebook_shim==0.2.4 356 | numba==0.60.0 357 | numba-cuda==0.2.0 358 | numexpr==2.10.2 359 | numpy==2.0.2 360 | nvidia-cublas-cu12==12.4.5.8 361 | nvidia-cuda-cupti-cu12==12.4.127 362 | nvidia-cuda-nvcc-cu12==12.5.82 363 | nvidia-cuda-nvrtc-cu12==12.4.127 364 | nvidia-cuda-runtime-cu12==12.4.127 365 | nvidia-cudnn-cu12==9.1.0.70 366 | nvidia-cufft-cu12==11.2.1.3 367 | nvidia-curand-cu12==10.3.5.147 368 | nvidia-cusolver-cu12==11.6.1.9 369 | nvidia-cusparse-cu12==12.3.1.170 370 | nvidia-cusparselt-cu12==0.6.2 371 | nvidia-ml-py==12.570.86 372 | nvidia-nccl-cu12==2.21.5 373 | nvidia-nvcomp-cu12==4.2.0.11 374 | nvidia-nvjitlink-cu12==12.4.127 375 | nvidia-nvtx-cu12==12.4.127 376 | nvtx==0.2.11 377 | nx-cugraph-cu12 @ https://pypi.nvidia.com/nx-cugraph-cu12/nx_cugraph_cu12-25.2.0-py3-none-any.whl 378 | oauth2client==4.1.3 379 | oauthlib==3.2.2 380 | openai==1.75.0 381 | opencv-contrib-python==4.11.0.86 382 | opencv-python==4.11.0.86 383 | opencv-python-headless==4.11.0.86 384 | openpyxl==3.1.5 385 | opentelemetry-api==1.32.1 386 | opentelemetry-sdk==1.32.1 387 | opentelemetry-semantic-conventions==0.53b1 388 | opt_einsum==3.4.0 389 | optax==0.2.4 390 | optree==0.15.0 391 | orbax-checkpoint==0.11.12 392 | orjson==3.10.16 393 | osqp==1.0.3 394 | packaging==24.2 395 | pandas==2.2.2 396 | pandas-datareader==0.10.0 397 | pandas-gbq==0.28.0 398 | pandas-stubs==2.2.2.240909 399 | pandocfilters==1.5.1 400 | panel==1.6.2 401 | param==2.2.0 402 | parso==0.8.4 403 | parsy==2.1 404 | partd==1.4.2 405 | pathlib==1.0.1 406 | patsy==1.0.1 407 | peewee==3.17.9 408 | peft==0.14.0 409 | pexpect==4.9.0 410 | pickleshare==0.7.5 411 | pillow==11.1.0 412 | platformdirs==4.3.7 413 | plotly==5.24.1 414 | plotnine==0.14.5 415 | pluggy==1.5.0 416 | ply==3.11 417 | polars==1.21.0 418 | pooch==1.8.2 419 | portpicker==1.5.2 420 | preshed==3.0.9 421 | prettytable==3.16.0 422 | proglog==0.1.11 423 | progressbar2==4.5.0 424 | prometheus_client==0.21.1 425 | promise==2.3 426 | prompt_toolkit==3.0.51 427 | propcache==0.3.1 428 | prophet==1.1.6 429 | proto-plus==1.26.1 430 | protobuf==5.29.4 431 | psutil==5.9.5 432 | psycopg2==2.9.10 433 | ptyprocess==0.7.0 434 | py-cpuinfo==9.0.0 435 | py4j==0.10.9.7 436 | pyarrow==18.1.0 437 | pyasn1==0.6.1 438 | pyasn1_modules==0.4.2 439 | pycairo==1.28.0 440 | pycocotools==2.0.8 441 | pycparser==2.22 442 | pydantic==2.11.3 443 | pydantic-settings==2.9.1 444 | pydantic_core==2.33.1 445 | pydata-google-auth==1.9.1 446 | pydot==3.0.4 447 | pydotplus==2.0.2 448 | PyDrive==1.3.1 449 | PyDrive2==1.21.3 450 | pyerfa==2.0.1.5 451 | pygame==2.6.1 452 | pygit2==1.17.0 453 | Pygments==2.18.0 454 | PyGObject==3.42.0 455 | PyJWT==2.10.1 456 | pylibcudf-cu12 @ https://pypi.nvidia.com/pylibcudf-cu12/pylibcudf_cu12-25.2.1-cp311-cp311-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl 457 | pylibcugraph-cu12==25.2.0 458 | pylibraft-cu12==25.2.0 459 | pymc==5.21.2 460 | pymystem3==0.2.0 461 | pynndescent==0.5.13 462 | pynvjitlink-cu12==0.5.2 463 | pynvml==12.0.0 464 | pyogrio==0.10.0 465 | Pyomo==6.8.2 466 | PyOpenGL==3.1.9 467 | pyOpenSSL==24.2.1 468 | pyparsing==3.2.3 469 | pypdf==6.0.0 470 | pyperclip==1.9.0 471 | pyproj==3.7.1 472 | pyshp==2.3.1 473 | PySocks==1.7.1 474 | pyspark==3.5.5 475 | pytensor==2.30.3 476 | pytest==8.3.5 477 | python-apt==0.0.0 478 | python-box==7.3.2 479 | python-dateutil==2.8.2 480 | python-dotenv==1.1.0 481 | python-louvain==0.16 482 | python-slugify==8.0.4 483 | python-snappy==0.7.3 484 | python-utils==3.9.1 485 | pytz==2025.2 486 | pyviz_comms==3.0.4 487 | PyYAML==6.0.2 488 | pyzmq==24.0.1 489 | raft-dask-cu12==25.2.0 490 | rapids-dask-dependency==25.2.0 491 | ratelim==0.1.6 492 | referencing==0.36.2 493 | regex==2024.11.6 494 | requests==2.32.3 495 | requests-oauthlib==2.0.0 496 | requests-toolbelt==1.0.0 497 | requirements-parser==0.9.0 498 | rich==13.9.4 499 | rmm-cu12==25.2.0 500 | roman-numerals-py==3.1.0 501 | rpds-py==0.24.0 502 | rpy2==3.5.17 503 | rsa==4.9.1 504 | safetensors==0.5.3 505 | scikit-image==0.25.2 506 | scikit-learn==1.6.1 507 | scipy==1.14.1 508 | scooby==0.10.0 509 | scs==3.2.7.post2 510 | seaborn==0.13.2 511 | SecretStorage==3.3.3 512 | Send2Trash==1.8.3 513 | sentence-transformers==3.4.1 514 | sentencepiece==0.2.0 515 | sentry-sdk==2.26.1 516 | setproctitle==1.3.5 517 | shap==0.47.1 518 | shapely==2.1.0 519 | shellingham==1.5.4 520 | simple-parsing==0.1.7 521 | simplejson==3.20.1 522 | simsimd==6.2.1 523 | six==1.17.0 524 | sklearn-compat==0.1.3 525 | sklearn-pandas==2.2.0 526 | slicer==0.0.8 527 | smart-open==7.1.0 528 | smmap==5.0.2 529 | sniffio==1.3.1 530 | snowballstemmer==2.2.0 531 | sortedcontainers==2.4.0 532 | soundfile==0.13.1 533 | soupsieve==2.6 534 | soxr==0.5.0.post1 535 | spacy==3.8.5 536 | spacy-legacy==3.0.12 537 | spacy-loggers==1.0.5 538 | spanner-graph-notebook==1.1.6 539 | Sphinx==8.2.3 540 | sphinxcontrib-applehelp==2.0.0 541 | sphinxcontrib-devhelp==2.0.0 542 | sphinxcontrib-htmlhelp==2.1.0 543 | sphinxcontrib-jsmath==1.0.1 544 | sphinxcontrib-qthelp==2.0.0 545 | sphinxcontrib-serializinghtml==2.0.0 546 | SQLAlchemy==2.0.40 547 | sqlglot==25.20.2 548 | sqlparse==0.5.3 549 | srsly==2.5.1 550 | stanio==0.5.1 551 | statsmodels==0.14.4 552 | stringzilla==3.12.4 553 | striprtf==0.0.26 554 | sympy==1.13.1 555 | tables==3.10.2 556 | tabulate==0.9.0 557 | tbb==2022.1.0 558 | tblib==3.1.0 559 | tcmlib==1.3.0 560 | tenacity==9.1.2 561 | tensorboard==2.18.0 562 | tensorboard-data-server==0.7.2 563 | tensorflow==2.18.0 564 | tensorflow-datasets==4.9.8 565 | tensorflow-hub==0.16.1 566 | tensorflow-io-gcs-filesystem==0.37.1 567 | tensorflow-metadata==1.17.1 568 | tensorflow-probability==0.25.0 569 | tensorflow-text==2.18.1 570 | tensorflow_decision_forests==1.11.0 571 | tensorstore==0.1.73 572 | termcolor==3.0.1 573 | terminado==0.18.1 574 | text-unidecode==1.3 575 | textblob==0.19.0 576 | tf-slim==1.1.0 577 | tf_keras==2.18.0 578 | thinc==8.3.6 579 | threadpoolctl==3.6.0 580 | tifffile==2025.3.30 581 | tiktoken==0.9.0 582 | timm==1.0.15 583 | tinycss2==1.4.0 584 | tokenizers==0.21.1 585 | toml==0.10.2 586 | toolz==0.12.1 587 | torch @ https://download.pytorch.org/whl/cu124/torch-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl 588 | torchaudio @ https://download.pytorch.org/whl/cu124/torchaudio-2.6.0%2Bcu124-cp311-cp311-linux_x86_64.whl 589 | torchsummary==1.5.1 590 | torchvision @ https://download.pytorch.org/whl/cu124/torchvision-0.21.0%2Bcu124-cp311-cp311-linux_x86_64.whl 591 | tornado==6.4.2 592 | tqdm==4.67.1 593 | traitlets==5.7.1 594 | traittypes==0.2.1 595 | transformers==4.51.3 596 | treelite==4.4.1 597 | treescope==0.1.9 598 | triton==3.2.0 599 | tweepy==4.15.0 600 | typeguard==4.4.2 601 | typer==0.15.2 602 | types-pytz==2025.2.0.20250326 603 | types-setuptools==78.1.0.20250329 604 | typing-inspect==0.9.0 605 | typing-inspection==0.4.0 606 | typing_extensions==4.13.2 607 | tzdata==2025.2 608 | tzlocal==5.3.1 609 | uc-micro-py==1.0.3 610 | ucx-py-cu12==0.42.0 611 | ucxx-cu12==0.42.0 612 | umap-learn==0.5.7 613 | umf==0.10.0 614 | uritemplate==4.1.1 615 | urllib3==2.3.0 616 | vega-datasets==0.9.0 617 | wadllib==1.3.6 618 | wandb==0.19.9 619 | wasabi==1.1.3 620 | wcwidth==0.2.13 621 | weasel==0.4.1 622 | webcolors==24.11.1 623 | webencodings==0.5.1 624 | websocket-client==1.8.0 625 | websockets==15.0.1 626 | Werkzeug==3.1.3 627 | widgetsnbextension==3.6.10 628 | wordcloud==1.9.4 629 | wrapt==1.17.2 630 | wurlitzer==3.1.1 631 | xarray==2025.1.2 632 | xarray-einstats==0.8.0 633 | xgboost==2.1.4 634 | xlrd==2.0.1 635 | xxhash==3.5.0 636 | xyzservices==2025.1.0 637 | yarl==1.19.0 638 | ydf==0.11.0 639 | yellowbrick==1.5 640 | yfinance==0.2.55 641 | zict==3.0.0 642 | zipp==3.21.0 643 | zstandard==0.23.0 644 | -------------------------------------------------------------------------------- /Chapter4_notebooks/Ch4_Preprocessing_Pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "attachments": {}, 5 | "cell_type": "markdown", 6 | "metadata": { 7 | "id": "UMqBL77hMXP2" 8 | }, 9 | "source": [ 10 | "# Text preprocessing pipeline\n", 11 | "Authors: \n", 12 | " - [Lior Gazit](https://www.linkedin.com/in/liorgazit). \n", 13 | " - [Meysam Ghaffari](https://www.linkedin.com/in/meysam-ghaffari-ph-d-a2553088/). \n", 14 | "\n", 15 | "This notebook is taught and reviewed in our book: \n", 16 | "**[Mastering NLP from Foundations to LLMs](https://www.amazon.com/dp/1804619183)** \n", 17 | "![image.png]()\n", 18 | "\n", 19 | "This Colab notebook is referenced in our book's Github repo: \n", 20 | "https://github.com/PacktPublishing/Mastering-NLP-from-Foundations-to-LLMs \n", 21 | "\n", 22 | " \"Open\n", 23 | "" 24 | ] 25 | }, 26 | { 27 | "attachments": {}, 28 | "cell_type": "markdown", 29 | "metadata": { 30 | "id": "dO3hrUVTRoMN" 31 | }, 32 | "source": [ 33 | "\\*Note: This notebook is updated and validated for 2025 \n", 34 | "**The purpose of this notebook:** \n", 35 | "As demonstrated in Chapter 4 of the book, text preprocessing is one of the most fundamental practices of NLP. \n", 36 | "In this notebook we walk you through a variety of preprocessing functions and show how they come together to a solid pipeline. \n", 37 | "\n", 38 | "**Requirements:** \n", 39 | "* When running in Colab, use this runtime notebook setting: `Python 3, CPU` \n" 40 | ] 41 | }, 42 | { 43 | "attachments": {}, 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "id": "g54Uf66Vz9Fi" 47 | }, 48 | "source": [ 49 | ">*```Disclaimer: The content and ideas presented in this notebook are solely those of the authors and do not represent the views or intellectual property of the authors' employers.```*" 50 | ] 51 | }, 52 | { 53 | "attachments": {}, 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "ayuVflRZ5w9C" 57 | }, 58 | "source": [ 59 | "Install:" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "colab": { 67 | "base_uri": "https://localhost:8080/" 68 | }, 69 | "id": "DhrOK1E1lxTo", 70 | "outputId": "92ab8692-a3eb-4dfe-e82f-2b82d6bae042" 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "# REMARK:\n", 75 | "# If the below code error's out due to a Python package discrepency, it may be because new versions are causing it.\n", 76 | "# In which case, set \"default_installations\" to False to revert to the original image:\n", 77 | "default_installations = True\n", 78 | "if default_installations:\n", 79 | " %pip -q install num2words autocorrect nltk\n", 80 | "else:\n", 81 | " import requests\n", 82 | " text_file_path = \"requirements__Ch4_Preprocessing_Pipeline.txt\"\n", 83 | " url = \"https://raw.githubusercontent.com/PacktPublishing/Mastering-NLP-from-Foundations-to-LLMs/main/Chapter4_notebooks/\" + text_file_path\n", 84 | " res = requests.get(url)\n", 85 | " with open(text_file_path, \"w\") as f:\n", 86 | " f.write(res.text)\n", 87 | "\n", 88 | " %pip install -r requirements__Ch4_Preprocessing_Pipeline.txt" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 5, 94 | "metadata": { 95 | "colab": { 96 | "base_uri": "https://localhost:8080/" 97 | }, 98 | "id": "3M3KTFuMSgCf", 99 | "outputId": "90ed8f76-68e3-4211-9d21-b730f1ce864f" 100 | }, 101 | "outputs": [ 102 | { 103 | "name": "stderr", 104 | "output_type": "stream", 105 | "text": [ 106 | "[nltk_data] Downloading package punkt to\n", 107 | "[nltk_data] C:\\Users\\gazit\\AppData\\Roaming\\nltk_data...\n", 108 | "[nltk_data] Unzipping tokenizers\\punkt.zip.\n", 109 | "[nltk_data] Downloading package stopwords to\n", 110 | "[nltk_data] C:\\Users\\gazit\\AppData\\Roaming\\nltk_data...\n", 111 | "[nltk_data] Unzipping corpora\\stopwords.zip.\n", 112 | "[nltk_data] Downloading package wordnet to\n", 113 | "[nltk_data] C:\\Users\\gazit\\AppData\\Roaming\\nltk_data...\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "# Imports:\n", 119 | "import re\n", 120 | "from num2words import num2words\n", 121 | "import nltk; nltk.download('punkt'); nltk.download('stopwords'); nltk.download('wordnet')\n", 122 | "from nltk.corpus import stopwords\n", 123 | "from nltk.stem.porter import PorterStemmer\n", 124 | "from nltk.stem import WordNetLemmatizer\n", 125 | "from autocorrect import Speller\n", 126 | "\n" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 6, 132 | "metadata": { 133 | "id": "ivh-TwcqSnVd" 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "# Preprocessing functions:\n", 138 | "def decode(text):\n", 139 | " \"\"\"\n", 140 | " The function takes in a string of text as input\n", 141 | " and extracts the subject line and body text from the text\n", 142 | " using regular expressions. It then formats the extracted\n", 143 | " text into a single string and returns it as output.\n", 144 | "\n", 145 | " Input: str\n", 146 | " Output: str\n", 147 | " \"\"\"\n", 148 | " text = re.sub(\"\\\\n|\\\\r|\\\\t|-\", \" \", text)\n", 149 | " subject_line_search = re.search(r\"(.*?)\", text, flags=re.S)\n", 150 | " body_text_search = re.search(r\"(.*?)\", text, flags=re.S)\n", 151 | "\n", 152 | " formated_output = \"\"\n", 153 | " if subject_line_search:\n", 154 | " formated_output = formated_output + subject_line_search.groups()[0] + \". \"\n", 155 | " if body_text_search:\n", 156 | " formated_output = formated_output + body_text_search.groups()[0] + \".\"\n", 157 | " return formated_output\n", 158 | "\n", 159 | "\n", 160 | "def digits_to_words(match):\n", 161 | " \"\"\"\n", 162 | " Convert string digits to the English words. The function distinguishes between\n", 163 | " cardinal and ordinal.\n", 164 | " E.g. \"2\" becomes \"two\", while \"2nd\" becomes \"second\"\n", 165 | "\n", 166 | " Input: str\n", 167 | " Output: str\n", 168 | " \"\"\"\n", 169 | " suffixes = ['st', 'nd', 'rd', 'th']\n", 170 | " # Making sure it's lower cased so not to rely on previous possible actions:\n", 171 | " string = match[0].lower()\n", 172 | " if string[-2:] in suffixes:\n", 173 | " type='ordinal'\n", 174 | " string = string[:-2]\n", 175 | " else:\n", 176 | " type='cardinal'\n", 177 | "\n", 178 | " return num2words(string, to=type)\n", 179 | "\n", 180 | "\n", 181 | "def spelling_correction(text):\n", 182 | " \"\"\"\n", 183 | " Replace misspelled words with the correct spelling.\n", 184 | "\n", 185 | " Input: str\n", 186 | " Output: str\n", 187 | " \"\"\"\n", 188 | " corrector = Speller()\n", 189 | " spells = [corrector(word) for word in text.split()]\n", 190 | " return \" \".join(spells)\n", 191 | "\n", 192 | "\n", 193 | "def remove_stop_words(text):\n", 194 | " \"\"\"\n", 195 | " Remove stopwords.\n", 196 | "\n", 197 | " Input: str\n", 198 | " Output: str\n", 199 | " \"\"\"\n", 200 | " stopwords_set = set(stopwords.words('english'))\n", 201 | " return \" \".join([word for word in text.split() if word not in stopwords_set])\n", 202 | "\n", 203 | "\n", 204 | "def stemming(text):\n", 205 | " \"\"\"\n", 206 | " Perform stemming of each word individually.\n", 207 | "\n", 208 | " Input: str\n", 209 | " Output: str\n", 210 | " \"\"\"\n", 211 | " stemmer = PorterStemmer()\n", 212 | " return \" \".join([stemmer.stem(word) for word in text.split()])\n", 213 | "\n", 214 | "\n", 215 | "def lemmatizing(text):\n", 216 | " \"\"\"\n", 217 | " Perform lemmatization for each word individually.\n", 218 | "\n", 219 | " Input: str\n", 220 | " Output: str\n", 221 | " \"\"\"\n", 222 | " lemmatizer = WordNetLemmatizer()\n", 223 | " return \" \".join([lemmatizer.lemmatize(word) for word in text.split()])\n", 224 | "\n", 225 | "\n" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 7, 231 | "metadata": { 232 | "id": "U7wnjrxqS0Ub" 233 | }, 234 | "outputs": [], 235 | "source": [ 236 | "# Preprocessing pipeline:\n", 237 | "def preprocessing(input_text, printing=False):\n", 238 | " \"\"\"\n", 239 | " This function represents a complete pipeline for text preprocessing.\n", 240 | "\n", 241 | " Code design note: The fact that we update variable \"output\" instead of\n", 242 | " creating new variables with new names as we go, allows us to change the\n", 243 | " order of the actions or add/remove actions easily.\n", 244 | "\n", 245 | " Input: str\n", 246 | " Output: str\n", 247 | " \"\"\"\n", 248 | " output = input_text\n", 249 | " # Decode/remove encoding:\n", 250 | " output = decode(output)\n", 251 | " print(\"\\nDecode/remove encoding:\\n \", output)\n", 252 | "\n", 253 | " # Lower casing:\n", 254 | " output = output.lower()\n", 255 | " print(\"\\nLower casing:\\n \", output)\n", 256 | "\n", 257 | " # Convert digits to words:\n", 258 | " # The following regex syntax looks for matching of consequtive digits tentatively followed by an ordinal suffix:\n", 259 | " output = re.sub(r'\\d+(st)?(nd)?(rd)?(th)?', digits_to_words, output, flags=re.IGNORECASE)\n", 260 | " print(\"\\nDigits to words\\n \", output)\n", 261 | "\n", 262 | " # Remove punctuations and other special characters:\n", 263 | " output = re.sub('[^ A-Za-z0-9]+', '', output)\n", 264 | " print(\"\\nRemove punctuations and other special characters\\n \", output)\n", 265 | "\n", 266 | " # Spelling corrections:\n", 267 | " output = spelling_correction(output)\n", 268 | " print(\"\\nSpelling corrections:\\n \", output)\n", 269 | "\n", 270 | "\n", 271 | " # Remove stop words:\n", 272 | " output = remove_stop_words(output)\n", 273 | " print(\"\\nRemove stop words:\\n \", output)\n", 274 | "\n", 275 | " # Stemming:\n", 276 | " output = stemming(output)\n", 277 | " print(\"\\nStemming:\\n \", output)\n", 278 | "\n", 279 | " # Lemmatizing:\n", 280 | " output = lemmatizing(output)\n", 281 | " print(\"\\nLemmatizing:\\n \", output)\n", 282 | "\n", 283 | " return output" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 8, 289 | "metadata": { 290 | "colab": { 291 | "base_uri": "https://localhost:8080/" 292 | }, 293 | "id": "2Duu0SyeS2jW", 294 | "outputId": "619a7797-6e09-4e2a-a3a3-f2d61968c906" 295 | }, 296 | "outputs": [ 297 | { 298 | "name": "stdout", 299 | "output_type": "stream", 300 | "text": [ 301 | "This is the input raw text:\n", 302 | "\n", 303 | "\" Employees detailsAttached are 2 files,\n", 304 | "1st one is pairoll, 2nd is healtcare!\"\n", 305 | "\n", 306 | "\n", 307 | "Decode/remove encoding:\n", 308 | " Employees details. Attached are 2 files, 1st one is pairoll, 2nd is healtcare!.\n", 309 | "\n", 310 | "Lower casing:\n", 311 | " employees details. attached are 2 files, 1st one is pairoll, 2nd is healtcare!.\n", 312 | "\n", 313 | "Digits to words\n", 314 | " employees details. attached are two files, first one is pairoll, second is healtcare!.\n", 315 | "\n", 316 | "Remove punctuations and other special characters\n", 317 | " employees details attached are two files first one is pairoll second is healtcare\n", 318 | "\n", 319 | "Spelling corrections:\n", 320 | " employees details attached are two files first one is payroll second is healthcare\n", 321 | "\n", 322 | "Remove stop words:\n", 323 | " employees details attached two files first one payroll second healthcare\n", 324 | "\n", 325 | "Stemming:\n", 326 | " employe detail attach two file first one payrol second healthcar\n", 327 | "\n", 328 | "Lemmatizing:\n", 329 | " employe detail attach two file first one payrol second healthcar\n", 330 | "\n", 331 | "----------------------------\n", 332 | "This is the preprocessed text:\n", 333 | " employe detail attach two file first one payrol second healthcar\n" 334 | ] 335 | } 336 | ], 337 | "source": [ 338 | "# Applying preprocessing:\n", 339 | "raw_text_input = \"\"\"\n", 340 | "\" Employees detailsAttached are 2 files,\\n1st one is pairoll, 2nd is healtcare!\"\n", 341 | "\"\"\"\n", 342 | "print(f\"This is the input raw text:\\n{raw_text_input}\")\n", 343 | "\n", 344 | "print(f\"\\n----------------------------\\nThis is the preprocessed text:\\n {preprocessing(raw_text_input, printing=True)}\")" 345 | ] 346 | } 347 | ], 348 | "metadata": { 349 | "colab": { 350 | "provenance": [] 351 | }, 352 | "kernelspec": { 353 | "display_name": "Python 3", 354 | "name": "python3" 355 | }, 356 | "language_info": { 357 | "codemirror_mode": { 358 | "name": "ipython", 359 | "version": 3 360 | }, 361 | "file_extension": ".py", 362 | "mimetype": "text/x-python", 363 | "name": "python", 364 | "nbconvert_exporter": "python", 365 | "pygments_lexer": "ipython3", 366 | "version": "3.11.4" 367 | } 368 | }, 369 | "nbformat": 4, 370 | "nbformat_minor": 0 371 | } 372 | --------------------------------------------------------------------------------