├── global_requirements.txt ├── 03-Deep Learning NLP (Models) ├── 3.5-Transformers │ ├── Theory.md │ └── Theory │ │ ├── 3.5.2-attention.md │ │ ├── 3.5.1-seq2seq.md │ │ └── 3.5.3-self attention.md ├── Deep Learning NLP (Models).md ├── 3.1-CNNS │ └── Theory.md ├── 3.4-LSTM │ └── Theory.md ├── 3.3-GRU │ └── Theory.md └── 3.2-RNNs │ ├── 3.2-RNNs.ipynb │ └── Theory.md ├── 04-crewai-agents ├── 4.2-Multi Agent Systems (CrewAI) │ └── README.md └── 4.1-AI Agents using CrewAI ( Abu Bakr Soliman) │ ├── crewai-agents │ ├── crewai_agents │ │ ├── __init__.py │ │ ├── agents │ │ │ ├── __init__.py │ │ │ ├── a1_search_queries_agent.py │ │ │ ├── a4_procurement_report.py │ │ │ ├── a2_search_engine_agent.py │ │ │ └── a3_scraping_agent.py │ │ ├── tasks │ │ │ ├── __init__.py │ │ │ ├── t2_search_engine_task.py │ │ │ ├── t4_procurement_report_task.py │ │ │ ├── t1_search_queries_task.py │ │ │ └── t3_scraping_task.py │ │ ├── config.py │ │ └── utilis.py │ ├── outputs │ │ └── ai-agent-output │ │ │ ├── step_3_scraping_results.json │ │ │ ├── step_4_procurement_report.html │ │ │ ├── step_1_suggested_search_queries.json │ │ │ └── step_2_search_results.json │ ├── requirements.txt │ ├── .gitignore │ ├── tests │ │ └── test.py │ ├── examples │ │ ├── ex2_run_search_engine_agent.py │ │ ├── ex1_run_search_queries_agent.py │ │ └── ex3_run_procurement_report_agent.py │ └── README.md │ └── README.MD ├── 01-Text-Preprocessing ├── Text-Preprocessing.md ├── requirements.txt └── 1.1-Text-Preprocessing │ ├── Theory.md │ └── preprocessing.ipynb ├── Questions └── assets │ ├── image.png │ ├── image1.png │ ├── image10.png │ ├── image11.png │ ├── image12.png │ ├── image13.png │ ├── image14.png │ ├── image15.png │ ├── image16.png │ ├── image17.png │ ├── image18.png │ ├── image19.png │ ├── image2.png │ ├── image20.png │ ├── image21.png │ ├── image22.png │ ├── image23.png │ ├── image24.png │ ├── image25.png │ ├── image26.png │ ├── image27.png │ ├── image28.png │ ├── image29.png │ ├── image3.png │ ├── image30.png │ ├── image31.png │ ├── image32.png │ ├── image33.png │ ├── image34.png │ ├── image4.png │ ├── image5.png │ ├── image6.png │ ├── image7.png │ ├── image8.png │ └── image9.png ├── 02-Word Embeddings ├── requirements.txt ├── 2.5-FastText │ ├── Theory.md │ └── 2.5-fast_text.ipynb ├── Word Embeddings.md ├── 2.2-BOW │ ├── Theory.md │ └── 2.2-BOW.ipynb ├── 2.3-TF_IDF │ ├── Theory.md │ └── 2.3-TF-IDF.ipynb ├── 2.1-Label Encoder and One Hot Encoder │ ├── Theory.md │ └── 2.1-label_and_oneHot_Encoder.ipynb └── 2.4-Word2Vec │ └── Theory.md ├── Data └── data.md ├── _config.yaml ├── LICENSE └── README.md /global_requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/3.5-Transformers/Theory.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/Deep Learning NLP (Models).md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /04-crewai-agents/4.2-Multi Agent Systems (CrewAI)/README.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /01-Text-Preprocessing/Text-Preprocessing.md: -------------------------------------------------------------------------------- 1 | # 01-Text-Preprocessing 2 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Questions/assets/image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image.png -------------------------------------------------------------------------------- /01-Text-Preprocessing/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas version>=2.2.3 2 | nltk version>=3.9.1 3 | emoji version>=2.14.1 4 | -------------------------------------------------------------------------------- /Questions/assets/image1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image1.png -------------------------------------------------------------------------------- /Questions/assets/image10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image10.png -------------------------------------------------------------------------------- /Questions/assets/image11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image11.png -------------------------------------------------------------------------------- /Questions/assets/image12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image12.png -------------------------------------------------------------------------------- /Questions/assets/image13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image13.png -------------------------------------------------------------------------------- /Questions/assets/image14.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image14.png -------------------------------------------------------------------------------- /Questions/assets/image15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image15.png -------------------------------------------------------------------------------- /Questions/assets/image16.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image16.png -------------------------------------------------------------------------------- /Questions/assets/image17.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image17.png -------------------------------------------------------------------------------- /Questions/assets/image18.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image18.png -------------------------------------------------------------------------------- /Questions/assets/image19.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image19.png -------------------------------------------------------------------------------- /Questions/assets/image2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image2.png -------------------------------------------------------------------------------- /Questions/assets/image20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image20.png -------------------------------------------------------------------------------- /Questions/assets/image21.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image21.png -------------------------------------------------------------------------------- /Questions/assets/image22.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image22.png -------------------------------------------------------------------------------- /Questions/assets/image23.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image23.png -------------------------------------------------------------------------------- /Questions/assets/image24.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image24.png -------------------------------------------------------------------------------- /Questions/assets/image25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image25.png -------------------------------------------------------------------------------- /Questions/assets/image26.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image26.png -------------------------------------------------------------------------------- /Questions/assets/image27.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image27.png -------------------------------------------------------------------------------- /Questions/assets/image28.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image28.png -------------------------------------------------------------------------------- /Questions/assets/image29.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image29.png -------------------------------------------------------------------------------- /Questions/assets/image3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image3.png -------------------------------------------------------------------------------- /Questions/assets/image30.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image30.png -------------------------------------------------------------------------------- /Questions/assets/image31.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image31.png -------------------------------------------------------------------------------- /Questions/assets/image32.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image32.png -------------------------------------------------------------------------------- /Questions/assets/image33.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image33.png -------------------------------------------------------------------------------- /Questions/assets/image34.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image34.png -------------------------------------------------------------------------------- /Questions/assets/image4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image4.png -------------------------------------------------------------------------------- /Questions/assets/image5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image5.png -------------------------------------------------------------------------------- /Questions/assets/image6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image6.png -------------------------------------------------------------------------------- /Questions/assets/image7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image7.png -------------------------------------------------------------------------------- /Questions/assets/image8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image8.png -------------------------------------------------------------------------------- /Questions/assets/image9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image9.png -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/outputs/ai-agent-output/step_3_scraping_results.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/outputs/ai-agent-output/step_4_procurement_report.html: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/requirements.txt: -------------------------------------------------------------------------------- 1 | crewai 2 | agentops 3 | tavily-python 4 | scrapegraph-py 5 | langchain -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .venv 3 | .env 4 | __pycache__/ 5 | *.pyc 6 | *.log 7 | agentops.log 8 | agentops-tmp.log -------------------------------------------------------------------------------- /02-Word Embeddings/requirements.txt: -------------------------------------------------------------------------------- 1 | pandas version>=2.2.3 2 | nltk version>=3.9.1 3 | emoji version>=2.14.1 4 | contractions>=0.1.73 5 | scikit-learn>=1.5.2 6 | numpy>=1.26.3 7 | gensim>=4.3.2 8 | fasttext>=0.9.2 9 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/outputs/ai-agent-output/step_1_suggested_search_queries.json: -------------------------------------------------------------------------------- 1 | { 2 | "queries": [ 3 | "Professional Development Book Egypt | Best Prices", 4 | "Professional Development Book Egypt | Compare Prices Across Sites" 5 | ] 6 | } -------------------------------------------------------------------------------- /Data/data.md: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | ## 01- Text Preprocessing 4 | [yelp_academic_dataset_tip](https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset/data) 5 | 6 | 7 | ## 02- Word Embeddings 8 | 9 | [IMDB Dataset Movie Reviews](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/config.py: -------------------------------------------------------------------------------- 1 | from crewai import LLM 2 | 3 | 4 | # set the output directory for the agent 5 | output_dir= r"outputs/ai-agent-output" 6 | 7 | 8 | # Initialize LLM 9 | llm = LLM( 10 | model="ollama/deepseek-r1", 11 | base_url="http://localhost:11434", 12 | temperature=0.5 13 | ) 14 | 15 | 16 | -------------------------------------------------------------------------------- /02-Word Embeddings/2.5-FastText/Theory.md: -------------------------------------------------------------------------------- 1 | # Word Embeddings 2 | 3 | [01- Label Encoder & One Hot Encoder](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder) 4 |
5 | 6 | [02 - BOW](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.2-BOW) 7 |
8 | 9 | [03 - TF-IDF](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.3-TF_IDF) 10 |
11 | 12 | [03 - Word2Vec](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.4-Word2Vec) 13 |
14 | 15 | ## 05 - FastText 16 | 17 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/a1_search_queries_agent.py: -------------------------------------------------------------------------------- 1 | from crewai import Agent 2 | from crewai_agents.config import llm 3 | 4 | 5 | search_queries_recommendation_agent = Agent( 6 | role="Search Queries Recommendation Agent", 7 | goal="\n".join([ 8 | "To provide a list of suggested search queries to be passed to the search engine.", 9 | "The queries must be varied and looking for specific items." 10 | ]), 11 | backstory="The agent is designed to help in looking for products by providing a list of suggested search queries to be passed to the search engine based on the context provided.", 12 | llm=llm, 13 | verbose=True, 14 | ) -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/a4_procurement_report.py: -------------------------------------------------------------------------------- 1 | from crewai import Agent 2 | from crewai_agents.config import llm 3 | 4 | 5 | procurement_report_author_agent = Agent( 6 | role="Procurement Report Author Agent", 7 | goal="To generate a professional, dynamic HTML page for the procurement report that incorporates product data, price comparisons, and company-specific insights.", 8 | backstory=( 9 | "The agent is designed to assist in generating a professional HTML page for a procurement report. " 10 | "It gathers data from various websites, compares product prices, and structures the report according to the company's specific requirements. " 11 | "The agent should tailor the report by considering the company's procurement goals, budget constraints, and preferred suppliers." 12 | ), 13 | llm=llm, 14 | verbose=True, 15 | ) -------------------------------------------------------------------------------- /_config.yaml: -------------------------------------------------------------------------------- 1 | title: NLP-Tea 2 | author: Mohammad Fawzy 3 | description: my journey learning Natural Language Processing. It includes theory notes, code examples, and useful resources for understanding and applying NLP concepts. 4 | remote_theme: daattali/beautiful-jekyll@6.0.1 5 | 6 | ############################################### 7 | # --- List of links in the navigation bar --- # 8 | ############################################### 9 | 10 | navbar-links: 11 | About Me: https://www.linkedin.com/in/mohammad-fawzy-438b05261/ 12 | 13 | ################ 14 | # --- Logo --- # 15 | ################ 16 | 17 | avatar: "/assets/img/avatar-icon.png" 18 | round-avatar: true 19 | 20 | 21 | social-network-links: 22 | email: "moha.fawzy63@gmail.com" 23 | linkedin: mohammad-fawzy-438b05261 24 | rss: true # remove this line if you don't want to show an RSS link at the bottom 25 | github: Fawzy-AI-Explorer 26 | kaggle: mohammadfawzy 27 | youtube: "@kiloeducation360" 28 | telegram: mohammad_fawzy_m 29 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/t2_search_engine_task.py: -------------------------------------------------------------------------------- 1 | from crewai import Task 2 | from pydantic import BaseModel, Field 3 | from typing import List 4 | import os 5 | from crewai_agents.agents.a2_search_engine_agent import search_engine_agent 6 | from crewai_agents.config import output_dir 7 | 8 | class SignleSearchResult(BaseModel): 9 | title: str 10 | url: str 11 | content: str 12 | score: float 13 | search_query: str 14 | 15 | class AllSearchResults(BaseModel): 16 | results: List[SignleSearchResult] 17 | 18 | search_engine_task = Task( 19 | description="\n".join([ 20 | "The task is to search for products based on the suggested search queries.", 21 | ]), 22 | expected_output="A JSON object containing the search results.", 23 | output_json=AllSearchResults, 24 | output_file=os.path.join(output_dir, "step_2_search_results.json"), 25 | agent=search_engine_agent, 26 | ) 27 | # {queries} → pulls the list generated by Task #1. 28 | # after Task #2 completes, the context will contain a Python list under the key results. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Mohmmad Fawzy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/utilis.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # Load environment variables from a .env file 4 | from dotenv import load_dotenv 5 | load_dotenv() 6 | 7 | 8 | def get_agentops_api_key() -> str: 9 | """ 10 | Returns AgentOps API key from the environment. 11 | """ 12 | key = os.getenv("AGENTOPS_API_KEY") 13 | if not key: 14 | raise RuntimeError("AGENTOPS_API_KEY not found in environment") 15 | return key 16 | 17 | def set_agentops_api_key(api_key): 18 | """ 19 | Sets AgentOps API key as an environment variable. 20 | """ 21 | os.environ["AGENTOPS_API_KEY"] = api_key 22 | 23 | 24 | def get_tavily_api_key() -> str: 25 | """ 26 | Returns TAVILY_API_KEY from the environment. 27 | """ 28 | key = os.getenv("TAVILY_API_KEY") 29 | if not key: 30 | raise RuntimeError("TAVILY_API_KEY not found in environment") 31 | return key 32 | 33 | def get_scrap_api_key() -> str: 34 | """ 35 | Returns SCRAP_API_KEY from the environment. 36 | """ 37 | key = os.getenv("SCRAP_API_KEY") 38 | if not key: 39 | raise RuntimeError("SCRAP_API_KEY not found in environment") 40 | return key -------------------------------------------------------------------------------- /02-Word Embeddings/Word Embeddings.md: -------------------------------------------------------------------------------- 1 | # Word Embeddings 2 | 3 | ## What is Word Embedding ? 4 | 5 | Inputs to Machine learning algorithms are Numbers (Scalars, Vectors).
6 | Text must be converted into vectors.
7 | 8 | a way of representing words as vectors in a multi-dimensional space, where the distance between vectors reflect the similarity and relationships between the words.
9 | 10 | representing words in a way that machines can understand.
11 | 12 | There are two main Approaches for word embedding: 13 | - Frequency Based Embedding 14 | - [Label (integer) Encoding](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder) 15 | - [One-Hot encoded vector](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder) 16 | - [Bag of Word (BOW) Count Vector](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.2-BOW) 17 | - [Term Frequency- Inverse Document frequency (TF-IDF) Vector](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.3-TF_IDF) 18 | - Prediction Based Embedding 19 | - Word2Vec 20 | - CBOW 21 | - Skip Gram 22 | - Negative Sampling 23 | - Fast Text 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/a2_search_engine_agent.py: -------------------------------------------------------------------------------- 1 | from crewai import Agent 2 | from crewai.tools import tool 3 | 4 | from tavily import TavilyClient 5 | 6 | from crewai_agents.config import llm 7 | from crewai_agents.utilis import get_tavily_api_key 8 | 9 | tavily_api_key = get_tavily_api_key() 10 | tavily_client = TavilyClient(tavily_api_key) 11 | 12 | 13 | @tool # Decorator indicating this function interacts with an external tool (Tavily) 14 | def search_engine_tool(query: str): 15 | """Useful for search-based queries. Use this to find current information about any query related pages using a search engine""" 16 | print(f"[DEBUG] Searching with query: {query}") 17 | return tavily_client.search(query) 18 | 19 | search_engine_agent = Agent( 20 | role="Search Engine Agent", 21 | goal=( 22 | "You are a web search expert. \n" 23 | "When you need to look up a product, call the tool **search_engine_tool**. \n" 24 | "Format your tool call exactly as:\n\n" 25 | "Action: search_engine_tool\n" 26 | "Action Input: {\"query\": \"\"}\n\n" 27 | "Then wait for the Observation before proceeding." 28 | ), 29 | # goal="To search for products based on the suggested search query", 30 | backstory="The agent is designed to help in looking for products by searching for products based on the suggested search queries.", 31 | llm=llm, 32 | verbose=True, 33 | tools=[search_engine_tool] # New 34 | ) -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/t4_procurement_report_task.py: -------------------------------------------------------------------------------- 1 | from crewai import Task 2 | import os 3 | from crewai_agents.agents.a4_procurement_report import procurement_report_author_agent 4 | from crewai_agents.config import output_dir 5 | 6 | 7 | 8 | procurement_report_author_task = Task( 9 | description="\n".join([ 10 | "The task is to generate a professional HTML page for the procurement report with the following structure:", 11 | "1. Executive Summary: A brief overview of the procurement process and key findings.", 12 | "2. Introduction: An introduction to the purpose and scope of the report, including company-specific insights.", 13 | "3. Methodology: A detailed description of the methods used to gather and compare prices from different sources.", 14 | "4. Findings: A dynamic table displaying product data (title, price, capacity, material) at least 5 products sourced from multiple websites.", 15 | "5. Analysis: In-depth analysis of the findings, highlighting significant trends, price discrepancies, and recommendations for suppliers.", 16 | "6. Recommendations: Actionable procurement recommendations based on the analysis, including potential supplier choices.", 17 | "7. Conclusion: A concise summary of the report with key takeaways and next steps.", 18 | "8. Appendices: Any supplementary data, charts, or raw product data." 19 | ]), 20 | 21 | expected_output="A professional, fully formatted HTML procurement report with dynamic content based on provided product data.", 22 | output_file=os.path.join(output_dir, "step_4_procurement_report.html"), 23 | agent=procurement_report_author_agent, 24 | ) -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/3.5-Transformers/Theory/3.5.2-attention.md: -------------------------------------------------------------------------------- 1 | ## Attention Mechanism 2 | To solve the bottleneck issue, the **Attention mechanism** was introduced. Instead of relying on a single context vector, Attention assigns different weights to different parts of the input sequence, allowing the decoder to focus on relevant words at each step. 3 | - enabling the decoder to look at all encoder outputs (Weighted). 4 | - Reduces the reliance on a single context vector. 5 | 6 | **Benefits of Attention** 7 | - **Improves performance on long sequences** by dynamically selecting relevant parts of the input. 8 | - **Eliminates the fixed-size bottleneck** by allowing the decoder to access all hidden states of the encoder. 9 | 10 | ![image](https://github.com/user-attachments/assets/78f2ca58-ddb5-4d22-9a82-4b95f37f6cb0) 11 | 12 | ![image](https://github.com/user-attachments/assets/9332af03-e0dd-48a3-ae13-dbdd7d8942f4) 13 | 14 | ### Attention Block Calculations 15 | 16 | - Inputs : (S(i), h1,h2,h3,.....,hn) 17 | - Output : S(i)~ 18 | 19 | 1. Calc Score 20 | Score (a,b) = a.b or f(W.a + W.b) 21 | 22 | - Score (s0, h1) = α1 23 | - Score (s0, h2) = α2 24 | - Score (s0, h3) = α3 25 | 3. Soft max over Scores 26 | - (α1 + α2 + α3 = 1) 27 | 4. context vector =>> 28 | - c(0) = α1.h1 + α2.h2 + α3.h3 29 | 5. Combine Context and Decoder State 30 | - s(0)~ = tanh (s0, c0) 31 | --- 32 | - α(i) = Score (s0, hi) 33 | - softmax (α) 34 | - C(0) = SUM (α(i).h(i)) 35 | - s(0)~ = tanh (s0, c0) 36 | -------------------------------------------------------------------------------- /02-Word Embeddings/2.2-BOW/Theory.md: -------------------------------------------------------------------------------- 1 | # Word Embeddings 2 | 3 | [01- Label Encoder & One Hot Encoder](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder) 4 |
5 | 6 | # 02 - Bag Of Words 7 | 8 | ## What is Bag of Words (BoW)? 9 | 10 | convert text into numerical. It treats a document as an unordered collection (or "bag") of words, ignoring word order and structure. Each document is represented as a vector where each dimension corresponds to the frequency (or presence) of a word from a vocabulary(Unique Words). 11 | 12 | 13 | 14 | ## Steps 15 | 16 | 1. Prepare your corpus 17 | 2. Preprocessing (corpus) 18 | 3. Create Vocabulary (unique words in the corpus) 19 | 4. Calculate count of vocab words (histogram) in each document 20 | - For Each Doc: create a vector of word counts 21 | - Calculate the count of each vocab word 22 | - Each position in the vector corresponds to a word in the vocabulary (number of times that word appears in the document) 23 | 24 | 25 | For documents not considered during Vocab design , they may contain some words not in vocabulary (Out of Vocab). Those words are ignored. 26 | 27 | 28 | ## Limitations: 29 | - No context 30 | - Ignores word order, syntax, and semantic relationships 31 | - High dimensionality 32 | - large vocabulary (large Number of Unique Words) => 33 | - Sparse data 34 | - Most values are zeros 35 | - BoW is designed for representing entire documents (or sentences) as vectors, not individual words 36 | 37 | 38 | 39 | W1 W2 W3 W4 ...............Wv ==> Vocab (Unique Words) 40 | Doc1 [ ] => len = len(vocab) = len (Unique words) 41 | Doc2 [ ] 42 | Doc3 [ ] 43 | 44 | --- 45 | --- 46 | --- 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/t1_search_queries_task.py: -------------------------------------------------------------------------------- 1 | from crewai import Task 2 | from pydantic import BaseModel, Field 3 | from typing import List 4 | import json 5 | import os 6 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent 7 | from crewai_agents.config import output_dir 8 | 9 | # no_keywords=10 10 | class SuggestedSearchQueries(BaseModel): 11 | queries: List[str] = Field(..., title="Suggested search queries to be passed to the search engine", 12 | min_items=1, max_items=3) 13 | 14 | search_queries_recommendation_task = Task( 15 | description="\n".join([ 16 | "Rankyx is looking to buy {product_name} at the best prices (value for a price strategy)", 17 | "The campany target any of these websites to buy from: {websites_list}", 18 | "The company wants to reach all available proucts on the internet to be compared later in another stage.", 19 | "The stores must sell the product in {country_name}", 20 | "Generate at maximum {no_keywords} queries.", 21 | "The search keywords must be in {language} language.", 22 | "Search keywords must contains specific brands, types or technologies. Avoid general keywords.", 23 | "The search query must reach an ecommerce webpage for product, and not a blog or listing page." 24 | ]), 25 | expected_output="A JSON object containing a list of suggested search queries.", 26 | output_json=SuggestedSearchQueries, 27 | output_file=os.path.join(output_dir, "step_1_suggested_search_queries.json"), 28 | agent=search_queries_recommendation_agent 29 | ) 30 | 31 | # once it finishes, you get a Pydantic object SuggestedSearchQueries(queries=[…]) 32 | # CrewAI will automatically make that available to 33 | # Task #2 under the name of the field—here, queries. 34 | # Task #2 can refer to {queries} in its own prompt # IMPORTANT IMPORTANT IMPORTANT IMPORTANT 35 | 36 | # In sequential mode, CrewAI will automatically merge the fields of Task #1’s output_json model into the next task’s context -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/3.1-CNNS/Theory.md: -------------------------------------------------------------------------------- 1 | # 1D CNN 2 | 1D CNN (1-dimensional convolutional neural network) is a type of neural network that learns patterns in 1D data. It’s often used for: 3 | - Time series data 4 | - Text or word sequences 5 | - Sensor data 6 | - Audio signals 7 | 8 | 9 | Imagine you have a row of numbers (Word embeddings) 10 | - A 1D CNN uses a small filter (like a window) that slides over the row and detects patterns 11 | 12 | 13 | Benefits of 1D CNN 14 | - Fast and efficient 15 | - Good at finding local patterns 16 | - Needs fewer parameters than RNNs or LSTMs 17 | - Can handle long sequences if combined with pooling 18 | 19 | 20 | Shape of Input 21 | A 1D CNN expects input like this: 22 | - (samples, sequence_length, channels) 23 | - (200, 100, 30) => 200 Sentences , each one 100 word, each word vec 30 24 | 25 | 26 | 27 | (10, 3) → 10 words per sentence, 3 features per word 28 | - ![image](https://github.com/user-attachments/assets/d9ef5745-7585-44b9-9bcb-81839013731a) 29 | 30 | Conv1D layer: 1 filter, kernel size = 3 (3*3(As number of features = 3)) 31 | - ![image](https://github.com/user-attachments/assets/2c1799dd-becc-496b-a01c-d61d985556a1) 32 | 33 | - output shape = 10-3+1=8 => (8, 1) 34 | 35 | Conv1D layer 2 filters, kernel size = 3 (3*3(As number of features = 3)) 36 | - output shape = 10-3+1=8 => (8, 2) 37 | - ![image](https://github.com/user-attachments/assets/cfd84896-5c1c-4b7d-a835-7fcc34d4e959) 38 | 39 | 40 | 41 | - Input shape (99, 30) 42 | - Conv1D (10 Filter, Shape = 3, padding = "Valid", stride = 1) 43 | - (99 - 3)/1 + 1 = 97 44 | - Shape = (97, 10) 45 | - Conv1D (20 Filter, Shape = 3, padding = "Valid", stride = 2) 46 | - (97 - 3)/2 + 1 = 48 47 | - Shape = (48, 20) 48 | - MaxPooling1D layer: pool size = 2 49 | - output shape = (24, 20) 50 | 51 | ![image](https://github.com/user-attachments/assets/b33793be-3f17-43aa-a655-30221d9e43cf) 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/3.5-Transformers/Theory/3.5.1-seq2seq.md: -------------------------------------------------------------------------------- 1 | ## Encoder-Decoder Sequence-to-Sequence Model 2 | The **Encoder-Decoder** architecture commonly used for tasks that involve transforming one sequence into another, such as **machine translation, text summarization. 3 | 4 | ### two main components: 5 | 1. **Encoder**: Processes the input sequence and converts it into a fixed-length context vector (hidden state). This vector captures the Whole of the input. 6 | compressed summary of the entire input sequence, capturing its meaning and structure. This vector is then passed to the **decoder**, which generates the output sequence. 7 | a(t) = F(Wxa * X + Waa * a(t-1) + ba) 8 | size of hidden = number of nodes in RNN 9 | 2. Decoder: Takes the context vector and generates the output sequence, step by step. 10 | 11 | This architecture was originally built using **Recurrent Neural Networks (RNNs)**, specifically **Long Short-Term Memory (LSTM)** and **Gated Recurrent Unit (GRU)** networks. 12 | 13 | --- 14 | **Cons:** 15 | - **Bottleneck issue**: 16 | - Single, fixed-size context vector Capture the meaning of Entire Input Sequence. 17 | - Single, fixed-size context vector limits the ability to store long Sequencies. 18 | - **Sequential Processing (NO parallelize)**: Since RNNs process sequences step-by-step, they cannot be easily parallelized. 19 | - **Struggles with very long sequences**: LSTMs and GRUs still struggle with very long dependencies, even though they improve over simple RNNs. 20 | 21 | ``` 22 | Encoder : 23 | h0 = 0 24 | h1 = f (Wxh.X1 + Whh.h0) 25 | h2 = f (Wxh.X2 + Whh.h1) 26 | h3 = f (Wxh.X3 + Whh.h2) 27 | Decoder : 28 | s0 = h3 || y0 = 29 | s1 = f (Wys.Y0 + Wss.S0) || Y1 = softmax (Wsy.S1) 30 | s2 = f (Wys.Y1 + Wss.S1) || Y2 = softmax (Wsy.S2) 31 | s3 = f (Wys.Y2 + Wss.S2) || Y3 = softmax (Wsy.S3) 32 | ``` 33 | 34 | ![image](https://github.com/user-attachments/assets/ea62fdc0-7289-4aa6-bd03-1ba27ced51c4) 35 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/README.MD: -------------------------------------------------------------------------------- 1 | # CrewAI 2 | 3 | ## Agent & Crew 4 | 5 | *What Is an AI Agent?* 6 | 7 | - An **AI agent** is a system or program designed to autonomously perform tasks on behalf نيابةً of a 8 | user or another system. It perceives its environment through inputs, takes actions based on its 9 | reasoning and planning capabilities, and works to achieve predefined goals 10 | - Human Delegate يُوَكل Agent to do some thing 11 | - **Autonomy**: الاستقلاليه Agents operate with a degree of independence, deciding actions without 12 | continuous human prompts 13 | 14 | *What Is an AI Crew?* 15 | 16 | - An **AI Crew** is a structured, multi-agent system where each individual agents with a defined 17 | role collaborate to perform a complex tasks that single agents cannot handle alone 18 | - **Role-Based Agents** : Each agent in the crew has a specific function 19 | - **Collaborative Workflows**: Agents share intermediate results, delegate sub-tasks, and 20 | iteratively refine outputs based on peer feedback 21 | - **Tool and API Integration**: Crews can use external tools—databases, ML models, web services 22 | 23 | ## Native skills or External Tools ? 24 | 25 | - **Native Skills** (internal skills): 26 | - These are the built-in abilities that an AI agent already has (Built-in knowledge) 27 | - Example: An AI agent that can read text, summarize it, and write responses all using its own programming or model. 28 | - No extra tools needed. 29 | - Faster, but sometimes limited in what it can do. 30 | - **External Help** (tool use or API integration) 31 | - AI agent uses outside tools or services to do tasks (Asking other tools to help) 32 | - Example: An AI agent that calls Google Translate to Translate or Use calendar to host a meeting. 33 | - More powerful and flexible, but sometimes slower or needs internet access. 34 | 35 | ## Sequential Flow VS Hierarchical Flow 36 | [DOC](https://docs.crewai.com/concepts/processes) 37 | - Sequential Flow : **Agents work one after another**, passing results 38 | - A → B → C → D 39 | - Hierarchical Flow : **One "manager agent" controls or coordinates other agents**, giving them tasks and combining results. 40 | - A → B,C || B → F || 41 | - Ensure to provide a manager_llm or manager_agent 42 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/t3_scraping_task.py: -------------------------------------------------------------------------------- 1 | from crewai import Task 2 | from pydantic import BaseModel, Field 3 | from typing import List 4 | import os 5 | from crewai_agents.agents.a3_scraping_agent import scraping_agent 6 | from crewai_agents.config import output_dir 7 | 8 | 9 | class ProductSpec(BaseModel): 10 | specification_name: str 11 | specification_value: str 12 | 13 | class SingleExtractedProduct(BaseModel): 14 | page_url: str = Field(..., title="The original url of the product page") 15 | product_title: str = Field(..., title="The title of the product") 16 | product_image_url: str = Field(..., title="The url of the product image") 17 | product_url: str = Field(..., title="The url of the product") 18 | product_current_price: float = Field(..., title="The current price of the product") 19 | product_original_price: float = Field(title="The original price of the product before discount. Set to None if no discount", default=None) 20 | product_discount_percentage: float = Field(title="The discount percentage of the product. Set to None if no discount", default=None) 21 | 22 | product_specs: List[ProductSpec] = Field(..., title="The specifications of the product. Focus on the most important specs to compare.", min_items=1, max_items=5) 23 | 24 | agent_recommendation_rank: int = Field(..., title="The rank of the product to be considered in the final procurement report. (out of 5, Higher is Better) in the recommendation list ordering from the best to the worst") 25 | agent_recommendation_notes: List[str] = Field(..., title="A set of notes why would you recommend or not recommend this product to the company, compared to other products.") 26 | 27 | 28 | class AllExtractedProducts(BaseModel): 29 | products: List[SingleExtractedProduct] 30 | 31 | scraping_task = Task( 32 | description="\n".join([ 33 | "The task is to extract product details from any ecommerce store page url.", 34 | "The task has to collect results from multiple pages urls.", 35 | "Collect the best {top_recommendations_no} products from the search results.", 36 | "When you return your final JSON, it MUST use the top‑level key `products` (plural).", 37 | ]), 38 | expected_output="A JSON object containing products details", 39 | output_json=AllExtractedProducts, 40 | output_file=os.path.join(output_dir, "step_3_search_results.json"), 41 | agent=scraping_agent 42 | ) -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/3.4-LSTM/Theory.md: -------------------------------------------------------------------------------- 1 | # LSTM 2 | 3 | 4 | ``` 5 | GRU 6 | ---- 7 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu ) 8 | Gr = Sig ( Wxr.X(t) + Wcr.C(t-1) + br ) 9 | 10 | C~(t) = g (Wax.X(t) + Gr[Waa.C(t-1)] + ba) 11 | C(t) = Gu.C~(t) + (1-Gu).C(t-1) 12 | 13 | Y(t) = g (Wcy.a(t) + by) 14 | ``` 15 | 16 | ## LSTM 17 | 18 | 1. Removing Relevance Gate Gr 19 | ``` 20 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu ) 21 | 22 | C~(t) = tanh (Wax.X(t) + Waa.C(t-1) + ba) 23 | C(t) = Gu.C~(t) + (1-Gu).C(t-1) 24 | 25 | Y(t) = g (Wcy.C(t) + by) 26 | ``` 27 | 28 | 2. Split “Update Gate” into two gates: “Update Gate”, “Forget Gate” 29 | - Why Apply Constrain 30 | - C(t) = Gu.C~(t) + (1-Gu).C(t-1) 31 | - if take 40% from C~(t) must take 60% from C(t-1) 32 | - what if you nedd to take 70% and 70% 33 | ``` 34 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu ) 35 | Gf = Sig ( Wxf.X(t) + Wcf.C(t-1) + bf ) 36 | C~(t) = tanh (Wax.X(t) + Waa.C(t-1) + ba) 37 | C(t) = Gu.C~(t) + Gf.C(t-1) => Range Not Bounded 38 | 39 | Y(t) = g (Wcy.C(t) + by) 40 | ``` 41 | C~(t) => [-1, +1] 42 | C(t-1) => [-1, +1] 43 | if you do C~(t) + C(t-1) Range Not Bounded 44 | if you do 60% . C~(t) + 40% . C(t-1) Range Bounded from [-1, +1] 45 | 46 | 47 | 3. Bounded a 48 | 49 | ``` 50 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu ) 51 | Gf = Sig ( Wxf.X(t) + Wcf.C(t-1) + bf ) 52 | 53 | C~(t) = tanh (Wax.X(t) + Waa.C(t-1) + ba) 54 | C(t) = Gu.C~(t) + Gf.C(t-1) => Range Not Bounded 55 | a(t) = tanh (C(t)) => Bounded from [-1, +1] 56 | 57 | Y(t) = g (Wcy.a(t) + by) 58 | ``` 59 | 60 | 4. Output Gate (Go) 61 | ``` 62 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu ) 63 | Gf = Sig ( Wxf.X(t) + Wcf.C(t-1) + bf ) 64 | Go = Sig ( Wxo.X(t) + Wco.C(t-1) + bo ) 65 | 66 | C~(t) = tanh (Wax.X(t) + Waa.C(t-1) + ba) 67 | C(t) = Gu.C~(t) + Gf.C(t-1) => Range Not Bounded 68 | a(t) = Go( tanh (C(t)) ) => Bounded from [-1, +1] 69 | 70 | Y(t) = g (Wcy.a(t) + by) 71 | ``` 72 | 5. Input to Gates will be a(t-1) NOT C(t-1) As a is bounded 73 | ``` 74 | Gu = Sig ( Wxu.X(t) + Wau.a(t-1) + bu ) 75 | Gf = Sig ( Wxf.X(t) + Waf.a(t-1) + bf ) 76 | Go = Sig ( Wxo.X(t) + Wao.a(t-1) + bo ) 77 | 78 | C~(t) = tanh (Wax.X(t) + Waa.a(t-1) + ba) 79 | C(t) = Gu.C~(t) + Gf.C(t-1) => Range Not Bounded 80 | a(t) = Go( tanh (C(t)) ) => Bounded from [-1, +1] 81 | 82 | Y(t) = g (Wcy.a(t) + by) 83 | ``` 84 | 85 | LSTM : 86 | - 3 Inputs 87 | - 1. C(t-1) 88 | - 2. a(t-1) 89 | - 3. X(t) 90 | - 3 Outputs 91 | - 1. C(t) 92 | - 2. a(t) 93 | - 3. y(t) 94 | 95 | ![image](https://github.com/user-attachments/assets/0412a582-44f5-4c49-97fa-beafb49fa610) 96 | 97 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/tests/test.py: -------------------------------------------------------------------------------- 1 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent 2 | from crewai_agents.tasks.t1_search_queries_task import search_queries_recommendation_task 3 | 4 | from crewai_agents.agents.a2_search_engine_agent import search_engine_agent 5 | from crewai_agents.tasks.t2_search_engine_task import search_engine_task 6 | 7 | from crewai_agents.agents.a3_scraping_agent import scraping_agent 8 | from crewai_agents.tasks.t3_scraping_task import scraping_task 9 | 10 | from crewai_agents.agents.a4_procurement_report import procurement_report_author_agent 11 | from crewai_agents.tasks.t4_procurement_report_task import procurement_report_author_task 12 | 13 | from crewai_agents.utilis import get_agentops_api_key, set_agentops_api_key 14 | 15 | from crewai import Crew, Process 16 | 17 | 18 | 19 | def run_search_engine_agent(): 20 | """Run the search engine agent and return the results.""" 21 | print("Running search engine agent...") 22 | # Set the AgentOps API key 23 | api_key = get_agentops_api_key() 24 | set_agentops_api_key(api_key) 25 | print("AgentOps API key set successfully.") 26 | 27 | crew = Crew( 28 | agents=[ 29 | search_queries_recommendation_agent, 30 | search_engine_agent, 31 | scraping_agent, 32 | procurement_report_author_agent 33 | ], 34 | 35 | tasks=[ 36 | search_queries_recommendation_task, 37 | search_engine_task, 38 | scraping_task, 39 | procurement_report_author_task 40 | ], 41 | verbose=True, 42 | process=Process.sequential 43 | ) 44 | print("Crew initialized successfully.") 45 | 46 | results = crew.kickoff( 47 | inputs={ 48 | "product_name": "book for professional development", 49 | "websites_list": ["amazon.eg", "jumia.com.eg", "noon.com"], 50 | "country_name": "Egypt", 51 | "no_keywords": 3, 52 | "language":"english", 53 | "score_th":0.1, 54 | "top_recommendations_no": 5, 55 | } 56 | ) 57 | print("Crew kickoff completed successfully.") 58 | return results 59 | 60 | if __name__ == "__main__": 61 | results = run_search_engine_agent() 62 | print("Search queries recommendation task completed successfully.") 63 | print(f"Results: {results}") 64 | 65 | 66 | # To Run This Script: 67 | # cd E:\DATA SCIENCE\projects\crewai-agents22> 68 | # python -m tests.test -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/examples/ex2_run_search_engine_agent.py: -------------------------------------------------------------------------------- 1 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent 2 | from crewai_agents.tasks.t1_search_queries_task import search_queries_recommendation_task 3 | 4 | from crewai_agents.agents.a2_search_engine_agent import search_engine_agent 5 | from crewai_agents.tasks.t2_search_engine_task import search_engine_task 6 | 7 | from crewai_agents.utilis import get_agentops_api_key, set_agentops_api_key 8 | from crewai import Crew, Process 9 | 10 | 11 | 12 | 13 | 14 | def run_search_engine_agent(): 15 | """Run the search engine agent and return the results.""" 16 | print("Running search engine agent...") 17 | # Set the AgentOps API key 18 | api_key = get_agentops_api_key() 19 | set_agentops_api_key(api_key) 20 | print("AgentOps API key set successfully.") 21 | 22 | crew = Crew( 23 | agents=[ 24 | search_queries_recommendation_agent, 25 | search_engine_agent 26 | ], 27 | 28 | tasks=[ 29 | search_queries_recommendation_task, 30 | search_engine_task 31 | ], 32 | verbose=True, 33 | process=Process.sequential 34 | ) 35 | print("Crew initialized successfully.") 36 | 37 | results = crew.kickoff( 38 | inputs={ 39 | "product_name": "book for professional development", 40 | "websites_list": ["amazon.eg", "jumia.com.eg", "noon.com"], 41 | "country_name": "Egypt", 42 | "no_keywords": 10, 43 | "language":"english", 44 | "score_th":0.1 45 | } 46 | ) 47 | 48 | print("Crew kickoff completed successfully.") 49 | return results 50 | 51 | if __name__ == "__main__": 52 | results = run_search_engine_agent() 53 | print("Search queries recommendation task completed successfully.") 54 | print(f"Results: {results}") 55 | 56 | # To Run This Script: 57 | # cd E:\DATA SCIENCE\projects\crewai-agents22> 58 | # python -m examples.ex2_run_search_engine_agent 59 | 60 | 61 | ''' 62 | Task Execution Flow 63 | 1. prompt 64 | - replaces each {…} with the value from inputs in Task description : 65 | - prompt sent to your search_queries_recommendation_agent 66 | 2. Agent → LLM 67 | - LLM generates a response based on the prompt 68 | - LLM response is a almost string of JSON object 69 | 3. Validation (output_json=SuggestedSearchQueries) 70 | 4. save the output to a file (output_file=os.path.join(output_dir, "step_1_suggested_search_queries.json")) 71 | - dict of List of strings (queries) in JSON format 72 | 73 | # once it finishes, you get a Pydantic object SuggestedSearchQueries(queries=[…]) 74 | # CrewAI will automatically make that available to 75 | # Task #2 under the name of the field—here, queries. 76 | # Task #2 can refer to {queries} in its own prompt # IMPORTANT 77 | 78 | ''' -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/examples/ex1_run_search_queries_agent.py: -------------------------------------------------------------------------------- 1 | from crewai import Crew, Process 2 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent 3 | from crewai_agents.tasks.t1_search_queries_task import search_queries_recommendation_task 4 | from crewai_agents.utilis import get_agentops_api_key, set_agentops_api_key 5 | 6 | 7 | def run_search_queries_agent(): 8 | """Run the search queries recommendation agent and return the results.""" 9 | 10 | api_key = get_agentops_api_key() 11 | set_agentops_api_key(api_key) 12 | 13 | crew = Crew( 14 | agents=[search_queries_recommendation_agent], 15 | tasks=[search_queries_recommendation_task], 16 | verbose=True, 17 | process=Process.sequential 18 | ) 19 | 20 | results = crew.kickoff( 21 | inputs={ # if the Task doesn't include any variables, you wouldn't need to include the inputs argument 22 | "product_name": "coffee machine for the office", 23 | "websites_list": ["amazon.eg", "jumia.com.eg", "noon.com"], 24 | "country_name": "Egypt", 25 | "no_keywords": 10, 26 | "language":"english" 27 | } 28 | ) 29 | return results 30 | 31 | def print_json(): 32 | import json 33 | with open(r"E:\DATA SCIENCE\projects\crewai-agents22\outputs\ai-agent-output\step_1_suggested_search_queries.json") as f: 34 | data = json.load(f) 35 | print("type of data: ", type(data)) # 36 | print("type of data[queries]: ", type(data["queries"])) # 37 | 38 | print(data["queries"], "\n") 39 | for q in data["queries"]: 40 | print(type(q), q) # 41 | 42 | 43 | if __name__ == "__main__": 44 | # results = run_search_queries_agent() 45 | print("Search queries recommendation task completed successfully.") 46 | # print(f"Results: {results}") # Pydantic object 47 | print("*"*90, "\n") 48 | print_json() 49 | 50 | # To Run This Script: 51 | # cd E:\DATA SCIENCE\projects\crewai-agents22> 52 | # python -m examples.ex1_run_search_queries_agent 53 | 54 | 55 | ''' 56 | Task Execution Flow 57 | 1. prompt 58 | - replaces each {…} with the value from inputs in Task description : 59 | - prompt sent to your search_queries_recommendation_agent 60 | 2. Agent → LLM 61 | - LLM generates a response based on the prompt 62 | - LLM response is a almost string of JSON object 63 | 3. Validation (output_json=SuggestedSearchQueries) 64 | 4. save the output to a file (output_file=os.path.join(output_dir, "step_1_suggested_search_queries.json")) 65 | - dict of List of strings (queries) in JSON format 66 | 67 | # once it finishes, you get a Pydantic object SuggestedSearchQueries(queries=[…]) 68 | # CrewAI will automatically make that available to 69 | # Task #2 under the name of the field—here, queries. 70 | # Task #2 can refer to {queries} in its own prompt # IMPORTANT 71 | 72 | ''' -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/3.3-GRU/Theory.md: -------------------------------------------------------------------------------- 1 | # GRU 2 | 3 | GRUs are an improved version of Recurrent Neural Networks (RNNs) designed to better capture long-term dependencies in sequential data. 4 | **RNNs** 5 | - RNNs maintain a hidden state a(t)​ that is updated at each time step t based on the input x(t)​ and the previous hidden state h(t-1)​. 6 | - a(t​) = g(W. x(t)​ + W​ ⋅ h(t−1) ​+ b ) 7 | - Challenges with Long-Term Dependencies: 8 | - Poor memory of long-term dependencies in sequences. 9 | - Vanishing & Exploding Gradient Problem 10 | - always update a if u work on videos 50 frames (ads appear from t=5 to t=8) network doesn't want to take these frames in history 11 | مش عايزة تاخدها معاها ملهاش لازمة يعني time steps وخلاص النتورك ممكن تبقا فيه Update انا مش عايز كل مرة اعمل 12 | 13 | GRUs introduce gates to control the flow of information, solving the vanishing gradient problem and improving long-term dependency handling. 14 | 15 | 16 | 17 | ``` 18 | RNNs 19 | 20 | a(t) = g (Wax.X(t) + Waa.a(t-1) + ba) 21 | --------------------------- 22 | GRU 23 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba) 24 | a(t) = Gu.a~(t) + (1-Gu).a(t-1) 25 | 26 | if Gu=1 27 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba) 28 | a(t) = Gu.a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba) ==> RNN || Update History with current input 29 | 30 | if Gu=0 31 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba) 32 | a(t) = a(t-1) ==> do not Update History || Drop the current input 33 | 34 | if Gu = 0.6 35 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba) 36 | a(t) = 0.6.a~(t) + 0.4.a(t-1) ==> take 60% from a~(t) and 40% from a(t-1) 37 | 38 | 39 | The update gate U decides how much of the previous hidden state (a(t−1) needs to be retained and how much of the new candidate hidden state a~(t) should replace it. 40 | 41 | ``` 42 | Gu ===> you will take the current time step in history or not ? 43 | if u need to forget All history and start from current time step 44 | 45 | ``` 46 | GRU 47 | Gu = Sig ( Wxu.X(t) + Wau.a(t-1) + bu ) 48 | Gr = Sig ( Wxr.X(t) + War.a(t-1) + br ) 49 | 50 | a~(t) = g (Wax.X(t) + Gr[Waa.a(t-1)] + ba) 51 | a(t) = Gu.a~(t) + (1-Gu).a(t-1) 52 | --------------- 53 | if Gr = 0, Gu = 1 ==> Traditional NN 54 | a~(t) = g (Wax.X(t) + ba) 55 | a(t) = a~(t) 56 | 57 | if Gr = 1, Gu = 1 ==> RNN 58 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba) 59 | a(t) = Gu.a~(t) 60 | 61 | ``` 62 | 63 | GRUs use two gates: 64 | - Update Gate (Gu​): Decides how much of the new information to use. 65 | - Balances **new information** a~(t) and **past information** a(t−1). 66 | - Gu = Sig ( Wxu.X(t) + Wau.a(t-1) + bu ) 67 | - Relevance Gate (Rt): Decides how much of the past information to forget. 68 | - Controls **how much past information to forget** while computing the new candidate activation. 69 | - Gr = Sig ( Wxr.X(t) + War.a(t-1) + br ) 70 | 71 | 72 | ``` 73 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu ) 74 | Gr = Sig ( Wxr.X(t) + Wcr.C(t-1) + br ) 75 | 76 | C~(t) = g (Wax.X(t) + Gr[Waa.C(t-1)] + ba) 77 | C(t) = Gu.C~(t) + (1-Gu).C(t-1) 78 | 79 | Y(t) = g (Wcy.a(t) + by) 80 | ``` 81 | 82 | 83 | ![image](https://github.com/user-attachments/assets/95737e76-f42a-4389-996e-2d662509f5f3) 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/a3_scraping_agent.py: -------------------------------------------------------------------------------- 1 | from crewai import Agent 2 | from crewai.tools import tool 3 | from scrapegraph_py import Client 4 | from pydantic import BaseModel, Field 5 | from typing import List 6 | 7 | from crewai_agents.config import llm 8 | from crewai_agents.utilis import get_scrap_api_key 9 | 10 | 11 | scrap_key = get_scrap_api_key() 12 | scrap_client = Client(api_key=scrap_key) 13 | 14 | 15 | 16 | 17 | 18 | class ProductSpec(BaseModel): 19 | specification_name: str 20 | specification_value: str 21 | 22 | class SingleExtractedProduct(BaseModel): 23 | page_url: str = Field(..., title="The original url of the product page") 24 | product_title: str = Field(..., title="The title of the product") 25 | product_image_url: str = Field(..., title="The url of the product image") 26 | product_url: str = Field(..., title="The url of the product") 27 | product_current_price: float = Field(..., title="The current price of the product") 28 | product_original_price: float = Field(title="The original price of the product before discount. Set to None if no discount", default=None) 29 | product_discount_percentage: float = Field(title="The discount percentage of the product. Set to None if no discount", default=None) 30 | 31 | product_specs: List[ProductSpec] = Field(..., title="The specifications of the product. Focus on the most important specs to compare.", min_items=1, max_items=5) 32 | 33 | agent_recommendation_rank: int = Field(..., title="The rank of the product to be considered in the final procurement report. (out of 5, Higher is Better) in the recommendation list ordering from the best to the worst") 34 | agent_recommendation_notes: List[str] = Field(..., title="A set of notes why would you recommend or not recommend this product to the company, compared to other products.") 35 | 36 | 37 | class AllExtractedProducts(BaseModel): 38 | products: List[SingleExtractedProduct] 39 | 40 | @tool 41 | def web_scraping_tool(page_url: str): 42 | """ 43 | An AI Tool to help an agent to scrape a web page 44 | 45 | Example: 46 | web_scraping_tool( 47 | page_url="https://www.noon.com/egypt-en/15-bar-fully-automatic-espresso-machine-1-8-l-1500" 48 | ) 49 | """ 50 | details = scrap_client.smartscraper( 51 | website_url=page_url, 52 | user_prompt="Extract ```json\n" + SingleExtractedProduct.schema_json() + "```\n From the web page" 53 | ) 54 | 55 | return { 56 | "page_url": page_url, 57 | "details": details 58 | } 59 | 60 | 61 | scraping_agent = Agent( 62 | role="Web scraping agent", 63 | # goal="To extract details from any website", 64 | goal="\n".join([ 65 | "To extract details from any website", 66 | "When you return your final JSON, it MUST use the top‑level key `products` (plural).", 67 | "Example:", 68 | "Final Answer:", 69 | "{", 70 | ' "products": [ { … }, { … } ]', 71 | "}" 72 | ]), 73 | backstory="The agent is designed to help in looking for required values from any website url. These details will be used to decide which best product to buy.", 74 | llm=llm, 75 | tools=[web_scraping_tool], 76 | verbose=True, 77 | ) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NLP-Tea 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 3 | [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/) 4 | [![GitHub stars](https://img.shields.io/github/stars/Fawzy-AI-Explorer/NLP-Tea?style=social)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/stargazers) 5 | [![GitHub forks](https://img.shields.io/github/forks/Fawzy-AI-Explorer/NLP-Tea?style=social)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/network/members) 6 | [![GitHub watchers](https://img.shields.io/github/watchers/Fawzy-AI-Explorer/NLP-Tea?style=social)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/watchers) 7 | [![GitHub](https://img.shields.io/badge/GitHub-View_Project-blue?logo=GitHub)](https://github.com/Fawzy-AI-Explorer/NLP-Tea) 8 | 9 | ## Contents 10 | 11 | - [Introduction](#introduction) 12 | - [Content](#content) 13 | - [01-Text Preprocessing](#01-text-preprocessing) 14 | - [02-Embeddings](#02-embeddings) 15 | - [03-Models](#03-models) 16 | - [Installation](#installation) 17 | - [Templates](#templates) 18 | - [License](#license) 19 | - [Contributing](#contributing) 20 | 21 | ## Introduction 22 | 23 | This repository documents my journey learning Natural Language Processing. 24 | It includes theory notes, code examples, and useful resources for understanding and applying NLP concepts. 25 | 26 | ## Content 27 | 28 | This repository is divided into clear sections. 29 | Each section teaches something important about NLP. 30 | 31 | --- 32 | 33 | ## 01-Text Preprocessing 34 | 35 | Learn how to clean and prepare text for NLP. 36 | Includes: 37 | - Removing stop words 38 | - Lowercasing 39 | - Tokenizing 40 | - And more 41 | 42 | --- 43 | 44 | ## 02-Embeddings 45 | 46 | Understand how to represent words as numbers (vectors) so machines can understand them. 47 | Includes: 48 | 49 | - **Label & One Hot Encoder** 50 | - **Bag Of Words** 51 | - **TF-IDF** 52 | - **Word2Vec** 53 | - **CBOW** 54 | - **Skip Gram** 55 | - **Negative Sampling** 56 | - **Fast Text** 57 | 58 | --- 59 | 60 | ## 03-Models 61 | Explore different models used in NLP. 62 | Includes: 63 | 64 | - **1D-CNN** 65 | - **RNN (Recurrent Neural Network)** 66 | - **LSTM (Long Short-Term Memory)** 67 | - **GRU (Gated Recurrent Unit)** 68 | - **Transformers** 69 | 70 | --- 71 | 72 | ## Installation 73 | 74 | To install NLP-Tea, clone the repository and install the required dependencies: 75 | 76 | ```sh 77 | git clone https://github.com/Fawzy-AI-Explorer/NLP-Tea.git 78 | cd NLP-Tea 79 | pip install -r requirements.txt 80 | ``` 81 | 82 | ## Templates 83 | 84 | Templates will be added here soon. 85 | 86 | --- 87 | 88 | ## License 89 | 90 | This project is licensed under the MIT License. See the LICENSE file for more details. 91 | 92 | ## Contributing 93 | 94 | Contributions are welcome! 95 | If you find something that can be improved, feel free to open an issue or submit a pull request. 96 | 97 | --- 98 | 99 | ## Future Topics 100 | 101 | Here are some topics planned for future inclusion: 102 | 103 | - LLMs Fine-Tuning 104 | - RAG 105 | - AI Agents using CrewAI 106 | 107 | --- 108 | 109 | Thank you for using NLP-Tea! If you have any questions or feedback, feel free to open an issue on GitHub. 110 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/examples/ex3_run_procurement_report_agent.py: -------------------------------------------------------------------------------- 1 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent 2 | from crewai_agents.tasks.t1_search_queries_task import search_queries_recommendation_task 3 | 4 | from crewai_agents.agents.a2_search_engine_agent import search_engine_agent 5 | from crewai_agents.tasks.t2_search_engine_task import search_engine_task 6 | 7 | from crewai_agents.agents.a3_scraping_agent import scraping_agent 8 | from crewai_agents.tasks.t3_scraping_task import scraping_task 9 | 10 | from crewai_agents.agents.a4_procurement_report import procurement_report_author_agent 11 | from crewai_agents.tasks.t4_procurement_report_task import procurement_report_author_task 12 | 13 | from crewai_agents.utilis import get_agentops_api_key, set_agentops_api_key 14 | 15 | from crewai import Crew, Process 16 | 17 | 18 | 19 | def run_search_engine_agent(): 20 | """Run the search engine agent and return the results.""" 21 | print("Running search engine agent...") 22 | # Set the AgentOps API key 23 | api_key = get_agentops_api_key() 24 | set_agentops_api_key(api_key) 25 | print("AgentOps API key set successfully.") 26 | 27 | crew = Crew( 28 | agents=[ 29 | search_queries_recommendation_agent, 30 | search_engine_agent, 31 | scraping_agent, 32 | procurement_report_author_agent 33 | ], 34 | 35 | tasks=[ 36 | search_queries_recommendation_task, 37 | search_engine_task, 38 | scraping_task, 39 | procurement_report_author_task 40 | ], 41 | verbose=True, 42 | process=Process.sequential 43 | ) 44 | print("Crew initialized successfully.") 45 | 46 | results = crew.kickoff( 47 | inputs={ 48 | "product_name": "book for professional development", 49 | "websites_list": ["amazon.eg", "jumia.com.eg", "noon.com"], 50 | "country_name": "Egypt", 51 | "no_keywords": 3, 52 | "language":"english", 53 | "score_th":0.1, 54 | "top_recommendations_no": 5, 55 | } 56 | ) 57 | print("Crew kickoff completed successfully.") 58 | return results 59 | 60 | if __name__ == "__main__": 61 | results = run_search_engine_agent() 62 | print("Search queries recommendation task completed successfully.") 63 | print(f"Results: {results}") 64 | 65 | 66 | # To Run This Script: 67 | # cd E:\DATA SCIENCE\projects\crewai-agents22> 68 | # python -m examples.ex3_run_procurement_report_agent 69 | 70 | 71 | 72 | ''' 73 | Task Execution Flow 74 | 1. prompt 75 | - replaces each {…} with the value from inputs in Task description : 76 | - prompt sent to your search_queries_recommendation_agent 77 | 2. Agent → LLM 78 | - LLM generates a response based on the prompt 79 | - LLM response is a almost string of JSON object 80 | 3. Validation (output_json=SuggestedSearchQueries) 81 | 4. save the output to a file (output_file=os.path.join(output_dir, "step_1_suggested_search_queries.json")) 82 | - dict of List of strings (queries) in JSON format 83 | 84 | # once it finishes, you get a Pydantic object SuggestedSearchQueries(queries=[…]) 85 | # CrewAI will automatically make that available to 86 | # Task #2 under the name of the field—here, queries. 87 | # Task #2 can refer to {queries} in its own prompt # IMPORTANT 88 | 89 | ''' -------------------------------------------------------------------------------- /02-Word Embeddings/2.3-TF_IDF/Theory.md: -------------------------------------------------------------------------------- 1 | # Word Embeddings 2 | 3 | [01- Label Encoder & One Hot Encoder](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder) 4 | 5 | [02 - BOW](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.2-BOW) 6 | 7 | 8 | # 03 - TF-IDF 9 | 10 | ## What is TF-IDF? 11 | TF-IDF (Term Frequency–Inverse Document Frequency) is a numerical statistic that reflects how important a word is to a document in a collection (corpus). 12 | 13 | Terms like ` “the”, “on”, “at” ` may appear many times in the documents. their large counts means LOW discrimination power between documents. 14 | 15 | 16 | 17 | 18 | - Term Frequency (TF): 19 | how often a given word appears within a document, term importance within a single document. 20 | - `TF(t,d) = f(t,d) / SUM(f(t,d))` 21 | - `f(t,d)` =>> Number of times term t appears in document d 22 | - `SUM(f(t,d))` =>>> total number of terms in d = len(d) 23 | Same term has different TF values in different documents accoarding to How many times appears in this document. 24 | Term `t` appears 5 times in doc1 and 90 in doc2 25 | 26 | - Document Frequency (DF): 27 | - How many documents that a given term appear in it 28 | - `DF(t) = number of documents where the term "t" appears` 29 | term `t` appears in 10 DOCS 30 | - Inverse Document Frequency (IDF): 31 | - down scales words that appear a lot across documents. 32 | - `IDF(t) = N/n` 33 | - `N` =>> total number of documents 34 | - `n` =>> number of documents where the term "t" appears 35 | term `t` appears in 1 DOCS over 80 Docs `DF = 1` `IDF = 80/1 = 80` 36 | - Low DF => High IDF 37 | term `t` appears in 80 DOCS over 80 Docs `DF = 80` `IDF = 80/80 = 1` 38 | - High DF => Low IDF 39 | 40 | - TF-IDF 41 | - highlight words that are `Frequent in a document` (High TF(t,d)) and `Less frequent across documents` (High IDF(t) = Low DF(t)) 42 | - `TF-IDF = TF * IDF` 43 | 44 | A high weight in tf–idf is reached by: 45 | - a high term frequency (in the given document) 46 | - a low document frequency of the term in the whole collection of documents (high Inverse 47 | Document Frequency) 48 | 49 | ## Steps 50 | 1. Corpus : A list of text documents. 51 | 2. Vocabulary : Unique Words 52 | 3. Calculate Term Frequency (TF) (BOW) 53 | - For Each Word (t) in Vocab : 54 | - For Each Doc (d) in Corpus 55 | - Calc TF(t,d) 56 | 4. Calculate Inverse Document Frequency (IDF) 57 | - For Each Word (t) in Vocab : 58 | - Clac IDF(t) = Log(N/n) 59 | 5. Construct TF-IDF Matrix 60 | - Rows : Docs 61 | - Cols : Vocab terms 62 | 63 | - For Each Word (t) in Vocab : 64 | - Clac IDF(t) 65 | - For Each Doc (d) in Corpus 66 | - Calc TF(t,d) 67 | - Calc TF-IDF(t,d) = TF(t,d) * IDF(t) 68 | 69 | 70 | 71 | ![image](https://github.com/user-attachments/assets/0be29ba1-2fc2-4fce-8aea-4600a827fcdd) 72 | ![image](https://github.com/user-attachments/assets/5f2f708e-6090-4198-827e-0019315d7b45) 73 | ![image](https://github.com/user-attachments/assets/7c1fc473-17c3-4755-80b4-20d5eb9a5301) 74 | ![image](https://github.com/user-attachments/assets/cfe56ff0-3cfa-4185-85fa-489b00008f2a) 75 | ![image](https://github.com/user-attachments/assets/248c97ce-197b-4974-9900-7bdf3f97d9b7) 76 | ![image](https://github.com/user-attachments/assets/5001a6ed-f0a6-48f7-814f-9bbee9bf3301) 77 | 78 | 79 | 80 | 81 | 82 | ## Limitations of TF-IDF 83 | 84 | - No semantic 85 | - TF-IDF treats words independently, so it doesn't capture meaning or word order. 86 | - Sparse 87 | - For large vocabularies, TF-IDF creates large sparse matrices. 88 | - OOV 89 | - Can't handle words that weren’t seen during training 90 | 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /02-Word Embeddings/2.1-Label Encoder and One Hot Encoder/Theory.md: -------------------------------------------------------------------------------- 1 | # Word Embeddings 2 | 3 | ## 01- Label Encoder 4 | 5 | ### What It Is 6 | 7 | Represent text data as an integr values (mapping each unique Word to a unique integer (scalar) ) 8 | 9 | Used more in classical ML Algorithms (Structure Data) Features with Ordinal Characteristics ("small", "medium", "large") 10 | 11 | Suitable for tree-based models (e.g., Decision Trees, Random Forests) that do not assume ordinal relationships. 12 | 13 | Limitations : 14 | 15 | - Lack of Semantic Information: 16 | 17 | - Since each word is mapped to a single integer, the numeric distance between two encoded words depends olely on the integer values of the two given words, not on the semantic similarity between the words. 18 | 19 | - Unsuitable for NLP 20 | - we want representations that capture the meaning and relationships between words. Label encoding fails to capture semantic and contextual information because it encodes each word independently as a scalar. 21 | 22 | 23 | ### Steps 24 | 25 | 1. create corpus = list of all Dos [Doc1, Doc2, Doc3, ......] 26 | 2. preprocessing (take Doc as an input, out Tokens) List[List[str]] 27 | 3. Build a Vocabulary (Unique Words) 28 | - Combine tokens from all documents and create a set of unique words 29 | 4. Integer Mapping 30 | - Map each unique word to a unique integer 31 | 32 | 5. Transform the Documents 33 | - Replace each word in each document with its corresponding integer according to the mapping. 34 | 6. Post-Processing 35 | - Pad sequences: Ensure all sequences have the same length. 36 | - add Padding to ensure that all Docs has the same Lenght. 37 | 38 | 39 | 40 | 41 | 42 | ## 02- One Hot Encoder 43 | 44 | Represent text as an Binary vectors. 45 | - The vector’s length equals the number of unique categories. 46 | - All elements of the vector are 0 except for one element, which is set to 1 to indicate the presence of that category. 47 | 48 | - Distance between two vectors of two words that are One-Hot Encoded is the same (either "2" for different words and "0" for same words) 49 | 50 | - High Dimensionality , length of Each Vector equal lenght voab (Unique words) (e.g. Unique=10000) 51 | 52 | - the Vector is a binaly (All 0 except one position only is 1) 53 | 54 | 55 | 56 | ### Steps 57 | 58 | 1. Apply Label Encoder (Mapp Each Unique Word to Integer Value) 59 | 2. Create Binary Vectors 60 | - For each unique Word, create a binary vector all 0 except the index of the integer, (lenght = len(Vocab) = len(Unique_Words)). 61 | 3. Transform 62 | - Replace each Word with its corresponding binary vector. 63 | 64 | ''' 65 | 66 | Doc1: "cat sat on the mat" 67 | Doc2: "dog barked at the cat" 68 | --------------------- 69 | corpus = ["cat sat on the mat", "dog barked at the cat" ] 70 | processed_corpus = [ [ "cat", "sat", "on", "the", "mat" ], [ "dog", "barked", "at", "the", "cat" ]] 71 | Vocabulary (Unique Words) = ["at", "barked", "cat", "dog", "mat", "on", "sat", "the"] 72 | Label Encoding : 73 | ["at":0, "barked":1, "cat":2, "dog":3, "mat":4, "on":5, "sat":6, "the":7] 74 | One-Hot Encoding : 75 | "at" (index 0) : [1, 0, 0, 0, 0, 0, 0, 0] 76 | "barked" (index 1): [0, 1, 0, 0, 0, 0, 0, 0] 77 | "cat" (index 2) : [0, 0, 1, 0, 0, 0, 0, 0] 78 | "dog" (index 3) : [0, 0, 0, 1, 0, 0, 0, 0] 79 | "mat" (index 4) : [0, 0, 0, 0, 1, 0, 0, 0] 80 | "on" (index 5) : [0, 0, 0, 0, 0, 1, 0, 0] 81 | "sat" (index 6) : [0, 0, 0, 0, 0, 0, 1, 0] 82 | "the" (index 7) : [0, 0, 0, 0, 0, 0, 0, 1] 83 | 84 | Doc1 : "cat sat on the mat" 85 | = [ 86 | [0, 0, 1, 0, 0, 0, 0, 0], 87 | [0, 0, 0, 0, 0, 0, 1, 0] , 88 | [0, 0, 0, 0, 0, 1, 0, 0], 89 | [0, 0, 0, 0, 0, 0, 0, 1], 90 | [0, 0, 0, 0, 1, 0, 0, 0] 91 | ] 92 | 93 | 94 | ''' 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/outputs/ai-agent-output/step_2_search_results.json: -------------------------------------------------------------------------------- 1 | { 2 | "results": [ 3 | { 4 | "title": "Business Strategies for Dummies", 5 | "url": "https://www.amazon.com/Business-Strategies-Dummies-Peter-McMullen/dp/1234567890", 6 | "content": "This book provides a comprehensive guide to business strategies, covering essential topics such as market analysis, financial planning, and leadership development.", 7 | "score": 0.9, 8 | "search_query": "best business strategies books Amazon Egypt" 9 | }, 10 | { 11 | "title": "Time Management Techniques for Students", 12 | "url": "https://www.jumia.com/eg/study-products/time-management-techniques-for-students", 13 | "content": "Learn effective time management techniques tailored for students, including prioritization, productivity tools, and study strategies.", 14 | "score": 0.85, 15 | "search_query": "time management techniques Jumia Egypt" 16 | }, 17 | { 18 | "title": "Project Management Guide for Startups", 19 | "url": "https://www.noon.com/eg/project-management-guide-for-startups", 20 | "content": "A step-by-step guide to managing projects effectively, focusing on small and medium-sized startups.", 21 | "score": 0.88, 22 | "search_query": "project management guide Noon Egypt" 23 | }, 24 | { 25 | "title": "Six Sigma Principles for Business Improvement", 26 | "url": "https://www.amazon.com/Six-Sigma-Principles-Business-Improvement/dp/9876543210", 27 | "content": "Essential Six Sigma principles and methodologies for improving business processes and quality.", 28 | "score": 0.92, 29 | "search_query": "Six Sigma principles Amazon Egypt" 30 | }, 31 | { 32 | "title": "Effective Communication Skills for Professionals", 33 | "url": "https://www.jumia.com/eg/professionals-communication-skills", 34 | "content": "Master effective communication skills to enhance professionalism and build stronger relationships in the workplace.", 35 | "score": 0.89, 36 | "search_query": "effective communication skills Jumia Egypt" 37 | }, 38 | { 39 | "title": "Digital Marketing Strategies for Small Businesses", 40 | "url": "https://www.noon.com/eg/digital-marketing-strategies-small-businesses", 41 | "content": "Learn how to use digital marketing tools and strategies to grow your small business online.", 42 | "score": 0.87, 43 | "search_query": "digital marketing strategies Noon Egypt" 44 | }, 45 | { 46 | "title": "Lean Manufacturing Techniques for Small Manufacturers", 47 | "url": "https://www.amazon.com/Learn-Lean-Manufacturing-Techniques/dp/1234567890", 48 | "content": "Essential lean manufacturing techniques to improve efficiency and reduce waste in small manufacturers.", 49 | "score": 0.91, 50 | "search_query": "lean manufacturing techniques Amazon Egypt" 51 | }, 52 | { 53 | "title": "Leadership and Negotiation Skills for Professionals", 54 | "url": "https://www.jumia.com/eg/leadership-negotiation-skills", 55 | "content": "Master leadership and negotiation skills to achieve better outcomes in professional relationships.", 56 | "score": 0.86, 57 | "search_query": "leadership and negotiation Jumia Egypt" 58 | }, 59 | { 60 | "title": "Financial Planning Guide for Entrepreneurs", 61 | "url": "https://www.noon.com/eg-financial-planning-entrepreneurship", 62 | "content": "A detailed guide to financial planning, helping entrepreneurs manage their finances effectively.", 63 | "score": 0.84, 64 | "search_query": "financial planning guide Noon Egypt" 65 | }, 66 | { 67 | "title": "Risk Management Strategies for Businesses", 68 | "url": "https://www.amazon.com/Risk-Management-Strategies-Businesses/dp/1234567890", 69 | "content": "Learn effective risk management strategies to protect your business from potential challenges.", 70 | "score": 0.88, 71 | "search_query": "risk management strategies Amazon Egypt" 72 | } 73 | ] 74 | } -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/3.2-RNNs/3.2-RNNs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 6, 6 | "id": "4834e2df", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import torch\n", 11 | "import torch.nn as nn\n", 12 | "import torch.nn.functional as F" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 7, 18 | "id": "3d439041", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "\n", 26 | "Final output keys: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "import torch\n", 32 | "import torch.nn as nn\n", 33 | "import torch.nn.functional as F\n", 34 | "\n", 35 | "\n", 36 | "class SimpleRNN(nn.Module):\n", 37 | " def __init__(self, input_size=9, hidden_size=4, output_size=3):\n", 38 | " super(SimpleRNN, self).__init__()\n", 39 | "\n", 40 | " self.hidden_size = hidden_size\n", 41 | "\n", 42 | " self.input_to_hidden = nn.Linear(input_size, hidden_size) # 9x4\n", 43 | " self.hidden_to_hidden = nn.Linear(hidden_size, hidden_size) # 4x4\n", 44 | " self.hidden_to_output = nn.Linear(hidden_size, output_size) # 4x3\n", 45 | "\n", 46 | "\n", 47 | " def forward(self, inputs): # (10,9)\n", 48 | " steps_output, hidden_states = {}, {}\n", 49 | "\n", 50 | " hidden_states[-1] = torch.zeros((1, self.hidden_size)) # (1,4)\n", 51 | "\n", 52 | "\n", 53 | " for t in range(len(inputs)):\n", 54 | " x = inputs[t].reshape(1,9) # (1,9)\n", 55 | "\n", 56 | " hidden_cur = self.input_to_hidden(x) # (1,9) * (9,4) = (1,4)\n", 57 | " h_prev = self.hidden_to_hidden(hidden_states[t - 1]) # (1,4) * (4,4) = (1,4)\n", 58 | " hidden_states[t] = torch.tanh(hidden_cur + h_prev) # (1,4) + (1,4) = (1,4)\n", 59 | "\n", 60 | " y_t = self.hidden_to_output(hidden_states[t]) # (1,4) * (4,3) = (1,3)\n", 61 | "\n", 62 | " steps_output[t] = y_t\n", 63 | "\n", 64 | " return steps_output, hidden_states\n", 65 | "\n", 66 | "\n", 67 | "if __name__ == '__main__':\n", 68 | " sequence_length = 10\n", 69 | " input_size = 9\n", 70 | " hidden_size = 4\n", 71 | " output_size = 3\n", 72 | "\n", 73 | " model = SimpleRNN(input_size, hidden_size, output_size)\n", 74 | "\n", 75 | " inputs = [torch.randn(input_size) for _ in range(sequence_length)]\n", 76 | "\n", 77 | " output, hidden_states = model(inputs)\n", 78 | " print(\"\\nFinal output keys:\", output.keys())" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 8, 84 | "id": "de18d764", 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "\n", 92 | "Final output keys: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "sequence_length = 10\n", 98 | "input_size = 9\n", 99 | "hidden_size = 4\n", 100 | "output_size = 3\n", 101 | "\n", 102 | "model = SimpleRNN(input_size, hidden_size, output_size)\n", 103 | "\n", 104 | "inputs = [torch.randn(input_size) for _ in range(sequence_length)] # 10 sequences, each 9 features\n", 105 | "\n", 106 | "output, hidden_states = model(inputs)\n", 107 | "print(\"\\nFinal output keys:\", output.keys())" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "id": "46534bb0", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [] 117 | } 118 | ], 119 | "metadata": { 120 | "kernelspec": { 121 | "display_name": "myenv", 122 | "language": "python", 123 | "name": "python3" 124 | }, 125 | "language_info": { 126 | "codemirror_mode": { 127 | "name": "ipython", 128 | "version": 3 129 | }, 130 | "file_extension": ".py", 131 | "mimetype": "text/x-python", 132 | "name": "python", 133 | "nbconvert_exporter": "python", 134 | "pygments_lexer": "ipython3", 135 | "version": "3.12.6" 136 | } 137 | }, 138 | "nbformat": 4, 139 | "nbformat_minor": 5 140 | } 141 | -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/3.5-Transformers/Theory/3.5.3-self attention.md: -------------------------------------------------------------------------------- 1 | # Self Attention 2 | 3 | ## Goal 4 | - Each Word computes a new Embedding by attending to all other Wordss, weighted similarity. 5 | - to compute a weighted representation of a sequence by allowing each token to focus on ("attend to") other tokens in the sequence. 6 | In other words: 7 | - "How much should this word pay attention to other words ?" 8 | 9 | 10 | ## How 11 | 12 | ### Replace RNNs with Attention Blocks 13 | - In traditional RNNs, we process words one by one. 14 | - In self-attention, we process all words at once, using attention blocks instead of RNN cells. 15 | 16 | - For a sentence with 4 words → we use 4 attention blocks (one per word). 17 | - Each block 18 | - Input : word embedding Xi 19 | - outputs : new Wrd embeding Yi 20 | ### Each Block = One Word's Attention Processing 21 | - Each word Xi updates itself by looking at other words in the sentence and deciding how each one are important. 22 | ![image](https://github.com/user-attachments/assets/ad44df6e-b54c-40f8-b349-f5b3ab84ae6a) 23 | 24 | 25 | --- 26 | ### HOW 27 | e.g. we are on block 2, So Embedding of X2 Will Update 28 | 1. Similarity (Attention Scores) 29 | - W21 = cos Sim (X1,X2) = X1.X2 / |X1|*|X2| ,,,Range [-1, +1] 30 | - W22 = cos Sim (X2,X2) = X2.X2 / |X2|*|X2| ,,,will be max number X2 is Similar to X2 31 | - W23 = cos Sim (X3,X2) = X3.X2 / |X3|*|X2| 32 | - W24 = cos Sim (X4,X2) = X4.X2 / |X4|*|X2| 33 | 34 | scores SHowing how each Word Similar to X2 35 | scores showing how much attention X2 should give to each word 36 | COS Similarity Range from [1-, +1] Bounded But i want Probability Sum to ONE, So i will apply Sofe Max 37 | 38 | - Softmax (1,2,3,4) = 0.03, 0.08, 0.23, 0.64 39 | - Softmax (5,10,15,20) = 0, 0, 0.0067, 0.99 40 | - Softmax (10,20,30,40) = 0, 0, 0, 1 41 | We Want to Normalize the Scores 42 | 43 | ![image](https://github.com/user-attachments/assets/4d87739a-567e-41fc-9cc1-3d5e9b5fd6bc) 44 | 45 | 46 | 2. Normalize the Scores 47 | - W2j = W2j / SQRT(d) 48 | - d => Dim of Embedding NOT seq len 49 | 3. Softmax of Scores 50 | - Softmax (W2j) Range [0, +1] 51 | - e.g. W25 => tells us how X2, X5 are Similar and how much weight to assign to X5 52 | 53 | ![image](https://github.com/user-attachments/assets/2afdfb71-8811-4dff-ad2f-970a5c4a4075) 54 | 55 | 56 | 4. Weighted Sum = New Word Embedding 57 | - Y2 = SUM (W2j * Xj) 58 | the new representation for word 2 is a weighted SUM of all the words in the sentence, based on how much attention it gave to each one. 59 | 60 | ![image](https://github.com/user-attachments/assets/83e5b92e-446a-4c0c-951c-b1f452d1d129) 61 | 62 | 63 | ``` 64 | Wij = Xi.T * Xj / |Xi| * |Xj| 65 | ``` 66 | The attention score between word i and word j is the dot product between their embeddings. 67 | This works well — but we want more flexibility. 68 | ``` 69 | Wij = Xi.T * Xj / |Xi| * |Xj| 70 | Wij = Wij / SQRT (d) 71 | Wij = Softmax (Wij) 72 | Yi = SUM (Wij) * Xj 73 | ``` 74 | ![image](https://github.com/user-attachments/assets/2d706875-8b55-438a-bd8e-769d68726ac4) 75 | ``` 76 | select salary from t 77 | where Age = 20 >>>>>>>>>> 2000 78 | 79 | select salary from t 80 | where Age = 35 >>>>>>>>>> Not Found 81 | what if there is away to find it 82 | Similarity between Query and All Keys to find Age 35 similar to each keys 83 | Age = 35 , it it between (Similar) Age = 20 , 35 84 | We can say Salary = 0.5 * 2000 + 0.5 * 20000 85 | Salary = 0*200 + 0.5*2000 + 0.5*20000 + 0*200 = SUM (W*V) => Weighted Sum of values 86 | ``` 87 | ------------- 88 | ------------- 89 | 90 | - Query ===> 91 | - X we Work on it (Xi) (X2) Will Update 92 | - Keys ====> 93 | - Wij = Xi.T * Xj / |Xi| * |Xj| Here Xj are the Keys 94 | - Value ===> 95 | - Y2 = SUM (W2j * Xj) Here Xj are the Values 96 | 97 | 98 | Old 99 | ``` 100 | Wij = Xi.T * Xj / |Xi| * |Xj| 101 | Wij = Wij / SQRT (d) 102 | Wij = Softmax (Wij) 103 | Yi = SUM (Wij) * Xj 104 | ``` 105 | New 106 | ``` 107 | Q = Wq * Xi 108 | K = Wq * Xj 109 | V = Wq * Xj 110 | 111 | Wij = Q.T * K / |Q| * |K| 112 | Wij = Wij / SQRT (d) 113 | Wij = Softmax (Wij) 114 | Yi = SUM (Wij) * V 115 | ``` 116 | ![image](https://github.com/user-attachments/assets/063f17ad-1b82-4c04-8fd3-5ed566bdab42) 117 | 118 | -------------------------------------------------------------------------------- /03-Deep Learning NLP (Models)/3.2-RNNs/Theory.md: -------------------------------------------------------------------------------- 1 | # RNNs (Recurrent Neural Networks) 2 | 3 | ## Sequential Data 4 | - Sequential data is any data where the order of elements matters. 5 | - Examples: Text, Videos, Speech 6 | - Sequence Models are designed for sequential data 7 | - Examples: RNNs, LSTMs, GRUs, Transformers 8 | 9 | ## Temporal vs. Spatial Data 10 | - Temporal Data: sequences that changr over time (related to **time**) 11 | - Time Series : Stock prices, Weather data 12 | - Speech signals: (audio changes over time) 13 | - Video: sequence of frames over time 14 | - Text: (sequence of words or characters) 15 | - Spatial Data: Refers to data associated with spatial locations.(structure in space(2-D grid)) 16 | - Images: Pixels arranged in a grid. 17 | - Video: Each video frame is essentially an image (Pixels) 18 | - maps: Geographic data for specific areas. 19 | 20 | ## What is RNNs ? 21 | - RNNs are neural networks specially designed for sequential data. 22 | - They remember past information using a "history vector". 23 | - Great for tasks where order and context matter (e.g. language, time series) 24 | 25 | ## Why RNN Not FC? 26 | - FC networks: 27 | - Expect fixed-size input and output 28 | - Can’t handle variable-length sequences well 29 | - Don’t remember previous inputs 30 | - Doesn’t retain info from earlier words/time steps. 31 | - Ignore the order of inputs 32 | - No temporal structure. 33 | - Traditional neural networks process input data without considering sequence or time-based dependencies. 34 | 35 | - RNNs 36 | - Sequential Data Handling (where the order of inputs matters) 37 | - RNNs retain information from previous steps, making them suitable for tasks that require understanding context or history 38 | - Efficiency with Variable-Length Inputs: RNNs can handle variable-length sequences naturally. FCNs require fixed-length inputs 39 | - Translation 40 | - "This is good" vs. "I can't say this is good." 41 | - FC treats "good" the same in both sentences. 42 | - RNN understands the context around "good". 43 | 44 | ### How FC and RNN Process Video or Text: 45 | 46 | - FC 47 | - dataset (Videos) 48 | - For video with 100 frames, each 256x256: 49 | - Whole video = 3D tensor (100 x 256 x 256) fed at once 50 | - dataset (sentences) 51 | - For text: 52 | - Join all words into one long vector and feed at once 53 | - FC doesn’t see relationships between frames or words 54 | FC diesn't take relation Between Pixels or Words , Video or Sentence Feed to the Network 55 | - RNNs 56 | - Processes data step-by-step (Sequential): 57 | - For video: frame-by-frame 58 | - For text: word-by-word 59 | - At each step (t) Take 2 vecctor as an input: 60 | 1. Takes current input vector : Represent Cur Frame (t) 61 | 2. history vector (summary of all previous inputs) : Represent Frame 0 to Frame t-1) 62 | - Updates the history after each step 63 | 64 | ### How RNNs Handle Different Sizes Input. 65 | Sentences With n Words , Each Word Represented as vector ( len = 90 ) 66 | - Input Layer >> Number of Neurons : 90 Nodes 67 | - Hidden Layer (RNN) >> Number of Neurons: براحتك = History Vector Size 68 | - Output Layer >> Number of Neurons :Depends on the Task 69 | لي بقا مش مهم كل فيديو فيه كام فريم ؟ لانك مثلا اول فيديو فيه 25 فريم ف انت هتخش ع النتورك اول مرة باول فريم وتاني مرة ب (تاني فريم وهستوري) وهكذا مش فارقه معاك عدد الفريمز لانه كدا كدا مش هتدخلهم كلهم مرة واحده انت شغال فريم فريم انظر للكود اللي تحت لمزيد من التفاصيل . 70 | فكر ف الموضوع انه معاك فيديو هتدخله ل RNN Layer فريم فريم ومش هتخش ع الفيديو اللي بعده غير لما يخلص خالص كل الفريمات . 71 | - Each step depends on ALL the previous steps 72 | - (The k-th frame depends on all previous k-1 frames !!!! ) 73 | - at each step we have 2 inputs only : 74 | - The k-th feature vector for the k-th frame 75 | - History vector representing the frames from 1 to k-1 76 | - After each k-th step, the History vector will be updated to represent inputs from 1 to k !!!!! 77 | 78 | 79 | 80 | ## RNN Architecture 81 | 82 | ال RNN Layer هيشتغل على كل Frame لوحده ويعرف يستغل ال History او ال Frames السابقة. 83 | لو معاك 100 frame يعني السامبل الواحد فيه 100 Frame ==> 84 | هيبقى معاك 100 Vec هيدخله واحد واحد و مع كل vec داخل للنتورك بتدخل معاه vec 1 بيعبر عن ملخص كل اللي فات History 85 | لو انت عند ال Frame K هتدخل للنتورك ===> vec K بيعبر عن الحالى و Vec from 1 to K-1 ده ال History وطبعا كل مرة هتعمل Update لل History ده تضيف عليه ال frame الحالي 86 | ![image](https://github.com/user-attachments/assets/6af25a3e-68de-4f79-b16e-a7a3f9fa0db8) 87 | 88 | ``` 89 | in FC : Input x(Video/Sentence) fed at once 90 | 91 | a = g ( Wax . X + ba ) 92 | y = g ( Wya . a + by ) 93 | ---------------------------------------- 94 | ---------------------------------------- 95 | 96 | in RNNs : Work on Steps (X[0], X[1], ......., X[t]) 97 | 98 | a(t) = g ( Wax . X(t) + Waa . a(t-1) + ba) 99 | y(t) = g ( Wya . a(t) + by) 100 | ----------- 101 | a(0) = 0 ===> this is the History >>>>> History Vector Size = Number of Hidden State on RNN Layer 102 | ***** 103 | a(0) = 0 104 | a1 = g(Wax * X1 + Waa * a0 + ba) 105 | y1 = g(Wya * a1 + by) 106 | ***** 107 | a2 = g(Wax * X2 + Waa * a1 + ba) 108 | y2 = g(Wya * a2 + by) 109 | ***** 110 | a3 = g(Wax * X3 + Waa * a2 + ba) 111 | = g(Wax * X3 + Waa * (g(Wax * X2 + Waa * (g(Wax * X1 + Waa * a0 + ba)) + ba)) + ba) 112 | Wax ==> Shared Weights Through Time 113 | Waa ==> Shared Weights Through Time 114 | 115 | y3 = g(Way * a3 + b) 116 | y3 = g(X3, X2, X1 117 | 118 | ``` 119 | ![image](https://github.com/user-attachments/assets/42614b7e-1e26-4a48-868f-9307161879c1) 120 | 121 | 122 | ## RNNS Types 123 | - Ont to One 124 | - One to Many 125 | - Image Caption 126 | - Many to One 127 | - Sentiement Analysis 128 | - Many to Many 129 | - Translation 130 | 131 | -------------------------------------------------------------------------------- /01-Text-Preprocessing/1.1-Text-Preprocessing/Theory.md: -------------------------------------------------------------------------------- 1 | # Text Preprocessing 2 | 3 | # What ... ? 4 | - the process of cleaning and transforming raw text into a format suitable for NLP tasks 5 | - first step of NLP projects 6 | 7 | # Why ... ? 8 | - Text data often contains noise such as punctuation, special characters, and irrelevant symbols. Preprocessing helps remove these elements 9 | - Different forms of words (e.g., “run,” “running,” “ran”) can convey the same meaning but appear in different forms. Preprocessing techniques like stemming and lemmatization help standardize these variations 10 | - raw text has Mixed cases ("Hello" , "hello") Models treat "Hello" and "hello" as different words 11 | and more... 12 | 13 | # How ... ? 14 | 15 | ## Lowercase 16 | Converts text to lowercase ("Hello WORLD" =>>> "hello world") 17 | #### Apply & Avoid for: 18 | - apply If the case (Capital or lower) does not contain information 19 | - Search engines (to normalize queries) 20 | - If your goal is just to classify 21 | - Sentiment analysis, Spam Detection, Topic Classification (NLP, nlp) are Same 22 | - Avoid : 23 | - Machine translation 24 | - POS (Parts-of-speech tagging (like noun, verb, adjective)) 25 | 26 | Chat GPT Said:
27 | If you're not sure, just ask: 28 | || “Does capitalization change the meaning in my task?” || 29 | If no, lowercase away. If yes, preserve it 30 | 31 | ## Remove URLs, mentions, hashtags 32 | Deletes symbols like !@#,. and urls 33 | ### Apply & Avoid for: 34 | - Apply for : Social media analysis, Topic modeling 35 | - Avoid for: If URLs/hashtags carry meaning (trend analysis) 36 | 37 | ## Remove punctuation & numbers & White Spaces 38 | - Deletes noise like . , ! ? ) : " 123 39 | #### Apply & Avoid for: 40 | - Apply for : Sentiment analysis (if numbers are irrelevant), Document classification 41 | - Avoid : If punctuation carries emotion, number-sensitive 42 | - emotion detection : "Sad :(" 43 | - math problems 44 | - Financial/medical texts ("COVID-19") 45 | 46 | ## Tokenize 47 | Splits text into words or tokens ("I love NLP" → ["I", "love", "NLP"]) 48 | 49 | ## Remove stopwords 50 | Deletes (Stop Words) common words ("is", "the", "and"). 51 | #### Apply & Avoid for: 52 | - Apply for : Topic modeling 53 | - Avoid : If stop words carries Informations 54 | - Sentiment analysis ("not", "never" are stopwords but means negation) 55 | - Machine translation (stopwords are Important) 56 | 57 | 58 | ## Stemming & Lemmatization 59 | - return Word Base ("playing" => Play) 60 | #### Apply & Avoid for: 61 | - Apply for : Spam detection, Search engines, Sentiment analysis 62 | - Avoid for : generative tasks (Summarization or translation) 63 | 64 | ## Custom Rules 65 | - replace emojis with text "🙂" → "[smile]") Social media sentiment, reviews analysis 66 | 67 | 68 | 69 | 70 | --- 71 | Text preprocessing is task-specific
72 | The preprocessing steps you choose should always depend on:
73 | 74 | - NLP Task 75 | - Sentiment Analysis 76 | - Lowercasing, remove URLs, emojis to text 77 | - Avoid removing negations ("not") or emojis 78 | - Topic Classification 79 | - Lowercasing, stopword removal, stemming/lemmatizing 80 | - Machine Translation 81 | - keep sentence structure 82 | - Avoid remove punctuation, stopwords 83 | - Text Generation (GPT) 84 | - Avoid changing text 85 | 86 | - Model 87 | - Traditional ML (SVM, Regression) 88 | - advanced: lowercase, stopwords, stemming 89 | - Transformers (BERT) 90 | - minimal cleaning 91 | 92 | - Dataset 93 | - Tweets 94 | - Product reviews 95 | - Scientific texts 96 | 97 | --- 98 | chat GPT Said :
99 | Always Ask Yourself
100 | Before preprocessing, ask:
101 | - What is the goal of my task? 102 | - Will this step remove or distort useful information? 103 | - What model am I using — does it need clean or natural text? 104 | --- 105 | 106 | 107 | ## Stemming & Lemmatization 108 | 109 | The goal of both stemming and lemmatization is to reduce: 110 | 111 | - inflectional forms and derivationally related forms of a word to a common base form 112 | 113 | 114 | ![image](https://github.com/user-attachments/assets/5e647b23-f61d-4a14-b1b4-da60ca14137c) 115 | 116 | #### Stemming 117 | 118 | - the process of reducing infected words to their stem (removing common affixes (prefixes, suffixes) from words) 119 | - the process of removing the last few characters of a given word, to obtain a shorter form, even if that form doesn’t have any meaning in machine learning. 120 | - rule Based Algorithm 121 | 122 | ![image](https://github.com/user-attachments/assets/8594aa9d-4acb-4930-8ca0-3e3c5b59e3e9) 123 | 124 | 125 | 126 | 127 | #### Lemmatization 128 | 129 | The purpose of lemmatization is same as that of stemming but overcomes the drawbacks of stemming
130 | use of a vocabulary and morphological analysis of words.
131 | 132 | the token saw
133 | - stemming might return just s, (remove aw) 134 | - lemmatization would attempt to return either see or saw 135 | - depending on whether the use of the token was as a verb or a noun. 136 | 137 | 138 | ![image](https://github.com/user-attachments/assets/faca7b47-8096-45e8-8b11-0b7025c81bbe) 139 | 140 | 141 | - Tokenization : 142 | - POS Tagging: Parts-of-speech tagging (like noun, verb, adjective, etc.) 143 | - Lemmatization: 144 | - Simple dictionary lookup. This works well for straightforward inflected forms, 145 | - Hand-crafted rule based system 146 | - Rules learned automatically from an annotated corpus. 147 | 148 | 149 | 150 | 151 | 152 | 153 | - Stemming: Faster, but may create Wrong root for words and lose meaning. This is known as "over stemming." 154 | 155 | - Lemmatization: slower, More accurate, preserves meaning and grammatical function. 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | -------------------------------------------------------------------------------- /04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/README.md: -------------------------------------------------------------------------------- 1 | # CrewAI Procurement Agents 🤖 2 | 3 | [![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)](https://www.python.org/downloads/) 4 | [![CrewAI](https://img.shields.io/badge/CrewAI-0.1.30+-orange.svg)](https://github.com/joaomdmoura/crewAI) 5 | [![LangChain](https://img.shields.io/badge/LangChain-0.0.335+-green.svg)](https://github.com/langchain-ai/langchain) 6 | [![License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE) 7 | [![Contributions](https://img.shields.io/badge/Contributions-Welcome-brightgreen.svg)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/issues) 8 | [![Stars](https://img.shields.io/github/stars/Fawzy-AI-Explorer/NLP-Tea?style=social)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/stargazers) 9 | 10 | A modular AI agent system built with CrewAI for product research, procurement, and analysis in e-commerce environments. 11 | 12 | ## Project Overview 🔍 13 | 14 | This project implements a multi-agent system using CrewAI to automate the process of researching products, searching e-commerce websites, scraping relevant information, and generating procurement reports. The system is designed to help businesses make informed purchasing decisions by collecting and analyzing product data from various online sources. 15 | 16 | ## Features ✨ 17 | 18 | - **Search Query Generation**: AI agent that generates optimized search queries for product research 19 | - **Search Engine Processing**: Agent that queries e-commerce sites and extracts relevant results 20 | - **Web Scraping**: Agent that collects detailed product information from search results 21 | - **Procurement Reports**: Agent that analyzes scraped data and creates comprehensive procurement reports 22 | 23 | ## Installation 💻 24 | 25 | 1. Clone the repository: 26 | ```bash 27 | git clone https://github.com/Fawzy-AI-Explorer/NLP-Tea.git 28 | cd NLP-Tea/04-crewai-agents/4.1-AI\ Agents\ using\ CrewAI\ \(\ Abu\ Bakr\ Soliman\)/crewai-agents 29 | ``` 30 | 31 | 2. Create and activate a virtual environment (recommended): 32 | ```bash 33 | # For Windows 34 | python -m venv venv 35 | source venv\Scripts\activate 36 | ``` 37 | 38 | 3. Install required packages: 39 | ```bash 40 | pip install -r requirements.txt 41 | ``` 42 | 43 | 4. Set up your environment variables: 44 | ```bash 45 | # Create a .env file with your API keys 46 | OPENAI_API_KEY=your_openai_api_key 47 | AGENTOPS_API_KEY=your_agentops_api_key 48 | # Add other API keys as needed 49 | ``` 50 | 51 | ## Project Structure 📂 52 | 53 | ``` 54 | crewai-agents/ 55 | │ 56 | │ 57 | ├── crewai_agents/ - Core module containing all agent definitions 58 | │ ├── __init__.py - Package initialization 59 | │ ├── config.py - Configuration settings 60 | │ ├── utilis.py - Utility functions 61 | │ │ 62 | │ ├── agents/ - Individual agent implementations 63 | │ │ ├── __init__.py 64 | │ │ ├── a1_search_queries_agent.py - Search query generation agent 65 | │ │ ├── a2_search_engine_agent.py - Search engine processing agent 66 | │ │ ├── a3_scraping_agent.py - Web scraping agent 67 | │ │ └── a4_procurement_report.py - Procurement report generation agent 68 | │ │ 69 | │ │ 70 | │ └── tasks/ - Task definitions for each agent 71 | │ ├── __init__.py 72 | │ ├── t1_search_queries_task.py - Search query generation task 73 | │ ├── t2_search_engine_task.py - Search engine task 74 | │ ├── t3_scraping_task.py - Web scraping task 75 | │ └── t4_procurement_report_task.py - Procurement report generation task 76 | │ 77 | ├── examples/ - Example scripts to run individual agents or full workflows 78 | │ ├── ex1_run_search_queries_agent.py - Run search queries agent 79 | │ ├── ex2_run_search_engine_agent.py - Run search engine agent 80 | │ └── ex3_run_procurement_report_agent.py - Run procurement report agent 81 | | 82 | │ 83 | ├── outputs/ - Output directory for agent results 84 | │ └── ai-agent-output/ - JSON outputs from agent runs 85 | │ ├── step_1_suggested_search_queries.json - Output from search queries agent 86 | │ ├── step_2_search_results.json - Output from search engine agent 87 | │ ├── step_3_scraping_results.json - Output from web scraping agent 88 | │ └── step_4_procurement_report.html - Final procurement report output 89 | │ 90 | ├── tests/ - Unit and integration tests 91 | │ └── test.py - Test script 92 | │ 93 | ├── requirements.txt - Project dependencies 94 | └── README.md - Project documentation 95 | ``` 96 | 97 | ## Output Files 📁 98 | 99 | The agents produce the following output files during execution: 100 | 101 | ``` 102 | outputs/ 103 | └── ai-agent-output/ 104 | ├── step_1_suggested_search_queries.json - Output from search queries agent 105 | ├── step_2_search_results.json - Output from search engine agent 106 | ├── step_3_scraping_results.json - Output from web scraping agent 107 | └── step_4_procurement_report.html - Final procurement report output (HTML format) 108 | ``` 109 | 110 | ## Usage 🚀 111 | 112 | ### 1. Generate Search Queries 🔎 113 | 114 | ```python 115 | from examples.ex1_run_search_queries_agent import run_search_queries_agent 116 | 117 | results = run_search_queries_agent() 118 | print(results) 119 | ``` 120 | 121 | ### 2. Run Search Engine Agent 🌐 122 | 123 | ```python 124 | from examples.ex2_run_search_engine_agent import run_search_engine_agent 125 | 126 | results = run_search_engine_agent() 127 | print(results) 128 | ``` 129 | 130 | ### 3. Generate Procurement Report 📊 131 | 132 | ```python 133 | from examples.ex3_run_procurement_report_agent import run_procurement_report_agent 134 | 135 | results = run_procurement_report_agent() 136 | print(results) 137 | ``` 138 | 139 | ## Complete Workflow 🔄 140 | 141 | workflow: 142 | 1. Generates optimized search queries for your product requirements 143 | 2. Searches e-commerce sites using these queries 144 | 3. Scrapes detailed product information from search results 145 | 4. Produces a comprehensive procurement report with recommendations 146 | 147 | ## License 📜 148 | 149 | ## Contributing 🤝 150 | 151 | Contributions are welcome! Please feel free to submit a Pull Request. 152 | 153 | ## Acknowledgments 🙏 154 | 155 | - Thanks to the [Abu Bakr Soliman](https://www.linkedin.com/in/bakrianoo/) for this [crash course](https://www.youtube.com/watch?v=DDR4A8-MLQs&t=1s) -------------------------------------------------------------------------------- /02-Word Embeddings/2.4-Word2Vec/Theory.md: -------------------------------------------------------------------------------- 1 | # Word Embeddings 2 | 3 | [01- Label Encoder & One Hot Encoder](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder) 4 | 5 | [02 - BOW](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.2-BOW) 6 | 7 | [03 - TF-IDF](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.3-TF_IDF) 8 | 9 | ## 04 - Word2Vec 10 | 11 | Word2Vec is a neural network-based method that learns to represent words as vectors in a continuous vector (word embeddings). where words with similar meanings have similar vectors. Word2Vec provides a way to capture the semantic relationships between words through neural networks. 12 | 13 | Map each word to a dense vector such that words with common contexts in the corpus have similar vector representations. 14 | 15 | 16 | - Dimensionality Reduction: 17 | - Instead of representing words as one-hot vectors (High Dimension len vec = len(Vocab) = Number of unique words and sparse), Word2Vec produces dense vectors, low-dimensional vectors. 18 | - Semantic: 19 | - By Using `Context Words` Based on `window size` 20 | - Computational Efficiency: 21 | - By Using `Negative Sampling Method` 22 | 23 | ## Goal 24 | 25 | Not predict Context But to Leaern Vector representation of target words 26 | By predicting or using the context Words, Word2Vec `learns` the structure of the language. 27 | The main objective of Word2Vec is to learn word embeddings that: 28 | - Capture Semantic Relationships: Words with similar meanings are represented by similar vectors. 29 | 30 | ## Target, Context 31 | ![image](https://github.com/user-attachments/assets/425a0cf4-68c1-476c-9061-fa13ae335f18) 32 | 33 | - Target Word: 34 | - in a particular step: It is the `center` word that the model wants to learn a good representation for. 35 | - Each word will be a target in a specific step 36 | - Context Words: 37 | - The words surrounding the target word within window size. 38 | ![image](https://github.com/user-attachments/assets/d2155ea2-aadc-41f1-9d31-ab0e32ab1a47) 39 | 40 | `my name is mohammad fawzy` 41 | window size = 1 42 | `Target` => `Context` 43 | `my` => `name` 44 | `name` => `my, is` 45 | `is` => `name, mohammad` 46 | `mohammad` => `is, fawzy` 47 | `fawzy` => `mohammad` 48 | window size = 2 49 | `my` => `name , is` 50 | `name` => `my, is, mohammad` 51 | `is` => `my, name, mohammad, fawzy` 52 | `mohammad` => `name, is, fawzy` 53 | `fawzy` => `is, mohammad` 54 | 55 | 56 | 57 | 58 | 59 | ## How Does Word2Vec Work? 60 | 61 | Word2Vec Uses a shallow neural network that consists of an input layer, a hidden layer, and an output layer. 62 | 63 | - Use One Hot Encoding 64 | - defining target words and for each one define it's Context words 65 | 66 | - One Input Layer (Number of Neurons = Number of unique words = len Vocab) 67 | - One Hidden Layer (Embedding Size) 68 | - One output Layer (Number of Neurons = Number of unique words = len Vocab) 69 | 70 | 71 | ## Types of Word2Vec 72 | 1. Continuous Bag of Words (CBOW): 73 | - Given the `Context` words, Predict `Target` word 74 | - works well with large datasets and It is computationally more efficient. 75 | - ![image](https://github.com/user-attachments/assets/9ef1f8ad-df42-4b6a-b7c1-068db91398d5) 76 | 77 | 2. Skip-Gram: 78 | - Given the `Target` word, Predict `Context` words 79 | - Works well with smaller datasets and is particularly good at capturing rare words. 80 | - ![image](https://github.com/user-attachments/assets/98926dee-066c-4c9a-8566-a54724f4058e) 81 | 82 | 83 | 84 | 85 | 86 | 87 | # Skip-Gram 88 | 89 | - Input Layer: 90 | - Number of Neurons: Equal to the number of unique words (vocabulary size). 91 | - Representation: One-hot encoded vector representing the target word. 92 | 93 | - Hidden Layer: 94 | - Number of Neurons: Equal to the chosen embedding size. 95 | - Purpose: This layer learns to project the one-hot vector into a lower-dimensional space. The learned weights of this layer become the word embeddings. 96 | 97 | - Output Layer: 98 | - Number of Neurons: Equal to the number of unique words (vocabulary size). 99 | - Representation: Produces a probability (SoftMax) of all words in the vocabulary to predict which words are context. 100 | - maximize the probability of all context words together, given a center word 101 | - goal is not to predict context words, but to learn vector representation of words, It just happens that predicting context words 102 | ![image](https://github.com/user-attachments/assets/24b44754-51d4-407e-af57-5936b4795840) 103 | 104 | 105 | 106 | Vocab Size = 100 , Window Size = 1 107 | Each Word represented in Binary Vector (Len = 100) All 0 except the index 108 | Suppose the 109 | - target 110 | - Has 1 in position 3 [0 0 0 1 0 0 0 0 0 0.......] 111 | - context 112 | - Has 1 in position 2 [0 0 1 0 0 0 0 0 0 0.......] 113 | - Has 1 in position 4 [0 0 0 0 1 0 0 0 0 0.......] 114 | 115 | 1. One-Hot Encoding (Neurons = Voc size) 116 | 117 | 2. Defining Target and Context Words 118 | 119 | 3. Input Layer 120 | - Feed the Target Word (one-hot encoded vector for the target word) 121 | 122 | 4. Hidden Layer (Embedding Layer) 123 | - The one-hot vector is multiplied by a weight matrix W (vocab size × embedding size). 124 | - Since only one element in the one-hot vector is 1, the output is simply the row of W corresponding to that word. This row becomes the word embedding for the target word. 125 | - The training process adjusts the weights in W so that similar words (appearing in similar contexts) end up with similar vectors. 126 | - ![image](https://github.com/user-attachments/assets/635aa4dd-c693-40a6-8894-6c2c76d5d004) 127 | 128 | 129 | 5. Output Layer 130 | - The hidden layer output (the word embedding) is then passed through another weight matrix W′(embedding size × vocab size) to produce logits for every word in the vocabulary. 131 | - A softmax function is applied to these Logits to get a probability distribution over all words. This distribution reflects the probability of each word being a context word for the given target word. 132 | - we want to maximize the probability of all context words together, given a center word 133 | - compute The error between the predicted probabilities and the actual context words represented as one-hot vectors (Sum all context in one vextor). 134 | - [0 0 1 0 1 0 0 0 0 0.......] 135 | - The network uses backpropagation to adjust both weight matrices W and W′ 136 | 6. Extracting the Embeddings 137 | - Once training is complete, the weights in the hidden layer matrix W are used as the word embeddings. 138 | - These embeddings capture the relationships between words based on their context in the training text. 139 | 140 | ## Negative Sampling 141 | 142 | Problems with Skip-Gram 143 | ![image](https://github.com/user-attachments/assets/5960d9ba-3be6-4486-b282-94d0283395d0) 144 | 145 | Softmax is computationally very expensive, as it requires scanning through the entire 146 | output to compute the probability distribution of all Vocab (V) words, Vocab size may be millions or more 147 | 148 | Multi class classification Problems Number of classes = V = 10,000 Classes 149 | we want to convert from multi class classification (Soft max) to Binary classification (Sigmoid) 150 | 151 | Negative Sampling: 152 | For each training sample define: 153 | - Conyext Word 154 | - Context Words (Positive Context Sample Cpos) 155 | - For each Context Word 156 | - K Number of Words not in the context (Negative Samples). 157 | The new objective is to predict, for any given target-word pair, whether the word is in the context words or not. 158 | Give the Network Two Words => It predict 1 if target-Context, 0 If target-Negative 159 | this is a Binary Classification 160 | 161 | ![image](https://github.com/user-attachments/assets/a7e7b459-1160-43a7-ab3d-1b2fc7eac1dd) 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /02-Word Embeddings/2.3-TF_IDF/2.3-TF-IDF.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 8, 6 | "id": "30fa1fe2", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "[nltk_data] Downloading package stopwords to\n", 14 | "[nltk_data] C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n", 15 | "[nltk_data] Package stopwords is already up-to-date!\n", 16 | "[nltk_data] Downloading package wordnet to\n", 17 | "[nltk_data] C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n", 18 | "[nltk_data] Package wordnet is already up-to-date!\n" 19 | ] 20 | } 21 | ], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "import numpy as np\n", 25 | "import re\n", 26 | "from nltk.stem import WordNetLemmatizer, PorterStemmer\n", 27 | "import nltk\n", 28 | "from nltk.corpus import stopwords\n", 29 | "from nltk.tokenize import word_tokenize\n", 30 | "import contractions\n", 31 | "nltk.download('stopwords')\n", 32 | "nltk.download('wordnet')\n", 33 | "\n", 34 | "from typing import List" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 106, 40 | "id": "6dcf6c88", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "doc1 = \"Neural networks process data using deep learning algorithms in artificial intelligence.\"\n", 45 | "doc2 = \"Artificial intelligence applies neural networks and deep learning to process large datasets.\"\n", 46 | "\n", 47 | "doc3 = \"Gasoline cars have combustion engines that power vehicles through fuel ignition.\"\n", 48 | "doc4 = \"Car engines burn gasoline in combustion chambers to move vehicles on the road.\"\n", 49 | "\n", 50 | "corpus = [doc1, doc2, doc3, doc4]" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 107, 56 | "id": "f4af634d", 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "Neural networks process data using deep learning algorithms in artificial intelligence.\n", 64 | "Length of document: 87\n", 65 | "\n", 66 | "Artificial intelligence applies neural networks and deep learning to process large datasets.\n", 67 | "Length of document: 92\n", 68 | "\n", 69 | "Gasoline cars have combustion engines that power vehicles through fuel ignition.\n", 70 | "Length of document: 80\n", 71 | "\n", 72 | "Car engines burn gasoline in combustion chambers to move vehicles on the road.\n", 73 | "Length of document: 78\n", 74 | "\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "for d in corpus:\n", 80 | " print(d)\n", 81 | " print(\"Length of document:\", len(d))\n", 82 | " print()" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 108, 88 | "id": "c96b80e5", 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "[['neural', 'network', 'process', 'data', 'using', 'deep', 'learning', 'algorithm', 'artificial', 'intelligence'], ['artificial', 'intelligence', 'applies', 'neural', 'network', 'deep', 'learning', 'process', 'large', 'datasets'], ['gasoline', 'car', 'combustion', 'engine', 'power', 'vehicle', 'fuel', 'ignition'], ['car', 'engine', 'burn', 'gasoline', 'combustion', 'chamber', 'move', 'vehicle', 'road']]\n" 96 | ] 97 | } 98 | ], 99 | "source": [ 100 | "def preprocessing(text: str) -> list[str]:\n", 101 | "\n", 102 | " stop_words = set(stopwords.words('english'))\n", 103 | " lemmatizer = WordNetLemmatizer()\n", 104 | " stemmer = PorterStemmer()\n", 105 | "\n", 106 | " # Convert Text to Lowercase (Normalization)\n", 107 | " text_lower = text.lower()\n", 108 | " text_no_tags = re.sub(r'<[^>]+>', '', text_lower)\n", 109 | "\n", 110 | " # Contraction Handling\n", 111 | " text_no_tags = contractions.fix(text_no_tags)\n", 112 | "\n", 113 | " # Removing Punctuation\n", 114 | " text_no_punct = re.sub(r'[^a-zA-Z\\s]', '', text_no_tags) # \\' for keep apostrophes (e.g. don't, it's)\n", 115 | "\n", 116 | "\n", 117 | " # 3. Tokens\n", 118 | " tokens = re.split(r\"\\s+\", text_no_punct) \n", 119 | " tokens = [t for t in tokens if t]\n", 120 | " # or use nltk tokenizer\n", 121 | " tokens = word_tokenize(text_no_punct)\n", 122 | "\n", 123 | " # 4. Stop word removal\n", 124 | " filtered_tokens = [token for token in tokens if token not in stop_words]\n", 125 | "\n", 126 | " # 5. Lemmatization \n", 127 | " lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]\n", 128 | " # or stemmer\n", 129 | " stemm_tokens = [stemmer.stem(token) for token in filtered_tokens ]\n", 130 | "\n", 131 | " return lemma_tokens\n", 132 | "\n", 133 | "preprocessed_text = [preprocessing(doc) for doc in corpus]\n", 134 | "print(preprocessed_text)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 109, 140 | "id": "91ad8dd0", 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "term : learning\n", 148 | "doc :['neural', 'network', 'process', 'data', 'using', 'deep', 'learning', 'algorithm', 'artificial', 'intelligence']\n", 149 | "frequency : 1\n", 150 | "len doc : 10\n", 151 | "tf of 'learning' on doc 0: 0.1\n", 152 | "-----------------------\n", 153 | "term : learning\n", 154 | "doc :['artificial', 'intelligence', 'applies', 'neural', 'network', 'deep', 'learning', 'process', 'large', 'datasets']\n", 155 | "frequency : 1\n", 156 | "len doc : 10\n", 157 | "tf of 'learning' on doc 2: 0.1\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "all_tokens = [token for doc in preprocessed_text for token in doc]\n", 163 | "vocab = sorted(set(all_tokens))\n", 164 | "# print(len(all_tokens))\n", 165 | "\n", 166 | "\n", 167 | "def TF(term, doc) :\n", 168 | " term = term.lower()\n", 169 | " print(f\"term : {term}\")\n", 170 | " print(f\"doc :{doc}\")\n", 171 | " print(f\"frequency : {doc.count(term)}\")\n", 172 | " print(f\"len doc : {len(doc)}\")\n", 173 | " return doc.count(term) / len(doc)\n", 174 | "\n", 175 | "\n", 176 | "term = \"learning\"\n", 177 | "tf = TF(term, preprocessed_text[0])\n", 178 | "print(\"tf of 'learning' on doc 0: \", tf)\n", 179 | "print(\"-----------------------\")\n", 180 | "tf = TF(term, preprocessed_text[1])\n", 181 | "print(\"tf of 'learning' on doc 2: \", tf)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 110, 187 | "id": "ea9c103f", 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "term : learning \n", 195 | "number of documents : 4 \n", 196 | "number of documents containing term : 2\n", 197 | "idf of 'learning' : 1.3333333333333333\n", 198 | "-----------------------\n", 199 | "term : statistic \n", 200 | "number of documents : 4 \n", 201 | "number of documents containing term : 0\n", 202 | "idf of 'statistics' : 4.0\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "def IDF(term, corpus):\n", 208 | " term = term.lower()\n", 209 | " N = len(corpus)\n", 210 | " n = sum(1 for doc in corpus if term in doc)\n", 211 | " print(f\"term : {term} \\nnumber of documents : {N} \\nnumber of documents containing term : {n}\")\n", 212 | "\n", 213 | " return N/(n+1)\n", 214 | "idf = IDF(\"learning\", preprocessed_text)\n", 215 | "print(\"idf of 'learning' : \", idf)\n", 216 | "print(\"-----------------------\")\n", 217 | "idf = IDF(\"statistic\", preprocessed_text)\n", 218 | "print(\"idf of 'statistics' : \", idf)" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": 111, 224 | "id": "b9d49d9a", 225 | "metadata": {}, 226 | "outputs": [ 227 | { 228 | "name": "stdout", 229 | "output_type": "stream", 230 | "text": [ 231 | "37\n" 232 | ] 233 | }, 234 | { 235 | "data": { 236 | "text/plain": [ 237 | "(4, 25)" 238 | ] 239 | }, 240 | "execution_count": 111, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "all_tokens = [token for doc in preprocessed_text for token in doc]\n", 247 | "vocab = sorted(set(all_tokens))\n", 248 | "print(len(all_tokens))\n", 249 | "\n", 250 | "\n", 251 | "def TF(term: str, doc: list[str]) -> float:\n", 252 | " \"\"\"\n", 253 | " Calculate Term Frequency (TF) of a term in a document.\n", 254 | "\n", 255 | " Args:\n", 256 | " term (str): The term to calculate TF for.\n", 257 | " doc (list[str]): The document in which to calculate TF.\n", 258 | "\n", 259 | " Returns:\n", 260 | " float: The term frequency of the term in the document.\n", 261 | " \"\"\"\n", 262 | " term = term.lower()\n", 263 | " return doc.count(term) / len(doc)\n", 264 | " \n", 265 | "def IDF(term: str, corpus: List[list[str]]) -> float:\n", 266 | " \"\"\"\n", 267 | " Calculate Inverse Document Frequency (IDF) of a term in a corpus.\n", 268 | "\n", 269 | " Args:\n", 270 | " term (str): The term to calculate IDF for.\n", 271 | " corpus (List[list[str]]): The corpus in which to calculate IDF.\n", 272 | "\n", 273 | " Returns:\n", 274 | " float: The inverse document frequency of the term in the corpus.\n", 275 | " \"\"\"\n", 276 | " N = len(corpus)\n", 277 | " term = term.lower()\n", 278 | " num_docs_with_term = sum(1 for doc in corpus if term in doc)\n", 279 | " return N / (1 + num_docs_with_term)\n", 280 | "\n", 281 | "def TF_IDF(term: str, doc: list[str], corpus: List[list[str]]) -> float:\n", 282 | " \"\"\"\n", 283 | " Calculate TF-IDF of a term in a document within a corpus.\n", 284 | "\n", 285 | " Args:\n", 286 | " term (str): The term to calculate TF-IDF for.\n", 287 | " doc (list[str]): The document in which to calculate TF-IDF.\n", 288 | " corpus (List[list[str]]): The corpus in which to calculate TF-IDF.\n", 289 | "\n", 290 | " Returns:\n", 291 | " float: The TF-IDF score of the term in the document.\n", 292 | " \"\"\"\n", 293 | " tf = TF(term, doc)\n", 294 | " idf = IDF(term, corpus)\n", 295 | " return tf * idf\n", 296 | "\n", 297 | "\n", 298 | "\n", 299 | "tfidf_matrix = np.zeros((len(preprocessed_text), len(vocab)))\n", 300 | "for i, doc in enumerate(preprocessed_text):\n", 301 | " for j, term in enumerate(vocab):\n", 302 | " tfidf_matrix[i][j] = TF_IDF(term, doc, preprocessed_text)\n", 303 | "\n", 304 | "\n", 305 | "tfidf_matrix.shape" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 112, 311 | "id": "6afe47a3", 312 | "metadata": {}, 313 | "outputs": [ 314 | { 315 | "name": "stdout", 316 | "output_type": "stream", 317 | "text": [ 318 | "[[1. 0.50909091 0. 0. ]\n", 319 | " [0.50909091 1. 0. 0. ]\n", 320 | " [0. 0. 1. 0.38984059]\n", 321 | " [0. 0. 0.38984059 1. ]]\n" 322 | ] 323 | } 324 | ], 325 | "source": [ 326 | "# calc similarity between documents\n", 327 | "from sklearn.metrics.pairwise import cosine_similarity\n", 328 | "similarity_matrix = cosine_similarity(tfidf_matrix)\n", 329 | "print(similarity_matrix)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "markdown", 334 | "id": "4239f7c3", 335 | "metadata": {}, 336 | "source": [ 337 | "# Built in" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 91, 343 | "id": "0459ffb9", 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "name": "stdout", 348 | "output_type": "stream", 349 | "text": [ 350 | "['advanced' 'ai' 'algorithms' 'allows' 'artificial' 'benefits' 'branch'\n", 351 | " 'cars' 'data' 'decisions' 'designed' 'efficiency' 'electric'\n", 352 | " 'environmental' 'features' 'include' 'intelligence' 'learn' 'learning'\n", 353 | " 'machine' 'machines' 'make' 'patterns' 'popular' 'safety'\n", 354 | " 'transportation' 'uses' 'vehicles']\n", 355 | "--------------\n", 356 | "[[0. 0. 0. 0.36222393 0.36222393 0.\n", 357 | " 0. 0. 0.2855815 0.36222393 0. 0.\n", 358 | " 0. 0. 0. 0. 0.36222393 0.36222393\n", 359 | " 0. 0. 0.36222393 0.36222393 0. 0.\n", 360 | " 0. 0. 0. 0. ]\n", 361 | " [0. 0.36222393 0.36222393 0. 0. 0.\n", 362 | " 0.36222393 0. 0.2855815 0. 0. 0.\n", 363 | " 0. 0. 0. 0. 0. 0.\n", 364 | " 0.36222393 0.36222393 0. 0. 0.36222393 0.\n", 365 | " 0. 0. 0.36222393 0. ]\n", 366 | " [0.37796447 0. 0. 0. 0. 0.\n", 367 | " 0. 0.37796447 0. 0. 0.37796447 0.\n", 368 | " 0. 0. 0.37796447 0.37796447 0. 0.\n", 369 | " 0. 0. 0. 0. 0. 0.\n", 370 | " 0.37796447 0.37796447 0. 0. ]\n", 371 | " [0. 0. 0. 0. 0. 0.40824829\n", 372 | " 0. 0. 0. 0. 0. 0.40824829\n", 373 | " 0.40824829 0.40824829 0. 0. 0. 0.\n", 374 | " 0. 0. 0. 0. 0. 0.40824829\n", 375 | " 0. 0. 0. 0.40824829]]\n" 376 | ] 377 | } 378 | ], 379 | "source": [ 380 | "from sklearn.feature_extraction.text import TfidfVectorizer\n", 381 | "\n", 382 | "documents = [doc1, doc2, doc3, doc4]\n", 383 | "\n", 384 | "vectorizer = TfidfVectorizer(stop_words='english')\n", 385 | "X = vectorizer.fit_transform(documents)\n", 386 | "\n", 387 | "print(vectorizer.get_feature_names_out())\n", 388 | "print(\"--------------\")\n", 389 | "# Convert TF-IDF matrix to array and view it\n", 390 | "print(X.toarray())" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": null, 396 | "id": "6c5ee47d", 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [] 400 | } 401 | ], 402 | "metadata": { 403 | "kernelspec": { 404 | "display_name": "myenv", 405 | "language": "python", 406 | "name": "python3" 407 | }, 408 | "language_info": { 409 | "codemirror_mode": { 410 | "name": "ipython", 411 | "version": 3 412 | }, 413 | "file_extension": ".py", 414 | "mimetype": "text/x-python", 415 | "name": "python", 416 | "nbconvert_exporter": "python", 417 | "pygments_lexer": "ipython3", 418 | "version": "3.12.6" 419 | } 420 | }, 421 | "nbformat": 4, 422 | "nbformat_minor": 5 423 | } 424 | -------------------------------------------------------------------------------- /02-Word Embeddings/2.5-FastText/2.5-fast_text.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.11","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":3316532,"sourceType":"datasetVersion","datasetId":10100}],"dockerImageVersionId":31012,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Libraries","metadata":{}},{"cell_type":"code","source":"import pandas as pd \n# preprocessing\nimport re\nimport nltk\nfrom nltk.stem import WordNetLemmatizer\nfrom nltk.corpus import stopwords\nnltk.download('stopwords')\nnltk.download('wordnet')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:17:58.089901Z","iopub.execute_input":"2025-04-19T17:17:58.090490Z","iopub.status.idle":"2025-04-19T17:18:02.530545Z","shell.execute_reply.started":"2025-04-19T17:17:58.090456Z","shell.execute_reply":"2025-04-19T17:18:02.529578Z"}},"outputs":[{"name":"stderr","text":"[nltk_data] Downloading package stopwords to /usr/share/nltk_data...\n[nltk_data] Package stopwords is already up-to-date!\n[nltk_data] Downloading package wordnet to /usr/share/nltk_data...\n[nltk_data] Package wordnet is already up-to-date!\n","output_type":"stream"},{"execution_count":1,"output_type":"execute_result","data":{"text/plain":"True"},"metadata":{}}],"execution_count":1},{"cell_type":"markdown","source":"# Data","metadata":{}},{"cell_type":"code","source":"file_path = r\"/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json\"\ndf = pd.read_json(file_path, lines=True)\ndf = df[:1000]\n\ntext_column = df['text']\ntext_column.head()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:02.531766Z","iopub.execute_input":"2025-04-19T17:18:02.532185Z","iopub.status.idle":"2025-04-19T17:18:10.677540Z","shell.execute_reply.started":"2025-04-19T17:18:02.532157Z","shell.execute_reply":"2025-04-19T17:18:10.676617Z"}},"outputs":[{"execution_count":2,"output_type":"execute_result","data":{"text/plain":"0 Avengers time with the ladies.\n1 They have lots of good deserts and tasty cuban...\n2 It's open even when you think it isn't\n3 Very decent fried chicken\n4 Appetizers.. platter special for lunch\nName: text, dtype: object"},"metadata":{}}],"execution_count":2},{"cell_type":"code","source":"df.shape","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:10.678316Z","iopub.execute_input":"2025-04-19T17:18:10.678558Z","iopub.status.idle":"2025-04-19T17:18:10.684053Z","shell.execute_reply.started":"2025-04-19T17:18:10.678538Z","shell.execute_reply":"2025-04-19T17:18:10.683203Z"}},"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"(1000, 5)"},"metadata":{}}],"execution_count":3},{"cell_type":"markdown","source":"# Preprocessing","metadata":{}},{"cell_type":"code","source":"def preprocess(text: str) -> list :\n text = text.lower()\n text = re.sub(r'[^a-zA-Z\\s]', '', text) # Remove all non-alphabetic characters\n text = re.sub(r'\\s+[a-zA-Z]\\s+', ' ', text) # Remove all single characters\n\n tokens = text.split()\n tokens = [t for t in tokens if len(t)>3] # Keep words with length >= 3\n\n stop_words = set(stopwords.words('english'))\n lemmatizer = WordNetLemmatizer() \n\n tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]\n return tokens if tokens else [] # Return an empty list if nothing remains\n\nsentences = [preprocess(t) for t in text_column] # List[List[str]]\nprint(sentences[:3])\nlen(sentences)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:10.685927Z","iopub.execute_input":"2025-04-19T17:18:10.686209Z","iopub.status.idle":"2025-04-19T17:18:13.899032Z","shell.execute_reply.started":"2025-04-19T17:18:10.686179Z","shell.execute_reply":"2025-04-19T17:18:13.898150Z"}},"outputs":[{"name":"stdout","text":"[['avenger', 'time', 'lady'], ['lot', 'good', 'desert', 'tasty', 'cuban', 'sandwich'], ['open', 'even', 'think', 'isnt']]\n","output_type":"stream"},{"execution_count":4,"output_type":"execute_result","data":{"text/plain":"1000"},"metadata":{}}],"execution_count":4},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# Official FastText","metadata":{}},{"cell_type":"code","source":"from gensim.models import FastText","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:13.903216Z","iopub.execute_input":"2025-04-19T17:18:13.903959Z","iopub.status.idle":"2025-04-19T17:18:54.429621Z","shell.execute_reply.started":"2025-04-19T17:18:13.903934Z","shell.execute_reply":"2025-04-19T17:18:54.428969Z"}},"outputs":[],"execution_count":5},{"cell_type":"code","source":"# Train FastText model\nFastText_model = FastText(\n sentences=sentences,\n vector_size=100,\n window=3,\n min_count=1,\n epochs=500\n)\nprint(FastText_model)\n# Save the model\nFastText_model.save(\"fasttext_model.model\")\nprint(\"model saved.\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:54.430461Z","iopub.execute_input":"2025-04-19T17:18:54.430938Z","iopub.status.idle":"2025-04-19T17:19:12.717566Z","shell.execute_reply.started":"2025-04-19T17:18:54.430916Z","shell.execute_reply":"2025-04-19T17:19:12.716625Z"}},"outputs":[{"name":"stdout","text":"FastText\nmodel saved.\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"vocab_size = len(FastText_model.wv)\nembedding_size = FastText_model.vector_size\n\n# Print vocabulary and embedding size\nprint(f\"Vocabulary Size: {vocab_size}\")\nprint(f\"Embedding Size: {embedding_size}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:19:12.719832Z","iopub.execute_input":"2025-04-19T17:19:12.720076Z","iopub.status.idle":"2025-04-19T17:19:12.725155Z","shell.execute_reply.started":"2025-04-19T17:19:12.720058Z","shell.execute_reply":"2025-04-19T17:19:12.724125Z"}},"outputs":[{"name":"stdout","text":"Vocabulary Size: 2121\nEmbedding Size: 100\n","output_type":"stream"}],"execution_count":7},{"cell_type":"code","source":"similar_words = FastText_model.wv.most_similar('good', topn=10)\nprint(\"\\nSimilar\")\nprint(similar_words)\nprint(\"-\"*30) \nopposite_words = FastText_model.wv.most_similar(negative= 'good', topn=10)\nprint(\"\\n\", opposite_words)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:19:12.726119Z","iopub.execute_input":"2025-04-19T17:19:12.726446Z","iopub.status.idle":"2025-04-19T17:19:12.750329Z","shell.execute_reply.started":"2025-04-19T17:19:12.726416Z","shell.execute_reply":"2025-04-19T17:19:12.749248Z"}},"outputs":[{"name":"stdout","text":"\nSimilar\n[('goodi', 0.8452088832855225), ('food', 0.5561390519142151), ('deliciously', 0.5237860083580017), ('neighborhood', 0.4961780607700348), ('deliciousness', 0.4901498854160309), ('deliciousthen', 0.4899592697620392), ('ipod', 0.4890640377998352), ('foodgreat', 0.47072115540504456), ('delicious', 0.46029138565063477), ('bollywood', 0.4556906819343567)]\n------------------------------\n\n [('postage', 0.3698478043079376), ('hermitage', 0.3362504243850708), ('cinco', 0.3282019793987274), ('lurk', 0.3278462290763855), ('professionalism', 0.31859180331230164), ('postal', 0.31132882833480835), ('prepaid', 0.30549004673957825), ('trip', 0.30125343799591064), ('trap', 0.2973553240299225), ('ease', 0.2962052822113037)]\n","output_type":"stream"}],"execution_count":8},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# pretrained FastText model","metadata":{}},{"cell_type":"code","source":"# Download Model\nimport urllib.request\nimport gzip\nimport os\nimport shutil","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:19:12.751634Z","iopub.execute_input":"2025-04-19T17:19:12.751965Z","iopub.status.idle":"2025-04-19T17:19:12.756263Z","shell.execute_reply.started":"2025-04-19T17:19:12.751942Z","shell.execute_reply":"2025-04-19T17:19:12.755380Z"}},"outputs":[],"execution_count":9},{"cell_type":"code","source":"# Download pretrained FastText model\nurl = \"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\"\noutput_gz = \"cc.en.300.bin.gz\"\noutput_bin = \"cc.en.300.bin\"\n\n# Download the .gz file\nprint(\"Downloading pretrained FastText model...\")\nurllib.request.urlretrieve(url, output_gz)\n\n# Unzip the .gz file\nprint(\"Unzipping the model...\")\nwith gzip.open(output_gz, 'rb') as f_in:\n with open(output_bin, 'wb') as f_out:\n shutil.copyfileobj(f_in, f_out)\nprint(\"model saved\")\n# Remove the .gz file to save space\nos.remove(output_gz)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:19:12.757106Z","iopub.execute_input":"2025-04-19T17:19:12.757990Z","iopub.status.idle":"2025-04-19T17:20:21.869886Z","shell.execute_reply.started":"2025-04-19T17:19:12.757961Z","shell.execute_reply":"2025-04-19T17:20:21.869086Z"}},"outputs":[{"name":"stdout","text":"Downloading pretrained FastText model...\nUnzipping the model...\nmodel saved\n","output_type":"stream"}],"execution_count":10},{"cell_type":"code","source":"import fasttext","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:20:21.870774Z","iopub.execute_input":"2025-04-19T17:20:21.871056Z","iopub.status.idle":"2025-04-19T17:20:21.896035Z","shell.execute_reply.started":"2025-04-19T17:20:21.871036Z","shell.execute_reply":"2025-04-19T17:20:21.895138Z"}},"outputs":[],"execution_count":11},{"cell_type":"code","source":"print(\"Loading the model...\")\npretrained = fasttext.load_model(output_bin)\nprint(pretrained)\nvocab_size = len(pretrained.words)\nembedding_size = pretrained.get_dimension()\nprint(f\"Vocabulary Size: {vocab_size}\")\nprint(f\"Embedding Size: {embedding_size}\")\n# --------------------\nsimilar_words = pretrained.get_nearest_neighbors(\"good\", k=10)\nprint(\"similar words\",similar_words)\nopposite_words = pretrained.get_nearest_neighbors(negative=[\"learning\"], k=10)\nprint(\"opposite words\", opposite_words)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:20:21.897103Z","iopub.execute_input":"2025-04-19T17:20:21.897399Z","iopub.status.idle":"2025-04-19T17:20:44.757918Z","shell.execute_reply.started":"2025-04-19T17:20:21.897371Z","shell.execute_reply":"2025-04-19T17:20:44.756852Z"}},"outputs":[{"name":"stdout","text":"Loading the model...\n\nVocabulary Size: 2000000\nEmbedding Size: 300\nsimilar words [(0.7517593502998352, 'bad'), (0.7426098585128784, 'great'), (0.7299689054489136, 'decent'), (0.7123614549636841, 'nice'), (0.6796907186508179, 'Good'), (0.6737031936645508, 'excellent'), (0.669592022895813, 'goood'), (0.6602178812026978, 'ggod'), (0.6479219794273376, 'semi-good'), (0.6417751908302307, 'good.Good')]\n","output_type":"stream"},{"traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)","\u001b[0;32m/tmp/ipykernel_31/297213840.py\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0msimilar_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpretrained\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_nearest_neighbors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"good\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"similar words\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msimilar_words\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mopposite_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpretrained\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_nearest_neighbors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnegative\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"learning\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"opposite words\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopposite_words\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mTypeError\u001b[0m: _FastText.get_nearest_neighbors() got an unexpected keyword argument 'negative'"],"ename":"TypeError","evalue":"_FastText.get_nearest_neighbors() got an unexpected keyword argument 'negative'","output_type":"error"}],"execution_count":12},{"cell_type":"markdown","source":"- import fasttext\n - Facebook's original FastText package.\n - Faster and more memory efficient\n - Limited API (e.g., doesn't support negative sampling like Gensim does).\n - get_nearest_neighbors(negative) doesn’t exist in official fasttext\n - Used in real Prijects (Production)","metadata":{}},{"cell_type":"markdown","source":"- Gensim FastText\n - You can use: (positive, negative, most_similar, similarity, .....)\n - Slightly slower\n - For production embedding lookup, not as efficient as the original FastText.","metadata":{}},{"cell_type":"code","source":"from gensim.models.fasttext import load_facebook_model\n\npretrained = load_facebook_model(\"cc.en.300.bin\")\n# model = load_facebook_model(output_bin)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:22:21.519417Z","iopub.execute_input":"2025-04-19T17:22:21.519877Z","iopub.status.idle":"2025-04-19T17:24:27.142626Z","shell.execute_reply.started":"2025-04-19T17:22:21.519849Z","shell.execute_reply":"2025-04-19T17:24:27.141885Z"}},"outputs":[],"execution_count":13},{"cell_type":"code","source":"vocab_size = len(pretrained.wv)\nembedding_size = pretrained.wv.vector_size\nprint(f\"Vocabulary Size: {vocab_size}\")\nprint(f\"Embedding Size: {embedding_size}\")\nsimilar = pretrained.wv.most_similar(\"learning\", topn=10)\nprint(\"similar words :\", similar)\n\nopposite_words = pretrained.wv.most_similar(negative=[\"learning\"],topn=10)\nprint(\"\\n\\nopposite words :\", opposite_words)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:24:27.146361Z","iopub.execute_input":"2025-04-19T17:24:27.146631Z","iopub.status.idle":"2025-04-19T17:24:28.242978Z","shell.execute_reply.started":"2025-04-19T17:24:27.146609Z","shell.execute_reply":"2025-04-19T17:24:28.242127Z"}},"outputs":[{"name":"stdout","text":"Vocabulary Size: 2000000\nEmbedding Size: 300\nsimilar words : [('learing', 0.7456762194633484), ('Learning', 0.6895480751991272), ('learning.This', 0.687819242477417), ('learning.The', 0.6796228289604187), ('learning.It', 0.6753032207489014), ('learning.So', 0.6706693768501282), ('learning.What', 0.6673311591148376), ('learning.But', 0.6648256778717041), ('learning-', 0.6643092036247253), ('learning.As', 0.6633589267730713)]\n\n\nopposite words : [('19555', 0.2533474564552307), ('12291', 0.23999808728694916), ('10264', 0.2394980639219284), ('13107', 0.23354505002498627), ('8504', 0.23330195248126984), ('13223', 0.23251304030418396), ('7242', 0.23047803342342377), ('13466', 0.2299567013978958), ('10494', 0.22803275287151337), ('14138', 0.2278987020254135)]\n","output_type":"stream"}],"execution_count":14},{"cell_type":"code","source":"pretrained.build_vocab(sentences, update=True)\npretrained.train(\n sentences,\n total_examples=len(sentences),\n epochs=10\n)\n# Print vocabulary and embedding size\nvocab_size = len(pretrained.wv)\nembedding_size = pretrained.vector_size\nprint(f\"Vocabulary Size: {vocab_size}\")\nprint(f\"Embedding Size: {embedding_size}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:24:28.244081Z","iopub.execute_input":"2025-04-19T17:24:28.244314Z","iopub.status.idle":"2025-04-19T17:26:12.710272Z","shell.execute_reply.started":"2025-04-19T17:24:28.244297Z","shell.execute_reply":"2025-04-19T17:26:12.709490Z"}},"outputs":[{"name":"stdout","text":"Vocabulary Size: 2000000\nEmbedding Size: 300\n","output_type":"stream"}],"execution_count":15},{"cell_type":"code","source":"similar_words = pretrained.wv.most_similar(\"learn\", topn=10)\nopposite_words = pretrained.wv.most_similar(negative=\"learn\", topn=10)\nprint(similar_words, \"\\n\\n\", opposite_words)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:26:12.711506Z","iopub.execute_input":"2025-04-19T17:26:12.711753Z","iopub.status.idle":"2025-04-19T17:26:13.760293Z","shell.execute_reply.started":"2025-04-19T17:26:12.711729Z","shell.execute_reply":"2025-04-19T17:26:13.759226Z"}},"outputs":[{"name":"stdout","text":"[('teach', 0.716772198677063), ('Learn', 0.7041028738021851), ('learned', 0.6968039274215698), ('learm', 0.6521831750869751), ('re-learn', 0.6518067717552185), ('discover', 0.6409897208213806), ('learn.If', 0.6341798901557922), ('relearn', 0.6159347295761108), ('leanr', 0.6142886877059937), ('understand', 0.6114104390144348)] \n\n [('.Rear', 0.22274798154830933), ('3.825', 0.20031915605068207), ('1.638', 0.19616979360580444), ('W52', 0.19612562656402588), ('3.725', 0.19571073353290558), ('9,677', 0.1925133764743805), ('2.101', 0.19243070483207703), ('2.675', 0.1889045089483261), ('3.425', 0.1883799433708191), ('2.76m', 0.1873113363981247)]\n","output_type":"stream"}],"execution_count":16},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]} -------------------------------------------------------------------------------- /01-Text-Preprocessing/1.1-Text-Preprocessing/preprocessing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "03384981", 6 | "metadata": {}, 7 | "source": [ 8 | "# Libraries" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 53, 14 | "id": "5baf0687", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stderr", 19 | "output_type": "stream", 20 | "text": [ 21 | "[nltk_data] Downloading package stopwords to\n", 22 | "[nltk_data] C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n", 23 | "[nltk_data] Package stopwords is already up-to-date!\n", 24 | "[nltk_data] Downloading package wordnet to\n", 25 | "[nltk_data] C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n", 26 | "[nltk_data] Package wordnet is already up-to-date!\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "import re\n", 33 | "from nltk.stem import WordNetLemmatizer, PorterStemmer\n", 34 | "import nltk\n", 35 | "from nltk.corpus import stopwords\n", 36 | "from nltk.tokenize import word_tokenize\n", 37 | "nltk.download('stopwords')\n", 38 | "nltk.download('wordnet')\n", 39 | "\n", 40 | "from typing import List" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "id": "89132fd3", 46 | "metadata": {}, 47 | "source": [ 48 | "# Data" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "id": "1f1a8e5a", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/html": [ 60 | "
\n", 61 | "\n", 74 | "\n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | "
user_idbusiness_idtextdatecompliment_count
0AGNUgVwnZUey3gcPCJ76iw3uLgwr0qeCNMjKenHJwPGQAvengers time with the ladies.2012-05-18 02:17:210
1NBN4MgHP9D3cw--SnauTkAQoezRbYQncpRqyrLH6IqjgThey have lots of good deserts and tasty cuban...2013-02-05 18:35:100
2-copOvldyKh1qr-vzkDEvwMYoRNLb5chwjQe3c_k37GgIt's open even when you think it isn't2013-08-18 00:56:080
3FjMQVZjSqY8syIO-53KFKwhV-bABTK-glh5wj31ps_JwVery decent fried chicken2017-06-27 23:05:380
4ld0AperBXk1h6UbqmM80zw_uN0OudeJ3Zl_tf6nxg5wwAppetizers.. platter special for lunch2012-10-06 19:43:090
\n", 128 | "
" 129 | ], 130 | "text/plain": [ 131 | " user_id business_id \\\n", 132 | "0 AGNUgVwnZUey3gcPCJ76iw 3uLgwr0qeCNMjKenHJwPGQ \n", 133 | "1 NBN4MgHP9D3cw--SnauTkA QoezRbYQncpRqyrLH6Iqjg \n", 134 | "2 -copOvldyKh1qr-vzkDEvw MYoRNLb5chwjQe3c_k37Gg \n", 135 | "3 FjMQVZjSqY8syIO-53KFKw hV-bABTK-glh5wj31ps_Jw \n", 136 | "4 ld0AperBXk1h6UbqmM80zw _uN0OudeJ3Zl_tf6nxg5ww \n", 137 | "\n", 138 | " text date \\\n", 139 | "0 Avengers time with the ladies. 2012-05-18 02:17:21 \n", 140 | "1 They have lots of good deserts and tasty cuban... 2013-02-05 18:35:10 \n", 141 | "2 It's open even when you think it isn't 2013-08-18 00:56:08 \n", 142 | "3 Very decent fried chicken 2017-06-27 23:05:38 \n", 143 | "4 Appetizers.. platter special for lunch 2012-10-06 19:43:09 \n", 144 | "\n", 145 | " compliment_count \n", 146 | "0 0 \n", 147 | "1 0 \n", 148 | "2 0 \n", 149 | "3 0 \n", 150 | "4 0 " 151 | ] 152 | }, 153 | "execution_count": 3, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "json_file_path = r\"E:\\DATA SCIENCE\\NLP-Tea\\Data\\yelp_academic_dataset_tip.json\\yelp_academic_dataset_tip.json\"\n", 160 | "df = pd.read_json(json_file_path, lines=True)\n", 161 | "\n", 162 | "df.head()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 4, 168 | "id": "e99b254f", 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "(908915, 5)\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "print(df.shape)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 5, 186 | "id": "d8dfe82f", 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "['Avengers time with the ladies.',\n", 193 | " 'They have lots of good deserts and tasty cuban sandwiches',\n", 194 | " \"It's open even when you think it isn't\",\n", 195 | " 'Very decent fried chicken',\n", 196 | " 'Appetizers.. platter special for lunch']" 197 | ] 198 | }, 199 | "execution_count": 5, 200 | "metadata": {}, 201 | "output_type": "execute_result" 202 | } 203 | ], 204 | "source": [ 205 | "text_data = list(df[\"text\"][:1000]) # First 1000 Row Only \n", 206 | "text_data[:5]" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "id": "5cf06943", 212 | "metadata": {}, 213 | "source": [ 214 | "# Preprocessing" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 54, 220 | "id": "02269239", 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "name": "stdout", 225 | "output_type": "stream", 226 | "text": [ 227 | "Self serve onions, relish, mayo? And FREE caramelized onions? Yes!\n" 228 | ] 229 | } 230 | ], 231 | "source": [ 232 | "test_text = text_data[101]\n", 233 | "print(test_text)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "id": "a13434b4", 239 | "metadata": {}, 240 | "source": [ 241 | "## Case Normalization (lowercase)\n" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 55, 247 | "id": "29303df8", 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "original text : Self serve onions, relish, mayo? And FREE caramelized onions? Yes!\n", 255 | "lowercase text: self serve onions, relish, mayo? and free caramelized onions? yes!\n" 256 | ] 257 | } 258 | ], 259 | "source": [ 260 | "text_lower = test_text.lower()\n", 261 | "print(f\"original text : {test_text}\")\n", 262 | "print(f\"lowercase text: {text_lower}\")\n" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "id": "1d864693", 268 | "metadata": {}, 269 | "source": [ 270 | "## Removes punctuation and digits" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 56, 276 | "id": "1b2e7d89", 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "original text : Self serve onions, relish, mayo? And FREE caramelized onions? Yes!\n", 284 | "preprocessed : self serve onions relish mayo and free caramelized onions yes\n" 285 | ] 286 | } 287 | ], 288 | "source": [ 289 | "text_lower = test_text.lower()\n", 290 | "text_no_punct = re.sub(r'[^a-zA-z\\s]', '', text_lower) # keep only letters and space\n", 291 | "text_no_punct = re.sub(r'[^a-zA-z\\s0-9]', '', text_lower) # Keep numbers \n", 292 | "\n", 293 | "\n", 294 | "print(f\"original text : {test_text}\")\n", 295 | "print(f\"preprocessed : {text_no_punct}\")" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 66, 301 | "id": "9f6006e9", 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "original text : don't\n", 309 | "with \\' : don't\n", 310 | "with out \\' : dont\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "text_no_punct1 = re.sub(r'[^a-zA-z\\s\\']', '', \"don't\")\n", 316 | "text_no_punct2 = re.sub(r'[^a-zA-z\\s]', '', \"don't\")\n", 317 | "\n", 318 | "print(f\"original text : don't\")\n", 319 | "print(f\"with \\\\' : {text_no_punct1}\")\n", 320 | "print(f\"with out \\\\' : {text_no_punct2}\")" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "id": "36ec81bf", 326 | "metadata": {}, 327 | "source": [ 328 | "## Tokens" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 57, 334 | "id": "a727391a", 335 | "metadata": {}, 336 | "outputs": [ 337 | { 338 | "name": "stdout", 339 | "output_type": "stream", 340 | "text": [ 341 | "original text : Self serve onions, relish, mayo? And FREE caramelized onions? Yes!\n", 342 | "preprocessed : ['self', 'serve', 'onions', 'relish', 'mayo', 'and', 'free', 'caramelized', 'onions', 'yes']\n", 343 | "preprocessed_1: ['self', 'serve', 'onions', 'relish', 'mayo', 'and', 'free', 'caramelized', 'onions', 'yes']\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "text_lower = test_text.lower()\n", 349 | "text_no_punct = re.sub(r'[^a-zA-z\\s]', '', text_lower) \n", 350 | "tokens = re.split(r\"\\s+\", text_no_punct) \n", 351 | "\n", 352 | "# or \n", 353 | "tokens_v1 = word_tokenize(text_no_punct)\n", 354 | "\n", 355 | "print(f\"original text : {test_text}\")\n", 356 | "print(f\"preprocessed : {tokens}\")\n", 357 | "print(f\"preprocessed_1: {tokens_v1}\")" 358 | ] 359 | }, 360 | { 361 | "cell_type": "markdown", 362 | "id": "0f69d1da", 363 | "metadata": {}, 364 | "source": [ 365 | "## Removes stopwords\n" 366 | ] 367 | }, 368 | { 369 | "cell_type": "code", 370 | "execution_count": null, 371 | "id": "98b95e15", 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "original text : Self serve onions, relish, mayo? And FREE caramelized onions? Yes!\n", 379 | "preprocessed : ['self', 'serve', 'onions', 'relish', 'mayo', 'free', 'caramelized', 'onions', 'yes']\n" 380 | ] 381 | } 382 | ], 383 | "source": [ 384 | "text_lower = test_text.lower()\n", 385 | "text_no_punct = re.sub(r'[^a-zA-z\\s]', '', text_lower)\n", 386 | "tokens = re.split(r\"\\s+\", text_no_punct) \n", 387 | "\n", 388 | "stop_words = set(stopwords.words('english'))\n", 389 | "tokens = [token for token in tokens if token not in stop_words]\n", 390 | "\n", 391 | "\n", 392 | "\n", 393 | "print(f\"original text : {test_text}\")\n", 394 | "print(f\"preprocessed : {tokens}\")" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "id": "d221f8d2", 400 | "metadata": {}, 401 | "source": [ 402 | "## Stemming and lemmatization" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "id": "822ea890", 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "name": "stdout", 413 | "output_type": "stream", 414 | "text": [ 415 | "original text : Self serve onions, relish, mayo? And FREE caramelized onions? Yes!\n", 416 | "preprocessed : ['self', 'serv', 'onion', 'relish', 'mayo', 'free', 'caramel', 'onion', 'ye']\n" 417 | ] 418 | } 419 | ], 420 | "source": [ 421 | "# Initialize stemmer \n", 422 | "stemmer = PorterStemmer()\n", 423 | "stem_tokens = [stemmer.stem(token) for token in tokens]\n", 424 | "\n", 425 | "print(f\"original text : {test_text}\")\n", 426 | "print(f\"preprocessed : {stem_tokens}\")\n", 427 | "\n", 428 | "#server =>> serv\n", 429 | "# yes =>> ye" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "id": "a0eb5608", 436 | "metadata": {}, 437 | "outputs": [ 438 | { 439 | "name": "stdout", 440 | "output_type": "stream", 441 | "text": [ 442 | "original text : Self serve onions, relish, mayo? And FREE caramelized onions? Yes!\n", 443 | "preprocessed : ['self', 'serve', 'onion', 'relish', 'mayo', 'free', 'caramelized', 'onion', 'yes']\n" 444 | ] 445 | } 446 | ], 447 | "source": [ 448 | "# Initialize lemmatizer\n", 449 | "lemmatizer = WordNetLemmatizer()\n", 450 | "lemma_tokens = [lemmatizer.lemmatize(token) for token in tokens]\n", 451 | "\n", 452 | "print(f\"original text : {test_text}\")\n", 453 | "print(f\"preprocessed : {lemma_tokens}\")\n" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 50, 459 | "id": "7d25b322", 460 | "metadata": {}, 461 | "outputs": [ 462 | { 463 | "name": "stdout", 464 | "output_type": "stream", 465 | "text": [ 466 | "Word: running\n", 467 | " Stemmed: run\n", 468 | " Lemmatized: running\n", 469 | "\n", 470 | "Word: better\n", 471 | " Stemmed: better\n", 472 | " Lemmatized: better\n", 473 | "\n", 474 | "Word: flies\n", 475 | " Stemmed: fli\n", 476 | " Lemmatized: fly\n", 477 | "\n", 478 | "Word: cities\n", 479 | " Stemmed: citi\n", 480 | " Lemmatized: city\n", 481 | "\n", 482 | "Word: served\n", 483 | " Stemmed: serv\n", 484 | " Lemmatized: served\n", 485 | "\n", 486 | "Word: children\n", 487 | " Stemmed: children\n", 488 | " Lemmatized: child\n", 489 | "\n" 490 | ] 491 | } 492 | ], 493 | "source": [ 494 | "words = [\"running\", \"better\", \"flies\", \"cities\", \"served\", \"children\"]\n", 495 | "\n", 496 | "for word in words:\n", 497 | " print(f\"Word: {word}\")\n", 498 | " print(f\" Stemmed: {stemmer.stem(word)}\")\n", 499 | " print(f\" Lemmatized: {lemmatizer.lemmatize(word)}\")\n", 500 | " print()" 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "execution_count": null, 506 | "id": "bdf159a4", 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "id": "a274ddb0", 514 | "metadata": {}, 515 | "source": [ 516 | "## ALL" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "id": "95ed33f8", 523 | "metadata": {}, 524 | "outputs": [], 525 | "source": [ 526 | "import pandas as pd\n", 527 | "import re\n", 528 | "from nltk.stem import WordNetLemmatizer, PorterStemmer\n", 529 | "import nltk\n", 530 | "import emoji\n", 531 | "from nltk.corpus import stopwords\n", 532 | "from nltk.tokenize import word_tokenize\n", 533 | "nltk.download('stopwords')\n", 534 | "nltk.download('wordnet')\n", 535 | "\n", 536 | "from typing import List" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": null, 542 | "id": "685cadf7", 543 | "metadata": {}, 544 | "outputs": [ 545 | { 546 | "name": "stdout", 547 | "output_type": "stream", 548 | "text": [ 549 | "[['avenger', 'time', 'lady'], ['lot', 'good', 'desert', 'tasty', 'cuban', 'sandwich'], [\"'s\", 'open', 'even', 'think', \"n't\"], ['decent', 'fried', 'chicken'], ['appetizer', 'platter', 'special', 'lunch']]\n" 550 | ] 551 | } 552 | ], 553 | "source": [ 554 | "def preprocessing(text: str) -> list[str]:\n", 555 | " \"\"\"\n", 556 | " Preprocesses a given text:\n", 557 | " - Lowercases text\n", 558 | " - Removes punctuation and digits\n", 559 | " - Removes stopwords\n", 560 | " - Tokenizes into words\n", 561 | " - Applies lemmatization or stemming\n", 562 | "\n", 563 | " Args:\n", 564 | " document (str): The raw input text\n", 565 | "\n", 566 | " Returns:\n", 567 | " List of str: Cleaned and preprocessed text\n", 568 | "\n", 569 | " Example:\n", 570 | " >>> preprocess(\"I love Python! 😊 It's awesome 👍\")\n", 571 | " ['love', 'python', 'smiling_face', 'awesome', 'thumbs_up']\n", 572 | " \"\"\"\n", 573 | "\n", 574 | " stop_words = set(stopwords.words('english'))\n", 575 | " lemmatizer = WordNetLemmatizer()\n", 576 | " stemmer = PorterStemmer()\n", 577 | "\n", 578 | " # Convert Text to Lowercase (Normalization)\n", 579 | " text_lower = text.lower()\n", 580 | "\n", 581 | " # Removing Punctuation\n", 582 | " text_no_punct = re.sub(r'[^a-zA-Z\\s\\']', '', text_lower) # \\' for keep apostrophes (e.g. don't, it's)\n", 583 | "\n", 584 | "\n", 585 | " # 3. Tokens\n", 586 | " tokens = re.split(r\"\\s+\", text_no_punct) \n", 587 | " tokens = [t for t in tokens if t]\n", 588 | " # or use nltk tokenizer\n", 589 | " tokens = word_tokenize(text_no_punct)\n", 590 | "\n", 591 | " # 4. Stop word removal\n", 592 | " filtered_tokens = [token for token in tokens if token not in stop_words]\n", 593 | "\n", 594 | " # 5. Lemmatization \n", 595 | " lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]\n", 596 | " # or stemmer\n", 597 | " stemm_tokens = [stemmer.stem(token) for token in filtered_tokens ]\n", 598 | "\n", 599 | " return lemma_tokens\n", 600 | "\n", 601 | "text_data = list(df[\"text\"][:100]) # First 1000 Row Only\n", 602 | "preprocessed_text = [preprocessing(text) for text in text_data]\n", 603 | "print(preprocessed_text[:5])" 604 | ] 605 | }, 606 | { 607 | "cell_type": "code", 608 | "execution_count": null, 609 | "id": "1016ea24", 610 | "metadata": {}, 611 | "outputs": [ 612 | { 613 | "data": { 614 | "text/plain": [ 615 | "'I love pizza and grinning face with smiling eyes !'" 616 | ] 617 | }, 618 | "execution_count": 86, 619 | "metadata": {}, 620 | "output_type": "execute_result" 621 | } 622 | ], 623 | "source": [ 624 | "print(preprocessing(\"I love Python! 😊 It's awesome 👍\"))\n", 625 | "text = emoji.demojize(\"I love 🍕 and 😄!\", delimiters=(\" \", \" \")) \n", 626 | "text = re.sub(r'_', ' ', text)\n", 627 | "text" 628 | ] 629 | }, 630 | { 631 | "cell_type": "code", 632 | "execution_count": null, 633 | "id": "e8531a7c", 634 | "metadata": {}, 635 | "outputs": [], 636 | "source": [ 637 | "# Remove URLs, emails, and Twitter mentions\n", 638 | "text = re.sub(r'(https?://\\S+|www\\.\\S+)', ' ', text) # URLs\n", 639 | "text = re.sub(r'\\S+@\\S+', ' ', text) # Email addresses\n", 640 | "text = re.sub(r'@\\w+', ' ', text) # Mentions" 641 | ] 642 | } 643 | ], 644 | "metadata": { 645 | "kernelspec": { 646 | "display_name": "myenv", 647 | "language": "python", 648 | "name": "python3" 649 | }, 650 | "language_info": { 651 | "codemirror_mode": { 652 | "name": "ipython", 653 | "version": 3 654 | }, 655 | "file_extension": ".py", 656 | "mimetype": "text/x-python", 657 | "name": "python", 658 | "nbconvert_exporter": "python", 659 | "pygments_lexer": "ipython3", 660 | "version": "3.12.6" 661 | } 662 | }, 663 | "nbformat": 4, 664 | "nbformat_minor": 5 665 | } 666 | -------------------------------------------------------------------------------- /02-Word Embeddings/2.2-BOW/2.2-BOW.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "aaef07ca", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "[nltk_data] Downloading package stopwords to\n", 14 | "[nltk_data] C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n", 15 | "[nltk_data] Package stopwords is already up-to-date!\n", 16 | "[nltk_data] Downloading package wordnet to\n", 17 | "[nltk_data] C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n", 18 | "[nltk_data] Package wordnet is already up-to-date!\n" 19 | ] 20 | } 21 | ], 22 | "source": [ 23 | "import pandas as pd\n", 24 | "import numpy as np\n", 25 | "import re\n", 26 | "from nltk.stem import WordNetLemmatizer, PorterStemmer\n", 27 | "import nltk\n", 28 | "from nltk.corpus import stopwords\n", 29 | "from nltk.tokenize import word_tokenize\n", 30 | "import contractions\n", 31 | "nltk.download('stopwords')\n", 32 | "nltk.download('wordnet')\n", 33 | "\n", 34 | "from typing import List" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "id": "3b88e66d", 40 | "metadata": {}, 41 | "source": [ 42 | "## Data" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 2, 48 | "id": "d27f3fbc", 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "data": { 53 | "text/html": [ 54 | "
\n", 55 | "\n", 68 | "\n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | "
reviewsentiment
0One of the other reviewers has mentioned that ...positive
1A wonderful little production. <br /><br />The...positive
2I thought this was a wonderful way to spend ti...positive
3Basically there's a family where a little boy ...negative
4Petter Mattei's \"Love in the Time of Money\" is...positive
\n", 104 | "
" 105 | ], 106 | "text/plain": [ 107 | " review sentiment\n", 108 | "0 One of the other reviewers has mentioned that ... positive\n", 109 | "1 A wonderful little production.

The... positive\n", 110 | "2 I thought this was a wonderful way to spend ti... positive\n", 111 | "3 Basically there's a family where a little boy ... negative\n", 112 | "4 Petter Mattei's \"Love in the Time of Money\" is... positive" 113 | ] 114 | }, 115 | "execution_count": 2, 116 | "metadata": {}, 117 | "output_type": "execute_result" 118 | } 119 | ], 120 | "source": [ 121 | "file_path = r\"E:\\DATA SCIENCE\\NLP-Tea\\Data\\IMDB Dataset Movie Reviews\\IMDB Dataset.csv\"\n", 122 | "df = pd.read_csv(file_path)\n", 123 | "\n", 124 | "df.head() # (50000, 2)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 3, 130 | "id": "e7e24d28", 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "[\"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.

The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.

It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.

I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare. Forget pretty pictures painted for mainstream audiences, forget charm, forget romance...OZ doesn't mess around. The first episode I ever saw struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I watched more, I developed a taste for Oz, and got accustomed to the high levels of graphic violence. Not just violence, but injustice (crooked guards who'll be sold out for a nickel, inmates who'll kill on order and get away with it, well mannered, middle class inmates being turned into prison bitches due to their lack of street skills or prison experience) Watching Oz, you may become comfortable with what is uncomfortable viewing....thats if you can get in touch with your darker side.\",\n", 137 | " 'A wonderful little production.

The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece.

The actors are extremely well chosen- Michael Sheen not only \"has got all the polari\" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\\'s of comedy and his life.

The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \\'dream\\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\\'s murals decorating every surface) are terribly well done.',\n", 138 | " 'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.

This was the most I\\'d laughed at one of Woody\\'s comedies in years (dare I say a decade?). While I\\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her \"sexy\" image and jumped right into a average, but spirited young woman.

This may not be the crown jewel of his career, but it was wittier than \"Devil Wears Prada\" and more interesting than \"Superman\" a great comedy to go see with friends.',\n", 139 | " \"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.

This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.

OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.

3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them.\",\n", 140 | " 'Petter Mattei\\'s \"Love in the Time of Money\" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter.

This being a variation on the Arthur Schnitzler\\'s play about the same theme, the director transfers the action to the present time New York where all these different characters meet and connect. Each one is connected in one way, or another to the next person, but no one seems to know the previous point of contact. Stylishly, the film has a sophisticated luxurious look. We are taken to see how these people live and the world they live in their own habitat.

The only thing one gets out of all these souls in the picture is the different stages of loneliness each one inhabits. A big city is not exactly the best place in which human relations find sincere fulfillment, as one discerns is the case with most of the people we encounter.

The acting is good under Mr. Mattei\\'s direction. Steve Buscemi, Rosario Dawson, Carol Kane, Michael Imperioli, Adrian Grenier, and the rest of the talented cast, make these characters come alive.

We wish Mr. Mattei good luck and await anxiously for his next work.']" 141 | ] 142 | }, 143 | "execution_count": 3, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "text_data = list(df[\"review\"][:1000]) # First 1000 Row Only \n", 150 | "text_data[:5]" 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 4, 156 | "id": "6677f1b3", 157 | "metadata": {}, 158 | "outputs": [ 159 | { 160 | "name": "stdout", 161 | "output_type": "stream", 162 | "text": [ 163 | "'a wonderful little production. the filming technique is very unassuming-\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "html_text = \"'A wonderful little production.

The filming technique is very unassuming-\"\n", 169 | "html_text = html_text.lower()\n", 170 | "clean_text = re.sub(r'<[^>]+>', '', html_text)\n", 171 | "print(clean_text)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "id": "6f628691", 177 | "metadata": {}, 178 | "source": [ 179 | "## Preprocessing" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 42, 185 | "id": "a662d33a", 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "[['one', 'reviewer', 'mentioned', 'watching', 'oz', 'episode', 'hooked', 'right', 'exactly', 'happened', 'methe', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scene', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', 'sex', 'violence', 'hardcore', 'classic', 'use', 'wordit', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focus', 'mainly', 'emerald', 'city', 'experimental', 'section', 'prison', 'cell', 'glass', 'front', 'face', 'inwards', 'privacy', 'high', 'agenda', 'city', 'home', 'manyaryans', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish', 'moreso', 'scuffle', 'death', 'stare', 'dodgy', 'dealing', 'shady', 'agreement', 'never', 'far', 'awayi', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'go', 'show', 'would', 'dare', 'forget', 'pretty', 'picture', 'painted', 'mainstream', 'audience', 'forget', 'charm', 'forget', 'romanceoz', 'mess', 'around', 'first', 'episode', 'ever', 'saw', 'struck', 'nasty', 'surreal', 'could', 'say', 'ready', 'watched', 'developed', 'taste', 'oz', 'got', 'accustomed', 'high', 'level', 'graphic', 'violence', 'violence', 'injustice', 'crooked', 'guard', 'sold', 'nickel', 'inmate', 'kill', 'order', 'get', 'away', 'well', 'mannered', 'middle', 'class', 'inmate', 'turned', 'prison', 'bitch', 'due', 'lack', 'street', 'skill', 'prison', 'experience', 'watching', 'oz', 'may', 'become', 'comfortable', 'uncomfortable', 'viewingthat', 'get', 'touch', 'darker', 'side'], ['wonderful', 'little', 'production', 'filming', 'technique', 'unassuming', 'oldtimebbc', 'fashion', 'give', 'comforting', 'sometimes', 'discomforting', 'sense', 'realism', 'entire', 'piece', 'actor', 'extremely', 'well', 'chosen', 'michael', 'sheen', 'got', 'polari', 'voice', 'pat', 'truly', 'see', 'seamless', 'editing', 'guided', 'reference', 'williams', 'diary', 'entry', 'well', 'worth', 'watching', 'terrificly', 'written', 'performed', 'piece', 'masterful', 'production', 'one', 'great', 'master', 'comedy', 'life', 'realism', 'really', 'come', 'home', 'little', 'thing', 'fantasy', 'guard', 'rather', 'use', 'traditional', 'dream', 'technique', 'remains', 'solid', 'disappears', 'play', 'knowledge', 'sens', 'particularly', 'scene', 'concerning', 'orton', 'halliwell', 'set', 'particularly', 'flat', 'halliwells', 'mural', 'decorating', 'every', 'surface', 'terribly', 'well', 'done'], ['thought', 'wonderful', 'way', 'spend', 'time', 'hot', 'summer', 'weekend', 'sitting', 'air', 'conditioned', 'theater', 'watching', 'lighthearted', 'comedy', 'plot', 'simplistic', 'dialogue', 'witty', 'character', 'likable', 'even', 'well', 'bread', 'suspected', 'serial', 'killer', 'may', 'disappointed', 'realize', 'match', 'point', 'risk', 'addiction', 'thought', 'proof', 'woody', 'allen', 'still', 'fully', 'control', 'style', 'many', 'u', 'grown', 'lovethis', 'would', 'laughed', 'one', 'woodys', 'comedy', 'year', 'dare', 'say', 'decade', 'never', 'impressed', 'scarlet', 'johanson', 'managed', 'tone', 'sexy', 'image', 'jumped', 'right', 'average', 'spirited', 'young', 'womanthis', 'may', 'crown', 'jewel', 'career', 'wittier', 'devil', 'wear', 'prada', 'interesting', 'superman', 'great', 'comedy', 'go', 'see', 'friend']]\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "def preprocessing(text: str) -> list[str]:\n", 198 | " \"\"\"\n", 199 | " Preprocesses a given text:\n", 200 | " - Lowercases text\n", 201 | " - Contraction Handling\n", 202 | " - Removes punctuation and digits\n", 203 | " - Removes stopwords\n", 204 | " - Tokenizes into words\n", 205 | " - Applies lemmatization or stemming\n", 206 | "\n", 207 | " Args:\n", 208 | " document (str): The raw input text\n", 209 | "\n", 210 | " Returns:\n", 211 | " List of str: Cleaned and preprocessed text\n", 212 | " \"\"\"\n", 213 | "\n", 214 | " stop_words = set(stopwords.words('english'))\n", 215 | " lemmatizer = WordNetLemmatizer()\n", 216 | " stemmer = PorterStemmer()\n", 217 | "\n", 218 | " # Convert Text to Lowercase (Normalization)\n", 219 | " text_lower = text.lower()\n", 220 | " text_no_tags = re.sub(r'<[^>]+>', '', text_lower)\n", 221 | "\n", 222 | " # Contraction Handling\n", 223 | " text_no_tags = contractions.fix(text_no_tags)\n", 224 | "\n", 225 | " # Removing Punctuation\n", 226 | " text_no_punct = re.sub(r'[^a-zA-Z\\s]', '', text_no_tags) # \\' for keep apostrophes (e.g. don't, it's)\n", 227 | "\n", 228 | "\n", 229 | " # 3. Tokens\n", 230 | " tokens = re.split(r\"\\s+\", text_no_punct) \n", 231 | " tokens = [t for t in tokens if t]\n", 232 | " # or use nltk tokenizer\n", 233 | " tokens = word_tokenize(text_no_punct)\n", 234 | "\n", 235 | " # 4. Stop word removal\n", 236 | " filtered_tokens = [token for token in tokens if token not in stop_words]\n", 237 | "\n", 238 | " # 5. Lemmatization \n", 239 | " lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]\n", 240 | " # or stemmer\n", 241 | " stemm_tokens = [stemmer.stem(token) for token in filtered_tokens ]\n", 242 | "\n", 243 | " return lemma_tokens\n", 244 | "\n", 245 | "text_data = list(df[\"review\"][:100]) # First 100 Row Only\n", 246 | "preprocessed_text = [preprocessing(text) for text in text_data]\n", 247 | "print(preprocessed_text[:3])" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "d9be009a", 253 | "metadata": {}, 254 | "source": [ 255 | "## From Scratch" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 86, 261 | "id": "2f324ffc", 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "all_tokens =[]\n", 266 | "for lst_tokens in preprocessed_text:\n", 267 | " all_tokens.extend(lst_tokens)\n", 268 | "vocab = sorted(set(all_tokens)) # Unique Words\n", 269 | "\n", 270 | "\n", 271 | "def TermFrequency(term: str, doc: list[str]) :\n", 272 | " tf = 0\n", 273 | " if term not in doc:\n", 274 | " return tf\n", 275 | " for t in doc:\n", 276 | " if t == term:\n", 277 | " tf+=1\n", 278 | " return tf \n", 279 | " \n", 280 | "\n", 281 | "\n", 282 | "def BagOfWords(vocab: list, preprocessed_text: list[list]) -> np.ndarray :\n", 283 | "\n", 284 | " n_docs = len(preprocessed_text)\n", 285 | " n_vocab = len(vocab)\n", 286 | " \n", 287 | " bow_matrix = np.zeros(shape=(n_docs, n_vocab)) # (# documents, # vocabulary words)\n", 288 | "\n", 289 | " for doc_idx, doc in enumerate(preprocessed_text): # For Each Document \n", 290 | "\n", 291 | " bow_vec = np.zeros(shape=n_vocab)\n", 292 | " # For each word in the vocabulary, calculate its term frequency in this document\n", 293 | " for term_idx,term in enumerate(vocab): \n", 294 | " tf = TermFrequency(term=term, doc=doc)\n", 295 | " bow_vec[term_idx] = tf\n", 296 | "\n", 297 | " bow_matrix[doc_idx] = bow_vec\n", 298 | " return bow_matrix\n", 299 | "bag_of_word_matrix = BagOfWords(vocab, preprocessed_text)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": 77, 305 | "id": "7c69b3af", 306 | "metadata": {}, 307 | "outputs": [ 308 | { 309 | "name": "stdout", 310 | "output_type": "stream", 311 | "text": [ 312 | "[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 313 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.\n", 314 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.\n", 315 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 316 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 317 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 318 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 319 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 320 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 321 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 322 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 323 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n", 324 | " 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0.]\n" 325 | ] 326 | } 327 | ], 328 | "source": [ 329 | "print(bag_of_word_matrix[3][400:700])" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "id": "ef5600dc", 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": null, 343 | "id": "f0d4bf46", 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "\n" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "id": "6c005e57", 353 | "metadata": {}, 354 | "source": [ 355 | "## Built in" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "33e86300", 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | "Vocabulary: ['abbot' 'abbreviated' 'abetted' ... 'zoo' 'zoom' 'zwick']\n", 369 | "Bag of Words Matrix:\n", 370 | " [[0 0 0 ... 0 0 0]\n", 371 | " [0 0 0 ... 0 0 0]\n", 372 | " [0 0 0 ... 0 0 0]\n", 373 | " ...\n", 374 | " [0 0 0 ... 0 0 0]\n", 375 | " [0 0 0 ... 0 0 0]\n", 376 | " [0 0 0 ... 0 0 0]]\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "from sklearn.feature_extraction.text import CountVectorizer\n", 382 | "\n", 383 | "corpus = []\n", 384 | "for p in preprocessed_text:\n", 385 | " corpus.append(\" \".join(p))\n", 386 | "\n", 387 | "vectorizer = CountVectorizer()\n", 388 | "bow_matrix = vectorizer.fit_transform(corpus)\n", 389 | "bow_dense = bow_matrix.toarray()\n", 390 | "\n", 391 | "vocab = vectorizer.get_feature_names_out()\n", 392 | "print(\"Vocabulary:\", vocab)\n", 393 | "print(\"Bag of Words Matrix:\\n\", bow_dense)\n" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 89, 399 | "id": "57eaacd7", 400 | "metadata": {}, 401 | "outputs": [ 402 | { 403 | "data": { 404 | "text/plain": [ 405 | "(100, 4438)" 406 | ] 407 | }, 408 | "execution_count": 89, 409 | "metadata": {}, 410 | "output_type": "execute_result" 411 | } 412 | ], 413 | "source": [ 414 | "bow_dense.shape" 415 | ] 416 | }, 417 | { 418 | "cell_type": "code", 419 | "execution_count": null, 420 | "id": "2f6ca6d9", 421 | "metadata": {}, 422 | "outputs": [], 423 | "source": [] 424 | } 425 | ], 426 | "metadata": { 427 | "kernelspec": { 428 | "display_name": "myenv", 429 | "language": "python", 430 | "name": "python3" 431 | }, 432 | "language_info": { 433 | "codemirror_mode": { 434 | "name": "ipython", 435 | "version": 3 436 | }, 437 | "file_extension": ".py", 438 | "mimetype": "text/x-python", 439 | "name": "python", 440 | "nbconvert_exporter": "python", 441 | "pygments_lexer": "ipython3", 442 | "version": "3.12.6" 443 | } 444 | }, 445 | "nbformat": 4, 446 | "nbformat_minor": 5 447 | } 448 | -------------------------------------------------------------------------------- /02-Word Embeddings/2.1-Label Encoder and One Hot Encoder/2.1-label_and_oneHot_Encoder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "a4382364", 6 | "metadata": {}, 7 | "source": [ 8 | "# Libraries" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": null, 14 | "id": "167272d1", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stderr", 19 | "output_type": "stream", 20 | "text": [ 21 | "[nltk_data] Downloading package stopwords to\n", 22 | "[nltk_data] C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n", 23 | "[nltk_data] Package stopwords is already up-to-date!\n", 24 | "[nltk_data] Downloading package wordnet to\n", 25 | "[nltk_data] C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n", 26 | "[nltk_data] Package wordnet is already up-to-date!\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "import re\n", 33 | "from nltk.stem import WordNetLemmatizer, PorterStemmer\n", 34 | "import nltk\n", 35 | "from nltk.corpus import stopwords\n", 36 | "from nltk.tokenize import word_tokenize\n", 37 | "import contractions\n", 38 | "nltk.download('stopwords')\n", 39 | "nltk.download('wordnet')\n", 40 | "\n", 41 | "from typing import List" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "id": "2d651715", 47 | "metadata": {}, 48 | "source": [ 49 | "# Data" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 2, 55 | "id": "cc53d23d", 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "data": { 60 | "text/html": [ 61 | "
\n", 62 | "\n", 75 | "\n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | "
user_idbusiness_idtextdatecompliment_count
0AGNUgVwnZUey3gcPCJ76iw3uLgwr0qeCNMjKenHJwPGQAvengers time with the ladies.2012-05-18 02:17:210
1NBN4MgHP9D3cw--SnauTkAQoezRbYQncpRqyrLH6IqjgThey have lots of good deserts and tasty cuban...2013-02-05 18:35:100
2-copOvldyKh1qr-vzkDEvwMYoRNLb5chwjQe3c_k37GgIt's open even when you think it isn't2013-08-18 00:56:080
3FjMQVZjSqY8syIO-53KFKwhV-bABTK-glh5wj31ps_JwVery decent fried chicken2017-06-27 23:05:380
4ld0AperBXk1h6UbqmM80zw_uN0OudeJ3Zl_tf6nxg5wwAppetizers.. platter special for lunch2012-10-06 19:43:090
\n", 129 | "
" 130 | ], 131 | "text/plain": [ 132 | " user_id business_id \\\n", 133 | "0 AGNUgVwnZUey3gcPCJ76iw 3uLgwr0qeCNMjKenHJwPGQ \n", 134 | "1 NBN4MgHP9D3cw--SnauTkA QoezRbYQncpRqyrLH6Iqjg \n", 135 | "2 -copOvldyKh1qr-vzkDEvw MYoRNLb5chwjQe3c_k37Gg \n", 136 | "3 FjMQVZjSqY8syIO-53KFKw hV-bABTK-glh5wj31ps_Jw \n", 137 | "4 ld0AperBXk1h6UbqmM80zw _uN0OudeJ3Zl_tf6nxg5ww \n", 138 | "\n", 139 | " text date \\\n", 140 | "0 Avengers time with the ladies. 2012-05-18 02:17:21 \n", 141 | "1 They have lots of good deserts and tasty cuban... 2013-02-05 18:35:10 \n", 142 | "2 It's open even when you think it isn't 2013-08-18 00:56:08 \n", 143 | "3 Very decent fried chicken 2017-06-27 23:05:38 \n", 144 | "4 Appetizers.. platter special for lunch 2012-10-06 19:43:09 \n", 145 | "\n", 146 | " compliment_count \n", 147 | "0 0 \n", 148 | "1 0 \n", 149 | "2 0 \n", 150 | "3 0 \n", 151 | "4 0 " 152 | ] 153 | }, 154 | "execution_count": 2, 155 | "metadata": {}, 156 | "output_type": "execute_result" 157 | } 158 | ], 159 | "source": [ 160 | "json_file_path = r\"E:\\DATA SCIENCE\\NLP-Tea\\Data\\yelp_academic_dataset_tip.json\\yelp_academic_dataset_tip.json\"\n", 161 | "df = pd.read_json(json_file_path, lines=True)\n", 162 | "\n", 163 | "df.head()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 3, 169 | "id": "7eeec26e", 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "data": { 174 | "text/plain": [ 175 | "['Avengers time with the ladies.',\n", 176 | " 'They have lots of good deserts and tasty cuban sandwiches',\n", 177 | " \"It's open even when you think it isn't\",\n", 178 | " 'Very decent fried chicken',\n", 179 | " 'Appetizers.. platter special for lunch']" 180 | ] 181 | }, 182 | "execution_count": 3, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "text_data = list(df[\"text\"][:1000]) # First 1000 Row Only \n", 189 | "text_data[:5]" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "id": "806e1a12", 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "['mohamad', 'fawzy', 'jfhbf', 'dvhbfehyv']\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "word_tokenize" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 180, 213 | "id": "5dc3e634", 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "[['avenger', 'time', 'lady'], ['lot', 'good', 'desert', 'tasty', 'cuban', 'sandwich'], ['open', 'even', 'think'], ['decent', 'fried', 'chicken'], ['appetizer', 'platter', 'special', 'lunch']]\n" 221 | ] 222 | } 223 | ], 224 | "source": [ 225 | "def preprocessing(text: str) -> list[list[str]]:\n", 226 | " \"\"\"\n", 227 | " Preprocesses a given text:\n", 228 | " - Lowercases text\n", 229 | " - Contraction Handling\n", 230 | " - Removes punctuation and digits\n", 231 | " - Removes stopwords\n", 232 | " - Tokenizes into words\n", 233 | " - Applies lemmatization or stemming\n", 234 | "\n", 235 | " Args:\n", 236 | " document (str): The raw input text\n", 237 | "\n", 238 | " Returns:\n", 239 | " List of str: Cleaned and preprocessed text\n", 240 | "\n", 241 | " Example:\n", 242 | " >>> preprocessing(\"It's open even when you think it isn't\")\n", 243 | " [\"'s\", 'open', 'even', 'think', \"n't\"]\n", 244 | " \"\"\"\n", 245 | "\n", 246 | " stop_words = set(stopwords.words('english'))\n", 247 | " lemmatizer = WordNetLemmatizer()\n", 248 | "\n", 249 | " # Convert Text to Lowercase (Normalization)\n", 250 | " text_lower = text.lower()\n", 251 | "\n", 252 | " # Contraction Handling\n", 253 | " text_lower = contractions.fix(text_lower)\n", 254 | "\n", 255 | " # Removing Punctuation\n", 256 | " text_no_punct = re.sub(r'[^a-zA-Z\\s\\']', '', text_lower) # \\' for keep apostrophes (e.g. don't, it's)\n", 257 | "\n", 258 | " # 3. Tokens\n", 259 | " # tokens = word_tokenize(text_no_punct)\n", 260 | " tokens = re.split(r\"\\s+\", text_no_punct)\n", 261 | " \n", 262 | "\n", 263 | " # 4. Stop word removal\n", 264 | " filtered_tokens = [token for token in tokens if token not in stop_words]\n", 265 | "\n", 266 | " # 5. Lemmatization \n", 267 | " lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]\n", 268 | " \n", 269 | "\n", 270 | " return lemma_tokens\n", 271 | "\n", 272 | "text_data = list(df[\"text\"][:10]) # First 1000 Row Only\n", 273 | "preprocessed_text = [preprocessing(text) for text in text_data]\n", 274 | "print(preprocessed_text[:5])" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 181, 280 | "id": "33d8ab7d", 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "name": "stdout", 285 | "output_type": "stream", 286 | "text": [ 287 | "['appetizer', 'area', 'avenger', 'best', 'boring', 'center', 'cheeseburger', 'chicken', 'chili', 'city', 'cocacolaso', 'cool', 'cuban', 'cup', 'dec', 'decent', 'decorated', 'desert', 'downtown', 'eat', 'elf', 'even', 'far', 'fried', 'game', 'good', 'great', 'kid', 'lady', 'leave', 'lindenwold', 'lot', 'lunch', 'make', 'never', 'onion', 'open', \"patco's\", 'pickle', 'place', 'platter', 'pm', 'probably', 'relish', 'ride', 'sandwich', 'santa', 'saturday', 'silver', 'single', 'sleigh', 'special', 'spring', 'starbucks', 'stop', 'substitute', 'taco', 'tampa', 'tasty', 'th', 'think', 'time', 'train', 'ugh', 'vanilla', 'w', 'watch']\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "all_tokens =[]\n", 293 | "for lst_tokens in preprocessed_text:\n", 294 | " all_tokens.extend(lst_tokens)\n", 295 | "\n", 296 | "vocab = sorted(set(all_tokens)) # Unique Words\n", 297 | "print(vocab)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "id": "93c7f85b", 303 | "metadata": {}, 304 | "source": [ 305 | "# Label Encoder" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "id": "7d94450f", 311 | "metadata": {}, 312 | "source": [ 313 | "## From Scratch" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 200, 319 | "id": "7edba0a2", 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, \"patco's\": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}\n", 327 | "[[2, 61, 28], [31, 25, 17, 58, 12, 45], [36, 21, 60]]\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "def LabelEncoder(vocab: list) -> dict:\n", 333 | " \"\"\"\n", 334 | " Creates a label encoder that maps each unique word to a unique integer index.\n", 335 | "\n", 336 | " Args:\n", 337 | " vocab (list): A sorted list of unique words (vocabulary).\n", 338 | "\n", 339 | " Returns:\n", 340 | " dict: A dictionary mapping words to their corresponding index.\n", 341 | " \"\"\"\n", 342 | " word_to_index = {token: idx for idx, token in enumerate(vocab)}\n", 343 | " return word_to_index\n", 344 | "\n", 345 | "\n", 346 | "def Transform (preprocessed_text: list[list[str]], word_to_idx: dict) -> list[list[int]] :\n", 347 | " \"\"\"\n", 348 | " Transforms a list of tokenized text into lists of integer-encoded words.\n", 349 | "\n", 350 | " Args:\n", 351 | " preprocessed_text (list[list[str]]): A list of lists, where each sublist contains tokens from one sentence.\n", 352 | " word_to_idx (dict): A dictionary mapping words to unique integer indices.\n", 353 | "\n", 354 | " Returns:\n", 355 | " list: A list of lists, where each sublist contains the integer-encoded words for a sentence.\n", 356 | " \"\"\"\n", 357 | " data=[]\n", 358 | " for sentence in preprocessed_text :\n", 359 | " encoded_sentence = []\n", 360 | " for word in sentence:\n", 361 | " encoded_sentence.append(word_to_idx[word])\n", 362 | " data.append(encoded_sentence)\n", 363 | " return data\n", 364 | "\n", 365 | "\n", 366 | "word_to_idx = LabelEncoder(vocab= vocab)\n", 367 | "transformed_txt = Transform(preprocessed_text=preprocessed_text, word_to_idx=word_to_idx)\n", 368 | "print(word_to_idx)\n", 369 | "print(transformed_txt[:3])\n" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "id": "1b2d2df5", 375 | "metadata": {}, 376 | "source": [ 377 | "## Built in" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 197, 383 | "id": "db04548e", 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "Encoded labels: [ 2 61 28 31 25 17 58 12 45 36 21 60 15 23 7 0 40 51 32 8 13 49 6 35\n", 391 | " 38 43 64 10 22 47 14 59 44 37 48 50 65 46 20 16 62 5 9 62 29 30 41 33\n", 392 | " 54 26 27 42 3 39 11 52 1 66 24 19 56 53 55 4 18 57 63 34]\n" 393 | ] 394 | } 395 | ], 396 | "source": [ 397 | "from sklearn.preprocessing import LabelEncoder\n", 398 | "\n", 399 | "all_tokens =[]\n", 400 | "for lst_tokens in preprocessed_text:\n", 401 | " all_tokens.extend(lst_tokens) # All Words\n", 402 | "\n", 403 | "\n", 404 | "label_encoder = LabelEncoder()\n", 405 | "encoded_labels = label_encoder.fit_transform(all_tokens)\n", 406 | "\n", 407 | "# Output encoded labels and the mapping\n", 408 | "print(\"Encoded labels:\", encoded_labels)" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 199, 414 | "id": "105c2030", 415 | "metadata": {}, 416 | "outputs": [ 417 | { 418 | "data": { 419 | "text/plain": [ 420 | "[array([ 2, 61, 28]),\n", 421 | " array([31, 25, 17, 58, 12, 45]),\n", 422 | " array([36, 21, 60]),\n", 423 | " array([15, 23, 7]),\n", 424 | " array([ 0, 40, 51, 32]),\n", 425 | " array([ 8, 13, 49, 6, 35, 38, 43, 64, 10, 22]),\n", 426 | " array([47, 14, 59, 44, 37, 48, 50, 65, 46, 20, 16, 62, 5, 9, 62, 29, 30,\n", 427 | " 41, 33, 54, 26, 27]),\n", 428 | " array([42, 3, 39, 11, 52, 1, 66, 24, 19]),\n", 429 | " array([56]),\n", 430 | " array([53, 55, 4, 18, 57, 63, 34])]" 431 | ] 432 | }, 433 | "execution_count": 199, 434 | "metadata": {}, 435 | "output_type": "execute_result" 436 | } 437 | ], 438 | "source": [ 439 | "encoded_sentences = [label_encoder.transform(sentence) for sentence in preprocessed_text]\n", 440 | "encoded_sentences" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "id": "95f03b91", 447 | "metadata": {}, 448 | "outputs": [], 449 | "source": [] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "id": "efff3800", 454 | "metadata": {}, 455 | "source": [ 456 | "# One hot Encoding" 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": 282, 462 | "id": "f9f691ca", 463 | "metadata": {}, 464 | "outputs": [ 465 | { 466 | "name": "stdout", 467 | "output_type": "stream", 468 | "text": [ 469 | "[array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 470 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 471 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 472 | " 0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 473 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 474 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n", 475 | " 0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 476 | " 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 477 | " 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", 478 | " 0])]\n" 479 | ] 480 | } 481 | ], 482 | "source": [ 483 | "def OneHotEncoder(vocab: list) -> dict :\n", 484 | " \"\"\"\n", 485 | " Creates one-hot encoded vectors for each unique word in the vocabulary.\n", 486 | "\n", 487 | " Args:\n", 488 | " vocab (list): A sorted list of unique tokens.\n", 489 | "\n", 490 | " Returns:\n", 491 | " dict: A mapping from word to its one-hot encoded numpy array.\n", 492 | " \"\"\"\n", 493 | " word_to_idx = {word: idx for idx, word in enumerate(vocab)}\n", 494 | " vocab_size = len(word_to_idx)\n", 495 | " one_hot_dict = {}\n", 496 | "\n", 497 | " for word, idx in word_to_idx.items() :\n", 498 | " # print(word, idx)\n", 499 | " vec = np.zeros(shape=vocab_size, dtype=int)\n", 500 | " vec[idx] = 1\n", 501 | " one_hot_dict[word] = vec\n", 502 | "\n", 503 | " return one_hot_dict\n", 504 | "\n", 505 | "\n", 506 | "def TransformOneHot(preprocessed_text: list[list[str]], word_to_vec: dict) -> list[list[np.ndarray]]:\n", 507 | " \"\"\"\n", 508 | " Transforms a list of tokenized sentences into one-hot encoded vectors.\n", 509 | "\n", 510 | " Args:\n", 511 | " preprocessed_text (list of list of str): Tokenized sentences.\n", 512 | " word_to_vec (dict): A mapping from word to one-hot vector.\n", 513 | "\n", 514 | " Returns:\n", 515 | " list of list of np.ndarray: One-hot encoded representation of sentences.\n", 516 | " \"\"\"\n", 517 | " data = []\n", 518 | " for sentence in preprocessed_text:\n", 519 | " encoded_sentence = []\n", 520 | " for word in sentence:\n", 521 | " encoded_sentence.append(word_to_vec[word])\n", 522 | " data.append(encoded_sentence)\n", 523 | " return data\n", 524 | "\n", 525 | "\n", 526 | "\n", 527 | "\n", 528 | "one_hot_dict = OneHotEncoder(vocab)\n", 529 | "transformed_txt = TransformOneHot(preprocessed_text=preprocessed_text, word_to_vec=one_hot_dict)\n", 530 | "# print(one_hot_dict)\n", 531 | "print(transformed_txt[0])" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "id": "aa185195", 537 | "metadata": {}, 538 | "source": [ 539 | "## Built in" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": 398, 545 | "id": "44c34f7a", 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "from sklearn.preprocessing import LabelEncoder\n", 550 | "from sklearn.preprocessing import OneHotEncoder\n", 551 | "\n", 552 | "from numpy import array" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": 418, 558 | "id": "9d58a0ac", 559 | "metadata": {}, 560 | "outputs": [ 561 | { 562 | "name": "stdout", 563 | "output_type": "stream", 564 | "text": [ 565 | "68\n", 566 | "67\n", 567 | "{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, \"patco's\": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}\n" 568 | ] 569 | } 570 | ], 571 | "source": [ 572 | "all_tokens =[]\n", 573 | "for lst_tokens in preprocessed_text:\n", 574 | " all_tokens.extend(lst_tokens) # All Words\n", 575 | "vocab = sorted(set(all_tokens)) # Unique Words\n", 576 | "\n", 577 | "print(len(all_tokens))\n", 578 | "print(len(vocab))\n", 579 | "w_idx = {w:i for i,w in enumerate(vocab)}\n", 580 | "print(w_idx)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 419, 586 | "id": "0c7cffee", 587 | "metadata": {}, 588 | "outputs": [], 589 | "source": [ 590 | "# Label Encode\n", 591 | "label_encoder = LabelEncoder()\n", 592 | "integer_encoded = label_encoder.fit_transform(array(vocab)) \n" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 420, 598 | "id": "9b16dccb", 599 | "metadata": {}, 600 | "outputs": [ 601 | { 602 | "name": "stdout", 603 | "output_type": "stream", 604 | "text": [ 605 | "{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, \"patco's\": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}\n", 606 | "{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, \"patco's\": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}\n" 607 | ] 608 | }, 609 | { 610 | "data": { 611 | "text/plain": [ 612 | "([2, 61, 28], [2, 61, 28])" 613 | ] 614 | }, 615 | "execution_count": 420, 616 | "metadata": {}, 617 | "output_type": "execute_result" 618 | } 619 | ], 620 | "source": [ 621 | "# print(integer_encoded)\n", 622 | "# print(vocab)\n", 623 | "# print(preprocessed_text )\n", 624 | "word2id = dict(zip(vocab, integer_encoded))\n", 625 | "print(word2id)\n", 626 | "print(w_idx)\n", 627 | "\n", 628 | "datamodel=[]\n", 629 | "data_me=[]\n", 630 | "\n", 631 | "for sentence in preprocessed_text:\n", 632 | " lmodel=[]\n", 633 | " lme=[]\n", 634 | "\n", 635 | " for w in sentence:\n", 636 | " lmodel.append(word2id[w])\n", 637 | " lme.append(w_idx[w])\n", 638 | " datamodel.append(lmodel)\n", 639 | " data_me.append(lme)\n", 640 | "#--------------------------------------------*************----------------\n", 641 | "data_me[0], datamodel[0]" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": null, 647 | "id": "4d4633d2", 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [] 651 | }, 652 | { 653 | "cell_type": "code", 654 | "execution_count": 421, 655 | "id": "42bbccf7", 656 | "metadata": {}, 657 | "outputs": [], 658 | "source": [ 659 | "from numpy import array, reshape\n", 660 | "\n", 661 | "integer_encoded = integer_encoded.reshape(-1, 1)\n", 662 | "\n", 663 | "onehot_encoder = OneHotEncoder(sparse_output=False)\n", 664 | "onehot_encoded = onehot_encoder.fit_transform(integer_encoded)" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": 423, 670 | "id": "b78d1f5c", 671 | "metadata": {}, 672 | "outputs": [ 673 | { 674 | "name": "stdout", 675 | "output_type": "stream", 676 | "text": [ 677 | "[array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 678 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 679 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 680 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 681 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 682 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 683 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 684 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,\n", 685 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n", 686 | " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]\n" 687 | ] 688 | } 689 | ], 690 | "source": [ 691 | "word2onehot = dict(zip(vocab, onehot_encoded))\n", 692 | "\n", 693 | "data=[]\n", 694 | "for sentence in preprocessed_text:\n", 695 | " vec = []\n", 696 | " for w in sentence:\n", 697 | " vec.append(word2onehot[w])\n", 698 | " data.append(vec)\n", 699 | "print(data[0])" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": null, 705 | "id": "5d3dc3b2", 706 | "metadata": {}, 707 | "outputs": [], 708 | "source": [] 709 | } 710 | ], 711 | "metadata": { 712 | "kernelspec": { 713 | "display_name": "myenv", 714 | "language": "python", 715 | "name": "python3" 716 | }, 717 | "language_info": { 718 | "codemirror_mode": { 719 | "name": "ipython", 720 | "version": 3 721 | }, 722 | "file_extension": ".py", 723 | "mimetype": "text/x-python", 724 | "name": "python", 725 | "nbconvert_exporter": "python", 726 | "pygments_lexer": "ipython3", 727 | "version": "3.12.6" 728 | } 729 | }, 730 | "nbformat": 4, 731 | "nbformat_minor": 5 732 | } 733 | --------------------------------------------------------------------------------