├── global_requirements.txt
├── 03-Deep Learning NLP (Models)
    ├── 3.5-Transformers
    │   ├── Theory.md
    │   └── Theory
    │   │   ├── 3.5.2-attention.md
    │   │   ├── 3.5.1-seq2seq.md
    │   │   └── 3.5.3-self attention.md
    ├── Deep Learning NLP (Models).md
    ├── 3.1-CNNS
    │   └── Theory.md
    ├── 3.4-LSTM
    │   └── Theory.md
    ├── 3.3-GRU
    │   └── Theory.md
    └── 3.2-RNNs
    │   ├── 3.2-RNNs.ipynb
    │   └── Theory.md
├── 04-crewai-agents
    ├── 4.2-Multi Agent Systems (CrewAI)
    │   └── README.md
    └── 4.1-AI Agents using CrewAI ( Abu Bakr Soliman)
    │   ├── crewai-agents
    │       ├── crewai_agents
    │       │   ├── __init__.py
    │       │   ├── agents
    │       │   │   ├── __init__.py
    │       │   │   ├── a1_search_queries_agent.py
    │       │   │   ├── a4_procurement_report.py
    │       │   │   ├── a2_search_engine_agent.py
    │       │   │   └── a3_scraping_agent.py
    │       │   ├── tasks
    │       │   │   ├── __init__.py
    │       │   │   ├── t2_search_engine_task.py
    │       │   │   ├── t4_procurement_report_task.py
    │       │   │   ├── t1_search_queries_task.py
    │       │   │   └── t3_scraping_task.py
    │       │   ├── config.py
    │       │   └── utilis.py
    │       ├── outputs
    │       │   └── ai-agent-output
    │       │   │   ├── step_3_scraping_results.json
    │       │   │   ├── step_4_procurement_report.html
    │       │   │   ├── step_1_suggested_search_queries.json
    │       │   │   └── step_2_search_results.json
    │       ├── requirements.txt
    │       ├── .gitignore
    │       ├── tests
    │       │   └── test.py
    │       ├── examples
    │       │   ├── ex2_run_search_engine_agent.py
    │       │   ├── ex1_run_search_queries_agent.py
    │       │   └── ex3_run_procurement_report_agent.py
    │       └── README.md
    │   └── README.MD
├── 01-Text-Preprocessing
    ├── Text-Preprocessing.md
    ├── requirements.txt
    └── 1.1-Text-Preprocessing
    │   ├── Theory.md
    │   └── preprocessing.ipynb
├── Questions
    └── assets
    │   ├── image.png
    │   ├── image1.png
    │   ├── image10.png
    │   ├── image11.png
    │   ├── image12.png
    │   ├── image13.png
    │   ├── image14.png
    │   ├── image15.png
    │   ├── image16.png
    │   ├── image17.png
    │   ├── image18.png
    │   ├── image19.png
    │   ├── image2.png
    │   ├── image20.png
    │   ├── image21.png
    │   ├── image22.png
    │   ├── image23.png
    │   ├── image24.png
    │   ├── image25.png
    │   ├── image26.png
    │   ├── image27.png
    │   ├── image28.png
    │   ├── image29.png
    │   ├── image3.png
    │   ├── image30.png
    │   ├── image31.png
    │   ├── image32.png
    │   ├── image33.png
    │   ├── image34.png
    │   ├── image4.png
    │   ├── image5.png
    │   ├── image6.png
    │   ├── image7.png
    │   ├── image8.png
    │   └── image9.png
├── 02-Word Embeddings
    ├── requirements.txt
    ├── 2.5-FastText
    │   ├── Theory.md
    │   └── 2.5-fast_text.ipynb
    ├── Word Embeddings.md
    ├── 2.2-BOW
    │   ├── Theory.md
    │   └── 2.2-BOW.ipynb
    ├── 2.3-TF_IDF
    │   ├── Theory.md
    │   └── 2.3-TF-IDF.ipynb
    ├── 2.1-Label Encoder and One Hot Encoder
    │   ├── Theory.md
    │   └── 2.1-label_and_oneHot_Encoder.ipynb
    └── 2.4-Word2Vec
    │   └── Theory.md
├── Data
    └── data.md
├── _config.yaml
├── LICENSE
└── README.md


/global_requirements.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/3.5-Transformers/Theory.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/Deep Learning NLP (Models).md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.2-Multi Agent Systems (CrewAI)/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/01-Text-Preprocessing/Text-Preprocessing.md:
--------------------------------------------------------------------------------
1 | # 01-Text-Preprocessing
2 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Questions/assets/image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image.png


--------------------------------------------------------------------------------
/01-Text-Preprocessing/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas version>=2.2.3
2 | nltk version>=3.9.1
3 | emoji version>=2.14.1
4 | 


--------------------------------------------------------------------------------
/Questions/assets/image1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image1.png


--------------------------------------------------------------------------------
/Questions/assets/image10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image10.png


--------------------------------------------------------------------------------
/Questions/assets/image11.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image11.png


--------------------------------------------------------------------------------
/Questions/assets/image12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image12.png


--------------------------------------------------------------------------------
/Questions/assets/image13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image13.png


--------------------------------------------------------------------------------
/Questions/assets/image14.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image14.png


--------------------------------------------------------------------------------
/Questions/assets/image15.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image15.png


--------------------------------------------------------------------------------
/Questions/assets/image16.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image16.png


--------------------------------------------------------------------------------
/Questions/assets/image17.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image17.png


--------------------------------------------------------------------------------
/Questions/assets/image18.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image18.png


--------------------------------------------------------------------------------
/Questions/assets/image19.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image19.png


--------------------------------------------------------------------------------
/Questions/assets/image2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image2.png


--------------------------------------------------------------------------------
/Questions/assets/image20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image20.png


--------------------------------------------------------------------------------
/Questions/assets/image21.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image21.png


--------------------------------------------------------------------------------
/Questions/assets/image22.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image22.png


--------------------------------------------------------------------------------
/Questions/assets/image23.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image23.png


--------------------------------------------------------------------------------
/Questions/assets/image24.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image24.png


--------------------------------------------------------------------------------
/Questions/assets/image25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image25.png


--------------------------------------------------------------------------------
/Questions/assets/image26.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image26.png


--------------------------------------------------------------------------------
/Questions/assets/image27.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image27.png


--------------------------------------------------------------------------------
/Questions/assets/image28.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image28.png


--------------------------------------------------------------------------------
/Questions/assets/image29.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image29.png


--------------------------------------------------------------------------------
/Questions/assets/image3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image3.png


--------------------------------------------------------------------------------
/Questions/assets/image30.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image30.png


--------------------------------------------------------------------------------
/Questions/assets/image31.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image31.png


--------------------------------------------------------------------------------
/Questions/assets/image32.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image32.png


--------------------------------------------------------------------------------
/Questions/assets/image33.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image33.png


--------------------------------------------------------------------------------
/Questions/assets/image34.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image34.png


--------------------------------------------------------------------------------
/Questions/assets/image4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image4.png


--------------------------------------------------------------------------------
/Questions/assets/image5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image5.png


--------------------------------------------------------------------------------
/Questions/assets/image6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image6.png


--------------------------------------------------------------------------------
/Questions/assets/image7.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image7.png


--------------------------------------------------------------------------------
/Questions/assets/image8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image8.png


--------------------------------------------------------------------------------
/Questions/assets/image9.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Fawzy-AI-Explorer/NLP-Tea/HEAD/Questions/assets/image9.png


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/outputs/ai-agent-output/step_3_scraping_results.json:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/outputs/ai-agent-output/step_4_procurement_report.html:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/requirements.txt:
--------------------------------------------------------------------------------
1 | crewai
2 | agentops
3 | tavily-python
4 | scrapegraph-py
5 | langchain


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | .venv
3 | .env
4 | __pycache__/
5 | *.pyc
6 | *.log
7 | agentops.log
8 | agentops-tmp.log


--------------------------------------------------------------------------------
/02-Word Embeddings/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas version>=2.2.3
2 | nltk version>=3.9.1
3 | emoji version>=2.14.1
4 | contractions>=0.1.73
5 | scikit-learn>=1.5.2
6 | numpy>=1.26.3
7 | gensim>=4.3.2
8 | fasttext>=0.9.2
9 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/outputs/ai-agent-output/step_1_suggested_search_queries.json:
--------------------------------------------------------------------------------
1 | {
2 |   "queries": [
3 |     "Professional Development Book Egypt | Best Prices",
4 |     "Professional Development Book Egypt | Compare Prices Across Sites"
5 |   ]
6 | }


--------------------------------------------------------------------------------
/Data/data.md:
--------------------------------------------------------------------------------
 1 | # Data
 2 | 
 3 | ## 01- Text Preprocessing
 4 | [yelp_academic_dataset_tip](https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset/data)
 5 | 
 6 | 
 7 | ## 02- Word Embeddings
 8 | 
 9 | [IMDB Dataset Movie Reviews](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/config.py:
--------------------------------------------------------------------------------
 1 | from crewai import LLM
 2 | 
 3 | 
 4 | # set the output directory for the agent
 5 | output_dir= r"outputs/ai-agent-output"
 6 | 
 7 | 
 8 | # Initialize LLM
 9 | llm = LLM(
10 |     model="ollama/deepseek-r1",
11 |     base_url="http://localhost:11434", 
12 |     temperature=0.5
13 | )
14 | 
15 | 
16 | 


--------------------------------------------------------------------------------
/02-Word Embeddings/2.5-FastText/Theory.md:
--------------------------------------------------------------------------------
 1 | # Word Embeddings
 2 | 
 3 | [01- Label Encoder & One Hot Encoder](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder)
 4 | <br>
 5 | 
 6 | [02 - BOW](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.2-BOW)
 7 | <br>
 8 | 
 9 | [03 - TF-IDF](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.3-TF_IDF)
10 | <br>
11 | 
12 | [03 - Word2Vec](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.4-Word2Vec)
13 | <br>
14 | 
15 | ## 05 - FastText
16 | 
17 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/a1_search_queries_agent.py:
--------------------------------------------------------------------------------
 1 | from crewai import Agent
 2 | from crewai_agents.config import llm
 3 | 
 4 | 
 5 | search_queries_recommendation_agent = Agent(
 6 |     role="Search Queries Recommendation Agent",
 7 |     goal="\n".join([
 8 |                 "To provide a list of suggested search queries to be passed to the search engine.",
 9 |                 "The queries must be varied and looking for specific items."
10 |             ]),
11 |     backstory="The agent is designed to help in looking for products by providing a list of suggested search queries to be passed to the search engine based on the context provided.",
12 |     llm=llm,
13 |     verbose=True,
14 | )


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/a4_procurement_report.py:
--------------------------------------------------------------------------------
 1 | from crewai import Agent
 2 | from crewai_agents.config import llm
 3 | 
 4 | 
 5 | procurement_report_author_agent = Agent(
 6 |     role="Procurement Report Author Agent",
 7 |     goal="To generate a professional, dynamic HTML page for the procurement report that incorporates product data, price comparisons, and company-specific insights.",
 8 |     backstory=(
 9 |         "The agent is designed to assist in generating a professional HTML page for a procurement report. "
10 |         "It gathers data from various websites, compares product prices, and structures the report according to the company's specific requirements. "
11 |         "The agent should tailor the report by considering the company's procurement goals, budget constraints, and preferred suppliers."
12 |     ),
13 |     llm=llm,
14 |     verbose=True,
15 | )


--------------------------------------------------------------------------------
/_config.yaml:
--------------------------------------------------------------------------------
 1 | title: NLP-Tea
 2 | author: Mohammad Fawzy
 3 | description: my journey learning Natural Language Processing. It includes theory notes, code examples, and useful resources for understanding and applying NLP concepts.
 4 | remote_theme: daattali/beautiful-jekyll@6.0.1
 5 | 
 6 | ###############################################
 7 | # --- List of links in the navigation bar --- #
 8 | ###############################################
 9 | 
10 | navbar-links:
11 |   About Me: https://www.linkedin.com/in/mohammad-fawzy-438b05261/
12 | 
13 | ################
14 | # --- Logo --- #
15 | ################
16 | 
17 | avatar: "/assets/img/avatar-icon.png"
18 | round-avatar: true
19 | 
20 | 
21 | social-network-links:
22 |   email: "moha.fawzy63@gmail.com"
23 |   linkedin: mohammad-fawzy-438b05261
24 |   rss: true  # remove this line if you don't want to show an RSS link at the bottom
25 |   github: Fawzy-AI-Explorer
26 |   kaggle: mohammadfawzy
27 |   youtube: "@kiloeducation360"
28 |   telegram: mohammad_fawzy_m
29 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/t2_search_engine_task.py:
--------------------------------------------------------------------------------
 1 | from crewai import Task
 2 | from pydantic import BaseModel, Field
 3 | from typing import List
 4 | import os
 5 | from crewai_agents.agents.a2_search_engine_agent import search_engine_agent
 6 | from crewai_agents.config import output_dir
 7 | 
 8 | class SignleSearchResult(BaseModel):
 9 |     title: str
10 |     url: str 
11 |     content: str
12 |     score: float
13 |     search_query: str
14 | 
15 | class AllSearchResults(BaseModel):
16 |     results: List[SignleSearchResult]
17 | 
18 | search_engine_task = Task(
19 |     description="\n".join([
20 |         "The task is to search for products based on the suggested search queries.",
21 |     ]),
22 |     expected_output="A JSON object containing the search results.",
23 |     output_json=AllSearchResults,
24 |     output_file=os.path.join(output_dir, "step_2_search_results.json"),
25 |     agent=search_engine_agent,
26 | )
27 | #  {queries} → pulls the list generated by Task #1.
28 | # after Task #2 completes, the context will contain a Python list under the key results.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Mohmmad Fawzy
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/utilis.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | # Load environment variables from a .env file
 4 | from dotenv import load_dotenv
 5 | load_dotenv()
 6 | 
 7 | 
 8 | def get_agentops_api_key() -> str:
 9 |     """
10 |     Returns AgentOps API key from the environment.
11 |     """
12 |     key = os.getenv("AGENTOPS_API_KEY")
13 |     if not key:
14 |         raise RuntimeError("AGENTOPS_API_KEY not found in environment")
15 |     return key
16 | 
17 | def set_agentops_api_key(api_key):
18 |     """
19 |     Sets AgentOps API key as an environment variable.
20 |     """
21 |     os.environ["AGENTOPS_API_KEY"] = api_key
22 | 
23 | 
24 | def get_tavily_api_key() -> str:
25 |     """
26 |     Returns TAVILY_API_KEY from the environment.
27 |     """
28 |     key = os.getenv("TAVILY_API_KEY")
29 |     if not key:
30 |         raise RuntimeError("TAVILY_API_KEY not found in environment")
31 |     return key
32 | 
33 | def get_scrap_api_key() -> str:
34 |     """
35 |     Returns SCRAP_API_KEY from the environment.
36 |     """
37 |     key = os.getenv("SCRAP_API_KEY")
38 |     if not key:
39 |         raise RuntimeError("SCRAP_API_KEY not found in environment")
40 |     return key


--------------------------------------------------------------------------------
/02-Word Embeddings/Word Embeddings.md:
--------------------------------------------------------------------------------
 1 | # Word Embeddings
 2 | 
 3 | ## What is Word Embedding ?
 4 | 
 5 | Inputs to Machine learning algorithms are Numbers (Scalars, Vectors). <br>
 6 | Text must be converted into vectors.<br>
 7 | 
 8 | a way of representing words as vectors in a multi-dimensional space, where the distance between vectors reflect the similarity and relationships between the words.<br>
 9 | 
10 | representing words in a way that machines can understand. <br>
11 | 
12 | There are two main Approaches for word embedding:
13 | -  Frequency Based Embedding
14 |     -  [Label (integer) Encoding](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder)  
15 |     -  [One-Hot encoded vector](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder) 
16 |     -  [Bag of Word (BOW) Count Vector](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.2-BOW) 
17 |     -  [Term Frequency- Inverse Document frequency (TF-IDF) Vector](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.3-TF_IDF)  
18 | -  Prediction Based Embedding
19 |     -  Word2Vec
20 |         -  CBOW
21 |         -  Skip Gram
22 |             -  Negative Sampling  
23 |     -  Fast Text    
24 | 
25 | 
26 | 
27 |  
28 | 
29 | 
30 | 
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/a2_search_engine_agent.py:
--------------------------------------------------------------------------------
 1 | from crewai import Agent
 2 | from crewai.tools import tool
 3 | 
 4 | from tavily import TavilyClient
 5 | 
 6 | from crewai_agents.config import llm
 7 | from crewai_agents.utilis import get_tavily_api_key
 8 | 
 9 | tavily_api_key = get_tavily_api_key()
10 | tavily_client = TavilyClient(tavily_api_key)
11 | 
12 | 
13 | @tool # Decorator indicating this function interacts with an external tool (Tavily)
14 | def search_engine_tool(query: str):
15 |     """Useful for search-based queries. Use this to find current information about any query related pages using a search engine"""
16 |     print(f"[DEBUG] Searching with query: {query}")
17 |     return tavily_client.search(query)
18 | 
19 | search_engine_agent = Agent(
20 |     role="Search Engine Agent",
21 |         goal=(
22 |         "You are a web search expert.  \n"
23 |         "When you need to look up a product, call the tool **search_engine_tool**.  \n"
24 |         "Format your tool call exactly as:\n\n"
25 |         "Action: search_engine_tool\n"
26 |         "Action Input: {\"query\": \"<your search phrase here>\"}\n\n"
27 |         "Then wait for the Observation before proceeding."
28 |     ),
29 |     # goal="To search for products based on the suggested search query",
30 |     backstory="The agent is designed to help in looking for products by searching for products based on the suggested search queries.",
31 |     llm=llm,
32 |     verbose=True,
33 |     tools=[search_engine_tool]   # New
34 | )


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/t4_procurement_report_task.py:
--------------------------------------------------------------------------------
 1 | from crewai import Task
 2 | import os
 3 | from crewai_agents.agents.a4_procurement_report import procurement_report_author_agent
 4 | from crewai_agents.config import output_dir
 5 | 
 6 | 
 7 | 
 8 | procurement_report_author_task = Task(
 9 |     description="\n".join([
10 |         "The task is to generate a professional HTML page for the procurement report with the following structure:",
11 |         "1. Executive Summary: A brief overview of the procurement process and key findings.",
12 |         "2. Introduction: An introduction to the purpose and scope of the report, including company-specific insights.",
13 |         "3. Methodology: A detailed description of the methods used to gather and compare prices from different sources.",
14 |         "4. Findings: A dynamic table displaying product data (title, price, capacity, material) at least 5 products sourced from multiple websites.",
15 |         "5. Analysis: In-depth analysis of the findings, highlighting significant trends, price discrepancies, and recommendations for suppliers.",
16 |         "6. Recommendations: Actionable procurement recommendations based on the analysis, including potential supplier choices.",
17 |         "7. Conclusion: A concise summary of the report with key takeaways and next steps.",
18 |         "8. Appendices: Any supplementary data, charts, or raw product data."
19 |     ]),
20 | 
21 |     expected_output="A professional, fully formatted HTML procurement report with dynamic content based on provided product data.",
22 |     output_file=os.path.join(output_dir, "step_4_procurement_report.html"),
23 |     agent=procurement_report_author_agent,
24 | )


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/3.5-Transformers/Theory/3.5.2-attention.md:
--------------------------------------------------------------------------------
 1 | ## Attention Mechanism
 2 | To solve the bottleneck issue, the **Attention mechanism** was introduced. Instead of relying on a single context vector, Attention assigns different weights to different parts of the input sequence, allowing the decoder to focus on relevant words at each step.
 3 | - enabling the decoder to look at all encoder outputs (Weighted).
 4 | - Reduces the reliance on a single context vector.
 5 | 
 6 | **Benefits of Attention**
 7 | - **Improves performance on long sequences** by dynamically selecting relevant parts of the input.
 8 | -  **Eliminates the fixed-size bottleneck** by allowing the decoder to access all hidden states of the encoder.
 9 | 
10 | ![image](https://github.com/user-attachments/assets/78f2ca58-ddb5-4d22-9a82-4b95f37f6cb0)
11 | 
12 | ![image](https://github.com/user-attachments/assets/9332af03-e0dd-48a3-ae13-dbdd7d8942f4)
13 | 
14 | ### Attention Block Calculations
15 |                                                   
16 | - Inputs : (S(i), h1,h2,h3,.....,hn)                          
17 | - Output : S(i)~                     
18 |                                  
19 | 1. Calc Score                                      
20 | Score (a,b) = a.b or f(W.a + W.b)
21 |                     
22 |    -  Score (s0, h1) = α1                       
23 |    -  Score (s0, h2) = α2                                
24 |    -  Score (s0, h3) = α3                                 
25 | 3. Soft max over Scores                          
26 | 	- (α1 + α2 + α3 = 1)                           
27 | 4. context vector =>>                    
28 | 	- c(0) = α1.h1 + α2.h2 + α3.h3            
29 | 5. Combine Context and Decoder State             
30 | 	- s(0)~ = tanh (s0, c0)
31 | ---          
32 | - α(i) = Score (s0, hi)               
33 | - softmax (α)                      
34 | - C(0) = SUM (α(i).h(i))                      
35 | - s(0)~ = tanh (s0, c0)             
36 | 


--------------------------------------------------------------------------------
/02-Word Embeddings/2.2-BOW/Theory.md:
--------------------------------------------------------------------------------
 1 | # Word Embeddings
 2 | 
 3 | [01- Label Encoder & One Hot Encoder](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder)
 4 | <br>
 5 | 
 6 | # 02 - Bag Of Words
 7 | 
 8 | ## What is Bag of Words (BoW)?
 9 | 
10 | convert text into numerical. It treats a document as an unordered collection (or "bag") of words, ignoring word order and structure. Each document is represented as a vector where each dimension corresponds to the frequency (or presence) of a word from a vocabulary(Unique Words). 
11 | 
12 | 
13 | 
14 | ## Steps
15 | 
16 | 1. Prepare your corpus  
17 | 2. Preprocessing (corpus)  
18 | 3. Create Vocabulary (unique words in the corpus)  
19 | 4. Calculate count of vocab words (histogram) in each document  
20 |    - For Each Doc: create a vector of word counts
21 |      - Calculate the count of each vocab word 
22 |      - Each position in the vector corresponds to a word in the vocabulary (number of times that word appears in the document)
23 | 
24 | 
25 | For documents not considered during Vocab design , they may contain some words not in vocabulary (Out of Vocab). Those words are ignored.
26 | 
27 | 
28 | ## Limitations:
29 | -  No context
30 |     -  Ignores word order, syntax, and semantic relationships 
31 | -  High dimensionality
32 |     -  large vocabulary (large Number of Unique Words) => 
33 | -  Sparse data
34 |     -  Most values are zeros
35 | -  BoW is designed for representing entire documents (or sentences) as vectors, not individual words
36 | 
37 | 
38 | 
39 |               W1     W2     W3     W4     ...............Wv ==> Vocab (Unique Words)
40 |         Doc1 [                                             ]  => len = len(vocab) = len (Unique words)                  
41 |         Doc2 [                                             ]
42 |         Doc3 [                                             ] 
43 | 
44 | ---
45 | ---
46 | ---
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/t1_search_queries_task.py:
--------------------------------------------------------------------------------
 1 | from crewai import Task
 2 | from pydantic import BaseModel, Field
 3 | from typing import List
 4 | import json
 5 | import os
 6 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent
 7 | from crewai_agents.config import output_dir
 8 | 
 9 | # no_keywords=10
10 | class SuggestedSearchQueries(BaseModel):
11 |     queries: List[str] = Field(..., title="Suggested search queries to be passed to the search engine",
12 |                                min_items=1, max_items=3)
13 | 
14 | search_queries_recommendation_task = Task(
15 |     description="\n".join([
16 |         "Rankyx is looking to buy {product_name} at the best prices (value for a price strategy)",
17 |         "The campany target any of these websites to buy from: {websites_list}",
18 |         "The company wants to reach all available proucts on the internet to be compared later in another stage.",
19 |         "The stores must sell the product in {country_name}",
20 |         "Generate at maximum {no_keywords} queries.",
21 |         "The search keywords must be in {language} language.",
22 |         "Search keywords must contains specific brands, types or technologies. Avoid general keywords.",
23 |         "The search query must reach an ecommerce webpage for product, and not a blog or listing page."
24 |     ]),
25 |     expected_output="A JSON object containing a list of suggested search queries.",
26 |     output_json=SuggestedSearchQueries,
27 |     output_file=os.path.join(output_dir, "step_1_suggested_search_queries.json"),
28 |     agent=search_queries_recommendation_agent
29 | )
30 | 
31 | # once it finishes, you get a Pydantic object SuggestedSearchQueries(queries=[…])
32 | # CrewAI will automatically make that available to 
33 | # Task #2 under the name of the field—here, queries.
34 | # Task #2 can refer to {queries} in its own prompt   # IMPORTANT IMPORTANT IMPORTANT IMPORTANT
35 | 
36 | # In sequential mode, CrewAI will automatically merge the fields of Task #1’s output_json model into the next task’s context


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/3.1-CNNS/Theory.md:
--------------------------------------------------------------------------------
 1 | # 1D CNN
 2 | 1D CNN (1-dimensional convolutional neural network) is a type of neural network that learns patterns in 1D data. It’s often used for:
 3 | - Time series data
 4 | - Text or word sequences
 5 | - Sensor data
 6 | - Audio signals
 7 | 
 8 | 
 9 | Imagine you have a row of numbers (Word embeddings)
10 | - A 1D CNN uses a small filter (like a window) that slides over the row and detects patterns
11 | 
12 | 
13 | Benefits of 1D CNN
14 | - Fast and efficient
15 | - Good at finding local patterns
16 | - Needs fewer parameters than RNNs or LSTMs
17 | - Can handle long sequences if combined with pooling
18 | 
19 | 
20 | Shape of Input
21 | A 1D CNN expects input like this:
22 | - (samples, sequence_length, channels)
23 | - (200, 100, 30) => 200 Sentences , each one 100 word, each word vec 30
24 | 
25 | 
26 | 
27 | (10, 3)  →  10 words per sentence, 3 features per word   
28 | - ![image](https://github.com/user-attachments/assets/d9ef5745-7585-44b9-9bcb-81839013731a)
29 | 
30 | Conv1D layer: 1 filter, kernel size = 3 (3*3(As number of features = 3))   
31 | - ![image](https://github.com/user-attachments/assets/2c1799dd-becc-496b-a01c-d61d985556a1)
32 | 
33 | - output shape = 10-3+1=8 => (8, 1)
34 | 
35 | Conv1D layer 2 filters, kernel size = 3 (3*3(As number of features = 3))
36 | - output shape = 10-3+1=8 => (8, 2)
37 | - ![image](https://github.com/user-attachments/assets/cfd84896-5c1c-4b7d-a835-7fcc34d4e959)
38 | 
39 | 
40 | 
41 | - Input shape (99, 30)
42 | - Conv1D (10 Filter, Shape = 3, padding = "Valid", stride = 1)
43 |   - (99 - 3)/1 + 1 = 97  
44 |   - Shape = (97, 10)  
45 | - Conv1D (20 Filter, Shape = 3, padding = "Valid", stride = 2)
46 |   - (97 - 3)/2 + 1 = 48
47 |   - Shape = (48, 20)
48 | - MaxPooling1D layer: pool size = 2
49 |   - output shape = (24, 20) 
50 | 
51 | ![image](https://github.com/user-attachments/assets/b33793be-3f17-43aa-a655-30221d9e43cf)
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/3.5-Transformers/Theory/3.5.1-seq2seq.md:
--------------------------------------------------------------------------------
 1 | ## Encoder-Decoder Sequence-to-Sequence Model
 2 | The **Encoder-Decoder** architecture commonly used for tasks that involve transforming one sequence into another, such as **machine translation, text summarization.
 3 | 
 4 | ### two main components:
 5 | 1. **Encoder**: Processes the input sequence and converts it into a fixed-length context vector (hidden state). This vector captures the Whole of the input.
 6 | 	compressed summary of the entire input sequence, capturing its meaning and structure. This vector is then passed to the **decoder**, which generates the output sequence.   
 7 | 	a(t) = F(Wxa * X + Waa * a(t-1) + ba)   
 8 |     size of hidden =  number of nodes in RNN
 9 | 2. Decoder: Takes the context vector and generates the output sequence, step by step.   
10 | 
11 | This architecture was originally built using **Recurrent Neural Networks (RNNs)**, specifically **Long Short-Term Memory (LSTM)** and **Gated Recurrent Unit (GRU)** networks.
12 | 
13 | ---
14 | **Cons:**
15 | - **Bottleneck issue**:
16 |     - Single, fixed-size context vector Capture the meaning of Entire Input Sequence.
17 |     - Single, fixed-size context vector limits the ability to store long Sequencies.
18 | - **Sequential Processing (NO parallelize)**: Since RNNs process sequences step-by-step, they cannot be easily parallelized.
19 | - **Struggles with very long sequences**: LSTMs and GRUs still struggle with very long dependencies, even though they improve over simple RNNs.
20 | 
21 | ```
22 | Encoder :          
23 | h0 = 0                    
24 | h1 = f (Wxh.X1 + Whh.h0)        
25 | h2 = f (Wxh.X2 + Whh.h1)             
26 | h3 = f (Wxh.X3 + Whh.h2)              
27 | Decoder :                                           
28 | s0 = h3                   || y0 = <SOS>                
29 | s1 = f (Wys.Y0 + Wss.S0)  || Y1 = softmax (Wsy.S1)            
30 | s2 = f (Wys.Y1 + Wss.S1)  || Y2 = softmax (Wsy.S2)                 
31 | s3 = f (Wys.Y2 + Wss.S2)  || Y3 = softmax (Wsy.S3)                       
32 | ```
33 | 
34 | ![image](https://github.com/user-attachments/assets/ea62fdc0-7289-4aa6-bd03-1ba27ced51c4)
35 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/README.MD:
--------------------------------------------------------------------------------
 1 | # CrewAI
 2 | 
 3 | ## Agent & Crew
 4 | 
 5 | *What Is an AI Agent?*  
 6 | 
 7 | - An **AI agent** is a system or program designed to autonomously perform tasks on behalf نيابةً of a 
 8 |   user or another system. It perceives its environment through inputs, takes actions based on its 
 9 |   reasoning and planning capabilities, and works to achieve predefined goals
10 | - Human Delegate يُوَكل Agent to do some thing
11 |    - **Autonomy**: الاستقلاليه Agents operate with a degree of independence, deciding actions without 
12 |    continuous human prompts     
13 | 
14 | *What Is an AI Crew?*  
15 | 
16 | - An **AI Crew** is a structured, multi-agent system where each individual agents with a defined 
17 |   role collaborate to perform a complex tasks that single agents cannot handle alone
18 |    - **Role-Based Agents** : Each agent in the crew has a specific function
19 | 	 - **Collaborative Workflows**: Agents share intermediate results, delegate sub-tasks, and   
20 |      iteratively refine outputs based on peer feedback
21 | 	 - **Tool and API Integration**: Crews can use external tools—databases, ML models, web services
22 | 
23 | ## Native skills or External Tools ?  
24 | 
25 | - **Native Skills** (internal skills):  
26 | 	- These are the built-in abilities that an AI agent already has (Built-in knowledge)
27 | 		- Example: An AI agent that can read text, summarize it, and write responses  all using its own programming or model.
28 | 		- No extra tools needed.
29 | 		- Faster, but sometimes limited in what it can do.
30 | - **External Help** (tool use or API integration)
31 | 	- AI agent uses outside tools or services to do tasks (Asking other tools to help)
32 | 		- Example: An AI agent that calls Google Translate to Translate or Use calendar to host a meeting.
33 | 		- More powerful and flexible, but sometimes slower or needs internet access.   
34 | 
35 | ## Sequential Flow VS Hierarchical Flow
36 | [DOC](https://docs.crewai.com/concepts/processes)
37 | - Sequential Flow : **Agents work one after another**, passing results
38 | 	- A → B → C → D
39 | - Hierarchical Flow : **One "manager agent" controls or coordinates other agents**, giving them tasks and combining results.
40 | 	- A → B,C || B → F ||
41 | 	- Ensure to provide a manager_llm or manager_agent
42 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/tasks/t3_scraping_task.py:
--------------------------------------------------------------------------------
 1 | from crewai import Task
 2 | from pydantic import BaseModel, Field
 3 | from typing import List
 4 | import os
 5 | from crewai_agents.agents.a3_scraping_agent import scraping_agent
 6 | from crewai_agents.config import output_dir
 7 | 
 8 | 
 9 | class ProductSpec(BaseModel):
10 |     specification_name: str
11 |     specification_value: str
12 | 
13 | class SingleExtractedProduct(BaseModel):
14 |     page_url: str = Field(..., title="The original url of the product page")
15 |     product_title: str = Field(..., title="The title of the product")
16 |     product_image_url: str = Field(..., title="The url of the product image")
17 |     product_url: str = Field(..., title="The url of the product")
18 |     product_current_price: float = Field(..., title="The current price of the product")
19 |     product_original_price: float = Field(title="The original price of the product before discount. Set to None if no discount", default=None)
20 |     product_discount_percentage: float = Field(title="The discount percentage of the product. Set to None if no discount", default=None)
21 | 
22 |     product_specs: List[ProductSpec] = Field(..., title="The specifications of the product. Focus on the most important specs to compare.", min_items=1, max_items=5)
23 | 
24 |     agent_recommendation_rank: int = Field(..., title="The rank of the product to be considered in the final procurement report. (out of 5, Higher is Better) in the recommendation list ordering from the best to the worst")
25 |     agent_recommendation_notes: List[str]  = Field(..., title="A set of notes why would you recommend or not recommend this product to the company, compared to other products.")
26 | 
27 | 
28 | class AllExtractedProducts(BaseModel):
29 |     products: List[SingleExtractedProduct]
30 | 
31 | scraping_task = Task(
32 |     description="\n".join([
33 |         "The task is to extract product details from any ecommerce store page url.",
34 |         "The task has to collect results from multiple pages urls.",
35 |         "Collect the best {top_recommendations_no} products from the search results.",
36 |         "When you return your final JSON, it MUST use the top‑level key `products` (plural).",
37 |     ]),
38 |     expected_output="A JSON object containing products details",
39 |     output_json=AllExtractedProducts,
40 |     output_file=os.path.join(output_dir, "step_3_search_results.json"),
41 |     agent=scraping_agent
42 | )


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/3.4-LSTM/Theory.md:
--------------------------------------------------------------------------------
 1 | # LSTM
 2 | 
 3 | 
 4 | ```
 5 | GRU 
 6 | ----
 7 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu )
 8 | Gr = Sig ( Wxr.X(t) + Wcr.C(t-1) + br )
 9 | 
10 | C~(t) = g (Wax.X(t) + Gr[Waa.C(t-1)] + ba)
11 | C(t) = Gu.C~(t) + (1-Gu).C(t-1)
12 | 
13 | Y(t) = g (Wcy.a(t) + by)
14 | ```
15 | 
16 | ## LSTM 
17 | 
18 | 1. Removing  Relevance Gate Gr
19 | ```
20 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu )
21 | 
22 | C~(t) = tanh (Wax.X(t) + Waa.C(t-1) + ba)
23 | C(t) = Gu.C~(t) + (1-Gu).C(t-1)
24 | 
25 | Y(t) = g (Wcy.C(t) + by)
26 | ```
27 | 
28 | 2. Split “Update Gate” into two gates: “Update Gate”, “Forget Gate”
29 | - Why Apply Constrain
30 |     - C(t) = Gu.C~(t) + (1-Gu).C(t-1) 
31 |     - if take 40% from C~(t) must take 60% from C(t-1)
32 |     - what if you nedd to take 70% and 70% 
33 | ```
34 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu )
35 | Gf = Sig ( Wxf.X(t) + Wcf.C(t-1) + bf )
36 | C~(t) = tanh (Wax.X(t) + Waa.C(t-1) + ba)
37 | C(t) = Gu.C~(t) + Gf.C(t-1)   => Range Not Bounded 
38 | 
39 | Y(t) = g (Wcy.C(t) + by)
40 | ```
41 | C~(t) => [-1, +1]  
42 | C(t-1) => [-1, +1]  
43 | if you do C~(t) + C(t-1) Range Not Bounded   
44 | if you do 60% . C~(t) + 40% . C(t-1) Range Bounded from [-1, +1]
45 | 
46 | 
47 | 3. Bounded a
48 | 
49 | ```
50 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu )
51 | Gf = Sig ( Wxf.X(t) + Wcf.C(t-1) + bf )
52 | 
53 | C~(t) = tanh (Wax.X(t) + Waa.C(t-1) + ba)
54 | C(t) = Gu.C~(t) + Gf.C(t-1)   => Range Not Bounded 
55 | a(t) = tanh (C(t)) => Bounded from [-1, +1]
56 | 
57 | Y(t) = g (Wcy.a(t) + by)
58 | ```
59 | 
60 | 4. Output Gate (Go)
61 | ```
62 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu )
63 | Gf = Sig ( Wxf.X(t) + Wcf.C(t-1) + bf )
64 | Go = Sig ( Wxo.X(t) + Wco.C(t-1) + bo )
65 | 
66 | C~(t) = tanh (Wax.X(t) + Waa.C(t-1) + ba)
67 | C(t) = Gu.C~(t) + Gf.C(t-1)   => Range Not Bounded 
68 | a(t) = Go( tanh (C(t)) ) => Bounded from [-1, +1]
69 | 
70 | Y(t) = g (Wcy.a(t) + by)
71 | ```
72 | 5. Input to Gates will be a(t-1) NOT C(t-1) As a is bounded
73 | ```
74 | Gu = Sig ( Wxu.X(t) + Wau.a(t-1) + bu )
75 | Gf = Sig ( Wxf.X(t) + Waf.a(t-1) + bf )
76 | Go = Sig ( Wxo.X(t) + Wao.a(t-1) + bo )
77 | 
78 | C~(t) = tanh (Wax.X(t) + Waa.a(t-1) + ba)
79 | C(t) = Gu.C~(t) + Gf.C(t-1)   => Range Not Bounded 
80 | a(t) = Go( tanh (C(t)) ) => Bounded from [-1, +1]
81 | 
82 | Y(t) = g (Wcy.a(t) + by)
83 | ```
84 | 
85 | LSTM : 
86 | - 3 Inputs 
87 |     - 1. C(t-1)
88 |     - 2. a(t-1)
89 |     - 3. X(t)
90 | - 3 Outputs
91 |     - 1. C(t)
92 |     - 2. a(t)
93 |     - 3. y(t)
94 | 
95 | ![image](https://github.com/user-attachments/assets/0412a582-44f5-4c49-97fa-beafb49fa610)
96 | 
97 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/tests/test.py:
--------------------------------------------------------------------------------
 1 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent
 2 | from crewai_agents.tasks.t1_search_queries_task import search_queries_recommendation_task
 3 | 
 4 | from crewai_agents.agents.a2_search_engine_agent import search_engine_agent
 5 | from crewai_agents.tasks.t2_search_engine_task import search_engine_task
 6 | 
 7 | from crewai_agents.agents.a3_scraping_agent import scraping_agent
 8 | from crewai_agents.tasks.t3_scraping_task import scraping_task
 9 | 
10 | from crewai_agents.agents.a4_procurement_report import procurement_report_author_agent
11 | from crewai_agents.tasks.t4_procurement_report_task import procurement_report_author_task
12 | 
13 | from crewai_agents.utilis import get_agentops_api_key, set_agentops_api_key 
14 | 
15 | from crewai import Crew, Process
16 | 
17 | 
18 | 
19 | def run_search_engine_agent():
20 |     """Run the search engine agent and return the results."""
21 |     print("Running search engine agent...")
22 |     # Set the AgentOps API key
23 |     api_key = get_agentops_api_key()
24 |     set_agentops_api_key(api_key)
25 |     print("AgentOps API key set successfully.")
26 | 
27 |     crew = Crew(
28 |         agents=[
29 |             search_queries_recommendation_agent,
30 |             search_engine_agent,
31 |             scraping_agent,
32 |             procurement_report_author_agent
33 |               ],
34 | 
35 |         tasks=[
36 |             search_queries_recommendation_task,
37 |             search_engine_task,
38 |             scraping_task,
39 |             procurement_report_author_task          
40 |             ],
41 |         verbose=True,
42 |         process=Process.sequential
43 |     )
44 |     print("Crew initialized successfully.")
45 |         
46 |     results = crew.kickoff(
47 |         inputs={                
48 |         "product_name": "book for professional development",
49 |         "websites_list": ["amazon.eg", "jumia.com.eg", "noon.com"],
50 |         "country_name": "Egypt",
51 |         "no_keywords": 3,
52 |         "language":"english",
53 |         "score_th":0.1,
54 |         "top_recommendations_no": 5,
55 |         } 
56 |     )
57 |     print("Crew kickoff completed successfully.")
58 |     return results 
59 | 
60 | if __name__ == "__main__":
61 |     results = run_search_engine_agent()
62 |     print("Search queries recommendation task completed successfully.")
63 |     print(f"Results: {results}")
64 | 
65 | 
66 | # To Run This Script:
67 | #           cd E:\DATA SCIENCE\projects\crewai-agents22>
68 | #           python -m tests.test


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/examples/ex2_run_search_engine_agent.py:
--------------------------------------------------------------------------------
 1 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent
 2 | from crewai_agents.tasks.t1_search_queries_task import search_queries_recommendation_task
 3 | 
 4 | from crewai_agents.agents.a2_search_engine_agent import search_engine_agent
 5 | from crewai_agents.tasks.t2_search_engine_task import search_engine_task
 6 | 
 7 | from crewai_agents.utilis import get_agentops_api_key, set_agentops_api_key 
 8 | from crewai import Crew, Process
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | def run_search_engine_agent():
15 |     """Run the search engine agent and return the results."""
16 |     print("Running search engine agent...")
17 |     # Set the AgentOps API key
18 |     api_key = get_agentops_api_key()
19 |     set_agentops_api_key(api_key)
20 |     print("AgentOps API key set successfully.")
21 | 
22 |     crew = Crew(
23 |         agents=[
24 |             search_queries_recommendation_agent,
25 |             search_engine_agent
26 |               ],
27 | 
28 |         tasks=[
29 |             search_queries_recommendation_task,
30 |             search_engine_task            
31 |             ],
32 |         verbose=True,
33 |         process=Process.sequential
34 |     )
35 |     print("Crew initialized successfully.")
36 |         
37 |     results = crew.kickoff(
38 |         inputs={                
39 |         "product_name": "book for professional development",
40 |         "websites_list": ["amazon.eg", "jumia.com.eg", "noon.com"],
41 |         "country_name": "Egypt",
42 |         "no_keywords": 10,
43 |         "language":"english",
44 |         "score_th":0.1
45 |      } 
46 |     )
47 |     
48 |     print("Crew kickoff completed successfully.")
49 |     return results 
50 | 
51 | if __name__ == "__main__":
52 |     results = run_search_engine_agent()
53 |     print("Search queries recommendation task completed successfully.")
54 |     print(f"Results: {results}")
55 | 
56 | # To Run This Script:
57 | #           cd E:\DATA SCIENCE\projects\crewai-agents22>
58 | #           python -m examples.ex2_run_search_engine_agent
59 | 
60 | 
61 | '''
62 | Task Execution Flow
63 | 1. prompt
64 |     -  replaces each {…} with the value from inputs in Task description :
65 |     -  prompt sent to your search_queries_recommendation_agent
66 | 2. Agent → LLM
67 |     -  LLM generates a response based on the prompt
68 |     -  LLM response is a almost string of JSON object
69 | 3. Validation (output_json=SuggestedSearchQueries)
70 | 4. save the output to a file (output_file=os.path.join(output_dir, "step_1_suggested_search_queries.json"))
71 |       - dict of List of strings (queries) in JSON format
72 | 
73 | # once it finishes, you get a Pydantic object SuggestedSearchQueries(queries=[…])
74 | # CrewAI will automatically make that available to 
75 | # Task #2 under the name of the field—here, queries.
76 | # Task #2 can refer to {queries} in its own prompt   # IMPORTANT
77 | 
78 | '''


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/examples/ex1_run_search_queries_agent.py:
--------------------------------------------------------------------------------
 1 | from crewai import Crew, Process
 2 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent
 3 | from crewai_agents.tasks.t1_search_queries_task import search_queries_recommendation_task
 4 | from crewai_agents.utilis import get_agentops_api_key, set_agentops_api_key 
 5 | 
 6 | 
 7 | def run_search_queries_agent():
 8 |     """Run the search queries recommendation agent and return the results."""
 9 | 
10 |     api_key = get_agentops_api_key()
11 |     set_agentops_api_key(api_key)
12 | 
13 |     crew = Crew(
14 |         agents=[search_queries_recommendation_agent],
15 |         tasks=[search_queries_recommendation_task],
16 |         verbose=True,
17 |         process=Process.sequential
18 |     )
19 | 
20 |     results = crew.kickoff(
21 |         inputs={                 # if the Task doesn't include any variables, you wouldn't need to include the inputs argument
22 |         "product_name": "coffee machine for the office",
23 |         "websites_list": ["amazon.eg", "jumia.com.eg", "noon.com"],
24 |         "country_name": "Egypt",
25 |         "no_keywords": 10,
26 |         "language":"english"
27 |         } 
28 |      )
29 |     return results 
30 | 
31 | def print_json():
32 |     import json
33 |     with open(r"E:\DATA SCIENCE\projects\crewai-agents22\outputs\ai-agent-output\step_1_suggested_search_queries.json") as f:
34 |         data = json.load(f)
35 |     print("type of data: ", type(data))   # <class 'dict'>
36 |     print("type of data[queries]: ", type(data["queries"])) # <class 'list'>
37 | 
38 |     print(data["queries"], "\n")
39 |     for q in data["queries"]:
40 |         print(type(q), q) # <class 'str'> 
41 |     
42 | 
43 | if __name__ == "__main__":
44 |     # results = run_search_queries_agent()
45 |     print("Search queries recommendation task completed successfully.")
46 |     # print(f"Results: {results}") # Pydantic object
47 |     print("*"*90, "\n")
48 |     print_json()
49 | 
50 | # To Run This Script:
51 | #           cd E:\DATA SCIENCE\projects\crewai-agents22>
52 | #           python -m examples.ex1_run_search_queries_agent
53 | 
54 | 
55 | '''
56 | Task Execution Flow
57 | 1. prompt
58 |     -  replaces each {…} with the value from inputs in Task description :
59 |     -  prompt sent to your search_queries_recommendation_agent
60 | 2. Agent → LLM
61 |     -  LLM generates a response based on the prompt
62 |     -  LLM response is a almost string of JSON object
63 | 3. Validation (output_json=SuggestedSearchQueries)
64 | 4. save the output to a file (output_file=os.path.join(output_dir, "step_1_suggested_search_queries.json"))
65 |       - dict of List of strings (queries) in JSON format
66 | 
67 | # once it finishes, you get a Pydantic object SuggestedSearchQueries(queries=[…])
68 | # CrewAI will automatically make that available to 
69 | # Task #2 under the name of the field—here, queries.
70 | # Task #2 can refer to {queries} in its own prompt   # IMPORTANT
71 | 
72 | '''


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/3.3-GRU/Theory.md:
--------------------------------------------------------------------------------
 1 | # GRU
 2 | 
 3 | GRUs are an improved version of Recurrent Neural Networks (RNNs) designed to better capture long-term dependencies in sequential data.
 4 | **RNNs**
 5 | -  RNNs maintain a hidden state a(t)​ that is updated at each time step t based on the input x(t)​ and the previous hidden state h(t-1)​.
 6 | -  a(t​) = g(W. x(t)​ + W​ ⋅ h(t−1) ​+ b )
 7 | -  Challenges with Long-Term Dependencies:
 8 | 	-  Poor memory of long-term dependencies in sequences.
 9 |     -  Vanishing & Exploding Gradient Problem
10 |     -  always update a if u work on videos 50 frames (ads appear from t=5 to t=8) network doesn't want to take these frames in history 
11 | مش عايزة تاخدها معاها ملهاش لازمة يعني time steps وخلاص النتورك ممكن تبقا فيه Update انا مش عايز كل مرة اعمل 
12 | 
13 | GRUs introduce gates to control the flow of information, solving the vanishing gradient problem and improving long-term dependency handling.
14 | 
15 | 
16 | 
17 | ```
18 | RNNs
19 | 
20 | a(t) = g (Wax.X(t) + Waa.a(t-1) + ba)
21 | ---------------------------
22 | GRU
23 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba)
24 | a(t) = Gu.a~(t) + (1-Gu).a(t-1)
25 | 
26 | if Gu=1 
27 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba)
28 | a(t) = Gu.a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba) ==> RNN || Update History with current input
29 | 
30 | if Gu=0 
31 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba)
32 | a(t) = a(t-1) ==> do not Update History || Drop the current input
33 | 
34 | if Gu = 0.6 
35 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba)
36 | a(t) = 0.6.a~(t) + 0.4.a(t-1) ==> take 60% from a~(t) and 40% from a(t-1)
37 | 
38 | 
39 | The update gate U decides how much of the previous hidden state (a(t−1) needs to be retained and how much of the new candidate hidden state a~(t) should replace it. 
40 | 
41 | ```
42 | Gu ===> you will take the current time step in history or not ?
43 | if u need to forget All history and start from current time step
44 | 
45 | ```
46 | GRU
47 | Gu = Sig ( Wxu.X(t) + Wau.a(t-1) + bu )
48 | Gr = Sig ( Wxr.X(t) + War.a(t-1) + br )
49 | 
50 | a~(t) = g (Wax.X(t) + Gr[Waa.a(t-1)] + ba)
51 | a(t) = Gu.a~(t) + (1-Gu).a(t-1)
52 | ---------------
53 | if Gr = 0, Gu = 1  ==> Traditional NN 
54 | a~(t) = g (Wax.X(t) + ba)
55 | a(t) = a~(t)  
56 | 
57 | if Gr = 1, Gu = 1  ==> RNN 
58 | a~(t) = g (Wax.X(t) + Waa.a(t-1) + ba)
59 | a(t) = Gu.a~(t) 
60 | 
61 | ```
62 | 
63 | GRUs use two gates:
64 |  - Update Gate (Gu​): Decides how much of the new information to use.
65 | 	- Balances **new information** a~(t) and **past information** a(t−1).
66 | 	- Gu = Sig ( Wxu.X(t) + Wau.a(t-1) + bu )
67 | - Relevance Gate (Rt): Decides how much of the past information to forget.
68 | 	- Controls **how much past information to forget** while computing the new candidate activation.
69 | 	- Gr = Sig ( Wxr.X(t) + War.a(t-1) + br )
70 | 
71 | 
72 | ```
73 | Gu = Sig ( Wxu.X(t) + Wcu.C(t-1) + bu )
74 | Gr = Sig ( Wxr.X(t) + Wcr.C(t-1) + br )
75 | 
76 | C~(t) = g (Wax.X(t) + Gr[Waa.C(t-1)] + ba)
77 | C(t) = Gu.C~(t) + (1-Gu).C(t-1)
78 | 
79 | Y(t) = g (Wcy.a(t) + by)
80 | ```
81 | 
82 | 
83 | ![image](https://github.com/user-attachments/assets/95737e76-f42a-4389-996e-2d662509f5f3)
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/crewai_agents/agents/a3_scraping_agent.py:
--------------------------------------------------------------------------------
 1 | from crewai import Agent
 2 | from crewai.tools import tool
 3 | from scrapegraph_py import Client
 4 | from pydantic import BaseModel, Field
 5 | from typing import List
 6 | 
 7 | from crewai_agents.config import llm
 8 | from crewai_agents.utilis import get_scrap_api_key
 9 | 
10 | 
11 | scrap_key = get_scrap_api_key()
12 | scrap_client = Client(api_key=scrap_key)
13 | 
14 | 
15 | 
16 | 
17 | 
18 | class ProductSpec(BaseModel):
19 |     specification_name: str
20 |     specification_value: str
21 | 
22 | class SingleExtractedProduct(BaseModel):
23 |     page_url: str = Field(..., title="The original url of the product page")
24 |     product_title: str = Field(..., title="The title of the product")
25 |     product_image_url: str = Field(..., title="The url of the product image")
26 |     product_url: str = Field(..., title="The url of the product")
27 |     product_current_price: float = Field(..., title="The current price of the product")
28 |     product_original_price: float = Field(title="The original price of the product before discount. Set to None if no discount", default=None)
29 |     product_discount_percentage: float = Field(title="The discount percentage of the product. Set to None if no discount", default=None)
30 | 
31 |     product_specs: List[ProductSpec] = Field(..., title="The specifications of the product. Focus on the most important specs to compare.", min_items=1, max_items=5)
32 | 
33 |     agent_recommendation_rank: int = Field(..., title="The rank of the product to be considered in the final procurement report. (out of 5, Higher is Better) in the recommendation list ordering from the best to the worst")
34 |     agent_recommendation_notes: List[str]  = Field(..., title="A set of notes why would you recommend or not recommend this product to the company, compared to other products.")
35 | 
36 | 
37 | class AllExtractedProducts(BaseModel):
38 |     products: List[SingleExtractedProduct]
39 | 
40 | @tool
41 | def web_scraping_tool(page_url: str):
42 |     """
43 |     An AI Tool to help an agent to scrape a web page
44 | 
45 |     Example:
46 |     web_scraping_tool(
47 |         page_url="https://www.noon.com/egypt-en/15-bar-fully-automatic-espresso-machine-1-8-l-1500"
48 |     )
49 |     """
50 |     details = scrap_client.smartscraper(
51 |         website_url=page_url,
52 |         user_prompt="Extract ```json\n" + SingleExtractedProduct.schema_json() + "```\n From the web page"
53 |     )
54 | 
55 |     return {
56 |         "page_url": page_url,
57 |         "details": details
58 |     }
59 | 
60 | 
61 | scraping_agent = Agent(
62 |     role="Web scraping agent",
63 |     # goal="To extract details from any website",
64 |     goal="\n".join([
65 |       "To extract details from any website",
66 |       "When you return your final JSON, it MUST use the top‑level key `products` (plural).",
67 |       "Example:",
68 |       "Final Answer:",
69 |       "{",
70 |       '  "products": [ { … }, { … } ]',
71 |       "}"
72 |     ]),
73 |     backstory="The agent is designed to help in looking for required values from any website url. These details will be used to decide which best product to buy.",
74 |     llm=llm,
75 |     tools=[web_scraping_tool],
76 |     verbose=True,
77 | )


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # NLP-Tea 
  2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 
  3 | [![Python 3.11](https://img.shields.io/badge/python-3.11-blue.svg)](https://www.python.org/downloads/) 
  4 | [![GitHub stars](https://img.shields.io/github/stars/Fawzy-AI-Explorer/NLP-Tea?style=social)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/stargazers)
  5 | [![GitHub forks](https://img.shields.io/github/forks/Fawzy-AI-Explorer/NLP-Tea?style=social)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/network/members) 
  6 | [![GitHub watchers](https://img.shields.io/github/watchers/Fawzy-AI-Explorer/NLP-Tea?style=social)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/watchers) 
  7 | [![GitHub](https://img.shields.io/badge/GitHub-View_Project-blue?logo=GitHub)](https://github.com/Fawzy-AI-Explorer/NLP-Tea)
  8 | 
  9 | ## Contents
 10 | 
 11 | - [Introduction](#introduction)
 12 | - [Content](#content)
 13 |   - [01-Text Preprocessing](#01-text-preprocessing)
 14 |   - [02-Embeddings](#02-embeddings)
 15 |   - [03-Models](#03-models)
 16 | - [Installation](#installation)
 17 | - [Templates](#templates)
 18 | - [License](#license)
 19 | - [Contributing](#contributing)
 20 | 
 21 | ## Introduction
 22 | 
 23 | This repository documents my journey learning Natural Language Processing.  
 24 | It includes theory notes, code examples, and useful resources for understanding and applying NLP concepts.   
 25 | 
 26 | ## Content
 27 | 
 28 | This repository is divided into clear sections.   
 29 | Each section teaches something important about NLP.   
 30 | 
 31 | ---
 32 | 
 33 | ## 01-Text Preprocessing
 34 | 
 35 | Learn how to clean and prepare text for NLP.  
 36 | Includes:   
 37 | - Removing stop words
 38 | - Lowercasing
 39 | - Tokenizing
 40 | - And more
 41 | 
 42 | ---
 43 | 
 44 | ## 02-Embeddings
 45 |  
 46 | Understand how to represent words as numbers (vectors) so machines can understand them.  
 47 | Includes:
 48 | 
 49 | - **Label & One Hot Encoder**
 50 | - **Bag Of Words**
 51 | - **TF-IDF**
 52 | - **Word2Vec**
 53 |    - **CBOW**
 54 |    - **Skip Gram**
 55 |      - **Negative Sampling**
 56 | - **Fast Text**
 57 | 
 58 | ---
 59 | 
 60 | ## 03-Models  
 61 | Explore different models used in NLP.   
 62 | Includes:
 63 | 
 64 | - **1D-CNN**
 65 | - **RNN (Recurrent Neural Network)**
 66 | - **LSTM (Long Short-Term Memory)**  
 67 | - **GRU (Gated Recurrent Unit)**  
 68 | - **Transformers**  
 69 | 
 70 | ---
 71 | 
 72 | ## Installation
 73 | 
 74 | To install NLP-Tea, clone the repository and install the required dependencies:
 75 | 
 76 | ```sh
 77 | git clone https://github.com/Fawzy-AI-Explorer/NLP-Tea.git
 78 | cd NLP-Tea
 79 | pip install -r requirements.txt
 80 | ```
 81 | 
 82 | ## Templates
 83 | 
 84 | Templates will be added here soon.   
 85 | 
 86 | ---
 87 | 
 88 | ## License
 89 | 
 90 | This project is licensed under the MIT License. See the LICENSE file for more details.
 91 | 
 92 | ## Contributing 
 93 | 
 94 | Contributions are welcome!               
 95 | If you find something that can be improved, feel free to open an issue or submit a pull request.
 96 | 
 97 | ---
 98 | 
 99 | ## Future Topics
100 | 
101 | Here are some topics planned for future inclusion:
102 | 
103 | - LLMs Fine-Tuning
104 | - RAG
105 | - AI Agents using CrewAI
106 | 
107 | ---
108 | 
109 | Thank you for using NLP-Tea! If you have any questions or feedback, feel free to open an issue on GitHub.
110 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/examples/ex3_run_procurement_report_agent.py:
--------------------------------------------------------------------------------
 1 | from crewai_agents.agents.a1_search_queries_agent import search_queries_recommendation_agent
 2 | from crewai_agents.tasks.t1_search_queries_task import search_queries_recommendation_task
 3 | 
 4 | from crewai_agents.agents.a2_search_engine_agent import search_engine_agent
 5 | from crewai_agents.tasks.t2_search_engine_task import search_engine_task
 6 | 
 7 | from crewai_agents.agents.a3_scraping_agent import scraping_agent
 8 | from crewai_agents.tasks.t3_scraping_task import scraping_task
 9 | 
10 | from crewai_agents.agents.a4_procurement_report import procurement_report_author_agent
11 | from crewai_agents.tasks.t4_procurement_report_task import procurement_report_author_task
12 | 
13 | from crewai_agents.utilis import get_agentops_api_key, set_agentops_api_key 
14 | 
15 | from crewai import Crew, Process
16 | 
17 | 
18 | 
19 | def run_search_engine_agent():
20 |     """Run the search engine agent and return the results."""
21 |     print("Running search engine agent...")
22 |     # Set the AgentOps API key
23 |     api_key = get_agentops_api_key()
24 |     set_agentops_api_key(api_key)
25 |     print("AgentOps API key set successfully.")
26 | 
27 |     crew = Crew(
28 |         agents=[
29 |             search_queries_recommendation_agent,
30 |             search_engine_agent,
31 |             scraping_agent,
32 |             procurement_report_author_agent
33 |               ],
34 | 
35 |         tasks=[
36 |             search_queries_recommendation_task,
37 |             search_engine_task,
38 |             scraping_task,
39 |             procurement_report_author_task          
40 |             ],
41 |         verbose=True,
42 |         process=Process.sequential
43 |     )
44 |     print("Crew initialized successfully.")
45 |         
46 |     results = crew.kickoff(
47 |         inputs={                
48 |         "product_name": "book for professional development",
49 |         "websites_list": ["amazon.eg", "jumia.com.eg", "noon.com"],
50 |         "country_name": "Egypt",
51 |         "no_keywords": 3,
52 |         "language":"english",
53 |         "score_th":0.1,
54 |         "top_recommendations_no": 5,
55 |         } 
56 |     )
57 |     print("Crew kickoff completed successfully.")
58 |     return results 
59 | 
60 | if __name__ == "__main__":
61 |     results = run_search_engine_agent()
62 |     print("Search queries recommendation task completed successfully.")
63 |     print(f"Results: {results}")
64 | 
65 | 
66 | # To Run This Script:
67 | #           cd E:\DATA SCIENCE\projects\crewai-agents22>
68 | #           python -m examples.ex3_run_procurement_report_agent
69 | 
70 | 
71 | 
72 | '''
73 | Task Execution Flow
74 | 1. prompt
75 |     -  replaces each {…} with the value from inputs in Task description :
76 |     -  prompt sent to your search_queries_recommendation_agent
77 | 2. Agent → LLM
78 |     -  LLM generates a response based on the prompt
79 |     -  LLM response is a almost string of JSON object
80 | 3. Validation (output_json=SuggestedSearchQueries)
81 | 4. save the output to a file (output_file=os.path.join(output_dir, "step_1_suggested_search_queries.json"))
82 |       - dict of List of strings (queries) in JSON format
83 | 
84 | # once it finishes, you get a Pydantic object SuggestedSearchQueries(queries=[…])
85 | # CrewAI will automatically make that available to 
86 | # Task #2 under the name of the field—here, queries.
87 | # Task #2 can refer to {queries} in its own prompt   # IMPORTANT
88 | 
89 | '''


--------------------------------------------------------------------------------
/02-Word Embeddings/2.3-TF_IDF/Theory.md:
--------------------------------------------------------------------------------
 1 | # Word Embeddings
 2 | 
 3 | [01- Label Encoder & One Hot Encoder](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder)  
 4 | 
 5 | [02 - BOW](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.2-BOW)  
 6 | 
 7 | 
 8 | # 03 - TF-IDF
 9 | 
10 | ## What is TF-IDF?
11 | TF-IDF (Term Frequency–Inverse Document Frequency) is a numerical statistic that reflects how important a word is to a document in a collection (corpus).
12 | 
13 | Terms like ` “the”, “on”, “at” ` may appear many times in the documents. their large counts means LOW discrimination power between documents.  
14 | 
15 | 
16 | 
17 | 
18 | - Term Frequency (TF):    
19 | how often a given word appears within a document, term importance within a single document.
20 |   - `TF(t,d) = f(t,d) / SUM(f(t,d))`
21 |   -  `f(t,d)` =>> Number of times term t appears in document d
22 |   -  `SUM(f(t,d))` =>>>  total number of terms in d = len(d)  
23 | Same term has different TF values in different documents accoarding to How many times appears in this document.  
24 | Term `t` appears 5 times in doc1 and 90 in doc2    
25 |   
26 | - Document Frequency (DF):
27 |   - How many documents that a given term appear in it  
28 |   - `DF(t) = number of documents where the term "t" appears`  
29 | term `t` appears in 10 DOCS 
30 | - Inverse Document Frequency (IDF):
31 |   - down scales words that appear a lot across documents.
32 |   - `IDF(t) = N/n`
33 |   - `N` =>> total number of documents
34 |   - `n` =>> number of documents where the term "t" appears  
35 | term `t` appears in 1 DOCS over 80 Docs  `DF = 1` `IDF = 80/1 = 80`  
36 |     - Low DF => High IDF  
37 | term `t` appears in 80 DOCS over 80 Docs `DF = 80` `IDF = 80/80 = 1`  
38 |     - High DF => Low IDF  
39 | 
40 | - TF-IDF
41 |   - highlight words that are `Frequent in a document` (High TF(t,d)) and `Less frequent across documents` (High IDF(t) = Low DF(t))   
42 |   - `TF-IDF = TF * IDF`
43 | 
44 | A high weight in tf–idf is reached by:  
45 | - a high term frequency (in the given document)     
46 | - a low document frequency of the term in the whole collection of documents (high Inverse
47 | Document Frequency)  
48 | 
49 | ## Steps
50 | 1. Corpus : A list of text documents.
51 | 2. Vocabulary : Unique Words
52 | 3. Calculate Term Frequency (TF) (BOW)
53 |    - For Each Word (t) in Vocab  :
54 |      - For Each Doc (d) in Corpus
55 |         - Calc TF(t,d)  
56 | 4. Calculate Inverse Document Frequency (IDF)
57 |    -  For Each Word (t) in Vocab  :
58 |      -  Clac IDF(t) = Log(N/n)  
59 | 5. Construct TF-IDF Matrix
60 |    - Rows : Docs
61 |    - Cols : Vocab terms
62 |  
63 | - For Each Word (t) in Vocab  :
64 |      - Clac IDF(t) 
65 |      - For Each Doc (d) in Corpus
66 |         - Calc TF(t,d)
67 |         - Calc TF-IDF(t,d) = TF(t,d) * IDF(t)  
68 | 
69 | 
70 | 
71 | ![image](https://github.com/user-attachments/assets/0be29ba1-2fc2-4fce-8aea-4600a827fcdd)
72 | ![image](https://github.com/user-attachments/assets/5f2f708e-6090-4198-827e-0019315d7b45)
73 | ![image](https://github.com/user-attachments/assets/7c1fc473-17c3-4755-80b4-20d5eb9a5301)
74 | ![image](https://github.com/user-attachments/assets/cfe56ff0-3cfa-4185-85fa-489b00008f2a)
75 | ![image](https://github.com/user-attachments/assets/248c97ce-197b-4974-9900-7bdf3f97d9b7)
76 | ![image](https://github.com/user-attachments/assets/5001a6ed-f0a6-48f7-814f-9bbee9bf3301)
77 | 
78 | 
79 | 
80 | 
81 | 
82 | ## Limitations of TF-IDF
83 | 
84 | - No semantic
85 |   - TF-IDF treats words independently, so it doesn't capture meaning or word order. 
86 | - Sparse
87 |   - For large vocabularies, TF-IDF creates large sparse matrices. 
88 | - OOV
89 |   - Can't handle words that weren’t seen during training 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/02-Word Embeddings/2.1-Label Encoder and One Hot Encoder/Theory.md:
--------------------------------------------------------------------------------
 1 | # Word Embeddings
 2 | 
 3 | ## 01- Label Encoder
 4 | 
 5 | ### What It Is
 6 | 
 7 | Represent text data as an integr values (mapping each unique Word to a unique integer (scalar) )
 8 | 
 9 | Used more in classical ML Algorithms (Structure Data) Features with Ordinal Characteristics ("small", "medium", "large")
10 | 
11 | Suitable for tree-based models (e.g., Decision Trees, Random Forests) that do not assume ordinal relationships.
12 | 
13 | Limitations  : 
14 | 
15 | -  Lack of Semantic Information:
16 |   
17 |     -  Since each word is mapped to a single integer, the numeric distance between two encoded words depends olely on the integer values of the two given words, not on the semantic similarity between the words.
18 |       
19 | -  Unsuitable for NLP
20 |     -  we want representations that capture the meaning and relationships between words. Label encoding fails to capture semantic and contextual information because it encodes each word independently as a scalar.
21 |  
22 |        
23 | ### Steps
24 | 
25 | 1.     create corpus = list of all Dos [Doc1, Doc2, Doc3, ......]
26 | 2.     preprocessing (take Doc as an input, out Tokens) List[List[str]]
27 | 3.     Build a Vocabulary (Unique Words)
28 |          -  Combine tokens from all documents and create a set of unique words
29 | 4.     Integer Mapping
30 |          -  Map each unique word to a unique integer    
31 | 
32 | 5.     Transform the Documents
33 |          -  Replace each word in each document with its corresponding integer according to the mapping.
34 | 6.     Post-Processing
35 |          -  Pad sequences: Ensure all sequences have the same length.
36 |              -  add Padding to ensure that all Docs has the same Lenght.
37 | 
38 | 
39 | 
40 | 
41 | 
42 | ## 02- One Hot Encoder
43 | 
44 | Represent text as an Binary vectors.
45 | -  The vector’s length equals the number of unique categories.
46 | -  All elements of the vector are 0 except for one element, which is set to 1 to indicate the presence of that category.
47 | 
48 | -  Distance between two vectors of two words that are One-Hot Encoded is the same (either "2" for different words and "0" for same words)
49 | 
50 | -  High Dimensionality , length of Each Vector equal lenght voab (Unique words) (e.g. Unique=10000)
51 |   
52 | -  the Vector is a binaly (All 0 except one position only is 1)
53 | 
54 | 
55 |    
56 | ### Steps
57 | 
58 | 1.     Apply Label Encoder (Mapp Each Unique Word to Integer Value)
59 | 2.     Create Binary Vectors
60 |         -  For each unique Word, create a binary vector all 0 except the index of the integer, (lenght = len(Vocab) = len(Unique_Words)).
61 | 3.     Transform
62 |         -  Replace each Word with its corresponding binary vector.
63 | 
64 | '''
65 | 
66 |     Doc1: "cat sat on the mat"
67 |     Doc2: "dog barked at the cat"
68 |     ---------------------
69 |     corpus = ["cat sat on the mat", "dog barked at the cat" ]
70 |     processed_corpus = [ [ "cat", "sat", "on", "the", "mat" ], [ "dog", "barked", "at", "the", "cat" ]]
71 |     Vocabulary (Unique Words) = ["at", "barked", "cat", "dog", "mat", "on", "sat", "the"]
72 |     Label Encoding : 
73 |     ["at":0, "barked":1, "cat":2, "dog":3, "mat":4, "on":5, "sat":6, "the":7]
74 |      One-Hot Encoding :
75 |      "at" (index 0)    : [1, 0, 0, 0, 0, 0, 0, 0]
76 |      "barked" (index 1): [0, 1, 0, 0, 0, 0, 0, 0]
77 |      "cat" (index 2)   : [0, 0, 1, 0, 0, 0, 0, 0]
78 |      "dog" (index 3)   : [0, 0, 0, 1, 0, 0, 0, 0]
79 |      "mat" (index 4)   : [0, 0, 0, 0, 1, 0, 0, 0]
80 |      "on" (index 5)    : [0, 0, 0, 0, 0, 1, 0, 0]
81 |      "sat" (index 6)   : [0, 0, 0, 0, 0, 0, 1, 0]
82 |      "the" (index 7)   : [0, 0, 0, 0, 0, 0, 0, 1]
83 |      
84 |     Doc1 : "cat sat on the mat"
85 |     = [ 
86 |     [0, 0, 1, 0, 0, 0, 0, 0],
87 |     [0, 0, 0, 0, 0, 0, 1, 0] , 
88 |     [0, 0, 0, 0, 0, 1, 0, 0], 
89 |     [0, 0, 0, 0, 0, 0, 0, 1],
90 |     [0, 0, 0, 0, 1, 0, 0, 0]    
91 |     ]
92 |     
93 |    
94 | '''
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/outputs/ai-agent-output/step_2_search_results.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "results": [
 3 |     {
 4 |       "title": "Business Strategies for Dummies",
 5 |       "url": "https://www.amazon.com/Business-Strategies-Dummies-Peter-McMullen/dp/1234567890",
 6 |       "content": "This book provides a comprehensive guide to business strategies, covering essential topics such as market analysis, financial planning, and leadership development.",
 7 |       "score": 0.9,
 8 |       "search_query": "best business strategies books Amazon Egypt"
 9 |     },
10 |     {
11 |       "title": "Time Management Techniques for Students",
12 |       "url": "https://www.jumia.com/eg/study-products/time-management-techniques-for-students",
13 |       "content": "Learn effective time management techniques tailored for students, including prioritization, productivity tools, and study strategies.",
14 |       "score": 0.85,
15 |       "search_query": "time management techniques Jumia Egypt"
16 |     },
17 |     {
18 |       "title": "Project Management Guide for Startups",
19 |       "url": "https://www.noon.com/eg/project-management-guide-for-startups",
20 |       "content": "A step-by-step guide to managing projects effectively, focusing on small and medium-sized startups.",
21 |       "score": 0.88,
22 |       "search_query": "project management guide Noon Egypt"
23 |     },
24 |     {
25 |       "title": "Six Sigma Principles for Business Improvement",
26 |       "url": "https://www.amazon.com/Six-Sigma-Principles-Business-Improvement/dp/9876543210",
27 |       "content": "Essential Six Sigma principles and methodologies for improving business processes and quality.",
28 |       "score": 0.92,
29 |       "search_query": "Six Sigma principles Amazon Egypt"
30 |     },
31 |     {
32 |       "title": "Effective Communication Skills for Professionals",
33 |       "url": "https://www.jumia.com/eg/professionals-communication-skills",
34 |       "content": "Master effective communication skills to enhance professionalism and build stronger relationships in the workplace.",
35 |       "score": 0.89,
36 |       "search_query": "effective communication skills Jumia Egypt"
37 |     },
38 |     {
39 |       "title": "Digital Marketing Strategies for Small Businesses",
40 |       "url": "https://www.noon.com/eg/digital-marketing-strategies-small-businesses",
41 |       "content": "Learn how to use digital marketing tools and strategies to grow your small business online.",
42 |       "score": 0.87,
43 |       "search_query": "digital marketing strategies Noon Egypt"
44 |     },
45 |     {
46 |       "title": "Lean Manufacturing Techniques for Small Manufacturers",
47 |       "url": "https://www.amazon.com/Learn-Lean-Manufacturing-Techniques/dp/1234567890",
48 |       "content": "Essential lean manufacturing techniques to improve efficiency and reduce waste in small manufacturers.",
49 |       "score": 0.91,
50 |       "search_query": "lean manufacturing techniques Amazon Egypt"
51 |     },
52 |     {
53 |       "title": "Leadership and Negotiation Skills for Professionals",
54 |       "url": "https://www.jumia.com/eg/leadership-negotiation-skills",
55 |       "content": "Master leadership and negotiation skills to achieve better outcomes in professional relationships.",
56 |       "score": 0.86,
57 |       "search_query": "leadership and negotiation Jumia Egypt"
58 |     },
59 |     {
60 |       "title": "Financial Planning Guide for Entrepreneurs",
61 |       "url": "https://www.noon.com/eg-financial-planning-entrepreneurship",
62 |       "content": "A detailed guide to financial planning, helping entrepreneurs manage their finances effectively.",
63 |       "score": 0.84,
64 |       "search_query": "financial planning guide Noon Egypt"
65 |     },
66 |     {
67 |       "title": "Risk Management Strategies for Businesses",
68 |       "url": "https://www.amazon.com/Risk-Management-Strategies-Businesses/dp/1234567890",
69 |       "content": "Learn effective risk management strategies to protect your business from potential challenges.",
70 |       "score": 0.88,
71 |       "search_query": "risk management strategies Amazon Egypt"
72 |     }
73 |   ]
74 | }


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/3.2-RNNs/3.2-RNNs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 6,
  6 |    "id": "4834e2df",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "import torch\n",
 11 |     "import torch.nn as nn\n",
 12 |     "import torch.nn.functional as F"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 7,
 18 |    "id": "3d439041",
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stdout",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "\n",
 26 |       "Final output keys: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "import torch\n",
 32 |     "import torch.nn as nn\n",
 33 |     "import torch.nn.functional as F\n",
 34 |     "\n",
 35 |     "\n",
 36 |     "class SimpleRNN(nn.Module):\n",
 37 |     "    def __init__(self, input_size=9, hidden_size=4, output_size=3):\n",
 38 |     "        super(SimpleRNN, self).__init__()\n",
 39 |     "\n",
 40 |     "        self.hidden_size = hidden_size\n",
 41 |     "\n",
 42 |     "        self.input_to_hidden = nn.Linear(input_size, hidden_size)    # 9x4\n",
 43 |     "        self.hidden_to_hidden = nn.Linear(hidden_size, hidden_size)  # 4x4\n",
 44 |     "        self.hidden_to_output = nn.Linear(hidden_size, output_size)  # 4x3\n",
 45 |     "\n",
 46 |     "\n",
 47 |     "    def forward(self, inputs): # (10,9)\n",
 48 |     "        steps_output, hidden_states = {}, {}\n",
 49 |     "\n",
 50 |     "        hidden_states[-1] = torch.zeros((1, self.hidden_size))  # (1,4)\n",
 51 |     "\n",
 52 |     "\n",
 53 |     "        for t in range(len(inputs)):\n",
 54 |     "            x = inputs[t].reshape(1,9)    # (1,9)\n",
 55 |     "\n",
 56 |     "            hidden_cur = self.input_to_hidden(x)  # (1,9) * (9,4) = (1,4)\n",
 57 |     "            h_prev = self.hidden_to_hidden(hidden_states[t - 1])  # (1,4) * (4,4) = (1,4)\n",
 58 |     "            hidden_states[t] = torch.tanh(hidden_cur + h_prev)  # (1,4) + (1,4) = (1,4)\n",
 59 |     "\n",
 60 |     "            y_t = self.hidden_to_output(hidden_states[t])  # (1,4) * (4,3) = (1,3)\n",
 61 |     "\n",
 62 |     "            steps_output[t] = y_t\n",
 63 |     "\n",
 64 |     "        return steps_output, hidden_states\n",
 65 |     "\n",
 66 |     "\n",
 67 |     "if __name__ == '__main__':\n",
 68 |     "    sequence_length = 10\n",
 69 |     "    input_size = 9\n",
 70 |     "    hidden_size = 4\n",
 71 |     "    output_size = 3\n",
 72 |     "\n",
 73 |     "    model = SimpleRNN(input_size, hidden_size, output_size)\n",
 74 |     "\n",
 75 |     "    inputs = [torch.randn(input_size) for _ in range(sequence_length)]\n",
 76 |     "\n",
 77 |     "    output, hidden_states = model(inputs)\n",
 78 |     "    print(\"\\nFinal output keys:\", output.keys())"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 8,
 84 |    "id": "de18d764",
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "name": "stdout",
 89 |      "output_type": "stream",
 90 |      "text": [
 91 |       "\n",
 92 |       "Final output keys: dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "sequence_length = 10\n",
 98 |     "input_size = 9\n",
 99 |     "hidden_size = 4\n",
100 |     "output_size = 3\n",
101 |     "\n",
102 |     "model = SimpleRNN(input_size, hidden_size, output_size)\n",
103 |     "\n",
104 |     "inputs = [torch.randn(input_size) for _ in range(sequence_length)] # 10 sequences, each 9 features\n",
105 |     "\n",
106 |     "output, hidden_states = model(inputs)\n",
107 |     "print(\"\\nFinal output keys:\", output.keys())"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "id": "46534bb0",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": []
117 |   }
118 |  ],
119 |  "metadata": {
120 |   "kernelspec": {
121 |    "display_name": "myenv",
122 |    "language": "python",
123 |    "name": "python3"
124 |   },
125 |   "language_info": {
126 |    "codemirror_mode": {
127 |     "name": "ipython",
128 |     "version": 3
129 |    },
130 |    "file_extension": ".py",
131 |    "mimetype": "text/x-python",
132 |    "name": "python",
133 |    "nbconvert_exporter": "python",
134 |    "pygments_lexer": "ipython3",
135 |    "version": "3.12.6"
136 |   }
137 |  },
138 |  "nbformat": 4,
139 |  "nbformat_minor": 5
140 | }
141 | 


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/3.5-Transformers/Theory/3.5.3-self attention.md:
--------------------------------------------------------------------------------
  1 | # Self Attention 
  2 | 
  3 | ## Goal
  4 | -  Each Word computes a new Embedding by attending to all other Wordss, weighted similarity.
  5 | -  to compute a weighted representation of a sequence by allowing each token to focus on ("attend to") other tokens in the sequence.
  6 | In other words:                             
  7 | -  "How much should this word pay attention to other words ?"          
  8 | 
  9 | 
 10 | ## How 
 11 | 
 12 | ### Replace RNNs with Attention Blocks
 13 | -  In traditional RNNs, we process words one by one.
 14 | -  In self-attention, we process all words at once, using attention blocks instead of RNN cells.
 15 |    
 16 | -  For a sentence with 4 words → we use 4 attention blocks (one per word).
 17 | -  Each block
 18 |   -  Input   : word embedding Xi 
 19 |   -  outputs : new Wrd embeding Yi
 20 | ### Each Block = One Word's Attention Processing
 21 | -  Each word Xi updates itself by looking at other words in the sentence and deciding how each one are important.
 22 | ![image](https://github.com/user-attachments/assets/ad44df6e-b54c-40f8-b349-f5b3ab84ae6a)
 23 | 
 24 | 
 25 | ---
 26 | ### HOW
 27 | e.g. we are on block 2, So Embedding of X2 Will Update        
 28 | 1.  Similarity (Attention Scores)
 29 |   -  W21 = cos Sim (X1,X2) = X1.X2 / |X1|*|X2|     ,,,Range [-1, +1]                      
 30 |   -  W22 = cos Sim (X2,X2) = X2.X2 / |X2|*|X2|     ,,,will be max number X2 is Similar to X2    
 31 |   -  W23 = cos Sim (X3,X2) = X3.X2 / |X3|*|X2|
 32 |   -  W24 = cos Sim (X4,X2) = X4.X2 / |X4|*|X2|
 33 | 
 34 | scores SHowing how each Word Similar to X2    
 35 | scores showing how much attention X2 should give to each word           
 36 | COS Similarity Range from [1-, +1] Bounded But i want Probability Sum to ONE, So i will apply Sofe Max     
 37 | 
 38 | -  Softmax (1,2,3,4)     = 0.03, 0.08, 0.23, 0.64 
 39 | -  Softmax (5,10,15,20)  = 0, 0, 0.0067, 0.99
 40 | -  Softmax (10,20,30,40) = 0, 0, 0, 1                     
 41 | We Want to Normalize the Scores                                      
 42 | 
 43 | ![image](https://github.com/user-attachments/assets/4d87739a-567e-41fc-9cc1-3d5e9b5fd6bc)
 44 | 
 45 | 
 46 | 2. Normalize the Scores
 47 |   -  W2j = W2j / SQRT(d)
 48 |   -  d => Dim of Embedding NOT seq len 
 49 | 3.   Softmax of Scores
 50 |   -  Softmax (W2j)         Range [0, +1]
 51 |   -  e.g. W25 => tells us how X2, X5 are Similar and how much weight to assign to X5
 52 | 
 53 | ![image](https://github.com/user-attachments/assets/2afdfb71-8811-4dff-ad2f-970a5c4a4075)
 54 | 
 55 | 
 56 | 4.  Weighted Sum = New Word Embedding
 57 |   -  Y2 = SUM (W2j * Xj)  
 58 | the new representation for word 2 is a weighted SUM of all the words in the sentence, based on how much attention it gave to each one.
 59 | 
 60 | ![image](https://github.com/user-attachments/assets/83e5b92e-446a-4c0c-951c-b1f452d1d129)
 61 | 
 62 | 
 63 | ```
 64 | Wij = Xi.T * Xj / |Xi| * |Xj|
 65 | ```
 66 | The attention score between word i and word j is the dot product between their embeddings.       
 67 | This works well — but we want more flexibility.          
 68 | ```
 69 | Wij = Xi.T * Xj / |Xi| * |Xj|
 70 | Wij = Wij / SQRT (d)
 71 | Wij = Softmax (Wij)
 72 | Yi = SUM (Wij) * Xj
 73 | ```
 74 | ![image](https://github.com/user-attachments/assets/2d706875-8b55-438a-bd8e-769d68726ac4)
 75 | ```
 76 | select salary from t
 77 |       where Age = 20   >>>>>>>>>> 2000
 78 | 
 79 | select salary from t
 80 |       where Age = 35   >>>>>>>>>> Not Found
 81 | what if there is away to find it
 82 | Similarity between Query and All Keys to find Age 35 similar to each keys
 83 | Age = 35 , it it between (Similar) Age = 20 , 35   
 84 | We can say Salary = 0.5 * 2000 + 0.5 * 20000
 85 | Salary = 0*200 + 0.5*2000 + 0.5*20000 + 0*200 = SUM (W*V) => Weighted Sum of values 
 86 | ```
 87 | -------------
 88 | -------------
 89 | 
 90 | -   Query ===>
 91 |    -   X we Work on it (Xi) (X2) Will Update
 92 | -   Keys ====>
 93 |    -   Wij = Xi.T * Xj / |Xi| * |Xj|  Here Xj are the Keys 
 94 | -   Value ===>
 95 |    -   Y2 = SUM (W2j * Xj)  Here Xj are the Values
 96 | 
 97 | 
 98 | Old
 99 | ```
100 | Wij = Xi.T * Xj / |Xi| * |Xj|
101 | Wij = Wij / SQRT (d)
102 | Wij = Softmax (Wij)
103 | Yi = SUM (Wij) * Xj
104 | ```
105 | New
106 | ```
107 | Q = Wq * Xi
108 | K = Wq * Xj
109 | V = Wq * Xj
110 | 
111 | Wij = Q.T * K / |Q| * |K|
112 | Wij = Wij / SQRT (d)
113 | Wij = Softmax (Wij)
114 | Yi = SUM (Wij) * V
115 | ```
116 | ![image](https://github.com/user-attachments/assets/063f17ad-1b82-4c04-8fd3-5ed566bdab42)
117 | 
118 | 


--------------------------------------------------------------------------------
/03-Deep Learning NLP (Models)/3.2-RNNs/Theory.md:
--------------------------------------------------------------------------------
  1 | # RNNs (Recurrent Neural Networks)
  2 | 
  3 | ## Sequential Data
  4 | - Sequential data is any data where the order of elements matters.
  5 |   - Examples: Text, Videos, Speech   
  6 | - Sequence Models are designed for sequential data
  7 |   - Examples: RNNs, LSTMs, GRUs, Transformers
  8 | 
  9 | ## Temporal vs. Spatial Data
 10 | - Temporal Data: sequences that changr over time (related to **time**)
 11 |   - Time Series : Stock prices, Weather data
 12 |   - Speech signals: (audio changes over time)
 13 |   - Video: sequence of frames over time
 14 |   - Text:  (sequence of words or characters)
 15 | - Spatial Data: Refers to data associated with spatial locations.(structure in space(2-D grid))
 16 |   - Images: Pixels arranged in a grid.
 17 |   - Video: Each video frame is essentially an image (Pixels)
 18 |   - maps: Geographic data for specific areas.
 19 | 
 20 | ## What is RNNs ? 
 21 | - RNNs are neural networks specially designed for sequential data.
 22 | - They remember past information using a "history vector".
 23 | - Great for tasks where order and context matter (e.g. language, time series)
 24 | 
 25 | ## Why RNN Not FC?
 26 | - FC networks:
 27 |   - Expect fixed-size input and output
 28 |     - Can’t handle variable-length sequences well 
 29 |   - Don’t remember previous inputs
 30 |     - Doesn’t retain info from earlier words/time steps. 
 31 |   - Ignore the order of inputs
 32 |     - No temporal structure. 
 33 |   - Traditional neural networks process input data without considering sequence or time-based dependencies.
 34 | 
 35 | - RNNs
 36 |   - Sequential Data Handling (where the order of inputs matters)
 37 |   - RNNs retain information from previous steps, making them suitable for tasks that require understanding context or history
 38 |   - Efficiency with Variable-Length Inputs: RNNs can handle variable-length sequences naturally. FCNs require fixed-length inputs
 39 |     - Translation  
 40 |   - "This is good" vs. "I can't say this is good."
 41 |     - FC treats "good" the same in both sentences.
 42 |     - RNN understands the context around "good".  
 43 | 
 44 | ### How FC and RNN Process Video or Text:
 45 | 
 46 | - FC
 47 |   - dataset (Videos)
 48 |     - For video with 100 frames, each 256x256:
 49 |       - Whole video = 3D tensor (100 x 256 x 256) fed at once 
 50 |   - dataset (sentences)
 51 |     - For text:
 52 |       - Join all words into one long vector and feed at once
 53 |   - FC doesn’t see relationships between frames or words
 54 | FC diesn't take relation Between Pixels or Words , Video or Sentence Feed to the Network 
 55 | - RNNs
 56 |   - Processes data step-by-step (Sequential):
 57 |     - For video: frame-by-frame
 58 |     - For text: word-by-word
 59 |   - At each step (t) Take 2 vecctor as an input:
 60 |      1. Takes current input vector : Represent Cur Frame (t) 
 61 |      2. history vector (summary of all previous inputs) : Represent Frame 0 to Frame t-1)
 62 |   - Updates the history after each step 
 63 | 
 64 | ### How RNNs Handle Different Sizes Input.
 65 | Sentences With n Words , Each Word Represented as vector ( len = 90 )
 66 | - Input Layer >> Number of Neurons : 90 Nodes
 67 | - Hidden Layer (RNN) >> Number of Neurons:  براحتك = History Vector Size
 68 | - Output Layer >> Number of Neurons :Depends on the Task
 69 | لي بقا مش مهم كل فيديو فيه كام فريم ؟ لانك مثلا اول فيديو فيه 25 فريم ف انت هتخش ع النتورك اول مرة باول فريم وتاني مرة ب (تاني فريم وهستوري) وهكذا مش فارقه معاك عدد الفريمز لانه كدا كدا مش هتدخلهم كلهم مرة واحده انت شغال فريم فريم انظر للكود اللي تحت لمزيد من التفاصيل .
 70 | فكر ف الموضوع انه معاك فيديو هتدخله ل RNN Layer فريم فريم ومش هتخش ع الفيديو اللي بعده غير لما يخلص خالص كل الفريمات .
 71 | - Each step depends on ALL the previous steps
 72 |   - (The k-th frame depends on all previous k-1 frames !!!! )
 73 |   - at each step we have 2 inputs only :
 74 |     - The k-th feature vector for the k-th frame
 75 |     - History vector representing the frames from 1 to k-1
 76 | - After each k-th step, the History vector will be updated to represent inputs from 1 to k !!!!!
 77 | 
 78 | 
 79 | 
 80 | ## RNN Architecture
 81 | 
 82 | ال RNN Layer هيشتغل على كل Frame لوحده ويعرف يستغل ال History او ال Frames السابقة.   
 83 | لو معاك 100 frame  يعني السامبل الواحد فيه 100 Frame ==>   
 84 | هيبقى معاك 100 Vec هيدخله واحد واحد و مع كل vec داخل للنتورك بتدخل معاه vec 1 بيعبر عن ملخص كل اللي فات History      
 85 | لو انت عند ال Frame K هتدخل للنتورك ===> vec K بيعبر عن الحالى و Vec from 1 to K-1 ده ال History وطبعا كل مرة هتعمل Update لل History ده تضيف عليه ال frame  الحالي   
 86 | ![image](https://github.com/user-attachments/assets/6af25a3e-68de-4f79-b16e-a7a3f9fa0db8)
 87 | 
 88 | ```
 89 | in FC : Input x(Video/Sentence) fed at once 
 90 | 
 91 | a = g ( Wax . X + ba )
 92 | y = g ( Wya . a + by )
 93 | ----------------------------------------
 94 | ----------------------------------------
 95 | 
 96 | in RNNs : Work on Steps (X[0], X[1], ......., X[t])    
 97 | 
 98 | a(t) = g ( Wax . X(t) + Waa . a(t-1) + ba)
 99 | y(t) = g ( Wya . a(t) + by)
100 | -----------
101 | a(0) = 0 ===> this is the History >>>>> History Vector Size = Number of Hidden State on RNN Layer
102 | *****
103 | a(0) = 0
104 | a1 = g(Wax * X1 + Waa * a0 + ba) 
105 | y1 = g(Wya * a1 + by)
106 | *****
107 | a2 = g(Wax * X2 + Waa * a1 + ba)
108 | y2 = g(Wya * a2 + by)
109 | *****
110 | a3 = g(Wax * X3 + Waa * a2 + ba)
111 |    = g(Wax * X3 + Waa * (g(Wax * X2 + Waa * (g(Wax * X1 + Waa * a0 + ba)) + ba)) + ba)
112 | Wax ==> Shared Weights Through Time
113 | Waa  ==> Shared Weights Through Time
114 | 
115 | y3 = g(Way * a3 + b)
116 | y3 = g(X3, X2, X1
117 | 
118 | ```
119 | ![image](https://github.com/user-attachments/assets/42614b7e-1e26-4a48-868f-9307161879c1)
120 | 
121 | 
122 | ## RNNS Types
123 | - Ont to One 
124 | - One to Many
125 |   - Image Caption   
126 | - Many to One
127 |   - Sentiement Analysis 
128 | - Many to Many
129 |   - Translation 
130 | 
131 | 


--------------------------------------------------------------------------------
/01-Text-Preprocessing/1.1-Text-Preprocessing/Theory.md:
--------------------------------------------------------------------------------
  1 | # Text Preprocessing
  2 | 
  3 | # What ... ?
  4 | -  the process of cleaning and transforming raw text into a format suitable for NLP tasks
  5 | -  first step of NLP projects
  6 |   
  7 | # Why ... ?
  8 | -  Text data often contains noise such as punctuation, special characters, and irrelevant symbols. Preprocessing helps remove these elements
  9 | -  Different forms of words (e.g., “run,” “running,” “ran”) can convey the same meaning but appear in different forms. Preprocessing techniques like stemming and lemmatization help standardize these variations
 10 | -  raw text has Mixed cases ("Hello" , "hello") Models treat "Hello" and "hello" as different words
 11 | and more...
 12 | 
 13 | # How ... ?
 14 | 
 15 | ## Lowercase
 16 | Converts text to lowercase ("Hello WORLD" =>>> "hello world")
 17 | #### Apply & Avoid for:
 18 | -  apply If the case (Capital or lower) does not contain information
 19 |     -  Search engines (to normalize queries)
 20 |     -  If your goal is just to classify
 21 |       -  Sentiment analysis, Spam Detection, Topic Classification (NLP, nlp) are Same
 22 | -  Avoid :      
 23 |     -  Machine translation
 24 |     -  POS (Parts-of-speech tagging (like noun, verb, adjective))
 25 |       
 26 | Chat GPT Said:  <br>
 27 | If you're not sure, just ask:  
 28 |          || “Does capitalization change the meaning in my task?” ||  
 29 | If no, lowercase away. If yes, preserve it  
 30 | 
 31 | ## Remove URLs, mentions, hashtags
 32 | Deletes symbols like !@#,. and urls
 33 | ### Apply & Avoid for:
 34 | - Apply for : Social media analysis, Topic modeling
 35 | - Avoid for: If URLs/hashtags carry meaning (trend analysis)
 36 | 
 37 | ## Remove punctuation & numbers & White Spaces
 38 |   - Deletes noise like . , ! ? ) : " 123
 39 | #### Apply & Avoid for: 
 40 | - Apply for : Sentiment analysis (if numbers are irrelevant), Document classification
 41 | - Avoid : If punctuation carries emotion, number-sensitive
 42 |     - emotion detection : "Sad :("
 43 |     - math problems
 44 |     - Financial/medical texts ("COVID-19")
 45 | 
 46 | ## Tokenize
 47 | Splits text into words or tokens ("I love NLP" → ["I", "love", "NLP"]) 
 48 |   
 49 | ## Remove stopwords
 50 | Deletes (Stop Words) common words ("is", "the", "and").
 51 | #### Apply & Avoid for:
 52 | - Apply for : Topic modeling
 53 | - Avoid : If stop words carries Informations 
 54 |     -  Sentiment analysis ("not", "never" are stopwords but means negation)
 55 |     -  Machine translation (stopwords are Important)
 56 |    
 57 |       
 58 | ## Stemming & Lemmatization
 59 |   - return Word Base ("playing" => Play)
 60 | #### Apply & Avoid for:
 61 | - Apply for : Spam detection, Search engines, Sentiment analysis
 62 | - Avoid for : generative tasks (Summarization or translation) 
 63 |   
 64 | ## Custom Rules
 65 |   - replace emojis with text "🙂" → "[smile]") Social media sentiment, reviews analysis
 66 | 
 67 | 
 68 | 
 69 | 
 70 | ---
 71 | Text preprocessing is task-specific <Br> 
 72 | The preprocessing steps you choose should always depend on: <Br>
 73 | 
 74 | -  NLP Task
 75 |     -  Sentiment Analysis
 76 |       -  Lowercasing, remove URLs, emojis to text
 77 |       -  Avoid removing negations ("not") or emojis 
 78 |   -  Topic Classification
 79 |       -  Lowercasing, stopword removal, stemming/lemmatizing 
 80 |   -  Machine Translation
 81 |       -  keep sentence structure
 82 |       -  Avoid remove punctuation, stopwords
 83 |   -  Text Generation (GPT)
 84 |       -  Avoid changing text
 85 |       
 86 | -  Model
 87 |     -  Traditional ML (SVM, Regression)
 88 |       -  advanced: lowercase, stopwords, stemming  
 89 |     -  Transformers (BERT)
 90 |       -  minimal cleaning
 91 |  
 92 | - Dataset
 93 |     -  Tweets
 94 |     -  Product reviews
 95 |     -  Scientific texts
 96 | 
 97 | ---
 98 | chat GPT Said : <br>
 99 | Always Ask Yourself <br>
100 | Before preprocessing, ask: <br>
101 | -  What is the goal of my task?
102 | -  Will this step remove or distort useful information?
103 | -  What model am I using — does it need clean or natural text?
104 | ---
105 | 
106 | 
107 | ## Stemming & Lemmatization
108 | 
109 | The goal of both stemming and lemmatization is to reduce:
110 | 
111 | -  inflectional forms and derivationally related forms of a word to a common base form
112 | 
113 |   
114 | ![image](https://github.com/user-attachments/assets/5e647b23-f61d-4a14-b1b4-da60ca14137c)
115 | 
116 | #### Stemming
117 | 
118 | -  the process of reducing infected words to their stem (removing common affixes (prefixes, suffixes) from words)
119 | -  the process of removing the last few characters of a given word, to obtain a shorter form, even if that form doesn’t have any meaning in machine learning.
120 | -  rule Based Algorithm
121 |   
122 | ![image](https://github.com/user-attachments/assets/8594aa9d-4acb-4930-8ca0-3e3c5b59e3e9)
123 | 
124 | 
125 | 
126 | 
127 | #### Lemmatization
128 | 
129 | The purpose of lemmatization is same as that of stemming but overcomes the drawbacks of stemming <br>
130 | use of a vocabulary and morphological analysis of words. <br>
131 | 
132 | the token saw <br>
133 | -  stemming might return just s, (remove aw)
134 | -  lemmatization would attempt to return either see or saw
135 |   -  depending on whether the use of the token was as a verb or a noun.
136 | 
137 | 
138 | ![image](https://github.com/user-attachments/assets/faca7b47-8096-45e8-8b11-0b7025c81bbe)
139 | 
140 | 
141 | - Tokenization :
142 | - POS Tagging: Parts-of-speech tagging (like noun, verb, adjective, etc.)
143 | - Lemmatization:
144 |   -  Simple dictionary lookup. This works well for straightforward inflected forms,
145 |   -  Hand-crafted rule based system
146 |   -  Rules learned automatically from an annotated corpus.
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | -  Stemming: Faster, but may create Wrong root for words and lose meaning. This is known as "over stemming."
154 | 
155 | -  Lemmatization: slower, More accurate, preserves meaning and grammatical function.
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 |  
165 | 


--------------------------------------------------------------------------------
/04-crewai-agents/4.1-AI Agents using CrewAI ( Abu Bakr Soliman)/crewai-agents/README.md:
--------------------------------------------------------------------------------
  1 | # CrewAI Procurement Agents 🤖
  2 | 
  3 | [![Python](https://img.shields.io/badge/Python-3.10+-blue.svg)](https://www.python.org/downloads/)
  4 | [![CrewAI](https://img.shields.io/badge/CrewAI-0.1.30+-orange.svg)](https://github.com/joaomdmoura/crewAI)
  5 | [![LangChain](https://img.shields.io/badge/LangChain-0.0.335+-green.svg)](https://github.com/langchain-ai/langchain)
  6 | [![License](https://img.shields.io/badge/License-MIT-green.svg)](LICENSE)
  7 | [![Contributions](https://img.shields.io/badge/Contributions-Welcome-brightgreen.svg)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/issues)
  8 | [![Stars](https://img.shields.io/github/stars/Fawzy-AI-Explorer/NLP-Tea?style=social)](https://github.com/Fawzy-AI-Explorer/NLP-Tea/stargazers)
  9 | 
 10 | A modular AI agent system built with CrewAI for product research, procurement, and analysis in e-commerce environments.
 11 | 
 12 | ## Project Overview 🔍
 13 | 
 14 | This project implements a multi-agent system using CrewAI to automate the process of researching products, searching e-commerce websites, scraping relevant information, and generating procurement reports. The system is designed to help businesses make informed purchasing decisions by collecting and analyzing product data from various online sources.
 15 | 
 16 | ## Features ✨
 17 | 
 18 | - **Search Query Generation**: AI agent that generates optimized search queries for product research
 19 | - **Search Engine Processing**: Agent that queries e-commerce sites and extracts relevant results
 20 | - **Web Scraping**: Agent that collects detailed product information from search results
 21 | - **Procurement Reports**: Agent that analyzes scraped data and creates comprehensive procurement reports
 22 | 
 23 | ## Installation 💻
 24 | 
 25 | 1. Clone the repository:
 26 |    ```bash
 27 |    git clone https://github.com/Fawzy-AI-Explorer/NLP-Tea.git
 28 |    cd NLP-Tea/04-crewai-agents/4.1-AI\ Agents\ using\ CrewAI\ \(\ Abu\ Bakr\ Soliman\)/crewai-agents
 29 |    ```
 30 | 
 31 | 2. Create and activate a virtual environment (recommended):
 32 |    ```bash
 33 |    # For Windows
 34 |    python -m venv venv
 35 |    source venv\Scripts\activate
 36 |    ```
 37 | 
 38 | 3. Install required packages:
 39 |    ```bash
 40 |    pip install -r requirements.txt
 41 |    ```
 42 | 
 43 | 4. Set up your environment variables:
 44 |    ```bash
 45 |    # Create a .env file with your API keys
 46 |    OPENAI_API_KEY=your_openai_api_key 
 47 |    AGENTOPS_API_KEY=your_agentops_api_key 
 48 |    # Add other API keys as needed
 49 |    ```
 50 | 
 51 | ## Project Structure 📂
 52 | 
 53 | ```
 54 | crewai-agents/
 55 | │
 56 | │
 57 | ├── crewai_agents/                   - Core module containing all agent definitions
 58 | │   ├── __init__.py                  - Package initialization
 59 | │   ├── config.py                    - Configuration settings
 60 | │   ├── utilis.py                    - Utility functions
 61 | │   │
 62 | │   ├── agents/                      - Individual agent implementations
 63 | │   │   ├── __init__.py
 64 | │   │   ├── a1_search_queries_agent.py   - Search query generation agent
 65 | │   │   ├── a2_search_engine_agent.py    - Search engine processing agent
 66 | │   │   ├── a3_scraping_agent.py         - Web scraping agent
 67 | │   │   └── a4_procurement_report.py     - Procurement report generation agent
 68 | │   │
 69 | │   │
 70 | │   └── tasks/                       - Task definitions for each agent
 71 | │       ├── __init__.py
 72 | │       ├── t1_search_queries_task.py    - Search query generation task
 73 | │       ├── t2_search_engine_task.py     - Search engine task
 74 | │       ├── t3_scraping_task.py          - Web scraping task
 75 | │       └── t4_procurement_report_task.py - Procurement report generation task
 76 | │
 77 | ├── examples/                        - Example scripts to run individual agents or full workflows
 78 | │   ├── ex1_run_search_queries_agent.py  - Run search queries agent
 79 | │   ├── ex2_run_search_engine_agent.py   - Run search engine agent
 80 | │   └── ex3_run_procurement_report_agent.py - Run procurement report agent
 81 | |   
 82 | │
 83 | ├── outputs/                         - Output directory for agent results
 84 | │   └── ai-agent-output/             - JSON outputs from agent runs
 85 | │       ├── step_1_suggested_search_queries.json - Output from search queries agent
 86 | │       ├── step_2_search_results.json           - Output from search engine agent
 87 | │       ├── step_3_scraping_results.json         - Output from web scraping agent
 88 | │       └── step_4_procurement_report.html       - Final procurement report output
 89 | │
 90 | ├── tests/                           - Unit and integration tests
 91 | │   └── test.py                      - Test script
 92 | │
 93 | ├── requirements.txt                 - Project dependencies
 94 | └── README.md                        - Project documentation
 95 | ```
 96 | 
 97 | ## Output Files 📁
 98 | 
 99 | The agents produce the following output files during execution:
100 | 
101 | ```
102 | outputs/
103 | └── ai-agent-output/
104 |     ├── step_1_suggested_search_queries.json - Output from search queries agent
105 |     ├── step_2_search_results.json           - Output from search engine agent
106 |     ├── step_3_scraping_results.json         - Output from web scraping agent
107 |     └── step_4_procurement_report.html       - Final procurement report output (HTML format)
108 | ```
109 | 
110 | ## Usage 🚀
111 | 
112 | ### 1. Generate Search Queries 🔎
113 | 
114 | ```python
115 | from examples.ex1_run_search_queries_agent import run_search_queries_agent
116 | 
117 | results = run_search_queries_agent()
118 | print(results)
119 | ```
120 | 
121 | ### 2. Run Search Engine Agent 🌐
122 | 
123 | ```python
124 | from examples.ex2_run_search_engine_agent import run_search_engine_agent
125 | 
126 | results = run_search_engine_agent()
127 | print(results)
128 | ```
129 | 
130 | ### 3. Generate Procurement Report 📊
131 | 
132 | ```python
133 | from examples.ex3_run_procurement_report_agent import run_procurement_report_agent
134 | 
135 | results = run_procurement_report_agent()
136 | print(results)
137 | ```
138 | 
139 | ## Complete Workflow 🔄
140 | 
141 | workflow:
142 | 1. Generates optimized search queries for your product requirements
143 | 2. Searches e-commerce sites using these queries
144 | 3. Scrapes detailed product information from search results
145 | 4. Produces a comprehensive procurement report with recommendations
146 | 
147 | ## License 📜
148 | 
149 | ## Contributing 🤝
150 | 
151 | Contributions are welcome! Please feel free to submit a Pull Request.
152 | 
153 | ## Acknowledgments 🙏
154 | 
155 | - Thanks to the [Abu Bakr Soliman](https://www.linkedin.com/in/bakrianoo/) for this [crash course](https://www.youtube.com/watch?v=DDR4A8-MLQs&t=1s)


--------------------------------------------------------------------------------
/02-Word Embeddings/2.4-Word2Vec/Theory.md:
--------------------------------------------------------------------------------
  1 | # Word Embeddings
  2 | 
  3 | [01- Label Encoder & One Hot Encoder](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.1-Label%20Encoder%20and%20One%20Hot%20Encoder)   
  4 | 
  5 | [02 - BOW](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.2-BOW)   
  6 | 
  7 | [03 - TF-IDF](https://github.com/Fawzy-AI-Explorer/NLP-Tea/tree/main/02-Word%20Embeddings/2.3-TF_IDF)   
  8 | 
  9 | ## 04 - Word2Vec
 10 | 
 11 | Word2Vec is a neural network-based method that learns to represent words as vectors in a continuous vector (word embeddings). where words with similar meanings have similar vectors. Word2Vec provides a way to capture the semantic relationships between words through neural networks.  
 12 | 
 13 | Map each word to a dense vector such that words with common contexts in the corpus have similar vector representations.  
 14 | 
 15 | 
 16 | - Dimensionality Reduction:
 17 |   - Instead of representing words as one-hot vectors (High Dimension len vec = len(Vocab) = Number of unique words and sparse), Word2Vec produces dense vectors, low-dimensional vectors.  
 18 | - Semantic:
 19 |   - By Using `Context Words` Based on `window size` 
 20 | - Computational Efficiency:
 21 |   - By Using `Negative Sampling Method`  
 22 | 
 23 | ## Goal
 24 | 
 25 | Not predict Context But to Leaern Vector representation of target words  
 26 | By predicting or using the context Words, Word2Vec `learns` the structure of the language.
 27 | The main objective of Word2Vec is to learn word embeddings that:
 28 | - Capture Semantic Relationships: Words with similar meanings are represented by similar vectors.
 29 |   
 30 | ## Target, Context
 31 | ![image](https://github.com/user-attachments/assets/425a0cf4-68c1-476c-9061-fa13ae335f18)
 32 | 
 33 | - Target Word:
 34 |   - in a particular step: It is the `center` word that the model wants to learn a good representation for.
 35 |   - Each word will be a target in a specific step  
 36 | - Context Words:
 37 |   - The words surrounding the target word within window size.
 38 | ![image](https://github.com/user-attachments/assets/d2155ea2-aadc-41f1-9d31-ab0e32ab1a47)
 39 | 
 40 | `my name is mohammad fawzy`                
 41 | window size = 1    
 42 | `Target` => `Context`     
 43 | `my` => `name`    
 44 | `name` => `my, is`       
 45 | `is` => `name, mohammad`         
 46 | `mohammad` => `is, fawzy`         
 47 | `fawzy` => `mohammad`          
 48 | window size = 2           
 49 | `my` => `name , is`          
 50 | `name` => `my, is, mohammad`          
 51 | `is` => `my, name, mohammad, fawzy`          
 52 | `mohammad` => `name, is, fawzy`            
 53 | `fawzy` => `is, mohammad`           
 54 |      
 55 | 
 56 | 
 57 | 
 58 | 
 59 | ## How Does Word2Vec Work?  
 60 | 
 61 | Word2Vec Uses a shallow neural network that consists of an input layer, a hidden layer, and an output layer.  
 62 | 
 63 | - Use One Hot Encoding
 64 | - defining target words and for each one define it's Context words
 65 | 
 66 | - One Input Layer  (Number of Neurons = Number of unique words = len Vocab)
 67 | - One Hidden Layer (Embedding Size)
 68 | - One output Layer (Number of Neurons = Number of unique words = len Vocab)
 69 | 
 70 | 
 71 | ## Types of Word2Vec
 72 | 1. Continuous Bag of Words (CBOW):
 73 |    - Given the `Context` words, Predict `Target` word
 74 |    - works well with large datasets and It is computationally more efficient.
 75 |    - ![image](https://github.com/user-attachments/assets/9ef1f8ad-df42-4b6a-b7c1-068db91398d5)
 76 | 
 77 | 2. Skip-Gram:
 78 |    -  Given the `Target` word, Predict `Context` words
 79 |    -  Works well with smaller datasets and is particularly good at capturing rare words.
 80 |    - ![image](https://github.com/user-attachments/assets/98926dee-066c-4c9a-8566-a54724f4058e)
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | # Skip-Gram
 88 | 
 89 | - Input Layer:
 90 |   - Number of Neurons: Equal to the number of unique words (vocabulary size).
 91 |   - Representation: One-hot encoded vector representing the target word.
 92 | 
 93 | - Hidden Layer:
 94 |   - Number of Neurons: Equal to the chosen embedding size.
 95 |   - Purpose: This layer learns to project the one-hot vector into a lower-dimensional space. The learned weights of this layer become the word embeddings.
 96 | 
 97 | - Output Layer:
 98 |   - Number of Neurons: Equal to the number of unique words (vocabulary size).
 99 |   - Representation: Produces a probability (SoftMax) of all words in the vocabulary to predict which words are context.
100 |   - maximize the probability of all context words together, given a center word
101 |   - goal is not to predict context words, but to learn vector representation of words, It just happens that predicting context words
102 | ![image](https://github.com/user-attachments/assets/24b44754-51d4-407e-af57-5936b4795840)
103 | 
104 | 
105 | 
106 | Vocab Size = 100 , Window Size = 1   
107 | Each Word represented in Binary Vector (Len = 100) All 0 except the index  
108 | Suppose the     
109 | - target  
110 |    - Has 1 in position 3 [0 0 0 1 0 0 0 0 0 0.......]  
111 | - context      
112 |    - Has 1 in position 2 [0 0 1 0 0 0 0 0 0 0.......]    
113 |    - Has 1 in position 4 [0 0 0 0 1 0 0 0 0 0.......]    
114 | 
115 | 1. One-Hot Encoding (Neurons = Voc size)     
116 | 
117 | 2. Defining Target and Context Words
118 | 
119 | 3. Input Layer
120 |    - Feed the Target Word (one-hot encoded vector for the target word)
121 | 
122 | 4. Hidden Layer (Embedding Layer)
123 |    - The one-hot vector is multiplied by a weight matrix W (vocab size × embedding size).
124 |    - Since only one element in the one-hot vector is 1, the output is simply the row of W corresponding to that word. This row becomes the word embedding for the target word.
125 |    - The training process adjusts the weights in W so that similar words (appearing in similar contexts) end up with similar vectors.
126 |    - ![image](https://github.com/user-attachments/assets/635aa4dd-c693-40a6-8894-6c2c76d5d004)
127 | 
128 | 
129 | 5. Output Layer
130 |    - The hidden layer output (the word embedding) is then passed through another weight matrix W′(embedding size × vocab size) to produce logits for every word in the vocabulary.
131 |    - A softmax function is applied to these Logits to get a probability distribution over all words. This distribution reflects the probability of each word being a context word for the given target word.
132 |    - we want to maximize the probability of all context words together, given a center word
133 |    - compute The error between the predicted probabilities and the actual context words represented as one-hot vectors (Sum all context in one vextor).
134 |    - [0 0 1 0 1 0 0 0 0 0.......] 
135 |    - The network uses backpropagation to adjust both weight matrices W and W′
136 | 6. Extracting the Embeddings
137 |    - Once training is complete, the weights in the hidden layer matrix W are used as the word embeddings.
138 |    - These embeddings capture the relationships between words based on their context in the training text.
139 | 
140 | ## Negative Sampling 
141 | 
142 | Problems with Skip-Gram   
143 | ![image](https://github.com/user-attachments/assets/5960d9ba-3be6-4486-b282-94d0283395d0)
144 | 
145 | Softmax is computationally very expensive, as it requires scanning through the entire
146 | output to compute the probability distribution of all Vocab (V) words, Vocab size may be millions or more  
147 | 
148 | Multi class classification Problems Number of classes = V = 10,000 Classes   
149 | we want to convert from multi class classification (Soft max) to Binary classification (Sigmoid)   
150 | 
151 | Negative Sampling:  
152 | For each training sample define:  
153 | - Conyext Word 
154 | - Context Words (Positive Context Sample Cpos)
155 |    - For each Context Word
156 |    - K Number of Words not in the context (Negative Samples).
157 | The new objective is to predict, for any given target-word pair, whether the word is in the context words or not.
158 | Give the Network Two Words => It predict 1 if target-Context, 0 If target-Negative
159 | this is a Binary Classification
160 | 
161 | ![image](https://github.com/user-attachments/assets/a7e7b459-1160-43a7-ab3d-1b2fc7eac1dd)
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/02-Word Embeddings/2.3-TF_IDF/2.3-TF-IDF.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 8,
  6 |    "id": "30fa1fe2",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stderr",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "[nltk_data] Downloading package stopwords to\n",
 14 |       "[nltk_data]     C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n",
 15 |       "[nltk_data]   Package stopwords is already up-to-date!\n",
 16 |       "[nltk_data] Downloading package wordnet to\n",
 17 |       "[nltk_data]     C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n",
 18 |       "[nltk_data]   Package wordnet is already up-to-date!\n"
 19 |      ]
 20 |     }
 21 |    ],
 22 |    "source": [
 23 |     "import pandas as pd\n",
 24 |     "import numpy as np\n",
 25 |     "import re\n",
 26 |     "from nltk.stem import WordNetLemmatizer, PorterStemmer\n",
 27 |     "import nltk\n",
 28 |     "from nltk.corpus import stopwords\n",
 29 |     "from nltk.tokenize import word_tokenize\n",
 30 |     "import contractions\n",
 31 |     "nltk.download('stopwords')\n",
 32 |     "nltk.download('wordnet')\n",
 33 |     "\n",
 34 |     "from typing import List"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 106,
 40 |    "id": "6dcf6c88",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "doc1 = \"Neural networks process data using deep learning algorithms in artificial intelligence.\"\n",
 45 |     "doc2 = \"Artificial intelligence applies neural networks and deep learning to process large datasets.\"\n",
 46 |     "\n",
 47 |     "doc3 = \"Gasoline cars have combustion engines that power vehicles through fuel ignition.\"\n",
 48 |     "doc4 = \"Car engines burn gasoline in combustion chambers to move vehicles on the road.\"\n",
 49 |     "\n",
 50 |     "corpus = [doc1, doc2, doc3, doc4]"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 107,
 56 |    "id": "f4af634d",
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "name": "stdout",
 61 |      "output_type": "stream",
 62 |      "text": [
 63 |       "Neural networks process data using deep learning algorithms in artificial intelligence.\n",
 64 |       "Length of document: 87\n",
 65 |       "\n",
 66 |       "Artificial intelligence applies neural networks and deep learning to process large datasets.\n",
 67 |       "Length of document: 92\n",
 68 |       "\n",
 69 |       "Gasoline cars have combustion engines that power vehicles through fuel ignition.\n",
 70 |       "Length of document: 80\n",
 71 |       "\n",
 72 |       "Car engines burn gasoline in combustion chambers to move vehicles on the road.\n",
 73 |       "Length of document: 78\n",
 74 |       "\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "for d in corpus:\n",
 80 |     "    print(d)\n",
 81 |     "    print(\"Length of document:\", len(d))\n",
 82 |     "    print()"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 108,
 88 |    "id": "c96b80e5",
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "name": "stdout",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "[['neural', 'network', 'process', 'data', 'using', 'deep', 'learning', 'algorithm', 'artificial', 'intelligence'], ['artificial', 'intelligence', 'applies', 'neural', 'network', 'deep', 'learning', 'process', 'large', 'datasets'], ['gasoline', 'car', 'combustion', 'engine', 'power', 'vehicle', 'fuel', 'ignition'], ['car', 'engine', 'burn', 'gasoline', 'combustion', 'chamber', 'move', 'vehicle', 'road']]\n"
 96 |      ]
 97 |     }
 98 |    ],
 99 |    "source": [
100 |     "def preprocessing(text: str) -> list[str]:\n",
101 |     "\n",
102 |     "    stop_words = set(stopwords.words('english'))\n",
103 |     "    lemmatizer = WordNetLemmatizer()\n",
104 |     "    stemmer = PorterStemmer()\n",
105 |     "\n",
106 |     "    # Convert Text to Lowercase (Normalization)\n",
107 |     "    text_lower = text.lower()\n",
108 |     "    text_no_tags = re.sub(r'<[^>]+>', '', text_lower)\n",
109 |     "\n",
110 |     "    # Contraction Handling\n",
111 |     "    text_no_tags = contractions.fix(text_no_tags)\n",
112 |     "\n",
113 |     "    # Removing Punctuation\n",
114 |     "    text_no_punct = re.sub(r'[^a-zA-Z\\s]', '', text_no_tags) # \\' for keep apostrophes (e.g. don't, it's)\n",
115 |     "\n",
116 |     "\n",
117 |     "    # 3. Tokens\n",
118 |     "    tokens = re.split(r\"\\s+\", text_no_punct) \n",
119 |     "    tokens = [t for t in tokens if t]\n",
120 |     "    # or use nltk tokenizer\n",
121 |     "    tokens = word_tokenize(text_no_punct)\n",
122 |     "\n",
123 |     "    # 4. Stop word removal\n",
124 |     "    filtered_tokens  = [token for token in tokens if token not in stop_words]\n",
125 |     "\n",
126 |     "    # 5. Lemmatization \n",
127 |     "    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]\n",
128 |     "    # or stemmer\n",
129 |     "    stemm_tokens = [stemmer.stem(token) for token in filtered_tokens ]\n",
130 |     "\n",
131 |     "    return lemma_tokens\n",
132 |     "\n",
133 |     "preprocessed_text = [preprocessing(doc) for doc in corpus]\n",
134 |     "print(preprocessed_text)"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 109,
140 |    "id": "91ad8dd0",
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "term : learning\n",
148 |       "doc :['neural', 'network', 'process', 'data', 'using', 'deep', 'learning', 'algorithm', 'artificial', 'intelligence']\n",
149 |       "frequency : 1\n",
150 |       "len doc : 10\n",
151 |       "tf of 'learning' on doc 0:  0.1\n",
152 |       "-----------------------\n",
153 |       "term : learning\n",
154 |       "doc :['artificial', 'intelligence', 'applies', 'neural', 'network', 'deep', 'learning', 'process', 'large', 'datasets']\n",
155 |       "frequency : 1\n",
156 |       "len doc : 10\n",
157 |       "tf of 'learning' on doc 2:  0.1\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "all_tokens = [token for doc in preprocessed_text for token in doc]\n",
163 |     "vocab = sorted(set(all_tokens))\n",
164 |     "# print(len(all_tokens))\n",
165 |     "\n",
166 |     "\n",
167 |     "def TF(term, doc) :\n",
168 |     "    term = term.lower()\n",
169 |     "    print(f\"term : {term}\")\n",
170 |     "    print(f\"doc :{doc}\")\n",
171 |     "    print(f\"frequency : {doc.count(term)}\")\n",
172 |     "    print(f\"len doc : {len(doc)}\")\n",
173 |     "    return doc.count(term) / len(doc)\n",
174 |     "\n",
175 |     "\n",
176 |     "term = \"learning\"\n",
177 |     "tf = TF(term, preprocessed_text[0])\n",
178 |     "print(\"tf of 'learning' on doc 0: \", tf)\n",
179 |     "print(\"-----------------------\")\n",
180 |     "tf = TF(term, preprocessed_text[1])\n",
181 |     "print(\"tf of 'learning' on doc 2: \", tf)"
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": 110,
187 |    "id": "ea9c103f",
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "term : learning \n",
195 |       "number of documents : 4 \n",
196 |       "number of documents containing term : 2\n",
197 |       "idf of 'learning' :  1.3333333333333333\n",
198 |       "-----------------------\n",
199 |       "term : statistic \n",
200 |       "number of documents : 4 \n",
201 |       "number of documents containing term : 0\n",
202 |       "idf of 'statistics' :  4.0\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "def IDF(term, corpus):\n",
208 |     "    term = term.lower()\n",
209 |     "    N = len(corpus)\n",
210 |     "    n = sum(1 for doc in corpus if term in doc)\n",
211 |     "    print(f\"term : {term} \\nnumber of documents : {N} \\nnumber of documents containing term : {n}\")\n",
212 |     "\n",
213 |     "    return N/(n+1)\n",
214 |     "idf = IDF(\"learning\", preprocessed_text)\n",
215 |     "print(\"idf of 'learning' : \", idf)\n",
216 |     "print(\"-----------------------\")\n",
217 |     "idf = IDF(\"statistic\", preprocessed_text)\n",
218 |     "print(\"idf of 'statistics' : \", idf)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": 111,
224 |    "id": "b9d49d9a",
225 |    "metadata": {},
226 |    "outputs": [
227 |     {
228 |      "name": "stdout",
229 |      "output_type": "stream",
230 |      "text": [
231 |       "37\n"
232 |      ]
233 |     },
234 |     {
235 |      "data": {
236 |       "text/plain": [
237 |        "(4, 25)"
238 |       ]
239 |      },
240 |      "execution_count": 111,
241 |      "metadata": {},
242 |      "output_type": "execute_result"
243 |     }
244 |    ],
245 |    "source": [
246 |     "all_tokens = [token for doc in preprocessed_text for token in doc]\n",
247 |     "vocab = sorted(set(all_tokens))\n",
248 |     "print(len(all_tokens))\n",
249 |     "\n",
250 |     "\n",
251 |     "def TF(term: str, doc: list[str]) -> float:\n",
252 |     "    \"\"\"\n",
253 |     "    Calculate Term Frequency (TF) of a term in a document.\n",
254 |     "\n",
255 |     "    Args:\n",
256 |     "        term (str): The term to calculate TF for.\n",
257 |     "        doc (list[str]): The document in which to calculate TF.\n",
258 |     "\n",
259 |     "    Returns:\n",
260 |     "        float: The term frequency of the term in the document.\n",
261 |     "    \"\"\"\n",
262 |     "    term = term.lower()\n",
263 |     "    return doc.count(term) / len(doc)\n",
264 |     "    \n",
265 |     "def IDF(term: str, corpus: List[list[str]]) -> float:\n",
266 |     "    \"\"\"\n",
267 |     "    Calculate Inverse Document Frequency (IDF) of a term in a corpus.\n",
268 |     "\n",
269 |     "    Args:\n",
270 |     "        term (str): The term to calculate IDF for.\n",
271 |     "        corpus (List[list[str]]): The corpus in which to calculate IDF.\n",
272 |     "\n",
273 |     "    Returns:\n",
274 |     "        float: The inverse document frequency of the term in the corpus.\n",
275 |     "    \"\"\"\n",
276 |     "    N = len(corpus)\n",
277 |     "    term = term.lower()\n",
278 |     "    num_docs_with_term = sum(1 for doc in corpus if term in doc)\n",
279 |     "    return N / (1 + num_docs_with_term)\n",
280 |     "\n",
281 |     "def TF_IDF(term: str, doc: list[str], corpus: List[list[str]]) -> float:\n",
282 |     "    \"\"\"\n",
283 |     "    Calculate TF-IDF of a term in a document within a corpus.\n",
284 |     "\n",
285 |     "    Args:\n",
286 |     "        term (str): The term to calculate TF-IDF for.\n",
287 |     "        doc (list[str]): The document in which to calculate TF-IDF.\n",
288 |     "        corpus (List[list[str]]): The corpus in which to calculate TF-IDF.\n",
289 |     "\n",
290 |     "    Returns:\n",
291 |     "        float: The TF-IDF score of the term in the document.\n",
292 |     "    \"\"\"\n",
293 |     "    tf = TF(term, doc)\n",
294 |     "    idf = IDF(term, corpus)\n",
295 |     "    return tf * idf\n",
296 |     "\n",
297 |     "\n",
298 |     "\n",
299 |     "tfidf_matrix = np.zeros((len(preprocessed_text), len(vocab)))\n",
300 |     "for i, doc in enumerate(preprocessed_text):\n",
301 |     "    for j, term in enumerate(vocab):\n",
302 |     "        tfidf_matrix[i][j] = TF_IDF(term, doc, preprocessed_text)\n",
303 |     "\n",
304 |     "\n",
305 |     "tfidf_matrix.shape"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 112,
311 |    "id": "6afe47a3",
312 |    "metadata": {},
313 |    "outputs": [
314 |     {
315 |      "name": "stdout",
316 |      "output_type": "stream",
317 |      "text": [
318 |       "[[1.         0.50909091 0.         0.        ]\n",
319 |       " [0.50909091 1.         0.         0.        ]\n",
320 |       " [0.         0.         1.         0.38984059]\n",
321 |       " [0.         0.         0.38984059 1.        ]]\n"
322 |      ]
323 |     }
324 |    ],
325 |    "source": [
326 |     "# calc similarity between documents\n",
327 |     "from sklearn.metrics.pairwise import cosine_similarity\n",
328 |     "similarity_matrix = cosine_similarity(tfidf_matrix)\n",
329 |     "print(similarity_matrix)"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "markdown",
334 |    "id": "4239f7c3",
335 |    "metadata": {},
336 |    "source": [
337 |     "# Built in"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 91,
343 |    "id": "0459ffb9",
344 |    "metadata": {},
345 |    "outputs": [
346 |     {
347 |      "name": "stdout",
348 |      "output_type": "stream",
349 |      "text": [
350 |       "['advanced' 'ai' 'algorithms' 'allows' 'artificial' 'benefits' 'branch'\n",
351 |       " 'cars' 'data' 'decisions' 'designed' 'efficiency' 'electric'\n",
352 |       " 'environmental' 'features' 'include' 'intelligence' 'learn' 'learning'\n",
353 |       " 'machine' 'machines' 'make' 'patterns' 'popular' 'safety'\n",
354 |       " 'transportation' 'uses' 'vehicles']\n",
355 |       "--------------\n",
356 |       "[[0.         0.         0.         0.36222393 0.36222393 0.\n",
357 |       "  0.         0.         0.2855815  0.36222393 0.         0.\n",
358 |       "  0.         0.         0.         0.         0.36222393 0.36222393\n",
359 |       "  0.         0.         0.36222393 0.36222393 0.         0.\n",
360 |       "  0.         0.         0.         0.        ]\n",
361 |       " [0.         0.36222393 0.36222393 0.         0.         0.\n",
362 |       "  0.36222393 0.         0.2855815  0.         0.         0.\n",
363 |       "  0.         0.         0.         0.         0.         0.\n",
364 |       "  0.36222393 0.36222393 0.         0.         0.36222393 0.\n",
365 |       "  0.         0.         0.36222393 0.        ]\n",
366 |       " [0.37796447 0.         0.         0.         0.         0.\n",
367 |       "  0.         0.37796447 0.         0.         0.37796447 0.\n",
368 |       "  0.         0.         0.37796447 0.37796447 0.         0.\n",
369 |       "  0.         0.         0.         0.         0.         0.\n",
370 |       "  0.37796447 0.37796447 0.         0.        ]\n",
371 |       " [0.         0.         0.         0.         0.         0.40824829\n",
372 |       "  0.         0.         0.         0.         0.         0.40824829\n",
373 |       "  0.40824829 0.40824829 0.         0.         0.         0.\n",
374 |       "  0.         0.         0.         0.         0.         0.40824829\n",
375 |       "  0.         0.         0.         0.40824829]]\n"
376 |      ]
377 |     }
378 |    ],
379 |    "source": [
380 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
381 |     "\n",
382 |     "documents = [doc1, doc2, doc3, doc4]\n",
383 |     "\n",
384 |     "vectorizer = TfidfVectorizer(stop_words='english')\n",
385 |     "X = vectorizer.fit_transform(documents)\n",
386 |     "\n",
387 |     "print(vectorizer.get_feature_names_out())\n",
388 |     "print(\"--------------\")\n",
389 |     "# Convert TF-IDF matrix to array and view it\n",
390 |     "print(X.toarray())"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": null,
396 |    "id": "6c5ee47d",
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": []
400 |   }
401 |  ],
402 |  "metadata": {
403 |   "kernelspec": {
404 |    "display_name": "myenv",
405 |    "language": "python",
406 |    "name": "python3"
407 |   },
408 |   "language_info": {
409 |    "codemirror_mode": {
410 |     "name": "ipython",
411 |     "version": 3
412 |    },
413 |    "file_extension": ".py",
414 |    "mimetype": "text/x-python",
415 |    "name": "python",
416 |    "nbconvert_exporter": "python",
417 |    "pygments_lexer": "ipython3",
418 |    "version": "3.12.6"
419 |   }
420 |  },
421 |  "nbformat": 4,
422 |  "nbformat_minor": 5
423 | }
424 | 


--------------------------------------------------------------------------------
/02-Word Embeddings/2.5-FastText/2.5-fast_text.ipynb:
--------------------------------------------------------------------------------
1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.11.11","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"},"kaggle":{"accelerator":"none","dataSources":[{"sourceId":3316532,"sourceType":"datasetVersion","datasetId":10100}],"dockerImageVersionId":31012,"isInternetEnabled":true,"language":"python","sourceType":"notebook","isGpuEnabled":false}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Libraries","metadata":{}},{"cell_type":"code","source":"import pandas as pd \n# preprocessing\nimport re\nimport nltk\nfrom nltk.stem import WordNetLemmatizer\nfrom nltk.corpus import stopwords\nnltk.download('stopwords')\nnltk.download('wordnet')","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:17:58.089901Z","iopub.execute_input":"2025-04-19T17:17:58.090490Z","iopub.status.idle":"2025-04-19T17:18:02.530545Z","shell.execute_reply.started":"2025-04-19T17:17:58.090456Z","shell.execute_reply":"2025-04-19T17:18:02.529578Z"}},"outputs":[{"name":"stderr","text":"[nltk_data] Downloading package stopwords to /usr/share/nltk_data...\n[nltk_data]   Package stopwords is already up-to-date!\n[nltk_data] Downloading package wordnet to /usr/share/nltk_data...\n[nltk_data]   Package wordnet is already up-to-date!\n","output_type":"stream"},{"execution_count":1,"output_type":"execute_result","data":{"text/plain":"True"},"metadata":{}}],"execution_count":1},{"cell_type":"markdown","source":"# Data","metadata":{}},{"cell_type":"code","source":"file_path = r\"/kaggle/input/yelp-dataset/yelp_academic_dataset_tip.json\"\ndf = pd.read_json(file_path, lines=True)\ndf = df[:1000]\n\ntext_column = df['text']\ntext_column.head()","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:02.531766Z","iopub.execute_input":"2025-04-19T17:18:02.532185Z","iopub.status.idle":"2025-04-19T17:18:10.677540Z","shell.execute_reply.started":"2025-04-19T17:18:02.532157Z","shell.execute_reply":"2025-04-19T17:18:10.676617Z"}},"outputs":[{"execution_count":2,"output_type":"execute_result","data":{"text/plain":"0                       Avengers time with the ladies.\n1    They have lots of good deserts and tasty cuban...\n2               It's open even when you think it isn't\n3                            Very decent fried chicken\n4               Appetizers.. platter special for lunch\nName: text, dtype: object"},"metadata":{}}],"execution_count":2},{"cell_type":"code","source":"df.shape","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:10.678316Z","iopub.execute_input":"2025-04-19T17:18:10.678558Z","iopub.status.idle":"2025-04-19T17:18:10.684053Z","shell.execute_reply.started":"2025-04-19T17:18:10.678538Z","shell.execute_reply":"2025-04-19T17:18:10.683203Z"}},"outputs":[{"execution_count":3,"output_type":"execute_result","data":{"text/plain":"(1000, 5)"},"metadata":{}}],"execution_count":3},{"cell_type":"markdown","source":"# Preprocessing","metadata":{}},{"cell_type":"code","source":"def preprocess(text: str) -> list :\n    text = text.lower()\n    text = re.sub(r'[^a-zA-Z\\s]', '', text)       # Remove all non-alphabetic characters\n    text = re.sub(r'\\s+[a-zA-Z]\\s+', ' ', text)   # Remove all single characters\n\n    tokens = text.split()\n    tokens = [t for t in tokens if len(t)>3]      # Keep words with length >= 3\n\n    stop_words = set(stopwords.words('english'))\n    lemmatizer = WordNetLemmatizer() \n\n    tokens = [lemmatizer.lemmatize(t) for t in tokens if t not in stop_words]\n    return tokens if tokens else []               # Return an empty list if nothing remains\n\nsentences = [preprocess(t) for t in text_column] # List[List[str]]\nprint(sentences[:3])\nlen(sentences)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:10.685927Z","iopub.execute_input":"2025-04-19T17:18:10.686209Z","iopub.status.idle":"2025-04-19T17:18:13.899032Z","shell.execute_reply.started":"2025-04-19T17:18:10.686179Z","shell.execute_reply":"2025-04-19T17:18:13.898150Z"}},"outputs":[{"name":"stdout","text":"[['avenger', 'time', 'lady'], ['lot', 'good', 'desert', 'tasty', 'cuban', 'sandwich'], ['open', 'even', 'think', 'isnt']]\n","output_type":"stream"},{"execution_count":4,"output_type":"execute_result","data":{"text/plain":"1000"},"metadata":{}}],"execution_count":4},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# Official FastText","metadata":{}},{"cell_type":"code","source":"from gensim.models import FastText","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:13.903216Z","iopub.execute_input":"2025-04-19T17:18:13.903959Z","iopub.status.idle":"2025-04-19T17:18:54.429621Z","shell.execute_reply.started":"2025-04-19T17:18:13.903934Z","shell.execute_reply":"2025-04-19T17:18:54.428969Z"}},"outputs":[],"execution_count":5},{"cell_type":"code","source":"# Train FastText model\nFastText_model = FastText(\n    sentences=sentences,\n    vector_size=100,\n    window=3,\n    min_count=1,\n    epochs=500\n)\nprint(FastText_model)\n# Save the model\nFastText_model.save(\"fasttext_model.model\")\nprint(\"model saved.\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:18:54.430461Z","iopub.execute_input":"2025-04-19T17:18:54.430938Z","iopub.status.idle":"2025-04-19T17:19:12.717566Z","shell.execute_reply.started":"2025-04-19T17:18:54.430916Z","shell.execute_reply":"2025-04-19T17:19:12.716625Z"}},"outputs":[{"name":"stdout","text":"FastText<vocab=2121, vector_size=100, alpha=0.025>\nmodel saved.\n","output_type":"stream"}],"execution_count":6},{"cell_type":"code","source":"vocab_size = len(FastText_model.wv)\nembedding_size = FastText_model.vector_size\n\n# Print vocabulary and embedding size\nprint(f\"Vocabulary Size: {vocab_size}\")\nprint(f\"Embedding Size: {embedding_size}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:19:12.719832Z","iopub.execute_input":"2025-04-19T17:19:12.720076Z","iopub.status.idle":"2025-04-19T17:19:12.725155Z","shell.execute_reply.started":"2025-04-19T17:19:12.720058Z","shell.execute_reply":"2025-04-19T17:19:12.724125Z"}},"outputs":[{"name":"stdout","text":"Vocabulary Size: 2121\nEmbedding Size: 100\n","output_type":"stream"}],"execution_count":7},{"cell_type":"code","source":"similar_words = FastText_model.wv.most_similar('good', topn=10)\nprint(\"\\nSimilar\")\nprint(similar_words)\nprint(\"-\"*30) \nopposite_words = FastText_model.wv.most_similar(negative= 'good', topn=10)\nprint(\"\\n\", opposite_words)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:19:12.726119Z","iopub.execute_input":"2025-04-19T17:19:12.726446Z","iopub.status.idle":"2025-04-19T17:19:12.750329Z","shell.execute_reply.started":"2025-04-19T17:19:12.726416Z","shell.execute_reply":"2025-04-19T17:19:12.749248Z"}},"outputs":[{"name":"stdout","text":"\nSimilar\n[('goodi', 0.8452088832855225), ('food', 0.5561390519142151), ('deliciously', 0.5237860083580017), ('neighborhood', 0.4961780607700348), ('deliciousness', 0.4901498854160309), ('deliciousthen', 0.4899592697620392), ('ipod', 0.4890640377998352), ('foodgreat', 0.47072115540504456), ('delicious', 0.46029138565063477), ('bollywood', 0.4556906819343567)]\n------------------------------\n\n [('postage', 0.3698478043079376), ('hermitage', 0.3362504243850708), ('cinco', 0.3282019793987274), ('lurk', 0.3278462290763855), ('professionalism', 0.31859180331230164), ('postal', 0.31132882833480835), ('prepaid', 0.30549004673957825), ('trip', 0.30125343799591064), ('trap', 0.2973553240299225), ('ease', 0.2962052822113037)]\n","output_type":"stream"}],"execution_count":8},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"markdown","source":"# pretrained FastText model","metadata":{}},{"cell_type":"code","source":"# Download Model\nimport urllib.request\nimport gzip\nimport os\nimport shutil","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:19:12.751634Z","iopub.execute_input":"2025-04-19T17:19:12.751965Z","iopub.status.idle":"2025-04-19T17:19:12.756263Z","shell.execute_reply.started":"2025-04-19T17:19:12.751942Z","shell.execute_reply":"2025-04-19T17:19:12.755380Z"}},"outputs":[],"execution_count":9},{"cell_type":"code","source":"# Download pretrained FastText model\nurl = \"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\"\noutput_gz = \"cc.en.300.bin.gz\"\noutput_bin = \"cc.en.300.bin\"\n\n# Download the .gz file\nprint(\"Downloading pretrained FastText model...\")\nurllib.request.urlretrieve(url, output_gz)\n\n# Unzip the .gz file\nprint(\"Unzipping the model...\")\nwith gzip.open(output_gz, 'rb') as f_in:\n    with open(output_bin, 'wb') as f_out:\n        shutil.copyfileobj(f_in, f_out)\nprint(\"model saved\")\n# Remove the .gz file to save space\nos.remove(output_gz)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:19:12.757106Z","iopub.execute_input":"2025-04-19T17:19:12.757990Z","iopub.status.idle":"2025-04-19T17:20:21.869886Z","shell.execute_reply.started":"2025-04-19T17:19:12.757961Z","shell.execute_reply":"2025-04-19T17:20:21.869086Z"}},"outputs":[{"name":"stdout","text":"Downloading pretrained FastText model...\nUnzipping the model...\nmodel saved\n","output_type":"stream"}],"execution_count":10},{"cell_type":"code","source":"import fasttext","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:20:21.870774Z","iopub.execute_input":"2025-04-19T17:20:21.871056Z","iopub.status.idle":"2025-04-19T17:20:21.896035Z","shell.execute_reply.started":"2025-04-19T17:20:21.871036Z","shell.execute_reply":"2025-04-19T17:20:21.895138Z"}},"outputs":[],"execution_count":11},{"cell_type":"code","source":"print(\"Loading the model...\")\npretrained = fasttext.load_model(output_bin)\nprint(pretrained)\nvocab_size = len(pretrained.words)\nembedding_size = pretrained.get_dimension()\nprint(f\"Vocabulary Size: {vocab_size}\")\nprint(f\"Embedding Size: {embedding_size}\")\n# --------------------\nsimilar_words = pretrained.get_nearest_neighbors(\"good\", k=10)\nprint(\"similar words\",similar_words)\nopposite_words = pretrained.get_nearest_neighbors(negative=[\"learning\"], k=10)\nprint(\"opposite words\", opposite_words)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:20:21.897103Z","iopub.execute_input":"2025-04-19T17:20:21.897399Z","iopub.status.idle":"2025-04-19T17:20:44.757918Z","shell.execute_reply.started":"2025-04-19T17:20:21.897371Z","shell.execute_reply":"2025-04-19T17:20:44.756852Z"}},"outputs":[{"name":"stdout","text":"Loading the model...\n<fasttext.FastText._FastText object at 0x7c722f235f90>\nVocabulary Size: 2000000\nEmbedding Size: 300\nsimilar words [(0.7517593502998352, 'bad'), (0.7426098585128784, 'great'), (0.7299689054489136, 'decent'), (0.7123614549636841, 'nice'), (0.6796907186508179, 'Good'), (0.6737031936645508, 'excellent'), (0.669592022895813, 'goood'), (0.6602178812026978, 'ggod'), (0.6479219794273376, 'semi-good'), (0.6417751908302307, 'good.Good')]\n","output_type":"stream"},{"traceback":["\u001b[0;31m---------------------------------------------------------------------------\u001b[0m","\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)","\u001b[0;32m/tmp/ipykernel_31/297213840.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0msimilar_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpretrained\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_nearest_neighbors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"good\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     10\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"similar words\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0msimilar_words\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 11\u001b[0;31m \u001b[0mopposite_words\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpretrained\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_nearest_neighbors\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnegative\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m\"learning\"\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mk\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     12\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"opposite words\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mopposite_words\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n","\u001b[0;31mTypeError\u001b[0m: _FastText.get_nearest_neighbors() got an unexpected keyword argument 'negative'"],"ename":"TypeError","evalue":"_FastText.get_nearest_neighbors() got an unexpected keyword argument 'negative'","output_type":"error"}],"execution_count":12},{"cell_type":"markdown","source":"- import fasttext\n    - Facebook's original FastText package.\n    - Faster and more memory efficient\n    - Limited API (e.g., doesn't support negative sampling like Gensim does).\n    - get_nearest_neighbors(negative) doesn’t exist in official fasttext\n    - Used in real Prijects (Production)","metadata":{}},{"cell_type":"markdown","source":"- Gensim FastText\n    - You can use: (positive, negative, most_similar, similarity, .....)\n    - Slightly slower\n    - For production embedding lookup, not as efficient as the original FastText.","metadata":{}},{"cell_type":"code","source":"from gensim.models.fasttext import load_facebook_model\n\npretrained = load_facebook_model(\"cc.en.300.bin\")\n# model = load_facebook_model(output_bin)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:22:21.519417Z","iopub.execute_input":"2025-04-19T17:22:21.519877Z","iopub.status.idle":"2025-04-19T17:24:27.142626Z","shell.execute_reply.started":"2025-04-19T17:22:21.519849Z","shell.execute_reply":"2025-04-19T17:24:27.141885Z"}},"outputs":[],"execution_count":13},{"cell_type":"code","source":"vocab_size = len(pretrained.wv)\nembedding_size = pretrained.wv.vector_size\nprint(f\"Vocabulary Size: {vocab_size}\")\nprint(f\"Embedding Size: {embedding_size}\")\nsimilar = pretrained.wv.most_similar(\"learning\", topn=10)\nprint(\"similar words :\", similar)\n\nopposite_words = pretrained.wv.most_similar(negative=[\"learning\"],topn=10)\nprint(\"\\n\\nopposite words :\", opposite_words)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:24:27.146361Z","iopub.execute_input":"2025-04-19T17:24:27.146631Z","iopub.status.idle":"2025-04-19T17:24:28.242978Z","shell.execute_reply.started":"2025-04-19T17:24:27.146609Z","shell.execute_reply":"2025-04-19T17:24:28.242127Z"}},"outputs":[{"name":"stdout","text":"Vocabulary Size: 2000000\nEmbedding Size: 300\nsimilar words : [('learing', 0.7456762194633484), ('Learning', 0.6895480751991272), ('learning.This', 0.687819242477417), ('learning.The', 0.6796228289604187), ('learning.It', 0.6753032207489014), ('learning.So', 0.6706693768501282), ('learning.What', 0.6673311591148376), ('learning.But', 0.6648256778717041), ('learning-', 0.6643092036247253), ('learning.As', 0.6633589267730713)]\n\n\nopposite words : [('19555', 0.2533474564552307), ('12291', 0.23999808728694916), ('10264', 0.2394980639219284), ('13107', 0.23354505002498627), ('8504', 0.23330195248126984), ('13223', 0.23251304030418396), ('7242', 0.23047803342342377), ('13466', 0.2299567013978958), ('10494', 0.22803275287151337), ('14138', 0.2278987020254135)]\n","output_type":"stream"}],"execution_count":14},{"cell_type":"code","source":"pretrained.build_vocab(sentences, update=True)\npretrained.train(\n    sentences,\n    total_examples=len(sentences),\n    epochs=10\n)\n# Print vocabulary and embedding size\nvocab_size = len(pretrained.wv)\nembedding_size = pretrained.vector_size\nprint(f\"Vocabulary Size: {vocab_size}\")\nprint(f\"Embedding Size: {embedding_size}\")","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:24:28.244081Z","iopub.execute_input":"2025-04-19T17:24:28.244314Z","iopub.status.idle":"2025-04-19T17:26:12.710272Z","shell.execute_reply.started":"2025-04-19T17:24:28.244297Z","shell.execute_reply":"2025-04-19T17:26:12.709490Z"}},"outputs":[{"name":"stdout","text":"Vocabulary Size: 2000000\nEmbedding Size: 300\n","output_type":"stream"}],"execution_count":15},{"cell_type":"code","source":"similar_words = pretrained.wv.most_similar(\"learn\", topn=10)\nopposite_words = pretrained.wv.most_similar(negative=\"learn\", topn=10)\nprint(similar_words, \"\\n\\n\", opposite_words)","metadata":{"trusted":true,"execution":{"iopub.status.busy":"2025-04-19T17:26:12.711506Z","iopub.execute_input":"2025-04-19T17:26:12.711753Z","iopub.status.idle":"2025-04-19T17:26:13.760293Z","shell.execute_reply.started":"2025-04-19T17:26:12.711729Z","shell.execute_reply":"2025-04-19T17:26:13.759226Z"}},"outputs":[{"name":"stdout","text":"[('teach', 0.716772198677063), ('Learn', 0.7041028738021851), ('learned', 0.6968039274215698), ('learm', 0.6521831750869751), ('re-learn', 0.6518067717552185), ('discover', 0.6409897208213806), ('learn.If', 0.6341798901557922), ('relearn', 0.6159347295761108), ('leanr', 0.6142886877059937), ('understand', 0.6114104390144348)] \n\n [('.Rear', 0.22274798154830933), ('3.825', 0.20031915605068207), ('1.638', 0.19616979360580444), ('W52', 0.19612562656402588), ('3.725', 0.19571073353290558), ('9,677', 0.1925133764743805), ('2.101', 0.19243070483207703), ('2.675', 0.1889045089483261), ('3.425', 0.1883799433708191), ('2.76m', 0.1873113363981247)]\n","output_type":"stream"}],"execution_count":16},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null},{"cell_type":"code","source":"","metadata":{"trusted":true},"outputs":[],"execution_count":null}]}


--------------------------------------------------------------------------------
/01-Text-Preprocessing/1.1-Text-Preprocessing/preprocessing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "03384981",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Libraries"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 53,
 14 |    "id": "5baf0687",
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stderr",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "[nltk_data] Downloading package stopwords to\n",
 22 |       "[nltk_data]     C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n",
 23 |       "[nltk_data]   Package stopwords is already up-to-date!\n",
 24 |       "[nltk_data] Downloading package wordnet to\n",
 25 |       "[nltk_data]     C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n",
 26 |       "[nltk_data]   Package wordnet is already up-to-date!\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "import pandas as pd\n",
 32 |     "import re\n",
 33 |     "from nltk.stem import WordNetLemmatizer, PorterStemmer\n",
 34 |     "import nltk\n",
 35 |     "from nltk.corpus import stopwords\n",
 36 |     "from nltk.tokenize import word_tokenize\n",
 37 |     "nltk.download('stopwords')\n",
 38 |     "nltk.download('wordnet')\n",
 39 |     "\n",
 40 |     "from typing import List"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "id": "89132fd3",
 46 |    "metadata": {},
 47 |    "source": [
 48 |     "# Data"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 3,
 54 |    "id": "1f1a8e5a",
 55 |    "metadata": {},
 56 |    "outputs": [
 57 |     {
 58 |      "data": {
 59 |       "text/html": [
 60 |        "<div>\n",
 61 |        "<style scoped>\n",
 62 |        "    .dataframe tbody tr th:only-of-type {\n",
 63 |        "        vertical-align: middle;\n",
 64 |        "    }\n",
 65 |        "\n",
 66 |        "    .dataframe tbody tr th {\n",
 67 |        "        vertical-align: top;\n",
 68 |        "    }\n",
 69 |        "\n",
 70 |        "    .dataframe thead th {\n",
 71 |        "        text-align: right;\n",
 72 |        "    }\n",
 73 |        "</style>\n",
 74 |        "<table border=\"1\" class=\"dataframe\">\n",
 75 |        "  <thead>\n",
 76 |        "    <tr style=\"text-align: right;\">\n",
 77 |        "      <th></th>\n",
 78 |        "      <th>user_id</th>\n",
 79 |        "      <th>business_id</th>\n",
 80 |        "      <th>text</th>\n",
 81 |        "      <th>date</th>\n",
 82 |        "      <th>compliment_count</th>\n",
 83 |        "    </tr>\n",
 84 |        "  </thead>\n",
 85 |        "  <tbody>\n",
 86 |        "    <tr>\n",
 87 |        "      <th>0</th>\n",
 88 |        "      <td>AGNUgVwnZUey3gcPCJ76iw</td>\n",
 89 |        "      <td>3uLgwr0qeCNMjKenHJwPGQ</td>\n",
 90 |        "      <td>Avengers time with the ladies.</td>\n",
 91 |        "      <td>2012-05-18 02:17:21</td>\n",
 92 |        "      <td>0</td>\n",
 93 |        "    </tr>\n",
 94 |        "    <tr>\n",
 95 |        "      <th>1</th>\n",
 96 |        "      <td>NBN4MgHP9D3cw--SnauTkA</td>\n",
 97 |        "      <td>QoezRbYQncpRqyrLH6Iqjg</td>\n",
 98 |        "      <td>They have lots of good deserts and tasty cuban...</td>\n",
 99 |        "      <td>2013-02-05 18:35:10</td>\n",
100 |        "      <td>0</td>\n",
101 |        "    </tr>\n",
102 |        "    <tr>\n",
103 |        "      <th>2</th>\n",
104 |        "      <td>-copOvldyKh1qr-vzkDEvw</td>\n",
105 |        "      <td>MYoRNLb5chwjQe3c_k37Gg</td>\n",
106 |        "      <td>It's open even when you think it isn't</td>\n",
107 |        "      <td>2013-08-18 00:56:08</td>\n",
108 |        "      <td>0</td>\n",
109 |        "    </tr>\n",
110 |        "    <tr>\n",
111 |        "      <th>3</th>\n",
112 |        "      <td>FjMQVZjSqY8syIO-53KFKw</td>\n",
113 |        "      <td>hV-bABTK-glh5wj31ps_Jw</td>\n",
114 |        "      <td>Very decent fried chicken</td>\n",
115 |        "      <td>2017-06-27 23:05:38</td>\n",
116 |        "      <td>0</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>4</th>\n",
120 |        "      <td>ld0AperBXk1h6UbqmM80zw</td>\n",
121 |        "      <td>_uN0OudeJ3Zl_tf6nxg5ww</td>\n",
122 |        "      <td>Appetizers.. platter special for lunch</td>\n",
123 |        "      <td>2012-10-06 19:43:09</td>\n",
124 |        "      <td>0</td>\n",
125 |        "    </tr>\n",
126 |        "  </tbody>\n",
127 |        "</table>\n",
128 |        "</div>"
129 |       ],
130 |       "text/plain": [
131 |        "                  user_id             business_id  \\\n",
132 |        "0  AGNUgVwnZUey3gcPCJ76iw  3uLgwr0qeCNMjKenHJwPGQ   \n",
133 |        "1  NBN4MgHP9D3cw--SnauTkA  QoezRbYQncpRqyrLH6Iqjg   \n",
134 |        "2  -copOvldyKh1qr-vzkDEvw  MYoRNLb5chwjQe3c_k37Gg   \n",
135 |        "3  FjMQVZjSqY8syIO-53KFKw  hV-bABTK-glh5wj31ps_Jw   \n",
136 |        "4  ld0AperBXk1h6UbqmM80zw  _uN0OudeJ3Zl_tf6nxg5ww   \n",
137 |        "\n",
138 |        "                                                text                date  \\\n",
139 |        "0                     Avengers time with the ladies. 2012-05-18 02:17:21   \n",
140 |        "1  They have lots of good deserts and tasty cuban... 2013-02-05 18:35:10   \n",
141 |        "2             It's open even when you think it isn't 2013-08-18 00:56:08   \n",
142 |        "3                          Very decent fried chicken 2017-06-27 23:05:38   \n",
143 |        "4             Appetizers.. platter special for lunch 2012-10-06 19:43:09   \n",
144 |        "\n",
145 |        "   compliment_count  \n",
146 |        "0                 0  \n",
147 |        "1                 0  \n",
148 |        "2                 0  \n",
149 |        "3                 0  \n",
150 |        "4                 0  "
151 |       ]
152 |      },
153 |      "execution_count": 3,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "json_file_path = r\"E:\\DATA SCIENCE\\NLP-Tea\\Data\\yelp_academic_dataset_tip.json\\yelp_academic_dataset_tip.json\"\n",
160 |     "df = pd.read_json(json_file_path, lines=True)\n",
161 |     "\n",
162 |     "df.head()"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 4,
168 |    "id": "e99b254f",
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "name": "stdout",
173 |      "output_type": "stream",
174 |      "text": [
175 |       "(908915, 5)\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "print(df.shape)"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 5,
186 |    "id": "d8dfe82f",
187 |    "metadata": {},
188 |    "outputs": [
189 |     {
190 |      "data": {
191 |       "text/plain": [
192 |        "['Avengers time with the ladies.',\n",
193 |        " 'They have lots of good deserts and tasty cuban sandwiches',\n",
194 |        " \"It's open even when you think it isn't\",\n",
195 |        " 'Very decent fried chicken',\n",
196 |        " 'Appetizers.. platter special for lunch']"
197 |       ]
198 |      },
199 |      "execution_count": 5,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "text_data = list(df[\"text\"][:1000]) # First 1000 Row Only \n",
206 |     "text_data[:5]"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "markdown",
211 |    "id": "5cf06943",
212 |    "metadata": {},
213 |    "source": [
214 |     "# Preprocessing"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 54,
220 |    "id": "02269239",
221 |    "metadata": {},
222 |    "outputs": [
223 |     {
224 |      "name": "stdout",
225 |      "output_type": "stream",
226 |      "text": [
227 |       "Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!\n"
228 |      ]
229 |     }
230 |    ],
231 |    "source": [
232 |     "test_text = text_data[101]\n",
233 |     "print(test_text)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "id": "a13434b4",
239 |    "metadata": {},
240 |    "source": [
241 |     "## Case Normalization (lowercase)\n"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 55,
247 |    "id": "29303df8",
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!\n",
255 |       "lowercase text: self serve onions, relish, mayo?  and free caramelized onions?  yes!\n"
256 |      ]
257 |     }
258 |    ],
259 |    "source": [
260 |     "text_lower = test_text.lower()\n",
261 |     "print(f\"original text : {test_text}\")\n",
262 |     "print(f\"lowercase text: {text_lower}\")\n"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "markdown",
267 |    "id": "1d864693",
268 |    "metadata": {},
269 |    "source": [
270 |     "## Removes punctuation and digits"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": 56,
276 |    "id": "1b2e7d89",
277 |    "metadata": {},
278 |    "outputs": [
279 |     {
280 |      "name": "stdout",
281 |      "output_type": "stream",
282 |      "text": [
283 |       "original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!\n",
284 |       "preprocessed  : self serve onions relish mayo  and free caramelized onions  yes\n"
285 |      ]
286 |     }
287 |    ],
288 |    "source": [
289 |     "text_lower = test_text.lower()\n",
290 |     "text_no_punct = re.sub(r'[^a-zA-z\\s]', '', text_lower) # keep only letters and space\n",
291 |     "text_no_punct = re.sub(r'[^a-zA-z\\s0-9]', '', text_lower) # Keep numbers \n",
292 |     "\n",
293 |     "\n",
294 |     "print(f\"original text : {test_text}\")\n",
295 |     "print(f\"preprocessed  : {text_no_punct}\")"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "code",
300 |    "execution_count": 66,
301 |    "id": "9f6006e9",
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "name": "stdout",
306 |      "output_type": "stream",
307 |      "text": [
308 |       "original text : don't\n",
309 |       "with \\'        : don't\n",
310 |       "with out \\'    : dont\n"
311 |      ]
312 |     }
313 |    ],
314 |    "source": [
315 |     "text_no_punct1 = re.sub(r'[^a-zA-z\\s\\']', '', \"don't\")\n",
316 |     "text_no_punct2 = re.sub(r'[^a-zA-z\\s]', '', \"don't\")\n",
317 |     "\n",
318 |     "print(f\"original text : don't\")\n",
319 |     "print(f\"with \\\\'        : {text_no_punct1}\")\n",
320 |     "print(f\"with out \\\\'    : {text_no_punct2}\")"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "id": "36ec81bf",
326 |    "metadata": {},
327 |    "source": [
328 |     "## Tokens"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 57,
334 |    "id": "a727391a",
335 |    "metadata": {},
336 |    "outputs": [
337 |     {
338 |      "name": "stdout",
339 |      "output_type": "stream",
340 |      "text": [
341 |       "original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!\n",
342 |       "preprocessed  : ['self', 'serve', 'onions', 'relish', 'mayo', 'and', 'free', 'caramelized', 'onions', 'yes']\n",
343 |       "preprocessed_1: ['self', 'serve', 'onions', 'relish', 'mayo', 'and', 'free', 'caramelized', 'onions', 'yes']\n"
344 |      ]
345 |     }
346 |    ],
347 |    "source": [
348 |     "text_lower = test_text.lower()\n",
349 |     "text_no_punct = re.sub(r'[^a-zA-z\\s]', '', text_lower) \n",
350 |     "tokens = re.split(r\"\\s+\", text_no_punct) \n",
351 |     "\n",
352 |     "# or \n",
353 |     "tokens_v1 = word_tokenize(text_no_punct)\n",
354 |     "\n",
355 |     "print(f\"original text : {test_text}\")\n",
356 |     "print(f\"preprocessed  : {tokens}\")\n",
357 |     "print(f\"preprocessed_1: {tokens_v1}\")"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "markdown",
362 |    "id": "0f69d1da",
363 |    "metadata": {},
364 |    "source": [
365 |     "## Removes stopwords\n"
366 |    ]
367 |   },
368 |   {
369 |    "cell_type": "code",
370 |    "execution_count": null,
371 |    "id": "98b95e15",
372 |    "metadata": {},
373 |    "outputs": [
374 |     {
375 |      "name": "stdout",
376 |      "output_type": "stream",
377 |      "text": [
378 |       "original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!\n",
379 |       "preprocessed  : ['self', 'serve', 'onions', 'relish', 'mayo', 'free', 'caramelized', 'onions', 'yes']\n"
380 |      ]
381 |     }
382 |    ],
383 |    "source": [
384 |     "text_lower = test_text.lower()\n",
385 |     "text_no_punct = re.sub(r'[^a-zA-z\\s]', '', text_lower)\n",
386 |     "tokens = re.split(r\"\\s+\", text_no_punct) \n",
387 |     "\n",
388 |     "stop_words = set(stopwords.words('english'))\n",
389 |     "tokens = [token for token in tokens if token not in stop_words]\n",
390 |     "\n",
391 |     "\n",
392 |     "\n",
393 |     "print(f\"original text : {test_text}\")\n",
394 |     "print(f\"preprocessed  : {tokens}\")"
395 |    ]
396 |   },
397 |   {
398 |    "cell_type": "markdown",
399 |    "id": "d221f8d2",
400 |    "metadata": {},
401 |    "source": [
402 |     "## Stemming and lemmatization"
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "code",
407 |    "execution_count": null,
408 |    "id": "822ea890",
409 |    "metadata": {},
410 |    "outputs": [
411 |     {
412 |      "name": "stdout",
413 |      "output_type": "stream",
414 |      "text": [
415 |       "original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!\n",
416 |       "preprocessed  : ['self', 'serv', 'onion', 'relish', 'mayo', 'free', 'caramel', 'onion', 'ye']\n"
417 |      ]
418 |     }
419 |    ],
420 |    "source": [
421 |     "# Initialize stemmer \n",
422 |     "stemmer = PorterStemmer()\n",
423 |     "stem_tokens = [stemmer.stem(token) for token in tokens]\n",
424 |     "\n",
425 |     "print(f\"original text : {test_text}\")\n",
426 |     "print(f\"preprocessed  : {stem_tokens}\")\n",
427 |     "\n",
428 |     "#server =>> serv\n",
429 |     "# yes =>> ye"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "id": "a0eb5608",
436 |    "metadata": {},
437 |    "outputs": [
438 |     {
439 |      "name": "stdout",
440 |      "output_type": "stream",
441 |      "text": [
442 |       "original text : Self serve onions, relish, mayo?  And FREE caramelized onions?  Yes!\n",
443 |       "preprocessed  : ['self', 'serve', 'onion', 'relish', 'mayo', 'free', 'caramelized', 'onion', 'yes']\n"
444 |      ]
445 |     }
446 |    ],
447 |    "source": [
448 |     "# Initialize lemmatizer\n",
449 |     "lemmatizer = WordNetLemmatizer()\n",
450 |     "lemma_tokens = [lemmatizer.lemmatize(token) for token in tokens]\n",
451 |     "\n",
452 |     "print(f\"original text : {test_text}\")\n",
453 |     "print(f\"preprocessed  : {lemma_tokens}\")\n"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 50,
459 |    "id": "7d25b322",
460 |    "metadata": {},
461 |    "outputs": [
462 |     {
463 |      "name": "stdout",
464 |      "output_type": "stream",
465 |      "text": [
466 |       "Word: running\n",
467 |       "  Stemmed:     run\n",
468 |       "  Lemmatized:  running\n",
469 |       "\n",
470 |       "Word: better\n",
471 |       "  Stemmed:     better\n",
472 |       "  Lemmatized:  better\n",
473 |       "\n",
474 |       "Word: flies\n",
475 |       "  Stemmed:     fli\n",
476 |       "  Lemmatized:  fly\n",
477 |       "\n",
478 |       "Word: cities\n",
479 |       "  Stemmed:     citi\n",
480 |       "  Lemmatized:  city\n",
481 |       "\n",
482 |       "Word: served\n",
483 |       "  Stemmed:     serv\n",
484 |       "  Lemmatized:  served\n",
485 |       "\n",
486 |       "Word: children\n",
487 |       "  Stemmed:     children\n",
488 |       "  Lemmatized:  child\n",
489 |       "\n"
490 |      ]
491 |     }
492 |    ],
493 |    "source": [
494 |     "words = [\"running\", \"better\", \"flies\", \"cities\", \"served\", \"children\"]\n",
495 |     "\n",
496 |     "for word in words:\n",
497 |     "    print(f\"Word: {word}\")\n",
498 |     "    print(f\"  Stemmed:     {stemmer.stem(word)}\")\n",
499 |     "    print(f\"  Lemmatized:  {lemmatizer.lemmatize(word)}\")\n",
500 |     "    print()"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "code",
505 |    "execution_count": null,
506 |    "id": "bdf159a4",
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": []
510 |   },
511 |   {
512 |    "cell_type": "markdown",
513 |    "id": "a274ddb0",
514 |    "metadata": {},
515 |    "source": [
516 |     "## ALL"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "id": "95ed33f8",
523 |    "metadata": {},
524 |    "outputs": [],
525 |    "source": [
526 |     "import pandas as pd\n",
527 |     "import re\n",
528 |     "from nltk.stem import WordNetLemmatizer, PorterStemmer\n",
529 |     "import nltk\n",
530 |     "import emoji\n",
531 |     "from nltk.corpus import stopwords\n",
532 |     "from nltk.tokenize import word_tokenize\n",
533 |     "nltk.download('stopwords')\n",
534 |     "nltk.download('wordnet')\n",
535 |     "\n",
536 |     "from typing import List"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": null,
542 |    "id": "685cadf7",
543 |    "metadata": {},
544 |    "outputs": [
545 |     {
546 |      "name": "stdout",
547 |      "output_type": "stream",
548 |      "text": [
549 |       "[['avenger', 'time', 'lady'], ['lot', 'good', 'desert', 'tasty', 'cuban', 'sandwich'], [\"'s\", 'open', 'even', 'think', \"n't\"], ['decent', 'fried', 'chicken'], ['appetizer', 'platter', 'special', 'lunch']]\n"
550 |      ]
551 |     }
552 |    ],
553 |    "source": [
554 |     "def preprocessing(text: str) -> list[str]:\n",
555 |     "    \"\"\"\n",
556 |     "    Preprocesses a given text:\n",
557 |     "    - Lowercases text\n",
558 |     "    - Removes punctuation and digits\n",
559 |     "    - Removes stopwords\n",
560 |     "    - Tokenizes into words\n",
561 |     "    - Applies lemmatization or stemming\n",
562 |     "\n",
563 |     "    Args:\n",
564 |     "        document (str): The raw input text\n",
565 |     "\n",
566 |     "    Returns:\n",
567 |     "        List of str: Cleaned and preprocessed text\n",
568 |     "\n",
569 |     "    Example:\n",
570 |     "        >>> preprocess(\"I love Python! 😊 It's awesome 👍\")\n",
571 |     "        ['love', 'python', 'smiling_face', 'awesome', 'thumbs_up']\n",
572 |     "    \"\"\"\n",
573 |     "\n",
574 |     "    stop_words = set(stopwords.words('english'))\n",
575 |     "    lemmatizer = WordNetLemmatizer()\n",
576 |     "    stemmer = PorterStemmer()\n",
577 |     "\n",
578 |     "    # Convert Text to Lowercase (Normalization)\n",
579 |     "    text_lower = text.lower()\n",
580 |     "\n",
581 |     "    # Removing Punctuation\n",
582 |     "    text_no_punct = re.sub(r'[^a-zA-Z\\s\\']', '', text_lower) # \\' for keep apostrophes (e.g. don't, it's)\n",
583 |     "\n",
584 |     "\n",
585 |     "    # 3. Tokens\n",
586 |     "    tokens = re.split(r\"\\s+\", text_no_punct) \n",
587 |     "    tokens = [t for t in tokens if t]\n",
588 |     "    # or use nltk tokenizer\n",
589 |     "    tokens = word_tokenize(text_no_punct)\n",
590 |     "\n",
591 |     "    # 4. Stop word removal\n",
592 |     "    filtered_tokens  = [token for token in tokens if token not in stop_words]\n",
593 |     "\n",
594 |     "    # 5. Lemmatization \n",
595 |     "    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]\n",
596 |     "    # or stemmer\n",
597 |     "    stemm_tokens = [stemmer.stem(token) for token in filtered_tokens ]\n",
598 |     "\n",
599 |     "    return lemma_tokens\n",
600 |     "\n",
601 |     "text_data = list(df[\"text\"][:100]) # First 1000 Row Only\n",
602 |     "preprocessed_text = [preprocessing(text) for text in text_data]\n",
603 |     "print(preprocessed_text[:5])"
604 |    ]
605 |   },
606 |   {
607 |    "cell_type": "code",
608 |    "execution_count": null,
609 |    "id": "1016ea24",
610 |    "metadata": {},
611 |    "outputs": [
612 |     {
613 |      "data": {
614 |       "text/plain": [
615 |        "'I love  pizza  and  grinning face with smiling eyes !'"
616 |       ]
617 |      },
618 |      "execution_count": 86,
619 |      "metadata": {},
620 |      "output_type": "execute_result"
621 |     }
622 |    ],
623 |    "source": [
624 |     "print(preprocessing(\"I love Python! 😊 It's awesome 👍\"))\n",
625 |     "text = emoji.demojize(\"I love 🍕 and 😄!\", delimiters=(\" \", \" \")) \n",
626 |     "text = re.sub(r'_', ' ', text)\n",
627 |     "text"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "code",
632 |    "execution_count": null,
633 |    "id": "e8531a7c",
634 |    "metadata": {},
635 |    "outputs": [],
636 |    "source": [
637 |     "#  Remove URLs, emails, and Twitter mentions\n",
638 |     "text = re.sub(r'(https?://\\S+|www\\.\\S+)', ' ', text)   # URLs\n",
639 |     "text = re.sub(r'\\S+@\\S+', ' ', text)                    # Email addresses\n",
640 |     "text = re.sub(r'@\\w+', ' ', text)                       # Mentions"
641 |    ]
642 |   }
643 |  ],
644 |  "metadata": {
645 |   "kernelspec": {
646 |    "display_name": "myenv",
647 |    "language": "python",
648 |    "name": "python3"
649 |   },
650 |   "language_info": {
651 |    "codemirror_mode": {
652 |     "name": "ipython",
653 |     "version": 3
654 |    },
655 |    "file_extension": ".py",
656 |    "mimetype": "text/x-python",
657 |    "name": "python",
658 |    "nbconvert_exporter": "python",
659 |    "pygments_lexer": "ipython3",
660 |    "version": "3.12.6"
661 |   }
662 |  },
663 |  "nbformat": 4,
664 |  "nbformat_minor": 5
665 | }
666 | 


--------------------------------------------------------------------------------
/02-Word Embeddings/2.2-BOW/2.2-BOW.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "aaef07ca",
  7 |    "metadata": {},
  8 |    "outputs": [
  9 |     {
 10 |      "name": "stderr",
 11 |      "output_type": "stream",
 12 |      "text": [
 13 |       "[nltk_data] Downloading package stopwords to\n",
 14 |       "[nltk_data]     C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n",
 15 |       "[nltk_data]   Package stopwords is already up-to-date!\n",
 16 |       "[nltk_data] Downloading package wordnet to\n",
 17 |       "[nltk_data]     C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n",
 18 |       "[nltk_data]   Package wordnet is already up-to-date!\n"
 19 |      ]
 20 |     }
 21 |    ],
 22 |    "source": [
 23 |     "import pandas as pd\n",
 24 |     "import numpy as np\n",
 25 |     "import re\n",
 26 |     "from nltk.stem import WordNetLemmatizer, PorterStemmer\n",
 27 |     "import nltk\n",
 28 |     "from nltk.corpus import stopwords\n",
 29 |     "from nltk.tokenize import word_tokenize\n",
 30 |     "import contractions\n",
 31 |     "nltk.download('stopwords')\n",
 32 |     "nltk.download('wordnet')\n",
 33 |     "\n",
 34 |     "from typing import List"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "id": "3b88e66d",
 40 |    "metadata": {},
 41 |    "source": [
 42 |     "## Data"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 2,
 48 |    "id": "d27f3fbc",
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "data": {
 53 |       "text/html": [
 54 |        "<div>\n",
 55 |        "<style scoped>\n",
 56 |        "    .dataframe tbody tr th:only-of-type {\n",
 57 |        "        vertical-align: middle;\n",
 58 |        "    }\n",
 59 |        "\n",
 60 |        "    .dataframe tbody tr th {\n",
 61 |        "        vertical-align: top;\n",
 62 |        "    }\n",
 63 |        "\n",
 64 |        "    .dataframe thead th {\n",
 65 |        "        text-align: right;\n",
 66 |        "    }\n",
 67 |        "</style>\n",
 68 |        "<table border=\"1\" class=\"dataframe\">\n",
 69 |        "  <thead>\n",
 70 |        "    <tr style=\"text-align: right;\">\n",
 71 |        "      <th></th>\n",
 72 |        "      <th>review</th>\n",
 73 |        "      <th>sentiment</th>\n",
 74 |        "    </tr>\n",
 75 |        "  </thead>\n",
 76 |        "  <tbody>\n",
 77 |        "    <tr>\n",
 78 |        "      <th>0</th>\n",
 79 |        "      <td>One of the other reviewers has mentioned that ...</td>\n",
 80 |        "      <td>positive</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>1</th>\n",
 84 |        "      <td>A wonderful little production. &lt;br /&gt;&lt;br /&gt;The...</td>\n",
 85 |        "      <td>positive</td>\n",
 86 |        "    </tr>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>2</th>\n",
 89 |        "      <td>I thought this was a wonderful way to spend ti...</td>\n",
 90 |        "      <td>positive</td>\n",
 91 |        "    </tr>\n",
 92 |        "    <tr>\n",
 93 |        "      <th>3</th>\n",
 94 |        "      <td>Basically there's a family where a little boy ...</td>\n",
 95 |        "      <td>negative</td>\n",
 96 |        "    </tr>\n",
 97 |        "    <tr>\n",
 98 |        "      <th>4</th>\n",
 99 |        "      <td>Petter Mattei's \"Love in the Time of Money\" is...</td>\n",
100 |        "      <td>positive</td>\n",
101 |        "    </tr>\n",
102 |        "  </tbody>\n",
103 |        "</table>\n",
104 |        "</div>"
105 |       ],
106 |       "text/plain": [
107 |        "                                              review sentiment\n",
108 |        "0  One of the other reviewers has mentioned that ...  positive\n",
109 |        "1  A wonderful little production. <br /><br />The...  positive\n",
110 |        "2  I thought this was a wonderful way to spend ti...  positive\n",
111 |        "3  Basically there's a family where a little boy ...  negative\n",
112 |        "4  Petter Mattei's \"Love in the Time of Money\" is...  positive"
113 |       ]
114 |      },
115 |      "execution_count": 2,
116 |      "metadata": {},
117 |      "output_type": "execute_result"
118 |     }
119 |    ],
120 |    "source": [
121 |     "file_path = r\"E:\\DATA SCIENCE\\NLP-Tea\\Data\\IMDB Dataset Movie Reviews\\IMDB Dataset.csv\"\n",
122 |     "df = pd.read_csv(file_path)\n",
123 |     "\n",
124 |     "df.head() #  (50000, 2)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 3,
130 |    "id": "e7e24d28",
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/plain": [
136 |        "[\"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare. Forget pretty pictures painted for mainstream audiences, forget charm, forget romance...OZ doesn't mess around. The first episode I ever saw struck me as so nasty it was surreal, I couldn't say I was ready for it, but as I watched more, I developed a taste for Oz, and got accustomed to the high levels of graphic violence. Not just violence, but injustice (crooked guards who'll be sold out for a nickel, inmates who'll kill on order and get away with it, well mannered, middle class inmates being turned into prison bitches due to their lack of street skills or prison experience) Watching Oz, you may become comfortable with what is uncomfortable viewing....thats if you can get in touch with your darker side.\",\n",
137 |        " 'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only \"has got all the polari\" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \\'dream\\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\\'s murals decorating every surface) are terribly well done.',\n",
138 |        " 'I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I\\'d laughed at one of Woody\\'s comedies in years (dare I say a decade?). While I\\'ve never been impressed with Scarlet Johanson, in this she managed to tone down her \"sexy\" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of his career, but it was wittier than \"Devil Wears Prada\" and more interesting than \"Superman\" a great comedy to go see with friends.',\n",
139 |        " \"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them.\",\n",
140 |        " 'Petter Mattei\\'s \"Love in the Time of Money\" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter. <br /><br />This being a variation on the Arthur Schnitzler\\'s play about the same theme, the director transfers the action to the present time New York where all these different characters meet and connect. Each one is connected in one way, or another to the next person, but no one seems to know the previous point of contact. Stylishly, the film has a sophisticated luxurious look. We are taken to see how these people live and the world they live in their own habitat.<br /><br />The only thing one gets out of all these souls in the picture is the different stages of loneliness each one inhabits. A big city is not exactly the best place in which human relations find sincere fulfillment, as one discerns is the case with most of the people we encounter.<br /><br />The acting is good under Mr. Mattei\\'s direction. Steve Buscemi, Rosario Dawson, Carol Kane, Michael Imperioli, Adrian Grenier, and the rest of the talented cast, make these characters come alive.<br /><br />We wish Mr. Mattei good luck and await anxiously for his next work.']"
141 |       ]
142 |      },
143 |      "execution_count": 3,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "text_data = list(df[\"review\"][:1000]) # First 1000 Row Only \n",
150 |     "text_data[:5]"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": 4,
156 |    "id": "6677f1b3",
157 |    "metadata": {},
158 |    "outputs": [
159 |     {
160 |      "name": "stdout",
161 |      "output_type": "stream",
162 |      "text": [
163 |       "'a wonderful little production. the filming technique is very unassuming-\n"
164 |      ]
165 |     }
166 |    ],
167 |    "source": [
168 |     "html_text = \"'A wonderful little production. <br /><br />The filming technique is very unassuming-\"\n",
169 |     "html_text = html_text.lower()\n",
170 |     "clean_text = re.sub(r'<[^>]+>', '', html_text)\n",
171 |     "print(clean_text)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "id": "6f628691",
177 |    "metadata": {},
178 |    "source": [
179 |     "## Preprocessing"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 42,
185 |    "id": "a662d33a",
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "name": "stdout",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "[['one', 'reviewer', 'mentioned', 'watching', 'oz', 'episode', 'hooked', 'right', 'exactly', 'happened', 'methe', 'first', 'thing', 'struck', 'oz', 'brutality', 'unflinching', 'scene', 'violence', 'set', 'right', 'word', 'go', 'trust', 'show', 'faint', 'hearted', 'timid', 'show', 'pull', 'punch', 'regard', 'drug', 'sex', 'violence', 'hardcore', 'classic', 'use', 'wordit', 'called', 'oz', 'nickname', 'given', 'oswald', 'maximum', 'security', 'state', 'penitentary', 'focus', 'mainly', 'emerald', 'city', 'experimental', 'section', 'prison', 'cell', 'glass', 'front', 'face', 'inwards', 'privacy', 'high', 'agenda', 'city', 'home', 'manyaryans', 'muslim', 'gangsta', 'latino', 'christian', 'italian', 'irish', 'moreso', 'scuffle', 'death', 'stare', 'dodgy', 'dealing', 'shady', 'agreement', 'never', 'far', 'awayi', 'would', 'say', 'main', 'appeal', 'show', 'due', 'fact', 'go', 'show', 'would', 'dare', 'forget', 'pretty', 'picture', 'painted', 'mainstream', 'audience', 'forget', 'charm', 'forget', 'romanceoz', 'mess', 'around', 'first', 'episode', 'ever', 'saw', 'struck', 'nasty', 'surreal', 'could', 'say', 'ready', 'watched', 'developed', 'taste', 'oz', 'got', 'accustomed', 'high', 'level', 'graphic', 'violence', 'violence', 'injustice', 'crooked', 'guard', 'sold', 'nickel', 'inmate', 'kill', 'order', 'get', 'away', 'well', 'mannered', 'middle', 'class', 'inmate', 'turned', 'prison', 'bitch', 'due', 'lack', 'street', 'skill', 'prison', 'experience', 'watching', 'oz', 'may', 'become', 'comfortable', 'uncomfortable', 'viewingthat', 'get', 'touch', 'darker', 'side'], ['wonderful', 'little', 'production', 'filming', 'technique', 'unassuming', 'oldtimebbc', 'fashion', 'give', 'comforting', 'sometimes', 'discomforting', 'sense', 'realism', 'entire', 'piece', 'actor', 'extremely', 'well', 'chosen', 'michael', 'sheen', 'got', 'polari', 'voice', 'pat', 'truly', 'see', 'seamless', 'editing', 'guided', 'reference', 'williams', 'diary', 'entry', 'well', 'worth', 'watching', 'terrificly', 'written', 'performed', 'piece', 'masterful', 'production', 'one', 'great', 'master', 'comedy', 'life', 'realism', 'really', 'come', 'home', 'little', 'thing', 'fantasy', 'guard', 'rather', 'use', 'traditional', 'dream', 'technique', 'remains', 'solid', 'disappears', 'play', 'knowledge', 'sens', 'particularly', 'scene', 'concerning', 'orton', 'halliwell', 'set', 'particularly', 'flat', 'halliwells', 'mural', 'decorating', 'every', 'surface', 'terribly', 'well', 'done'], ['thought', 'wonderful', 'way', 'spend', 'time', 'hot', 'summer', 'weekend', 'sitting', 'air', 'conditioned', 'theater', 'watching', 'lighthearted', 'comedy', 'plot', 'simplistic', 'dialogue', 'witty', 'character', 'likable', 'even', 'well', 'bread', 'suspected', 'serial', 'killer', 'may', 'disappointed', 'realize', 'match', 'point', 'risk', 'addiction', 'thought', 'proof', 'woody', 'allen', 'still', 'fully', 'control', 'style', 'many', 'u', 'grown', 'lovethis', 'would', 'laughed', 'one', 'woodys', 'comedy', 'year', 'dare', 'say', 'decade', 'never', 'impressed', 'scarlet', 'johanson', 'managed', 'tone', 'sexy', 'image', 'jumped', 'right', 'average', 'spirited', 'young', 'womanthis', 'may', 'crown', 'jewel', 'career', 'wittier', 'devil', 'wear', 'prada', 'interesting', 'superman', 'great', 'comedy', 'go', 'see', 'friend']]\n"
193 |      ]
194 |     }
195 |    ],
196 |    "source": [
197 |     "def preprocessing(text: str) -> list[str]:\n",
198 |     "    \"\"\"\n",
199 |     "    Preprocesses a given text:\n",
200 |     "    - Lowercases text\n",
201 |     "    - Contraction Handling\n",
202 |     "    - Removes punctuation and digits\n",
203 |     "    - Removes stopwords\n",
204 |     "    - Tokenizes into words\n",
205 |     "    - Applies lemmatization or stemming\n",
206 |     "\n",
207 |     "    Args:\n",
208 |     "        document (str): The raw input text\n",
209 |     "\n",
210 |     "    Returns:\n",
211 |     "        List of str: Cleaned and preprocessed text\n",
212 |     "    \"\"\"\n",
213 |     "\n",
214 |     "    stop_words = set(stopwords.words('english'))\n",
215 |     "    lemmatizer = WordNetLemmatizer()\n",
216 |     "    stemmer = PorterStemmer()\n",
217 |     "\n",
218 |     "    # Convert Text to Lowercase (Normalization)\n",
219 |     "    text_lower = text.lower()\n",
220 |     "    text_no_tags = re.sub(r'<[^>]+>', '', text_lower)\n",
221 |     "\n",
222 |     "    # Contraction Handling\n",
223 |     "    text_no_tags = contractions.fix(text_no_tags)\n",
224 |     "\n",
225 |     "    # Removing Punctuation\n",
226 |     "    text_no_punct = re.sub(r'[^a-zA-Z\\s]', '', text_no_tags) # \\' for keep apostrophes (e.g. don't, it's)\n",
227 |     "\n",
228 |     "\n",
229 |     "    # 3. Tokens\n",
230 |     "    tokens = re.split(r\"\\s+\", text_no_punct) \n",
231 |     "    tokens = [t for t in tokens if t]\n",
232 |     "    # or use nltk tokenizer\n",
233 |     "    tokens = word_tokenize(text_no_punct)\n",
234 |     "\n",
235 |     "    # 4. Stop word removal\n",
236 |     "    filtered_tokens  = [token for token in tokens if token not in stop_words]\n",
237 |     "\n",
238 |     "    # 5. Lemmatization \n",
239 |     "    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]\n",
240 |     "    # or stemmer\n",
241 |     "    stemm_tokens = [stemmer.stem(token) for token in filtered_tokens ]\n",
242 |     "\n",
243 |     "    return lemma_tokens\n",
244 |     "\n",
245 |     "text_data = list(df[\"review\"][:100]) # First 100 Row Only\n",
246 |     "preprocessed_text = [preprocessing(text) for text in text_data]\n",
247 |     "print(preprocessed_text[:3])"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "markdown",
252 |    "id": "d9be009a",
253 |    "metadata": {},
254 |    "source": [
255 |     "## From Scratch"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": 86,
261 |    "id": "2f324ffc",
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": [
265 |     "all_tokens =[]\n",
266 |     "for lst_tokens in preprocessed_text:\n",
267 |     "    all_tokens.extend(lst_tokens)\n",
268 |     "vocab = sorted(set(all_tokens)) # Unique Words\n",
269 |     "\n",
270 |     "\n",
271 |     "def TermFrequency(term: str, doc: list[str]) :\n",
272 |     "    tf = 0\n",
273 |     "    if term not in doc:\n",
274 |     "        return tf\n",
275 |     "    for t in doc:\n",
276 |     "        if t == term:\n",
277 |     "            tf+=1\n",
278 |     "    return tf \n",
279 |     "    \n",
280 |     "\n",
281 |     "\n",
282 |     "def BagOfWords(vocab: list, preprocessed_text: list[list]) -> np.ndarray :\n",
283 |     "\n",
284 |     "    n_docs = len(preprocessed_text)\n",
285 |     "    n_vocab = len(vocab)\n",
286 |     "    \n",
287 |     "    bow_matrix = np.zeros(shape=(n_docs, n_vocab)) # (# documents, # vocabulary words)\n",
288 |     "\n",
289 |     "    for doc_idx, doc in enumerate(preprocessed_text): # For Each Document \n",
290 |     "\n",
291 |     "        bow_vec = np.zeros(shape=n_vocab)\n",
292 |     "        #  For each word in the vocabulary, calculate its term frequency in this document\n",
293 |     "        for term_idx,term in enumerate(vocab): \n",
294 |     "                tf = TermFrequency(term=term, doc=doc)\n",
295 |     "                bow_vec[term_idx] = tf\n",
296 |     "\n",
297 |     "        bow_matrix[doc_idx] = bow_vec\n",
298 |     "    return bow_matrix\n",
299 |     "bag_of_word_matrix = BagOfWords(vocab, preprocessed_text)"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "code",
304 |    "execution_count": 77,
305 |    "id": "7c69b3af",
306 |    "metadata": {},
307 |    "outputs": [
308 |     {
309 |      "name": "stdout",
310 |      "output_type": "stream",
311 |      "text": [
312 |       "[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
313 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.\n",
314 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.\n",
315 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
316 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
317 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
318 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
319 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
320 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
321 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
322 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
323 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.\n",
324 |       " 0. 0. 0. 0. 0. 0. 0. 0. 0. 2. 0. 0.]\n"
325 |      ]
326 |     }
327 |    ],
328 |    "source": [
329 |     "print(bag_of_word_matrix[3][400:700])"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "id": "ef5600dc",
336 |    "metadata": {},
337 |    "outputs": [],
338 |    "source": []
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": null,
343 |    "id": "f0d4bf46",
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "\n"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "id": "6c005e57",
353 |    "metadata": {},
354 |    "source": [
355 |     "## Built in"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "id": "33e86300",
362 |    "metadata": {},
363 |    "outputs": [
364 |     {
365 |      "name": "stdout",
366 |      "output_type": "stream",
367 |      "text": [
368 |       "Vocabulary: ['abbot' 'abbreviated' 'abetted' ... 'zoo' 'zoom' 'zwick']\n",
369 |       "Bag of Words Matrix:\n",
370 |       " [[0 0 0 ... 0 0 0]\n",
371 |       " [0 0 0 ... 0 0 0]\n",
372 |       " [0 0 0 ... 0 0 0]\n",
373 |       " ...\n",
374 |       " [0 0 0 ... 0 0 0]\n",
375 |       " [0 0 0 ... 0 0 0]\n",
376 |       " [0 0 0 ... 0 0 0]]\n"
377 |      ]
378 |     }
379 |    ],
380 |    "source": [
381 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
382 |     "\n",
383 |     "corpus = []\n",
384 |     "for p in preprocessed_text:\n",
385 |     "    corpus.append(\" \".join(p))\n",
386 |     "\n",
387 |     "vectorizer = CountVectorizer()\n",
388 |     "bow_matrix = vectorizer.fit_transform(corpus)\n",
389 |     "bow_dense = bow_matrix.toarray()\n",
390 |     "\n",
391 |     "vocab = vectorizer.get_feature_names_out()\n",
392 |     "print(\"Vocabulary:\", vocab)\n",
393 |     "print(\"Bag of Words Matrix:\\n\", bow_dense)\n"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 89,
399 |    "id": "57eaacd7",
400 |    "metadata": {},
401 |    "outputs": [
402 |     {
403 |      "data": {
404 |       "text/plain": [
405 |        "(100, 4438)"
406 |       ]
407 |      },
408 |      "execution_count": 89,
409 |      "metadata": {},
410 |      "output_type": "execute_result"
411 |     }
412 |    ],
413 |    "source": [
414 |     "bow_dense.shape"
415 |    ]
416 |   },
417 |   {
418 |    "cell_type": "code",
419 |    "execution_count": null,
420 |    "id": "2f6ca6d9",
421 |    "metadata": {},
422 |    "outputs": [],
423 |    "source": []
424 |   }
425 |  ],
426 |  "metadata": {
427 |   "kernelspec": {
428 |    "display_name": "myenv",
429 |    "language": "python",
430 |    "name": "python3"
431 |   },
432 |   "language_info": {
433 |    "codemirror_mode": {
434 |     "name": "ipython",
435 |     "version": 3
436 |    },
437 |    "file_extension": ".py",
438 |    "mimetype": "text/x-python",
439 |    "name": "python",
440 |    "nbconvert_exporter": "python",
441 |    "pygments_lexer": "ipython3",
442 |    "version": "3.12.6"
443 |   }
444 |  },
445 |  "nbformat": 4,
446 |  "nbformat_minor": 5
447 | }
448 | 


--------------------------------------------------------------------------------
/02-Word Embeddings/2.1-Label Encoder and One Hot Encoder/2.1-label_and_oneHot_Encoder.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "a4382364",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Libraries"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": null,
 14 |    "id": "167272d1",
 15 |    "metadata": {},
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stderr",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "[nltk_data] Downloading package stopwords to\n",
 22 |       "[nltk_data]     C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n",
 23 |       "[nltk_data]   Package stopwords is already up-to-date!\n",
 24 |       "[nltk_data] Downloading package wordnet to\n",
 25 |       "[nltk_data]     C:\\Users\\htc\\AppData\\Roaming\\nltk_data...\n",
 26 |       "[nltk_data]   Package wordnet is already up-to-date!\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "import pandas as pd\n",
 32 |     "import re\n",
 33 |     "from nltk.stem import WordNetLemmatizer, PorterStemmer\n",
 34 |     "import nltk\n",
 35 |     "from nltk.corpus import stopwords\n",
 36 |     "from nltk.tokenize import word_tokenize\n",
 37 |     "import contractions\n",
 38 |     "nltk.download('stopwords')\n",
 39 |     "nltk.download('wordnet')\n",
 40 |     "\n",
 41 |     "from typing import List"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "markdown",
 46 |    "id": "2d651715",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "# Data"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 2,
 55 |    "id": "cc53d23d",
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "data": {
 60 |       "text/html": [
 61 |        "<div>\n",
 62 |        "<style scoped>\n",
 63 |        "    .dataframe tbody tr th:only-of-type {\n",
 64 |        "        vertical-align: middle;\n",
 65 |        "    }\n",
 66 |        "\n",
 67 |        "    .dataframe tbody tr th {\n",
 68 |        "        vertical-align: top;\n",
 69 |        "    }\n",
 70 |        "\n",
 71 |        "    .dataframe thead th {\n",
 72 |        "        text-align: right;\n",
 73 |        "    }\n",
 74 |        "</style>\n",
 75 |        "<table border=\"1\" class=\"dataframe\">\n",
 76 |        "  <thead>\n",
 77 |        "    <tr style=\"text-align: right;\">\n",
 78 |        "      <th></th>\n",
 79 |        "      <th>user_id</th>\n",
 80 |        "      <th>business_id</th>\n",
 81 |        "      <th>text</th>\n",
 82 |        "      <th>date</th>\n",
 83 |        "      <th>compliment_count</th>\n",
 84 |        "    </tr>\n",
 85 |        "  </thead>\n",
 86 |        "  <tbody>\n",
 87 |        "    <tr>\n",
 88 |        "      <th>0</th>\n",
 89 |        "      <td>AGNUgVwnZUey3gcPCJ76iw</td>\n",
 90 |        "      <td>3uLgwr0qeCNMjKenHJwPGQ</td>\n",
 91 |        "      <td>Avengers time with the ladies.</td>\n",
 92 |        "      <td>2012-05-18 02:17:21</td>\n",
 93 |        "      <td>0</td>\n",
 94 |        "    </tr>\n",
 95 |        "    <tr>\n",
 96 |        "      <th>1</th>\n",
 97 |        "      <td>NBN4MgHP9D3cw--SnauTkA</td>\n",
 98 |        "      <td>QoezRbYQncpRqyrLH6Iqjg</td>\n",
 99 |        "      <td>They have lots of good deserts and tasty cuban...</td>\n",
100 |        "      <td>2013-02-05 18:35:10</td>\n",
101 |        "      <td>0</td>\n",
102 |        "    </tr>\n",
103 |        "    <tr>\n",
104 |        "      <th>2</th>\n",
105 |        "      <td>-copOvldyKh1qr-vzkDEvw</td>\n",
106 |        "      <td>MYoRNLb5chwjQe3c_k37Gg</td>\n",
107 |        "      <td>It's open even when you think it isn't</td>\n",
108 |        "      <td>2013-08-18 00:56:08</td>\n",
109 |        "      <td>0</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <th>3</th>\n",
113 |        "      <td>FjMQVZjSqY8syIO-53KFKw</td>\n",
114 |        "      <td>hV-bABTK-glh5wj31ps_Jw</td>\n",
115 |        "      <td>Very decent fried chicken</td>\n",
116 |        "      <td>2017-06-27 23:05:38</td>\n",
117 |        "      <td>0</td>\n",
118 |        "    </tr>\n",
119 |        "    <tr>\n",
120 |        "      <th>4</th>\n",
121 |        "      <td>ld0AperBXk1h6UbqmM80zw</td>\n",
122 |        "      <td>_uN0OudeJ3Zl_tf6nxg5ww</td>\n",
123 |        "      <td>Appetizers.. platter special for lunch</td>\n",
124 |        "      <td>2012-10-06 19:43:09</td>\n",
125 |        "      <td>0</td>\n",
126 |        "    </tr>\n",
127 |        "  </tbody>\n",
128 |        "</table>\n",
129 |        "</div>"
130 |       ],
131 |       "text/plain": [
132 |        "                  user_id             business_id  \\\n",
133 |        "0  AGNUgVwnZUey3gcPCJ76iw  3uLgwr0qeCNMjKenHJwPGQ   \n",
134 |        "1  NBN4MgHP9D3cw--SnauTkA  QoezRbYQncpRqyrLH6Iqjg   \n",
135 |        "2  -copOvldyKh1qr-vzkDEvw  MYoRNLb5chwjQe3c_k37Gg   \n",
136 |        "3  FjMQVZjSqY8syIO-53KFKw  hV-bABTK-glh5wj31ps_Jw   \n",
137 |        "4  ld0AperBXk1h6UbqmM80zw  _uN0OudeJ3Zl_tf6nxg5ww   \n",
138 |        "\n",
139 |        "                                                text                date  \\\n",
140 |        "0                     Avengers time with the ladies. 2012-05-18 02:17:21   \n",
141 |        "1  They have lots of good deserts and tasty cuban... 2013-02-05 18:35:10   \n",
142 |        "2             It's open even when you think it isn't 2013-08-18 00:56:08   \n",
143 |        "3                          Very decent fried chicken 2017-06-27 23:05:38   \n",
144 |        "4             Appetizers.. platter special for lunch 2012-10-06 19:43:09   \n",
145 |        "\n",
146 |        "   compliment_count  \n",
147 |        "0                 0  \n",
148 |        "1                 0  \n",
149 |        "2                 0  \n",
150 |        "3                 0  \n",
151 |        "4                 0  "
152 |       ]
153 |      },
154 |      "execution_count": 2,
155 |      "metadata": {},
156 |      "output_type": "execute_result"
157 |     }
158 |    ],
159 |    "source": [
160 |     "json_file_path = r\"E:\\DATA SCIENCE\\NLP-Tea\\Data\\yelp_academic_dataset_tip.json\\yelp_academic_dataset_tip.json\"\n",
161 |     "df = pd.read_json(json_file_path, lines=True)\n",
162 |     "\n",
163 |     "df.head()"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 3,
169 |    "id": "7eeec26e",
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "data": {
174 |       "text/plain": [
175 |        "['Avengers time with the ladies.',\n",
176 |        " 'They have lots of good deserts and tasty cuban sandwiches',\n",
177 |        " \"It's open even when you think it isn't\",\n",
178 |        " 'Very decent fried chicken',\n",
179 |        " 'Appetizers.. platter special for lunch']"
180 |       ]
181 |      },
182 |      "execution_count": 3,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "text_data = list(df[\"text\"][:1000]) # First 1000 Row Only \n",
189 |     "text_data[:5]"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "id": "806e1a12",
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "name": "stdout",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "['mohamad', 'fawzy', 'jfhbf', 'dvhbfehyv']\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "word_tokenize"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 180,
213 |    "id": "5dc3e634",
214 |    "metadata": {},
215 |    "outputs": [
216 |     {
217 |      "name": "stdout",
218 |      "output_type": "stream",
219 |      "text": [
220 |       "[['avenger', 'time', 'lady'], ['lot', 'good', 'desert', 'tasty', 'cuban', 'sandwich'], ['open', 'even', 'think'], ['decent', 'fried', 'chicken'], ['appetizer', 'platter', 'special', 'lunch']]\n"
221 |      ]
222 |     }
223 |    ],
224 |    "source": [
225 |     "def preprocessing(text: str) -> list[list[str]]:\n",
226 |     "    \"\"\"\n",
227 |     "    Preprocesses a given text:\n",
228 |     "    - Lowercases text\n",
229 |     "    - Contraction Handling\n",
230 |     "    - Removes punctuation and digits\n",
231 |     "    - Removes stopwords\n",
232 |     "    - Tokenizes into words\n",
233 |     "    - Applies lemmatization or stemming\n",
234 |     "\n",
235 |     "    Args:\n",
236 |     "        document (str): The raw input text\n",
237 |     "\n",
238 |     "    Returns:\n",
239 |     "        List of str: Cleaned and preprocessed text\n",
240 |     "\n",
241 |     "    Example:\n",
242 |     "        >>> preprocessing(\"It's open even when you think it isn't\")\n",
243 |     "        [\"'s\", 'open', 'even', 'think', \"n't\"]\n",
244 |     "    \"\"\"\n",
245 |     "\n",
246 |     "    stop_words = set(stopwords.words('english'))\n",
247 |     "    lemmatizer = WordNetLemmatizer()\n",
248 |     "\n",
249 |     "    # Convert Text to Lowercase (Normalization)\n",
250 |     "    text_lower = text.lower()\n",
251 |     "\n",
252 |     "    # Contraction Handling\n",
253 |     "    text_lower = contractions.fix(text_lower)\n",
254 |     "\n",
255 |     "    # Removing Punctuation\n",
256 |     "    text_no_punct = re.sub(r'[^a-zA-Z\\s\\']', '', text_lower) # \\' for keep apostrophes (e.g. don't, it's)\n",
257 |     "\n",
258 |     "    # 3. Tokens\n",
259 |     "    # tokens = word_tokenize(text_no_punct)\n",
260 |     "    tokens = re.split(r\"\\s+\", text_no_punct)\n",
261 |     "    \n",
262 |     "\n",
263 |     "    # 4. Stop word removal\n",
264 |     "    filtered_tokens  = [token for token in tokens if token not in stop_words]\n",
265 |     "\n",
266 |     "    # 5. Lemmatization \n",
267 |     "    lemma_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens ]\n",
268 |     "    \n",
269 |     "\n",
270 |     "    return lemma_tokens\n",
271 |     "\n",
272 |     "text_data = list(df[\"text\"][:10]) # First 1000 Row Only\n",
273 |     "preprocessed_text = [preprocessing(text) for text in text_data]\n",
274 |     "print(preprocessed_text[:5])"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 181,
280 |    "id": "33d8ab7d",
281 |    "metadata": {},
282 |    "outputs": [
283 |     {
284 |      "name": "stdout",
285 |      "output_type": "stream",
286 |      "text": [
287 |       "['appetizer', 'area', 'avenger', 'best', 'boring', 'center', 'cheeseburger', 'chicken', 'chili', 'city', 'cocacolaso', 'cool', 'cuban', 'cup', 'dec', 'decent', 'decorated', 'desert', 'downtown', 'eat', 'elf', 'even', 'far', 'fried', 'game', 'good', 'great', 'kid', 'lady', 'leave', 'lindenwold', 'lot', 'lunch', 'make', 'never', 'onion', 'open', \"patco's\", 'pickle', 'place', 'platter', 'pm', 'probably', 'relish', 'ride', 'sandwich', 'santa', 'saturday', 'silver', 'single', 'sleigh', 'special', 'spring', 'starbucks', 'stop', 'substitute', 'taco', 'tampa', 'tasty', 'th', 'think', 'time', 'train', 'ugh', 'vanilla', 'w', 'watch']\n"
288 |      ]
289 |     }
290 |    ],
291 |    "source": [
292 |     "all_tokens =[]\n",
293 |     "for lst_tokens in preprocessed_text:\n",
294 |     "    all_tokens.extend(lst_tokens)\n",
295 |     "\n",
296 |     "vocab = sorted(set(all_tokens)) # Unique Words\n",
297 |     "print(vocab)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "id": "93c7f85b",
303 |    "metadata": {},
304 |    "source": [
305 |     "# Label Encoder"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "id": "7d94450f",
311 |    "metadata": {},
312 |    "source": [
313 |     "## From Scratch"
314 |    ]
315 |   },
316 |   {
317 |    "cell_type": "code",
318 |    "execution_count": 200,
319 |    "id": "7edba0a2",
320 |    "metadata": {},
321 |    "outputs": [
322 |     {
323 |      "name": "stdout",
324 |      "output_type": "stream",
325 |      "text": [
326 |       "{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, \"patco's\": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}\n",
327 |       "[[2, 61, 28], [31, 25, 17, 58, 12, 45], [36, 21, 60]]\n"
328 |      ]
329 |     }
330 |    ],
331 |    "source": [
332 |     "def LabelEncoder(vocab: list) -> dict:\n",
333 |     "    \"\"\"\n",
334 |     "    Creates a label encoder that maps each unique word to a unique integer index.\n",
335 |     "\n",
336 |     "    Args:\n",
337 |     "        vocab (list): A sorted list of unique words (vocabulary).\n",
338 |     "\n",
339 |     "    Returns:\n",
340 |     "        dict: A dictionary mapping words to their corresponding index.\n",
341 |     "    \"\"\"\n",
342 |     "    word_to_index = {token: idx for idx, token in enumerate(vocab)}\n",
343 |     "    return word_to_index\n",
344 |     "\n",
345 |     "\n",
346 |     "def Transform (preprocessed_text: list[list[str]], word_to_idx: dict) -> list[list[int]] :\n",
347 |     "    \"\"\"\n",
348 |     "    Transforms a list of tokenized text into lists of integer-encoded words.\n",
349 |     "\n",
350 |     "    Args:\n",
351 |     "        preprocessed_text (list[list[str]]): A list of lists, where each sublist contains tokens from one sentence.\n",
352 |     "        word_to_idx (dict): A dictionary mapping words to unique integer indices.\n",
353 |     "\n",
354 |     "    Returns:\n",
355 |     "        list: A list of lists, where each sublist contains the integer-encoded words for a sentence.\n",
356 |     "    \"\"\"\n",
357 |     "    data=[]\n",
358 |     "    for sentence in preprocessed_text :\n",
359 |     "        encoded_sentence = []\n",
360 |     "        for word in sentence:\n",
361 |     "            encoded_sentence.append(word_to_idx[word])\n",
362 |     "        data.append(encoded_sentence)\n",
363 |     "    return data\n",
364 |     "\n",
365 |     "\n",
366 |     "word_to_idx = LabelEncoder(vocab= vocab)\n",
367 |     "transformed_txt = Transform(preprocessed_text=preprocessed_text, word_to_idx=word_to_idx)\n",
368 |     "print(word_to_idx)\n",
369 |     "print(transformed_txt[:3])\n"
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "markdown",
374 |    "id": "1b2d2df5",
375 |    "metadata": {},
376 |    "source": [
377 |     "## Built in"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 197,
383 |    "id": "db04548e",
384 |    "metadata": {},
385 |    "outputs": [
386 |     {
387 |      "name": "stdout",
388 |      "output_type": "stream",
389 |      "text": [
390 |       "Encoded labels: [ 2 61 28 31 25 17 58 12 45 36 21 60 15 23  7  0 40 51 32  8 13 49  6 35\n",
391 |       " 38 43 64 10 22 47 14 59 44 37 48 50 65 46 20 16 62  5  9 62 29 30 41 33\n",
392 |       " 54 26 27 42  3 39 11 52  1 66 24 19 56 53 55  4 18 57 63 34]\n"
393 |      ]
394 |     }
395 |    ],
396 |    "source": [
397 |     "from sklearn.preprocessing import LabelEncoder\n",
398 |     "\n",
399 |     "all_tokens =[]\n",
400 |     "for lst_tokens in preprocessed_text:\n",
401 |     "    all_tokens.extend(lst_tokens) # All Words\n",
402 |     "\n",
403 |     "\n",
404 |     "label_encoder = LabelEncoder()\n",
405 |     "encoded_labels = label_encoder.fit_transform(all_tokens)\n",
406 |     "\n",
407 |     "# Output encoded labels and the mapping\n",
408 |     "print(\"Encoded labels:\", encoded_labels)"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 199,
414 |    "id": "105c2030",
415 |    "metadata": {},
416 |    "outputs": [
417 |     {
418 |      "data": {
419 |       "text/plain": [
420 |        "[array([ 2, 61, 28]),\n",
421 |        " array([31, 25, 17, 58, 12, 45]),\n",
422 |        " array([36, 21, 60]),\n",
423 |        " array([15, 23,  7]),\n",
424 |        " array([ 0, 40, 51, 32]),\n",
425 |        " array([ 8, 13, 49,  6, 35, 38, 43, 64, 10, 22]),\n",
426 |        " array([47, 14, 59, 44, 37, 48, 50, 65, 46, 20, 16, 62,  5,  9, 62, 29, 30,\n",
427 |        "        41, 33, 54, 26, 27]),\n",
428 |        " array([42,  3, 39, 11, 52,  1, 66, 24, 19]),\n",
429 |        " array([56]),\n",
430 |        " array([53, 55,  4, 18, 57, 63, 34])]"
431 |       ]
432 |      },
433 |      "execution_count": 199,
434 |      "metadata": {},
435 |      "output_type": "execute_result"
436 |     }
437 |    ],
438 |    "source": [
439 |     "encoded_sentences = [label_encoder.transform(sentence) for sentence in preprocessed_text]\n",
440 |     "encoded_sentences"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": null,
446 |    "id": "95f03b91",
447 |    "metadata": {},
448 |    "outputs": [],
449 |    "source": []
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "id": "efff3800",
454 |    "metadata": {},
455 |    "source": [
456 |     "# One hot Encoding"
457 |    ]
458 |   },
459 |   {
460 |    "cell_type": "code",
461 |    "execution_count": 282,
462 |    "id": "f9f691ca",
463 |    "metadata": {},
464 |    "outputs": [
465 |     {
466 |      "name": "stdout",
467 |      "output_type": "stream",
468 |      "text": [
469 |       "[array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
470 |       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
471 |       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
472 |       "       0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
473 |       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
474 |       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,\n",
475 |       "       0]), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
476 |       "       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
477 |       "       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
478 |       "       0])]\n"
479 |      ]
480 |     }
481 |    ],
482 |    "source": [
483 |     "def OneHotEncoder(vocab: list) -> dict :\n",
484 |     "    \"\"\"\n",
485 |     "    Creates one-hot encoded vectors for each unique word in the vocabulary.\n",
486 |     "\n",
487 |     "    Args:\n",
488 |     "        vocab (list): A sorted list of unique tokens.\n",
489 |     "\n",
490 |     "    Returns:\n",
491 |     "        dict: A mapping from word to its one-hot encoded numpy array.\n",
492 |     "    \"\"\"\n",
493 |     "    word_to_idx = {word: idx for idx, word in enumerate(vocab)}\n",
494 |     "    vocab_size = len(word_to_idx)\n",
495 |     "    one_hot_dict  = {}\n",
496 |     "\n",
497 |     "    for word, idx in word_to_idx.items() :\n",
498 |     "        # print(word, idx)\n",
499 |     "        vec = np.zeros(shape=vocab_size, dtype=int)\n",
500 |     "        vec[idx] = 1\n",
501 |     "        one_hot_dict[word] = vec\n",
502 |     "\n",
503 |     "    return one_hot_dict\n",
504 |     "\n",
505 |     "\n",
506 |     "def TransformOneHot(preprocessed_text: list[list[str]], word_to_vec: dict) -> list[list[np.ndarray]]:\n",
507 |     "    \"\"\"\n",
508 |     "    Transforms a list of tokenized sentences into one-hot encoded vectors.\n",
509 |     "\n",
510 |     "    Args:\n",
511 |     "        preprocessed_text (list of list of str): Tokenized sentences.\n",
512 |     "        word_to_vec (dict): A mapping from word to one-hot vector.\n",
513 |     "\n",
514 |     "    Returns:\n",
515 |     "        list of list of np.ndarray: One-hot encoded representation of sentences.\n",
516 |     "    \"\"\"\n",
517 |     "    data = []\n",
518 |     "    for sentence in preprocessed_text:\n",
519 |     "        encoded_sentence = []\n",
520 |     "        for word in sentence:\n",
521 |     "            encoded_sentence.append(word_to_vec[word])\n",
522 |     "        data.append(encoded_sentence)\n",
523 |     "    return data\n",
524 |     "\n",
525 |     "\n",
526 |     "\n",
527 |     "\n",
528 |     "one_hot_dict  = OneHotEncoder(vocab)\n",
529 |     "transformed_txt = TransformOneHot(preprocessed_text=preprocessed_text, word_to_vec=one_hot_dict)\n",
530 |     "# print(one_hot_dict)\n",
531 |     "print(transformed_txt[0])"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "markdown",
536 |    "id": "aa185195",
537 |    "metadata": {},
538 |    "source": [
539 |     "## Built in"
540 |    ]
541 |   },
542 |   {
543 |    "cell_type": "code",
544 |    "execution_count": 398,
545 |    "id": "44c34f7a",
546 |    "metadata": {},
547 |    "outputs": [],
548 |    "source": [
549 |     "from sklearn.preprocessing import LabelEncoder\n",
550 |     "from sklearn.preprocessing import OneHotEncoder\n",
551 |     "\n",
552 |     "from numpy import array"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": 418,
558 |    "id": "9d58a0ac",
559 |    "metadata": {},
560 |    "outputs": [
561 |     {
562 |      "name": "stdout",
563 |      "output_type": "stream",
564 |      "text": [
565 |       "68\n",
566 |       "67\n",
567 |       "{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, \"patco's\": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}\n"
568 |      ]
569 |     }
570 |    ],
571 |    "source": [
572 |     "all_tokens =[]\n",
573 |     "for lst_tokens in preprocessed_text:\n",
574 |     "    all_tokens.extend(lst_tokens) # All Words\n",
575 |     "vocab = sorted(set(all_tokens)) # Unique Words\n",
576 |     "\n",
577 |     "print(len(all_tokens))\n",
578 |     "print(len(vocab))\n",
579 |     "w_idx = {w:i for i,w in enumerate(vocab)}\n",
580 |     "print(w_idx)"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 419,
586 |    "id": "0c7cffee",
587 |    "metadata": {},
588 |    "outputs": [],
589 |    "source": [
590 |     "# Label Encode\n",
591 |     "label_encoder = LabelEncoder()\n",
592 |     "integer_encoded = label_encoder.fit_transform(array(vocab)) \n"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "code",
597 |    "execution_count": 420,
598 |    "id": "9b16dccb",
599 |    "metadata": {},
600 |    "outputs": [
601 |     {
602 |      "name": "stdout",
603 |      "output_type": "stream",
604 |      "text": [
605 |       "{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, \"patco's\": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}\n",
606 |       "{'appetizer': 0, 'area': 1, 'avenger': 2, 'best': 3, 'boring': 4, 'center': 5, 'cheeseburger': 6, 'chicken': 7, 'chili': 8, 'city': 9, 'cocacolaso': 10, 'cool': 11, 'cuban': 12, 'cup': 13, 'dec': 14, 'decent': 15, 'decorated': 16, 'desert': 17, 'downtown': 18, 'eat': 19, 'elf': 20, 'even': 21, 'far': 22, 'fried': 23, 'game': 24, 'good': 25, 'great': 26, 'kid': 27, 'lady': 28, 'leave': 29, 'lindenwold': 30, 'lot': 31, 'lunch': 32, 'make': 33, 'never': 34, 'onion': 35, 'open': 36, \"patco's\": 37, 'pickle': 38, 'place': 39, 'platter': 40, 'pm': 41, 'probably': 42, 'relish': 43, 'ride': 44, 'sandwich': 45, 'santa': 46, 'saturday': 47, 'silver': 48, 'single': 49, 'sleigh': 50, 'special': 51, 'spring': 52, 'starbucks': 53, 'stop': 54, 'substitute': 55, 'taco': 56, 'tampa': 57, 'tasty': 58, 'th': 59, 'think': 60, 'time': 61, 'train': 62, 'ugh': 63, 'vanilla': 64, 'w': 65, 'watch': 66}\n"
607 |      ]
608 |     },
609 |     {
610 |      "data": {
611 |       "text/plain": [
612 |        "([2, 61, 28], [2, 61, 28])"
613 |       ]
614 |      },
615 |      "execution_count": 420,
616 |      "metadata": {},
617 |      "output_type": "execute_result"
618 |     }
619 |    ],
620 |    "source": [
621 |     "# print(integer_encoded)\n",
622 |     "# print(vocab)\n",
623 |     "# print(preprocessed_text )\n",
624 |     "word2id = dict(zip(vocab, integer_encoded))\n",
625 |     "print(word2id)\n",
626 |     "print(w_idx)\n",
627 |     "\n",
628 |     "datamodel=[]\n",
629 |     "data_me=[]\n",
630 |     "\n",
631 |     "for sentence in preprocessed_text:\n",
632 |     "    lmodel=[]\n",
633 |     "    lme=[]\n",
634 |     "\n",
635 |     "    for w in sentence:\n",
636 |     "        lmodel.append(word2id[w])\n",
637 |     "        lme.append(w_idx[w])\n",
638 |     "    datamodel.append(lmodel)\n",
639 |     "    data_me.append(lme)\n",
640 |     "#--------------------------------------------*************----------------\n",
641 |     "data_me[0], datamodel[0]"
642 |    ]
643 |   },
644 |   {
645 |    "cell_type": "code",
646 |    "execution_count": null,
647 |    "id": "4d4633d2",
648 |    "metadata": {},
649 |    "outputs": [],
650 |    "source": []
651 |   },
652 |   {
653 |    "cell_type": "code",
654 |    "execution_count": 421,
655 |    "id": "42bbccf7",
656 |    "metadata": {},
657 |    "outputs": [],
658 |    "source": [
659 |     "from numpy import array, reshape\n",
660 |     "\n",
661 |     "integer_encoded = integer_encoded.reshape(-1, 1)\n",
662 |     "\n",
663 |     "onehot_encoder = OneHotEncoder(sparse_output=False)\n",
664 |     "onehot_encoded = onehot_encoder.fit_transform(integer_encoded)"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "code",
669 |    "execution_count": 423,
670 |    "id": "b78d1f5c",
671 |    "metadata": {},
672 |    "outputs": [
673 |     {
674 |      "name": "stdout",
675 |      "output_type": "stream",
676 |      "text": [
677 |       "[array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
678 |       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
679 |       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
680 |       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
681 |       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
682 |       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
683 |       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
684 |       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,\n",
685 |       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
686 |       "       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]\n"
687 |      ]
688 |     }
689 |    ],
690 |    "source": [
691 |     "word2onehot = dict(zip(vocab, onehot_encoded))\n",
692 |     "\n",
693 |     "data=[]\n",
694 |     "for sentence in preprocessed_text:\n",
695 |     "    vec = []\n",
696 |     "    for w in sentence:\n",
697 |     "        vec.append(word2onehot[w])\n",
698 |     "    data.append(vec)\n",
699 |     "print(data[0])"
700 |    ]
701 |   },
702 |   {
703 |    "cell_type": "code",
704 |    "execution_count": null,
705 |    "id": "5d3dc3b2",
706 |    "metadata": {},
707 |    "outputs": [],
708 |    "source": []
709 |   }
710 |  ],
711 |  "metadata": {
712 |   "kernelspec": {
713 |    "display_name": "myenv",
714 |    "language": "python",
715 |    "name": "python3"
716 |   },
717 |   "language_info": {
718 |    "codemirror_mode": {
719 |     "name": "ipython",
720 |     "version": 3
721 |    },
722 |    "file_extension": ".py",
723 |    "mimetype": "text/x-python",
724 |    "name": "python",
725 |    "nbconvert_exporter": "python",
726 |    "pygments_lexer": "ipython3",
727 |    "version": "3.12.6"
728 |   }
729 |  },
730 |  "nbformat": 4,
731 |  "nbformat_minor": 5
732 | }
733 | 


--------------------------------------------------------------------------------