├── .clinerules ├── 01-project-context.md └── 02-development-guidelines.md ├── .env.example ├── .gitignore ├── .talismanrc ├── README.md ├── SETUP.md ├── advanced-rag ├── 01-advanced-rag-rerank.ipynb └── scifact │ ├── 01-data-indexing.ipynb │ ├── 02-advanced-rag.ipynb │ └── run-qdrant.sh ├── data ├── extended_wiki_articles │ ├── Algorithm.txt │ ├── Algorithm_metadata.json │ ├── Artificial_intelligence.txt │ ├── Artificial_intelligence_metadata.json │ ├── Artificial_neural_network.txt │ ├── Artificial_neural_network_metadata.json │ ├── BERT_language_model.txt │ ├── BERT_language_model_metadata.json │ ├── Bayesian_network.txt │ ├── Bayesian_network_metadata.json │ ├── Bioinformatics.txt │ ├── Bioinformatics_metadata.json │ ├── Biology.txt │ ├── Biology_metadata.json │ ├── Blog_GPT-4_Technical_Report.txt │ ├── Blog_GPT-4_Technical_Report_metadata.json │ ├── Class_biology.txt │ ├── Class_biology_metadata.json │ ├── Cluster_analysis.txt │ ├── Cluster_analysis_metadata.json │ ├── Cognitive_science.txt │ ├── Cognitive_science_metadata.json │ ├── Computational_biology.txt │ ├── Computational_biology_metadata.json │ ├── Computer_network.txt │ ├── Computer_network_metadata.json │ ├── Computer_science.txt │ ├── Computer_science_metadata.json │ ├── Control_theory.txt │ ├── Control_theory_metadata.json │ ├── Cryptography.txt │ ├── Cryptography_metadata.json │ ├── Cybernetics.txt │ ├── Cybernetics_metadata.json │ ├── Data_structure.txt │ ├── Data_structure_metadata.json │ ├── Database_management_system.txt │ ├── Database_management_system_metadata.json │ ├── Decision_tree_learning.txt │ ├── Decision_tree_learning_metadata.json │ ├── Deep_learning.txt │ ├── Deep_learning_metadata.json │ ├── Dimensionality_reduction.txt │ ├── Dimensionality_reduction_metadata.json │ ├── Electrical_engineering.txt │ ├── Electrical_engineering_metadata.json │ ├── Engineering.txt │ ├── Engineering_metadata.json │ ├── Ensemble_learning.txt │ ├── Ensemble_learning_metadata.json │ ├── Evolution.txt │ ├── Evolution_metadata.json │ ├── Evolutionary_computation.txt │ ├── Evolutionary_computation_metadata.json │ ├── Feature_recognition.txt │ ├── Feature_recognition_metadata.json │ ├── Feature_selection.txt │ ├── Feature_selection_metadata.json │ ├── Fuzzy_logic.txt │ ├── Fuzzy_logic_metadata.json │ ├── Generative_pre_trained_transformer.txt │ ├── Generative_pre_trained_transformer_metadata.json │ ├── Graph_theory.txt │ ├── Graph_theory_metadata.json │ ├── History_of_artificial_intelligence.txt │ ├── History_of_artificial_intelligence_metadata.json │ ├── Humancomputer_interaction.txt │ ├── Humancomputer_interaction_metadata.json │ ├── Information_theory.txt │ ├── Information_theory_metadata.json │ ├── John_von_Neumann.txt │ ├── John_von_Neumann_metadata.json │ ├── Kernel_computing.txt │ ├── Kernel_computing_metadata.json │ ├── Knowledge_representation_and_reasoning.txt │ ├── Knowledge_representation_and_reasoning_metadata.json │ ├── Layer.txt │ ├── Layer_metadata.json │ ├── LilianWeng_Contrastive_Representation_Learning.txt │ ├── LilianWeng_Contrastive_Representation_Learning_metadata.json │ ├── LilianWeng_Flow-based_Deep_Generative_Models.txt │ ├── LilianWeng_Flow-based_Deep_Generative_Models_metadata.json │ ├── LilianWeng_LLM_Powered_Autonomous_Agents.txt │ ├── LilianWeng_LLM_Powered_Autonomous_Agents_metadata.json │ ├── LilianWeng_Learning_with_not_Enough_Data_Part_3_Active_Learning.txt │ ├── LilianWeng_Learning_with_not_Enough_Data_Part_3_Active_Learning_metadata.json │ ├── LilianWeng_The_Transformer_Family_Version_20.txt │ ├── LilianWeng_The_Transformer_Family_Version_20_metadata.json │ ├── LilianWeng_What_are_Diffusion_Models.txt │ ├── LilianWeng_What_are_Diffusion_Models_metadata.json │ ├── Linear_algebra.txt │ ├── Linear_algebra_metadata.json │ ├── MORE_application.txt │ ├── MORE_application_metadata.json │ ├── Mathematical_optimization.txt │ ├── Mathematical_optimization_metadata.json │ ├── Mathematics.txt │ ├── Mathematics_metadata.json │ ├── Neuroscience.txt │ ├── Neuroscience_metadata.json │ ├── Node.txt │ ├── Node_computer_science.txt │ ├── Node_computer_science_metadata.json │ ├── Node_metadata.json │ ├── Operating_system.txt │ ├── Operating_system_metadata.json │ ├── Optimization_mathematics.txt │ ├── Optimization_mathematics_metadata.json │ ├── Overfitting.txt │ ├── Overfitting_metadata.json │ ├── Physical_examination.txt │ ├── Physical_examination_metadata.json │ ├── Portfolio_finance.txt │ ├── Portfolio_finance_metadata.json │ ├── Psychology.txt │ ├── Psychology_metadata.json │ ├── Quantitative_analysis_finance.txt │ ├── Quantitative_analysis_finance_metadata.json │ ├── Rise_The_Vieneo_Province.txt │ ├── Rise_The_Vieneo_Province_metadata.json │ ├── Signal_processing.txt │ ├── Signal_processing_metadata.json │ ├── Software.txt │ ├── Software_engineering.txt │ ├── Software_engineering_metadata.json │ ├── Software_metadata.json │ ├── Statistical_mechanics.txt │ ├── Statistical_mechanics_metadata.json │ ├── Statistics.txt │ ├── Statistics_metadata.json │ ├── Support_vector_machine.txt │ ├── Support_vector_machine_metadata.json │ ├── Transfer_learning.txt │ ├── Transfer_learning_metadata.json │ ├── Transformer_machine_learning_model.txt │ ├── Transformer_machine_learning_model_metadata.json │ ├── Tree_data_structure.txt │ ├── Tree_data_structure_metadata.json │ ├── Weight.txt │ ├── Weight_metadata.json │ ├── arXiv_1706.03762.txt │ ├── arXiv_1706.03762_metadata.json │ ├── arXiv_2005.14165.txt │ ├── arXiv_2005.14165_metadata.json │ ├── arXiv_2307.09288.txt │ ├── arXiv_2307.09288_metadata.json │ └── dataset_summary.json ├── ingestion_summary.json ├── wiki_articles │ ├── Artificial_neural_network.txt │ ├── BERT_(language_model).txt │ ├── Deep_learning.txt │ ├── Generative_pre-trained_transformer.txt │ ├── Overfitting.txt │ └── Transformer_(machine_learning_model).txt └── wiki_eval_dataset.json ├── default.sqlite ├── imgs ├── ingestion.png └── naive-rag.png ├── naive-rag ├── 01-naive-rag.ipynb ├── 02-naive-rag-challenges.ipynb └── rag_evaluator_v2.py ├── requirements.txt └── scripts ├── README.md ├── create_extended_dataset.py ├── fetch_additional_articles.py ├── ingest_to_qdrant_cloud.py └── wiki_article_loader.py /.clinerules/01-project-context.md: -------------------------------------------------------------------------------- 1 | # Project Context - RAG Workshop 2 | 3 | ## Repository Overview 4 | This is a workshop repository for building Retrieval-Augmented Generation (RAG) applications. The project contains educational materials and implementations for learning RAG concepts from basic to advanced levels. This is used for the O'Reilly training course: "Building Reliable RAG Applications: From PoC to Production". 5 | 6 | The structure of the workshop: 7 | 1. We start with Naive-rag concepts and do the hands on with Naive-rag notebook 8 | 2. Then we talk about naive-rag-challenges. Where we discuss where it does not fare well and the need for advanced techniques. 9 | 3. Then we jump on advanced rag section where we first talk about reranking. And show practically hw the score gets improved the reranker. 10 | 4. Then we show hybrid search and how the score gets further improved the hybrid search. 11 | -------------------------------------------------------------------------------- /.clinerules/02-development-guidelines.md: -------------------------------------------------------------------------------- 1 | # Development Guidelines 2 | 3 | ## Environment Setup 4 | - Always use virtual environments for Python development 5 | - Ensure `.env` file contains required API keys: OPENAI_API_KEY, COHERE_API_KEY 6 | - Use Python 3.11+ for compatibility with all dependencies 7 | - Install dependencies from requirements.txt 8 | 9 | ## Code Style for Notebooks 10 | - Use clear, descriptive cell comments explaining each step 11 | - Include markdown cells to explain concepts and methodology 12 | - Follow consistent variable naming: use descriptive names like `retriever`, `embeddings`, `vector_store` 13 | - Add error handling for API calls and external services 14 | - Include timing information for performance analysis 15 | 16 | ## Notebook Organization 17 | - Start each notebook with clear objectives and prerequisites 18 | - Include imports section with all required libraries 19 | - Add configuration section for API keys and parameters 20 | - Structure code in logical sections: Setup → Data Loading → Processing → Evaluation 21 | - End with summary of results and next steps 22 | 23 | ## Error Handling 24 | - Implement graceful handling of API rate limits 25 | - Include fallback options for different LLM providers 26 | - Add validation for required environment variables 27 | - Handle vector database connection issues 28 | - Provide clear error messages for common setup problems 29 | 30 | ## Performance Considerations 31 | - Monitor token usage and costs for LLM calls 32 | - Implement caching for embeddings and retrieved results 33 | - Use batch processing for large datasets 34 | - Include memory usage monitoring for large vector operations 35 | - Optimize chunk sizes based on content type 36 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | OPENAI_API_KEY= 2 | OPENAI_BASE_URL= 3 | COHERE_API_KEY= 4 | 5 | # Qdrant Cloud Configuration (for workshop) 6 | QDRANT_URL= 7 | QDRANT_API_KEY= 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | .env 3 | venv/* 4 | trulens/* 5 | __pycache__/ 6 | .ipynb_checkpoints/ -------------------------------------------------------------------------------- /.talismanrc: -------------------------------------------------------------------------------- 1 | threshold: medium 2 | fileignoreconfig: 3 | - filename: advanced-rag/01-data-indexing.ipynb 4 | checksum: 6e6bce37572ea4722f26ba90b34733bbed9b99d0514945bce7a793b6c2142821 5 | version: "" -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Building RAG Applications Workshop 2 | 3 | This repository contains materials for the Building RAG (Retrieval-Augmented Generation) Applications Workshop. The workshop covers both naive RAG implementations and advanced RAG techniques to help you understand and build effective RAG systems. 4 | 5 | This repository is used for the O'Reilly training course: [Building Reliable RAG Applications: From PoC to Production](https://learning.oreilly.com/live-events/building-reliable-rag-applications-from-poc-to-production/0642572012347/). 6 | 7 | ## Prerequisites 8 | 9 | - Python 3.11+ recommended 10 | - API keys (OpenAI, Cohere) 11 | - Jupyter notebook environment 12 | - Qdrant database (Cloud or Docker) 13 | 14 | ## Quick Start 15 | 16 | ### 1. Clone and Install 17 | 18 | ```bash 19 | git clone https://github.com/Sarangk90/building-rag-app-workshop.git 20 | cd building-rag-app-workshop 21 | 22 | # Create virtual environment 23 | python -m venv venv 24 | source venv/bin/activate # On Windows: venv\Scripts\activate 25 | 26 | # Install dependencies 27 | pip install --upgrade pip 28 | pip install -r requirements.txt 29 | ``` 30 | 31 | ### 2. Complete Workshop Setup 32 | 33 | 📖 **Follow the complete setup guide: [`SETUP.md`](SETUP.md)** 34 | 35 | The setup guide covers: 36 | - Qdrant database setup (Cloud or Docker options) 37 | - Environment variable configuration 38 | - Data ingestion process 39 | - Troubleshooting common issues 40 | 41 | **⚠️ You must complete the setup before running any notebooks!** 42 | 43 | ## Workshop Notebooks 44 | 45 | After completing the setup, run the notebooks in this order: 46 | 47 | ### 1. Naive RAG 48 | - **`naive-rag/01-naive-rag.ipynb`** - Basic RAG implementation 49 | - **`naive-rag/02-naive-rag-challenges.ipynb`** - RAG limitations and evaluation 50 | 51 | ### 2. Advanced RAG 52 | - **`advanced-rag/01-advanced-rag-rerank.ipynb`** - Advanced RAG with reranking 53 | 54 | ### 3. SciFact Dataset (Optional) 55 | - **`advanced-rag/scifact/01-data-indexing.ipynb`** - Data indexing techniques 56 | - **`advanced-rag/scifact/02-advanced-rag.ipynb`** - Advanced techniques 57 | 58 | **Note**: Each notebook automatically detects your setup (Cloud vs Docker) and connects appropriately. 59 | 60 | ## Workshop Content 61 | 62 | ### Naive RAG 63 | - Basic RAG implementation using OpenAI embeddings 64 | - Vector storage with Qdrant 65 | - Simple retrieval and generation pipeline 66 | 67 | ### Advanced RAG 68 | - Hybrid search combining dense and sparse embeddings 69 | - Reranking with cross-encoders for improved relevance 70 | - Evaluation using standard metrics and benchmarks 71 | 72 | ## Data 73 | 74 | The workshop uses: 75 | - Wikipedia articles on machine learning topics (Deep learning, Transformers, etc.) 76 | - BeIR SciFact dataset for demonstrations and evaluations 77 | 78 | ### Wikipedia Article Management 79 | 80 | The repository includes pre-downloaded Wikipedia articles in `data/wiki_articles/` to avoid repetitive API calls during workshops. Use the following scripts to manage articles: 81 | 82 | #### List Available Articles 83 | ```bash 84 | python scripts/fetch_additional_articles.py --list-available 85 | ``` 86 | 87 | #### Fetch Additional Articles 88 | ```bash 89 | # Fetch specific articles 90 | python scripts/fetch_additional_articles.py "Machine learning" "Computer vision" 91 | 92 | # Fetch from extended list (30+ ML/AI topics) 93 | python scripts/fetch_additional_articles.py 94 | 95 | # View the extended article list 96 | python scripts/fetch_additional_articles.py --list-extended 97 | ``` 98 | 99 | #### Force Re-fetch Existing Articles 100 | ```bash 101 | python scripts/fetch_additional_articles.py --force "Deep learning" 102 | ``` 103 | 104 | **Available Pre-downloaded Articles:** 105 | - Artificial neural network 106 | - BERT (language model) 107 | - Deep learning 108 | - Generative pre-trained transformer 109 | - Overfitting 110 | - Transformer (machine learning model) 111 | 112 | ## Key Dependencies 113 | 114 | - **openai**: For embeddings and completions 115 | - **qdrant-client**: For vector storage and retrieval 116 | - **wikipedia, beautifulsoup4**: For data collection and cleaning 117 | - **FlagEmbedding**: For reranking functionality 118 | - **cohere**: For additional reranking options 119 | - **ragas**: For comprehensive RAG evaluation 120 | - **Various utilities**: tqdm, python-dotenv, etc. 121 | 122 | ## Evaluation 123 | 124 | The workshop includes evaluation scripts using RAGAS metrics to assess the quality of RAG outputs across dimensions like relevance, faithfulness, and answer quality. 125 | -------------------------------------------------------------------------------- /SETUP.md: -------------------------------------------------------------------------------- 1 | # RAG Workshop Setup Guide 2 | 3 | This guide walks you through setting up your environment for the RAG workshop. You'll need to complete these steps before running any of the workshop notebooks. 4 | 5 | ## 🚀 Choose Your Setup Option 6 | 7 | You have **two options** to set up Qdrant and ingest the extended Wikipedia dataset. Choose the option that works best for your environment: 8 | 9 | --- 10 | 11 | ## Option A: Qdrant Cloud (Recommended) 12 | 13 | This is the easiest option and works on all platforms without requiring Docker. 14 | 15 | ### Step 1: Create Qdrant Cloud Account 16 | 1. Go to [Qdrant Cloud](https://cloud.qdrant.io/) 17 | 2. Sign up for a free account 18 | 3. Create a new cluster (free tier is sufficient) 19 | 4. Get your cluster URL and API key from the dashboard 20 | 21 | ### Step 2: Set Environment Variables 22 | Create a `.env` file in the project root with: 23 | ```bash 24 | OPENAI_API_KEY=your_openai_api_key_here 25 | QDRANT_URL=your_qdrant_cluster_url_here 26 | QDRANT_API_KEY=your_qdrant_api_key_here 27 | 28 | # Optional: For advanced RAG features 29 | COHERE_API_KEY=your_cohere_api_key_here 30 | ``` 31 | 32 | ### Step 3: Run Data Ingestion Script 33 | ```bash 34 | # From the project root directory 35 | python scripts/ingest_to_qdrant_cloud.py 36 | ``` 37 | 38 | --- 39 | 40 | ## Option B: Local Docker Setup 41 | 42 | If you prefer to run Qdrant locally using Docker, follow these steps: 43 | 44 | ### Step 1: Start Local Qdrant with Docker 45 | ```bash 46 | # Run Qdrant locally 47 | docker run -d -p 6333:6333 -p 6334:6334 qdrant/qdrant:v1.13.2 48 | ``` 49 | 50 | ### Step 2: Set Environment Variables 51 | Create a `.env` file in the project root with: 52 | ```bash 53 | OPENAI_API_KEY=your_openai_api_key_here 54 | QDRANT_URL=http://localhost:6333 55 | # Note: No QDRANT_API_KEY needed for local setup 56 | 57 | # Optional: For advanced RAG features 58 | COHERE_API_KEY=your_cohere_api_key_here 59 | ``` 60 | 61 | ### Step 3: Run Data Ingestion Script 62 | ```bash 63 | # From the project root directory 64 | python scripts/ingest_to_qdrant_cloud.py 65 | ``` 66 | 67 | **Note**: The same ingestion script automatically detects whether you're using cloud or local setup based on your QDRANT_URL! 68 | 69 | --- 70 | 71 | ## 📋 What the Ingestion Script Does 72 | 73 | Regardless of which option you choose, the ingestion script will: 74 | - ✅ Load the extended Wikipedia dataset (61 articles) 75 | - 🔪 Create 1,210 chunks with 300 character chunks, 50 character overlap 76 | - 🤖 Generate embeddings using OpenAI text-embedding-3-small 77 | - 📤 Upload everything to your Qdrant instance (cloud or local) 78 | - ⏱️ Takes approximately 5-10 minutes to complete 79 | 80 | --- 81 | 82 | ## 🔧 Troubleshooting 83 | 84 | ### Collection Not Found Error 85 | ```bash 86 | # Make sure you're in the project root directory 87 | cd path/to/building-rag-app-workshop 88 | 89 | # Run the ingestion script 90 | python scripts/ingest_to_qdrant_cloud.py 91 | ``` 92 | 93 | ### Docker Setup Issues (Option B) 94 | ```bash 95 | # Check if Qdrant container is running 96 | docker ps 97 | 98 | # If not running, start it again 99 | docker run -d -p 6333:6333 -p 6334:6334 qdrant/qdrant:v1.13.2 100 | 101 | # Test connection 102 | curl http://localhost:6333 103 | ``` 104 | 105 | ### Cloud Setup Issues (Option A) 106 | - Verify your Qdrant Cloud cluster is running in the dashboard 107 | - Double-check your cluster URL and API key 108 | - Make sure you're using the correct cluster region 109 | 110 | ### Environment Variables Issues 111 | - Double-check your `.env` file is in the project root 112 | - Restart your Jupyter kernel after creating/updating `.env` 113 | - For local setup: `QDRANT_URL=http://localhost:6333` (no API key needed) 114 | - For cloud setup: Both `QDRANT_URL` and `QDRANT_API_KEY` required 115 | 116 | ### OpenAI API Issues 117 | - Make sure you have credits in your OpenAI account 118 | - Verify your OpenAI API key is correct 119 | 120 | ### Cohere API Issues (for Advanced RAG) 121 | - Sign up for a free Cohere account at [cohere.ai](https://cohere.ai) 122 | - Get your API key from the dashboard 123 | - Add it to your `.env` file as `COHERE_API_KEY=your_key_here` 124 | 125 | ### Still Having Issues? 126 | - Check the `data/ingestion_summary.json` file (created after successful ingestion) 127 | - Look at the terminal output from the ingestion script for error messages 128 | - For Docker: Check Docker logs with `docker logs ` 129 | 130 | --- 131 | 132 | ## ✅ Verification 133 | 134 | After completing the setup, you should see: 135 | - A `.env` file in your project root with the required API keys 136 | - Output from the ingestion script showing "🎉 INGESTION COMPLETED SUCCESSFULLY!" 137 | - A `data/ingestion_summary.json` file with ingestion details 138 | 139 | **You're ready to start the workshop once you see "Expected number of chunks found! Ingestion was successful." in any notebook!** 140 | 141 | --- 142 | 143 | ## 📚 Workshop Notebooks 144 | 145 | After completing this setup, you can run any of these notebooks: 146 | 147 | 1. **`naive-rag/01-naive-rag.ipynb`** - Basic RAG implementation 148 | 2. **`naive-rag/02-naive-rag-challenges.ipynb`** - Exploring RAG limitations 149 | 3. **`advanced-rag/01-advanced-rag-rerank.ipynb`** - Advanced RAG with reranking 150 | 4. **`advanced-rag/scifact/`** - SciFact dataset examples 151 | 152 | Each notebook will automatically detect your setup and connect appropriately. -------------------------------------------------------------------------------- /advanced-rag/scifact/run-qdrant.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | docker run -d -p 6333:6333 qdrant/qdrant:v1.10.0 -------------------------------------------------------------------------------- /data/extended_wiki_articles/Algorithm_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Algorithm", 3 | "url": "https://en.wikipedia.org/wiki/Algorithm", 4 | "fetched_at": "2025-07-16T22:50:11.221499", 5 | "content_file": "Algorithm.txt", 6 | "cleaned_length": 25827 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Artificial_intelligence_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Artificial intelligence", 3 | "url": "https://en.wikipedia.org/wiki/Artificial_intelligence", 4 | "fetched_at": "2025-07-16T23:32:09.977415", 5 | "content_file": "Artificial_intelligence.txt", 6 | "cleaned_length": 87457 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Artificial_neural_network_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Artificial neural network", 3 | "url": "https://en.wikipedia.org/wiki/Neural_network_(machine_learning)", 4 | "fetched_at": "2025-07-16T22:49:50.888489", 5 | "content_file": "Artificial_neural_network.txt", 6 | "cleaned_length": 57791 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/BERT_language_model_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "BERT (language model)", 3 | "url": "https://en.wikipedia.org/wiki/BERT_(language_model)", 4 | "fetched_at": "2025-07-16T22:49:54.888340", 5 | "content_file": "BERT_language_model.txt", 6 | "cleaned_length": 17112 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Bayesian_network_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Bayesian network", 3 | "url": "https://en.wikipedia.org/wiki/Bayesian_network", 4 | "fetched_at": "2025-07-16T22:50:40.349761", 5 | "content_file": "Bayesian_network.txt", 6 | "cleaned_length": 45832 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Bioinformatics_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Bioinformatics", 3 | "url": "https://en.wikipedia.org/wiki/Bioinformatics", 4 | "fetched_at": "2025-07-16T23:31:48.128076", 5 | "content_file": "Bioinformatics.txt", 6 | "cleaned_length": 37593 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Biology_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Biology", 3 | "url": "https://en.wikipedia.org/wiki/Geology", 4 | "fetched_at": "2025-07-16T23:31:38.991850", 5 | "content_file": "Biology.txt", 6 | "cleaned_length": 40696 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Blog_GPT-4_Technical_Report_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "GPT-4 Technical Report", 3 | "source": "Web", 4 | "url": "https://openai.com/research/gpt-4", 5 | "word_count": 3148, 6 | "character_count": 19974, 7 | "fetched_at": "2025-07-17T00:52:16.152699", 8 | "confusion_article": true, 9 | "arxiv_id": "" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Class_biology.txt: -------------------------------------------------------------------------------- 1 | In biological classification, class (Latin: classis) is a taxonomic rank, as well as a taxonomic unit, a taxon, in that rank. It is a group of related taxonomic orders. Other well-known ranks in descending order of size are domain, kingdom, phylum, order, family, genus, and species, with class ranking between phylum and order. 2 | 3 | 4 | == History == 5 | The class as a distinct rank of biological classification having its own distinctive name – and not just called a top-level genus (genus summum) – was first introduced by French botanist Joseph Pitton de Tournefort in the classification of plants that appeared in his Eléments de botanique of 1694. 6 | Insofar as a general definition of a class is available, it has historically been conceived as embracing taxa that combine a distinct grade of organization—i.e. a 'level of complexity', measured in terms of how differentiated their organ systems are into distinct regions or sub-organs—with a distinct type of construction, which is to say a particular layout of organ systems. This said, the composition of each class is ultimately determined by the subjective judgment of taxonomists. 7 | In the first edition of his Systema Naturae (1735), Carl Linnaeus divided all three of his kingdoms of nature (minerals, plants, and animals) into classes. Only in the animal kingdom are Linnaeus's classes similar to the classes used today; his classes and orders of plants were never intended to represent natural groups, but rather to provide a convenient "artificial key" according to his Systema Sexuale, largely based on the arrangement of flowers. In botany, classes are now rarely discussed. Since the first publication of the APG system in 1998, which proposed a taxonomy of the flowering plants up to the level of orders, many sources have preferred to treat ranks higher than orders as informal clades. Where formal ranks have been assigned, the ranks have been reduced to a very much lower level, e.g. class Equisitopsida for the land plants, with the major divisions within the class assigned to subclasses and superorders. 8 | The class was considered the highest level of the taxonomic hierarchy until George Cuvier's embranchements, first called Phyla by Ernst Haeckel, were introduced in the early nineteenth century. 9 | 10 | 11 | == See also == 12 | 13 | Cladistics 14 | List of animal classes 15 | Phylogenetics 16 | Systematics 17 | Taxonomy 18 | 19 | 20 | == Explanatory notes == 21 | 22 | 23 | == References == -------------------------------------------------------------------------------- /data/extended_wiki_articles/Class_biology_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Class (biology)", 3 | "url": "https://en.wikipedia.org/wiki/Class_(biology)", 4 | "fetched_at": "2025-07-16T23:30:59.200460", 5 | "content_file": "Class_biology.txt", 6 | "cleaned_length": 2372 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Cluster_analysis_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Cluster analysis", 3 | "url": "https://en.wikipedia.org/wiki/Cluster_analysis", 4 | "fetched_at": "2025-07-16T22:51:10.199096", 5 | "content_file": "Cluster_analysis.txt", 6 | "cleaned_length": 57883 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Cognitive_science_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Cognitive science", 3 | "url": "https://en.wikipedia.org/wiki/Cognitive_science", 4 | "fetched_at": "2025-07-16T22:50:26.181378", 5 | "content_file": "Cognitive_science.txt", 6 | "cleaned_length": 38612 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Computational_biology_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Computational biology", 3 | "url": "https://en.wikipedia.org/wiki/Computational_biology", 4 | "fetched_at": "2025-07-16T23:31:50.242726", 5 | "content_file": "Computational_biology.txt", 6 | "cleaned_length": 23949 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Computer_network_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Computer network", 3 | "url": "https://en.wikipedia.org/wiki/Computer_network", 4 | "fetched_at": "2025-07-16T23:31:18.865630", 5 | "content_file": "Computer_network.txt", 6 | "cleaned_length": 54745 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Computer_science_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Computer science", 3 | "url": "https://en.wikipedia.org/wiki/Computer_science", 4 | "fetched_at": "2025-07-16T22:50:04.268865", 5 | "content_file": "Computer_science.txt", 6 | "cleaned_length": 29735 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Control_theory_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Control theory", 3 | "url": "https://en.wikipedia.org/wiki/Control_theory", 4 | "fetched_at": "2025-07-16T22:50:51.072891", 5 | "content_file": "Control_theory.txt", 6 | "cleaned_length": 35117 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Cryptography_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Cryptography", 3 | "url": "https://en.wikipedia.org/wiki/Cryptography", 4 | "fetched_at": "2025-07-16T22:50:58.934057", 5 | "content_file": "Cryptography.txt", 6 | "cleaned_length": 53487 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Cybernetics_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Cybernetics", 3 | "url": "https://en.wikipedia.org/wiki/Cybernetics", 4 | "fetched_at": "2025-07-16T22:50:35.500008", 5 | "content_file": "Cybernetics.txt", 6 | "cleaned_length": 18997 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Data_structure.txt: -------------------------------------------------------------------------------- 1 | In computer science, a data structure is a data organization and storage format that is usually chosen for efficient access to data. More precisely, a data structure is a collection of data values, the relationships among them, and the functions or operations that can be applied to the data, i.e., it is an algebraic structure about data. 2 | 3 | 4 | == Usage == 5 | Data structures serve as the basis for abstract data types (ADT). The ADT defines the logical form of the data type. The data structure implements the physical form of the data type. 6 | Different types of data structures are suited to different kinds of applications, and some are highly specialized to specific tasks. For example, relational databases commonly use B-tree indexes for data retrieval, while compiler implementations usually use hash tables to look up identifiers. 7 | Data structures provide a means to manage large amounts of data efficiently for uses such as large databases and internet indexing services. Usually, efficient data structures are key to designing efficient algorithms. Some formal design methods and programming languages emphasize data structures, rather than algorithms, as the key organizing factor in software design. Data structures can be used to organize the storage and retrieval of information stored in both main memory and secondary memory. 8 | 9 | 10 | == Implementation == 11 | Data structures can be implemented using a variety of programming languages and techniques, but they all share the common goal of efficiently organizing and storing data. Data structures are generally based on the ability of a computer to fetch and store data at any place in its memory, specified by a pointer—a bit string, representing a memory address, that can be itself stored in memory and manipulated by the program. Thus, the array and record data structures are based on computing the addresses of data items with arithmetic operations, while the linked data structures are based on storing addresses of data items within the structure itself. This approach to data structuring has profound implications for the efficiency and scalability of algorithms. For instance, the contiguous memory allocation in arrays facilitates rapid access and modification operations, leading to optimized performance in sequential data processing scenarios. 12 | The implementation of a data structure usually requires writing a set of procedures that create and manipulate instances of that structure. The efficiency of a data structure cannot be analyzed separately from those operations. This observation motivates the theoretical concept of an abstract data type, a data structure that is defined indirectly by the operations that may be performed on it, and the mathematical properties of those operations (including their space and time cost). 13 | A landmark 1989 study showed how ordinary pointer-based structures can be transformed into persistent data structures – versions that preserve and share earlier states after updates without asymptotically increasing time or space costs. 14 | 15 | 16 | == Examples == 17 | 18 | There are numerous types of data structures, generally built upon simpler primitive data types. Well known examples are: 19 | 20 | An array is a number of elements in a specific order, typically all of the same type (depending on the language, individual elements may either all be forced to be the same type, or may be of almost any type). Elements are accessed using an integer index to specify which element is required. Typical implementations allocate contiguous memory words for the elements of arrays (but this is not always a necessity). Arrays may be fixed-length or resizable. 21 | A linked list (also just called list) is a linear collection of data elements of any type, called nodes, where each node has itself a value, and points to the next node in the linked list. The principal advantage of a linked list over an array is that values can always be efficiently inserted and removed without relocating the rest of the list. Certain other operations, such as random access to a certain element, are however slower on lists than on arrays. 22 | A record (also called tuple or struct) is an aggregate data structure. A record is a value that contains other values, typically in fixed number and sequence and typically indexed by names. The elements of records are usually called fields or members. In the context of object-oriented programming, records are known as plain old data structures to distinguish them from objects. 23 | Hash tables, also known as hash maps, are data structures that provide fast retrieval of values based on keys. They use a hashing function to map keys to indexes in an array, allowing for constant-time access in the average case. Hash tables are commonly used in dictionaries, caches, and database indexing. However, hash collisions can occur, which can impact their performance. Techniques like chaining and open addressing are employed to handle collisions. 24 | Graphs are collections of nodes connected by edges, representing relationships between entities. Graphs can be used to model social networks, computer networks, and transportation networks, among other things. They consist of vertices (nodes) and edges (connections between nodes). Graphs can be directed or undirected, and they can have cycles or be acyclic. Graph traversal algorithms include breadth-first search and depth-first search. 25 | Stacks and queues are abstract data types that can be implemented using arrays or linked lists. A stack has two primary operations: push (adds an element to the top of the stack) and pop (removes the topmost element from the stack), that follow the Last In, First Out (LIFO) principle. Queues have two main operations: enqueue (adds an element to the rear of the queue) and dequeue (removes an element from the front of the queue) that follow the First In, First Out (FIFO) principle. 26 | Trees represent a hierarchical organization of elements. A tree consists of nodes connected by edges, with one node being the root and all other nodes forming subtrees. Trees are widely used in various algorithms and data storage scenarios. Binary trees (particularly heaps), AVL trees, and B-trees are some popular types of trees. They enable efficient and optimal searching, sorting, and hierarchical representation of data. 27 | A trie, or prefix tree, is a special type of tree used to efficiently retrieve strings. In a trie, each node represents a character of a string, and the edges between nodes represent the characters that connect them. This structure is especially useful for tasks like autocomplete, spell-checking, and creating dictionaries. Tries allow for quick searches and operations based on string prefixes. 28 | 29 | 30 | == Language support == 31 | Most assembly languages and some low-level languages, such as BCPL (Basic Combined Programming Language), lack built-in support for data structures. On the other hand, many high-level programming languages and some higher-level assembly languages, such as MASM, have special syntax or other built-in support for certain data structures, such as records and arrays. For example, the C (a direct descendant of BCPL) and Pascal languages support structs and records, respectively, in addition to vectors (one-dimensional arrays) and multi-dimensional arrays. 32 | Most programming languages feature some sort of library mechanism that allows data structure implementations to be reused by different programs. Modern languages usually come with standard libraries that implement the most common data structures. Examples are the C++ Standard Template Library, the Java Collections Framework, and the Microsoft .NET Framework. 33 | Modern languages also generally support modular programming, the separation between the interface of a library module and its implementation. Some provide opaque data types that allow clients to hide implementation details. Object-oriented programming languages, such as C++, Java, and Smalltalk, typically use classes for this purpose. 34 | Many known data structures have concurrent versions which allow multiple computing threads to access a single concrete instance of a data structure simultaneously. 35 | 36 | 37 | == See also == 38 | 39 | 40 | == References == 41 | 42 | 43 | == Bibliography == 44 | Peter Brass, Advanced Data Structures, Cambridge University Press, 2008, ISBN 978-0521880374 45 | Donald Knuth, The Art of Computer Programming, vol. 1. Addison-Wesley, 3rd edition, 1997, ISBN 978-0201896831 46 | Dinesh Mehta and Sartaj Sahni, Handbook of Data Structures and Applications, Chapman and Hall/CRC Press, 2004, ISBN 1584884355 47 | Niklaus Wirth, Algorithms and Data Structures, Prentice Hall, 1985, ISBN 978-0130220059 48 | 49 | 50 | == Further reading == 51 | Open Data Structures by Pat Morin 52 | G. H. Gonnet and R. Baeza-Yates, Handbook of Algorithms and Data Structures - in Pascal and C, second edition, Addison-Wesley, 1991, ISBN 0-201-41607-7 53 | Ellis Horowitz and Sartaj Sahni, Fundamentals of Data Structures in Pascal, Computer Science Press, 1984, ISBN 0-914894-94-3 54 | 55 | 56 | == External links == 57 | 58 | Descriptions from the Dictionary of Algorithms and Data Structures 59 | Data structures course 60 | An Examination of Data Structures from .NET perspective 61 | Schaffer, C. Data Structures and Algorithm Analysis -------------------------------------------------------------------------------- /data/extended_wiki_articles/Data_structure_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Data structure", 3 | "url": "https://en.wikipedia.org/wiki/Data_structure", 4 | "fetched_at": "2025-07-16T23:32:13.680703", 5 | "content_file": "Data_structure.txt", 6 | "cleaned_length": 9104 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Database_management_system_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Database management system", 3 | "url": "https://en.wikipedia.org/wiki/Database", 4 | "fetched_at": "2025-07-16T22:50:54.571475", 5 | "content_file": "Database_management_system.txt", 6 | "cleaned_length": 58358 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Decision_tree_learning_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Decision tree learning", 3 | "url": "https://en.wikipedia.org/wiki/Decision_tree_learning", 4 | "fetched_at": "2025-07-16T22:51:05.269790", 5 | "content_file": "Decision_tree_learning.txt", 6 | "cleaned_length": 51214 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Deep_learning_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Deep learning", 3 | "url": "https://en.wikipedia.org/wiki/Deep_learning", 4 | "fetched_at": "2025-07-16T22:49:42.725813", 5 | "content_file": "Deep_learning.txt", 6 | "cleaned_length": 56760 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Dimensionality_reduction.txt: -------------------------------------------------------------------------------- 1 | Dimensionality reduction, or dimension reduction, is the transformation of data from a high-dimensional space into a low-dimensional space so that the low-dimensional representation retains some meaningful properties of the original data, ideally close to its intrinsic dimension. Working in high-dimensional spaces can be undesirable for many reasons; raw data are often sparse as a consequence of the curse of dimensionality, and analyzing the data is usually computationally intractable. Dimensionality reduction is common in fields that deal with large numbers of observations and/or large numbers of variables, such as signal processing, speech recognition, neuroinformatics, and bioinformatics. 2 | Methods are commonly divided into linear and nonlinear approaches. Linear approaches can be further divided into feature selection and feature extraction. Dimensionality reduction can be used for noise reduction, data visualization, cluster analysis, or as an intermediate step to facilitate other analyses. 3 | 4 | 5 | == Feature selection == 6 | 7 | The process of feature selection aims to find a suitable subset of the input variables (features, or attributes) for the task at hand. The three strategies are: the filter strategy (e.g., information gain), the wrapper strategy (e.g., accuracy-guided search), and the embedded strategy (features are added or removed while building the model based on prediction errors). 8 | Data analysis such as regression or classification can be done in the reduced space more accurately than in the original space. 9 | 10 | 11 | == Feature projection == 12 | 13 | Feature projection (also called feature extraction) transforms the data from the high-dimensional space to a space of fewer dimensions. The data transformation may be linear, as in principal component analysis (PCA), but many nonlinear dimensionality reduction techniques also exist. For multidimensional data, tensor representation can be used in dimensionality reduction through multilinear subspace learning. 14 | 15 | 16 | === Principal component analysis (PCA) === 17 | 18 | The main linear technique for dimensionality reduction, principal component analysis, performs a linear mapping of the data to a lower-dimensional space in such a way that the variance of the data in the low-dimensional representation is maximized. In practice, the covariance (and sometimes the correlation) matrix of the data is constructed and the eigenvectors on this matrix are computed. The eigenvectors that correspond to the largest eigenvalues (the principal components) can now be used to reconstruct a large fraction of the variance of the original data. Moreover, the first few eigenvectors can often be interpreted in terms of the large-scale physical behavior of the system, because they often contribute the vast majority of the system's energy, especially in low-dimensional systems. Still, this must be proved on a case-by-case basis as not all systems exhibit this behavior. The original space (with dimension of the number of points) has been reduced (with data loss, but hopefully retaining the most important variance) to the space spanned by a few eigenvectors. 19 | 20 | 21 | === Non-negative matrix factorization (NMF) === 22 | 23 | NMF decomposes a non-negative matrix to the product of two non-negative ones, which has been a promising tool in fields where only non-negative signals exist, such as astronomy. NMF is well known since the multiplicative update rule by Lee & Seung, which has been continuously developed: the inclusion of uncertainties, the consideration of missing data and parallel computation, sequential construction which leads to the stability and linearity of NMF, as well as other updates including handling missing data in digital image processing. 24 | With a stable component basis during construction, and a linear modeling process, sequential NMF is able to preserve the flux in direct imaging of circumstellar structures in astronomy, as one of the methods of detecting exoplanets, especially for the direct imaging of circumstellar discs. In comparison with PCA, NMF does not remove the mean of the matrices, which leads to physical non-negative fluxes; therefore NMF is able to preserve more information than PCA as demonstrated by Ren et al. 25 | 26 | 27 | === Kernel PCA === 28 | 29 | Principal component analysis can be employed in a nonlinear way by means of the kernel trick. The resulting technique is capable of constructing nonlinear mappings that maximize the variance in the data. The resulting technique is called kernel PCA. 30 | 31 | 32 | === Graph-based kernel PCA === 33 | Other prominent nonlinear techniques include manifold learning techniques such as Isomap, locally linear embedding (LLE), Hessian LLE, Laplacian eigenmaps, and methods based on tangent space analysis. These techniques construct a low-dimensional data representation using a cost function that retains local properties of the data, and can be viewed as defining a graph-based kernel for Kernel PCA. 34 | More recently, techniques have been proposed that, instead of defining a fixed kernel, try to learn the kernel using semidefinite programming. The most prominent example of such a technique is maximum variance unfolding (MVU). The central idea of MVU is to exactly preserve all pairwise distances between nearest neighbors (in the inner product space) while maximizing the distances between points that are not nearest neighbors. 35 | An alternative approach to neighborhood preservation is through the minimization of a cost function that measures differences between distances in the input and output spaces. Important examples of such techniques include: classical multidimensional scaling, which is identical to PCA; Isomap, which uses geodesic distances in the data space; diffusion maps, which use diffusion distances in the data space; t-distributed stochastic neighbor embedding (t-SNE), which minimizes the divergence between distributions over pairs of points; and curvilinear component analysis. 36 | A different approach to nonlinear dimensionality reduction is through the use of autoencoders, a special kind of feedforward neural networks with a bottleneck hidden layer. The training of deep encoders is typically performed using a greedy layer-wise pre-training (e.g., using a stack of restricted Boltzmann machines) that is followed by a finetuning stage based on backpropagation. 37 | 38 | 39 | === Linear discriminant analysis (LDA) === 40 | 41 | Linear discriminant analysis (LDA) is a generalization of Fisher's linear discriminant, a method used in statistics, pattern recognition, and machine learning to find a linear combination of features that characterizes or separates two or more classes of objects or events. 42 | 43 | 44 | === Generalized discriminant analysis (GDA) === 45 | GDA deals with nonlinear discriminant analysis using kernel function operator. The underlying theory is close to the support-vector machines (SVM) insofar as the GDA method provides a mapping of the input vectors into high-dimensional feature space. Similar to LDA, the objective of GDA is to find a projection for the features into a lower dimensional space by maximizing the ratio of between-class scatter to within-class scatter. 46 | 47 | 48 | === Autoencoder === 49 | 50 | Autoencoders can be used to learn nonlinear dimension reduction functions and codings together with an inverse function from the coding to the original representation. 51 | 52 | 53 | === t-SNE === 54 | 55 | T-distributed Stochastic Neighbor Embedding (t-SNE) is a nonlinear dimensionality reduction technique useful for the visualization of high-dimensional datasets. It is not recommended for use in analysis such as clustering or outlier detection since it does not necessarily preserve densities or distances well. 56 | 57 | 58 | === UMAP === 59 | 60 | Uniform manifold approximation and projection (UMAP) is a nonlinear dimensionality reduction technique. Visually, it is similar to t-SNE, but it assumes that the data is uniformly distributed on a locally connected Riemannian manifold and that the Riemannian metric is locally constant or approximately locally constant. 61 | 62 | 63 | == Dimension reduction == 64 | For high-dimensional datasets, dimension reduction is usually performed prior to applying a k-nearest neighbors (k-NN) algorithm in order to mitigate the curse of dimensionality. 65 | Feature extraction and dimension reduction can be combined in one step, using principal component analysis (PCA), linear discriminant analysis (LDA), canonical correlation analysis (CCA), or non-negative matrix factorization (NMF) techniques to pre-process the data, followed by clustering via k-NN on feature vectors in a reduced-dimension space. In machine learning, this process is also called low-dimensional embedding. 66 | For high-dimensional datasets (e.g., when performing similarity search on live video streams, DNA data, or high-dimensional time series), running a fast approximate k-NN search using locality-sensitive hashing, random projection, "sketches", or other high-dimensional similarity search techniques from the VLDB conference toolbox may be the only feasible option. 67 | 68 | 69 | == Applications == 70 | A dimensionality reduction technique that is sometimes used in neuroscience is maximally informative dimensions, which finds a lower-dimensional representation of a dataset such that as much information as possible about the original data is preserved. 71 | 72 | 73 | == See also == 74 | 75 | 76 | == Notes == 77 | 78 | 79 | == References == 80 | 81 | 82 | == External links == 83 | JMLR Special Issue on Variable and Feature Selection 84 | ELastic MAPs Archived 2011-07-20 at the Wayback Machine 85 | Locally Linear Embedding 86 | Visual Comparison of various dimensionality reduction methods 87 | A Global Geometric Framework for Nonlinear Dimensionality Reduction -------------------------------------------------------------------------------- /data/extended_wiki_articles/Dimensionality_reduction_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Dimensionality reduction", 3 | "url": "https://en.wikipedia.org/wiki/Dimensionality_reduction", 4 | "fetched_at": "2025-07-16T22:51:11.989825", 5 | "content_file": "Dimensionality_reduction.txt", 6 | "cleaned_length": 9444 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Electrical_engineering_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Electrical engineering", 3 | "url": "https://en.wikipedia.org/wiki/Electrical_engineering", 4 | "fetched_at": "2025-07-16T22:50:44.894772", 5 | "content_file": "Electrical_engineering.txt", 6 | "cleaned_length": 43228 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Engineering_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Engineering", 3 | "url": "https://en.wikipedia.org/wiki/Engineering", 4 | "fetched_at": "2025-07-16T23:32:11.918992", 5 | "content_file": "Engineering.txt", 6 | "cleaned_length": 34441 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Ensemble_learning_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Ensemble learning", 3 | "url": "https://en.wikipedia.org/wiki/Ensemble_learning", 4 | "fetched_at": "2025-07-16T22:51:16.253250", 5 | "content_file": "Ensemble_learning.txt", 6 | "cleaned_length": 28700 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Evolution_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Evolution", 3 | "url": "https://en.wikipedia.org/wiki/Revolution", 4 | "fetched_at": "2025-07-16T23:31:41.124764", 5 | "content_file": "Evolution.txt", 6 | "cleaned_length": 19731 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Evolutionary_computation_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Evolutionary computation", 3 | "url": "https://en.wikipedia.org/wiki/Evolutionary_computation", 4 | "fetched_at": "2025-07-16T22:51:00.955207", 5 | "content_file": "Evolutionary_computation.txt", 6 | "cleaned_length": 15902 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Feature_recognition.txt: -------------------------------------------------------------------------------- 1 | The term "feature" implies different meanings in different engineering disciplines. This has resulted in many ambiguous definitions for feature. A feature, in computer-aided design (CAD), usually refers to a region of a part with some interesting geometric or topological properties. These are more precisely called form features. Form features contain both shape information and parametric information of a region of interest. They are now ubiquitous in most current CAD software, where they are used as the primary means of creating 3D geometric models. Examples of form features are extruded boss, loft, etc. Form feature is not the only type of feature that is discussed in CAD literature. Sometimes a part's functional or manufacturing features of the subject of attention. Although it is quite possible to see form features and manufacturing features are called by the same name, they are not exactly the same concepts. For example, one may either use the name "pocket" to refer to a swept cut on the boundary of a part model, or to refer to a trace left on the part boundary by a specific machining operation. The former is exclusively concerned with a geometric shape whereas the latter is concerned with both the geometric shape and a manufacturing operation, needing more parameters in its definition. As such, a manufacturing feature can be minimally defined as a form feature (if it has a form that can uniquely represent it), but not necessarily vice versa (forms can be interpreted differently in different manufacturing domains). Machining features are an important subset of manufacturing features. A machining feature can be regarded as the volume swept by a "cutting" tool, which is always a negative (subtracted) volume. Finally, there is also the concept of assembly feature, which encodes the assembly method between connected components. 2 | Feature data in CAD can be specified either as a collection of surfaces or as volumes. Surface features can be used to describe manufacturing tolerances or locating surfaces in assembly design. Volumetric features on the other hand, can be used in tool path generation, etc. Manufacturing information (particularly in machining) is better portrayed by using volumetric features. 3 | The first published work on features was for the original boundary representation modelling system, BUILD, and was performed by Lyc Kyprianou. Soon other work followed based on different solid representations. Overviews on the work on features can be found in Shah et al.; Subrahmanyam and Wozny; Salomons et al. 4 | 5 | 6 | == Technology == 7 | Work on features (generally called feature technology) can be divided into two rough categories: Design-by-features and Feature recognition. In design-by-features, also known as feature-based design (FBD), feature structures are introduced directly into a model using particular operations or by sewing in shapes. On the other hand, the goal of feature recognition (FR) is to algorithmically extract higher level entities (e.g. manufacturing features) from lower level elements (e.g. surfaces, edges, etc.) of a CAD model. 8 | 9 | 10 | === Form feature generation model === 11 | Completeness of feature set is very subjective, domain dependence eludes a formal definition. Feature generation model proposed by Nalluri and Gurumoorthy attempts to define the completeness of a feature set. They define domain independent form feature as a set of faces with distinct topological and geometric characteristics. They have modelled creation of a form feature as addition/subtraction of feature-solid (exact minimum volume required) to/from based-solid. They define feature "Type" based on the local topology of participating base-solid faces and "shape" based on shape of the feature-solid. Based on these definitions, they have enumerated and classified form features. For example, they have enumerated 94 sweep form feature types with possibility of each feature type having unlimited number of shapes. They provided proof those 94 types are complete for sweep feature-solid. They have modeled the feature extraction as a reverse process of their feature generation model. They have developed a feature recognition algorithm based on the concept of computing dynamic topological status of faces. They also defined a framework for mapping these domain independent features to a specific domain of interest. 12 | 13 | 14 | === Design by features === 15 | By using features to build up shape models, the design process is made more efficient, because the shape of features can be pre-defined. Features in FBD can be directly associated to manufacturing information so that these informations can be retrieved in downstream applications. In this way, an overall CAD/CAM system can be fully automated, however, the idea of using manufacturing features to design a part has its own shortcomings: The features used to design the part do not necessarily represent the best way to manufacture it. It is, therefore, the designer's responsibility to evaluate all methods that can produce the part. Furthermore, manufacturing features are not the most natural way of designing a part. 16 | 17 | 18 | === Feature recognition === 19 | The method proposed by Kyprianou was aimed to encode parts for group technology (GT). The purpose of GT is to systematically classify objects based on their manufacturing method. Kyprianou's work involved classifying faces into primary and secondary groups and then identifying features according to patterns of these primary or secondary faces. A primary face is one with multiple boundaries (also called "hole-loops") or mixed concave and convex boundaries. A concave boundary is a set of concave edges, where the solid angle over the edge is more than 180. Secondary faces are all other faces. Kyprianou's work was continued and extended by Jared et al. to cover a number of important special cases where features interacted. 20 | Automatic Feature Recognition (AFR) is regarded as an ideal solution to automate design and manufacturing processes. Successful automation of CAD and CAM systems is a vital connection in building Computer Integrated Manufacturing (CIM) systems. This is the part of the FR research that has attracted much of the attention. Another important application of AFR is for manufacturability evaluation. The AFR system should be able to interpret the design differently based on alternative features and feed back the manufacturability and cost of those interpretations to the designer. 21 | There is a big stockpile of different AFR techniques that has been proposed for CAD/CAM integration and process planning. Han et al. provides a critical and detailed analysis of some of the existing approaches. The most common methods according to Han et al. range from graph-based algorithms to hint-based and volumetric decomposition techniques. In the graph-based feature recognition, a graph showing the topology of the part (connection of faces) is created. The graph is often attributed, for example the edges are marked as concave or convex. This graph is then analyzed to extract subsets of nodes and arcs that match with any predefined template. This is done by a variety of techniques, including graph iso-morphism algorithms. 22 | Graph based approaches have been criticized for several shortcomings. They fail to account for manufacturability of the recognized features due to their strong reliance on topological patterns rather than geometry. The intersection of features causes an explosion in the number of possible feature patterns that spoils any attempt to formulate feature patterns. To address these difficulties, Vandenbrande and Requicha. proposed to search for "minimal indispensable portion of a feature's boundary", called hints, rather than complete feature patterns. For example, presence of two opposing planar faces is a hint for potential existence of a slot feature. Hints are not necessarily restricted to the part geometry. They can be extracted form tolerances and design attributes as well. For example, "a thread attribute may be taken as a hole hint". This approach has been more successful in recognizing intersecting features. However, the efficiency of the approach has been argued, as there could be a huge number of traces that won't lead to valid features. Some authors have been in favor of using a hybrid of graph based and hint based FR to improve the efficiency of hint-based reasoning. In the hybrid approach, graph-based reasoning is used to find out those regions of the part that certainly lead to valid features when used by the hint based reasoner. Other existing FR approaches are volumetric decomposition, Artificial Neural Networks, and expert systems Babic et al. briefly introduces many of them. 23 | However, building feature recognition systems that function effectively on real industrial products has been elusive. A real product with hundreds of faces and end edges brings almost all the above approaches to a halt due to computational complexity. Furthermore, the features studied in these approaches are usually over simplified. The bulk of the feature recognition literature normally deals with 2.5D features (those made by sweeping a 2D profile along a linear axis). Graph representations, hint definitions or volume decompositions are much more difficult to define for 3D and free form features. The work done by Sundararajan is focused on free form surfaces, but again it is limited in application. Oversimplification is also evident even in the course of 2.5D features. For example, feature recognition algorithms usually assume sharp concave edges in the feature geometry. However, such edges are barely used in real design of mechanical components due to manufacturing constrains. Some of these issues such as the presence of filleted edges and free form surfaces in the model have been studied by Rahmani and Arezoo. 24 | 25 | 26 | === Commercial feature recognition systems === 27 | Few commercial feature recognition systems are also available. Though feature recognition technology can be applied for various applications, commercial software have effectively adopted feature recognition technology for recreating the feature tree from imported models so that even the imported models can be edited as if it were a native solid model. Major 3D CAD modelers have Feature Recognition to convert imported 3-D models into native feature based models. CAM software and design for manufacturing software are also built using this feature recognition technology. Few CAD/CAM software have used commercially available third-party feature recognition library, which recognizes various features from 3-D B-Rep models. Separate libraries are available for Design, Manufacturing and Sheet metal applications. Design feature recognition library can identify features such as holes of various types, split holes, hole-chains, fillets, chamfers, cut extrudes, boss extrudes, drafted extrudes, revolved cuts, revolved bosses, ribs, drafts, lofts and sweeps are identified. Manufacturing feature recognition library provides recognition of manufacturing features such as simple holes, tapered holes, counter-bore holes, counter-sunk holes, counter-drilled holes, hole-chains, hole patterns such as linear, rectangular and circular patterns, fillets, chamfers, blind pockets, through pockets, drafted pockets, filleted and chamfered pockets, simple slots, drafted slots, filleted and chamfered slots, islands in pockets and slots, machinable volumes, machinable slabs, multiple intersecting features, axi-symmetric features such as external turned profiles, internal turned profiles, turned grooves such as vee and dovetail grooves, and mill-turn features such as slots and pocket in turned profiles. Sheet metal feature recognition library extracts features from a sheet metal perspective. Various features identified through this library include walls, bends, holes, cutouts, flanged holes, flanged cutouts, notches, open hems, closed hems, teardrop hems, rolled hems (curls), jog flanges, edge flanges, contour flanges, stamps such as louver, lance, bridge, dimple, beads, embosses and ribs. Though such commercial systems can identify a variety of features listed above, further research can be driven to identify feature types that are not identified by such commercial systems. Manufacturing features such as 3-axis and 5-axis feature recognition are generally not available in such commercial systems. 28 | 29 | 30 | == See also == 31 | Parametric feature based modeler 32 | Space mapping 33 | STEP-NC 34 | 35 | 36 | == References == 37 | 38 | 39 | == External links == 40 | Feature Recognition—The Missing Link To Automated CAM 41 | Manufacturing feature recognition towards integration with process planning 42 | Manufacturing feature recognition: a status report 43 | hybrid hint-based and graph-based feature recognition -------------------------------------------------------------------------------- /data/extended_wiki_articles/Feature_recognition_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Feature recognition", 3 | "url": "https://en.wikipedia.org/wiki/Feature_recognition", 4 | "fetched_at": "2025-07-16T23:30:55.015122", 5 | "content_file": "Feature_recognition.txt", 6 | "cleaned_length": 12694 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Feature_selection_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Feature selection", 3 | "url": "https://en.wikipedia.org/wiki/Feature_selection", 4 | "fetched_at": "2025-07-16T22:51:14.227121", 5 | "content_file": "Feature_selection.txt", 6 | "cleaned_length": 54157 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Fuzzy_logic_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Fuzzy logic", 3 | "url": "https://en.wikipedia.org/wiki/Fuzzy_logic", 4 | "fetched_at": "2025-07-16T22:51:02.954704", 5 | "content_file": "Fuzzy_logic.txt", 6 | "cleaned_length": 29876 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Generative_pre_trained_transformer_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Generative pre-trained transformer", 3 | "url": "https://en.wikipedia.org/wiki/Generative_pre-trained_transformer", 4 | "fetched_at": "2025-07-16T22:49:52.753177", 5 | "content_file": "Generative_pre_trained_transformer.txt", 6 | "cleaned_length": 14466 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Graph_theory_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Graph theory", 3 | "url": "https://en.wikipedia.org/wiki/Graph_theory", 4 | "fetched_at": "2025-07-16T22:50:42.592294", 5 | "content_file": "Graph_theory.txt", 6 | "cleaned_length": 40795 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/History_of_artificial_intelligence_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "History of artificial intelligence", 3 | "url": "https://en.wikipedia.org/wiki/History_of_artificial_intelligence", 4 | "fetched_at": "2025-07-16T22:50:32.120371", 5 | "content_file": "History_of_artificial_intelligence.txt", 6 | "cleaned_length": 79930 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Humancomputer_interaction_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Human\u2013computer interaction", 3 | "url": "https://en.wikipedia.org/wiki/Human%E2%80%93computer_interaction", 4 | "fetched_at": "2025-07-16T22:50:24.354244", 5 | "content_file": "Humancomputer_interaction.txt", 6 | "cleaned_length": 30452 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Information_theory_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Information theory", 3 | "url": "https://en.wikipedia.org/wiki/Information_theory", 4 | "fetched_at": "2025-07-16T22:50:13.560576", 5 | "content_file": "Information_theory.txt", 6 | "cleaned_length": 51511 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/John_von_Neumann_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "John von Neumann", 3 | "url": "https://en.wikipedia.org/wiki/John_von_Neumann", 4 | "fetched_at": "2025-07-16T23:32:03.705010", 5 | "content_file": "John_von_Neumann.txt", 6 | "cleaned_length": 100075 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Kernel_computing.txt: -------------------------------------------------------------------------------- 1 | In computing, a compute kernel is a routine compiled for high throughput accelerators (such as graphics processing units (GPUs), digital signal processors (DSPs) or field-programmable gate arrays (FPGAs)), separate from but used by a main program (typically running on a central processing unit). They are sometimes called compute shaders, sharing execution units with vertex shaders and pixel shaders on GPUs, but are not limited to execution on one class of device, or graphics APIs. 2 | 3 | 4 | == Description == 5 | Compute kernels roughly correspond to inner loops when implementing algorithms in traditional languages (except there is no implied sequential operation), or to code passed to internal iterators. 6 | They may be specified by a separate programming language such as "OpenCL C" (managed by the OpenCL API), as "compute shaders" written in a shading language (managed by a graphics API such as OpenGL), or embedded directly in application code written in a high level language, as in the case of C++AMP. Microsoft support this as DirectCompute. 7 | 8 | 9 | == Vector processing == 10 | This programming paradigm maps well to vector processors: there is an assumption that each invocation of a kernel within a batch is independent, allowing for data parallel execution. However, atomic operations may sometimes be used for synchronization between elements (for interdependent work), in some scenarios. Individual invocations are given indices (in 1 or more dimensions) from which arbitrary addressing of buffer data may be performed (including scatter gather operations), so long as the non-overlapping assumption is respected. 11 | 12 | 13 | == Vulkan API == 14 | The Vulkan API provides the intermediate SPIR-V representation to describe both Graphical Shaders, and Compute Kernels, in a language independent and machine independent manner. The intention is to facilitate language evolution and provide a more natural ability to leverage GPU compute capabilities, in line with hardware developments such as Unified Memory Architecture and Heterogeneous System Architecture. This allows closer cooperation between a CPU and GPU. 15 | 16 | 17 | == LLM Kernel Generation == 18 | Much work has been done in the field of Kernel generation through LLMs as a means of optimizing code. KernelBench, created by the Scaling Intelligence Lab at Stanford, provides a framework to evaluate the ability of LLMs to generate efficient GPU kernels. 19 | Cognition has created Kevin 32-B to create efficient CUDA kernels which is currently the highest performing model on KernelBench. 20 | 21 | 22 | == See also == 23 | 24 | 25 | == References == -------------------------------------------------------------------------------- /data/extended_wiki_articles/Kernel_computing_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Kernel (computing)", 3 | "url": "https://en.wikipedia.org/wiki/Compute_kernel", 4 | "fetched_at": "2025-07-16T23:30:44.437478", 5 | "content_file": "Kernel_computing.txt", 6 | "cleaned_length": 2518 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Knowledge_representation_and_reasoning_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Knowledge representation and reasoning", 3 | "url": "https://en.wikipedia.org/wiki/Knowledge_representation_and_reasoning", 4 | "fetched_at": "2025-07-16T22:50:29.670985", 5 | "content_file": "Knowledge_representation_and_reasoning.txt", 6 | "cleaned_length": 28937 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Layer_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Layer", 3 | "url": "https://en.wikipedia.org/wiki/Lawyer", 4 | "fetched_at": "2025-07-16T23:30:48.448909", 5 | "content_file": "Layer.txt", 6 | "cleaned_length": 37718 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/LilianWeng_Contrastive_Representation_Learning_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Contrastive Representation Learning", 3 | "source": "Lilian Weng's Blog", 4 | "url": "https://lilianweng.github.io/posts/2021-05-31-contrastive/", 5 | "word_count": 8395, 6 | "character_count": 61991, 7 | "fetched_at": "2025-07-17T00:51:55.062412", 8 | "confusion_article": true, 9 | "arxiv_id": "" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/LilianWeng_Flow-based_Deep_Generative_Models_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Flow-based Deep Generative Models", 3 | "source": "Lilian Weng's Blog", 4 | "url": "https://lilianweng.github.io/posts/2018-10-13-flow-models/", 5 | "word_count": 4362, 6 | "character_count": 31458, 7 | "fetched_at": "2025-07-17T00:51:50.225518", 8 | "confusion_article": true, 9 | "arxiv_id": "" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/LilianWeng_LLM_Powered_Autonomous_Agents_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "LLM Powered Autonomous Agents", 3 | "source": "Lilian Weng's Blog", 4 | "url": "https://lilianweng.github.io/posts/2023-06-23-agent/", 5 | "word_count": 5183, 6 | "character_count": 33160, 7 | "fetched_at": "2025-07-17T00:51:43.716804", 8 | "confusion_article": true, 9 | "arxiv_id": "" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/LilianWeng_Learning_with_not_Enough_Data_Part_3_Active_Learning_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Learning with not Enough Data Part 3: Active Learning", 3 | "source": "Lilian Weng's Blog", 4 | "url": "https://lilianweng.github.io/posts/2022-02-20-active-learning/", 5 | "word_count": 4607, 6 | "character_count": 32693, 7 | "fetched_at": "2025-07-17T00:51:52.672731", 8 | "confusion_article": true, 9 | "arxiv_id": "" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/LilianWeng_The_Transformer_Family_Version_20_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "The Transformer Family Version 2.0", 3 | "source": "Lilian Weng's Blog", 4 | "url": "https://lilianweng.github.io/posts/2023-01-27-the-transformer-family-v2/", 5 | "word_count": 9680, 6 | "character_count": 68139, 7 | "fetched_at": "2025-07-17T00:51:45.915002", 8 | "confusion_article": true, 9 | "arxiv_id": "" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/LilianWeng_What_are_Diffusion_Models_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "What are Diffusion Models?", 3 | "source": "Lilian Weng's Blog", 4 | "url": "https://lilianweng.github.io/posts/2021-07-11-diffusion-models/", 5 | "word_count": 6691, 6 | "character_count": 54088, 7 | "fetched_at": "2025-07-17T00:51:48.069634", 8 | "confusion_article": true, 9 | "arxiv_id": "" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Linear_algebra_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Linear algebra", 3 | "url": "https://en.wikipedia.org/wiki/Linear_algebra", 4 | "fetched_at": "2025-07-16T22:50:02.167899", 5 | "content_file": "Linear_algebra.txt", 6 | "cleaned_length": 61084 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/MORE_application.txt: -------------------------------------------------------------------------------- 1 | MORE is an outline processor application that was created for the Macintosh in 1986 by software developer Dave Winer and that was not ported to any other platforms. An earlier outliner, ThinkTank, was developed by Winer, his brother Peter, and Doug Baron for Apple II, Apple III, IBM PC compatiblesl then ported by Peter to the Macintosh. 2 | MORE was the result of combining three planned products into one expanding around the outliner, and described by its author as an outline processor. In MORE, the outlines could be formatted with different layouts, colors, and shapes. Outline "nodes" could include pictures and graphics. 3 | The company that made these products, Living Videotext, merged with Symantec in July 1987. Around July 1999, with Symantec's permission, Mr. Winer released versions of the ThinkTank and MORE products on a Web site for free download. 4 | 5 | 6 | == Functions == 7 | In 1987, MORE had evolved into a tool that was used to create presentations, a category that ultimately would be dominated by PowerPoint when Microsoft chose to pass up on acquiring Living Videotext. 8 | Functions in these outliners included: 9 | 10 | Appending notes, comments, rough drafts of sentences and paragraphs under some topics 11 | Assembling various low-level topics and creating a new topic to group them under 12 | Deleting duplicate topics 13 | Demoting a topic to become a subtopic under some other topic 14 | Disassembling a grouping that does not work, parceling its subtopics out among various other topics 15 | Dividing one topic into its component subtopics 16 | Dragging to rearrange the order of topics 17 | Making a hierarchical list of topics 18 | Merging related topics 19 | Promoting a subtopic to the level of a topic 20 | 21 | 22 | == Reception == 23 | Mick O'Neill of Personal Computer World in January 1987 praised Living Videotext for "having comprehensively responded to every major shortcoming" of ThinkTank with MORE. He cited the ability to open multiple files simultaneously, file export of outlines, detailed documentation, and lack of copy protection as among virtues, concluding that "No matter what your requirements, I suspect that you'll find MORE a remarkable package". 24 | MORE was selected for the 2nd Annual Editor's Choice awards of the magazine MacUser as Best Organizer and Best Product of 1986. 25 | 26 | 27 | == References == 28 | 29 | 30 | == External links == 31 | Outliners.com Archived information by Dave Winer 32 | How Electronic Outlining Can Help You Create Online Materials at The Communication Circle -------------------------------------------------------------------------------- /data/extended_wiki_articles/MORE_application_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "MORE (application)", 3 | "url": "https://en.wikipedia.org/wiki/MORE_(application)", 4 | "fetched_at": "2025-07-16T23:30:40.637139", 5 | "content_file": "MORE_application.txt", 6 | "cleaned_length": 2405 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Mathematical_optimization_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Mathematical optimization", 3 | "url": "https://en.wikipedia.org/wiki/Mathematical_optimization", 4 | "fetched_at": "2025-07-16T23:31:34.030362", 5 | "content_file": "Mathematical_optimization.txt", 6 | "cleaned_length": 37163 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Mathematics_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Mathematics", 3 | "url": "https://en.wikipedia.org/wiki/Mathematics", 4 | "fetched_at": "2025-07-16T23:31:58.322877", 5 | "content_file": "Mathematics.txt", 6 | "cleaned_length": 54781 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Neuroscience_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Neuroscience", 3 | "url": "https://en.wikipedia.org/wiki/Neuroscience", 4 | "fetched_at": "2025-07-16T22:50:06.334265", 5 | "content_file": "Neuroscience.txt", 6 | "cleaned_length": 26068 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Node.txt: -------------------------------------------------------------------------------- 1 | In communications and information processing, code is a system of rules to convert information—such as a letter, word, sound, image, or gesture—into another form, sometimes shortened or secret, for communication through a communication channel or storage in a storage medium. An early example is an invention of language, which enabled a person, through speech, to communicate what they thought, saw, heard, or felt to others. But speech limits the range of communication to the distance a voice can carry and limits the audience to those present when the speech is uttered. The invention of writing, which converted spoken language into visual symbols, extended the range of communication across space and time. 2 | The process of encoding converts information from a source into symbols for communication or storage. Decoding is the reverse process, converting code symbols back into a form that the recipient understands, such as English, Spanish, etc. 3 | One reason for coding is to enable communication in places where ordinary plain language, spoken or written, is difficult or impossible. For example, semaphore, where the configuration of flags held by a signaler or the arms of a semaphore tower encodes parts of the message, typically individual letters, and numbers. Another person standing a great distance away can interpret the flags and reproduce the words sent. 4 | 5 | 6 | == Theory == 7 | 8 | In information theory and computer science, a code is usually considered as an algorithm that uniquely represents symbols from some source alphabet, by encoded strings, which may be in some other target alphabet. An extension of the code for representing sequences of symbols over the source alphabet is obtained by concatenating the encoded strings. 9 | Before giving a mathematically precise definition, this is a brief example. The mapping 10 | 11 | 12 | 13 | 14 | C 15 | = 16 | { 17 | 18 | a 19 | ↦ 20 | 0 21 | , 22 | b 23 | ↦ 24 | 01 25 | , 26 | c 27 | ↦ 28 | 011 29 | 30 | } 31 | 32 | 33 | {\displaystyle C=\{\,a\mapsto 0,b\mapsto 01,c\mapsto 011\,\}} 34 | 35 | 36 | is a code, whose source alphabet is the set 37 | 38 | 39 | 40 | { 41 | a 42 | , 43 | b 44 | , 45 | c 46 | } 47 | 48 | 49 | {\displaystyle \{a,b,c\}} 50 | 51 | and whose target alphabet is the set 52 | 53 | 54 | 55 | { 56 | 0 57 | , 58 | 1 59 | } 60 | 61 | 62 | {\displaystyle \{0,1\}} 63 | 64 | . Using the extension of the code, the encoded string 0011001 can be grouped into codewords as 0 011 0 01, and these in turn can be decoded to the sequence of source symbols acab. 65 | Using terms from formal language theory, the precise mathematical definition of this concept is as follows: let S and T be two finite sets, called the source and target alphabets, respectively. A code 66 | 67 | 68 | 69 | C 70 | : 71 | 72 | S 73 | → 74 | 75 | T 76 | 77 | ∗ 78 | 79 | 80 | 81 | 82 | {\displaystyle C:\,S\to T^{*}} 83 | 84 | is a total function mapping each symbol from S to a sequence of symbols over T. The extension 85 | 86 | 87 | 88 | 89 | C 90 | ′ 91 | 92 | 93 | 94 | {\displaystyle C'} 95 | 96 | of 97 | 98 | 99 | 100 | C 101 | 102 | 103 | {\displaystyle C} 104 | 105 | , is a homomorphism of 106 | 107 | 108 | 109 | 110 | S 111 | 112 | ∗ 113 | 114 | 115 | 116 | 117 | {\displaystyle S^{*}} 118 | 119 | into 120 | 121 | 122 | 123 | 124 | T 125 | 126 | ∗ 127 | 128 | 129 | 130 | 131 | {\displaystyle T^{*}} 132 | 133 | , which naturally maps each sequence of source symbols to a sequence of target symbols. 134 | 135 | 136 | === Variable-length codes === 137 | 138 | In this section, we consider codes that encode each source (clear text) character by a code word from some dictionary, and concatenation of such code words give us an encoded string. Variable-length codes are especially useful when clear text characters have different probabilities; see also entropy encoding. 139 | A prefix code is a code with the "prefix property": there is no valid code word in the system that is a prefix (start) of any other valid code word in the set. Huffman coding is the most known algorithm for deriving prefix codes. Prefix codes are widely referred to as "Huffman codes" even when the code was not produced by a Huffman algorithm. Other examples of prefix codes are telephone country codes, the country and publisher parts of ISBNs, and the Secondary Synchronization Codes used in the UMTS WCDMA 3G Wireless Standard. 140 | Kraft's inequality characterizes the sets of codeword lengths that are possible in a prefix code. Virtually any uniquely decodable one-to-many code, not necessarily a prefix one, must satisfy Kraft's inequality. 141 | 142 | 143 | === Error-correcting codes === 144 | 145 | Codes may also be used to represent data in a way more resistant to errors in transmission or storage. This so-called error-correcting code works by including carefully crafted redundancy with the stored (or transmitted) data. Examples include Hamming codes, Reed–Solomon, Reed–Muller, Walsh–Hadamard, Bose–Chaudhuri–Hochquenghem, Turbo, Golay, algebraic geometry codes, low-density parity-check codes, and space–time codes. 146 | Error detecting codes can be optimised to detect burst errors, or random errors. 147 | 148 | 149 | == Examples == 150 | 151 | 152 | === Codes in communication used for brevity === 153 | 154 | A cable code replaces words (e.g. ship or invoice) with shorter words, allowing the same information to be sent with fewer characters, more quickly, and less expensively. 155 | Codes can be used for brevity. When telegraph messages were the state of the art in rapid long-distance communication, elaborate systems of commercial codes that encoded complete phrases into single mouths (commonly five-minute groups) were developed, so that telegraphers became conversant with such "words" as BYOXO ("Are you trying to weasel out of our deal?"), LIOUY ("Why do you not answer my question?"), BMULD ("You're a skunk!"), or AYYLU ("Not clearly coded, repeat more clearly."). Code words were chosen for various reasons: length, pronounceability, etc. Meanings were chosen to fit perceived needs: commercial negotiations, military terms for military codes, diplomatic terms for diplomatic codes, any and all of the preceding for espionage codes. Codebooks and codebook publishers proliferated, including one run as a front for the American Black Chamber run by Herbert Yardley between the First and Second World Wars. The purpose of most of these codes was to save on cable costs. The use of data coding for data compression predates the computer era; an early example is the telegraph Morse code where more-frequently used characters have shorter representations. Techniques such as Huffman coding are now used by computer-based algorithms to compress large data files into a more compact form for storage or transmission. 156 | 157 | 158 | === Character encoding === 159 | 160 | A character encoding describes how character-based data (text) is encoded. Antiquated encoding systems used a fixed number of bits, ranging from 4 to 7, but modern systems use one or more 8-bit bytes for each character. ASCII, the dominate system for decades, uses one byte for each character, and therefore, can encode up to 256 different characters. To support natural languages with more characters, other systems were invented that use more than one byte or a variable number of bytes for each character. A writing system with a large character set such as Chinese, Japanese and Korean can be represented with a multibyte encoding. Early multibyte encodings were fixed-length, meaning that each character is represented by the same number of bytes; making them suitable for decoding via a lookup table. On-the-other-hand, a variable-width encoding is more complex to decode since it cannot be decoded via a single lookup table and must be processed sequentially, but it supports a more efficient representation of a large character set by using a smaller representation for more commonly used characters. Today, UTF-8, an encoding of the Unicode character set, is the most common text encoding used on the Internet. 161 | 162 | 163 | === Genetic code === 164 | 165 | Biological organisms contain genetic material that is used to control their function and development. This is DNA, which contains units named genes from which messenger RNA is derived. This in turn produces proteins through a genetic code in which a series of triplets (codons) of four possible nucleotides can be translated into one of twenty possible amino acids. A sequence of codons results in a corresponding sequence of amino acids that form a protein molecule; a type of codon called a stop codon signals the end of the sequence. 166 | 167 | 168 | === Gödel code === 169 | In mathematics, a Gödel code is the basis for the proof of Gödel's incompleteness theorem. Here, the idea is to map mathematical notation to a natural number (using a Gödel numbering). 170 | 171 | 172 | === Other === 173 | There are codes using colors, like traffic lights, the color code employed to mark the nominal value of the electrical resistors or that of the trashcans devoted to specific types of garbage (paper, glass, organic, etc.). 174 | In marketing, coupon codes can be used for a financial discount or rebate when purchasing a product from a (usual internet) retailer. 175 | In military environments, specific sounds with the cornet are used for different uses: to mark some moments of the day, to command the infantry on the battlefield, etc. 176 | Communication systems for sensory impairments, such as sign language for deaf people and braille for blind people, are based on movement or tactile codes. 177 | Musical scores are the most common way to encode music. 178 | Specific games have their own code systems to record the matches, e.g. chess notation. 179 | 180 | 181 | === Cryptography === 182 | In the history of cryptography, codes were once common for ensuring the confidentiality of communications, although ciphers are now used instead. 183 | Secret codes intended to obscure the real messages, ranging from serious (mainly espionage in military, diplomacy, business, etc.) to trivial (romance, games) can be any kind of imaginative encoding: flowers, game cards, clothes, fans, hats, melodies, birds, etc., in which the sole requirement is the pre-agreement on the meaning by both the sender and the receiver. 184 | 185 | 186 | == Other examples == 187 | Other examples of encoding include: 188 | 189 | Encoding (in cognition) - a basic perceptual process of interpreting incoming stimuli; technically speaking, it is a complex, multi-stage process of converting relatively objective sensory input (e.g., light, sound) into a subjectively meaningful experience. 190 | Text encoding uses a markup language to tag the structure and other features of a text to facilitate processing by computers. (See also Text Encoding Initiative.) 191 | Semantics encoding of formal language A informal language B is a method of representing all terms (e.g. programs or descriptions) of language A using language B. 192 | Data compression transforms a signal into a code optimized for transmission or storage, generally done with a codec. 193 | Neural encoding - the way in which information is represented in neurons. 194 | Memory encoding - the process of converting sensations into memories. 195 | Television encoding: NTSC, PAL and SECAM 196 | Other examples of decoding include: 197 | 198 | Decoding (computer science) 199 | Decoding methods, methods in communication theory for decoding codewords sent over a noisy channel 200 | Digital signal processing, the study of signals in a digital representation and the processing methods of these signals 201 | Digital-to-analog converter, the use of analog circuit for decoding operations 202 | Word decoding, the use of phonics to decipher print patterns and translate them into the sounds of language 203 | 204 | 205 | == Codes and acronyms == 206 | Acronyms and abbreviations can be considered codes, and in a sense, all languages and writing systems are codes for human thought. 207 | International Air Transport Association airport codes are three-letter codes used to designate airports and used for bag tags. Station codes are similarly used on railways but are usually national, so the same code can be used for different stations if they are in different countries. 208 | Occasionally, a code word achieves an independent existence (and meaning) while the original equivalent phrase is forgotten or at least no longer has the precise meaning attributed to the code word. For example, '30' was widely used in journalism to mean "end of story", and has been used in other contexts to signify "the end". 209 | 210 | 211 | == See also == 212 | 213 | ADDML 214 | Asemic writing 215 | Cipher 216 | Code (semiotics) 217 | Cultural code 218 | Equipment codes 219 | Quantum error correction 220 | Semiotics 221 | Universal language 222 | 223 | 224 | == References == 225 | 226 | Chevance, Fabienne (2017). "Case for the genetic code as a triplet of triplets". Proceedings of the National Academy of Sciences of the United States of America. 114 (18): 4745–4750. Bibcode:2017PNAS..114.4745C. doi:10.1073/pnas.1614896114. PMC 5422812. PMID 28416671. 227 | 228 | 229 | == Further reading == 230 | Codes and Abbreviations for the Use of the International Telecommunication Services (2nd ed.). Geneva, Switzerland: International Telecommunication Union. 1963. OCLC 13677884. -------------------------------------------------------------------------------- /data/extended_wiki_articles/Node_computer_science.txt: -------------------------------------------------------------------------------- 1 | A node is a basic unit of a data structure, such as a linked list or tree data structure. Nodes contain data and also may link to other nodes. Links between nodes are often implemented by pointers. 2 | 3 | 4 | == Nodes and trees == 5 | 6 | Nodes are often arranged into tree structures. A node represents the information contained in a single data structure. These nodes may contain a value or condition, or possibly serve as another independent data structure. Nodes are represented by a single parent node. The highest point on a tree structure is called a root node, which does not have a parent node, but serves as the parent or 'grandparent' of all of the nodes below it in the tree. The height of a node is determined by the total number of edges on the path from that node to the furthest leaf node, and the height of the tree is equal to the height of the root node. Node depth is determined by the distance between that particular node and the root node. The root node is said to have a depth of zero. Data can be discovered along these network paths. 7 | An IP address uses this kind of system of nodes to define its location in a network. 8 | 9 | 10 | === Definitions === 11 | Child: A child node is a node extending from another node. For example, a computer with internet access could be considered a child node of a node representing the internet. The inverse relationship is that of a parent node. If node C is a child of node A, then A is the parent node of C. 12 | Degree: the degree of a node is the number of children of the node. 13 | Depth: the depth of node A is the length of the path from A to the root node. The root node is said to have depth 0. 14 | Edge: the connection between nodes. 15 | Forest: a set of trees. 16 | Height: the height of node A is the length of the longest path through children to a leaf node. 17 | Internal node: a node with at least one child. 18 | Leaf node: a node with no children. 19 | Root node: a node distinguished from the rest of the tree nodes. Usually, it is depicted as the highest node of the tree. 20 | Sibling nodes: these are nodes connected to the same parent node. 21 | 22 | 23 | == Markup languages == 24 | Another common use of node trees is in web development. In programming, XML is used to communicate information between computer programmers and computers alike. For this reason XML is used to create common communication protocols used in office productivity software, and serves as the base for the development of modern web markup languages like XHTML. Though similar in how it is approached by a programmer, HTML and CSS is typically the language used to develop website text and design. While XML, HTML and XHTML provide the language and expression, the DOM serves as a translator. 25 | 26 | 27 | === Node type === 28 | Different types of nodes in a tree are represented by specific interfaces. In other words, the node type is defined by how it communicates with other nodes. Each node has a node type property, which specifies the type of node, such as sibling or leaf. 29 | For example, if the node type property is the constant properties for a node, this property specifies the type of the node. So if a node type property is the constant node ELEMENT_NODE, one can know that this node object is an object Element. This object uses the Element interface to define all the methods and properties of that particular node. 30 | Different W3C World Wide Web Consortium node types and descriptions: 31 | 32 | Document represents the entire document (the root-node of the DOM tree) 33 | DocumentFragment represents a "lightweight" Document object, which can hold a portion of a document 34 | DocumentType provides an interface to the entities defined for the document 35 | ProcessingInstruction represents a processing instruction 36 | EntityReference represents an entity reference 37 | Element represents an element 38 | Attr represents an attribute 39 | Text represents textual content in an element or attribute 40 | CDATASection represents a CDATA section in a document (text that will NOT be parsed by a parser) 41 | Comment represents a comment 42 | Entity represents an entity 43 | Notation represents a notation declared in the DTD 44 | 45 | 46 | === Node object === 47 | A node object is represented by a single node in a tree. It can be an element node, attribute node, text node, or any type that is described in section "node type". All objects can inherit properties and methods for dealing with parent and child nodes, but not all of the objects have parent or child nodes. For example, with text nodes that cannot have child nodes, trying to add child nodes results in a DOM error. 48 | Objects in the DOM tree may be addressed and manipulated by using methods on the objects. The public interface of a DOM is specified in its application programming interface (API). The history of the Document Object Model is intertwined with the history of the "browser wars" of the late 1990s between Netscape Navigator and Microsoft Internet Explorer, as well as with that of JavaScript and JScript, the first scripting languages to be widely implemented in the layout engines of web browsers. 49 | 50 | 51 | == See also == 52 | Vertex (graph theory) 53 | 54 | 55 | == References == 56 | 57 | 58 | == External links == 59 | Data Trees as a Means of Presenting Complex Data Analysis by Sally Knipe 60 | STL-like C++ tree class Archived 2020-11-26 at the Wayback Machine 61 | Description of tree data structures from ideainfo.8m.com 62 | WormWeb.org: Interactive Visualization of the C. elegans Cell Tree - Visualize the entire cell lineage tree of the nematode C. elegans (javascript) -------------------------------------------------------------------------------- /data/extended_wiki_articles/Node_computer_science_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Node (computer science)", 3 | "url": "https://en.wikipedia.org/wiki/Node_(computer_science)", 4 | "fetched_at": "2025-07-16T23:31:24.166358", 5 | "content_file": "Node_computer_science.txt", 6 | "cleaned_length": 5344 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Node_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Node", 3 | "url": "https://en.wikipedia.org/wiki/Code", 4 | "fetched_at": "2025-07-16T23:30:46.437900", 5 | "content_file": "Node.txt", 6 | "cleaned_length": 12933 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Operating_system_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Operating system", 3 | "url": "https://en.wikipedia.org/wiki/Operating_system", 4 | "fetched_at": "2025-07-16T23:31:20.992408", 5 | "content_file": "Operating_system.txt", 6 | "cleaned_length": 37723 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Optimization_mathematics_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Optimization (mathematics)", 3 | "url": "https://en.wikipedia.org/wiki/Mathematical_optimization", 4 | "fetched_at": "2025-07-16T22:50:16.884396", 5 | "content_file": "Optimization_mathematics.txt", 6 | "cleaned_length": 37163 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Overfitting_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Overfitting", 3 | "url": "https://en.wikipedia.org/wiki/Overfitting", 4 | "fetched_at": "2025-07-16T22:49:57.605991", 5 | "content_file": "Overfitting.txt", 6 | "cleaned_length": 15821 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Physical_examination_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Physical examination", 3 | "url": "https://en.wikipedia.org/wiki/Physical_examination", 4 | "fetched_at": "2025-07-16T23:31:55.538978", 5 | "content_file": "Physical_examination.txt", 6 | "cleaned_length": 14823 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Portfolio_finance.txt: -------------------------------------------------------------------------------- 1 | In finance, a portfolio is a collection of investments. 2 | 3 | 4 | == Definition == 5 | 6 | The term "portfolio" refers to any combination of financial assets such as stocks, bonds and cash. Portfolios may be held by individual investors or managed by financial professionals, hedge funds, banks and other financial institutions. It is a generally accepted principle that a portfolio is designed according to the investor's risk tolerance, time frame and investment objectives. The monetary value of each asset may influence the risk/reward ratio of the portfolio. 7 | When determining asset allocation, the aim is to maximise the expected return and minimise the risk. This is an example of a multi-objective optimization problem: many efficient solutions are available and the preferred solution must be selected by considering a tradeoff between risk and return. In particular, a portfolio A is dominated by another portfolio A' if A' has a greater expected gain and a lesser risk than A. If no portfolio dominates A, A is a Pareto-optimal portfolio. 8 | The set of Pareto-optimal returns and risks is called the Pareto efficient frontier for the Markowitz portfolio selection problem. Recently, an alternative approach to portfolio diversification has been suggested in the literatures that combines risk and return in the optimization problem. 9 | 10 | 11 | == Description == 12 | There are many types of portfolios including the market portfolio and the zero-investment portfolio. A portfolio's asset allocation may be managed utilizing any of the following investment approaches and principles: dividend weighting, equal weighting, capitalization-weighting, price-weighting, risk parity, the capital asset pricing model, arbitrage pricing theory, the Jensen Index, the Treynor ratio, the Sharpe diagonal (or index) model, the value at risk model, modern portfolio theory and others. 13 | There are several methods for calculating portfolio returns and performance. One traditional method is using quarterly or monthly money-weighted returns; however, the true time-weighted method is a method preferred by many investors in financial markets. There are also several models for measuring the performance attribution of a portfolio's returns when compared to an index or benchmark, partly viewed as investment strategy. 14 | 15 | 16 | == See also == 17 | Outline of finance § Portfolio theory 18 | Capital asset pricing model 19 | Hedge (finance) 20 | Infection ratio 21 | Investment management 22 | Portfolio investment 23 | Portfolio optimization 24 | Financial risk management § Investment management 25 | 26 | 27 | == References == 28 | 29 | 30 | == Bibliography == 31 | Baker, H. Kent; Filbeck, Greg (2015). Investment Risk Management. Oxford Academic. ISBN 978-0199331963. 32 | Grinold, Richard; Kahn, Ronald (1999). Active Portfolio Management: A Quantitative Approach for Producing Superior Returns and Controlling Risk (2nd ed.). McGraw Hill. ISBN 978-0070248823. 33 | Harvey, Campbell; Rattray, Sandy; Van Hemert,Otto (2021). Strategic Risk Management: Designing Portfolios and Managing Risk. Wiley Finance. ISBN 978-1119773917. 34 | Maginn, John L.; Tuttle, Donald L.; Pinto, Jerald E.; McLeavey,Dennis W. (2007). Managing Investment Portfolios: A Dynamic Process (3rd ed.). Springer. ISBN 978-0470080146. 35 | Paleologo, Giuseppe A. (2021). Advanced Portfolio Management: A Quant's Guide for Fundamental Investors (1st ed.). Wiley. ISBN 978-1119789796. 36 | Rasmussen, M. (2003). Quantitative Portfolio Optimisation, Asset Allocation and Risk Management. Palgrave Macmillan. ISBN 978-1403904584. 37 | Schulmerich, Marcus; Leporcher, Yves-Michel; Eu, Ching-Hwa (2015). Applied Asset and Risk Management. Springer. ISBN 978-3642554438. -------------------------------------------------------------------------------- /data/extended_wiki_articles/Portfolio_finance_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Portfolio (finance)", 3 | "url": "https://en.wikipedia.org/wiki/Portfolio_(finance)", 4 | "fetched_at": "2025-07-16T23:31:31.840393", 5 | "content_file": "Portfolio_finance.txt", 6 | "cleaned_length": 3566 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Psychology_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Psychology", 3 | "url": "https://en.wikipedia.org/wiki/Psychology", 4 | "fetched_at": "2025-07-16T22:50:08.659043", 5 | "content_file": "Psychology.txt", 6 | "cleaned_length": 109387 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Quantitative_analysis_finance_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Quantitative analysis (finance)", 3 | "url": "https://en.wikipedia.org/wiki/Quantitative_analysis_(finance)", 4 | "fetched_at": "2025-07-16T23:31:36.357804", 5 | "content_file": "Quantitative_analysis_finance.txt", 6 | "cleaned_length": 24098 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Rise_The_Vieneo_Province.txt: -------------------------------------------------------------------------------- 1 | Rise: The Vieneo Province (abbreviated to Rise) is an MMO space simulator. The game is centered around an exomoon called Vieneo about 225 LY from Earth. 2 | While Rise is sometimes referred to as a game, in general it does not have points, scores, winners or losers, levels, an end-strategy, or most of the other characteristics of a typical game. Players, often called "citizens", can visit this virtual world almost as if it were a real place. They explore, meet other citizens, socialize, participate in individual and alliance activities, and buy and produce goods (virtual property) and services from one another. As they spend more time in the world, they learn new skills and mature socially, learning the culture and manners of a virtual environment. 3 | The game contains player-driven economics, a dynamic weather system, and a detailed terrain engine. Vieneo, the fictitious moon the game is based on, has all the typical attributes of a planet or moon and was made using StarGen. 4 | The aerodynamic and astrodynamic physics are of the highest fidelity and therefore closely mimic reality. One of the goals of many players is to fly up into orbit and look down at the large gaming arena. 5 | 6 | 7 | == History == 8 | 9 | Unistellar Industries was founded in the State of Missouri in August, 2002. 10 | The company originally developed software for a full-scale cockpit enclosure. 11 | The one and only prototype (called Audrey) was stolen and litigation followed. 12 | Due to budgetary restrictions, liability, and assembly space required, the firm has focused on the larger audience of online gaming. 13 | The initial alpha test version, code-named Planetfall, was made available in 2003 to players with initial release of the proprietary terrain engine. An alpha version of the game called Rise: The Vieneo Province was made publicly available in 2006. 14 | Monthly subscriptions offered starting April, 2006 for users wanting to extend beyond the 14-day free trial period. 15 | The Rise economy was initialized in August, 2006 and has been entirely user-driven since. 16 | Ongoing development and updates ceased on Rise in October, 2010. 17 | The company started production of a new full-scale cockpit enclosure in July, 2011 with estimated completion in July, 2014. 18 | A limited liability company was formed in December, 2012 (State of Kansas). 19 | The game went "Free-to-Play" on March 1, 2013. 20 | The game is back in development as of February 1, 2017. 21 | The game was listed as an upcoming title on Steam on July 1, 2019. 22 | The game was released as Early Access on Steam on August 1, 2019, and subsequently released February 4, 2020. 23 | An unrestricted demo version was posted on Steam on February 29, 2020, during the COVID-19 pandemic. 24 | 25 | 26 | == See also == 27 | 28 | List of space flight simulation games 29 | 30 | 31 | == References == 32 | 33 | 34 | == External links == 35 | Official website -------------------------------------------------------------------------------- /data/extended_wiki_articles/Rise_The_Vieneo_Province_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Rise: The Vieneo Province", 3 | "url": "https://en.wikipedia.org/wiki/Rise:_The_Vieneo_Province", 4 | "fetched_at": "2025-07-16T23:31:30.037877", 5 | "content_file": "Rise_The_Vieneo_Province.txt", 6 | "cleaned_length": 2770 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Signal_processing.txt: -------------------------------------------------------------------------------- 1 | Signal processing is an electrical engineering subfield that focuses on analyzing, modifying and synthesizing signals, such as sound, images, potential fields, seismic signals, altimetry processing, and scientific measurements. Signal processing techniques are used to optimize transmissions, digital storage efficiency, correcting distorted signals, improve subjective video quality, and to detect or pinpoint components of interest in a measured signal. 2 | 3 | 4 | == History == 5 | According to Alan V. Oppenheim and Ronald W. Schafer, the principles of signal processing can be found in the classical numerical analysis techniques of the 17th century. They further state that the digital refinement of these techniques can be found in the digital control systems of the 1940s and 1950s. 6 | In 1948, Claude Shannon wrote the influential paper "A Mathematical Theory of Communication" which was published in the Bell System Technical Journal. The paper laid the groundwork for later development of information communication systems and the processing of signals for transmission. 7 | Signal processing matured and flourished in the 1960s and 1970s, and digital signal processing became widely used with specialized digital signal processor chips in the 1980s. 8 | 9 | 10 | == Definition of a signal == 11 | A signal is a function 12 | 13 | 14 | 15 | x 16 | ( 17 | t 18 | ) 19 | 20 | 21 | {\displaystyle x(t)} 22 | 23 | , where this function is either 24 | 25 | deterministic (then one speaks of a deterministic signal) or 26 | a path 27 | 28 | 29 | 30 | ( 31 | 32 | x 33 | 34 | t 35 | 36 | 37 | 38 | ) 39 | 40 | t 41 | ∈ 42 | T 43 | 44 | 45 | 46 | 47 | {\displaystyle (x_{t})_{t\in T}} 48 | 49 | , a realization of a stochastic process 50 | 51 | 52 | 53 | ( 54 | 55 | X 56 | 57 | t 58 | 59 | 60 | 61 | ) 62 | 63 | t 64 | ∈ 65 | T 66 | 67 | 68 | 69 | 70 | {\displaystyle (X_{t})_{t\in T}} 71 | 72 | 73 | 74 | == Categories == 75 | 76 | 77 | === Analog === 78 | 79 | Analog signal processing is for signals that have not been digitized, as in most 20th-century radio, telephone, and television systems. This involves linear electronic circuits as well as nonlinear ones. The former are, for instance, passive filters, active filters, additive mixers, integrators, and delay lines. Nonlinear circuits include compandors, multipliers (frequency mixers, voltage-controlled amplifiers), voltage-controlled filters, voltage-controlled oscillators, and phase-locked loops. 80 | 81 | 82 | === Continuous time === 83 | Continuous-time signal processing is for signals that vary with the change of continuous domain (without considering some individual interrupted points). 84 | The methods of signal processing include time domain, frequency domain, and complex frequency domain. This technology mainly discusses the modeling of a linear time-invariant continuous system, integral of the system's zero-state response, setting up system function and the continuous time filtering of deterministic signals. For example, in time domain, a continuous-time signal 85 | 86 | 87 | 88 | x 89 | ( 90 | t 91 | ) 92 | 93 | 94 | {\displaystyle x(t)} 95 | 96 | passing through a linear time-invariant filter/system denoted as 97 | 98 | 99 | 100 | h 101 | ( 102 | t 103 | ) 104 | 105 | 106 | {\displaystyle h(t)} 107 | 108 | , can be expressed at the output as 109 | 110 | 111 | 112 | 113 | y 114 | ( 115 | t 116 | ) 117 | = 118 | 119 | ∫ 120 | 121 | − 122 | ∞ 123 | 124 | 125 | ∞ 126 | 127 | 128 | h 129 | ( 130 | τ 131 | ) 132 | x 133 | ( 134 | t 135 | − 136 | τ 137 | ) 138 | 139 | d 140 | τ 141 | 142 | 143 | {\displaystyle y(t)=\int _{-\infty }^{\infty }h(\tau )x(t-\tau )\,d\tau } 144 | 145 | 146 | In some contexts, 147 | 148 | 149 | 150 | h 151 | ( 152 | t 153 | ) 154 | 155 | 156 | {\displaystyle h(t)} 157 | 158 | is referred to as the impulse response of the system. The above convolution operation is conducted between the input and the system. 159 | 160 | 161 | === Discrete time === 162 | Discrete-time signal processing is for sampled signals, defined only at discrete points in time, and as such are quantized in time, but not in magnitude. 163 | Analog discrete-time signal processing is a technology based on electronic devices such as sample and hold circuits, analog time-division multiplexers, analog delay lines and analog feedback shift registers. This technology was a predecessor of digital signal processing (see below), and is still used in advanced processing of gigahertz signals. 164 | The concept of discrete-time signal processing also refers to a theoretical discipline that establishes a mathematical basis for digital signal processing, without taking quantization error into consideration. 165 | 166 | 167 | === Digital === 168 | 169 | Digital signal processing is the processing of digitized discrete-time sampled signals. Processing is done by general-purpose computers or by digital circuits such as ASICs, field-programmable gate arrays or specialized digital signal processors. Typical arithmetical operations include fixed-point and floating-point, real-valued and complex-valued, multiplication and addition. Other typical operations supported by the hardware are circular buffers and lookup tables. Examples of algorithms are the fast Fourier transform (FFT), finite impulse response (FIR) filter, Infinite impulse response (IIR) filter, and adaptive filters such as the Wiener and Kalman filters. 170 | 171 | 172 | === Nonlinear === 173 | Nonlinear signal processing involves the analysis and processing of signals produced from nonlinear systems and can be in the time, frequency, or spatiotemporal domains. Nonlinear systems can produce highly complex behaviors including bifurcations, chaos, harmonics, and subharmonics which cannot be produced or analyzed using linear methods. 174 | Polynomial signal processing is a type of non-linear signal processing, where polynomial systems may be interpreted as conceptually straightforward extensions of linear systems to the nonlinear case. 175 | 176 | 177 | === Statistical === 178 | Statistical signal processing is an approach which treats signals as stochastic processes, utilizing their statistical properties to perform signal processing tasks. Statistical techniques are widely used in signal processing applications. For example, one can model the probability distribution of noise incurred when photographing an image, and construct techniques based on this model to reduce the noise in the resulting image. 179 | 180 | 181 | === Graph === 182 | Graph signal processing generalizes signal processing tasks to signals living on non-Euclidean domains whose structure can be captured by a weighted graph. Graph signal processing presents several key points such as sampling signal techniques, recovery techniques and time-varying techiques. Graph signal processing has been applied with success in the field of image processing, computer vision 183 | 184 | and sound anomaly detection. 185 | 186 | 187 | == Application fields == 188 | 189 | Audio signal processing – for electrical signals representing sound, such as speech or music 190 | Image processing – in digital cameras, computers and various imaging systems 191 | Video processing – for interpreting moving pictures 192 | Wireless communication – waveform generations, demodulation, filtering, equalization 193 | Control systems 194 | Array processing – for processing signals from arrays of sensors 195 | Process control – a variety of signals are used, including the industry standard 4-20 mA current loop 196 | Seismology 197 | Feature extraction, such as image understanding, semantic audio and speech recognition. 198 | Quality improvement, such as noise reduction, image enhancement, and echo cancellation. 199 | Source coding including audio compression, image compression, and video compression. 200 | Genomic signal processing 201 | In geophysics, signal processing is used to amplify the signal vs the noise within time-series measurements of geophysical data. Processing is conducted within the time domain or frequency domain, or both. 202 | In communication systems, signal processing may occur at: 203 | 204 | OSI layer 1 in the seven-layer OSI model, the physical layer (modulation, equalization, multiplexing, etc.); 205 | OSI layer 2, the data link layer (forward error correction); 206 | OSI layer 6, the presentation layer (source coding, including analog-to-digital conversion and data compression). 207 | 208 | 209 | == Typical devices == 210 | Filters – for example analog (passive or active) or digital (FIR, IIR, frequency domain or stochastic filters, etc.) 211 | Samplers and analog-to-digital converters for signal acquisition and reconstruction, which involves measuring a physical signal, storing or transferring it as digital signal, and possibly later rebuilding the original signal or an approximation thereof. 212 | Digital signal processors (DSPs) 213 | 214 | 215 | == Mathematical methods applied == 216 | Differential equations – for modeling system behavior, connecting input and output relations in linear time-invariant systems. For instance, a low-pass filter such as an RC circuit can be modeled as a differential equation in signal processing, which allows one to compute the continuous output signal as a function of the input or initial conditions. 217 | Recurrence relations 218 | Transform theory 219 | Time-frequency analysis – for processing non-stationary signals 220 | Linear canonical transformation 221 | Spectral estimation – for determining the spectral content (i.e., the distribution of power over frequency) of a set of time series data points 222 | Statistical signal processing – analyzing and extracting information from signals and noise based on their stochastic properties 223 | Linear time-invariant system theory, and transform theory 224 | Polynomial signal processing – analysis of systems which relate input and output using polynomials 225 | System identification and classification 226 | Calculus 227 | Coding theory 228 | Complex analysis 229 | Vector spaces and Linear algebra 230 | Functional analysis 231 | Probability and stochastic processes 232 | Detection theory 233 | Estimation theory 234 | Optimization 235 | Numerical methods 236 | Data mining – for statistical analysis of relations between large quantities of variables (in this context representing many physical signals), to extract previously unknown interesting patterns 237 | 238 | 239 | == See also == 240 | Algebraic signal processing 241 | Audio filter 242 | Bounded variation 243 | Dynamic range compression 244 | Information theory 245 | Least-squares spectral analysis 246 | Non-local means 247 | Reverberation 248 | Sensitivity (electronics) 249 | Similarity (signal processing) 250 | 251 | 252 | == References == 253 | 254 | 255 | == Further reading == 256 | Byrne, Charles (2014). Signal Processing: A Mathematical Approach. Taylor & Francis. doi:10.1201/b17672. ISBN 9780429158711. 257 | P Stoica, R Moses (2005). Spectral Analysis of Signals (PDF). NJ: Prentice Hall. 258 | Papoulis, Athanasios (1991). Probability, Random Variables, and Stochastic Processes (third ed.). McGraw-Hill. ISBN 0-07-100870-5. 259 | Kainam Thomas Wong [1]: Statistical Signal Processing lecture notes at the University of Waterloo, Canada. 260 | Ali H. Sayed, Adaptive Filters, Wiley, NJ, 2008, ISBN 978-0-470-25388-5. 261 | Thomas Kailath, Ali H. Sayed, and Babak Hassibi, Linear Estimation, Prentice-Hall, NJ, 2000, ISBN 978-0-13-022464-4. 262 | 263 | 264 | == External links == 265 | Signal Processing for Communications – free online textbook by Paolo Prandoni and Martin Vetterli (2008) 266 | Scientists and Engineers Guide to Digital Signal Processing – free online textbook by Stephen Smith 267 | Julius O. Smith III: Spectral Audio Signal Processing – free online textbook 268 | Graph Signal Processing Website – free online website by Thierry Bouwmans (2025) -------------------------------------------------------------------------------- /data/extended_wiki_articles/Signal_processing_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Signal processing", 3 | "url": "https://en.wikipedia.org/wiki/Signal_processing", 4 | "fetched_at": "2025-07-16T22:50:20.055180", 5 | "content_file": "Signal_processing.txt", 6 | "cleaned_length": 11338 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Software.txt: -------------------------------------------------------------------------------- 1 | Software consists of computer programs that instruct the execution of a computer. Software also includes design documents and specifications. 2 | The history of software is closely tied to the development of digital computers in the mid-20th century. Early programs were written in the machine language specific to the hardware. The introduction of high-level programming languages in 1958 allowed for more human-readable instructions, making software development easier and more portable across different computer architectures. Software in a programming language is run through a compiler or interpreter to execute on the architecture's hardware. Over time, software has become complex, owing to developments in networking, operating systems, and databases. 3 | Software can generally be categorized into two main types: 4 | 5 | operating systems, which manage hardware resources and provide services for applications 6 | application software, which performs specific tasks for users 7 | The rise of cloud computing has introduced the new software delivery model Software as a Service (SaaS). In SaaS, applications are hosted by a provider and accessed over the Internet. 8 | The process of developing software involves several stages. The stages include software design, programming, testing, release, and maintenance. Software quality assurance and security are critical aspects of software development, as bugs and security vulnerabilities can lead to system failures and security breaches. Additionally, legal issues such as software licenses and intellectual property rights play a significant role in the distribution of software products. 9 | 10 | 11 | == History == 12 | 13 | The first use of the word software to describe computer programs is credited to mathematician John Wilder Tukey in 1958. 14 | The first programmable computers, which appeared at the end of the 1940s, were programmed in machine language. Machine language is difficult to debug and not portable across different computers. Initially, hardware resources were more expensive than human resources. As programs became complex, programmer productivity became the bottleneck. The introduction of high-level programming languages in 1958 hid the details of the hardware and expressed the underlying algorithms into the code . Early languages include Fortran, Lisp, and COBOL. 15 | 16 | 17 | == Types == 18 | 19 | There are two main types of software: 20 | 21 | Operating systems are "the layer of software that manages a computer's resources for its users and their applications". There are three main purposes that an operating system fulfills: 22 | Allocating resources between different applications, deciding when they will receive central processing unit (CPU) time or space in memory. 23 | Providing an interface that abstracts the details of accessing hardware details (like physical memory) to make things easier for programmers. 24 | Offering common services, such as an interface for accessing network and disk devices. This enables an application to be run on different hardware without needing to be rewritten. 25 | Application software runs on top of the operating system and uses the computer's resources to perform a task. There are many different types of application software because the range of tasks that can be performed with modern computers is so large. Applications account for most software and require the environment provided by an operating system, and often other applications, in order to function. 26 | 27 | Software can also be categorized by how it is deployed. Traditional applications are purchased with a perpetual license for a specific version of the software, downloaded, and run on hardware belonging to the purchaser. The rise of the Internet and cloud computing enabled a new model, software as a service (SaaS), in which the provider hosts the software (usually built on top of rented infrastructure or platforms) and provides the use of the software to customers, often in exchange for a subscription fee. By 2023, SaaS products—which are usually delivered via a web application—had become the primary method that companies deliver applications. 28 | 29 | 30 | == Software development and maintenance == 31 | 32 | Software companies aim to deliver a high-quality product on time and under budget. A challenge is that software development effort estimation is often inaccurate. Software development begins by conceiving the project, evaluating its feasibility, analyzing the business requirements, and making a software design. Most software projects speed up their development by reusing or incorporating existing software, either in the form of commercial off-the-shelf (COTS) or open-source software. Software quality assurance is typically a combination of manual code review by other engineers and automated software testing. Due to time constraints, testing cannot cover all aspects of the software's intended functionality, so developers often focus on the most critical functionality. Formal methods are used in some safety-critical systems to prove the correctness of code, while user acceptance testing helps to ensure that the product meets customer expectations. There are a variety of software development methodologies, which vary from completing all steps in order to concurrent and iterative models. Software development is driven by requirements taken from prospective users, as opposed to maintenance, which is driven by events such as a change request. 33 | Frequently, software is released in an incomplete state when the development team runs out of time or funding. Despite testing and quality assurance, virtually all software contains bugs where the system does not work as intended. Post-release software maintenance is necessary to remediate these bugs when they are found and keep the software working as the environment changes over time. New features are often added after the release. Over time, the level of maintenance becomes increasingly restricted before being cut off entirely when the product is withdrawn from the market. As software ages, it becomes known as legacy software and can remain in use for decades, even if there is no one left who knows how to fix it. Over the lifetime of the product, software maintenance is estimated to comprise 75 percent or more of the total development cost. 34 | Completing a software project involves various forms of expertise, not just in software programmers but also testing, documentation writing, project management, graphic design, user experience, user support, marketing, and fundraising. 35 | 36 | 37 | == Quality and security == 38 | 39 | Software quality is defined as meeting the stated requirements as well as customer expectations. Quality is an overarching term that can refer to a code's correct and efficient behavior, its reusability and portability, or the ease of modification. It is usually more cost-effective to build quality into the product from the beginning rather than try to add it later in the development process. Higher quality code will reduce lifetime cost to both suppliers and customers as it is more reliable and easier to maintain. Software failures in safety-critical systems can be very serious including death. By some estimates, the cost of poor quality software can be as high as 20 to 40 percent of sales. Despite developers' goal of delivering a product that works entirely as intended, virtually all software contains bugs. 40 | The rise of the Internet also greatly increased the need for computer security as it enabled malicious actors to conduct cyberattacks remotely. If a bug creates a security risk, it is called a vulnerability. Software patches are often released to fix identified vulnerabilities, but those that remain unknown (zero days) as well as those that have not been patched are still liable for exploitation. Vulnerabilities vary in their ability to be exploited by malicious actors, and the actual risk is dependent on the nature of the vulnerability as well as the value of the surrounding system. Although some vulnerabilities can only be used for denial of service attacks that compromise a system's availability, others allow the attacker to inject and run their own code (called malware), without the user being aware of it. To thwart cyberattacks, all software in the system must be designed to withstand and recover from external attack. Despite efforts to ensure security, a significant fraction of computers are infected with malware. 41 | 42 | 43 | == Encoding and execution == 44 | 45 | 46 | === Programming languages === 47 | 48 | Programming languages are the format in which software is written. Since the 1950s, thousands of different programming languages have been invented; some have been in use for decades, while others have fallen into disuse. Some definitions classify machine code—the exact instructions directly implemented by the hardware—and assembly language—a more human-readable alternative to machine code whose statements can be translated one-to-one into machine code—as programming languages. Programs written in the high-level programming languages used to create software share a few main characteristics: knowledge of machine code is not necessary to write them, they can be ported to other computer systems, and they are more concise and human-readable than machine code. They must be both human-readable and capable of being translated into unambiguous instructions for computer hardware. 49 | 50 | 51 | === Compilation, interpretation, and execution === 52 | The invention of high-level programming languages was simultaneous with the compilers needed to translate them automatically into machine code. Most programs do not contain all the resources needed to run them and rely on external libraries. Part of the compiler's function is to link these files in such a way that the program can be executed by the hardware. Once compiled, the program can be saved as an object file and the loader (part of the operating system) can take this saved file and execute it as a process on the computer hardware. Some programming languages use an interpreter instead of a compiler. An interpreter converts the program into machine code at run time, which makes them 10 to 100 times slower than compiled programming languages. 53 | 54 | 55 | == Legal issues == 56 | 57 | 58 | === Liability === 59 | 60 | Software is often released with the knowledge that it is incomplete or contains bugs. Purchasers knowingly buy it in this state, which has led to a legal regime where liability for software products is significantly curtailed compared to other products. 61 | 62 | 63 | === Licenses === 64 | 65 | Since the mid-1970s, software and its source code have been protected by copyright law that vests the owner with the exclusive right to copy the code. The underlying ideas or algorithms are not protected by copyright law, but are sometimes treated as a trade secret and concealed by such methods as non-disclosure agreements. A software copyright is often owned by the person or company that financed or made the software (depending on their contracts with employees or contractors who helped to write it). Some software is in the public domain and has no restrictions on who can use it, copy or share it, or modify it; a notable example is software written by the United States Government. Free and open-source software also allow free use, sharing, and modification, perhaps with a few specified conditions. The use of some software is governed by an agreement (software license) written by the copyright holder and imposed on the user. Proprietary software is usually sold under a restrictive license that limits its use and sharing. Some free software licenses require that modified versions must be released under the same license, which prevents the software from being sold 66 | or distributed under proprietary restrictions. 67 | 68 | 69 | === Patents === 70 | 71 | Patents give an inventor an exclusive, time-limited license for a novel product or process. Ideas about what software could accomplish are not protected by law and concrete implementations are instead covered by copyright law. In some countries, a requirement for the claimed invention to have an effect on the physical world may also be part of the requirements for a software patent to be held valid. Software patents have been historically controversial. Before the 1998 case State Street Bank & Trust Co. v. Signature Financial Group, Inc., software patents were generally not recognized in the United States. In that case, the Supreme Court decided that business processes could be patented. Patent applications are complex and costly, and lawsuits involving patents can drive up the cost of products. Unlike copyrights, patents generally only apply in the jurisdiction where they were issued. 72 | 73 | 74 | == Impact == 75 | 76 | Engineer Capers Jones writes that "computers and software are making profound changes to every aspect of human life: education, work, warfare, entertainment, medicine, law, and everything else". It has become ubiquitous in everyday life in developed countries. In many cases, software augments the functionality of existing technologies such as household appliances and elevators. Software also spawned entirely new technologies such as the Internet, video games, mobile phones, and GPS. New methods of communication, including email, forums, blogs, microblogging, wikis, and social media, were enabled by the Internet. Massive amounts of knowledge exceeding any paper-based library are now available with a quick web search. Most creative professionals have switched to software-based tools such as computer-aided design, 3D modeling, digital image editing, and computer animation. Almost every complex device is controlled by software. 77 | 78 | 79 | == References == 80 | 81 | 82 | === Sources === -------------------------------------------------------------------------------- /data/extended_wiki_articles/Software_engineering_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Software engineering", 3 | "url": "https://en.wikipedia.org/wiki/Software_engineering", 4 | "fetched_at": "2025-07-16T22:50:22.217369", 5 | "content_file": "Software_engineering.txt", 6 | "cleaned_length": 25855 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Software_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Software", 3 | "url": "https://en.wikipedia.org/wiki/Software", 4 | "fetched_at": "2025-07-16T23:32:15.682718", 5 | "content_file": "Software.txt", 6 | "cleaned_length": 13471 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Statistical_mechanics_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Statistical mechanics", 3 | "url": "https://en.wikipedia.org/wiki/Statistical_mechanics", 4 | "fetched_at": "2025-07-16T23:32:00.068287", 5 | "content_file": "Statistical_mechanics.txt", 6 | "cleaned_length": 25685 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Statistics_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Statistics", 3 | "url": "https://en.wikipedia.org/wiki/Statistics", 4 | "fetched_at": "2025-07-16T22:49:59.901386", 5 | "content_file": "Statistics.txt", 6 | "cleaned_length": 45625 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Support_vector_machine_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Support vector machine", 3 | "url": "https://en.wikipedia.org/wiki/Support_vector_machine", 4 | "fetched_at": "2025-07-16T22:51:07.768274", 5 | "content_file": "Support_vector_machine.txt", 6 | "cleaned_length": 100225 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Transfer_learning.txt: -------------------------------------------------------------------------------- 1 | Transfer learning (TL) is a technique in machine learning (ML) in which knowledge learned from a task is re-used in order to boost performance on a related task. For example, for image classification, knowledge gained while learning to recognize cars could be applied when trying to recognize trucks. This topic is related to the psychological literature on transfer of learning, although practical ties between the two fields are limited. Reusing/transferring information from previously learned tasks to new tasks has the potential to significantly improve learning efficiency. 2 | Since transfer learning makes use of training with multiple objective functions it is related to cost-sensitive machine learning and multi-objective optimization. 3 | 4 | 5 | == History == 6 | In 1976, Bozinovski and Fulgosi published a paper addressing transfer learning in neural network training. The paper gives a mathematical and geometrical model of the topic. In 1981, a report considered the application of transfer learning to a dataset of images representing letters of computer terminals, experimentally demonstrating positive and negative transfer learning. 7 | In 1992, Lorien Pratt formulated the discriminability-based transfer (DBT) algorithm. 8 | By 1998, the field had advanced to include multi-task learning, along with more formal theoretical foundations. Influential publications on transfer learning include the book Learning to Learn in 1998, a 2009 survey and a 2019 survey. 9 | Ng said in his NIPS 2016 tutorial that TL would become the next driver of machine learning commercial success after supervised learning. 10 | In the 2020 paper, "Rethinking Pre-Training and self-training", Zoph et al. reported that pre-training can hurt accuracy, and advocate self-training instead. 11 | 12 | 13 | == Definition == 14 | The definition of transfer learning is given in terms of domains and tasks. A domain 15 | 16 | 17 | 18 | 19 | 20 | D 21 | 22 | 23 | 24 | 25 | {\displaystyle {\mathcal {D}}} 26 | 27 | consists of: a feature space 28 | 29 | 30 | 31 | 32 | 33 | X 34 | 35 | 36 | 37 | 38 | {\displaystyle {\mathcal {X}}} 39 | 40 | and a marginal probability distribution 41 | 42 | 43 | 44 | P 45 | ( 46 | X 47 | ) 48 | 49 | 50 | {\displaystyle P(X)} 51 | 52 | , where 53 | 54 | 55 | 56 | X 57 | = 58 | { 59 | 60 | x 61 | 62 | 1 63 | 64 | 65 | , 66 | . 67 | . 68 | . 69 | , 70 | 71 | x 72 | 73 | n 74 | 75 | 76 | } 77 | ∈ 78 | 79 | 80 | X 81 | 82 | 83 | 84 | 85 | {\displaystyle X=\{x_{1},...,x_{n}\}\in {\mathcal {X}}} 86 | 87 | . Given a specific domain, 88 | 89 | 90 | 91 | 92 | 93 | D 94 | 95 | 96 | = 97 | { 98 | 99 | 100 | X 101 | 102 | 103 | , 104 | P 105 | ( 106 | X 107 | ) 108 | } 109 | 110 | 111 | {\displaystyle {\mathcal {D}}=\{{\mathcal {X}},P(X)\}} 112 | 113 | , a task consists of two components: a label space 114 | 115 | 116 | 117 | 118 | 119 | Y 120 | 121 | 122 | 123 | 124 | {\displaystyle {\mathcal {Y}}} 125 | 126 | and an objective predictive function 127 | 128 | 129 | 130 | f 131 | : 132 | 133 | 134 | X 135 | 136 | 137 | → 138 | 139 | 140 | Y 141 | 142 | 143 | 144 | 145 | {\displaystyle f:{\mathcal {X}}\rightarrow {\mathcal {Y}}} 146 | 147 | . The function 148 | 149 | 150 | 151 | f 152 | 153 | 154 | {\displaystyle f} 155 | 156 | is used to predict the corresponding label 157 | 158 | 159 | 160 | f 161 | ( 162 | x 163 | ) 164 | 165 | 166 | {\displaystyle f(x)} 167 | 168 | of a new instance 169 | 170 | 171 | 172 | x 173 | 174 | 175 | {\displaystyle x} 176 | 177 | . This task, denoted by 178 | 179 | 180 | 181 | 182 | 183 | T 184 | 185 | 186 | = 187 | { 188 | 189 | 190 | Y 191 | 192 | 193 | , 194 | f 195 | ( 196 | x 197 | ) 198 | } 199 | 200 | 201 | {\displaystyle {\mathcal {T}}=\{{\mathcal {Y}},f(x)\}} 202 | 203 | , is learned from the training data consisting of pairs 204 | 205 | 206 | 207 | { 208 | 209 | x 210 | 211 | i 212 | 213 | 214 | , 215 | 216 | y 217 | 218 | i 219 | 220 | 221 | } 222 | 223 | 224 | {\displaystyle \{x_{i},y_{i}\}} 225 | 226 | , where 227 | 228 | 229 | 230 | 231 | x 232 | 233 | i 234 | 235 | 236 | ∈ 237 | 238 | 239 | X 240 | 241 | 242 | 243 | 244 | {\displaystyle x_{i}\in {\mathcal {X}}} 245 | 246 | and 247 | 248 | 249 | 250 | 251 | y 252 | 253 | i 254 | 255 | 256 | ∈ 257 | 258 | 259 | Y 260 | 261 | 262 | 263 | 264 | {\displaystyle y_{i}\in {\mathcal {Y}}} 265 | 266 | . 267 | Given a source domain 268 | 269 | 270 | 271 | 272 | 273 | 274 | D 275 | 276 | 277 | 278 | S 279 | 280 | 281 | 282 | 283 | {\displaystyle {\mathcal {D}}_{S}} 284 | 285 | and learning task 286 | 287 | 288 | 289 | 290 | 291 | 292 | T 293 | 294 | 295 | 296 | S 297 | 298 | 299 | 300 | 301 | {\displaystyle {\mathcal {T}}_{S}} 302 | 303 | , a target domain 304 | 305 | 306 | 307 | 308 | 309 | 310 | D 311 | 312 | 313 | 314 | T 315 | 316 | 317 | 318 | 319 | {\displaystyle {\mathcal {D}}_{T}} 320 | 321 | and learning task 322 | 323 | 324 | 325 | 326 | 327 | 328 | T 329 | 330 | 331 | 332 | T 333 | 334 | 335 | 336 | 337 | {\displaystyle {\mathcal {T}}_{T}} 338 | 339 | , where 340 | 341 | 342 | 343 | 344 | 345 | 346 | D 347 | 348 | 349 | 350 | S 351 | 352 | 353 | ≠ 354 | 355 | 356 | 357 | D 358 | 359 | 360 | 361 | T 362 | 363 | 364 | 365 | 366 | {\displaystyle {\mathcal {D}}_{S}\neq {\mathcal {D}}_{T}} 367 | 368 | , or 369 | 370 | 371 | 372 | 373 | 374 | 375 | T 376 | 377 | 378 | 379 | S 380 | 381 | 382 | ≠ 383 | 384 | 385 | 386 | T 387 | 388 | 389 | 390 | T 391 | 392 | 393 | 394 | 395 | {\displaystyle {\mathcal {T}}_{S}\neq {\mathcal {T}}_{T}} 396 | 397 | , transfer learning aims to help improve the learning of the target predictive function 398 | 399 | 400 | 401 | 402 | f 403 | 404 | T 405 | 406 | 407 | ( 408 | ⋅ 409 | ) 410 | 411 | 412 | {\displaystyle f_{T}(\cdot )} 413 | 414 | in 415 | 416 | 417 | 418 | 419 | 420 | 421 | D 422 | 423 | 424 | 425 | T 426 | 427 | 428 | 429 | 430 | {\displaystyle {\mathcal {D}}_{T}} 431 | 432 | using the knowledge in 433 | 434 | 435 | 436 | 437 | 438 | 439 | D 440 | 441 | 442 | 443 | S 444 | 445 | 446 | 447 | 448 | {\displaystyle {\mathcal {D}}_{S}} 449 | 450 | and 451 | 452 | 453 | 454 | 455 | 456 | 457 | T 458 | 459 | 460 | 461 | S 462 | 463 | 464 | 465 | 466 | {\displaystyle {\mathcal {T}}_{S}} 467 | 468 | . 469 | 470 | 471 | == Applications == 472 | Algorithms for transfer learning are available in Markov logic networks and Bayesian networks. Transfer learning has been applied to cancer subtype discovery, building utilization, general game playing, text classification, digit recognition, medical imaging and spam filtering. 473 | In 2020, it was discovered that, due to their similar physical natures, transfer learning is possible between electromyographic (EMG) signals from the muscles and classifying the behaviors of electroencephalographic (EEG) brainwaves, from the gesture recognition domain to the mental state recognition domain. It was noted that this relationship worked in both directions, showing that electroencephalographic can likewise be used to classify EMG. The experiments noted that the accuracy of neural networks and convolutional neural networks were improved through transfer learning both prior to any learning (compared to standard random weight distribution) and at the end of the learning process (asymptote). That is, results are improved by exposure to another domain. Moreover, the end-user of a pre-trained model can change the structure of fully-connected layers to improve performance. 474 | 475 | 476 | == See also == 477 | Crossover (genetic algorithm) 478 | Domain adaptation 479 | General game playing 480 | Multi-task learning 481 | Multitask optimization 482 | Transfer of learning in educational psychology 483 | Zero-shot learning 484 | Feature learning 485 | external validity 486 | 487 | 488 | == References == 489 | 490 | 491 | == Sources == 492 | Thrun, Sebastian; Pratt, Lorien (6 December 2012). Learning to Learn. Springer Science & Business Media. ISBN 978-1-4615-5529-2. -------------------------------------------------------------------------------- /data/extended_wiki_articles/Transfer_learning_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Transfer learning", 3 | "url": "https://en.wikipedia.org/wiki/Transfer_learning", 4 | "fetched_at": "2025-07-16T22:51:18.059049", 5 | "content_file": "Transfer_learning.txt", 6 | "cleaned_length": 8630 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Transformer_machine_learning_model_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Transformer (machine learning model)", 3 | "url": "https://en.wikipedia.org/wiki/Transformer_(deep_learning_architecture)", 4 | "fetched_at": "2025-07-16T22:49:45.254026", 5 | "content_file": "Transformer_machine_learning_model.txt", 6 | "cleaned_length": 108813 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Tree_data_structure.txt: -------------------------------------------------------------------------------- 1 | In computer science, a tree is a widely used abstract data type that represents a hierarchical tree structure with a set of connected nodes. Each node in the tree can be connected to many children (depending on the type of tree), but must be connected to exactly one parent, except for the root node, which has no parent (i.e., the root node as the top-most node in the tree hierarchy). These constraints mean there are no cycles or "loops" (no node can be its own ancestor), and also that each child can be treated like the root node of its own subtree, making recursion a useful technique for tree traversal. In contrast to linear data structures, many trees cannot be represented by relationships between neighboring nodes (parent and children nodes of a node under consideration, if they exist) in a single straight line (called edge or link between two adjacent nodes). 2 | Binary trees are a commonly used type, which constrain the number of children for each parent to at most two. When the order of the children is specified, this data structure corresponds to an ordered tree in graph theory. A value or pointer to other data may be associated with every node in the tree, or sometimes only with the leaf nodes, which have no children nodes. 3 | The abstract data type (ADT) can be represented in a number of ways, including a list of parents with pointers to children, a list of children with pointers to parents, or a list of nodes and a separate list of parent-child relations (a specific type of adjacency list). Representations might also be more complicated, for example using indexes or ancestor lists for performance. 4 | Trees as used in computing are similar to but can be different from mathematical constructs of trees in graph theory, trees in set theory, and trees in descriptive set theory. 5 | 6 | 7 | == Terminology == 8 | A node is a structure which may contain data and connections to other nodes, sometimes called edges or links. Each node in a tree has zero or more child nodes, which are below it in the tree (by convention, trees are drawn with descendants going downwards). A node that has a child is called the child's parent node (or superior). All nodes have exactly one parent, except the topmost root node, which has none. A node might have many ancestor nodes, such as the parent's parent. Child nodes with the same parent are sibling nodes. Typically siblings have an order, with the first one conventionally drawn on the left. Some definitions allow a tree to have no nodes at all, in which case it is called empty. 9 | An internal node (also known as an inner node, inode for short, or branch node) is any node of a tree that has child nodes. Similarly, an external node (also known as an outer node, leaf node, or terminal node) is any node that does not have child nodes. 10 | The height of a node is the length of the longest downward path to a leaf from that node. The height of the root is the height of the tree. The depth of a node is the length of the path to its root (i.e., its root path). Thus the root node has depth zero, leaf nodes have height zero, and a tree with only a single node (hence both a root and leaf) has depth and height zero. Conventionally, an empty tree (tree with no nodes, if such are allowed) has height −1. 11 | Each non-root node can be treated as the root node of its own subtree, which includes that node and all its descendants. 12 | Other terms used with trees: 13 | 14 | Neighbor Parent or child. 15 | Ancestor A node reachable by repeated proceeding from child to parent. 16 | Descendant A node reachable by repeated proceeding from parent to child. Also known as subchild. 17 | Degree For a given node, its number of children. A leaf, by definition, has degree zero. 18 | Degree of tree The degree of a tree is the maximum degree of a node in the tree. 19 | Distance The number of edges along the shortest path between two nodes. 20 | Level The level of a node is the number of edges along the 21 | unique path between it and the root node. This is the same as depth. 22 | Width The number of nodes in a level. 23 | Breadth The number of leaves. 24 | Complete tree A tree with every level filled, except the last. 25 | Forest A set of one or more disjoint trees. 26 | Ordered tree A rooted tree in which an ordering is specified for the children of each vertex. 27 | Size of a tree Number of nodes in the tree. 28 | 29 | 30 | == Common operations == 31 | Enumerating all the items 32 | Enumerating a section of a tree 33 | Searching for an item 34 | Adding a new item at a certain position on the tree 35 | Deleting an item 36 | Pruning: Removing a whole section of a tree 37 | Grafting: Adding a whole section to a tree 38 | Finding the root for any node 39 | Finding the lowest common ancestor of two nodes 40 | 41 | 42 | === Traversal and search methods === 43 | 44 | Stepping through the items of a tree, by means of the connections between parents and children, is called walking the tree, and the action is a walk of the tree. Often, an operation might be performed when a pointer arrives at a particular node. A walk in which each parent node is traversed before its children is called a pre-order walk; a walk in which the children are traversed before their respective parents are traversed is called a post-order walk; a walk in which a node's left subtree, then the node itself, and finally its right subtree are traversed is called an in-order traversal. (This last scenario, referring to exactly two subtrees, a left subtree and a right subtree, assumes specifically a binary tree.) A level-order walk effectively performs a breadth-first search over the entirety of a tree; nodes are traversed level by level, where the root node is visited first, followed by its direct child nodes and their siblings, followed by its grandchild nodes and their siblings, etc., until all nodes in the tree have been traversed. 45 | 46 | 47 | == Representations == 48 | There are many different ways to represent trees. In working memory, nodes are typically dynamically allocated records with pointers to their children, their parents, or both, as well as any associated data. If of a fixed size, the nodes might be stored in a list. Nodes and relationships between nodes might be stored in a separate special type of adjacency list. In relational databases, nodes are typically represented as table rows, with indexed row IDs facilitating pointers between parents and children. 49 | Nodes can also be stored as items in an array, with relationships between them determined by their positions in the array (as in a binary heap). 50 | A binary tree can be implemented as a list of lists: the head of a list (the value of the first term) is the left child (subtree), while the tail (the list of second and subsequent terms) is the right child (subtree). This can be modified to allow values as well, as in Lisp S-expressions, where the head (value of first term) is the value of the node, the head of the tail (value of second term) is the left child, and the tail of the tail (list of third and subsequent terms) is the right child. 51 | Ordered trees can be naturally encoded by finite sequences, for example with natural numbers. 52 | 53 | 54 | == Examples of trees and non-trees == 55 | 56 | 57 | == Type theory == 58 | 59 | As an abstract data type, the abstract tree type T with values of some type E is defined, using the abstract forest type F (list of trees), by the functions: 60 | 61 | value: T → E 62 | children: T → F 63 | nil: () → F 64 | node: E × F → T 65 | with the axioms: 66 | 67 | value(node(e, f)) = e 68 | children(node(e, f)) = f 69 | In terms of type theory, a tree is an inductive type defined by the constructors nil (empty forest) and node (tree with root node with given value and children). 70 | 71 | 72 | == Mathematical terminology == 73 | 74 | Viewed as a whole, a tree data structure is an ordered tree, generally with values attached to each node. Concretely, it is (if required to be non-empty): 75 | 76 | A rooted tree with the "away from root" direction (a more narrow term is an "arborescence"), meaning: 77 | A directed graph, 78 | whose underlying undirected graph is a tree (any two vertices are connected by exactly one simple path), 79 | with a distinguished root (one vertex is designated as the root), 80 | which determines the direction on the edges (arrows point away from the root; given an edge, the node that the edge points from is called the parent and the node that the edge points to is called the child), together with: 81 | an ordering on the child nodes of a given node, and 82 | a value (of some data type) at each node. 83 | Often trees have a fixed (more properly, bounded) branching factor (outdegree), particularly always having two child nodes (possibly empty, hence at most two non-empty child nodes), hence a "binary tree". 84 | Allowing empty trees makes some definitions simpler, some more complicated: a rooted tree must be non-empty, hence if empty trees are allowed the above definition instead becomes "an empty tree or a rooted tree such that ...". On the other hand, empty trees simplify defining fixed branching factor: with empty trees allowed, a binary tree is a tree such that every node has exactly two children, each of which is a tree (possibly empty). 85 | 86 | 87 | == Applications == 88 | Trees are commonly used to represent or manipulate hierarchical data in applications such as: 89 | 90 | File systems for: 91 | Directory structure used to organize subdirectories and files (symbolic links create non-tree graphs, as do multiple hard links to the same file or directory) 92 | The mechanism used to allocate and link blocks of data on the storage device 93 | Class hierarchy or "inheritance tree" showing the relationships among classes in object-oriented programming; multiple inheritance produces non-tree graphs 94 | Abstract syntax trees for computer languages 95 | Natural language processing: 96 | Parse trees 97 | Modeling utterances in a generative grammar 98 | Dialogue tree for generating conversations 99 | Document Object Models ("DOM tree") of XML and HTML documents 100 | Search trees store data in a way that makes an efficient search algorithm possible via tree traversal 101 | A binary search tree is a type of binary tree 102 | Representing sorted lists of data 103 | Computer-generated imagery: 104 | Space partitioning, including binary space partitioning 105 | Digital compositing 106 | Storing Barnes–Hut trees used to simulate galaxies 107 | Implementing heaps 108 | Nested set collections 109 | Hierarchical taxonomies such as the Dewey Decimal Classification with sections of increasing specificity. 110 | Hierarchical temporal memory 111 | Genetic programming 112 | Hierarchical clustering 113 | Trees can be used to represent and manipulate various mathematical structures, such as: 114 | 115 | Paths through an arbitrary node-and-edge graph (including multigraphs), by making multiple nodes in the tree for each graph node used in multiple paths 116 | Any mathematical hierarchy 117 | Tree structures are often used for mapping the relationships between things, such as: 118 | 119 | Components and subcomponents which can be visualized in an exploded-view drawing 120 | Subroutine calls used to identify which subroutines in a program call other subroutines non recursively 121 | Inheritance of DNA among species by evolution, of source code by software projects (e.g. Linux distribution timeline), of designs in various types of cars, etc. 122 | The contents of hierarchical namespaces 123 | JSON and YAML documents can be thought of as trees, but are typically represented by nested lists and dictionaries. 124 | 125 | 126 | == See also == 127 | Distributed tree search 128 | Category:Trees (data structures) (catalogs types of computational trees) 129 | 130 | 131 | == Notes == 132 | 133 | 134 | == References == 135 | 136 | 137 | == Further reading == 138 | Donald Knuth. The Art of Computer Programming: Fundamental Algorithms, Third Edition. Addison-Wesley, 1997. ISBN 0-201-89683-4 . Section 2.3: Trees, pp. 308–423. 139 | Thomas H. Cormen, Charles E. Leiserson, Ronald L. Rivest, and Clifford Stein. Introduction to Algorithms, Second Edition. MIT Press and McGraw-Hill, 2001. ISBN 0-262-03293-7. Section 10.4: Representing rooted trees, pp. 214–217. Chapters 12–14 (Binary Search Trees, Red–Black Trees, Augmenting Data Structures), pp. 253–320. 140 | 141 | 142 | == External links == 143 | 144 | Description from the Dictionary of Algorithms and Data Structures -------------------------------------------------------------------------------- /data/extended_wiki_articles/Tree_data_structure_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Tree (data structure)", 3 | "url": "https://en.wikipedia.org/wiki/Tree_(abstract_data_type)", 4 | "fetched_at": "2025-07-16T23:30:42.549484", 5 | "content_file": "Tree_data_structure.txt", 6 | "cleaned_length": 11786 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/Weight.txt: -------------------------------------------------------------------------------- 1 | Height is measure of vertical distance, either vertical extent (how "tall" something or someone is) or vertical position (how "high" a point is). For an example of vertical extent, "This basketball player is 7 foot 1 inches in height." For an example of vertical position, "The height of an airplane in-flight is about 10,000 meters." 2 | When the term is used to describe vertical position (of, e.g., an airplane) from sea level, height is more often called altitude. 3 | Furthermore, if the point is attached to the Earth (e.g., a mountain peak), then altitude (height above sea level) is called elevation. 4 | In a two-dimensional Cartesian space, height is measured along the vertical axis (y) between a specific point and another that does not have the same y-value. If both points happen to have the same y-value, then their relative height is zero. In the case of three-dimensional space, height is measured along the vertical z axis, describing a distance from (or "above") the x-y plane. 5 | 6 | 7 | == Etymology == 8 | The English-language word high is derived from Old English hēah, ultimately from Proto-Germanic *xauxa-z, from a PIE base *keuk-. The derived noun height, also the obsolete forms heighth and highth, is from Old English híehþo, later héahþu, as it were from Proto-Germanic *xaux-iþa. 9 | 10 | 11 | == In mathematics == 12 | In elementary models of space, height may indicate the third dimension, the other two being length and width. Height is normal to the plane formed by the length and width. 13 | Height is also used as a name for some more abstract definitions. These include: 14 | 15 | The height or altitude of a triangle, which is the length from a vertex of a triangle to the line formed by the opposite side; 16 | The height of a pyramid, which is the smallest distance from the apex to the base; 17 | A measurement in a circular segment of the distance from the midpoint of the arc of the circular segment to the midpoint of the line joining the endpoints of the arc (see diagram in circular segment); 18 | In a rooted tree, the height of a vertex is the length of the longest downward path to a leaf from that vertex; 19 | In algebraic number theory, a "height function" is a measurement related to the minimal polynomial of an algebraic number; among other uses in commutative algebra and representation theory; 20 | In ring theory, the height of a prime ideal is the supremum of the lengths of all chains of prime ideals contained in it. 21 | 22 | 23 | == In geosciences == 24 | 25 | Although height is normally relative to a plane of reference, most measurements of height in the physical world are based upon a zero surface, known as sea level. Both altitude and elevation, two synonyms for height, are usually defined as the position of a point above the mean sea level. One can extend the sea-level surface under the continents: naively, one can imagine a lot of narrow canals through the continents. In practice, the sea level under a continent has to be computed from gravity measurements, and slightly different computational methods exist; see Geodesy, heights. 26 | In addition to vertical position, the vertical extent of geographic landmarks can be defined in terms of topographic prominence. For example, the highest mountain (by elevation in reference to sea level) belongs to Mount Everest, located on the border of Nepal and Tibet, China; however the tallest mountain, by measurement of apex to base, is Mauna Kea in Hawaii, United States. 27 | 28 | 29 | === In geodesy === 30 | 31 | Geodesists formalize mean sea level (MSL) by means of the geoid, the equipotential surface that best fits MSL. Then various types of height (normal, dynamic, orthometric, etc.) can be defined, based on the assumption of density of topographic masses necessary in the continuation of MSL under the continents. 32 | A purely geometric quantity is the ellipsoidal height, reckoned from the surface of a reference ellipsoid, see Geodetic system, vertical datum. 33 | 34 | 35 | == In aviation == 36 | 37 | In aviation terminology, the terms height, altitude, and elevation are not synonyms. Usually, the altitude of an aircraft is measured from sea level, while its height is measured from ground level. Elevation is also measured from sea level, but is most often regarded as a property of the ground. Thus, elevation plus height can equal altitude, but the term altitude has several meanings in aviation. 38 | 39 | 40 | == In human culture == 41 | Human height is one of the areas of study within anthropometry. While environmental factors have some effect on variations in human height, these influences are insufficient to account for all differences between populations, suggesting that genetic factors are important for explaining variations between human populations. 42 | The United Nations uses height (among other statistics) to monitor changes in the nutrition of developing nations. In human populations, average height can distill down complex data about the group's birth, upbringing, social class, diet, and health care system. 43 | In their research, Baten, Stegl and van der Eng came to the conclusion that a change in the average height is a sign for a change in the economic development. With broad data of Indonesia, the researchers state that several incidents in the history of the country has led not only to a change in the economy but also to a change in the population's average height. 44 | 45 | 46 | == See also == 47 | 48 | 49 | == References == 50 | 51 | 52 | == External links == 53 | Media related to Height at Wikimedia Commons 54 | The dictionary definition of height at Wiktionary 55 | The dictionary definition of high at Wiktionary -------------------------------------------------------------------------------- /data/extended_wiki_articles/Weight_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Weight", 3 | "url": "https://en.wikipedia.org/wiki/Height", 4 | "fetched_at": "2025-07-16T23:30:50.322300", 5 | "content_file": "Weight.txt", 6 | "cleaned_length": 5425 7 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/arXiv_1706.03762_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Attention Is All You Need (arXiv:1706.03762)", 3 | "source": "arXiv", 4 | "url": "http://arxiv.org/pdf/1706.03762v7", 5 | "word_count": 4570, 6 | "character_count": 29993, 7 | "fetched_at": "2025-07-17T00:52:05.577684", 8 | "confusion_article": true, 9 | "arxiv_id": "1706.03762" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/arXiv_2005.14165_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Language Models are Few-Shot Learners (arXiv:2005.14165)", 3 | "source": "arXiv", 4 | "url": "http://arxiv.org/pdf/2005.14165v4", 5 | "word_count": 37995, 6 | "character_count": 236547, 7 | "fetched_at": "2025-07-17T00:52:09.715988", 8 | "confusion_article": true, 9 | "arxiv_id": "2005.14165" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/arXiv_2307.09288_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "Llama 2: Open Foundation and Fine-Tuned Chat Models (arXiv:2307.09288)", 3 | "source": "arXiv", 4 | "url": "http://arxiv.org/pdf/2307.09288v2", 5 | "word_count": 30533, 6 | "character_count": 249507, 7 | "fetched_at": "2025-07-17T00:52:01.770880", 8 | "confusion_article": true, 9 | "arxiv_id": "2307.09288" 10 | } -------------------------------------------------------------------------------- /data/extended_wiki_articles/dataset_summary.json: -------------------------------------------------------------------------------- 1 | { 2 | "dataset_name": "Extended Wikipedia Articles for RAG Workshop", 3 | "created_at": "2025-07-16T23:32:16.846217", 4 | "total_articles": 63, 5 | "articles": [ 6 | "BERT (language model)", 7 | "Node", 8 | "Electrical engineering", 9 | "Graph theory", 10 | "Control theory", 11 | "Software engineering", 12 | "Tree (data structure)", 13 | "Algorithm", 14 | "Feature selection", 15 | "Decision tree learning", 16 | "Deep learning", 17 | "Fuzzy logic", 18 | "Cognitive science", 19 | "Cluster analysis", 20 | "Generative pre-trained transformer", 21 | "Human\u2013computer interaction", 22 | "Feature recognition", 23 | "Support vector machine", 24 | "Weight", 25 | "Psychology", 26 | "Overfitting", 27 | "Layer", 28 | "Knowledge representation and reasoning", 29 | "Kernel (computing)", 30 | "Class (biology)", 31 | "Optimization (mathematics)", 32 | "Cybernetics", 33 | "Artificial neural network", 34 | "History of artificial intelligence", 35 | "Ensemble learning", 36 | "Bayesian network", 37 | "Computer science", 38 | "Neuroscience", 39 | "Transfer learning", 40 | "Linear algebra", 41 | "Statistics", 42 | "MORE (application)", 43 | "Information theory", 44 | "Evolutionary computation", 45 | "Dimensionality reduction", 46 | "Cryptography", 47 | "Transformer (machine learning model)", 48 | "Database management system", 49 | "Signal processing", 50 | "Computer network", 51 | "Operating system", 52 | "Node (computer science)", 53 | "Risk", 54 | "Portfolio (finance)", 55 | "Mathematical optimization", 56 | "Quantitative analysis (finance)", 57 | "Biology", 58 | "Evolution", 59 | "Bioinformatics", 60 | "Computational biology", 61 | "Physics", 62 | "Mathematics", 63 | "Statistical mechanics", 64 | "John von Neumann", 65 | "Artificial intelligence", 66 | "Engineering", 67 | "Data structure", 68 | "Software" 69 | ], 70 | "purpose": "Demonstrating naive RAG limitations with expanded knowledge base", 71 | "usage": "Used in naive-rag-challenges.ipynb for workshop demonstrations", 72 | "confusion_articles_added": { 73 | "timestamp": "2025-07-17T00:52:18.154965", 74 | "successful": [ 75 | "LLM Powered Autonomous Agents", 76 | "The Transformer Family Version 2.0", 77 | "What are Diffusion Models?", 78 | "Flow-based Deep Generative Models", 79 | "Learning with not Enough Data Part 3: Active Learning", 80 | "Contrastive Representation Learning", 81 | "Llama 2: Open Foundation and Fine-Tuned Chat Models", 82 | "Attention Is All You Need", 83 | "Language Models are Few-Shot Learners", 84 | "GPT-4 Technical Report" 85 | ], 86 | "failed": [ 87 | "Constitutional AI: Harmlessness from AI Feedback" 88 | ], 89 | "total_added": 10 90 | } 91 | } -------------------------------------------------------------------------------- /data/ingestion_summary.json: -------------------------------------------------------------------------------- 1 | { 2 | "ingestion_completed_at": "2025-07-17T00:56:04.438031", 3 | "collection_name": "workshop_wikipedia_extended", 4 | "embedding_model": "text-embedding-3-small", 5 | "embedding_dimension": 1536, 6 | "chunk_size": 300, 7 | "chunk_overlap": 50, 8 | "total_articles": 61, 9 | "total_chunks": 1210, 10 | "qdrant_url": "https://193ab6bf-6a0b-4687-9f5a-5c371f663592.eu-west-1-0.aws.cloud.qdrant.io", 11 | "usage_instructions": { 12 | "connection": "Use QDRANT_URL and QDRANT_API_KEY environment variables", 13 | "collection": "workshop_wikipedia_extended", 14 | "search_example": "client.query_points(collection_name, query=embedding, limit=5)" 15 | } 16 | } -------------------------------------------------------------------------------- /data/wiki_eval_dataset.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "question": "Who introduced the ReLU (rectified linear unit) activation function and in what year?", 4 | "ground_truth": "Kunihiko Fukushima in 1969.", 5 | "reference_context": "In 1969, Kunihiko Fukushima introduced the ReLU (rectified linear unit) activation function." 6 | }, 7 | { 8 | "question": "What was the first working deep learning algorithm and who published it?", 9 | "ground_truth": "The Group method of data handling, published by Alexey Ivakhnenko and Lapa in 1965.", 10 | "reference_context": "The first working deep learning algorithm was the Group method of data handling, a method to train arbitrarily deep neural networks, published by Alexey Ivakhnenko and Lapa in the Soviet Union (1965)." 11 | }, 12 | { 13 | "question": "Which CNN achieved superhuman performance in a visual pattern recognition contest for the first time in 2011?", 14 | "ground_truth": "DanNet by Dan Ciresan, Ueli Meier, Jonathan Masci, Luca Maria Gambardella, and Jürgen Schmidhuber.", 15 | "reference_context": "In 2011, a CNN named DanNet by Dan Ciresan, Ueli Meier, Jonathan Masci, Luca Maria Gambardella, and Jürgen Schmidhuber achieved for the first time superhuman performance in a visual pattern recognition contest, outperforming traditional methods by a factor of 3." 16 | }, 17 | { 18 | "question": "When was BERT introduced and by which organization?", 19 | "ground_truth": "October 2018 by researchers at Google.", 20 | "reference_context": "Bidirectional encoder representations from transformers (BERT) is a language model introduced in October 2018 by researchers at Google." 21 | }, 22 | { 23 | "question": "What are the two model sizes BERT was originally implemented in?", 24 | "ground_truth": "BERTBASE (110 million parameters) and BERTLARGE (340 million parameters).", 25 | "reference_context": "BERT was originally implemented in the English language at two model sizes, BERTBASE (110 million parameters) and BERTLARGE (340 million parameters)." 26 | }, 27 | { 28 | "question": "What percentage of tokens are randomly selected for the masked-prediction task in BERT's masked language modeling?", 29 | "ground_truth": "15% of tokens.", 30 | "reference_context": "In masked language modeling, 15% of tokens would be randomly selected for masked-prediction task, and the training objective was to predict the masked token given its context." 31 | }, 32 | { 33 | "question": "Who introduced the term 'deep learning' to the machine learning community and when?", 34 | "ground_truth": "Rina Dechter in 1986.", 35 | "reference_context": "The term deep learning was introduced to the machine learning community by Rina Dechter in 1986, and to artificial neural networks by Igor Aizenberg and colleagues in 2000, in the context of Boolean threshold neurons." 36 | }, 37 | { 38 | "question": "Which three researchers were awarded the 2018 Turing Award for their work on deep neural networks?", 39 | "ground_truth": "Yoshua Bengio, Geoffrey Hinton and Yann LeCun.", 40 | "reference_context": "Yoshua Bengio, Geoffrey Hinton and Yann LeCun were awarded the 2018 Turing Award for \"conceptual and engineering breakthroughs that have made deep neural networks a critical component of computing\"." 41 | }, 42 | { 43 | "question": "When was the first GPT introduced and by which organization?", 44 | "ground_truth": "2018 by OpenAI.", 45 | "reference_context": "The first GPT was introduced in 2018 by OpenAI." 46 | }, 47 | { 48 | "question": "What were the three parameter sizes of the first versions of GPT-3 released in July 2020?", 49 | "ground_truth": "1B, 6.7B, and 175B parameters.", 50 | "reference_context": "Regarding more recent GPT foundation models, OpenAI published its first versions of GPT-3 in July 2020. There were three models, with 1B, 6.7B, 175B parameters, respectively named babbage, curie, and davinci (giving initials B, C, and D)." 51 | }, 52 | { 53 | "question": "What is the 'one in ten rule' in regression analysis?", 54 | "ground_truth": "The guideline of 10 observations per independent variable.", 55 | "reference_context": "For logistic regression or Cox proportional hazards models, there are a variety of rules of thumb (e.g. 5–9, 10 and 10–15 — the guideline of 10 observations per independent variable is known as the \"one in ten rule\")." 56 | }, 57 | { 58 | "question": "What is the essence of overfitting according to the article?", 59 | "ground_truth": "To have unknowingly extracted some of the residual variation (i.e., the noise) as if that variation represented underlying model structure.", 60 | "reference_context": "The essence of overfitting is to have unknowingly extracted some of the residual variation (i.e., the noise) as if that variation represented underlying model structure." 61 | }, 62 | { 63 | "question": "In which year and paper was the modern version of the transformer proposed?", 64 | "ground_truth": "2017 in the paper \"Attention Is All You Need\" by researchers at Google.", 65 | "reference_context": "The modern version of the transformer was proposed in the 2017 paper \"Attention Is All You Need\" by researchers at Google." 66 | }, 67 | { 68 | "question": "What value did the original Transformer paper use for the parameter N in positional encoding?", 69 | "ground_truth": "N = 10000.", 70 | "reference_context": "The original paper uses N = 10000." 71 | } 72 | ] -------------------------------------------------------------------------------- /default.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sarangk90/building-rag-app-workshop/0a781d4daef8de00305e46a0626a47dab0309cf3/default.sqlite -------------------------------------------------------------------------------- /imgs/ingestion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sarangk90/building-rag-app-workshop/0a781d4daef8de00305e46a0626a47dab0309cf3/imgs/ingestion.png -------------------------------------------------------------------------------- /imgs/naive-rag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Sarangk90/building-rag-app-workshop/0a781d4daef8de00305e46a0626a47dab0309cf3/imgs/naive-rag.png -------------------------------------------------------------------------------- /naive-rag/rag_evaluator_v2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simplified RAG Evaluator using modern RAGAS API 3 | Evaluates RAG systems using context-focused metrics. 4 | """ 5 | 6 | import json 7 | import pandas as pd 8 | from pathlib import Path 9 | from typing import List, Dict, Any, Callable 10 | 11 | from ragas import evaluate 12 | from ragas.metrics import LLMContextRecall, LLMContextPrecisionWithReference, ContextRelevance 13 | from ragas.dataset_schema import SingleTurnSample, EvaluationDataset 14 | from ragas.llms import LangchainLLMWrapper 15 | from langchain_openai import ChatOpenAI 16 | 17 | 18 | def evaluate_naive_rag_v2( 19 | vector_search_func: Callable, 20 | generate_answer_func: Callable, 21 | show_detailed: bool = False, 22 | ) -> Dict[str, Any]: 23 | """ 24 | Evaluate RAG system using RAGAS context metrics. 25 | 26 | Args: 27 | vector_search_func: Function to perform vector search 28 | generate_answer_func: Function to generate answers 29 | show_detailed: Whether to show detailed individual question results 30 | 31 | Returns: 32 | Dictionary containing evaluation results 33 | """ 34 | # Load evaluation dataset 35 | base_dir = Path(__file__).resolve().parent.parent 36 | eval_dataset_path = base_dir / "data" / "wiki_eval_dataset.json" 37 | 38 | try: 39 | with eval_dataset_path.open("r", encoding="utf-8") as f: 40 | eval_data = json.load(f) 41 | print(f"✅ Loaded {len(eval_data)} questions from evaluation dataset") 42 | except FileNotFoundError: 43 | print(f"❌ Evaluation dataset not found at {eval_dataset_path}") 44 | return {"error": "Dataset not found"} 45 | 46 | # Initialize LLM for RAGAS metrics 47 | llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o", temperature=0)) 48 | 49 | # Initialize metrics 50 | metrics = [ 51 | LLMContextRecall(llm=llm), 52 | ] 53 | 54 | # Process each question 55 | samples = [] 56 | individual_results = [] 57 | print(f"\nEvaluating {len(eval_data)} questions...\n") 58 | 59 | for i, item in enumerate(eval_data): 60 | print(f"Question {i+1}/{len(eval_data)}: {item['question'][:50]}...") 61 | 62 | try: 63 | # Get RAG system outputs 64 | search_results = vector_search_func(item["question"]) 65 | retrieved_contexts = [result.get("text", "") for result in search_results] 66 | generated_answer = generate_answer_func(item["question"]) 67 | 68 | # Create RAGAS sample with reference context 69 | sample = SingleTurnSample( 70 | user_input=item["question"], 71 | retrieved_contexts=retrieved_contexts, 72 | response=generated_answer, 73 | reference=item["ground_truth"], 74 | reference_contexts=[item["reference_context"]] # Use reference_context! 75 | ) 76 | samples.append(sample) 77 | 78 | # Store individual question data for later display 79 | individual_results.append({ 80 | "question": item["question"], 81 | "retrieved_contexts": retrieved_contexts, 82 | "generated_answer": generated_answer, 83 | "ground_truth": item["ground_truth"], 84 | "reference_context": item["reference_context"] 85 | }) 86 | 87 | except Exception as e: 88 | print(f" ⚠️ Error processing question: {e}") 89 | 90 | # Run RAGAS evaluation 91 | print("\n🔍 Running RAGAS evaluation...") 92 | dataset = EvaluationDataset(samples) 93 | results = evaluate(dataset, metrics=metrics) 94 | 95 | # Format and display results 96 | formatted_results = _format_results(results, individual_results) 97 | _print_results(formatted_results, show_detailed) 98 | 99 | return formatted_results 100 | 101 | 102 | def _format_results(results, individual_results: List[Dict] = None) -> Dict[str, Any]: 103 | """Format RAGAS results into a clean dictionary.""" 104 | formatted = { 105 | "metrics": {}, 106 | "aggregate_scores": {}, 107 | "individual_results": [] 108 | } 109 | 110 | # Extract scores from results DataFrame 111 | df = results.to_pandas() 112 | 113 | # Get metric columns (they might have different names) 114 | metric_mapping = { 115 | "context_recall": ["context_recall", "llm_context_recall"], 116 | } 117 | 118 | # Find the actual column name for context recall 119 | actual_recall_column = None 120 | for metric_name, possible_columns in metric_mapping.items(): 121 | for col in possible_columns: 122 | if col in df.columns: 123 | actual_recall_column = col 124 | score = df[col].mean() 125 | formatted["metrics"][metric_name] = float(score) 126 | formatted["aggregate_scores"][metric_name] = float(score) 127 | break 128 | if actual_recall_column: 129 | break 130 | 131 | # Calculate overall score 132 | if formatted["metrics"]: 133 | formatted["aggregate_scores"]["overall_context_score"] = float( 134 | sum(formatted["metrics"].values()) / len(formatted["metrics"]) 135 | ) 136 | 137 | # Add individual question results with their scores 138 | if individual_results and not df.empty and actual_recall_column: 139 | for i, individual_result in enumerate(individual_results): 140 | question_result = { 141 | "question": individual_result["question"], 142 | "generated_answer": individual_result["generated_answer"], 143 | "ground_truth": individual_result["ground_truth"], 144 | "scores": {} 145 | } 146 | 147 | # Extract individual score for this question 148 | if i < len(df): 149 | score = df.iloc[i][actual_recall_column] 150 | if not pd.isna(score): 151 | question_result["scores"]["context_recall"] = float(score) 152 | 153 | formatted["individual_results"].append(question_result) 154 | 155 | return formatted 156 | 157 | 158 | def _print_results(results: Dict[str, Any], show_detailed: bool = False): 159 | """Print formatted evaluation results.""" 160 | print("\n" + "=" * 60) 161 | print("RAGAS EVALUATION RESULTS") 162 | print("=" * 60) 163 | 164 | if "error" in results: 165 | print(f"❌ Error: {results['error']}") 166 | return 167 | 168 | # Display individual question results if requested or show summary 169 | individual_results = results.get("individual_results", []) 170 | if individual_results: 171 | if show_detailed: 172 | print("\n📋 DETAILED INDIVIDUAL QUESTION RESULTS:") 173 | print("-" * 60) 174 | 175 | for i, result in enumerate(individual_results, 1): 176 | print(f"\n{'='*40} Question {i} {'='*40}") 177 | print(f"❓ QUESTION: {result['question']}") 178 | 179 | # Display scores 180 | scores = result.get("scores", {}) 181 | if "context_recall" in scores: 182 | score = scores["context_recall"] 183 | icon = _get_score_icon(score) 184 | print(f"\n📊 {icon} Context Recall Score: {score:.3f}") 185 | 186 | # Display generated answer 187 | print(f"\n🤖 GENERATED ANSWER:") 188 | print(f"{result.get('generated_answer', 'No answer generated')}") 189 | 190 | # Display ground truth 191 | print(f"\n✅ GROUND TRUTH:") 192 | print(f"{result.get('ground_truth', 'No ground truth available')}") 193 | 194 | print(f"\n{'-'*80}") 195 | else: 196 | print("\n📋 INDIVIDUAL QUESTION SCORES:") 197 | print("-" * 60) 198 | 199 | individual_scores_shown = 0 200 | for i, result in enumerate(individual_results, 1): 201 | scores = result.get("scores", {}) 202 | if "context_recall" in scores: 203 | score = scores["context_recall"] 204 | icon = _get_score_icon(score) 205 | question_preview = result['question'][:60] + "..." if len(result['question']) > 60 else result['question'] 206 | print(f"{i:2d}. {icon} {score:.3f} - {question_preview}") 207 | individual_scores_shown += 1 208 | else: 209 | question_preview = result['question'][:60] + "..." if len(result['question']) > 60 else result['question'] 210 | print(f"{i:2d}. ❓ N/A - {question_preview}") 211 | 212 | if individual_scores_shown == 0: 213 | print("⚠️ Individual scores not available - showing aggregate only") 214 | 215 | # Display aggregate results 216 | scores = results.get("aggregate_scores", {}) 217 | 218 | print("\n" + "=" * 60) 219 | print("📊 AGGREGATE RESULTS") 220 | print("=" * 60) 221 | 222 | print("\nCONTEXT RECALL METRIC (0.0 - 1.0 scale):") 223 | if "context_recall" in scores: 224 | score = scores["context_recall"] 225 | icon = _get_score_icon(score) 226 | print(f" {icon} Context Recall: {score:.3f}") 227 | 228 | print("=" * 60) 229 | 230 | if not show_detailed and individual_results: 231 | print("\n💡 Tip: Add show_detailed=True to see full question details") 232 | 233 | 234 | def _get_score_icon(score: float) -> str: 235 | """Get icon based on score.""" 236 | if score >= 0.8: 237 | return "🟢" 238 | elif score >= 0.6: 239 | return "🟡" 240 | elif score >= 0.4: 241 | return "🟠" 242 | else: 243 | return "🔴" 244 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wikipedia==1.4.0 2 | mwparserfromhell==0.7.2 3 | beautifulsoup4==4.13.4 4 | openai==1.97.0 5 | tqdm==4.67.1 6 | python-dotenv==1.1.1 7 | ipykernel==6.29.5 8 | nest-asyncio==1.6.0 9 | ipywidgets==8.1.7 10 | spacy==3.8.7 11 | qdrant-client==1.15.0 12 | matplotlib==3.10.3 13 | plotly==6.2.0 14 | pandas==2.3.1 15 | scikit-learn==1.7.1 16 | FlagEmbedding==1.3.5 17 | cohere==5.16.1 18 | numpy>=1.24,<2.2 19 | ragas==0.3.0 20 | datasets==4.0.0 21 | Pillow==11.3.0 22 | fastembed==0.7.1 23 | ranx==0.3.20 24 | 25 | #force pinning dependencies for compatibility with umap-learn 26 | numba>=0.58.0 27 | llvmlite>=0.41.0 28 | umap-learn==0.5.9.post2 29 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | # RAG Workshop Scripts 2 | 3 | This directory contains automation scripts to streamline the workshop experience by pre-processing data and setting up cloud infrastructure. 4 | 5 | ## Overview 6 | 7 | The scripts eliminate the need for students to repeatedly fetch, clean, and embed Wikipedia articles during the workshop, allowing them to focus on learning RAG concepts rather than data preparation. 8 | 9 | ## Scripts 10 | 11 | ### 1. `create_extended_dataset.py` 12 | 13 | **Purpose**: Fetches and processes Wikipedia articles for the extended dataset used in naive-rag-challenges. 14 | 15 | **Features**: 16 | - Fetches 47 Wikipedia articles covering ML, AI, and related topics 17 | - Cleans text by removing wiki markup and citations 18 | - Saves articles as individual text files with metadata 19 | - Idempotent operation (skips already downloaded articles) 20 | - Robust error handling with retry logic 21 | 22 | **Usage**: 23 | ```bash 24 | python scripts/create_extended_dataset.py 25 | ``` 26 | 27 | **Output**: 28 | - Creates `data/extended_wiki_articles/` directory 29 | - Saves individual article files (e.g., `Deep_learning.txt`) 30 | - Creates metadata files (e.g., `Deep_learning_metadata.json`) 31 | - Generates `dataset_summary.json` with overview 32 | 33 | **Requirements**: 34 | - `wikipedia`, `mwparserfromhell`, `beautifulsoup4`, `tqdm` 35 | 36 | ### 2. `ingest_to_qdrant_cloud.py` 37 | 38 | **Purpose**: Ingests the extended dataset into Qdrant Cloud for workshop use. 39 | 40 | **Features**: 41 | - Loads processed articles from extended dataset 42 | - Chunks text using consistent parameters (1000 chars, 100 overlap) 43 | - Creates embeddings using OpenAI text-embedding-3-small 44 | - Uploads to Qdrant Cloud with proper collection setup 45 | - Batch processing for efficiency and API rate limit management 46 | - Verification and testing of ingested data 47 | 48 | **Usage**: 49 | ```bash 50 | python scripts/ingest_to_qdrant_cloud.py 51 | ``` 52 | 53 | **Prerequisites**: 54 | 1. Run `create_extended_dataset.py` first 55 | 2. Set environment variables: 56 | ```bash 57 | OPENAI_API_KEY=your_openai_key 58 | QDRANT_URL=your_qdrant_cloud_url 59 | QDRANT_API_KEY=your_qdrant_api_key 60 | ``` 61 | 62 | **Output**: 63 | - Creates `workshop_wikipedia_extended` collection in Qdrant Cloud 64 | - Uploads ~2000+ chunks with embeddings 65 | - Generates `data/ingestion_summary.json` with details 66 | 67 | **Requirements**: 68 | - `openai`, `qdrant-client`, `python-dotenv`, `tqdm` 69 | 70 | ## Workflow 71 | 72 | ### For Workshop Instructors: 73 | 74 | 1. **Setup Environment**: 75 | ```bash 76 | cp .env.example .env 77 | # Edit .env with your API keys 78 | ``` 79 | 80 | 2. **Create Extended Dataset**: 81 | ```bash 82 | python scripts/create_extended_dataset.py 83 | ``` 84 | 85 | 3. **Ingest to Qdrant Cloud**: 86 | ```bash 87 | python scripts/ingest_to_qdrant_cloud.py 88 | ``` 89 | 90 | 4. **Distribute to Students**: 91 | - Provide students with Qdrant Cloud URL and API keys 92 | - Students use the streamlined notebook version 93 | 94 | ### For Students: 95 | 96 | Students only need: 97 | - OpenAI API key for query embeddings 98 | - Qdrant Cloud URL and API key (provided by instructor) 99 | - Use `naive-rag-challenges-streamlined.ipynb` 100 | 101 | ## Configuration 102 | 103 | ### Environment Variables 104 | 105 | ```bash 106 | # Required for both scripts 107 | OPENAI_API_KEY=sk-... 108 | 109 | # Required for ingestion script 110 | QDRANT_URL=https://your-cluster.qdrant.tech 111 | QDRANT_API_KEY=your-api-key 112 | ``` 113 | 114 | ### Collection Settings 115 | 116 | - **Collection Name**: `workshop_wikipedia_extended` 117 | - **Vector Dimension**: 1536 (text-embedding-3-small) 118 | - **Distance Metric**: COSINE 119 | - **Chunk Size**: 1000 characters 120 | - **Chunk Overlap**: 100 characters 121 | 122 | ## Benefits 123 | 124 | ### Time Savings 125 | - **Data Fetching**: ~5-10 minutes saved per student 126 | - **Embedding Creation**: ~10-15 minutes saved per student 127 | - **Setup Complexity**: Reduced from complex to simple connection 128 | 129 | ### Consistency 130 | - All students work with identical, pre-processed data 131 | - Eliminates variations due to Wikipedia content changes 132 | - Standardized chunking and embedding parameters 133 | 134 | ### Scalability 135 | - Qdrant Cloud handles concurrent student access 136 | - No individual API rate limit issues during workshops 137 | - Reliable performance regardless of class size 138 | 139 | ### Focus 140 | - Students concentrate on RAG concepts, not infrastructure 141 | - More time for advanced techniques and evaluation 142 | - Reduced troubleshooting of data preparation issues 143 | 144 | ## Troubleshooting 145 | 146 | ### Common Issues 147 | 148 | 1. **Wikipedia API Errors**: 149 | - Script includes retry logic and disambiguation handling 150 | - Check internet connection and Wikipedia availability 151 | 152 | 2. **OpenAI API Rate Limits**: 153 | - Ingestion script includes batch processing and delays 154 | - Monitor API usage and adjust batch sizes if needed 155 | 156 | 3. **Qdrant Connection Issues**: 157 | - Verify QDRANT_URL and QDRANT_API_KEY are correct 158 | - Check Qdrant Cloud cluster status 159 | 160 | 4. **Missing Dependencies**: 161 | ```bash 162 | pip install -r requirements.txt 163 | ``` 164 | 165 | ### Verification 166 | 167 | After running scripts, verify: 168 | 169 | 1. **Dataset Creation**: 170 | ```bash 171 | ls data/extended_wiki_articles/ 172 | # Should show ~47 .txt files and metadata 173 | ``` 174 | 175 | 2. **Qdrant Ingestion**: 176 | ```python 177 | from qdrant_client import QdrantClient 178 | client = QdrantClient(url=..., api_key=...) 179 | info = client.get_collection("workshop_wikipedia_extended") 180 | print(f"Points: {info.points_count}") 181 | ``` 182 | 183 | ## File Structure 184 | 185 | ``` 186 | scripts/ 187 | ├── README.md # This file 188 | ├── create_extended_dataset.py # Dataset creation script 189 | └── ingest_to_qdrant_cloud.py # Qdrant ingestion script 190 | 191 | data/ 192 | ├── extended_wiki_articles/ # Created by dataset script 193 | │ ├── Deep_learning.txt 194 | │ ├── Deep_learning_metadata.json 195 | │ ├── ... 196 | │ └── dataset_summary.json 197 | └── ingestion_summary.json # Created by ingestion script 198 | ``` 199 | 200 | ## Support 201 | 202 | For issues or questions: 203 | 1. Check the troubleshooting section above 204 | 2. Verify environment variables are set correctly 205 | 3. Ensure all dependencies are installed 206 | 4. Check script output for specific error messages 207 | 208 | The scripts are designed to be robust and provide clear feedback about their progress and any issues encountered. 209 | -------------------------------------------------------------------------------- /scripts/create_extended_dataset.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Extended Dataset Creation Script for RAG Workshop 4 | 5 | This script fetches and processes Wikipedia articles for the naive-rag-challenges notebook. 6 | It creates a consistent, reusable dataset that students can use without repeating the 7 | data fetching and cleaning process. 8 | 9 | Usage: 10 | python scripts/create_extended_dataset.py 11 | """ 12 | 13 | import os 14 | import json 15 | import time 16 | from pathlib import Path 17 | from datetime import datetime 18 | from typing import Dict, List, Optional 19 | 20 | import wikipedia 21 | import re 22 | from mwparserfromhell import parse 23 | from bs4 import BeautifulSoup 24 | from tqdm import tqdm 25 | 26 | # Extended article list for demonstrating naive RAG limitations 27 | EXTENDED_ARTICLE_TITLES = [ 28 | # Original core articles 29 | "Deep learning", "Transformer (machine learning model)", 30 | "Natural language processing", "Reinforcement learning", 31 | "Artificial neural network", "Generative pre-trained transformer", 32 | "BERT (language model)", "Overfitting", 33 | 34 | # Previous expansion pack (broad adjacents) 35 | "Statistics", "Linear algebra", "Computer science", 36 | "Neuroscience", "Psychology", "Algorithm", 37 | "Information theory", "Probability theory", 38 | "Optimization (mathematics)", "Pattern recognition", 39 | "Signal processing", "Software engineering", 40 | "Human–computer interaction", "Cognitive science", 41 | "Data mining", "Knowledge representation and reasoning", 42 | "History of artificial intelligence", "Expert system", 43 | "Cybernetics", "TensorFlow", 44 | 45 | # New additions (to highlight semantic limitations) 46 | "Machine learning", "Bayesian network", "Graph theory", 47 | "Electrical engineering", "Quantum computing", "Robotics", 48 | "Game theory", "Control theory", "Big data", 49 | "Database management system", "Cryptography", 50 | "Evolutionary computation", "Fuzzy logic", 51 | "Decision tree learning", "Support vector machine", 52 | "Cluster analysis", "Dimensionality reduction", 53 | "Feature selection", "Ensemble learning", "Transfer learning", 54 | 55 | # CONFUSING ARTICLES - Added to demonstrate naive RAG limitations 56 | # Ambiguous terms (same word, different contexts) - using exact Wikipedia titles 57 | "Computer network", "Operating system", "Database", 58 | "Tree (data structure)", "Node (computer science)", 59 | 60 | # Cross-domain vocabulary overlap (finance) 61 | "Finance", "Risk", "Portfolio (finance)", 62 | "Mathematical optimization", "Quantitative analysis (finance)", 63 | 64 | # Cross-domain vocabulary overlap (biology) 65 | "Biology", "Evolution", "Genetics", 66 | "Bioinformatics", "Computational biology", 67 | 68 | # Cross-domain vocabulary overlap (physics) 69 | "Physics", "Mathematics", "Statistical mechanics", 70 | 71 | # Biographical articles (will match AI queries but provide different context) 72 | "Alan Turing", "John von Neumann", "Claude Shannon", 73 | "Geoffrey Hinton", "Yann LeCun", 74 | 75 | # Broader topics that will dilute specific queries 76 | "Artificial intelligence", "Engineering", 77 | "Data structure", "Software", "Hardware" 78 | ] 79 | 80 | 81 | def setup_directories(): 82 | """Create necessary directories for the extended dataset.""" 83 | base_dir = Path(__file__).parent.parent 84 | extended_dir = base_dir / "data" / "extended_wiki_articles" 85 | extended_dir.mkdir(parents=True, exist_ok=True) 86 | return extended_dir 87 | 88 | 89 | def clean_text(text: str) -> str: 90 | """ 91 | Clean Wikipedia text by removing markup and citations. 92 | 93 | Args: 94 | text: Raw Wikipedia content 95 | 96 | Returns: 97 | Cleaned text string 98 | """ 99 | # Remove wiki markup and citation numbers 100 | text = ''.join(parse(text).strip_code()) 101 | soup = BeautifulSoup(text, 'html.parser') 102 | text = soup.get_text() 103 | return re.sub(r'\[\d+\]', '', text).strip() 104 | 105 | 106 | def fetch_wikipedia_article(title: str, max_retries: int = 3) -> Optional[Dict]: 107 | """ 108 | Fetch a Wikipedia article with retry logic. 109 | 110 | Args: 111 | title: Wikipedia article title 112 | max_retries: Maximum number of retry attempts 113 | 114 | Returns: 115 | Dictionary with article data or None if failed 116 | """ 117 | for attempt in range(max_retries): 118 | try: 119 | page = wikipedia.page(title) 120 | return { 121 | "title": title, 122 | "url": page.url, 123 | "raw_content": page.content, 124 | "fetched_at": datetime.now().isoformat() 125 | } 126 | except wikipedia.exceptions.DisambiguationError as e: 127 | print(f" Disambiguation for '{title}', using: {e.options[0]}") 128 | return fetch_wikipedia_article(e.options[0], max_retries - 1) 129 | except wikipedia.exceptions.PageError: 130 | print(f" Page not found: {title}") 131 | return None 132 | except Exception as e: 133 | print(f" Attempt {attempt + 1} failed for '{title}': {e}") 134 | if attempt < max_retries - 1: 135 | time.sleep(2 ** attempt) # Exponential backoff 136 | else: 137 | print(f" Failed to fetch '{title}' after {max_retries} attempts") 138 | return None 139 | 140 | return None 141 | 142 | 143 | def save_article(article_data: Dict, output_dir: Path) -> bool: 144 | """ 145 | Save article data to individual files. 146 | 147 | Args: 148 | article_data: Dictionary containing article information 149 | output_dir: Directory to save the article 150 | 151 | Returns: 152 | True if saved successfully, False otherwise 153 | """ 154 | try: 155 | # Clean filename 156 | safe_title = re.sub(r'[^\w\s-]', '', article_data['title']) 157 | safe_title = re.sub(r'[-\s]+', '_', safe_title) 158 | 159 | # Save raw content 160 | content_file = output_dir / f"{safe_title}.txt" 161 | with open(content_file, 'w', encoding='utf-8') as f: 162 | f.write(article_data['raw_content']) 163 | 164 | # Save metadata 165 | metadata = { 166 | "title": article_data['title'], 167 | "url": article_data['url'], 168 | "fetched_at": article_data['fetched_at'], 169 | "content_file": f"{safe_title}.txt", 170 | "cleaned_length": len(clean_text(article_data['raw_content'])) 171 | } 172 | 173 | metadata_file = output_dir / f"{safe_title}_metadata.json" 174 | with open(metadata_file, 'w', encoding='utf-8') as f: 175 | json.dump(metadata, f, indent=2) 176 | 177 | return True 178 | except Exception as e: 179 | print(f" Error saving article '{article_data['title']}': {e}") 180 | return False 181 | 182 | 183 | def load_existing_articles(output_dir: Path) -> set: 184 | """ 185 | Load list of already processed articles. 186 | 187 | Args: 188 | output_dir: Directory containing existing articles 189 | 190 | Returns: 191 | Set of article titles already processed 192 | """ 193 | existing = set() 194 | for metadata_file in output_dir.glob("*_metadata.json"): 195 | try: 196 | with open(metadata_file, 'r', encoding='utf-8') as f: 197 | metadata = json.load(f) 198 | existing.add(metadata['title']) 199 | except Exception as e: 200 | print(f"Warning: Could not read {metadata_file}: {e}") 201 | 202 | return existing 203 | 204 | 205 | def create_dataset_summary(output_dir: Path, processed_articles: List[str]): 206 | """ 207 | Create a summary file for the extended dataset. 208 | 209 | Args: 210 | output_dir: Directory containing the articles 211 | processed_articles: List of successfully processed article titles 212 | """ 213 | summary = { 214 | "dataset_name": "Extended Wikipedia Articles for RAG Workshop", 215 | "created_at": datetime.now().isoformat(), 216 | "total_articles": len(processed_articles), 217 | "articles": processed_articles, 218 | "purpose": "Demonstrating naive RAG limitations with expanded knowledge base", 219 | "usage": "Used in naive-rag-challenges.ipynb for workshop demonstrations" 220 | } 221 | 222 | summary_file = output_dir / "dataset_summary.json" 223 | with open(summary_file, 'w', encoding='utf-8') as f: 224 | json.dump(summary, f, indent=2) 225 | 226 | print(f"\n📋 Dataset summary saved to: {summary_file}") 227 | 228 | 229 | def main(): 230 | """Main function to create the extended dataset.""" 231 | print("🚀 Creating Extended Wikipedia Dataset for RAG Workshop") 232 | print("=" * 60) 233 | 234 | # Setup 235 | output_dir = setup_directories() 236 | print(f"📁 Output directory: {output_dir}") 237 | 238 | # Check existing articles 239 | existing_articles = load_existing_articles(output_dir) 240 | articles_to_fetch = [title for title in EXTENDED_ARTICLE_TITLES 241 | if title not in existing_articles] 242 | 243 | if existing_articles: 244 | print(f"✅ Found {len(existing_articles)} existing articles") 245 | 246 | if not articles_to_fetch: 247 | print("🎉 All articles already exist! Dataset is up to date.") 248 | create_dataset_summary(output_dir, list(existing_articles)) 249 | return 250 | 251 | print(f"📥 Fetching {len(articles_to_fetch)} new articles...") 252 | 253 | # Fetch and process articles 254 | processed_articles = list(existing_articles) 255 | failed_articles = [] 256 | 257 | for title in tqdm(articles_to_fetch, desc="Fetching articles"): 258 | print(f"\n📖 Processing: {title}") 259 | 260 | article_data = fetch_wikipedia_article(title) 261 | if article_data: 262 | if save_article(article_data, output_dir): 263 | processed_articles.append(title) 264 | print(f" ✅ Saved successfully") 265 | else: 266 | failed_articles.append(title) 267 | else: 268 | failed_articles.append(title) 269 | 270 | # Results summary 271 | print("\n" + "=" * 60) 272 | print("📊 DATASET CREATION SUMMARY") 273 | print("=" * 60) 274 | print(f"✅ Successfully processed: {len(processed_articles)} articles") 275 | 276 | if failed_articles: 277 | print(f"❌ Failed to process: {len(failed_articles)} articles") 278 | print("Failed articles:", failed_articles) 279 | 280 | # Create dataset summary 281 | create_dataset_summary(output_dir, processed_articles) 282 | 283 | print(f"\n🎯 Extended dataset ready for workshop!") 284 | print(f"📍 Location: {output_dir}") 285 | print(f"📚 Total articles: {len(processed_articles)}") 286 | print("\n💡 Next step: Run the ingestion script to upload to Qdrant Cloud") 287 | 288 | 289 | if __name__ == "__main__": 290 | main() 291 | -------------------------------------------------------------------------------- /scripts/fetch_additional_articles.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Fetch Additional Wikipedia Articles 4 | 5 | Script to fetch additional Wikipedia articles that are not already 6 | in the data/wiki_articles directory. Useful for expanding the dataset. 7 | """ 8 | 9 | import argparse 10 | from pathlib import Path 11 | from typing import List 12 | from wiki_article_loader import ( 13 | fetch_wikipedia_article, 14 | save_article_to_file, 15 | get_available_article_titles 16 | ) 17 | 18 | 19 | def fetch_additional_articles( 20 | article_titles: List[str], 21 | data_dir: str = "data/wiki_articles", 22 | skip_existing: bool = True 23 | ) -> None: 24 | """ 25 | Fetch additional Wikipedia articles and save them. 26 | 27 | Args: 28 | article_titles: List of article titles to fetch 29 | data_dir: Directory to save articles 30 | skip_existing: Whether to skip articles that already exist 31 | """ 32 | # Get existing articles if skip_existing is True 33 | existing_titles = set() 34 | if skip_existing: 35 | existing_titles = set(get_available_article_titles(data_dir)) 36 | 37 | # Create data directory if it doesn't exist 38 | Path(data_dir).mkdir(parents=True, exist_ok=True) 39 | 40 | print(f"Fetching articles to {data_dir}") 41 | print(f"Skip existing articles: {skip_existing}") 42 | 43 | successful = 0 44 | skipped = 0 45 | failed = 0 46 | 47 | for title in article_titles: 48 | if skip_existing and title in existing_titles: 49 | print(f"Skipping existing article: {title}") 50 | skipped += 1 51 | continue 52 | 53 | print(f"Fetching: {title}") 54 | article = fetch_wikipedia_article(title) 55 | 56 | if article and save_article_to_file(article, data_dir): 57 | successful += 1 58 | else: 59 | failed += 1 60 | 61 | print(f"\nSummary:") 62 | print(f" Successfully fetched: {successful}") 63 | print(f" Skipped (existing): {skipped}") 64 | print(f" Failed: {failed}") 65 | print(f" Total processed: {len(article_titles)}") 66 | 67 | 68 | # Extended list of ML/AI related articles for workshop expansion 69 | EXTENDED_ARTICLE_TITLES = [ 70 | # Core ML concepts 71 | "Machine learning", 72 | "Supervised learning", 73 | "Unsupervised learning", 74 | "Semi-supervised learning", 75 | 76 | # Neural Networks 77 | "Convolutional neural network", 78 | "Recurrent neural network", 79 | "Long short-term memory", 80 | "Attention mechanism", 81 | 82 | # NLP specific 83 | "Word embedding", 84 | "Named-entity recognition", 85 | "Part-of-speech tagging", 86 | "Sentiment analysis", 87 | 88 | # Computer Vision 89 | "Computer vision", 90 | "Image classification", 91 | "Object detection", 92 | "Image segmentation", 93 | 94 | # Advanced topics 95 | "Transfer learning", 96 | "Meta-learning", 97 | "Few-shot learning", 98 | "Zero-shot learning", 99 | 100 | # Optimization 101 | "Gradient descent", 102 | "Backpropagation", 103 | "Adam optimizer", 104 | "Learning rate", 105 | 106 | # Evaluation 107 | "Cross-validation", 108 | "Bias-variance tradeoff", 109 | "ROC curve", 110 | "Precision and recall" 111 | ] 112 | 113 | 114 | def main(): 115 | parser = argparse.ArgumentParser( 116 | description="Fetch additional Wikipedia articles for RAG workshop" 117 | ) 118 | parser.add_argument( 119 | "articles", 120 | nargs="*", 121 | help="Article titles to fetch (if not provided, uses extended list)" 122 | ) 123 | parser.add_argument( 124 | "--data-dir", 125 | default="data/wiki_articles", 126 | help="Directory to save articles (default: data/wiki_articles)" 127 | ) 128 | parser.add_argument( 129 | "--force", 130 | action="store_true", 131 | help="Fetch articles even if they already exist" 132 | ) 133 | parser.add_argument( 134 | "--list-available", 135 | action="store_true", 136 | help="List currently available articles and exit" 137 | ) 138 | parser.add_argument( 139 | "--list-extended", 140 | action="store_true", 141 | help="List extended article titles and exit" 142 | ) 143 | 144 | args = parser.parse_args() 145 | 146 | if args.list_available: 147 | print("Currently available articles:") 148 | titles = get_available_article_titles(args.data_dir) 149 | for i, title in enumerate(titles, 1): 150 | print(f"{i:2d}. {title}") 151 | print(f"\nTotal: {len(titles)} articles") 152 | return 153 | 154 | if args.list_extended: 155 | print("Extended article list:") 156 | for i, title in enumerate(EXTENDED_ARTICLE_TITLES, 1): 157 | print(f"{i:2d}. {title}") 158 | print(f"\nTotal: {len(EXTENDED_ARTICLE_TITLES)} articles") 159 | return 160 | 161 | # Determine which articles to fetch 162 | if args.articles: 163 | titles_to_fetch = args.articles 164 | print(f"Fetching user-specified articles: {titles_to_fetch}") 165 | else: 166 | titles_to_fetch = EXTENDED_ARTICLE_TITLES 167 | print(f"Fetching extended article list ({len(titles_to_fetch)} articles)") 168 | 169 | # Fetch the articles 170 | fetch_additional_articles( 171 | titles_to_fetch, 172 | data_dir=args.data_dir, 173 | skip_existing=not args.force 174 | ) 175 | 176 | 177 | if __name__ == "__main__": 178 | main() -------------------------------------------------------------------------------- /scripts/wiki_article_loader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Wikipedia Article Loader Utilities 4 | 5 | This module provides functions to load existing wiki articles and 6 | fetch additional ones when needed for the RAG workshop. 7 | """ 8 | 9 | import os 10 | import re 11 | from pathlib import Path 12 | from typing import List, Dict, Optional 13 | import wikipedia 14 | from mwparserfromhell import parse 15 | from bs4 import BeautifulSoup 16 | 17 | 18 | def load_existing_wiki_articles( 19 | data_dir: str = "data/wiki_articles", 20 | article_titles: Optional[List[str]] = None 21 | ) -> List[Dict[str, str]]: 22 | """ 23 | Load existing wiki articles from the data directory. 24 | 25 | Args: 26 | data_dir: Directory containing wiki article text files 27 | article_titles: Optional list of specific article titles to load. 28 | If None, loads all available articles. 29 | 30 | Returns: 31 | List of dictionaries with title, content, and file_path 32 | """ 33 | articles = [] 34 | data_path = Path(data_dir) 35 | 36 | if not data_path.exists(): 37 | print(f"Warning: {data_dir} directory not found") 38 | return articles 39 | 40 | # Convert article titles to set for faster lookup if specified 41 | target_titles = set(article_titles) if article_titles else None 42 | found_titles = set() 43 | 44 | for file_path in data_path.glob("*.txt"): 45 | try: 46 | # Extract title from filename (remove .txt and replace underscores) 47 | title = file_path.stem.replace('_', ' ') 48 | 49 | # Skip if we have specific titles and this isn't one of them 50 | if target_titles and title not in target_titles: 51 | continue 52 | 53 | with open(file_path, 'r', encoding='utf-8') as f: 54 | content = f.read() 55 | 56 | articles.append({ 57 | "title": title, 58 | "content": content, 59 | "file_path": str(file_path), 60 | "url": f"https://en.wikipedia.org/wiki/{file_path.stem}" 61 | }) 62 | 63 | if target_titles: 64 | found_titles.add(title) 65 | 66 | except Exception as e: 67 | print(f"Error reading {file_path}: {e}") 68 | 69 | # Report any missing articles if specific titles were requested 70 | if target_titles: 71 | missing_titles = target_titles - found_titles 72 | if missing_titles: 73 | print(f"Warning: The following articles were not found: {', '.join(missing_titles)}") 74 | print(f"Available articles: {', '.join(get_available_article_titles(data_dir))}") 75 | 76 | return articles 77 | 78 | 79 | def clean_text(text: str) -> str: 80 | """ 81 | Clean Wikipedia text by removing markup and citation numbers. 82 | 83 | Args: 84 | text: Raw Wikipedia text 85 | 86 | Returns: 87 | Cleaned text 88 | """ 89 | # Remove wiki markup and citation numbers 90 | text = ''.join(parse(text).strip_code()) 91 | soup = BeautifulSoup(text, 'html.parser') 92 | text = soup.get_text() 93 | return re.sub(r'\[\d+\]', '', text).strip() 94 | 95 | 96 | def fetch_wikipedia_article(title: str) -> Optional[Dict[str, str]]: 97 | """ 98 | Fetch a single Wikipedia article and clean it. 99 | 100 | Args: 101 | title: Wikipedia article title 102 | 103 | Returns: 104 | Dictionary with article data or None if failed 105 | """ 106 | try: 107 | page = wikipedia.page(title) 108 | return { 109 | "title": title, 110 | "url": page.url, 111 | "raw_content": page.content, 112 | "content": clean_text(page.content) 113 | } 114 | except wikipedia.exceptions.DisambiguationError as e: 115 | print(f"Disambiguation for '{title}', trying first option: {e.options[0]}") 116 | return fetch_wikipedia_article(e.options[0]) 117 | except wikipedia.exceptions.PageError: 118 | print(f"Page not found: {title}") 119 | return None 120 | except Exception as e: 121 | print(f"Error fetching {title}: {e}") 122 | return None 123 | 124 | 125 | def save_article_to_file(article: Dict[str, str], data_dir: str = "data/wiki_articles") -> bool: 126 | """ 127 | Save an article to a text file. 128 | 129 | Args: 130 | article: Dictionary with article data (must have 'title' and 'content') 131 | data_dir: Directory to save the article 132 | 133 | Returns: 134 | True if saved successfully, False otherwise 135 | """ 136 | try: 137 | data_path = Path(data_dir) 138 | data_path.mkdir(parents=True, exist_ok=True) 139 | 140 | # Create filename from title 141 | filename = article["title"].replace(' ', '_').replace('/', '_') + '.txt' 142 | file_path = data_path / filename 143 | 144 | with open(file_path, 'w', encoding='utf-8') as f: 145 | f.write(article["content"]) 146 | 147 | print(f"Saved: {filename}") 148 | return True 149 | except Exception as e: 150 | print(f"Error saving article '{article['title']}': {e}") 151 | return False 152 | 153 | 154 | def get_available_article_titles(data_dir: str = "data/wiki_articles") -> List[str]: 155 | """ 156 | Get list of available article titles from the data directory. 157 | 158 | Args: 159 | data_dir: Directory containing wiki articles 160 | 161 | Returns: 162 | List of article titles 163 | """ 164 | data_path = Path(data_dir) 165 | if not data_path.exists(): 166 | return [] 167 | 168 | titles = [] 169 | for file_path in data_path.glob("*.txt"): 170 | title = file_path.stem.replace('_', ' ') 171 | titles.append(title) 172 | 173 | return sorted(titles) 174 | 175 | 176 | # Default article titles used in the workshop 177 | DEFAULT_ARTICLE_TITLES = [ 178 | "Deep learning", 179 | "Transformer (machine learning model)", 180 | "Natural language processing", 181 | "Reinforcement learning", 182 | "Artificial neural network", 183 | "Generative pre-trained transformer", 184 | "BERT (language model)", 185 | "Overfitting" 186 | ] 187 | 188 | # Predefined article combinations for different workshop scenarios 189 | ARTICLE_COMBINATIONS = { 190 | "minimal": ["Deep learning", "Artificial neural network"], 191 | "core_ml": ["Deep learning", "Artificial neural network", "Overfitting"], 192 | "transformers": ["Transformer (machine learning model)", "BERT (language model)", "Generative pre-trained transformer"], 193 | "all_neural": ["Deep learning", "Artificial neural network", "Transformer (machine learning model)", "BERT (language model)"], 194 | "default": DEFAULT_ARTICLE_TITLES 195 | } 196 | 197 | 198 | def load_article_combination( 199 | combination: str, 200 | data_dir: str = "data/wiki_articles" 201 | ) -> List[Dict[str, str]]: 202 | """ 203 | Load a predefined combination of articles. 204 | 205 | Args: 206 | combination: Name of the predefined combination 207 | data_dir: Directory containing wiki articles 208 | 209 | Returns: 210 | List of article dictionaries 211 | """ 212 | if combination not in ARTICLE_COMBINATIONS: 213 | available = ", ".join(ARTICLE_COMBINATIONS.keys()) 214 | raise ValueError(f"Unknown combination '{combination}'. Available: {available}") 215 | 216 | article_titles = ARTICLE_COMBINATIONS[combination] 217 | print(f"Loading '{combination}' combination: {', '.join(article_titles)}") 218 | 219 | return load_existing_wiki_articles(data_dir, article_titles) 220 | 221 | 222 | if __name__ == "__main__": 223 | # Example usage 224 | print("Available articles:") 225 | titles = get_available_article_titles() 226 | for title in titles: 227 | print(f" - {title}") 228 | 229 | print(f"\n=== Loading ALL articles (default behavior) ===") 230 | articles = load_existing_wiki_articles() # No params = load all 231 | print(f"Successfully loaded {len(articles)} articles") 232 | 233 | print(f"\n=== Loading SPECIFIC articles ===") 234 | specific_articles = load_existing_wiki_articles( 235 | article_titles=["Deep learning", "Artificial neural network"] 236 | ) 237 | print(f"Successfully loaded {len(specific_articles)} specific articles") 238 | 239 | print(f"\n=== Loading PREDEFINED combination ===") 240 | combo_articles = load_article_combination("minimal") 241 | print(f"Successfully loaded {len(combo_articles)} articles from 'minimal' combination") 242 | 243 | print(f"\n=== Available combinations ===") 244 | for combo_name, combo_titles in ARTICLE_COMBINATIONS.items(): 245 | print(f" - {combo_name}: {combo_titles}") 246 | 247 | # Show example content 248 | if articles: 249 | print(f"\nExample content from '{articles[0]['title']}':") 250 | print(f"{articles[0]['content'][:200]}...") --------------------------------------------------------------------------------