├── search ├── indexing │ ├── __init__.py │ ├── simple_indexing.py │ └── advanced_indexing.py ├── serving │ └── pagerank.py ├── crawling │ ├── simple_crawler.py │ └── advanced_crawler.py └── complete_examples │ ├── simple_pagerank.py │ └── advanced_pagerank.py ├── requirements.txt ├── client ├── images │ ├── sloth_search.png │ ├── google_search_icon.svg │ ├── google_mic.svg │ └── google_camera.svg ├── index.html ├── search.html └── styles.css ├── LICENSE ├── .gitignore ├── README.md └── server └── google_search_api.py /search/indexing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | requests 3 | nltk 4 | flask 5 | flask-cors -------------------------------------------------------------------------------- /client/images/sloth_search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-CodingSloth/sloth-search/HEAD/client/images/sloth_search.png -------------------------------------------------------------------------------- /client/images/google_search_icon.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /client/images/google_mic.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /client/images/google_camera.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 The Coding Sloth 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /search/indexing/simple_indexing.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def simple_index_page(webpage, webpage_url): 4 | 5 | #Collect title and description 6 | title_tag = webpage.find('title') 7 | title = title_tag.get_text().strip() if title_tag else 'No Title' 8 | 9 | #Collect description 10 | description = '' 11 | meta_description = webpage.find('meta', attrs={'name': 'description'}) 12 | if meta_description and 'content' in meta_description.attrs: 13 | description = meta_description['content'] 14 | else: 15 | text_content = webpage.get_text(separator=" ", strip=True) 16 | description = text_content[:200] + "..." if len(text_content) > 200 else text_content 17 | 18 | #Grab ALL the words in the page 19 | #regex disgusting... 20 | words = re.findall(r'\b\w+\b', webpage.get_text(separator=" ", strip=True).lower()) 21 | 22 | #Double check and filter out any numbers, symbols, etc. 23 | #WE ONLY WANT WORDS 24 | words = [word for word in words if word.isalpha()] 25 | 26 | #Add the information to the index 27 | indexed_page = { 28 | "url": webpage_url, 29 | "title": title, 30 | "description": description, 31 | "words": words 32 | } 33 | print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(words)} \n") 34 | return indexed_page -------------------------------------------------------------------------------- /search/serving/pagerank.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | def compute_pagerank(graph, damping_factor=0.85, max_iterations=100, tol=1.0e-6): 4 | # Build the set of all URLs 5 | all_nodes = set(graph.keys()) 6 | for links in graph.values(): 7 | all_nodes.update(links) 8 | num_nodes = len(all_nodes) 9 | # Initialize PageRank scores 10 | pagerank = {url: 1.0 / num_nodes for url in all_nodes} 11 | # Identify dangling nodes (nodes with no outgoing links) 12 | dangling_nodes = [url for url in all_nodes if url not in graph or len(graph[url]) == 0] 13 | # Iterative computation 14 | for iteration in range(max_iterations): 15 | new_pagerank = {} 16 | # Sum of PageRank scores from dangling nodes 17 | dangling_sum = damping_factor * sum(pagerank[node] for node in dangling_nodes) / num_nodes 18 | for url in all_nodes: 19 | rank = (1.0 - damping_factor) / num_nodes 20 | rank += dangling_sum 21 | # Sum contributions from incoming links 22 | for node in graph: 23 | if url in graph[node]: 24 | out_degree = len(graph[node]) 25 | rank += damping_factor * pagerank[node] / out_degree 26 | new_pagerank[url] = rank 27 | # Check for convergence 28 | error = sum(abs(new_pagerank[url] - pagerank[url]) for url in all_nodes) 29 | if error < tol: 30 | break 31 | pagerank = new_pagerank 32 | for url in all_nodes: 33 | pagerank[url] = round(pagerank[url], 6) 34 | return pagerank 35 | -------------------------------------------------------------------------------- /search/crawling/simple_crawler.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import time 4 | import random 5 | 6 | def sloth_bot(): 7 | # our list of URLs to crawl 8 | urls = ["https://en.wikipedia.org/wiki/Google"] 9 | visited_urls = set() 10 | #timer to see how long it takes to crawl 11 | start = time.time() 12 | #Loops through the list of urls 13 | CRAWL_LIMIT = 15 14 | current_crawl_count = 0 15 | 16 | while urls and current_crawl_count < CRAWL_LIMIT: 17 | # grabs the next url 18 | current_url = urls.pop(0) 19 | print("time to crawl: " + current_url) 20 | time.sleep(random.uniform(1, 3)) 21 | try: 22 | response = requests.get(current_url) 23 | response.raise_for_status() 24 | except requests.RequestException as e: 25 | print(f"Failed to retrieve {current_url}: {e}") 26 | continue 27 | 28 | # grabbing the content of the page 29 | webpage = BeautifulSoup(response.content, "html.parser") 30 | 31 | # grabbing the links from the page 32 | hyperlinks = webpage.select("a[href]") 33 | # looping through the links and adding them to our list of urls 34 | for hyperlink in hyperlinks: 35 | url = hyperlink["href"] 36 | #Formats the url into a proper url (don't worry about this) 37 | if url.startswith("#"): 38 | continue 39 | if url.startswith("//"): 40 | url = "https:" + url 41 | elif url.startswith("/"): 42 | base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) 43 | url = base_url + url 44 | elif not url.startswith("http"): 45 | continue 46 | # 47 | url = url.split('#')[0] 48 | 49 | #if we haven't visited this url yet, add it to our list 50 | if url not in visited_urls: 51 | urls.append(url) 52 | visited_urls.add(url) 53 | 54 | current_crawl_count += 1 55 | 56 | 57 | def main(): 58 | # Start the crawling process 59 | sloth_bot() 60 | 61 | if __name__ == "__main__": 62 | main() 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /search/indexing/advanced_indexing.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | import ssl 3 | from nltk.corpus import stopwords 4 | from nltk.stem import PorterStemmer 5 | from nltk.tokenize import word_tokenize 6 | 7 | try: 8 | _create_unverified_https_context = ssl._create_unverified_context 9 | except AttributeError: 10 | pass 11 | else: 12 | ssl._create_default_https_context = _create_unverified_https_context 13 | nltk.download('stopwords') 14 | nltk.download('punkt_tab') 15 | try: 16 | _create_unverified_https_context = ssl._create_unverified_context 17 | except AttributeError: 18 | pass 19 | else: 20 | ssl._create_default_https_context = _create_unverified_https_context 21 | 22 | # Download NLTK data only if not already downloaded 23 | def download_nltk_resources(): 24 | try: 25 | stopwords.words('english') 26 | except LookupError: 27 | nltk.download('stopwords') 28 | try: 29 | word_tokenize('test') 30 | except LookupError: 31 | nltk.download('punkt') 32 | #Function that indexes the webpage 33 | def advanced_index_page(webpage, webpage_url): 34 | #Download NLTK data only if not already downloaded 35 | download_nltk_resources() 36 | 37 | # Initialize NLTK components 38 | stop_words = set(stopwords.words('english')) 39 | ps = PorterStemmer() 40 | #Collect title and description 41 | title_tag = webpage.find('title') 42 | title = title_tag.get_text().strip() if title_tag else 'No Title' 43 | 44 | #Collect description 45 | description = '' 46 | meta_description = webpage.find('meta', attrs={'name': 'description'}) 47 | if meta_description and 'content' in meta_description.attrs: 48 | description = meta_description['content'] 49 | else: 50 | text_content = webpage.get_text(separator=" ", strip=True) 51 | description = text_content[:200] + "..." if len(text_content) > 200 else text_content 52 | 53 | 54 | # Grab ALL the words in the page. 55 | text_content = webpage.get_text(separator=' ', strip=True) 56 | #Splitting them into the individual words 57 | tokens = word_tokenize(text_content.lower()) 58 | #Big brain techniques 2 and 3 59 | #Stemming the words and removing stop words. 60 | filtered_words = [ 61 | ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words 62 | ] 63 | 64 | #Add the information to the index 65 | indexed_page = { 66 | "url": webpage_url, 67 | "title": title, 68 | "description": description, 69 | "words": filtered_words 70 | } 71 | #If you want to print the results 72 | #print(f"Indexed: {webpage_url}. \n Here's the info: \n title: {title} \n description: {description} \n number of words: {len(filtered_words)} \n") 73 | return indexed_page 74 | -------------------------------------------------------------------------------- /client/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | am real programmer 7 | 8 | 9 | 10 | 11 |
12 |
13 | About 14 | Store 15 |
16 |
17 | Gmail 18 | Images 19 | 23 | 27 |
28 |
29 |
30 | 31 | 32 | 33 |
40 |
41 | 45 | 46 | 47 | 48 |
49 |
50 | 53 | 54 |
55 |
56 |
57 |
58 |
59 | Advertising 60 | Business 61 | How Search works 64 |
65 |
66 | Carbon Neutral since 2007 69 |
70 |
71 | Privacy 72 | Terms 73 | Settings 74 |
75 |
76 | 77 | 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | 164 | .DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Sloth Search - A Google-like Search Engine Clone 2 | 3 | Sloth Search is a project that aims to recreate Google, including crawling, indexing, and serving results through a user-friendly front-end interface. The project consists of three main components: the Client, Search, and Server. 4 | [Check out the video for a full explanation here](https://youtu.be/WCpimlH0Kck?si=_zFzrb1cxZinWKo3) 5 | 6 | ## Project Structure 7 | 8 | The project is divided into the following folders: 9 | 10 | - **Client**: Contains the front-end code, providing a user interface similar to Google search, where users can enter queries and view search results. 11 | 12 | - **Search**: Contains the core components of Sloth Search, which replicate the three main parts of Google: 13 | 14 | - **Crawling**: The web crawler that collects information from the web. 15 | 16 | - **Indexing**: Processing and storing the content collected by the crawler for efficient searching. 17 | 18 | - **Serving (PageRank)**: Serving search results based on their relevance and PageRank algorithm. 19 | 20 | - **Server**: Contains the search API used to handle client requests and provide search results. 21 | 22 | ## Installation and Setup 23 | 24 | **1. Clone the Repository** 25 | 26 | ```sh 27 | git clone https://github.com/The-CodingSloth/sloth-search.git 28 | cd sloth-search 29 | ``` 30 | 31 | **2. Install the necessary Python dependencies** 32 | 33 | ```sh 34 | pip install -r requirements.txt 35 | ``` 36 | 37 | **3. Client Setup** 38 | 39 | - The client contains the HTML, CSS, and JavaScript code to run the front-end. 40 | 41 | - Open the `index.html` file in your browser, or use a static file server to serve the client code locally. 42 | 43 | - You can also use the live server extension. 44 | 45 | **4. Search Setup** 46 | 47 | - The `search` directory contains the code for crawling, indexing, and serving. 48 | 49 | - You can start the process by running: 50 | 51 | ```sh 52 | python search/complete_examples/advanced_pagerank.py 53 | ``` 54 | 55 | - This will crawl, index, and prepare the content for searching. 56 | 57 | - If you want to run any other files, do the same process: 58 | 59 | ```sh 60 | python search/ 61 | ``` 62 | 63 | ## How It Works 64 | 65 | **1. Crawling** 66 | 67 | - The crawler starts with a set of seed URLs and collects links and content from the web. 68 | 69 | - It respects `robots.txt` to avoid being blocked and to ensure ethical crawling. 70 | 71 | - Parsed data is stored in a format ready for indexing. 72 | 73 | **2. Indexing** 74 | 75 | - The indexing module processes the crawled pages. 76 | 77 | - The content is tokenized, cleaned, stemmed, and stop words are removed using the NLTK library. 78 | 79 | - The resulting indexed data is saved to be used by the search API. 80 | 81 | **3. Serving and PageRank** 82 | 83 | - The PageRank algorithm is used to rank pages based on their importance. 84 | 85 | - When a user searches for a query through the client, the server uses the indexed data and PageRank scores to return the most relevant pages. 86 | 87 | ## Important Notes 88 | 89 | - **Respecting Websites**: The crawler respects `robots.txt` rules. Please make sure not to overload any websites. 90 | 91 | - **PageRank Algorithm**: The implementation of the PageRank algorithm uses an iterative approach to rank pages based on the links. 92 | 93 | - **Data Storage**: The crawler and indexer use CSV files for data storage (`advanced_pagerank_inverted_index.csv` and `advanced_pagerank.csv`). Make sure these files are writable during execution. 94 | 95 | ## Contributing 96 | 97 | Contributions are welcome! If you'd like to contribute to the development of Sloth Search, feel free to fork the repository, make changes, and submit a pull request. 98 | 99 | ## License 100 | 101 | This project is open-source and available under the MIT License. 102 | 103 | If you have any questions or suggestions, feel free to contact me. 104 | 105 | Happy Searching with Sloth Search! 🦥🔍 106 | -------------------------------------------------------------------------------- /search/complete_examples/simple_pagerank.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import time 4 | import random 5 | import csv 6 | import sys 7 | import os 8 | # Add the root directory to sys.path 9 | # This is to be able to import modules from other directories (indexing and serving) idk why... 10 | # any imports from indexing/serving need to happen under this 11 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 12 | from indexing.simple_indexing import simple_index_page 13 | from serving.pagerank import compute_pagerank 14 | 15 | def sloth_bot(): 16 | # Our list of URLs to crawl 17 | urls = ["https://en.wikipedia.org/wiki/Google"] 18 | visited_urls = set() 19 | 20 | # Create the index and graph 21 | index = {} # URL -> page contents 22 | pagerank_graph = {} # URL -> set of URLs it links to 23 | CRAWL_LIMIT = 5 24 | crawl_count = 0 25 | 26 | # Loops through the list of URLs 27 | while urls and crawl_count < CRAWL_LIMIT: 28 | # Grab the next URL 29 | current_url = urls.pop() 30 | if current_url in visited_urls: 31 | continue 32 | print("Time to crawl: " + current_url) 33 | time.sleep(random.uniform(1, 2)) 34 | try: 35 | response = requests.get(current_url) 36 | response.raise_for_status() 37 | except requests.RequestException as e: 38 | print(f"Failed to retrieve {current_url}: {e}") 39 | continue 40 | 41 | # Parse the content of the page 42 | webpage = BeautifulSoup(response.content, "html.parser") 43 | 44 | # Add the page to the index 45 | indexed_page = simple_index_page(webpage, current_url) 46 | index[current_url] = indexed_page 47 | visited_urls.add(current_url) 48 | 49 | # Grab the links from the page 50 | hyperlinks = webpage.select("a[href]") 51 | #This is where we store our connected pages 52 | hyperlink_connections = set() 53 | for hyperlink in hyperlinks: 54 | url = hyperlink["href"] 55 | # Format the URL into a proper URL 56 | if url.startswith("#"): 57 | continue 58 | if url.startswith("//"): 59 | url = "https:" + url 60 | elif url.startswith("/"): 61 | base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) 62 | url = base_url + url 63 | elif not url.startswith("http"): 64 | continue 65 | url = url.split('#')[0] 66 | #Add to the link connection 67 | hyperlink_connections.add(url) 68 | # If we haven't visited this URL yet, add it to our list 69 | if url not in visited_urls: 70 | urls.append(url) 71 | 72 | # Update the page's outgoing links 73 | index[current_url]['hyperlink_connections'] = hyperlink_connections 74 | pagerank_graph[current_url] = hyperlink_connections 75 | 76 | crawl_count += 1 77 | print(f"Crawled count: {crawl_count}, index size: {len(index)}, URLs left: {len(urls)}") 78 | 79 | # Compute PageRank 80 | pagerank_scores = compute_pagerank(pagerank_graph) 81 | 82 | """ This part is for saving the data to CSV files. 83 | However, if you don't want to save the data, you can remove/comment out this part. 84 | If you want to use a database, you can replace this part with a database connection. 85 | """ 86 | 87 | with open('simple_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile: 88 | fieldnames = ["url", "title", "description", "pagerank", "words"] 89 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 90 | writer.writeheader() 91 | for url, info in index.items(): 92 | writer.writerow({ 93 | 'url': url, 94 | 'title': info['title'], 95 | 'description': info['description'], 96 | 'pagerank': pagerank_scores.get(url, 0), 97 | 'words': ', '.join(info['words']) 98 | }) 99 | 100 | 101 | 102 | def main(): 103 | # Start the crawling process 104 | sloth_bot() 105 | 106 | if __name__ == "__main__": 107 | main() 108 | 109 | 110 | 111 | -------------------------------------------------------------------------------- /client/search.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Search Results - My Search Engine 7 | 8 | 9 | 10 |
11 | 12 | 13 | 14 | 15 |
22 |
23 | 24 | 25 | 26 | 30 |
31 |
32 |
33 |
34 | 35 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /client/styles.css: -------------------------------------------------------------------------------- 1 | * { 2 | margin: 0; 3 | padding: 0; 4 | 5 | font-family: 'Roboto', sans-serif; 6 | } 7 | 8 | body { 9 | display: flex; 10 | flex-direction: column; 11 | min-height: 100vh; 12 | /* ensures the body takes up at least the full viewport height */ 13 | } 14 | 15 | a { 16 | all: unset; 17 | text-decoration: none; 18 | /* no underline */ 19 | } 20 | 21 | .top-section { 22 | padding: 1rem; 23 | display: flex; 24 | justify-content: space-between; 25 | } 26 | 27 | .app-icon { 28 | width: 1.5rem; 29 | height: 1.5rem; 30 | } 31 | 32 | .profile-pic { 33 | width: 2rem; 34 | height: 2rem; 35 | border-radius: 100%; 36 | } 37 | 38 | .left-side { 39 | display: flex; 40 | gap: 1.5rem; 41 | } 42 | 43 | .right-side { 44 | display: flex; 45 | gap: 1.5rem; 46 | justify-content: center; 47 | align-items: center; 48 | } 49 | 50 | .left-side a, 51 | .right-side a { 52 | color: #202124; 53 | font-size: 0.8rem; 54 | } 55 | 56 | .middle-section { 57 | flex-grow: 1; 58 | display: flex; 59 | flex-direction: column; 60 | justify-content: center; 61 | align-items: center; 62 | padding: 1rem 0; 63 | gap: 1.2rem; 64 | } 65 | 66 | .search-label { 67 | display: none; 68 | } 69 | 70 | .search-form { 71 | display: flex; 72 | flex-direction: column; 73 | align-items: center; 74 | justify-content: center; 75 | gap: 2.5rem; 76 | } 77 | .result-search-form { 78 | flex-direction: column; 79 | align-items: center; 80 | justify-content: center; 81 | gap: 2.5rem; 82 | } 83 | 84 | .search-form-input { 85 | display: flex; 86 | align-items: center; 87 | justify-content: center; 88 | gap: 1rem; 89 | border: 1px solid #dfe1e5; 90 | border-radius: 30px; 91 | padding: 0.3rem 1.5rem; 92 | box-shadow: 0px 2px 4px rgba(0, 0, 0, 0.1); 93 | } 94 | 95 | .search-form input { 96 | width: 27rem; 97 | padding: 0.5rem; 98 | border: none; 99 | outline: none; 100 | } 101 | 102 | .buttons { 103 | display: flex; 104 | gap: 1rem; 105 | } 106 | 107 | .search-form button { 108 | border: 1px solid #f8f9fa; 109 | padding: 0.5rem 1rem; 110 | background-color: #f8f9fa; 111 | font-size: 0.9rem; 112 | } 113 | .search-icon-home { 114 | width: 1rem; 115 | height: 1rem; 116 | } 117 | .search-icon-result { 118 | width: 1.5rem; 119 | height: 1.5rem; 120 | } 121 | .mic, 122 | .camera { 123 | width: 1.5rem; 124 | height: 1.5rem; 125 | } 126 | 127 | .bottom-section { 128 | margin-top: 15rem; 129 | padding: 1rem; 130 | display: flex; 131 | justify-content: space-between; 132 | align-items: center; 133 | background-color: #f2f2f2; 134 | font-size: 0.9em; 135 | padding-left: 2rem; 136 | padding-right: 2rem; 137 | } 138 | 139 | .bottom-left, 140 | .bottom-right { 141 | display: flex; 142 | gap: 1.8rem; 143 | } 144 | 145 | .bottom-middle { 146 | padding-right: 10rem; 147 | } 148 | 149 | .bottom-section a { 150 | color: #70757a; 151 | } 152 | 153 | .search-form button { 154 | background-color: #f8f9fa; 155 | border: 1px solid #f8f9fa; 156 | border-radius: 4px; 157 | color: #3c4043; 158 | font-family: Roboto, arial, sans-serif; 159 | font-size: 14px; 160 | margin: 11px 4px; 161 | padding: 0 16px; 162 | line-height: 27px; 163 | height: 36px; 164 | min-width: 54px; 165 | text-align: center; 166 | cursor: pointer; 167 | user-select: none; 168 | } 169 | 170 | .bottom-section { 171 | display: flex; 172 | justify-content: space-between; 173 | align-items: center; 174 | background-color: #f2f2f2; 175 | padding: 1rem 1.5rem; 176 | margin-top: 15rem; 177 | } 178 | 179 | .bottom-section a { 180 | margin: 0 1rem; 181 | } 182 | 183 | .bottom-middle { 184 | margin-right: 8rem; 185 | } 186 | 187 | .search-result-area { 188 | display: flex; 189 | padding-left: 1rem; 190 | gap: 1rem; 191 | } 192 | .search-logo-home { 193 | width: 20rem; 194 | } 195 | .search-logo-result { 196 | width: 7rem; 197 | } 198 | 199 | #results { 200 | padding-top: 1rem; 201 | display: flex; 202 | flex-direction: column; 203 | gap: 1rem; 204 | padding-left: 2rem; 205 | padding-right: 2rem; 206 | } 207 | .result:hover { 208 | cursor: pointer; 209 | } 210 | 211 | .result-description { 212 | font-size: 0.8rem; 213 | width: 50%; 214 | color: #545454; 215 | } 216 | .result { 217 | margin-bottom: 20px; 218 | } 219 | .result-title { 220 | font-size: 18px; 221 | color: #1a0dab; 222 | text-decoration: none; 223 | } 224 | .result-title:hover { 225 | text-decoration: underline; 226 | } 227 | .result-url { 228 | font-size: 14px; 229 | color: #006621; 230 | } 231 | #pagination { 232 | display: flex; 233 | justify-content: center; 234 | align-items: center; 235 | gap: 1.5rem; 236 | padding: 2rem; 237 | font-size: 1.2rem; 238 | } 239 | 240 | #pagination a { 241 | color: #1a0dab; 242 | } 243 | 244 | #pagination a:hover { 245 | text-decoration: underline; 246 | cursor: pointer; 247 | } 248 | -------------------------------------------------------------------------------- /server/google_search_api.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, request, jsonify 2 | import csv 3 | import nltk 4 | from nltk.corpus import stopwords 5 | from nltk.stem import PorterStemmer 6 | from nltk.tokenize import word_tokenize 7 | import ssl 8 | from flask_cors import CORS 9 | app = Flask(__name__) 10 | 11 | 12 | CORS(app) 13 | 14 | # NLTK setup (handles SSL certificate issues) 15 | try: 16 | _create_unverified_https_context = ssl._create_unverified_context 17 | except AttributeError: 18 | pass 19 | else: 20 | ssl._create_default_https_context = _create_unverified_https_context 21 | 22 | # Download NLTK data only if not already downloaded 23 | def download_nltk_resources(): 24 | try: 25 | stopwords.words('english') 26 | except LookupError: 27 | nltk.download('stopwords') 28 | try: 29 | word_tokenize('test') 30 | except LookupError: 31 | nltk.download('punkt') 32 | 33 | # Initialize NLTK components 34 | download_nltk_resources() 35 | stop_words = set(stopwords.words('english')) 36 | ps = PorterStemmer() 37 | 38 | 39 | def load_inverted_index(file_path): 40 | inverted_index = {} 41 | with open(file_path, 'r', encoding='utf-8') as csvfile: 42 | reader = csv.DictReader(csvfile) 43 | for row in reader: 44 | word = row['word'] 45 | doc_ids_str = row['doc_ids'].strip("[]") # Remove brackets 46 | doc_ids_list = doc_ids_str.split(', ') if doc_ids_str else [] 47 | doc_ids = set(int(doc_id) for doc_id in doc_ids_list) 48 | inverted_index[word] = doc_ids 49 | return inverted_index 50 | 51 | def load_document_info(file_path): 52 | document_info = {} 53 | with open(file_path, 'r', encoding='utf-8') as csvfile: 54 | reader = csv.DictReader(csvfile) 55 | for row in reader: 56 | doc_id = int(row['doc_id']) 57 | document_info[doc_id] = { 58 | 'url': row['url'], 59 | 'title': row['title'], 60 | 'description': row['description'], 61 | 'pagerank': float(row['pagerank']) 62 | } 63 | return document_info 64 | 65 | def parse_query(query): 66 | # Tokenize the query 67 | tokens = word_tokenize(query.lower()) 68 | # Remove non-alphabetic tokens and stop words, then stem the words 69 | query_words = [ 70 | ps.stem(word) for word in tokens if word.isalpha() and word not in stop_words 71 | ] 72 | return query_words 73 | 74 | def search(query, inverted_index, document_info, num_results=10, page=1): 75 | query_words = parse_query(query) 76 | if not query_words: 77 | return [] 78 | # Find documents that contain any of the query words 79 | matched_doc_ids = set() 80 | for word in query_words: 81 | if word in inverted_index: 82 | matched_doc_ids.update(inverted_index[word]) 83 | if not matched_doc_ids: 84 | return [] 85 | # Retrieve documents and their PageRank scores 86 | results = [] 87 | for doc_id in matched_doc_ids: 88 | info = document_info[doc_id] 89 | results.append({ 90 | 'doc_id': doc_id, 91 | 'url': info['url'], 92 | 'title': info['title'], 93 | 'description': info['description'], 94 | 'pagerank': info['pagerank'] 95 | }) 96 | # Sort documents by PageRank score 97 | sorted_results = sorted(results, key=lambda x: x['pagerank'], reverse=True) 98 | # Pagination 99 | start = (page - 1) * num_results 100 | end = start + num_results 101 | paginated_results = sorted_results[start:end] 102 | return paginated_results 103 | 104 | # Load the inverted index and document info 105 | # If you are using a different file, replace the path with the path to your file 106 | #If you're using a database, replace this with the code to connect to your database 107 | try: 108 | inverted_index = load_inverted_index('../search/complete_examples/advanced_pagerank_inverted_index.csv') 109 | document_info = load_document_info('../search/complete_examples/advanced_pagerank.csv') 110 | except FileNotFoundError: 111 | try: 112 | inverted_index = load_inverted_index("../advanced_pagerank_inverted_index.csv") 113 | document_info = load_document_info("../advanced_pagerank.csv") 114 | except FileNotFoundError: 115 | print("Error: Files not found, run the advanced_pagerank.py file first") 116 | print("Exiting...") 117 | exit() 118 | 119 | 120 | @app.route('/search') 121 | def search_api(): 122 | query = request.args.get('q', '') 123 | num_results = int(request.args.get('num_results', 10)) 124 | page = int(request.args.get('page', 1)) 125 | if not query: 126 | return jsonify({'error': 'No query provided'}), 400 127 | results = search(query, inverted_index, document_info, num_results=num_results, page=page) 128 | return jsonify({ 129 | 'query': query, 130 | 'page': page, 131 | 'num_results': num_results, 132 | 'results': results 133 | }) 134 | 135 | if __name__ == '__main__': 136 | app.run(debug=True) -------------------------------------------------------------------------------- /search/crawling/advanced_crawler.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | import time 4 | import random 5 | from queue import Queue 6 | from concurrent.futures import ThreadPoolExecutor 7 | import threading 8 | from urllib.parse import urlparse 9 | import csv 10 | from indexing.advanced_indexing import index_page 11 | import sys 12 | import os 13 | # Add the root directory to sys.path 14 | # This is to be able to import modules from other directories (indexing and serving) idk why... 15 | # any imports from indexing/serving need to happen under this 16 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 17 | 18 | # Function to check robots.txt for permission to crawl 19 | # If we don't do this, we could get blocked/banned 20 | # since we don't have permission to crawl. 21 | def can_crawl(url): 22 | parsed_url = urlparse(url) 23 | robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" 24 | print(f"Checking robots.txt for: {robots_url}") 25 | time.sleep(random.uniform(1, 3)) 26 | try: 27 | response = requests.get(robots_url, timeout=5) 28 | response.raise_for_status() 29 | disallowed_paths = [] 30 | for line in response.text.splitlines(): 31 | if line.startswith("Disallow"): 32 | parts = line.split() 33 | if len(parts) > 1: 34 | disallowed_paths.append(parts[1]) 35 | for path in disallowed_paths: 36 | if urlparse(url).path.startswith(path): 37 | print(f"Disallowed by robots.txt: {url}") 38 | return False 39 | return True 40 | except requests.RequestException: 41 | print(f"Failed to access robots.txt: {robots_url}") 42 | return False # If we can't access robots.txt, assume we can't crawl (we're being nice here) 43 | 44 | # Function to fetch and parse URL 45 | def crawl(args): 46 | queue = args['queue'] 47 | visited_urls = args['visited_urls'] 48 | crawl_count = args['crawl_count'] 49 | CRAWL_LIMIT = args['CRAWL_LIMIT'] 50 | lock = args['lock'] 51 | index = args['index'] 52 | webpage_info = args['webpage_info'] 53 | webpage_id_counter = args['webpage_id_counter'] 54 | stop_crawl = args['stop_crawl'] 55 | 56 | while not stop_crawl.is_set(): 57 | try: 58 | current_url = queue.get(timeout=5) 59 | print("Time to crawl: " + current_url) 60 | except Exception: 61 | break # Exit if no more URLs are available to crawl 62 | 63 | with lock: 64 | if crawl_count[0] >= CRAWL_LIMIT: 65 | queue.queue.clear() # Clear remaining URLs to stop processing 66 | print("Crawl limit reached. Exiting...") 67 | stop_crawl.set() 68 | break 69 | if current_url in visited_urls: 70 | queue.task_done() 71 | continue 72 | visited_urls.add(current_url) 73 | 74 | """ Checks for noindex directive in the page 75 | Comment this out if you don't care about noindex 76 | WARNING: websites could block/ban you if you don't have permission 77 | """ 78 | # if not can_crawl(current_url): 79 | # queue.task_done() 80 | # continue 81 | 82 | time.sleep(random.uniform(2, 5)) 83 | try: 84 | response = requests.get(current_url, timeout=5) 85 | response.raise_for_status() # Check for request errors 86 | content = response.content 87 | 88 | """ Checks for noindex directive in the page 89 | Comment this out if you don't care about noindex 90 | WARNING: websites could block/ban you if you don't have permission 91 | """ 92 | # if 'noindex' in content.decode('utf-8').lower(): 93 | # print(f"Noindex found, skipping: {current_url}") 94 | # queue.task_done() 95 | # continue 96 | 97 | 98 | # Parse the fetched content to find new URLs 99 | webpage = BeautifulSoup(content, "html.parser") 100 | 101 | # Index the webpage 102 | indexed_page = index_page(webpage, current_url) 103 | with lock: 104 | for word in indexed_page["words"]: 105 | if word not in index: 106 | index[word] = set() 107 | index[word].add(webpage_id_counter[0]) 108 | webpage_info[webpage_id_counter[0]] = indexed_page 109 | webpage_id_counter[0] += 1 110 | 111 | hyperlinks = webpage.select("a[href]") 112 | new_urls = parse_links(hyperlinks, current_url) 113 | 114 | with lock: 115 | for new_url in new_urls: 116 | if new_url not in visited_urls: 117 | queue.put(new_url) 118 | crawl_count[0] += 1 119 | 120 | except requests.RequestException as e: 121 | print(f"Failed to fetch {current_url}: {e}") 122 | finally: 123 | queue.task_done() 124 | 125 | # Function to parse links from HTML content 126 | def parse_links(hyperlinks, current_url): 127 | urls = [] 128 | for hyperlink in hyperlinks: 129 | url = hyperlink["href"] 130 | 131 | # Format the URL into a proper URL 132 | if url.startswith("#"): 133 | continue # Skip same-page anchors 134 | if url.startswith("//"): 135 | url = "https:" + url # Add scheme to protocol-relative URLs 136 | elif url.startswith("/"): 137 | # Construct full URL for relative links 138 | base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) 139 | url = base_url + url 140 | elif not url.startswith("http"): 141 | continue # Skip non-HTTP links 142 | url = url.split("#")[0] # Remove anchor 143 | urls.append(url) 144 | return urls 145 | 146 | # Main crawling function 147 | def sloth_bot(): 148 | # Start with the initial pages to crawl 149 | starting_urls = [ 150 | "https://www.wikipedia.org/wiki/Google", 151 | "https://www.bbc.com/news/world", 152 | "https://news.ycombinator.com/", 153 | ] 154 | 155 | urls_to_crawl = Queue() 156 | for seed_url in starting_urls: 157 | urls_to_crawl.put(seed_url) 158 | 159 | visited_urls = set() # URL tracking 160 | CRAWL_LIMIT = 20 # Set crawl limit 161 | crawl_count = [0] # Shared counter 162 | lock = threading.Lock() # Thread safety lock 163 | index = {} 164 | webpage_info = {} 165 | webpage_id_counter = [0] 166 | stop_crawl = threading.Event() 167 | 168 | # Start concurrent crawling with ThreadPoolExecutor 169 | #Concurrency = speed 170 | #Threads go BRRRRR 171 | #Increase this if you want more threads, but be careful with these. 172 | NUM_WORKERS = 100 173 | #Setting up arguments for the crawl function 174 | args = { 175 | 'queue': urls_to_crawl, 176 | 'visited_urls': visited_urls, 177 | 'crawl_count': crawl_count, 178 | 'CRAWL_LIMIT': CRAWL_LIMIT, 179 | 'lock': lock, 180 | 'index': index, 181 | 'webpage_info': webpage_info, 182 | 'webpage_id_counter': webpage_id_counter, 183 | 'stop_crawl': stop_crawl 184 | } 185 | 186 | with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: 187 | for _ in range(NUM_WORKERS): 188 | executor.submit(crawl, args) 189 | 190 | print("All URLs have been crawled") 191 | 192 | 193 | """ This part is for saving the data to CSV files. 194 | However, if you don't want to save the data, you can remove/comment out this part. 195 | If you want to use a database, you can replace this part with a database connection. 196 | """ 197 | with open('advanced_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile: 198 | fieldnames = ['word', 'doc_ids'] 199 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 200 | writer.writeheader() 201 | for word, doc_ids in index.items(): 202 | writer.writerow({'word': word, 'doc_ids': list(doc_ids)}) 203 | 204 | with open('advanced_doc_info.csv', 'w', newline='', encoding='utf-8') as csvfile: 205 | fieldnames = ['doc_id', 'url', 'title', 'description'] 206 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 207 | writer.writeheader() 208 | for doc_id, info in webpage_info.items(): 209 | writer.writerow({ 210 | 'doc_id': doc_id, 211 | 'url': info['url'], 212 | 'title': info['title'], 213 | 'description': info['description'] 214 | }) 215 | 216 | def main(): 217 | # Start the crawling process 218 | sloth_bot() 219 | 220 | if __name__ == "__main__": 221 | main() 222 | 223 | 224 | 225 | -------------------------------------------------------------------------------- /search/complete_examples/advanced_pagerank.py: -------------------------------------------------------------------------------- 1 | 2 | from bs4 import BeautifulSoup 3 | import requests 4 | import time 5 | import random 6 | from queue import Queue 7 | from concurrent.futures import ThreadPoolExecutor 8 | import threading 9 | from urllib.parse import urlparse 10 | import csv 11 | import sys 12 | import os 13 | # Add the root directory to sys.path 14 | # This is to be able to import modules from other directories (indexing and serving) idk why... 15 | # any imports from indexing/serving need to happen under this 16 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 17 | from indexing.advanced_indexing import advanced_index_page 18 | from serving.pagerank import compute_pagerank 19 | 20 | 21 | # Function to check robots.txt for permission to crawl 22 | # If we don't do this, we could get blocked/banned 23 | # since we don't have permission to crawl. 24 | def can_crawl(url): 25 | parsed_url = urlparse(url) 26 | robots_url = f"{parsed_url.scheme}://{parsed_url.netloc}/robots.txt" 27 | print(f"Checking robots.txt for: {robots_url}") 28 | time.sleep(random.uniform(1, 3)) 29 | try: 30 | response = requests.get(robots_url, timeout=5) 31 | response.raise_for_status() 32 | disallowed_paths = [] 33 | for line in response.text.splitlines(): 34 | if line.startswith("Disallow"): 35 | parts = line.split() 36 | if len(parts) > 1: 37 | disallowed_paths.append(parts[1]) 38 | for path in disallowed_paths: 39 | if urlparse(url).path.startswith(path): 40 | print(f"Disallowed by robots.txt: {url}") 41 | return False 42 | return True 43 | except requests.RequestException: 44 | print(f"Failed to access robots.txt: {robots_url}") 45 | return False # If we can't access robots.txt, assume we can't crawl (we're being nice here) 46 | 47 | # Function to fetch and parse URL 48 | def crawl(args): 49 | queue = args['queue'] 50 | visited_urls = args['visited_urls'] 51 | crawl_count = args['crawl_count'] 52 | CRAWL_LIMIT = args['CRAWL_LIMIT'] 53 | lock = args['lock'] 54 | index = args['index'] 55 | webpage_info = args['webpage_info'] 56 | webpage_id_counter = args['webpage_id_counter'] 57 | pagerank_graph = args['pagerank_graph'] 58 | stop_crawl = args['stop_crawl'] 59 | 60 | while not stop_crawl.is_set(): 61 | try: 62 | current_url = queue.get(timeout=5) 63 | print("Time to crawl: " + current_url) 64 | except Exception: 65 | break # Exit if no more URLs are available to crawl 66 | 67 | with lock: 68 | if crawl_count[0] >= CRAWL_LIMIT: 69 | queue.queue.clear() # Clear remaining URLs to stop processing 70 | print("Crawl limit reached. Exiting...") 71 | stop_crawl.set() 72 | break 73 | if current_url in visited_urls: 74 | queue.task_done() 75 | continue 76 | visited_urls.add(current_url) 77 | 78 | """ Checks for noindex directive in the page 79 | Comment this out if you don't care about noindex 80 | WARNING: websites could block/ban you if you don't have permission 81 | """ 82 | if not can_crawl(current_url): 83 | queue.task_done() 84 | continue 85 | 86 | time.sleep(random.uniform(2, 5)) 87 | try: 88 | response = requests.get(current_url, timeout=5) 89 | response.raise_for_status() # Check for request errors 90 | content = response.content 91 | 92 | """ Checks for noindex directive in the page 93 | Comment this out if you don't care about noindex 94 | WARNING: websites could block/ban you if you don't have permission 95 | """ 96 | if 'noindex' in content.decode('utf-8').lower(): 97 | print(f"Noindex found, skipping: {current_url}") 98 | queue.task_done() 99 | continue 100 | 101 | 102 | # Parse the fetched content to find new URLs 103 | webpage = BeautifulSoup(content, "html.parser") 104 | 105 | # Index the webpage 106 | indexed_page = advanced_index_page(webpage, current_url) 107 | with lock: 108 | for word in indexed_page["words"]: 109 | if word not in index: 110 | index[word] = set() 111 | index[word].add(webpage_id_counter[0]) 112 | webpage_info[webpage_id_counter[0]] = indexed_page 113 | webpage_id_counter[0] += 1 114 | 115 | hyperlinks = webpage.select("a[href]") 116 | #NEW: Add hyperlink connections for pagerank 117 | new_urls, hyperlink_connections = parse_links(hyperlinks, current_url) 118 | pagerank_graph[current_url] = hyperlink_connections 119 | 120 | with lock: 121 | for new_url in new_urls: 122 | if new_url not in visited_urls: 123 | queue.put(new_url) 124 | crawl_count[0] += 1 125 | 126 | except requests.RequestException as e: 127 | print(f"Failed to fetch {current_url}: {e}") 128 | finally: 129 | queue.task_done() 130 | 131 | # Function to parse links from HTML content 132 | def parse_links(hyperlinks, current_url): 133 | urls = [] 134 | #NEW: Add hyperlink connections for pagerank 135 | hyperlink_connections = set() 136 | for hyperlink in hyperlinks: 137 | url = hyperlink["href"] 138 | 139 | # Format the URL into a proper URL 140 | if url.startswith("#"): 141 | continue # Skip same-page anchors 142 | if url.startswith("//"): 143 | url = "https:" + url # Add scheme to protocol-relative URLs 144 | elif url.startswith("/"): 145 | # Construct full URL for relative links 146 | base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url)) 147 | url = base_url + url 148 | elif not url.startswith("http"): 149 | continue # Skip non-HTTP links 150 | url = url.split("#")[0] # Remove anchor 151 | 152 | hyperlink_connections.add(url) 153 | urls.append(url) 154 | return urls, hyperlink_connections 155 | 156 | # Main crawling function 157 | def sloth_bot(): 158 | # Start with the initial pages to crawl 159 | starting_urls = [ 160 | "https://www.wikipedia.org/wiki/Google", 161 | "https://www.bbc.com/news/world", 162 | "https://news.ycombinator.com/", 163 | ] 164 | 165 | urls_to_crawl = Queue() 166 | for seed_url in starting_urls: 167 | urls_to_crawl.put(seed_url) 168 | 169 | visited_urls = set() # URL tracking 170 | CRAWL_LIMIT = 20 # Set crawl limit 171 | crawl_count = [0] # Shared counter 172 | lock = threading.Lock() # Thread safety lock 173 | index = {} 174 | webpage_info = {} 175 | #NEW: pagerank graph for pagerank. 176 | # This will be used to store the connections between hyperlinks 177 | pagerank_graph = {} 178 | webpage_id_counter = [0] 179 | stop_crawl = threading.Event() 180 | 181 | # Start concurrent crawling with ThreadPoolExecutor 182 | #Concurrency = speed 183 | #Threads go BRRRRR 184 | #Increase this if you want more threads, but be careful with these. 185 | NUM_WORKERS = 100 186 | #Setting up arguments for the crawl function 187 | args = { 188 | 'queue': urls_to_crawl, 189 | 'visited_urls': visited_urls, 190 | 'crawl_count': crawl_count, 191 | 'CRAWL_LIMIT': CRAWL_LIMIT, 192 | 'lock': lock, 193 | 'index': index, 194 | 'webpage_info': webpage_info, 195 | 'webpage_id_counter': webpage_id_counter, 196 | 'pagerank_graph': pagerank_graph, 197 | 'stop_crawl': stop_crawl 198 | } 199 | 200 | with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor: 201 | for _ in range(NUM_WORKERS): 202 | executor.submit(crawl, args) 203 | 204 | print("All URLs have been crawled") 205 | 206 | #NEW: Computes pagerank 207 | pagerank_scores = compute_pagerank(pagerank_graph) 208 | 209 | 210 | """ This part is for saving the data to CSV files. 211 | However, if you don't want to save the data, you can remove/comment out this part. 212 | If you want to use a database, you can replace this part with a database connection. 213 | """ 214 | with open('advanced_pagerank_inverted_index.csv', 'w', newline='', encoding='utf-8') as csvfile: 215 | fieldnames = ['word', 'doc_ids'] 216 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 217 | writer.writeheader() 218 | for word, doc_ids in index.items(): 219 | writer.writerow({'word': word, 'doc_ids': list(doc_ids)}) 220 | 221 | with open('advanced_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile: 222 | fieldnames = ['doc_id', 'url', 'title', 'description', 'pagerank'] 223 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames) 224 | writer.writeheader() 225 | for doc_id, info in webpage_info.items(): 226 | writer.writerow({ 227 | 'doc_id': doc_id, 228 | 'url': info['url'], 229 | 'title': info['title'], 230 | 'description': info['description'], 231 | 'pagerank': pagerank_scores.get(info['url'], 0) 232 | }) 233 | 234 | # Entry point for the script 235 | def main(): 236 | sloth_bot() 237 | 238 | if __name__ == "__main__": 239 | main() 240 | --------------------------------------------------------------------------------