76 |
77 |
78 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 |
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 |
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 |
121 | # SageMath parsed files
122 | *.sage.py
123 |
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 |
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 |
137 | # Rope project settings
138 | .ropeproject
139 |
140 | # mkdocs documentation
141 | /site
142 |
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 |
148 | # Pyre type checker
149 | .pyre/
150 |
151 | # pytype static type analyzer
152 | .pytype/
153 |
154 | # Cython debug symbols
155 | cython_debug/
156 |
157 | # PyCharm
158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | # and can be added to the global gitignore or merged into this file. For a more nuclear
161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 |
164 | .DS_Store
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Sloth Search - A Google-like Search Engine Clone
2 |
3 | Sloth Search is a project that aims to recreate Google, including crawling, indexing, and serving results through a user-friendly front-end interface. The project consists of three main components: the Client, Search, and Server.
4 | [Check out the video for a full explanation here](https://youtu.be/WCpimlH0Kck?si=_zFzrb1cxZinWKo3)
5 |
6 | ## Project Structure
7 |
8 | The project is divided into the following folders:
9 |
10 | - **Client**: Contains the front-end code, providing a user interface similar to Google search, where users can enter queries and view search results.
11 |
12 | - **Search**: Contains the core components of Sloth Search, which replicate the three main parts of Google:
13 |
14 | - **Crawling**: The web crawler that collects information from the web.
15 |
16 | - **Indexing**: Processing and storing the content collected by the crawler for efficient searching.
17 |
18 | - **Serving (PageRank)**: Serving search results based on their relevance and PageRank algorithm.
19 |
20 | - **Server**: Contains the search API used to handle client requests and provide search results.
21 |
22 | ## Installation and Setup
23 |
24 | **1. Clone the Repository**
25 |
26 | ```sh
27 | git clone https://github.com/The-CodingSloth/sloth-search.git
28 | cd sloth-search
29 | ```
30 |
31 | **2. Install the necessary Python dependencies**
32 |
33 | ```sh
34 | pip install -r requirements.txt
35 | ```
36 |
37 | **3. Client Setup**
38 |
39 | - The client contains the HTML, CSS, and JavaScript code to run the front-end.
40 |
41 | - Open the `index.html` file in your browser, or use a static file server to serve the client code locally.
42 |
43 | - You can also use the live server extension.
44 |
45 | **4. Search Setup**
46 |
47 | - The `search` directory contains the code for crawling, indexing, and serving.
48 |
49 | - You can start the process by running:
50 |
51 | ```sh
52 | python search/complete_examples/advanced_pagerank.py
53 | ```
54 |
55 | - This will crawl, index, and prepare the content for searching.
56 |
57 | - If you want to run any other files, do the same process:
58 |
59 | ```sh
60 | python search/
61 | ```
62 |
63 | ## How It Works
64 |
65 | **1. Crawling**
66 |
67 | - The crawler starts with a set of seed URLs and collects links and content from the web.
68 |
69 | - It respects `robots.txt` to avoid being blocked and to ensure ethical crawling.
70 |
71 | - Parsed data is stored in a format ready for indexing.
72 |
73 | **2. Indexing**
74 |
75 | - The indexing module processes the crawled pages.
76 |
77 | - The content is tokenized, cleaned, stemmed, and stop words are removed using the NLTK library.
78 |
79 | - The resulting indexed data is saved to be used by the search API.
80 |
81 | **3. Serving and PageRank**
82 |
83 | - The PageRank algorithm is used to rank pages based on their importance.
84 |
85 | - When a user searches for a query through the client, the server uses the indexed data and PageRank scores to return the most relevant pages.
86 |
87 | ## Important Notes
88 |
89 | - **Respecting Websites**: The crawler respects `robots.txt` rules. Please make sure not to overload any websites.
90 |
91 | - **PageRank Algorithm**: The implementation of the PageRank algorithm uses an iterative approach to rank pages based on the links.
92 |
93 | - **Data Storage**: The crawler and indexer use CSV files for data storage (`advanced_pagerank_inverted_index.csv` and `advanced_pagerank.csv`). Make sure these files are writable during execution.
94 |
95 | ## Contributing
96 |
97 | Contributions are welcome! If you'd like to contribute to the development of Sloth Search, feel free to fork the repository, make changes, and submit a pull request.
98 |
99 | ## License
100 |
101 | This project is open-source and available under the MIT License.
102 |
103 | If you have any questions or suggestions, feel free to contact me.
104 |
105 | Happy Searching with Sloth Search! 🦥🔍
106 |
--------------------------------------------------------------------------------
/search/complete_examples/simple_pagerank.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 | import time
4 | import random
5 | import csv
6 | import sys
7 | import os
8 | # Add the root directory to sys.path
9 | # This is to be able to import modules from other directories (indexing and serving) idk why...
10 | # any imports from indexing/serving need to happen under this
11 | sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
12 | from indexing.simple_indexing import simple_index_page
13 | from serving.pagerank import compute_pagerank
14 |
15 | def sloth_bot():
16 | # Our list of URLs to crawl
17 | urls = ["https://en.wikipedia.org/wiki/Google"]
18 | visited_urls = set()
19 |
20 | # Create the index and graph
21 | index = {} # URL -> page contents
22 | pagerank_graph = {} # URL -> set of URLs it links to
23 | CRAWL_LIMIT = 5
24 | crawl_count = 0
25 |
26 | # Loops through the list of URLs
27 | while urls and crawl_count < CRAWL_LIMIT:
28 | # Grab the next URL
29 | current_url = urls.pop()
30 | if current_url in visited_urls:
31 | continue
32 | print("Time to crawl: " + current_url)
33 | time.sleep(random.uniform(1, 2))
34 | try:
35 | response = requests.get(current_url)
36 | response.raise_for_status()
37 | except requests.RequestException as e:
38 | print(f"Failed to retrieve {current_url}: {e}")
39 | continue
40 |
41 | # Parse the content of the page
42 | webpage = BeautifulSoup(response.content, "html.parser")
43 |
44 | # Add the page to the index
45 | indexed_page = simple_index_page(webpage, current_url)
46 | index[current_url] = indexed_page
47 | visited_urls.add(current_url)
48 |
49 | # Grab the links from the page
50 | hyperlinks = webpage.select("a[href]")
51 | #This is where we store our connected pages
52 | hyperlink_connections = set()
53 | for hyperlink in hyperlinks:
54 | url = hyperlink["href"]
55 | # Format the URL into a proper URL
56 | if url.startswith("#"):
57 | continue
58 | if url.startswith("//"):
59 | url = "https:" + url
60 | elif url.startswith("/"):
61 | base_url = "{0.scheme}://{0.netloc}".format(requests.utils.urlparse(current_url))
62 | url = base_url + url
63 | elif not url.startswith("http"):
64 | continue
65 | url = url.split('#')[0]
66 | #Add to the link connection
67 | hyperlink_connections.add(url)
68 | # If we haven't visited this URL yet, add it to our list
69 | if url not in visited_urls:
70 | urls.append(url)
71 |
72 | # Update the page's outgoing links
73 | index[current_url]['hyperlink_connections'] = hyperlink_connections
74 | pagerank_graph[current_url] = hyperlink_connections
75 |
76 | crawl_count += 1
77 | print(f"Crawled count: {crawl_count}, index size: {len(index)}, URLs left: {len(urls)}")
78 |
79 | # Compute PageRank
80 | pagerank_scores = compute_pagerank(pagerank_graph)
81 |
82 | """ This part is for saving the data to CSV files.
83 | However, if you don't want to save the data, you can remove/comment out this part.
84 | If you want to use a database, you can replace this part with a database connection.
85 | """
86 |
87 | with open('simple_pagerank.csv', 'w', newline='', encoding='utf-8') as csvfile:
88 | fieldnames = ["url", "title", "description", "pagerank", "words"]
89 | writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
90 | writer.writeheader()
91 | for url, info in index.items():
92 | writer.writerow({
93 | 'url': url,
94 | 'title': info['title'],
95 | 'description': info['description'],
96 | 'pagerank': pagerank_scores.get(url, 0),
97 | 'words': ', '.join(info['words'])
98 | })
99 |
100 |
101 |
102 | def main():
103 | # Start the crawling process
104 | sloth_bot()
105 |
106 | if __name__ == "__main__":
107 | main()
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/client/search.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | Search Results - My Search Engine
7 |
8 |
9 |
10 |