├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.md │ └── feature-request.md └── workflows │ ├── python-package.yml │ └── python-publish.yml ├── LICENSE ├── README.md ├── requirements.txt ├── scpscraper ├── __init__.py ├── gdrive.py ├── scpscraper.py └── test_scpscraper.py ├── setup.cfg └── setup.py /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: JaonHax 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **System Information:** 27 | - OS and Version: [e.g. Linux Mint 19.2 Cinnamon] 28 | - Environment: [e.g. terminal, virtual environment, Jupyter notebook, etc.] 29 | - Version of scpscraper: [e.g. 1.0.0a0] 30 | 31 | **Additional context** 32 | Add any other context about the problem here. This is where you'd put traceback information, as well. 33 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature Request 3 | about: Suggest an idea for this project 4 | title: "[Request]" 5 | labels: feature request 6 | assignees: JaonHax 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Test scpscraper 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: [3.6, 3.7, 3.8] 19 | 20 | name: Test on Python ${{ matrix.python-version }} 21 | 22 | steps: 23 | - uses: actions/checkout@v2 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v2 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Update to latest pip 29 | run: pip install --upgrade pip 30 | - name: Install dependencies 31 | run: | 32 | pip install flake8 pytest 33 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 34 | - name: Lint with flake8 35 | run: | 36 | # stop the build if there are Python syntax errors or undefined names 37 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 39 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 40 | - name: Test with pytest 41 | run: pytest 42 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | branches: [master] 10 | 11 | jobs: 12 | deploy: 13 | 14 | runs-on: ubuntu-latest 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | - name: Set up Python 19 | uses: actions/setup-python@v2 20 | with: 21 | python-version: '3.x' 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install setuptools wheel twine 26 | - name: Build and publish 27 | env: 28 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 29 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 30 | run: | 31 | python setup.py sdist bdist_wheel 32 | twine upload dist/* 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 JaonHax 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # SCP Scraper 3 | A small Python library designed for scraping data from the SCP wiki. Made with AI training (namely NLP models) and dataset collection (for things like categorization of SCPs for external projects) in mind, and has arguments to allow for ease of use in those applications. 4 | 5 | Below you will find installation instructions, examples of how to use this library, and the ways in which you can utilize it. I hope you find this as useful as I have! 6 | 7 | ## Sample Code 8 | 9 | ### Installation 10 | `scpscraper` can be installed via `pip install`. Here's the command I recommend using, so you consistently have the latest version. 11 | ``` 12 | pip3 install --upgrade scpscraper 13 | ``` 14 | 15 | ### The Basics 16 | #### Importing the Library 17 | ```py 18 | # Before we begin, we obviously have to import scpscraper. 19 | import scpscraper 20 | ``` 21 | 22 | #### Grabbing an SCP's Name 23 | ```py 24 | # Let's use 3001 (Red Reality) as an example. 25 | name = scpscraper.get_scp_name(3001) 26 | 27 | print(name) # Outputs "Red Reality" 28 | ``` 29 | 30 | #### Grabbing as many details as possible about an SCP 31 | ```py 32 | # Again using 3001 as an example 33 | info = scpscraper.get_scp(3001) 34 | 35 | print(info) # Outputs a dictionary with the 36 | # name, object id, rating, page content by section, etc. 37 | ``` 38 | 39 | ### The Fun Stuff 40 | #### Grabbing an SCP's `page-content` div HTML 41 | For reference, the `page-content` div contains what the user actually wrote, without all the extra Wikidot external stuff. 42 | ```py 43 | # Once again, 3001 is the example 44 | scp = scpscraper.get_single_scp(3001) 45 | 46 | # Grab the page-content div specifically 47 | content = scp.find_all('div', id='page-content') 48 | 49 | print(content) # Outputs "
...
" 50 | ``` 51 | 52 | #### Scraping HTML or information from *multiple* SCPs 53 | ```py 54 | # Grab info on SCPs 000-099 55 | scpscraper.scrape_scps(0, 100) 56 | 57 | # Same as above, but only grabbing Keter-class SCPs 58 | scpscraper.scrape_scps(0, 100, tags=['keter']) 59 | 60 | # Grab 000-099 in a format that can be used to train AI 61 | scpscraper.scrape_scps(0, 100, ai_dataset=True) 62 | ``` 63 | ```py 64 | # Scrape the page-content div's HTML from SCP-000 to SCP-099 65 | 66 | # Only including this as an example, but scrape_scps_html() has 67 | # all the same options as scrape_scps(). 68 | scpscraper.scrape_scps_html(0, 100) 69 | ``` 70 | 71 | ### Google Colaboratory Only Usage 72 | Because of the `google.colab` module included in Google Colaboratory, we can do a few extra things there that we can't otherwise. 73 | 74 | #### Mount your Google Drive to the Colaboratory VM 75 | ```py 76 | # Mounts it to the directory /content/drive/ 77 | scpscraper.gdrive.mount() 78 | ``` 79 | 80 | #### Scrape SCP info/HTML and copy to your Google Drive afterwards 81 | ```py 82 | # Requires your Google Drive to be mounted at the directory /content/drive/ 83 | scpscraper.scrape_scps(0, 100, copy_to_drive=True) 84 | 85 | scpscraper.scrape_scps_html(0, 100, copy_to_drive=True) 86 | ``` 87 | 88 | #### Copy other files to/from your Google Drive 89 | ```py 90 | # Requires your Google Drive to be mounted at the directory /content/drive/ 91 | scpscraper.gdrive.copy_to_drive('example.txt') 92 | 93 | scpscraper.gdrive.copy_from_drive('example.txt') 94 | ``` 95 | ## Planned Updates 96 | Potential updates in the future to make scraping data from any website easy/viable, allowing for easy mass collection of data. 97 | 98 | ## Link to GitHub Repo 99 | Please consider checking it out! You can report issues, request features, contribute to this project, etc. in the GitHub Repo. That is the best way to reach me for issues/feedback relating to this project. 100 | 101 | https://github.com/JaonHax/scpscraper/ 102 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4>=4.9.1 2 | bs4>=0.0.1 3 | soupsieve>=2.0.1 4 | tqdm>=4.48.0 5 | lxml 6 | -------------------------------------------------------------------------------- /scpscraper/__init__.py: -------------------------------------------------------------------------------- 1 | from scpscraper.scpscraper import * 2 | import scpscraper.gdrive 3 | -------------------------------------------------------------------------------- /scpscraper/gdrive.py: -------------------------------------------------------------------------------- 1 | import os, shutil, sys 2 | 3 | # Check if user is using Google Colaboratory (and if so, we can do fancier things...) 4 | try: 5 | from google.colab import drive 6 | except: 7 | pass 8 | 9 | # Custom error definitions 10 | class DriveNotMountedError(Exception): 11 | pass 12 | 13 | class PathNotRecognizedError(Exception): 14 | pass 15 | 16 | class PathNotExistsError(Exception): 17 | pass 18 | 19 | class NoColaboratoryVMError(Exception): 20 | pass 21 | 22 | # Special Google Colaboratory functions 23 | def mount(): 24 | """ 25 | Mounts your Google Drive to the Colaboratory VM. 26 | """ 27 | if 'google.colab' in sys.modules: 28 | drive.mount('/content/drive') 29 | else: 30 | raise NoColaboratoryVMError("You must be in Google Colaboratory to run any Google Drive related functions!") 31 | 32 | def _is_mounted(): 33 | if os.path.isdir('/content/drive'): 34 | return 35 | else: 36 | raise DriveNotMountedError("You must first mount your Google Drive using scpscraper.gdrive.mount()!") 37 | 38 | def copy_to_drive(path: str): 39 | """ 40 | Copies a file or directory to your Google Drive. 41 | """ 42 | _is_mounted() 43 | 44 | if os.path.exists(path): 45 | if os.path.isfile(path): 46 | shutil.copyfile(path, f"/content/drive/My Drive/{path}") 47 | 48 | elif os.path.isdir(path): 49 | shutil.copytree(path, f"/content/drive/My Drive/{path}") 50 | 51 | else: 52 | raise PathNotRecognizedError(f"Path {path} is not a file or a directory!") 53 | 54 | else: 55 | raise PathNotExistsError(f"Path {path} does not exist!") 56 | 57 | def copy_from_drive(path: str): 58 | """ 59 | Copies a file or directory to your Google Drive. 60 | """ 61 | _is_mounted() 62 | 63 | if os.path.exists(f"/content/drive/My Drive/{path}"): 64 | if os.path.isfile(f"/content/drive/My Drive/{path}"): 65 | shutil.copyfile(f"/content/drive/My Drive/{path}", path) 66 | 67 | elif os.path.isdir(f"/content/drive/My Drive/{path}"): 68 | shutil.copytree(f"/content/drive/My Drive/{path}", path) 69 | 70 | else: 71 | raise PathNotRecognizedError(f"Path /content/drive/My Drive/{path} is not a file or a directory!") 72 | 73 | else: 74 | raise PathNotExistsError(f"Path /content/drive/My Drive/{path} does not exist!") 75 | -------------------------------------------------------------------------------- /scpscraper/scpscraper.py: -------------------------------------------------------------------------------- 1 | import os, shutil, sys, re, urllib.request, pytest 2 | from math import ceil 3 | from bs4 import BeautifulSoup 4 | from typing import Union 5 | from tqdm import tqdm 6 | 7 | from scpscraper import gdrive 8 | 9 | def get_single_scp(scp_id: str) -> BeautifulSoup: 10 | """Returns HTML code for the `page-content` div of a given SCP.""" 11 | try: 12 | # Grab the HTML code. 13 | r = urllib.request.urlopen(url=f'http://scp-wiki.wikidot.com/scp-{scp_id}') 14 | 15 | # Return the organized content for parsing. 16 | return BeautifulSoup(r, 'lxml') 17 | 18 | # Error handling. 19 | except Exception as e: 20 | # print(f'\nWARNING: Failed to access SCP Wiki page for SCP-{scp_id}. Error: {e}', file=sys.stderr) 21 | return 22 | 23 | def _get_scp_name(scp_id: int) -> str: 24 | """Gets the name of an SCP from the SCP Series pages. Internal function, shouldn't need to be called by a user.""" 25 | try: 26 | # Determine which series the SCP is in. 27 | if scp_id < 1000: 28 | url = 'http://scp-wiki.wikidot.com/scp-series' 29 | elif scp_id % 1000 == 0: 30 | url = f'http://scp-wiki.wikidot.com/scp-series-{int(scp_id/1000+1)}' 31 | else: 32 | url = f'http://scp-wiki.wikidot.com/scp-series-{ceil(scp_id/1000, 0)}' 33 | 34 | # Grab the HTML and parse as needed. 35 | r = urllib.request.urlopen(url=url) 36 | try: 37 | soup = BeautifulSoup(r, 'lxml') 38 | content = soup.find('div', id='page-content') 39 | list_elements = content.find_all('li') 40 | 41 | for li in list_elements: 42 | if re.findall('[0-9]+', li.find_next('a')['href']): 43 | if int(re.findall('[0-9]+', li.find_next('a')['href'])[0]) == scp_id: 44 | scp_name = re.split('-', li.get_text())[-1] 45 | return scp_name.strip(' ') 46 | 47 | # Handle 404 errors. 48 | except urllib.error.HTTPError as e: 49 | if e.code == 404: 50 | # print(f'\nWARNING: Unavailable SCP Series for SCP-{scp_id}!', file=sys.stderr) 51 | return 52 | 53 | # Handle other HTTP errors. 54 | else: 55 | # print(f'\nWARNING: Failed to access SCP Series page for SCP-{scp_id}. HTTP Status Code {e.code}. {e.read()}', file=sys.stderr) 56 | return 57 | 58 | # Even more error handling. 59 | except Exception as e: 60 | # print(f'\nWARNING: Failed to access SCP Series page for SCP-{scp_id}. Request Error: {e}', file=sys.stderr) 61 | return 62 | 63 | def parse_scp(soup: BeautifulSoup, scp_id: Union[str, int]) -> dict: 64 | """Parses the HTML content of a page on the SCP wiki. Internal function, shouldn't need to be called by a user.""" 65 | # Just to get this out of the way... 66 | if soup is None: 67 | return None 68 | 69 | # Get rating. 70 | try: 71 | rating = soup.find('span', {'class': 'rate-points'}).contents[1].contents[0].replace('+', '') 72 | 73 | # Error handling. 74 | except AttributeError: 75 | # print(f'No rating found for SCP-{scp_id}!') 76 | rating = 0 77 | 78 | # Get page-content block. 79 | content = soup.find('div', id='page-content') 80 | # print(content) 81 | 82 | # Get main image (if it exists). 83 | try: 84 | main_image = content.find('div', {'class': 'scp-image-block'}).contents[0]['src'] 85 | 86 | # Error handling. 87 | except AttributeError: 88 | # print(f'No main_image found for SCP-{scp_id}!') 89 | main_image = None 90 | 91 | # More error handling. 92 | except KeyError: 93 | # print(f'No main_image found for SCP-{scp_id}') 94 | main_image = None 95 | 96 | # Get image caption 97 | try: 98 | image_caption = content.find('div', {'class': 'scp-image-block'}).contents[2].contents[1].contents[0] 99 | 100 | # Error handling. 101 | except AttributeError: 102 | # print(f'No image_caption found for SCP-{scp_id}!') 103 | image_caption = None 104 | 105 | # Even more error handling. 106 | except KeyError: 107 | # print(f'No image_caption found for SCP-{scp_id}') 108 | image_caption = None 109 | 110 | # Get main content 111 | try: 112 | # Initial variable definitions. 113 | mapping = {} 114 | key = None 115 | # print(content.find_all('p')) 116 | 117 | # Find all the paragraph elements. 118 | for item in content.find_all('p'): 119 | # Grab the paragraph element's first child. 120 | first_child = item.next 121 | 122 | # Use bold portions as keys/identifiers for their sections. 123 | if first_child.name == 'strong': 124 | key = first_child.text.rstrip(': ') 125 | value = first_child.next_sibling.strip(': ') 126 | 127 | else: 128 | # Add subsequent paragraphs to the same section. 129 | if key is not None: 130 | value = f'{mapping[key]}\n{item.get_text(strip=True)}' 131 | 132 | # Don't if there's no section to add them to. 133 | else: 134 | value = None 135 | 136 | # Put that all into the value for the key. 137 | mapping[key] = value 138 | 139 | # Remove the sections that didn't have keys. 140 | try: 141 | mapping.pop(None) 142 | 143 | # Error handling. 144 | except: 145 | pass 146 | 147 | # Error handling. 148 | except AttributeError as e: 149 | # print(f'Can\'t parse content of SCP-{scp_id}! Error: {e}') 150 | mapping = None 151 | 152 | # Get page info. 153 | page_info = soup.find('div', id='page-info') 154 | revision = re.findall('\d+', page_info.next)[0] 155 | last_updated = page_info.find('span')['class'][1].replace('time_', '') 156 | 157 | # Get tags. 158 | tags_list = soup.find('div', {'class': 'page-tags'}).find('span') 159 | tags = [tag.string for tag in tags_list if tag.string != '\n'] 160 | 161 | # Get link to the discussion page. 162 | discussion_link = 'http://www.scpwiki.com' + soup.find('a', id='discuss-button')['href'] 163 | 164 | return { 165 | 'id': scp_id, 166 | 'rating': int(rating), 167 | 'image': { 168 | 'src': main_image, 169 | 'caption': image_caption 170 | }, 171 | 'content': mapping, 172 | 'revision': int(revision), 173 | 'last_edited': int(last_updated), 174 | 'tags': tags, 175 | 'discussion': discussion_link 176 | } 177 | 178 | def get_scp(scp_id: Union[str, int]) -> dict: 179 | """ 180 | Returns a dictionary with as much content as possible regarding the SCP ID. 181 | 182 | Parameters: 183 | scp_id: ID of the SCP to grab info for. Should be either a string with leading zeroes (ex. 002) or an integer (ex. 2). 184 | """ 185 | 186 | # Make the formatting nice for get_single_scp 187 | if int(scp_id) < 10: 188 | scp_id = f'00{scp_id}' 189 | 190 | elif int(scp_id) < 100: 191 | scp_id = f'0{scp_id}' 192 | 193 | # Get stuff we need from the page's HTML 194 | site_content = get_single_scp(str(scp_id)) 195 | parsed_content = parse_scp(site_content, int(scp_id)) 196 | 197 | # Get SCP's name and add it to parsed_content. 198 | if get_scp_name(int(scp_id)) is not None: 199 | if '[ACCESS DENIED]' not in get_scp_name(int(scp_id)): 200 | scp_name = get_scp_name(int(scp_id)) 201 | parsed_content['name'] = scp_name 202 | 203 | # Don't add the name if there was an error preventing get_scp_name from grabbing it. 204 | else: 205 | pass 206 | 207 | return parsed_content 208 | 209 | def get_scp_name(id: int) -> str: 210 | """ 211 | Scrapes an SCP's name. Ignores uncreated SCPs. Returns the SCP's name as a string. 212 | 213 | Parameters: 214 | id: The SCP you want to retrieve's object ID. 215 | """ 216 | try: 217 | # Redundant, but I can sleep easier since it has that extra layer of fallback. 218 | if _get_scp_name(id) is not None: 219 | if "[ACCESS DENIED]" not in _get_scp_name(id): 220 | return _get_scp_name(id) 221 | 222 | # Error handling 223 | except KeyError as e: 224 | # print(f"\nWARNING: Failed to scrape SCP-{id}! Error: {e}", file=sys.stderr) 225 | pass 226 | 227 | def scrape_scps(min_skip: int=0, max_skip: int=6000, tags: list=[], ai_dataset: bool=False, copy_to_drive: bool=False) -> None: 228 | """ 229 | Scrapes as much info on all SCPs from min_skip to max_skip - 1 as possible. Writes this info to different files based on its section. 230 | 231 | Output files: 232 | scp-descrips.txt, scp-conprocs.txt, scp-titles.txt, and scp-addenda.txt. 233 | 234 | Parameters: 235 | min_skip: The SCP number to start at. Default: 0 236 | max_skip: The SCP number to end at plus one. Default: 6000 237 | tags: The list of tags to grab from. Will ignore SCPs without these tags. An empty list (default) matches all tags. 238 | ai_dataset: Set to True if data is later going to be used to train an AI. Adds "<|endoftext|>" tokens where necessary to divide the dataset for training. Default: False 239 | copy_to_drive: Set to True to copy the output files to your Google Drive when done creating them. Requires having your Google Drive mounted (preferably with scpscraper.gdrive.mount()). Default: False 240 | """ 241 | # Create/clear the files we need for scraping. 242 | filelist = [] 243 | filelist.append(open('scp-descrips.txt', 'w')) 244 | filelist.append(open('scp-conprocs.txt', 'w')) 245 | filelist.append(open('scp-titles.txt', 'w')) 246 | filelist.append(open('scp-addenda.txt', 'w')) 247 | for i in range(len(filelist)): 248 | filelist[i].close() 249 | 250 | # print('Grabbing and writing skip info...\n', flush=True) 251 | 252 | # Initiate loop, create progress bar. 253 | for i in tqdm(range(min_skip, max_skip), "Fetching skips", total=max_skip, ncols=150, initial=min_skip, unit="skip", file=sys.stdout, bar_format='{desc}... {percentage:3.2f}% |{bar}| [{remaining} remaining, {rate_fmt}]', smoothing=0.01875): 254 | # Nice number formatting. 255 | if i < 10: 256 | j = f'00{i}' 257 | elif i < 100: 258 | j = f'0{i}' 259 | else: 260 | j = i 261 | 262 | try: 263 | # Get all the things for the SCP. 264 | mylist = get_scp(i) 265 | 266 | # Tag match checking code 267 | match = False 268 | 269 | if tags: 270 | for tag in tags: 271 | if tag in mylist["tags"]: 272 | match = True 273 | break 274 | 275 | else: 276 | match = True 277 | 278 | # Get the list of keys in the dictionary (so we can search through it later). 279 | if match: 280 | keyslist = mylist["content"].keys() 281 | 282 | # Put stuff in a better format for the AI, if we're making a dataset for one 283 | if ai_dataset: 284 | for k in keyslist: 285 | mylist["content"][k] = mylist["content"][k].replace(j, 'XXXX') 286 | 287 | try: 288 | # Append current SCP's description to the description file. 289 | with open('scp-descrips.txt', 'a') as out: 290 | try: 291 | # Add <|endoftext|> token if it's a dataset for training AI. 292 | if ai_dataset: 293 | out.write('Description: {}\n<|endoftext|>'.format(mylist["content"]["Description"].replace(j, 'XXXX'))) 294 | else: 295 | out.write(f'Description: {mylist["content"]["Description"]}\n') 296 | 297 | out.write('\n') 298 | 299 | # Error handling. 300 | except Exception as e: 301 | # print(f'Failed to grab the description of SCP-{j}! Please grab it yourself! Error: {e}') 302 | pass 303 | 304 | # Append current SCP's conprocs to the conproc file. 305 | with open('scp-conprocs.txt', 'a') as out: 306 | try: 307 | for k in keyslist: 308 | # Search keys for "Containment", output to conproc file if it matches. 309 | if "containment" in k.lower(): 310 | if ai_dataset: 311 | out.write('Special Containment Procedures: {}\n<|endoftext|>\n'.format(mylist["content"][k].replace(j, 'XXXX'))) 312 | else: 313 | out.write(f'Special Containment Procedures: {mylist["content"][k]}\n') 314 | 315 | # Add <|endoftext|> token if it's a dataset for training AI. 316 | 317 | 318 | out.write('\n') 319 | 320 | # Error handling. 321 | except: 322 | # print(f'Failed to grab the conprocs of SCP-{j}! It is probably not an article with a standard format! Please grab them yourself!') 323 | pass 324 | 325 | try: 326 | # Append current SCP's title to the title file (if we can grab it). 327 | with open('scp-titles.txt', 'a') as out: 328 | # Even more redundancy. I know. This is getting ridiculous. 329 | if mylist["name"] is not None: 330 | if "[ACCESS DENIED]" not in mylist["name"]: 331 | if ai_dataset: 332 | out.write(f'SCP-XXXX: {mylist["name"]}\n') 333 | else: 334 | out.write(f'SCP-{j}: {mylist["name"]}\n') 335 | 336 | # Handle nonexistent SCPs. 337 | else: 338 | # print(f'SCP-{j} doesn\'t exist yet!') 339 | pass 340 | 341 | else: 342 | # print(f'SCP-{j} doesn\'t exist yet!') 343 | pass 344 | 345 | # Error handling. 346 | except Exception as e: 347 | # raise e 348 | # print(f'Failed to grab the title of SCP-{j}! Please grab it yourself! Error: {e}') 349 | pass 350 | 351 | # Find and append addenda (if they exist) to the addenda file. 352 | with open('scp-addenda.txt', 'a') as out: 353 | try: 354 | # Define list or dictionary depending on whether or not we need the keys. 355 | if ai_dataset: 356 | addendalist = [] 357 | else: 358 | addendalist = {} 359 | 360 | for k in keyslist: 361 | # Search keys for "Addendum", add to addendalist if it matches. 362 | if "addendum" in k.lower(): 363 | if ai_dataset: 364 | addendalist.append(mylist["content"][k]) 365 | 366 | # Do the same thing for non-dataset, also adding the keys. 367 | else: 368 | addendalist.update({k: mylist["content"][k]}) 369 | 370 | # Write addenda to addenda file. 371 | if ai_dataset: 372 | for k in addendalist: 373 | buffer = k.strip(': ') 374 | out.write('Addendum XXXX-XX: {}\n<|endoftext|>\n\n'.format(buffer.replace(j, 'XXXX'))) 375 | 376 | # Do the same for non-dataset. 377 | else: 378 | for k in addendalist.keys(): 379 | buffer = f'{k}: {addendalist[k]}' 380 | out.write(f'{buffer}\n\n') 381 | 382 | # Error handling. 383 | except Exception as e: 384 | # print(f'Failed to grab the addenda of SCP-{j}! Please grab them yourself (if they exist)! Error: {e}') 385 | pass 386 | 387 | # More error handling. 388 | except Exception as e: 389 | # print(f'Failed to write the info for SCP-{j}! Error: {e}') 390 | pass 391 | 392 | # Wow, just look at all that error handling! 393 | except Exception as e: 394 | # print(f'Failed to grab the info for {i}! Error: {e}') 395 | pass 396 | # print(mylist) 397 | 398 | filelist_names = [ 399 | 'scp-descrips.txt', 400 | 'scp-conprocs.txt', 401 | 'scp-titles.txt', 402 | 'scp-addenda.txt', 403 | ] 404 | for skip_file in filelist_names: 405 | # Variable definitions. 406 | lines_seen = set() 407 | outfile = open(f'{skip_file}.tmp', "w") 408 | 409 | # Remove duplicate lines. 410 | for line in open(skip_file, 'r'): 411 | if line not in lines_seen: 412 | outfile.write(line) 413 | lines_seen.add(line) 414 | 415 | # Close original outfile connection. 416 | outfile.flush() 417 | outfile.close() 418 | 419 | # Write changes to original file and delete temporary file. 420 | with open(f'{skip_file}.tmp', "r") as infile: 421 | with open(skip_file, 'w') as outfile: 422 | outfile.write(infile.read()) 423 | os.remove(f'{skip_file}.tmp') 424 | if copy_to_drive: 425 | gdrive.copy_to_drive(skip_file) 426 | # print("Done!") 427 | 428 | def scrape_scps_html(min_skip: int=0, max_skip: int=6000, tags: list=[], ai_dataset: bool=False, copy_to_drive: bool=False) -> None: 429 | """ 430 | Scrapes the html code of SCPs min_skip to max_skip - 1. 431 | 432 | Output files: 433 | scp-html.txt. 434 | 435 | Parameters: 436 | min_skip: The SCP number to start at. Default: 0 437 | max_skip: The SCP number to end at plus one. Default: 6000 438 | tags: The list of tags to grab from. Will ignore SCPs without these tags. An empty list (default) matches all tags. 439 | ai_dataset: Set to True if data is later going to be used to train an AI. Adds "<|endoftext|>" tokens where necessary to divide the dataset for training. Default: False 440 | copy_to_drive: Set to True to copy the output files to your Google Drive when done creating them. Requires having your Google Drive mounted (preferably with scpscraper.gdrive.mount()). Default: False 441 | """ 442 | # Create/reset text file 443 | with open('scp_html.txt', "w"): 444 | pass 445 | 446 | # Define blank page contents. 447 | blank_page = '
\n

This page doesn\'t exist yet!

\n
\n
\n
\n

Did you get feedback first?

\n
\n
' 448 | 449 | for i in tqdm(range(min_skip, max_skip), "Fetching skips", total=max_skip, ncols=150, initial=min_skip, unit="skip", file=sys.stdout, bar_format='{desc}... {percentage:3.2f}% |{bar}| [{remaining} remaining, {rate_fmt}]', smoothing=0.01875): 450 | with open('scp-html.txt', "a") as out: 451 | if i < 10: 452 | j = f'00{i}' 453 | elif i < 100: 454 | j = f'0{i}' 455 | else: 456 | j = i 457 | 458 | soup = get_single_scp(j) 459 | 460 | if soup is not None: 461 | # Get page tags 462 | tags_list = soup.find('div', {'class': 'page-tags'}).find('span') 463 | page_tags = [tag.string for tag in tags_list if tag.string != '\n'] 464 | 465 | # Tag match checking code 466 | match = False 467 | 468 | if tags != []: 469 | for tag in tags: 470 | if tag in page_tags: 471 | match = True 472 | break 473 | 474 | else: 475 | match = True 476 | 477 | if match: 478 | content = soup.find('div', id='page-content') 479 | 480 | if blank_page not in content: 481 | if ai_dataset: 482 | out.write('{}\n\n<|endoftext|>\n\n\n'.format(str(content).replace(j, 'XXXX'))) 483 | 484 | else: 485 | out.write(f'{content}\n\n') 486 | 487 | else: 488 | # print(f'\nThe page for SCP-{j} is blank!', file=sys.stderr) 489 | pass 490 | 491 | if copy_to_drive: 492 | gdrive.copy_to_drive('scp-html.txt') 493 | -------------------------------------------------------------------------------- /scpscraper/test_scpscraper.py: -------------------------------------------------------------------------------- 1 | import pytest, scpscraper 2 | 3 | def test_get_name(): 4 | name_list = [None, 'classification [Blocked]', 'The "Living" Room', 'Biological Motherboard', 'The 12 Rusty Keys and the Door', 'Skeleton Key', 'Fountain of Youth', 'Abdominal Planet', 'Zombie Plague', 'Red Ice', 'Collars of Control', 'Sentient Civil War Memorial Statue', 'A Bad Composition', 'Blue Lady Cigarettes', 'The Concrete Man', 'Pipe Nightmare', 'Organism', 'Shadow Person', 'Super Ball', 'The Monster Pot', 'Unseen Mold', 'Skin Wyrm', 'The Morgue', 'Black Shuck', 'Game Show of Death', 'Worn Wardrobe', 'Afterschool Retention', 'The Vermin God', 'Knowledge', 'Daughter of Shadows', 'The Homunculus', 'What is Love?', "Brothers' Bride", 'The Missing Number', 'Obsidian Ritual Knife', 'Possessive Mask', 'The Reincarnation Pilgrimage of the Yazidi (Kiras Guhorîn)', 'Dwarf Star', 'The Everything Tree', 'Proboscis Engineers', "Evolution's Child", 'Broadcasting Patient', 'A Formerly Winged Horse', 'The Beatle', 'Fission Cannon', 'Atmospheric Converter', '"Predatory" Holly Bush', 'Microbial Mutagen', 'The Cursed SCP Number', 'Plague Doctor', 'To The Cleverest', 'Japanese Obstetrical Model', 'Traveling Train', 'Young Girl', 'Water Nymph', '[unknown]', 'A Beautiful Person', 'The Daily Grind', 'Heart of Darkness', 'Radioactive Mineral', 'Infernal Occult Skeleton', 'Auditory Mind Control', '"Quantum" Computer', '"The World\'s Best TothBrush"', 'Flawed von Neumann Structure', 'Destroyed Organic Catalyst', "Eric's Toy", "The Artist's Pen", 'The Wire Figure', 'Second Chance', 'Iron Wings', 'Degenerative Metamorphic Entity', 'The Foot of the Bed', '"Cain"', 'Quantum Woodlouse', 'Corrosive Snail', '"Able"', 'Rot Skull', 'Guilt', 'Old AI', 'Dark Form', 'Spontaneous Combustion Virus', '"Fernand" the Cannibal', 'An Abandoned Row Home', 'Static Tower', "drawn ''Cassy''", 'The Office of Dr. [REDACTED]', 'The Stairwell', 'The Lizard King', 'Tophet', "Apocorubik's Cube", 'Nostalgia', '"The Best of The 5th Dimension"', 'Red Sea Object', 'Miniature Event Horizon', 'Gun', 'The "Shy Guy"', 'Old Fairgrounds', 'Surgeon Crabs', 'The Portrait'] 5 | 6 | for i in range(100): 7 | assert scpscraper.get_scp_name(i) == name_list[i], 'get_scp_name() is not working properly!' 8 | 9 | def test_scrape_scps(): 10 | scpscraper.scrape_scps(0, 50) 11 | 12 | filelist = [ 13 | 'scp-descrips.txt', 14 | 'scp-conprocs.txt', 15 | 'scp-titles.txt', 16 | ] 17 | 18 | for test_file in filelist: 19 | with open(test_file, 'r') as in_text: 20 | assert in_text.read() != '', f'scrape_scps() is not working properly! {test_file} is empty!' 21 | 22 | def test_scps_html(): 23 | scpscraper.scrape_scps_html(0, 50) 24 | 25 | with open('scp-html.txt', 'r') as in_text: 26 | assert in_text.read() != '', f'scrape_scps() is not working properly! scp-html is empty!' 27 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | with open("requirements.txt", "r") as fh: 7 | requirements = fh.readlines() 8 | 9 | setup( 10 | name="scpscraper", 11 | packages=['scpscraper'], 12 | version="1.0.1", 13 | license="MIT", 14 | author="JaonHax", 15 | author_email="jaonhax@gmail.com", 16 | description="A Python library designed for scraping data from the SCP wiki.", 17 | long_description=long_description, 18 | long_description_content_type="text/markdown", 19 | url="https://github.com/JaonHax/scp-scraper", 20 | keywords=["scp", "foundation", "webscraper", "tensorflow", "dataset"], 21 | install_requires=requirements, 22 | include_package_data=True, 23 | classifiers=[ 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3 :: Only", 26 | "Programming Language :: Python :: 3.6", 27 | "Programming Language :: Python :: 3.7", 28 | "Programming Language :: Python :: 3.8", 29 | "License :: OSI Approved :: MIT License", 30 | "Operating System :: OS Independent", 31 | "Development Status :: 3 - Alpha", 32 | "Intended Audience :: Developers", 33 | "Natural Language :: English", 34 | "Topic :: Scientific/Engineering :: Artificial Intelligence" 35 | ], 36 | python_requires='>=3.6' 37 | ) 38 | --------------------------------------------------------------------------------