├── .github
├── ISSUE_TEMPLATE
│ ├── bug-report.md
│ └── feature-request.md
└── workflows
│ ├── python-package.yml
│ └── python-publish.yml
├── LICENSE
├── README.md
├── requirements.txt
├── scpscraper
├── __init__.py
├── gdrive.py
├── scpscraper.py
└── test_scpscraper.py
├── setup.cfg
└── setup.py
/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug Report
3 | about: Create a report to help us improve
4 | title: "[BUG]"
5 | labels: bug
6 | assignees: JaonHax
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **System Information:**
27 | - OS and Version: [e.g. Linux Mint 19.2 Cinnamon]
28 | - Environment: [e.g. terminal, virtual environment, Jupyter notebook, etc.]
29 | - Version of scpscraper: [e.g. 1.0.0a0]
30 |
31 | **Additional context**
32 | Add any other context about the problem here. This is where you'd put traceback information, as well.
33 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature Request
3 | about: Suggest an idea for this project
4 | title: "[Request]"
5 | labels: feature request
6 | assignees: JaonHax
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Test scpscraper
5 |
6 | on:
7 | push:
8 | branches: [ master ]
9 | pull_request:
10 | branches: [ master ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | python-version: [3.6, 3.7, 3.8]
19 |
20 | name: Test on Python ${{ matrix.python-version }}
21 |
22 | steps:
23 | - uses: actions/checkout@v2
24 | - name: Set up Python ${{ matrix.python-version }}
25 | uses: actions/setup-python@v2
26 | with:
27 | python-version: ${{ matrix.python-version }}
28 | - name: Update to latest pip
29 | run: pip install --upgrade pip
30 | - name: Install dependencies
31 | run: |
32 | pip install flake8 pytest
33 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
34 | - name: Lint with flake8
35 | run: |
36 | # stop the build if there are Python syntax errors or undefined names
37 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
38 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
39 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
40 | - name: Test with pytest
41 | run: pytest
42 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflows will upload a Python Package using Twine when a release is created
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
3 |
4 | name: Upload Python Package
5 |
6 | on:
7 | release:
8 | types: [created]
9 | branches: [master]
10 |
11 | jobs:
12 | deploy:
13 |
14 | runs-on: ubuntu-latest
15 |
16 | steps:
17 | - uses: actions/checkout@v2
18 | - name: Set up Python
19 | uses: actions/setup-python@v2
20 | with:
21 | python-version: '3.x'
22 | - name: Install dependencies
23 | run: |
24 | python -m pip install --upgrade pip
25 | pip install setuptools wheel twine
26 | - name: Build and publish
27 | env:
28 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
29 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
30 | run: |
31 | python setup.py sdist bdist_wheel
32 | twine upload dist/*
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 JaonHax
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # SCP Scraper
3 | A small Python library designed for scraping data from the SCP wiki. Made with AI training (namely NLP models) and dataset collection (for things like categorization of SCPs for external projects) in mind, and has arguments to allow for ease of use in those applications.
4 |
5 | Below you will find installation instructions, examples of how to use this library, and the ways in which you can utilize it. I hope you find this as useful as I have!
6 |
7 | ## Sample Code
8 |
9 | ### Installation
10 | `scpscraper` can be installed via `pip install`. Here's the command I recommend using, so you consistently have the latest version.
11 | ```
12 | pip3 install --upgrade scpscraper
13 | ```
14 |
15 | ### The Basics
16 | #### Importing the Library
17 | ```py
18 | # Before we begin, we obviously have to import scpscraper.
19 | import scpscraper
20 | ```
21 |
22 | #### Grabbing an SCP's Name
23 | ```py
24 | # Let's use 3001 (Red Reality) as an example.
25 | name = scpscraper.get_scp_name(3001)
26 |
27 | print(name) # Outputs "Red Reality"
28 | ```
29 |
30 | #### Grabbing as many details as possible about an SCP
31 | ```py
32 | # Again using 3001 as an example
33 | info = scpscraper.get_scp(3001)
34 |
35 | print(info) # Outputs a dictionary with the
36 | # name, object id, rating, page content by section, etc.
37 | ```
38 |
39 | ### The Fun Stuff
40 | #### Grabbing an SCP's `page-content` div HTML
41 | For reference, the `page-content` div contains what the user actually wrote, without all the extra Wikidot external stuff.
42 | ```py
43 | # Once again, 3001 is the example
44 | scp = scpscraper.get_single_scp(3001)
45 |
46 | # Grab the page-content div specifically
47 | content = scp.find_all('div', id='page-content')
48 |
49 | print(content) # Outputs "
...
"
50 | ```
51 |
52 | #### Scraping HTML or information from *multiple* SCPs
53 | ```py
54 | # Grab info on SCPs 000-099
55 | scpscraper.scrape_scps(0, 100)
56 |
57 | # Same as above, but only grabbing Keter-class SCPs
58 | scpscraper.scrape_scps(0, 100, tags=['keter'])
59 |
60 | # Grab 000-099 in a format that can be used to train AI
61 | scpscraper.scrape_scps(0, 100, ai_dataset=True)
62 | ```
63 | ```py
64 | # Scrape the page-content div's HTML from SCP-000 to SCP-099
65 |
66 | # Only including this as an example, but scrape_scps_html() has
67 | # all the same options as scrape_scps().
68 | scpscraper.scrape_scps_html(0, 100)
69 | ```
70 |
71 | ### Google Colaboratory Only Usage
72 | Because of the `google.colab` module included in Google Colaboratory, we can do a few extra things there that we can't otherwise.
73 |
74 | #### Mount your Google Drive to the Colaboratory VM
75 | ```py
76 | # Mounts it to the directory /content/drive/
77 | scpscraper.gdrive.mount()
78 | ```
79 |
80 | #### Scrape SCP info/HTML and copy to your Google Drive afterwards
81 | ```py
82 | # Requires your Google Drive to be mounted at the directory /content/drive/
83 | scpscraper.scrape_scps(0, 100, copy_to_drive=True)
84 |
85 | scpscraper.scrape_scps_html(0, 100, copy_to_drive=True)
86 | ```
87 |
88 | #### Copy other files to/from your Google Drive
89 | ```py
90 | # Requires your Google Drive to be mounted at the directory /content/drive/
91 | scpscraper.gdrive.copy_to_drive('example.txt')
92 |
93 | scpscraper.gdrive.copy_from_drive('example.txt')
94 | ```
95 | ## Planned Updates
96 | Potential updates in the future to make scraping data from any website easy/viable, allowing for easy mass collection of data.
97 |
98 | ## Link to GitHub Repo
99 | Please consider checking it out! You can report issues, request features, contribute to this project, etc. in the GitHub Repo. That is the best way to reach me for issues/feedback relating to this project.
100 |
101 | https://github.com/JaonHax/scpscraper/
102 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4>=4.9.1
2 | bs4>=0.0.1
3 | soupsieve>=2.0.1
4 | tqdm>=4.48.0
5 | lxml
6 |
--------------------------------------------------------------------------------
/scpscraper/__init__.py:
--------------------------------------------------------------------------------
1 | from scpscraper.scpscraper import *
2 | import scpscraper.gdrive
3 |
--------------------------------------------------------------------------------
/scpscraper/gdrive.py:
--------------------------------------------------------------------------------
1 | import os, shutil, sys
2 |
3 | # Check if user is using Google Colaboratory (and if so, we can do fancier things...)
4 | try:
5 | from google.colab import drive
6 | except:
7 | pass
8 |
9 | # Custom error definitions
10 | class DriveNotMountedError(Exception):
11 | pass
12 |
13 | class PathNotRecognizedError(Exception):
14 | pass
15 |
16 | class PathNotExistsError(Exception):
17 | pass
18 |
19 | class NoColaboratoryVMError(Exception):
20 | pass
21 |
22 | # Special Google Colaboratory functions
23 | def mount():
24 | """
25 | Mounts your Google Drive to the Colaboratory VM.
26 | """
27 | if 'google.colab' in sys.modules:
28 | drive.mount('/content/drive')
29 | else:
30 | raise NoColaboratoryVMError("You must be in Google Colaboratory to run any Google Drive related functions!")
31 |
32 | def _is_mounted():
33 | if os.path.isdir('/content/drive'):
34 | return
35 | else:
36 | raise DriveNotMountedError("You must first mount your Google Drive using scpscraper.gdrive.mount()!")
37 |
38 | def copy_to_drive(path: str):
39 | """
40 | Copies a file or directory to your Google Drive.
41 | """
42 | _is_mounted()
43 |
44 | if os.path.exists(path):
45 | if os.path.isfile(path):
46 | shutil.copyfile(path, f"/content/drive/My Drive/{path}")
47 |
48 | elif os.path.isdir(path):
49 | shutil.copytree(path, f"/content/drive/My Drive/{path}")
50 |
51 | else:
52 | raise PathNotRecognizedError(f"Path {path} is not a file or a directory!")
53 |
54 | else:
55 | raise PathNotExistsError(f"Path {path} does not exist!")
56 |
57 | def copy_from_drive(path: str):
58 | """
59 | Copies a file or directory to your Google Drive.
60 | """
61 | _is_mounted()
62 |
63 | if os.path.exists(f"/content/drive/My Drive/{path}"):
64 | if os.path.isfile(f"/content/drive/My Drive/{path}"):
65 | shutil.copyfile(f"/content/drive/My Drive/{path}", path)
66 |
67 | elif os.path.isdir(f"/content/drive/My Drive/{path}"):
68 | shutil.copytree(f"/content/drive/My Drive/{path}", path)
69 |
70 | else:
71 | raise PathNotRecognizedError(f"Path /content/drive/My Drive/{path} is not a file or a directory!")
72 |
73 | else:
74 | raise PathNotExistsError(f"Path /content/drive/My Drive/{path} does not exist!")
75 |
--------------------------------------------------------------------------------
/scpscraper/scpscraper.py:
--------------------------------------------------------------------------------
1 | import os, shutil, sys, re, urllib.request, pytest
2 | from math import ceil
3 | from bs4 import BeautifulSoup
4 | from typing import Union
5 | from tqdm import tqdm
6 |
7 | from scpscraper import gdrive
8 |
9 | def get_single_scp(scp_id: str) -> BeautifulSoup:
10 | """Returns HTML code for the `page-content` div of a given SCP."""
11 | try:
12 | # Grab the HTML code.
13 | r = urllib.request.urlopen(url=f'http://scp-wiki.wikidot.com/scp-{scp_id}')
14 |
15 | # Return the organized content for parsing.
16 | return BeautifulSoup(r, 'lxml')
17 |
18 | # Error handling.
19 | except Exception as e:
20 | # print(f'\nWARNING: Failed to access SCP Wiki page for SCP-{scp_id}. Error: {e}', file=sys.stderr)
21 | return
22 |
23 | def _get_scp_name(scp_id: int) -> str:
24 | """Gets the name of an SCP from the SCP Series pages. Internal function, shouldn't need to be called by a user."""
25 | try:
26 | # Determine which series the SCP is in.
27 | if scp_id < 1000:
28 | url = 'http://scp-wiki.wikidot.com/scp-series'
29 | elif scp_id % 1000 == 0:
30 | url = f'http://scp-wiki.wikidot.com/scp-series-{int(scp_id/1000+1)}'
31 | else:
32 | url = f'http://scp-wiki.wikidot.com/scp-series-{ceil(scp_id/1000, 0)}'
33 |
34 | # Grab the HTML and parse as needed.
35 | r = urllib.request.urlopen(url=url)
36 | try:
37 | soup = BeautifulSoup(r, 'lxml')
38 | content = soup.find('div', id='page-content')
39 | list_elements = content.find_all('li')
40 |
41 | for li in list_elements:
42 | if re.findall('[0-9]+', li.find_next('a')['href']):
43 | if int(re.findall('[0-9]+', li.find_next('a')['href'])[0]) == scp_id:
44 | scp_name = re.split('-', li.get_text())[-1]
45 | return scp_name.strip(' ')
46 |
47 | # Handle 404 errors.
48 | except urllib.error.HTTPError as e:
49 | if e.code == 404:
50 | # print(f'\nWARNING: Unavailable SCP Series for SCP-{scp_id}!', file=sys.stderr)
51 | return
52 |
53 | # Handle other HTTP errors.
54 | else:
55 | # print(f'\nWARNING: Failed to access SCP Series page for SCP-{scp_id}. HTTP Status Code {e.code}. {e.read()}', file=sys.stderr)
56 | return
57 |
58 | # Even more error handling.
59 | except Exception as e:
60 | # print(f'\nWARNING: Failed to access SCP Series page for SCP-{scp_id}. Request Error: {e}', file=sys.stderr)
61 | return
62 |
63 | def parse_scp(soup: BeautifulSoup, scp_id: Union[str, int]) -> dict:
64 | """Parses the HTML content of a page on the SCP wiki. Internal function, shouldn't need to be called by a user."""
65 | # Just to get this out of the way...
66 | if soup is None:
67 | return None
68 |
69 | # Get rating.
70 | try:
71 | rating = soup.find('span', {'class': 'rate-points'}).contents[1].contents[0].replace('+', '')
72 |
73 | # Error handling.
74 | except AttributeError:
75 | # print(f'No rating found for SCP-{scp_id}!')
76 | rating = 0
77 |
78 | # Get page-content block.
79 | content = soup.find('div', id='page-content')
80 | # print(content)
81 |
82 | # Get main image (if it exists).
83 | try:
84 | main_image = content.find('div', {'class': 'scp-image-block'}).contents[0]['src']
85 |
86 | # Error handling.
87 | except AttributeError:
88 | # print(f'No main_image found for SCP-{scp_id}!')
89 | main_image = None
90 |
91 | # More error handling.
92 | except KeyError:
93 | # print(f'No main_image found for SCP-{scp_id}')
94 | main_image = None
95 |
96 | # Get image caption
97 | try:
98 | image_caption = content.find('div', {'class': 'scp-image-block'}).contents[2].contents[1].contents[0]
99 |
100 | # Error handling.
101 | except AttributeError:
102 | # print(f'No image_caption found for SCP-{scp_id}!')
103 | image_caption = None
104 |
105 | # Even more error handling.
106 | except KeyError:
107 | # print(f'No image_caption found for SCP-{scp_id}')
108 | image_caption = None
109 |
110 | # Get main content
111 | try:
112 | # Initial variable definitions.
113 | mapping = {}
114 | key = None
115 | # print(content.find_all('p'))
116 |
117 | # Find all the paragraph elements.
118 | for item in content.find_all('p'):
119 | # Grab the paragraph element's first child.
120 | first_child = item.next
121 |
122 | # Use bold portions as keys/identifiers for their sections.
123 | if first_child.name == 'strong':
124 | key = first_child.text.rstrip(': ')
125 | value = first_child.next_sibling.strip(': ')
126 |
127 | else:
128 | # Add subsequent paragraphs to the same section.
129 | if key is not None:
130 | value = f'{mapping[key]}\n{item.get_text(strip=True)}'
131 |
132 | # Don't if there's no section to add them to.
133 | else:
134 | value = None
135 |
136 | # Put that all into the value for the key.
137 | mapping[key] = value
138 |
139 | # Remove the sections that didn't have keys.
140 | try:
141 | mapping.pop(None)
142 |
143 | # Error handling.
144 | except:
145 | pass
146 |
147 | # Error handling.
148 | except AttributeError as e:
149 | # print(f'Can\'t parse content of SCP-{scp_id}! Error: {e}')
150 | mapping = None
151 |
152 | # Get page info.
153 | page_info = soup.find('div', id='page-info')
154 | revision = re.findall('\d+', page_info.next)[0]
155 | last_updated = page_info.find('span')['class'][1].replace('time_', '')
156 |
157 | # Get tags.
158 | tags_list = soup.find('div', {'class': 'page-tags'}).find('span')
159 | tags = [tag.string for tag in tags_list if tag.string != '\n']
160 |
161 | # Get link to the discussion page.
162 | discussion_link = 'http://www.scpwiki.com' + soup.find('a', id='discuss-button')['href']
163 |
164 | return {
165 | 'id': scp_id,
166 | 'rating': int(rating),
167 | 'image': {
168 | 'src': main_image,
169 | 'caption': image_caption
170 | },
171 | 'content': mapping,
172 | 'revision': int(revision),
173 | 'last_edited': int(last_updated),
174 | 'tags': tags,
175 | 'discussion': discussion_link
176 | }
177 |
178 | def get_scp(scp_id: Union[str, int]) -> dict:
179 | """
180 | Returns a dictionary with as much content as possible regarding the SCP ID.
181 |
182 | Parameters:
183 | scp_id: ID of the SCP to grab info for. Should be either a string with leading zeroes (ex. 002) or an integer (ex. 2).
184 | """
185 |
186 | # Make the formatting nice for get_single_scp
187 | if int(scp_id) < 10:
188 | scp_id = f'00{scp_id}'
189 |
190 | elif int(scp_id) < 100:
191 | scp_id = f'0{scp_id}'
192 |
193 | # Get stuff we need from the page's HTML
194 | site_content = get_single_scp(str(scp_id))
195 | parsed_content = parse_scp(site_content, int(scp_id))
196 |
197 | # Get SCP's name and add it to parsed_content.
198 | if get_scp_name(int(scp_id)) is not None:
199 | if '[ACCESS DENIED]' not in get_scp_name(int(scp_id)):
200 | scp_name = get_scp_name(int(scp_id))
201 | parsed_content['name'] = scp_name
202 |
203 | # Don't add the name if there was an error preventing get_scp_name from grabbing it.
204 | else:
205 | pass
206 |
207 | return parsed_content
208 |
209 | def get_scp_name(id: int) -> str:
210 | """
211 | Scrapes an SCP's name. Ignores uncreated SCPs. Returns the SCP's name as a string.
212 |
213 | Parameters:
214 | id: The SCP you want to retrieve's object ID.
215 | """
216 | try:
217 | # Redundant, but I can sleep easier since it has that extra layer of fallback.
218 | if _get_scp_name(id) is not None:
219 | if "[ACCESS DENIED]" not in _get_scp_name(id):
220 | return _get_scp_name(id)
221 |
222 | # Error handling
223 | except KeyError as e:
224 | # print(f"\nWARNING: Failed to scrape SCP-{id}! Error: {e}", file=sys.stderr)
225 | pass
226 |
227 | def scrape_scps(min_skip: int=0, max_skip: int=6000, tags: list=[], ai_dataset: bool=False, copy_to_drive: bool=False) -> None:
228 | """
229 | Scrapes as much info on all SCPs from min_skip to max_skip - 1 as possible. Writes this info to different files based on its section.
230 |
231 | Output files:
232 | scp-descrips.txt, scp-conprocs.txt, scp-titles.txt, and scp-addenda.txt.
233 |
234 | Parameters:
235 | min_skip: The SCP number to start at. Default: 0
236 | max_skip: The SCP number to end at plus one. Default: 6000
237 | tags: The list of tags to grab from. Will ignore SCPs without these tags. An empty list (default) matches all tags.
238 | ai_dataset: Set to True if data is later going to be used to train an AI. Adds "<|endoftext|>" tokens where necessary to divide the dataset for training. Default: False
239 | copy_to_drive: Set to True to copy the output files to your Google Drive when done creating them. Requires having your Google Drive mounted (preferably with scpscraper.gdrive.mount()). Default: False
240 | """
241 | # Create/clear the files we need for scraping.
242 | filelist = []
243 | filelist.append(open('scp-descrips.txt', 'w'))
244 | filelist.append(open('scp-conprocs.txt', 'w'))
245 | filelist.append(open('scp-titles.txt', 'w'))
246 | filelist.append(open('scp-addenda.txt', 'w'))
247 | for i in range(len(filelist)):
248 | filelist[i].close()
249 |
250 | # print('Grabbing and writing skip info...\n', flush=True)
251 |
252 | # Initiate loop, create progress bar.
253 | for i in tqdm(range(min_skip, max_skip), "Fetching skips", total=max_skip, ncols=150, initial=min_skip, unit="skip", file=sys.stdout, bar_format='{desc}... {percentage:3.2f}% |{bar}| [{remaining} remaining, {rate_fmt}]', smoothing=0.01875):
254 | # Nice number formatting.
255 | if i < 10:
256 | j = f'00{i}'
257 | elif i < 100:
258 | j = f'0{i}'
259 | else:
260 | j = i
261 |
262 | try:
263 | # Get all the things for the SCP.
264 | mylist = get_scp(i)
265 |
266 | # Tag match checking code
267 | match = False
268 |
269 | if tags:
270 | for tag in tags:
271 | if tag in mylist["tags"]:
272 | match = True
273 | break
274 |
275 | else:
276 | match = True
277 |
278 | # Get the list of keys in the dictionary (so we can search through it later).
279 | if match:
280 | keyslist = mylist["content"].keys()
281 |
282 | # Put stuff in a better format for the AI, if we're making a dataset for one
283 | if ai_dataset:
284 | for k in keyslist:
285 | mylist["content"][k] = mylist["content"][k].replace(j, 'XXXX')
286 |
287 | try:
288 | # Append current SCP's description to the description file.
289 | with open('scp-descrips.txt', 'a') as out:
290 | try:
291 | # Add <|endoftext|> token if it's a dataset for training AI.
292 | if ai_dataset:
293 | out.write('Description: {}\n<|endoftext|>'.format(mylist["content"]["Description"].replace(j, 'XXXX')))
294 | else:
295 | out.write(f'Description: {mylist["content"]["Description"]}\n')
296 |
297 | out.write('\n')
298 |
299 | # Error handling.
300 | except Exception as e:
301 | # print(f'Failed to grab the description of SCP-{j}! Please grab it yourself! Error: {e}')
302 | pass
303 |
304 | # Append current SCP's conprocs to the conproc file.
305 | with open('scp-conprocs.txt', 'a') as out:
306 | try:
307 | for k in keyslist:
308 | # Search keys for "Containment", output to conproc file if it matches.
309 | if "containment" in k.lower():
310 | if ai_dataset:
311 | out.write('Special Containment Procedures: {}\n<|endoftext|>\n'.format(mylist["content"][k].replace(j, 'XXXX')))
312 | else:
313 | out.write(f'Special Containment Procedures: {mylist["content"][k]}\n')
314 |
315 | # Add <|endoftext|> token if it's a dataset for training AI.
316 |
317 |
318 | out.write('\n')
319 |
320 | # Error handling.
321 | except:
322 | # print(f'Failed to grab the conprocs of SCP-{j}! It is probably not an article with a standard format! Please grab them yourself!')
323 | pass
324 |
325 | try:
326 | # Append current SCP's title to the title file (if we can grab it).
327 | with open('scp-titles.txt', 'a') as out:
328 | # Even more redundancy. I know. This is getting ridiculous.
329 | if mylist["name"] is not None:
330 | if "[ACCESS DENIED]" not in mylist["name"]:
331 | if ai_dataset:
332 | out.write(f'SCP-XXXX: {mylist["name"]}\n')
333 | else:
334 | out.write(f'SCP-{j}: {mylist["name"]}\n')
335 |
336 | # Handle nonexistent SCPs.
337 | else:
338 | # print(f'SCP-{j} doesn\'t exist yet!')
339 | pass
340 |
341 | else:
342 | # print(f'SCP-{j} doesn\'t exist yet!')
343 | pass
344 |
345 | # Error handling.
346 | except Exception as e:
347 | # raise e
348 | # print(f'Failed to grab the title of SCP-{j}! Please grab it yourself! Error: {e}')
349 | pass
350 |
351 | # Find and append addenda (if they exist) to the addenda file.
352 | with open('scp-addenda.txt', 'a') as out:
353 | try:
354 | # Define list or dictionary depending on whether or not we need the keys.
355 | if ai_dataset:
356 | addendalist = []
357 | else:
358 | addendalist = {}
359 |
360 | for k in keyslist:
361 | # Search keys for "Addendum", add to addendalist if it matches.
362 | if "addendum" in k.lower():
363 | if ai_dataset:
364 | addendalist.append(mylist["content"][k])
365 |
366 | # Do the same thing for non-dataset, also adding the keys.
367 | else:
368 | addendalist.update({k: mylist["content"][k]})
369 |
370 | # Write addenda to addenda file.
371 | if ai_dataset:
372 | for k in addendalist:
373 | buffer = k.strip(': ')
374 | out.write('Addendum XXXX-XX: {}\n<|endoftext|>\n\n'.format(buffer.replace(j, 'XXXX')))
375 |
376 | # Do the same for non-dataset.
377 | else:
378 | for k in addendalist.keys():
379 | buffer = f'{k}: {addendalist[k]}'
380 | out.write(f'{buffer}\n\n')
381 |
382 | # Error handling.
383 | except Exception as e:
384 | # print(f'Failed to grab the addenda of SCP-{j}! Please grab them yourself (if they exist)! Error: {e}')
385 | pass
386 |
387 | # More error handling.
388 | except Exception as e:
389 | # print(f'Failed to write the info for SCP-{j}! Error: {e}')
390 | pass
391 |
392 | # Wow, just look at all that error handling!
393 | except Exception as e:
394 | # print(f'Failed to grab the info for {i}! Error: {e}')
395 | pass
396 | # print(mylist)
397 |
398 | filelist_names = [
399 | 'scp-descrips.txt',
400 | 'scp-conprocs.txt',
401 | 'scp-titles.txt',
402 | 'scp-addenda.txt',
403 | ]
404 | for skip_file in filelist_names:
405 | # Variable definitions.
406 | lines_seen = set()
407 | outfile = open(f'{skip_file}.tmp', "w")
408 |
409 | # Remove duplicate lines.
410 | for line in open(skip_file, 'r'):
411 | if line not in lines_seen:
412 | outfile.write(line)
413 | lines_seen.add(line)
414 |
415 | # Close original outfile connection.
416 | outfile.flush()
417 | outfile.close()
418 |
419 | # Write changes to original file and delete temporary file.
420 | with open(f'{skip_file}.tmp', "r") as infile:
421 | with open(skip_file, 'w') as outfile:
422 | outfile.write(infile.read())
423 | os.remove(f'{skip_file}.tmp')
424 | if copy_to_drive:
425 | gdrive.copy_to_drive(skip_file)
426 | # print("Done!")
427 |
428 | def scrape_scps_html(min_skip: int=0, max_skip: int=6000, tags: list=[], ai_dataset: bool=False, copy_to_drive: bool=False) -> None:
429 | """
430 | Scrapes the html code of SCPs min_skip to max_skip - 1.
431 |
432 | Output files:
433 | scp-html.txt.
434 |
435 | Parameters:
436 | min_skip: The SCP number to start at. Default: 0
437 | max_skip: The SCP number to end at plus one. Default: 6000
438 | tags: The list of tags to grab from. Will ignore SCPs without these tags. An empty list (default) matches all tags.
439 | ai_dataset: Set to True if data is later going to be used to train an AI. Adds "<|endoftext|>" tokens where necessary to divide the dataset for training. Default: False
440 | copy_to_drive: Set to True to copy the output files to your Google Drive when done creating them. Requires having your Google Drive mounted (preferably with scpscraper.gdrive.mount()). Default: False
441 | """
442 | # Create/reset text file
443 | with open('scp_html.txt', "w"):
444 | pass
445 |
446 | # Define blank page contents.
447 | blank_page = '\n
This page doesn\'t exist yet!
\n\n
\n\n
Did you get feedback first?
\n
\n'
448 |
449 | for i in tqdm(range(min_skip, max_skip), "Fetching skips", total=max_skip, ncols=150, initial=min_skip, unit="skip", file=sys.stdout, bar_format='{desc}... {percentage:3.2f}% |{bar}| [{remaining} remaining, {rate_fmt}]', smoothing=0.01875):
450 | with open('scp-html.txt', "a") as out:
451 | if i < 10:
452 | j = f'00{i}'
453 | elif i < 100:
454 | j = f'0{i}'
455 | else:
456 | j = i
457 |
458 | soup = get_single_scp(j)
459 |
460 | if soup is not None:
461 | # Get page tags
462 | tags_list = soup.find('div', {'class': 'page-tags'}).find('span')
463 | page_tags = [tag.string for tag in tags_list if tag.string != '\n']
464 |
465 | # Tag match checking code
466 | match = False
467 |
468 | if tags != []:
469 | for tag in tags:
470 | if tag in page_tags:
471 | match = True
472 | break
473 |
474 | else:
475 | match = True
476 |
477 | if match:
478 | content = soup.find('div', id='page-content')
479 |
480 | if blank_page not in content:
481 | if ai_dataset:
482 | out.write('{}\n\n<|endoftext|>\n\n\n'.format(str(content).replace(j, 'XXXX')))
483 |
484 | else:
485 | out.write(f'{content}\n\n')
486 |
487 | else:
488 | # print(f'\nThe page for SCP-{j} is blank!', file=sys.stderr)
489 | pass
490 |
491 | if copy_to_drive:
492 | gdrive.copy_to_drive('scp-html.txt')
493 |
--------------------------------------------------------------------------------
/scpscraper/test_scpscraper.py:
--------------------------------------------------------------------------------
1 | import pytest, scpscraper
2 |
3 | def test_get_name():
4 | name_list = [None, 'classification [Blocked]', 'The "Living" Room', 'Biological Motherboard', 'The 12 Rusty Keys and the Door', 'Skeleton Key', 'Fountain of Youth', 'Abdominal Planet', 'Zombie Plague', 'Red Ice', 'Collars of Control', 'Sentient Civil War Memorial Statue', 'A Bad Composition', 'Blue Lady Cigarettes', 'The Concrete Man', 'Pipe Nightmare', 'Organism', 'Shadow Person', 'Super Ball', 'The Monster Pot', 'Unseen Mold', 'Skin Wyrm', 'The Morgue', 'Black Shuck', 'Game Show of Death', 'Worn Wardrobe', 'Afterschool Retention', 'The Vermin God', 'Knowledge', 'Daughter of Shadows', 'The Homunculus', 'What is Love?', "Brothers' Bride", 'The Missing Number', 'Obsidian Ritual Knife', 'Possessive Mask', 'The Reincarnation Pilgrimage of the Yazidi (Kiras Guhorîn)', 'Dwarf Star', 'The Everything Tree', 'Proboscis Engineers', "Evolution's Child", 'Broadcasting Patient', 'A Formerly Winged Horse', 'The Beatle', 'Fission Cannon', 'Atmospheric Converter', '"Predatory" Holly Bush', 'Microbial Mutagen', 'The Cursed SCP Number', 'Plague Doctor', 'To The Cleverest', 'Japanese Obstetrical Model', 'Traveling Train', 'Young Girl', 'Water Nymph', '[unknown]', 'A Beautiful Person', 'The Daily Grind', 'Heart of Darkness', 'Radioactive Mineral', 'Infernal Occult Skeleton', 'Auditory Mind Control', '"Quantum" Computer', '"The World\'s Best TothBrush"', 'Flawed von Neumann Structure', 'Destroyed Organic Catalyst', "Eric's Toy", "The Artist's Pen", 'The Wire Figure', 'Second Chance', 'Iron Wings', 'Degenerative Metamorphic Entity', 'The Foot of the Bed', '"Cain"', 'Quantum Woodlouse', 'Corrosive Snail', '"Able"', 'Rot Skull', 'Guilt', 'Old AI', 'Dark Form', 'Spontaneous Combustion Virus', '"Fernand" the Cannibal', 'An Abandoned Row Home', 'Static Tower', "drawn ''Cassy''", 'The Office of Dr. [REDACTED]', 'The Stairwell', 'The Lizard King', 'Tophet', "Apocorubik's Cube", 'Nostalgia', '"The Best of The 5th Dimension"', 'Red Sea Object', 'Miniature Event Horizon', 'Gun', 'The "Shy Guy"', 'Old Fairgrounds', 'Surgeon Crabs', 'The Portrait']
5 |
6 | for i in range(100):
7 | assert scpscraper.get_scp_name(i) == name_list[i], 'get_scp_name() is not working properly!'
8 |
9 | def test_scrape_scps():
10 | scpscraper.scrape_scps(0, 50)
11 |
12 | filelist = [
13 | 'scp-descrips.txt',
14 | 'scp-conprocs.txt',
15 | 'scp-titles.txt',
16 | ]
17 |
18 | for test_file in filelist:
19 | with open(test_file, 'r') as in_text:
20 | assert in_text.read() != '', f'scrape_scps() is not working properly! {test_file} is empty!'
21 |
22 | def test_scps_html():
23 | scpscraper.scrape_scps_html(0, 50)
24 |
25 | with open('scp-html.txt', 'r') as in_text:
26 | assert in_text.read() != '', f'scrape_scps() is not working properly! scp-html is empty!'
27 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | with open("requirements.txt", "r") as fh:
7 | requirements = fh.readlines()
8 |
9 | setup(
10 | name="scpscraper",
11 | packages=['scpscraper'],
12 | version="1.0.1",
13 | license="MIT",
14 | author="JaonHax",
15 | author_email="jaonhax@gmail.com",
16 | description="A Python library designed for scraping data from the SCP wiki.",
17 | long_description=long_description,
18 | long_description_content_type="text/markdown",
19 | url="https://github.com/JaonHax/scp-scraper",
20 | keywords=["scp", "foundation", "webscraper", "tensorflow", "dataset"],
21 | install_requires=requirements,
22 | include_package_data=True,
23 | classifiers=[
24 | "Programming Language :: Python :: 3",
25 | "Programming Language :: Python :: 3 :: Only",
26 | "Programming Language :: Python :: 3.6",
27 | "Programming Language :: Python :: 3.7",
28 | "Programming Language :: Python :: 3.8",
29 | "License :: OSI Approved :: MIT License",
30 | "Operating System :: OS Independent",
31 | "Development Status :: 3 - Alpha",
32 | "Intended Audience :: Developers",
33 | "Natural Language :: English",
34 | "Topic :: Scientific/Engineering :: Artificial Intelligence"
35 | ],
36 | python_requires='>=3.6'
37 | )
38 |
--------------------------------------------------------------------------------