├── scp_epub ├── __init__.py ├── __main__.py ├── constants │ ├── __init__.py │ └── constants.py ├── download │ ├── __init__.py │ ├── aws.py │ ├── utils.py │ ├── get_wiki.py │ ├── wikidot_api.py │ └── cache.py ├── exceptions │ ├── __init__.py │ └── exceptions.py ├── test_unit │ ├── __init__.py │ ├── process │ │ ├── __init__.py │ │ ├── test_assemble.py │ │ └── test_process_page.py │ ├── download │ │ ├── __init__.py │ │ ├── test_get_wiki.py │ │ ├── wikidot_api.py │ │ ├── test_utils.py │ │ ├── test_wikidot_api.py │ │ └── test_cache.py │ └── _samples │ │ ├── 1509_yui-navset_output.html │ │ ├── 1509_yui-navset.html │ │ ├── scp_1-800-j_footnote_output.html │ │ ├── scp_1-800-j_footnote.html │ │ ├── scp_055.html │ │ ├── scp_055_pyscp.xhtml │ │ └── scp_055.json ├── test_component │ ├── __init__.py │ └── process │ │ ├── __init__.py │ │ ├── test_process_page_cases │ │ ├── _LICENSES.txt │ │ ├── scp-1257_converted.html │ │ └── scp-1257.html │ │ └── test_process_page.py ├── test_platform │ ├── __init__.py │ ├── download │ │ ├── __init__.py │ │ └── test_get_complete_page.py │ └── process │ │ ├── __init__.py │ │ └── test_process_all_pages.py └── process │ ├── assemble.py │ └── process_page.py ├── docs ├── references.md ├── book_definition.md ├── tests.md ├── configuration.md ├── constants.md └── how_it_works.md ├── requirements.txt ├── edge_cases └── _LICENSES.txt ├── README.md ├── LICENSE ├── .gitignore ├── progress.txt └── definitions └── complete_collection.json /scp_epub/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/__main__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/constants/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/download/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/exceptions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/test_unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/test_component/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/test_platform/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/test_unit/process/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/test_component/process/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/test_platform/download/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/test_platform/process/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scp_epub/test_unit/download/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/references.md: -------------------------------------------------------------------------------- 1 | # References 2 | 3 | Useful documentation to better understand this project. 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ratelimit 2 | lxml 3 | ebooklib 4 | parameterized 5 | requests 6 | bs4 7 | boto3 8 | -------------------------------------------------------------------------------- /scp_epub/exceptions/exceptions.py: -------------------------------------------------------------------------------- 1 | class SCPEpubError(Exception): 2 | pass 3 | 4 | 5 | class SCPDownloadError(SCPEpubError): 6 | pass 7 | -------------------------------------------------------------------------------- /docs/book_definition.md: -------------------------------------------------------------------------------- 1 | # Book Definition 2 | 3 | This is the file that defines the entire SCP ebook that is created: its structure, contents etc. 4 | -------------------------------------------------------------------------------- /scp_epub/test_component/process/test_process_page_cases/_LICENSES.txt: -------------------------------------------------------------------------------- 1 | SCP-1257 (http://www.scp-wiki.net/scp-1257): CC-BY-SA-3.0 by http://www.wikidot.com/user:info/sandrewswann 2 | -------------------------------------------------------------------------------- /edge_cases/_LICENSES.txt: -------------------------------------------------------------------------------- 1 | The files in this directory are governed by a separate license than the main project. 2 | 3 | SCP-3125: CC-BY-SA-3.0 http://www.scp-wiki.net/scp-3125 by qntm (http://www.scp-wiki.net/qntm-s-author-page) 4 | -------------------------------------------------------------------------------- /scp_epub/test_unit/_samples/1509_yui-navset_output.html: -------------------------------------------------------------------------------- 1 |

Effect 1509-1

A specimen.

Effect 1509-1 typically.

Effect 1509-2

Effect SCP-1509-2 occurs.

2 | -------------------------------------------------------------------------------- /scp_epub/download/aws.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | 3 | 4 | def get_api_key_from_secretsmanager(): 5 | raise NotImplementedError 6 | 7 | 8 | def retrieve_from_s3_cache(relative_path, item, filetype): 9 | raise NotImplementedError 10 | 11 | 12 | def store_in_s3_cache(contents, relative_path, item, filetype): 13 | raise NotImplementedError 14 | -------------------------------------------------------------------------------- /scp_epub/download/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def filter_tags(pages, include_tags=None): 5 | if include_tags is not None: 6 | pages = [ 7 | page for page in pages 8 | if 'tags' in page and any( 9 | included_tag in page['tags'] for included_tag in include_tags 10 | ) 11 | ] 12 | 13 | return pages 14 | 15 | 16 | def normalize_string(raw_string): 17 | return re.sub('[^a-z0-9\\-]', '_', raw_string) 18 | -------------------------------------------------------------------------------- /docs/tests.md: -------------------------------------------------------------------------------- 1 | # Running tests 2 | 3 | To run tests: 4 | 5 | * Run tests from the `scp_epub` module directory, not from the root of the repository: `cd scp_epub` 6 | * Unit tests: `python3 -m unittest discover -s test_unit -t .` 7 | * Component tests: `python3 -m unittest discover -s test_component -t .` 8 | * Platform tests: CAUTION! These tests actually do stuff such as download pages! Recommended to run one at a time, as some may take several hours: `python3 -m unittest test_platform/path/to/test_file.py` 9 | * Note: some platform tests may prompt you for a Wikidot API key. 10 | -------------------------------------------------------------------------------- /scp_epub/process/assemble.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | import re 3 | 4 | from constants import constants 5 | import process.process_page 6 | 7 | 8 | def process_all_pages(pages): 9 | page_names = [ 10 | page[constants.PAGE_PATH_KEY] 11 | for page in pages 12 | ] 13 | 14 | results = [] 15 | failures = [] 16 | for page in pages: 17 | try: 18 | results.append(process.process_page.process_page(page, url_allow_list=page_names)) 19 | except Exception as exception: 20 | failures.append(exception) 21 | 22 | return results, failures 23 | -------------------------------------------------------------------------------- /scp_epub/test_unit/_samples/1509_yui-navset.html: -------------------------------------------------------------------------------- 1 |
2 | 6 |
7 |
8 |
9 |

A specimen.

10 |
11 |

Effect 1509-1 typically.

12 |
13 | 16 |
17 |
18 | -------------------------------------------------------------------------------- /scp_epub/download/get_wiki.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import download.wikidot_api 5 | from constants import constants 6 | 7 | 8 | def get_scp_wiki(book_definition, refresh=False): 9 | raise NotImplementedError 10 | 11 | 12 | def filter_pages(book_definition): 13 | raise NotImplementedError 14 | 15 | 16 | def get_all_page_metadata(page_name, refresh=False): 17 | raise NotImplementedError 18 | 19 | 20 | def enrich_all_page_metadata_with_contents(page_metadata, refresh=False): 21 | raise NotImplementedError 22 | 23 | 24 | def get_edge_case(page_name): 25 | json_file = os.path.join(constants.EDGE_CASES_DIR, page_name + '.' + constants.EDGE_CASES_FILETYPE) 26 | with open(json_file, 'r', encoding=constants.ENCODING) as edge_case: 27 | page = json.load(edge_case) 28 | 29 | return page 30 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SCP epub 2 | 3 | Creates an epub from the scp wiki 4 | 5 | **This is a work in progress with no current ETA.** 6 | 7 | You can track the progress in progress.txt 8 | 9 | ## Running the tool 10 | 11 | ### Prerequisites 12 | 13 | You can run the ebook builder locally in Linux, Windows (using WSL), or Mac. 14 | 15 | Resource requirements: 16 | 17 | * At least 2 GB of available memory 18 | * At least 2 GB of available storage 19 | 20 | You need the following installed: 21 | 22 | * Python 3 and pip3 23 | * All the python modules in requirements.txt: `pip3 install -r requirements.txt` 24 | 25 | You need the following environment variables: 26 | 27 | * `SCP_EPUB_USE_AWS`: this environment variable must be unset: `unset SCP_EPUB_USE_AWS` 28 | * `SCP_EPUB_WIKIDOT_API_KEY`: your read-only Wikidot API Key: `export SCP_EPUB_WIKIDOT_API_KEY=000000000000000000000000000` 29 | -------------------------------------------------------------------------------- /docs/configuration.md: -------------------------------------------------------------------------------- 1 | # Configuration 2 | 3 | The tool loads configuration information in three ways: 4 | 5 | * If the tool is deployed on AWS, the tool will load infrastructure information (bucket names, locations of secrets in AWSSM etc.) from environment variables that are defined in [constants.py](/scp_epub/constants.py) 6 | * The tool also reads configuration directly from [constants.py](/scp_epub/constants.py). This is not meant to be changed by the end user. 7 | * Any configuration around building a book is contained in a book definition file in the [definitions directory](/definitions). This is documented in [book_definition.md](./book_definition.md) and is meant to be edited by the end user. 8 | 9 | ## Constants file 10 | 11 | All specifications regarding the format of the SCP wiki, caching settings, how to process the page contents etc. are defined in [the constants file](/scp_epub/constants.py). 12 | -------------------------------------------------------------------------------- /scp_epub/test_unit/_samples/scp_1-800-j_footnote_output.html: -------------------------------------------------------------------------------- 1 |

Special Containment Procedures: SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family1! No stains! No mess! No permanent physical or mental trauma!

Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.992!

2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 elfakyn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scp_epub/test_unit/_samples/scp_1-800-j_footnote.html: -------------------------------------------------------------------------------- 1 |

Special Containment Procedures: SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family1! No stains! No mess! No permanent physical or mental trauma!

Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.992!

2 | -------------------------------------------------------------------------------- /scp_epub/test_unit/download/test_get_wiki.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import json 3 | import os 4 | import unittest 5 | import unittest.mock 6 | 7 | import download.get_wiki 8 | from constants import constants 9 | 10 | 11 | class TestGetEdgeCase(unittest.TestCase): 12 | @unittest.mock.patch('builtins.open') 13 | def test_retrieve_from_local_cache(self, mock_open): 14 | # Arrange 15 | expected_relative_path = constants.EDGE_CASES_DIR 16 | expected_item = 'scp-1234' 17 | expected_filetype = constants.EDGE_CASES_FILETYPE 18 | expected_file = os.path.join(expected_relative_path, expected_item + '.' + constants.EDGE_CASES_FILETYPE) 19 | expected_encoding = constants.ENCODING 20 | expected_open_type = 'r' 21 | expected_contents = {'a': 'b'} 22 | expected_encoded_contents = json.dumps(expected_contents) 23 | mock_open.return_value.__enter__.return_value.read.return_value = expected_encoded_contents 24 | 25 | # Act 26 | actual_contents = download.get_wiki.get_edge_case(expected_item) 27 | 28 | # Assert 29 | self.assertEqual(expected_contents, actual_contents) 30 | mock_open.assert_called_once_with(expected_file, expected_open_type, encoding=expected_encoding) 31 | -------------------------------------------------------------------------------- /docs/constants.md: -------------------------------------------------------------------------------- 1 | # Constants 2 | 3 | This file documents constants in [constants.py](scp_epub/constants/constants.py) and what they do. 4 | 5 | Almost every string literal and magic number in the entire program is extracted in this file. 6 | 7 | ## Charset 8 | 9 | Character set-related values. 10 | 11 | ## AWS Execution 12 | 13 | This tool may be expanded in the future to run on AWS automatically. This sets some groundwork for that. This is currently not implemented, so enabling AWS use will not work. This may be removed in the future. 14 | 15 | Constant | Explanation 16 | --- | --- 17 | `USE_AWS_VARIABLE` | The environment variable that defines whether to use AWS or not 18 | `USE_AWS_TRUE` | The value of `USE_AWS_VARIABLE` that will be interpreted as "True" 19 | `S3_CACHE_BASE_PATH` | The path in the s3 bucket that will be used to store the page cache 20 | `S3_BUCKET_VARIABLE` | The environment variable that defines which s3 bucket data will be stored in 21 | `API_KEY_SECRETSMANAGER_VARIABLE` | The environment variable that contains the name of the SecretsManager secret that will be used. 22 | 23 | ## Local Execution 24 | 25 | This tool is for the most part intended to be run locally. Some of the key file paths are defined relative to the path of the constants file. 26 | 27 | To be continued... 28 | -------------------------------------------------------------------------------- /scp_epub/test_platform/process/test_process_all_pages.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import unittest.mock 3 | 4 | from parameterized import parameterized 5 | import bs4 6 | import json 7 | import os 8 | import io 9 | 10 | import download.download_scp 11 | import process.assemble 12 | 13 | class TestProcessAllPages(unittest.TestCase): 14 | def setUp(self): 15 | self.maxDiff = 500 16 | 17 | def test_process_all_pages(self): 18 | # Arrange 19 | expected_max_failures = 0 20 | expected_failures = [] 21 | expected_definition = { 22 | "download": { 23 | "download_tags": [ 24 | "scp", 25 | "tale", 26 | "hub", 27 | "supplement" 28 | ], 29 | "edge_cases": [ 30 | "scp-3125" 31 | ] 32 | }, 33 | } 34 | expected_pages = download.get_wiki.get_pages_from_book_definition(expected_definition) 35 | 36 | # Act 37 | actual_processed_pages, actual_failures = process.assemble.process_all_pages(expected_pages) 38 | 39 | # Assert 40 | self.assertLessEqual(len(actual_failures), expected_max_failures) 41 | self.assertEqual(expected_failures, actual_failures) 42 | -------------------------------------------------------------------------------- /scp_epub/test_platform/download/test_get_complete_page.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import os 3 | import parameterized.parameterized 4 | import unittest 5 | 6 | from constants import constants 7 | import download.get_wiki 8 | 9 | 10 | TEST_CASES_REGULAR = [ 11 | ['scp-123'], 12 | ['scp-4000'], 13 | ['scp-173'] 14 | ] 15 | 16 | TEST_CASES_EDGE_CASE = [ 17 | ['scp-3125'] 18 | ] 19 | 20 | 21 | class TestGetCompletePageSameWithWithoutCache(unittest.TestCase): 22 | @classmethod 23 | def setUpClass(cls): 24 | os.environ[constants.API_KEY_VARIABLE] = getpass.getpass('Wikidot read-only API key: ') 25 | return super().setUpClass() 26 | 27 | @classmethod 28 | def tearDownClass(cls): 29 | del os.environ[constants.API_KEY_VARIABLE] 30 | return super().tearDownClass() 31 | 32 | def setUp(self): 33 | self.maxDiff = 500 34 | 35 | @parameterized.parameterized.expand(TEST_CASES_REGULAR) 36 | def test_download_page(self, expected_page_name): 37 | # Arrange 38 | expected_page = download.get_wiki.get_complete_page(expected_page_name, refresh=True) 39 | 40 | # Act 41 | actual_page = download.get_wiki.get_complete_page(expected_page_name, refresh=False) 42 | 43 | # Assert 44 | self.assertEqual(expected_page, actual_page) 45 | self.assertEqual(expected_page_name, actual_page[constants.PAGE_PATH_KEY]) 46 | -------------------------------------------------------------------------------- /scp_epub/test_component/process/test_process_page.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import unittest.mock 3 | 4 | from parameterized import parameterized 5 | import bs4 6 | import json 7 | import os 8 | import io 9 | 10 | import process.process_page 11 | from constants import constants 12 | 13 | TEST_COMPONENT_PROCESS_PAGE_CASES_DIR = 'test_process_page_cases' 14 | 15 | 16 | class TestProcessPage(unittest.TestCase): 17 | def setUp(self): 18 | self.maxDiff = 0 19 | 20 | @parameterized.expand([ 21 | [ 22 | 'SCP-1257', 23 | 'scp-1257.html', 24 | 'scp-1257_converted.html', 25 | ['scp-173', 'scp-682'], 26 | ], 27 | ]) 28 | def test_process_page(self, expected_page_title, expected_web_html_file, expected_processed_html_file, expected_url_allow_list): 29 | # Arrange 30 | 31 | with open(os.path.join(os.path.dirname(__file__), TEST_COMPONENT_PROCESS_PAGE_CASES_DIR, expected_web_html_file), 'r', encoding=constants.ENCODING) as target_file: 32 | expected_web_html = target_file.read() 33 | 34 | with open(os.path.join(os.path.dirname(__file__), TEST_COMPONENT_PROCESS_PAGE_CASES_DIR, expected_processed_html_file), 'r', encoding=constants.ENCODING) as target_file: 35 | expected_processed_html = target_file.read() 36 | 37 | # Act 38 | actual_processed_html = process.process_page.process_page_html(expected_web_html, expected_page_title, expected_url_allow_list) 39 | 40 | # Assert 41 | self.assertEqual(expected_processed_html, actual_processed_html) 42 | -------------------------------------------------------------------------------- /scp_epub/test_unit/download/wikidot_api.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | import unittest 4 | import unittest.mock 5 | 6 | import download.wikidot_api 7 | from constants import constants 8 | 9 | class TestGetApiKey(unittest.TestCase): 10 | @unittest.mock.patch('download.aws.get_api_key_from_secretsmanager') 11 | def test_get_api_key_locally(self, mock_get_api_key_from_secretsmanager): 12 | # Arrange 13 | expected_api_key = '000000000000000000000000000' 14 | os.environ.pop(constants.USE_AWS_VARIABLE, None) 15 | os.environ[constants.API_KEY_VARIABLE] = expected_api_key 16 | 17 | # Act 18 | actual_api_key = download.wikidot_api._get_api_key() 19 | 20 | # Assert 21 | mock_get_api_key_from_secretsmanager.assert_not_called() 22 | self.assertEqual(expected_api_key, actual_api_key) 23 | 24 | @unittest.mock.patch('download.aws.get_api_key_from_secretsmanager') 25 | def test_get_api_key_with_aws(self, mock_get_api_key_from_secretsmanager): 26 | # Arrange 27 | expected_api_key = '000000000000000000000000000' 28 | os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE 29 | os.environ.pop(constants.API_KEY_VARIABLE, None) 30 | 31 | mock_get_api_key_from_secretsmanager.return_value = expected_api_key 32 | 33 | # Act 34 | actual_api_key = download.wikidot_api._get_api_key() 35 | 36 | # Assert 37 | mock_get_api_key_from_secretsmanager.assert_called_once_with() 38 | self.assertEqual(expected_api_key, actual_api_key) 39 | 40 | class TestWikidotClient(unittest.TestCase): 41 | def setUp(self): 42 | importlib.reload('download.wikidot_api') 43 | 44 | @unittest.mock.patch('download.wikidot_api.get_wikidot_client') 45 | def test_client_closure(self, mock_get_wikidot_client): 46 | # Arrange 47 | expected_client = download.wikidot_api.client() 48 | 49 | # Act 50 | actual_client = download.wikidot_api.client() 51 | 52 | # Assert 53 | self.assertIs(expected_client(), actual_client()) 54 | -------------------------------------------------------------------------------- /scp_epub/test_unit/download/test_utils.py: -------------------------------------------------------------------------------- 1 | import download.utils 2 | import unittest 3 | from parameterized import parameterized 4 | 5 | 6 | NORMALIZATION_TEST_CASES = [ 7 | ["fragment:three-farewells-aktus", "fragment_three-farewells-aktus"], 8 | ["scp-3125", "scp-3125"] 9 | ] 10 | 11 | class TestNormalizeString(unittest.TestCase): 12 | @parameterized.expand(NORMALIZATION_TEST_CASES) 13 | def test_normalize_string(self, expected_raw_string, expected_normalized_string): 14 | # Arrange 15 | 16 | # Act 17 | actual_normalized_string = download.utils.normalize_string(expected_raw_string) 18 | 19 | # Assert 20 | self.assertEqual(expected_normalized_string, actual_normalized_string) 21 | 22 | 23 | class TestFilterPages(unittest.TestCase): 24 | def test_filter_tags_no_rule(self): 25 | # Arrange 26 | expected_pages = [ 27 | {'tags': ['scp', 'meta']}, 28 | {'tags': ['tale', 'antimemetic']}, 29 | {'tags': ['_sys']}, 30 | {'tags': []}, 31 | {'content': 'whatnot'}, 32 | {} 33 | ] 34 | 35 | expected_filtered_pages = [ 36 | {'tags': ['scp', 'meta']}, 37 | {'tags': ['tale', 'antimemetic']}, 38 | {'tags': ['_sys']}, 39 | {'tags': []}, 40 | {'content': 'whatnot'}, 41 | {} 42 | ] 43 | 44 | expected_tag_filter = None 45 | 46 | # Act 47 | actual_filtered_pages = download.utils.filter_tags(expected_pages) 48 | 49 | # Assert 50 | self.assertEqual(expected_filtered_pages, actual_filtered_pages) 51 | 52 | def test_filter_tags_include_tags(self): 53 | # Arrange 54 | expected_pages = [ 55 | {'tags': ['scp', 'meta']}, 56 | {'tags': ['hub', 'mtf']}, 57 | {'tags': ['tale', 'antimemetic']}, 58 | {'tags': ['_sys']}, 59 | {'tags': []}, 60 | {'content': 'whatnot'}, 61 | {}, 62 | ] 63 | 64 | expected_filtered_pages = [ 65 | {'tags': ['scp', 'meta']}, 66 | {'tags': ['tale', 'antimemetic']}, 67 | ] 68 | 69 | expected_tag_filter = None 70 | expected_include_tags = ['scp', 'tale'] 71 | 72 | # Act 73 | actual_filtered_pages = download.utils.filter_tags(expected_pages, include_tags=expected_include_tags) 74 | 75 | # Assert 76 | self.assertEqual(expected_filtered_pages, actual_filtered_pages) 77 | -------------------------------------------------------------------------------- /scp_epub/download/wikidot_api.py: -------------------------------------------------------------------------------- 1 | 2 | import json 3 | import os 4 | from ratelimit import limits, sleep_and_retry 5 | import requests 6 | import xmlrpc.client 7 | 8 | import download.aws 9 | from download import cache 10 | from constants import constants 11 | from exceptions import exceptions 12 | 13 | 14 | _wikidot_client = None 15 | 16 | 17 | def _create_wikidot_client(): 18 | api_key = _get_api_key() 19 | return xmlrpc.client.ServerProxy(f'https://{constants.CLIENT_NAME}:{api_key}@{constants.RPC_ENDPOINT}') 20 | 21 | 22 | def _get_api_key(): 23 | if os.getenv(constants.USE_AWS_VARIABLE) == constants.USE_AWS_TRUE: 24 | return download.aws.get_api_key_from_secretsmanager() 25 | else: 26 | return os.getenv(constants.API_KEY_VARIABLE) 27 | 28 | 29 | def _get_wikidot_client(): 30 | global _wikidot_client 31 | if _wikidot_client is None: 32 | _wikidot_client = _create_wikidot_client() 33 | 34 | return _wikidot_client 35 | 36 | 37 | def _get_list_of_pages_undecorated(categories, **kwargs): 38 | client = _get_wikidot_client() 39 | list_of_pages = client.pages.select({ 40 | 'site': constants.SITE_NAME, 41 | 'categories': categories 42 | }) 43 | return list_of_pages 44 | 45 | 46 | def _get_page_metadata_undecorated(page, **kwargs): 47 | client = _get_wikidot_client() 48 | page_data = client.pages.get_meta({ 49 | 'site': constants.SITE_NAME, 50 | 'pages': [page] 51 | }) 52 | return page_data[page] 53 | 54 | 55 | def _get_web_page_undecorated(page, **kwargs): 56 | web_page = requests.get(f'{constants.SITE_DOWNLOAD_HOST}/{page}') 57 | if web_page.status_code > 200: 58 | return None 59 | return web_page.content.decode(constants.ENCODING) 60 | 61 | 62 | @cache.use_cache(constants.CACHE_PAGE_LIST_DIR, filetype=constants.CACHE_FILETYPE_JSON) 63 | @sleep_and_retry 64 | @limits(calls=constants.RATE_LIMIT_CALLS, period=constants.RATE_LIMIT_PERIOD) 65 | def get_list_of_pages(*args, **kwargs): 66 | return _get_list_of_pages_undecorated(*args, **kwargs) 67 | 68 | 69 | @cache.use_cache(constants.CACHE_PAGES_DIR, filetype=constants.CACHE_FILETYPE_JSON) 70 | @sleep_and_retry 71 | @limits(calls=constants.RATE_LIMIT_CALLS, period=constants.RATE_LIMIT_PERIOD) 72 | def get_page_metadata(*args, **kwargs): 73 | return _get_page_metadata_undecorated(*args, **kwargs) 74 | 75 | 76 | @cache.use_cache(constants.CACHE_HTML_DIR, filetype=constants.CACHE_FILETYPE_HTML) 77 | @sleep_and_retry 78 | @limits(calls=constants.RATE_LIMIT_WEB_CALLS, period=constants.RATE_LIMIT_WEB_PERIOD) 79 | def get_web_page(*args, **kwargs): 80 | return _get_web_page_undecorated(*args, **kwargs) 81 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Project stuff 2 | build/ 3 | 4 | # Visual Studio Code 5 | .vscode/ 6 | *.code-workspace 7 | .history/ 8 | 9 | ################################################# 10 | ## Python stuff 11 | 12 | # Byte-compiled / optimized / DLL files 13 | __pycache__/ 14 | *.py[cod] 15 | *$py.class 16 | 17 | # C extensions 18 | *.so 19 | 20 | # Distribution / packaging 21 | .Python 22 | build/ 23 | develop-eggs/ 24 | dist/ 25 | downloads/ 26 | eggs/ 27 | .eggs/ 28 | lib/ 29 | lib64/ 30 | parts/ 31 | sdist/ 32 | var/ 33 | wheels/ 34 | pip-wheel-metadata/ 35 | share/python-wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | *.py,cover 62 | .hypothesis/ 63 | .pytest_cache/ 64 | 65 | # Translations 66 | *.mo 67 | *.pot 68 | 69 | # Django stuff: 70 | *.log 71 | local_settings.py 72 | db.sqlite3 73 | db.sqlite3-journal 74 | 75 | # Flask stuff: 76 | instance/ 77 | .webassets-cache 78 | 79 | # Scrapy stuff: 80 | .scrapy 81 | 82 | # Sphinx documentation 83 | docs/_build/ 84 | 85 | # PyBuilder 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | .python-version 97 | 98 | # pipenv 99 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 100 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 101 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 102 | # install all needed dependencies. 103 | #Pipfile.lock 104 | 105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 106 | __pypackages__/ 107 | 108 | # Celery stuff 109 | celerybeat-schedule 110 | celerybeat.pid 111 | 112 | # SageMath parsed files 113 | *.sage.py 114 | 115 | # Environments 116 | .env 117 | .venv 118 | env/ 119 | venv/ 120 | ENV/ 121 | env.bak/ 122 | venv.bak/ 123 | 124 | # Spyder project settings 125 | .spyderproject 126 | .spyproject 127 | 128 | # Rope project settings 129 | .ropeproject 130 | 131 | # mkdocs documentation 132 | /site 133 | 134 | # mypy 135 | .mypy_cache/ 136 | .dmypy.json 137 | dmypy.json 138 | 139 | # Pyre type checker 140 | .pyre/ 141 | test.bin 142 | -------------------------------------------------------------------------------- /scp_epub/test_unit/process/test_assemble.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import unittest.mock 3 | 4 | from parameterized import parameterized 5 | import bs4 6 | import json 7 | import os 8 | import io 9 | 10 | import process.assemble 11 | from constants import constants 12 | 13 | 14 | class TestProcessAllPages(unittest.TestCase): 15 | @unittest.mock.patch('process.process_page.process_page') 16 | def test_process_all_pages(self, mock_process_page): 17 | # Arrange 18 | expected_page_names = ['a-1', 'b-2'] 19 | expected_processed_pages = ['result 1', 'result 2'] 20 | expected_failures = [] 21 | 22 | mock_process_page.side_effect = expected_processed_pages 23 | 24 | expected_pages = [ 25 | { 26 | constants.PAGE_PATH_KEY: page_name 27 | } 28 | for page_name in expected_page_names 29 | ] 30 | 31 | expected_calls = [ 32 | unittest.mock.call( 33 | { 34 | constants.PAGE_PATH_KEY: page_name 35 | }, 36 | url_allow_list=expected_page_names 37 | ) 38 | for page_name in expected_page_names 39 | ] 40 | 41 | # Act 42 | actual_processed_pages, actual_failures = process.assemble.process_all_pages(expected_pages) 43 | 44 | # Assert 45 | self.assertEqual(expected_processed_pages, actual_processed_pages) 46 | self.assertEqual(expected_failures, actual_failures) 47 | mock_process_page.assert_has_calls(expected_calls) 48 | 49 | @unittest.mock.patch('process.process_page.process_page') 50 | def test_process_all_pages_errors(self, mock_process_page): 51 | # Arrange 52 | expected_page_names = ['a-1', 'b-2'] 53 | expected_processed_pages = ['result 2'] 54 | 55 | expected_error = ValueError() 56 | expected_failures = [expected_error] 57 | 58 | mock_process_page.side_effect = [expected_error, 'result 2'] 59 | 60 | expected_pages = [ 61 | { 62 | constants.PAGE_PATH_KEY: page_name 63 | } 64 | for page_name in expected_page_names 65 | ] 66 | 67 | expected_calls = [ 68 | unittest.mock.call( 69 | { 70 | constants.PAGE_PATH_KEY: page_name 71 | }, 72 | url_allow_list=expected_page_names 73 | ) 74 | for page_name in expected_page_names 75 | ] 76 | 77 | # Act 78 | actual_processed_pages, actual_failures = process.assemble.process_all_pages(expected_pages) 79 | 80 | # Assert 81 | self.assertEqual(expected_processed_pages, actual_processed_pages) 82 | self.assertEqual(expected_failures, actual_failures) 83 | mock_process_page.assert_has_calls(expected_calls) 84 | -------------------------------------------------------------------------------- /scp_epub/download/cache.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import json 3 | import os 4 | 5 | from constants import constants 6 | import download.aws 7 | import download.utils 8 | 9 | 10 | def use_cache(relative_path, filetype=constants.CACHE_DEFAULT_FILETYPE): 11 | def decorator(func): 12 | @functools.wraps(func) 13 | def wrapper(*args, **kwargs): 14 | normalized_item = download.utils.normalize_string(args[0]) 15 | 16 | if 'refresh' in kwargs and kwargs['refresh'] is True: 17 | cached_contents = None 18 | else: 19 | cached_contents = get_cached_contents(relative_path, normalized_item, filetype) 20 | 21 | if cached_contents is not None: 22 | return cached_contents 23 | else: 24 | contents = func(*args, **kwargs) 25 | set_cached_contents(contents, relative_path, normalized_item, filetype) 26 | return contents 27 | 28 | return wrapper 29 | return decorator 30 | 31 | 32 | def get_cached_contents(relative_path, item, filetype): 33 | if os.getenv(constants.USE_AWS_VARIABLE) == constants.USE_AWS_TRUE: 34 | content_string = download.aws.retrieve_from_s3_cache(relative_path, item, filetype) 35 | else: 36 | content_string = retrieve_from_local_cache(relative_path, item, filetype) 37 | 38 | if filetype == 'json': 39 | return json.loads(content_string) 40 | else: 41 | return content_string 42 | 43 | 44 | def set_cached_contents(contents, relative_path, item, filetype): 45 | if filetype == 'json': 46 | content_string = json.dumps(contents) 47 | else: 48 | content_string = contents 49 | 50 | if os.getenv(constants.USE_AWS_VARIABLE) == constants.USE_AWS_TRUE: 51 | download.aws.store_in_s3_cache(content_string, relative_path, item, filetype) 52 | else: 53 | store_in_local_cache(content_string, relative_path, item, filetype) 54 | 55 | 56 | def retrieve_from_local_cache(relative_path, item, filetype): 57 | try: 58 | filename = item + '.' + filetype 59 | file_location = os.path.join(constants.LOCAL_CACHE_BASE_PATH, relative_path, filename) 60 | with open(file_location, 'r', encoding=constants.ENCODING) as local_file: 61 | contents = local_file.read() 62 | 63 | return contents 64 | except FileNotFoundError: 65 | return None 66 | 67 | 68 | def store_in_local_cache(contents, relative_path, item, filetype): 69 | filename = item + '.' + filetype 70 | file_dir = os.path.join(constants.LOCAL_CACHE_BASE_PATH, relative_path) 71 | file_location = os.path.join(file_dir, filename) 72 | 73 | os.makedirs(file_dir, exist_ok=True) 74 | with open(file_location, 'w', encoding=constants.ENCODING) as local_file: 75 | local_file.write(contents) 76 | -------------------------------------------------------------------------------- /docs/how_it_works.md: -------------------------------------------------------------------------------- 1 | # How SCP epub Works 2 | 3 | SCP epub takes the entirety of the SCP wiki (running on Wikidot) and converts it into ebook format via a series of separate steps. 4 | 5 | 1. Download the SCP wiki 6 | 2. Convert every page from web format into an ebook-friendly format 7 | 3. Organize and assemble the pages in the correct order 8 | 4. Create the ebook 9 | 10 | The **book definition file** controls exactly what pages to download from the SCP wiki and how to organize the book (steps 1, 3, and 4). The [constants file](scp_epub/constants/constants.py) controls how the pages are converted into ebook format (step 2). 11 | 12 | ## Downloading the SCP wiki 13 | 14 | We'll use [the complete collection definition file](definitions/complete_collection.json) as an example. 15 | 16 | First, SCP epub obtains a list of all SCP pages. For example, all pages that we care about (SCP entries, tales, hubs, supplements) are in the same category, `_default`. We get a list of all the pages in the specified categories (usually, `_default` is enough). This uses the Wikidot API and it requires a read-only Wikidot API key. 17 | 18 | Then, using the wikidot API, we obtain metadata on all the pages of interest, specified by tags (in this example, scp, tale, hub, and supplement). 19 | 20 | Now that we have all the pages of interest and their metadata, we download all of them. Some pages (like scp-3125), however, are super complicated or interactive, so they will not be downloaded and processed. Instead, they are treated as an edge case and they will be replaced by a version that's processed by hand (located in the [edge_cases folder](edge_cases/)). This is OK because the SCP wiki license allows such use. 21 | 22 | After we have a list of all the pages we care about, we download their actual contents not through the API, but by scraping the HTML of each web page's "printer-friendly" version. The program implements rate limiting to follow Wikidot rate limits and to also be respectful of the site bandwidth. Rate limits are defined in the constants file. 23 | 24 | Because of the way Wikidot works, there is no hierarchy or grouping of pages. This means everything is downloaded in the same directory and is not organized in any way. This will become relevant later. 25 | 26 | All downloaded information is cached so that we don't need to re-download stuff all the time. The cached version are found in the `build/cache/` directory that will be created when you first run the program. 27 | 28 | ## Converting pages into ebook-friendly format 29 | 30 | Right now, we have a whole bunch of pages in HTML format, and they contain a lot of unnecessary information such as site headers and footers, web links etc. We need to convert all the pages into an ebook-friendly form. Thankfully, the epub ebook format is basically just a huge ZIP archive containing HTML files. There are certain requirements for a page to be epub-compatible but it is otherwise a straightforward conversion process. 31 | 32 | Certain classes and tags need to be removed from the HTML outright. These are defined in the constants file. Currently, we simply remove images, although it would be theoretically possible to include them. 33 | 34 | The SCP wiki uses collapsible blocks a lot (where text is hidden until you click on a dropdown). These are incompatible with epub, so they're unwrapped: we get rid of the collapsible blocks but keep the text inside them. 35 | 36 | There are a number of other items, such as page headers, block quotes, footnotes, and internal links that all need to be properly converted. 37 | 38 | Each page is processed individually and is now converted into ebook-friendly HTML that can be directly assembled into an epub. 39 | 40 | ## Organizing and assembling the pages 41 | 42 | However, we can't do that just yet. The pages need to be put in the correct order. 43 | 44 | To be continued... 45 | -------------------------------------------------------------------------------- /scp_epub/constants/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | ######################## 4 | # Charset 5 | 6 | ENCODING = 'utf-8' 7 | 8 | ######################## 9 | # AWS Execution 10 | USE_AWS_VARIABLE = 'SCP_EPUB_USE_AWS' 11 | USE_AWS_TRUE = 'TRUE' 12 | S3_CACHE_BASE_PATH = 'cache/' 13 | S3_BUCKET_VARIABLE = 'SCP_EPUB_S3_BUCKET' 14 | API_KEY_SECRETSMANAGER_VARIABLE = 'SCP_EPUB_API_KEY_SECRET' 15 | 16 | ######################## 17 | # Relative filepaths 18 | 19 | LOCAL_CACHE_BASE_PATH = os.path.join(os.path.dirname(__file__), '../../build/cache') 20 | EDGE_CASES_DIR = os.path.join(os.path.dirname(__file__), '../../edge_cases') 21 | 22 | ######################## 23 | # Caching paths 24 | 25 | CACHE_FILETYPE_JSON = 'json' 26 | CACHE_FILETYPE_HTML = 'html' 27 | CACHE_DEFAULT_FILETYPE = CACHE_FILETYPE_JSON 28 | CACHE_PAGES_DIR = 'pages/' 29 | CACHE_HTML_DIR = 'web/' 30 | CACHE_PAGE_LIST_DIR = 'lists/' 31 | 32 | ######################## 33 | # Scraping 34 | 35 | API_KEY_VARIABLE = 'SCP_EPUB_WIKIDOT_API_KEY' 36 | 37 | SITE_NAME = 'scp-wiki' 38 | SITE_HOST = 'http://scp-wiki.net' 39 | SITE_DOWNLOAD_HOST = 'http://scp-wiki.net/printer--friendly' 40 | 41 | CLIENT_NAME = 'scp-epub' 42 | RPC_ENDPOINT = 'www.wikidot.com/xml-rpc-api.php' 43 | 44 | # Wikidot rate limit is 240 calls per 60 seconds, we're being conservative 45 | RATE_LIMIT_CALLS = 60 46 | RATE_LIMIT_PERIOD = 30 47 | 48 | RATE_LIMIT_WEB_CALLS = 10 49 | RATE_LIMIT_WEB_PERIOD = 5 50 | 51 | EDGE_CASES_FILETYPE = 'json' 52 | 53 | ###################### 54 | # Processing pages 55 | 56 | PAGE_PATH_KEY = 'fullname' 57 | TITLE_SHOWN_KEY = 'title_shown' 58 | TITLE_KEY = 'title' 59 | CREATED_BY_KEY = 'created_by' 60 | CREATED_AT_KEY = 'created_at' 61 | TAGS_KEY = 'tags' 62 | ADDITIONAL_DATA_KEY = 'scp_epub_additional_data' 63 | WEB_HTML_KEY = 'web_html' 64 | 65 | PROCESSED_NAME_KEY = 'name' 66 | PROCESSED_TITLE_KEY = 'title' 67 | PROCESSED_AUTHOR_KEY = 'created_by' 68 | PROCESSED_CREATION_DATE_KEY = 'created_at' 69 | PROCESSED_TAGS_KEY = 'tags' 70 | PROCESSED_HTML_KEY = 'html' 71 | 72 | EMPTY_TITLE = '███████████' 73 | EMPTY_AUTHOR = 'Unknown' 74 | EMPTY_TIMESTAMP = 'Unknown' 75 | 76 | 77 | BS4_FORMAT = 'lxml' 78 | 79 | PAGE_CONTENT_ID = 'page-content' 80 | 81 | CLASSES_TO_REMOVE = [ 82 | 'heritage-rating-module', 83 | 'heritage-emblem', 84 | 'page-rate-widget-box', 85 | 'scp-image-block', 86 | 'image', 87 | 'scp-image-caption', 88 | 'footer-wikiwalk-nav' 89 | ] 90 | 91 | TAGS_TO_REMOVE = [ 92 | 'img' 93 | ] 94 | 95 | COLLAPSIBLE_BLOCK_CLASS = 'collapsible-block' 96 | COLLAPSIBLE_BLOCK_LINK_CLASS = 'collapsible-block-link' 97 | COLLAPSIBLE_BLOCK_CONTENT_CLASS = 'collapsible-block-content' 98 | 99 | COLLAPSIBLE_CLASS_NEW = 'collapsible' 100 | COLLAPSIBLE_TITLE_CLASS_NEW = 'collapsible-title' 101 | 102 | BLOCKQUOTE_TAG = 'blockquote' 103 | BLOCKQUOTE_CLASS_NEW = 'quote' 104 | 105 | YUI_NAVSET_CLASS = 'yui-navset' 106 | YUI_NAVSET_CLASS_NEW = 'tabview' 107 | YUI_NAVSET_TAB_CLASS = 'yui-nav' 108 | YUI_NAVSET_TAB_CLASS_NEW = 'tabview-tab' 109 | YUI_NAVSET_TAB_TITLE_IDENTIFIER = 'em' 110 | YUI_NAVSET_TAB_TITLE_CLASS_NEW = 'tab-title' 111 | 112 | LINK_TAG = 'a' 113 | 114 | HREF_ATTRIBUTE = 'href' 115 | ID_ATTRIBUTE = 'id' 116 | EPUB_TYPE_ATTRIBUTE = 'epub:type' 117 | ONCLICK_ATTRIBUTE = 'onclick' 118 | CLASS_ATTRIBUTE = 'class' 119 | 120 | LINK_CLASS_NEW = 'link' 121 | LINK_EXTENSION = '.xhtml' 122 | 123 | PAGE_TITLE_TAG = 'p' 124 | PAGE_TITLE_CLASS = 'page-title' 125 | 126 | FOOTNOTEREF_TAG = 'sup' 127 | FOOTNOTEREF_CLASS = 'footnoteref' 128 | 129 | FOOTNOTE_CLASS = 'footnote-footer' 130 | 131 | EPUB_TYPE_FOOTNOTEREF = 'noteref' 132 | EPUB_TYPE_FOOTNOTE = 'footnote' 133 | 134 | FOOTNOTE_HREF_PATTERN = r"WIKIDOT\.page\.utils\.scrollToReference\('([a-zA-Z0-9-_]+)'\)" 135 | 136 | ###################### 137 | # Assembling pages 138 | 139 | DEFAULT_BASED_ON = 'name' 140 | -------------------------------------------------------------------------------- /progress.txt: -------------------------------------------------------------------------------- 1 | Punchlist: 2 | 3 | 4 | ============================================== 5 | 6 | TASKS: 7 | 8 | 1. download all pages 9 | * Rewrite downloader to not be a complete janky mess. 10 | [DONE] * Multiple backends for caching: 11 | [DEFERRED] * aws 12 | [DONE] * local 13 | [DONE] * able to select between them 14 | [DONE] * Multiple backends for retrieving the api secret: 15 | [DEFERRED] * aws 16 | [DONE] * local 17 | [DONE] * able to select between them 18 | [DONE] * Single instance of wikidot client 19 | [DONE] * Downloads: 20 | [DONE] * Download list 21 | [DONE] * Download page via API 22 | [DONE] * Download page via web 23 | * Get entire wiki: 24 | * Download list then filter pages based on definitions, download only those pages 25 | [IP] * Download complete page: 26 | [DONE] * API entry 27 | [DONE] * Web download 28 | * Apply edge case if applicable 29 | * Platform tests: 30 | [DONE] * Single page with cache 31 | [DONE] * Single page without cache 32 | [IP] * Edge case downloading 33 | * Downloading everything 34 | 35 | [DONE] 2. PRETTIFY SINGLE PAGE 36 | [CANCELLED] * delete the div that aligns right and is at the very beginning of every page ( should we do this? what if there's a page that doesn't have that div but has another one instead with actual content) 37 | [DONE] * WHAT PYSCP DOES (MAKE COMPATIBLE): 38 | [DONE] * remove widget box 39 | [DONE] * yui-navset 40 | [DONE] * collapsible-block 41 | [DONE] * footnoteref, footnote footer: BREAK COMPATIBILITY WITH PYSCP, MAKE NICE FOR EPUB 42 | [DONE] * blockquote 43 | [DONE] * links 44 | [DONE] * images 45 | [DONE] * title 46 | [DONE] * Put them all together to parse a whole page 47 | [DONE] * Component test that correctly parses whole web html files 48 | [DONE] * I want some sort of test that the output of the combined parsers makes sense. Even if just a regression test. 49 | [DONE] * Process all pages 50 | [DONE] * Error handling: e.g. each item in its own try-catch with logging of errors 51 | [DONE] * Log errors verbosely, exactly what failed and how 52 | [DONE] * Platform test: parse LITERALLY ALL THE SCP PAGES and test that none fail, or 2 |
3 |
scp-heritage-v3.png
6 |
7 | 8 |

Item 9 | #: SCP-055

10 |

Object Class: Keter

11 |

Special Containment 12 | Procedures: Object is kept within a five (5) by five (5) by two point five (2.5) meter square room 13 | constructed of cement (fifty (50) centimeter thickness), with a Faraday cage surrounding the cement walls. Access is 14 | via a heavy containment door measuring two (2) by two point five (2.5) meters constructed on bearings to ensure door 15 | closes and locks automatically unless held open deliberately. Security guards are NOT to be posted outside SCP-055's 16 | room. It is further advised that all personnel maintaining or studying other SCP objects in the vicinity try to 17 | maintain a distance of at least fifty (50) meters from the geometric center of the room, as long as this is 18 | reasonably practical.

19 |

Description: SCP-055 is a "self-keeping secret" or 20 | "anti-meme". Information about SCP-055's physical appearance as well as its nature, behavior, and origins 21 | is self-classifying. To clarify:

22 | 25 | 26 | 29 | 41 | 45 | 50 |

All of these facts are periodically rediscovered, usually by chance 51 | readers of this file, causing a great deal of alarm. This state of concern lasts minutes at most, before the matter 52 | is simply forgotten about.

53 |

A great deal of scientific data has been recorded from SCP-055, but cannot be 54 | studied.

55 |

At least one attempt has been made to destroy SCP-055, or possibly move it from containment at Site 56 | 19 to another site, meeting failure for reasons unknown.

57 |

SCP-055 may present a major physical threat and 58 | indeed may have killed many hundreds of personnel, and we would not know it. Certainly it presents a gigantic 59 | memetic/mental threat, hence its Keter classification.

60 |

Document #055-1: An Analysis of 61 | SCP-055

62 |

The author puts forward the hypothesis that SCP-055 was never formally acquired by 63 | \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 64 | \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 and is in fact an autonomous or remotely-controlled agent, inserted 65 | at Site 19 by an unidentified third party for one or all of the following purposes:

66 | 77 |

No action to counter 78 | any of these potential threats is suggested, or indeed theoretically possible.

79 |

Addendum A: 80 |

81 |
82 |

Hey, if this thing really is an "anti-meme", why doesn't the fact that it's an 83 | "anti-meme" get wiped? We must be wrong about that somehow. Wait a minute, what if we were to keep 84 | notes about what it isn't? Would we remember those? Bartholomew Hughes, NSA

85 |
86 |

87 | Document #055-2: Report of Dr. John Marachek

88 |

Survey team #19-055-127BXE was successfully 89 | able to enter SCP-055's container and ascertain the appearance and, to some degree, the nature of the object. Notes 90 | were taken according to the project methodology (see 91 | \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588), after which the container was sealed 92 | again.

93 |

Excerpt from a transcript of personnel debriefing follows:

94 |
95 |

Dr. Hughes: Okay, 96 | I'm going to need to ask you some questions about number 55 now.

97 |

98 | \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Number what?

99 |

Dr. Hughes: SCP object 55. The object you just 100 | examined.

101 |

\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Um, I don't know what you're talking about. I 102 | don't think we have a 55.

103 |

Dr. Hughes: Okay, then, \u2588\u2588\u2588\u2588\u2588\u2588\u2588, 104 | I'd like you to tell me what you've been doing for the past two hours.

105 |

106 | \u2588\u2588\u2588\u2588\u2588\u2588\u2588: What? I… <subject appears uncomfortable> … I don't 107 | know.

108 |

Dr. Hughes: Okay, then, do you remember that we all agreed that it wasn't spherical?

109 |

110 | \u2588\u2588\u2588\u2588\u2588\u2588\u2588: That what wasn't… Oh! Right! It isn't round at all! Object 55 111 | isn't round!

112 |

Dr. Hughes: So you remember it now?

113 |

\u2588\u2588\u2588\u2588\u2588\u2588\u2588: 114 | Well, no. I mean, I don't know what it is, but I know there is one. It's something you can't remember. And it's 115 | not a sphere.

116 |

Dr. Hughes: Wait a minute. What's not a sphere?

117 |

118 | \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Object 55.

119 |

Dr. Hughes: Object what?

120 |

121 | \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Doc, do you remember agreeing that something wasn't shaped like a 122 | sphere?

123 |

Dr. Hughes: Oh, right!

124 |
125 |

It appears to be possible to remember what SCP-055 126 | is not (negations of fact), and to repeatedly deduce its existence from these memories.

127 |

Personnel involved 128 | in Survey #19-055-127BXE reported moderate levels of disorientation and psychological trauma associated with cycles 129 | of repeated memory and forgetfulness of SCP-055. However, no long-term behavioral or health problems were observed, 130 | and psych assessments of survey personnel showed consistent reports of this distress fading over time.

131 |

132 | Recommendations: It may be worthwhile to post at least one staff member capable of remembering the existence of 133 | SCP-055 to each critical site.

134 | 141 | -------------------------------------------------------------------------------- /scp_epub/test_component/process/test_process_page_cases/scp-1257_converted.html: -------------------------------------------------------------------------------- 1 |

SCP-1257

2 |
3 |

Item #: SCP-1257

4 |

Object Class: Safe

5 |

Special Containment Procedures: All copies of SCP-1257, encompassing all instances of SCP-1257-1, SCP-1257-2, and SCP-1257-3, are to be kept in the secure media vault at Site-██. Any uncontained copies of SCP-1257 are to be recovered or destroyed by MTF Mu-53 (“Ebert's Thumb”). Because of the nature of the original appearance of SCP-1257, and its widespread exposure to the public, MTF Mu-53 is also tasked to replace any new sources of information about SCP-1257 as they are discovered, in whatever format they may appear, in accordance with Protocol Gamma-1257-A (Codename: “Snopes’ Revenge”)1

6 |

Unauthorized persons exposed to copies of SCP-1257, or who evidence any knowledge about SCP-1257, shall be interrogated by the Foundation Intelligence Department, administered Class A Amnestics, and have implanted post-hypnotic suggestions to reinforce the belief that SCP-1257 is a hoax.2

7 |

Study of SCP-1257 is limited to personnel of Level 3 or higher, subject to approval by the Foundation Intelligence Department.

8 |

Description: SCP-1257 is an American-produced half-hour situation comedy originally titled Raising Danny that aired on the ███ television network for six episodes in 197█. Instances of SCP-1257-1 are the original production reels for all twelve episodes filmed, recovered from the ███ archives in 198█. Instances of SCP-1257-2 are the draft and shooting scripts and copies, including four episodes that were never shot. Instances of SCP-1257-3 are all other video recordings of the six episodes actually aired.

9 |

The anomalous properties of SCP-1257 manifest in any and all video copies produced from the original series, and in any copies of the scripts for those episodes. Every year, beginning in mid-September, video recordings and scripts for Raising Danny will change to reflect a new season of episodes. Replacements will begin with episode one, and progress sequentially through each episode in order during each subsequent week. While the Foundation has access to the first sixteen episodes of each season,3 it appears that each SCP-1257 season runs approximately 24 episodes. Additionally, while new seasons of SCP-1257 occasionally produce hour-long "specials,” copies are always limited to the first half-hour running time of the original episodes of SCP-1257.

10 |

Video copies of the original over-the-air broadcast of the first six episodes of SCP-1257 present a special case. Commercials recorded contemporaneously with SCP-1257 will also show changes consistent with the content of SCP-1257, and updated videos have occasionally shown news bulletins and weather alerts that imply multiple points of divergence between the world that continues to produce SCP-1257 and our own.

11 |

SCP-1257’s original premise had a black man, named Tyler (played by Whitman Mayo), married to a white woman who already had a son by a prior marriage. When the woman dies, prior to the pilot episode, the man is left raising her son, named Danny (played by Danny Bonaduce), as his own. Reviews of the original series recovered by the Foundation were universally unfavorable and referred to it as “The unwanted bastard child of Sanford and Son and The Courtship of Eddie's Father.” Because of the anomalous properties of SCP-1257, the Foundation has only been able to reconstruct a general outline of the original content of the series.

12 |

Addendum 1: Notes on selected episodes of SCP-1257 observed in Foundation custody.

13 |

+ Document S-1257-11

14 |
15 |
    16 |
  • Season 3, Episode 3, “Tyler’s Date”: Episode manifests three months after initial containment. One line in the script implies the assassination of American President Jimmy Carter in late 1978 or early 1979.
  • 17 |
  • Season 4, Episode 1, “Bad Touch pt. 2”: Second half of a “special episode” ending season three. Eric, a young classmate of Danny’s, is the target of a sexual predator using what appears to be anomalous items manufactured by Doctor Wondertainment to lure his victims.
  • 18 |
  • Season 5, Episode 10, “The Senior Trip”: Episode mentions a scandal where 10 members of the UK House of Commons had been publicly revealed as members of a cult that bears a strong resemblance to the Church of the Broken God.
  • 19 |
  • Season 6, Episode 1, “The Freshmen”: The title of SCP-1257 is changed to Danny. The premise of the series changes as well, dropping the Tyler character and sending Danny to college in New York City with five of his classmates from High School.4 The University Lab appears to have specimens of SCP-███, SCP-███ and SCP-███.
  • 20 |
  • Season 6, Episode 11, “The ████████”: Plot of the episode concerns Eric’s suspicions that one of their dormmates might be secretly one of the “████████.” This turns out to be a misunderstanding. From context, the “████████” appear to prey on young women and have become endemic in [REDACTED] and seem to be the result of a containment breach of [REDACTED] in Mexico City.
  • 21 |
  • Season 7, Episode 2, “Eric’s Midterm Caper”: When this episode manifested in SCP-1257-3-12, a new advertisement appeared during the second break for Marshall, Carter, and Dark Ltd. The ad promoted [REDACTED] services for [REDACTED].
  • 22 |
  • Season 10, Episode 1, “The Job Hunt”: Hour-long “special” introducing another change in premise.5 One scene implies that the Global Occult Coalition has become public enough to run “want ads” in the local newspaper.
  • 23 |
  • Season 10, Episode 2, “The New Guy”: The show’s title is officially changed to Agent Danny of the SCP.6 Danny has been employed as Level 1 security at Site-19, and through a series of mishaps, prevents a containment breach of SCP-173.
  • 24 |
  • Season 10, Episode 5, “D-Class Act”: Danny mis-hears a co-worker’s conversation and becomes convinced he has been mistakenly reassigned to D-Class by the HR Department.
  • 25 |
  • Season 10, Episode 11, “Leaping Lizards”: [REDACTED] SCP-682 [REDACTED].
  • 26 |
27 |

Note: Details of SCP-1257 episodes past Season 10 are only available with the approval of the Intelligence Department.

28 |
29 |
30 | 39 | 40 |
-------------------------------------------------------------------------------- /scp_epub/test_unit/_samples/scp_055_pyscp.xhtml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | 6 | 7 | SCP-055: [unknown] 8 | 9 | 10 | 11 | 12 |
13 |

SCP-055: [unknown]

14 |
15 |
16 |
17 |
18 |
19 |

Item #: SCP-055

20 |

Object Class: Keter

21 |

Special Containment Procedures: Object is kept within a five (5) by five (5) by two point 22 | five (2.5) meter square room constructed of cement (fifty (50) centimeter thickness), with a Faraday cage 23 | surrounding the cement walls. Access is via a heavy containment door measuring two (2) by two point five 24 | (2.5) meters constructed on bearings to ensure door closes and locks automatically unless held open 25 | deliberately. Security guards are NOT to be posted outside SCP-055's room. It is further advised that all 26 | personnel maintaining or studying other SCP objects in the vicinity try to maintain a distance of at least 27 | fifty (50) meters from the geometric center of the room, as long as this is reasonably practical.

28 |

Description: SCP-055 is a "self-keeping secret" or "anti-meme". Information about SCP-055's 29 | physical appearance as well as its nature, behavior, and origins is self-classifying. To clarify:

30 | 33 | 36 | 47 | 51 | 55 |

All of these facts are periodically rediscovered, usually by chance readers of this file, causing a great 56 | deal of alarm. This state of concern lasts minutes at most, before the matter is simply forgotten about.

57 |

A great deal of scientific data has been recorded from SCP-055, but cannot be studied.

58 |

At least one attempt has been made to destroy SCP-055, or possibly move it from containment at Site 19 to 59 | another site, meeting failure for reasons unknown.

60 |

SCP-055 may present a major physical threat and indeed may have killed many hundreds of personnel, and we 61 | would not know it. Certainly it presents a gigantic memetic/mental threat, hence its Keter classification. 62 |

63 |

Document #055-1: An Analysis of SCP-055

64 |

The author puts forward the hypothesis that SCP-055 was never formally acquired by ████████████ ████████ and 65 | is in fact an autonomous or remotely-controlled agent, inserted at Site 19 by an unidentified third party 66 | for one or all of the following purposes:

67 | 74 |

No action to counter any of these potential threats is suggested, or indeed theoretically possible.

75 |

Addendum A:

76 |
77 |

Hey, if this thing really is an "anti-meme", why doesn't the fact that it's an "anti-meme" get wiped? We 78 | must be wrong about that somehow. Wait a minute, what if we were to keep notes about what it isn't? 79 | Would we remember those? Bartholomew Hughes, NSA

80 |
81 |

Document #055-2: Report of Dr. John Marachek

82 |

Survey team #19-055-127BXE was successfully able to enter SCP-055's container and ascertain the appearance 83 | and, to some degree, the nature of the object. Notes were taken according to the project methodology (see 84 | ████████████), after which the container was sealed again.

85 |

Excerpt from a transcript of personnel debriefing follows:

86 |
87 |

Dr. Hughes: Okay, I'm going to need to ask you some questions about number 55 now.

88 |

███████: Number what?

89 |

Dr. Hughes: SCP object 55. The object you just examined.

90 |

███████: Um, I don't know what you're talking about. I don't think we have a 55.

91 |

Dr. Hughes: Okay, then, ███████, I'd like you to tell me what you've been doing for the past two hours. 92 |

93 |

███████: What? I… <subject appears uncomfortable> … I don't know.

94 |

Dr. Hughes: Okay, then, do you remember that we all agreed that it wasn't spherical?

95 |

███████: That what wasn't… Oh! Right! It isn't round at all! Object 55 isn't round!

96 |

Dr. Hughes: So you remember it now?

97 |

███████: Well, no. I mean, I don't know what it is, but I know there is one. It's something you can't 98 | remember. And it's not a sphere.

99 |

Dr. Hughes: Wait a minute. What's not a sphere?

100 |

███████: Object 55.

101 |

Dr. Hughes: Object what?

102 |

███████: Doc, do you remember agreeing that something wasn't shaped like a sphere?

103 |

Dr. Hughes: Oh, right!

104 |
105 |

It appears to be possible to remember what SCP-055 is not (negations of fact), and to repeatedly deduce its 106 | existence from these memories.

107 |

Personnel involved in Survey #19-055-127BXE reported moderate levels of disorientation and psychological 108 | trauma associated with cycles of repeated memory and forgetfulness of SCP-055. However, no long-term 109 | behavioral or health problems were observed, and psych assessments of survey personnel showed consistent 110 | reports of this distress fading over time.

111 |

Recommendations: It may be worthwhile to post at least one staff member capable of remembering the existence 112 | of SCP-055 to each critical site.

113 | 118 |
119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /scp_epub/test_unit/_samples/scp_055.json: -------------------------------------------------------------------------------- 1 | { 2 | "fullname": "scp-055", 3 | "created_at": "2008-07-25T15:59:04+00:00", 4 | "created_by": "xthevilecorruptor", 5 | "updated_at": "2019-05-03T02:04:14+00:00", 6 | "updated_by": "Modern_Erasmus", 7 | "title": "SCP-055", 8 | "title_shown": "SCP-055", 9 | "parent_fullname": null, 10 | "tags": [ 11 | "keter", 12 | "scp", 13 | "meta", 14 | "featured", 15 | "memory-affecting", 16 | "heritage", 17 | "infohazard", 18 | "antimemetic" 19 | ], 20 | "rating": 2733, 21 | "revisions": 37, 22 | "parent_title": null, 23 | "content": "[[include component:heritage-rating]]\n\n**Item #:** SCP-055 \n\n**Object Class:** Keter \n\n**Special Containment Procedures:** Object is kept within a five (5) by five (5) by two point five (2.5) meter square room constructed of cement (fifty (50) centimeter thickness), with a Faraday cage surrounding the cement walls. Access is via a heavy containment door measuring two (2) by two point five (2.5) meters constructed on bearings to ensure door closes and locks automatically unless held open deliberately. Security guards are NOT to be posted outside SCP-055's room. It is further advised that all personnel maintaining or studying other SCP objects in the vicinity try to maintain a distance of at least fifty (50) meters from the geometric center of the room, as long as this is reasonably practical. \n\n**Description:** SCP-055 is a \"self-keeping secret\" or \"anti-meme\". Information about SCP-055's physical appearance as well as its nature, behavior, and origins is self-classifying. To clarify: \n\n* How Site 19 originally acquired SCP-055 is unknown. \n\n* When SCP-055 was obtained, and by whom, is unknown. \n\n* SCP-055's physical appearance is unknown. It is not indescribable, or invisible: individuals are perfectly capable of entering SCP-055's container and observing it, taking mental or written notes, making sketches, taking photographs, and even making audio/video recordings. An extensive log of such observations is on file. However, information about SCP-055's physical appearance \"leaks\" out of a human mind soon after such an observation. Individuals tasked with describing SCP-055 afterwards find their minds wandering and lose interest in the task; individuals tasked with sketching a copy of a photograph of SCP-055 are unable to remember what the photograph looks like, as are researchers overseeing these tests. Security personnel who have observed SCP-055 via closed-circuit television cameras emerge after a full shift exhausted and effectively amnesiac about the events of the previous hours. \n\n* Who authorized the construction of SCP-055's containment room, why it was constructed in this way, or what the purpose of the described Containment Procedures may be, are all unknown. \n\n* Despite SCP-055's container being easily accessible, all personnel at Site 19 claim no knowledge of SCP-055's existence when challenged.\n \n\nAll of these facts are periodically rediscovered, usually by chance readers of this file, causing a great deal of alarm. This state of concern lasts minutes at most, before the matter is simply forgotten about. \n\nA great deal of scientific data has been recorded from SCP-055, but cannot be studied. \n\nAt least one attempt has been made to destroy SCP-055, or possibly move it from containment at Site 19 to another site, meeting failure for reasons unknown. \n\nSCP-055 may present a major physical threat and indeed may have killed many hundreds of personnel, and we would not know it. Certainly it presents a gigantic memetic/mental threat, hence its Keter classification. \n\n**Document #055-1:** An Analysis of SCP-055 \n\nThe author puts forward the hypothesis that SCP-055 was never formally acquired by \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 and is in fact an autonomous or remotely-controlled agent, inserted at Site 19 by an unidentified third party for one or all of the following purposes: \n\n* to silently observe, or interfere with, activities at Site 19 \n* to silently observe, or interfere with, activities at other SCP locations \n* to silently observe, or interfere with, activities of humanity worldwide \n* to silently observe, or interfere with, other SCP objects \n* to silently observe, or interfere with, \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\n \n\nNo action to counter any of these potential threats is suggested, or indeed theoretically possible.\n\n**Addendum A**: \n> Hey, if this thing really is an \"anti-meme\", why doesn't the fact that it's an \"anti-meme\" get wiped? We must be wrong about that somehow. Wait a minute, what if we were to keep notes about what it isn't? Would we remember those? //Bartholomew Hughes, NSA//\n\n**Document #055-2:** Report of Dr. John Marachek\n\nSurvey team #19-055-127BXE was successfully able to enter SCP-055's container and ascertain the appearance and, to some degree, the nature of the object. Notes were taken according to the project methodology (see \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588), after which the container was sealed again. \n\nExcerpt from a transcript of personnel debriefing follows:\n\n> Dr. Hughes: Okay, I'm going to need to ask you some questions about number 55 now.\n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Number what? \n> \n> Dr. Hughes: SCP object 55. The object you just examined.\n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Um, I don't know what you're talking about. I don't think we //have// a 55. \n> \n> Dr. Hughes: Okay, then, \u2588\u2588\u2588\u2588\u2588\u2588\u2588, I'd like you to tell me what you've been doing for the past two hours.\n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: What? I... ... I don't know. \n> \n> Dr. Hughes: Okay, then, do you remember that we all agreed that it wasn't spherical? \n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: That what wasn't... Oh! Right! It isn't round at all! Object 55 isn't round!\n> \n> Dr. Hughes: So you remember it now? \n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Well, no. I mean, I don't know what it is, but I know there is one. It's something you can't remember. And it's not a sphere. \n> \n> Dr. Hughes: Wait a minute. What's not a sphere? \n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Object 55.\n> \n> Dr. Hughes: Object what?\n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Doc, do you remember agreeing that something wasn't shaped like a sphere?\n> \n> Dr. Hughes: Oh, right!\n\nIt appears to be possible to remember what SCP-055 is not (negations of fact), and to repeatedly deduce its existence from these memories. \n\nPersonnel involved in Survey #19-055-127BXE reported moderate levels of disorientation and psychological trauma associated with cycles of repeated memory and forgetfulness of SCP-055. However, no long-term behavioral or health problems were observed, and psych assessments of survey personnel showed consistent reports of this distress fading over time.\n\nRecommendations: It may be worthwhile to post at least one staff member capable of remembering the existence of SCP-055 to each critical site.\n\n[[footnoteblock]]\n\n[[div class=\"footer-wikiwalk-nav\"]]\n[[=]]\n<< [[[SCP-054]]] | SCP-055 | [[[SCP-056]]] >>\n[[/=]]\n[[/div]]", 24 | "html": "\n\n
\n
\n
\"scp-heritage-v3.png\"
\n
\n
\n

Item #: SCP-055

\n

Object Class: Keter

\n

Special Containment Procedures: Object is kept within a five (5) by five (5) by two point five (2.5) meter square room constructed of cement (fifty (50) centimeter thickness), with a Faraday cage surrounding the cement walls. Access is via a heavy containment door measuring two (2) by two point five (2.5) meters constructed on bearings to ensure door closes and locks automatically unless held open deliberately. Security guards are NOT to be posted outside SCP-055's room. It is further advised that all personnel maintaining or studying other SCP objects in the vicinity try to maintain a distance of at least fifty (50) meters from the geometric center of the room, as long as this is reasonably practical.

\n

Description: SCP-055 is a "self-keeping secret" or "anti-meme". Information about SCP-055's physical appearance as well as its nature, behavior, and origins is self-classifying. To clarify:

\n\n\n\n\n\n

All of these facts are periodically rediscovered, usually by chance readers of this file, causing a great deal of alarm. This state of concern lasts minutes at most, before the matter is simply forgotten about.

\n

A great deal of scientific data has been recorded from SCP-055, but cannot be studied.

\n

At least one attempt has been made to destroy SCP-055, or possibly move it from containment at Site 19 to another site, meeting failure for reasons unknown.

\n

SCP-055 may present a major physical threat and indeed may have killed many hundreds of personnel, and we would not know it. Certainly it presents a gigantic memetic/mental threat, hence its Keter classification.

\n

Document #055-1: An Analysis of SCP-055

\n

The author puts forward the hypothesis that SCP-055 was never formally acquired by \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 and is in fact an autonomous or remotely-controlled agent, inserted at Site 19 by an unidentified third party for one or all of the following purposes:

\n\n

No action to counter any of these potential threats is suggested, or indeed theoretically possible.

\n

Addendum A:

\n
\n

Hey, if this thing really is an "anti-meme", why doesn't the fact that it's an "anti-meme" get wiped? We must be wrong about that somehow. Wait a minute, what if we were to keep notes about what it isn't? Would we remember those? Bartholomew Hughes, NSA

\n
\n

Document #055-2: Report of Dr. John Marachek

\n

Survey team #19-055-127BXE was successfully able to enter SCP-055's container and ascertain the appearance and, to some degree, the nature of the object. Notes were taken according to the project methodology (see \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588), after which the container was sealed again.

\n

Excerpt from a transcript of personnel debriefing follows:

\n
\n

Dr. Hughes: Okay, I'm going to need to ask you some questions about number 55 now.

\n

\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Number what?

\n

Dr. Hughes: SCP object 55. The object you just examined.

\n

\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Um, I don't know what you're talking about. I don't think we have a 55.

\n

Dr. Hughes: Okay, then, \u2588\u2588\u2588\u2588\u2588\u2588\u2588, I'd like you to tell me what you've been doing for the past two hours.

\n

\u2588\u2588\u2588\u2588\u2588\u2588\u2588: What? I… <subject appears uncomfortable> … I don't know.

\n

Dr. Hughes: Okay, then, do you remember that we all agreed that it wasn't spherical?

\n

\u2588\u2588\u2588\u2588\u2588\u2588\u2588: That what wasn't… Oh! Right! It isn't round at all! Object 55 isn't round!

\n

Dr. Hughes: So you remember it now?

\n

\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Well, no. I mean, I don't know what it is, but I know there is one. It's something you can't remember. And it's not a sphere.

\n

Dr. Hughes: Wait a minute. What's not a sphere?

\n

\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Object 55.

\n

Dr. Hughes: Object what?

\n

\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Doc, do you remember agreeing that something wasn't shaped like a sphere?

\n

Dr. Hughes: Oh, right!

\n
\n

It appears to be possible to remember what SCP-055 is not (negations of fact), and to repeatedly deduce its existence from these memories.

\n

Personnel involved in Survey #19-055-127BXE reported moderate levels of disorientation and psychological trauma associated with cycles of repeated memory and forgetfulness of SCP-055. However, no long-term behavioral or health problems were observed, and psych assessments of survey personnel showed consistent reports of this distress fading over time.

\n

Recommendations: It may be worthwhile to post at least one staff member capable of remembering the existence of SCP-055 to each critical site.

\n
\n
\n

« SCP-054 | SCP-055 | SCP-056 »

\n
\n
\n", 25 | "children": 0, 26 | "comments": 412, 27 | "commented_at": "2020-04-14T20:34:20+00:00", 28 | "commented_by": "ZELYNER" 29 | } 30 | -------------------------------------------------------------------------------- /definitions/complete_collection.json: -------------------------------------------------------------------------------- 1 | { 2 | "download": { 3 | "categories": ["_default"], 4 | "tags_to_download": [ 5 | "scp", 6 | "tale", 7 | "hub", 8 | "supplement" 9 | ], 10 | "edge_cases": [ 11 | "scp-3125" 12 | ] 13 | }, 14 | "meta": { 15 | "title": "SCP Foundation — The Complete Collection", 16 | "author": "Various Authors", 17 | "publisher": "github.com/elfakyn/scp_epub", 18 | "identifier": "f2c8fbad-0cb0-4ae7-93c7-3a634a7e540e", 19 | "language": "en" 20 | }, 21 | "toc": [ 22 | { 23 | "toc_entry": "SCP Foundation — The Complete Collection", 24 | "children": [ 25 | { 26 | "toc_entry": "SCPs by Series", 27 | "regex": "^scp-series(-\\d+)?$" 28 | }, 29 | { 30 | "toc_entry": "SCP Tales by Series", 31 | "regex": "^scp-series(-\\d+)?-tales-edition$" 32 | }, 33 | { 34 | "toc_entry": "Hubs", 35 | "tags_all": [ 36 | "hub" 37 | ] 38 | } 39 | ] 40 | }, 41 | { 42 | "toc_entry": "SCP Series 1", 43 | "children": [ 44 | { 45 | "toc_entry": "000 to 099", 46 | "regex": "^scp-0\\d\\d$" 47 | }, 48 | { 49 | "toc_entry": "100 to 199", 50 | "regex": "^scp-1\\d\\d$" 51 | }, 52 | { 53 | "toc_entry": "200 to 299", 54 | "regex": "^scp-2\\d\\d$" 55 | }, 56 | { 57 | "toc_entry": "300 to 399", 58 | "regex": "^scp-3\\d\\d$" 59 | }, 60 | { 61 | "toc_entry": "400 to 499", 62 | "regex": "^scp-4\\d\\d$" 63 | }, 64 | { 65 | "toc_entry": "500 to 599", 66 | "regex": "^scp-5\\d\\d$" 67 | }, 68 | { 69 | "toc_entry": "600 to 699", 70 | "regex": "^scp-6\\d\\d$" 71 | }, 72 | { 73 | "toc_entry": "700 to 799", 74 | "regex": "^scp-7\\d\\d$" 75 | }, 76 | { 77 | "toc_entry": "800 to 899", 78 | "regex": "^scp-8\\d\\d$" 79 | }, 80 | { 81 | "toc_entry": "900 to 999", 82 | "regex": "^scp-9\\d\\d$" 83 | } 84 | ] 85 | }, 86 | { 87 | "toc_entry": "SCP Series 2", 88 | "children": [ 89 | { 90 | "toc_entry": "1000 to 1099", 91 | "regex": "^scp-10\\d\\d$" 92 | }, 93 | { 94 | "toc_entry": "1100 to 1199", 95 | "regex": "^scp-11\\d\\d$" 96 | }, 97 | { 98 | "toc_entry": "1200 to 1299", 99 | "regex": "^scp-12\\d\\d$" 100 | }, 101 | { 102 | "toc_entry": "1300 to 1399", 103 | "regex": "^scp-13\\d\\d$" 104 | }, 105 | { 106 | "toc_entry": "1400 to 1499", 107 | "regex": "^scp-14\\d\\d$" 108 | }, 109 | { 110 | "toc_entry": "1500 to 1599", 111 | "regex": "^scp-15\\d\\d$" 112 | }, 113 | { 114 | "toc_entry": "1600 to 1699", 115 | "regex": "^scp-16\\d\\d$" 116 | }, 117 | { 118 | "toc_entry": "1700 to 1799", 119 | "regex": "^scp-17\\d\\d$" 120 | }, 121 | { 122 | "toc_entry": "1800 to 1899", 123 | "regex": "^scp-18\\d\\d$" 124 | }, 125 | { 126 | "toc_entry": "1900 to 1999", 127 | "regex": "^scp-19\\d\\d$" 128 | } 129 | ] 130 | }, 131 | { 132 | "toc_entry": "SCP Series 3", 133 | "children": [ 134 | { 135 | "toc_entry": "2000 to 2099", 136 | "regex": "^scp-20\\d\\d$" 137 | }, 138 | { 139 | "toc_entry": "2100 to 2199", 140 | "regex": "^scp-21\\d\\d$" 141 | }, 142 | { 143 | "toc_entry": "2200 to 2299", 144 | "regex": "^scp-22\\d\\d$" 145 | }, 146 | { 147 | "toc_entry": "2300 to 2399", 148 | "regex": "^scp-23\\d\\d$" 149 | }, 150 | { 151 | "toc_entry": "2400 to 2499", 152 | "regex": "^scp-24\\d\\d$" 153 | }, 154 | { 155 | "toc_entry": "2500 to 2599", 156 | "regex": "^scp-25\\d\\d$" 157 | }, 158 | { 159 | "toc_entry": "2600 to 2699", 160 | "regex": "^scp-26\\d\\d$" 161 | }, 162 | { 163 | "toc_entry": "2700 to 2799", 164 | "regex": "^scp-27\\d\\d$" 165 | }, 166 | { 167 | "toc_entry": "2800 to 2899", 168 | "regex": "^scp-28\\d\\d$" 169 | }, 170 | { 171 | "toc_entry": "2900 to 2999", 172 | "regex": "^scp-29\\d\\d$" 173 | } 174 | ] 175 | }, 176 | { 177 | "toc_entry": "SCP Series 4", 178 | "children": [ 179 | { 180 | "toc_entry": "3000 to 3099", 181 | "regex": "^scp-30\\d\\d$" 182 | }, 183 | { 184 | "toc_entry": "3100 to 3199", 185 | "regex": "^scp-31\\d\\d$" 186 | }, 187 | { 188 | "toc_entry": "3200 to 3299", 189 | "regex": "^scp-32\\d\\d$" 190 | }, 191 | { 192 | "toc_entry": "3300 to 3399", 193 | "regex": "^scp-33\\d\\d$" 194 | }, 195 | { 196 | "toc_entry": "3400 to 3499", 197 | "regex": "^scp-34\\d\\d$" 198 | }, 199 | { 200 | "toc_entry": "3500 to 3599", 201 | "regex": "^scp-35\\d\\d$" 202 | }, 203 | { 204 | "toc_entry": "3600 to 3699", 205 | "regex": "^scp-36\\d\\d$" 206 | }, 207 | { 208 | "toc_entry": "3700 to 3799", 209 | "regex": "^scp-37\\d\\d$" 210 | }, 211 | { 212 | "toc_entry": "3800 to 3899", 213 | "regex": "^scp-38\\d\\d$" 214 | }, 215 | { 216 | "toc_entry": "3900 to 3999", 217 | "regex": "^scp-39\\d\\d$" 218 | } 219 | ] 220 | }, 221 | { 222 | "toc_entry": "SCP Series 5", 223 | "children": [ 224 | { 225 | "toc_entry": "4000 to 4099", 226 | "regex": "^scp-40\\d\\d$" 227 | }, 228 | { 229 | "toc_entry": "4100 to 4199", 230 | "regex": "^scp-41\\d\\d$" 231 | }, 232 | { 233 | "toc_entry": "4200 to 4299", 234 | "regex": "^scp-42\\d\\d$" 235 | }, 236 | { 237 | "toc_entry": "4300 to 4399", 238 | "regex": "^scp-43\\d\\d$" 239 | }, 240 | { 241 | "toc_entry": "4400 to 4499", 242 | "regex": "^scp-44\\d\\d$" 243 | }, 244 | { 245 | "toc_entry": "4500 to 4599", 246 | "regex": "^scp-45\\d\\d$" 247 | }, 248 | { 249 | "toc_entry": "4600 to 4699", 250 | "regex": "^scp-46\\d\\d$" 251 | }, 252 | { 253 | "toc_entry": "4700 to 4799", 254 | "regex": "^scp-47\\d\\d$" 255 | }, 256 | { 257 | "toc_entry": "4800 to 4899", 258 | "regex": "^scp-48\\d\\d$" 259 | }, 260 | { 261 | "toc_entry": "4900 to 4999", 262 | "regex": "^scp-49\\d\\d$" 263 | } 264 | ] 265 | }, 266 | { 267 | "toc_entry": "SCP Series 6", 268 | "children": [ 269 | { 270 | "toc_entry": "5000 to 5099", 271 | "regex": "^scp-50\\d\\d$" 272 | }, 273 | { 274 | "toc_entry": "5100 to 5199", 275 | "regex": "^scp-51\\d\\d$" 276 | }, 277 | { 278 | "toc_entry": "5200 to 5299", 279 | "regex": "^scp-52\\d\\d$" 280 | }, 281 | { 282 | "toc_entry": "5300 to 5399", 283 | "regex": "^scp-53\\d\\d$" 284 | }, 285 | { 286 | "toc_entry": "5400 to 5499", 287 | "regex": "^scp-54\\d\\d$" 288 | }, 289 | { 290 | "toc_entry": "5500 to 5599", 291 | "regex": "^scp-55\\d\\d$" 292 | }, 293 | { 294 | "toc_entry": "5600 to 5699", 295 | "regex": "^scp-56\\d\\d$" 296 | }, 297 | { 298 | "toc_entry": "5700 to 5799", 299 | "regex": "^scp-57\\d\\d$" 300 | }, 301 | { 302 | "toc_entry": "5800 to 5899", 303 | "regex": "^scp-58\\d\\d$" 304 | }, 305 | { 306 | "toc_entry": "5900 to 5999", 307 | "regex": "^scp-59\\d\\d$" 308 | } 309 | ] 310 | }, 311 | { 312 | "toc_entry": "Other SCPs", 313 | "children": [ 314 | { 315 | "toc_entry": "International SCPs", 316 | "tags_all": [ 317 | "scp", 318 | "international" 319 | ] 320 | }, 321 | { 322 | "toc_entry": "Explained SCPs", 323 | "tags_all": [ 324 | "scp", 325 | "explained" 326 | ] 327 | }, 328 | { 329 | "toc_entry": "Archived SCPs", 330 | "tags_all": [ 331 | "scp", 332 | "archived" 333 | ] 334 | }, 335 | { 336 | "toc_entry": "Joke SCPs", 337 | "tags_all": [ 338 | "scp", 339 | "joke" 340 | ] 341 | }, 342 | { 343 | "toc_entry": "Other SCPs", 344 | "tags_all": [ 345 | "scp" 346 | ] 347 | } 348 | ] 349 | }, 350 | { 351 | "toc_entry": "Tales", 352 | "children": [ 353 | { 354 | "toc_entry": "Tales A", 355 | "regex": "^a", 356 | "tags_all": [ 357 | "tale" 358 | ], 359 | "based_on": "title" 360 | }, 361 | { 362 | "toc_entry": "Tales B", 363 | "regex": "^b", 364 | "tags_all": [ 365 | "tale" 366 | ], 367 | "based_on": "title" 368 | }, 369 | { 370 | "toc_entry": "Tales C", 371 | "regex": "^c", 372 | "tags_all": [ 373 | "tale" 374 | ], 375 | "based_on": "title" 376 | }, 377 | { 378 | "toc_entry": "Tales D", 379 | "regex": "^d", 380 | "tags_all": [ 381 | "tale" 382 | ], 383 | "based_on": "title" 384 | }, 385 | { 386 | "toc_entry": "Tales E", 387 | "regex": "^e", 388 | "tags_all": [ 389 | "tale" 390 | ], 391 | "based_on": "title" 392 | }, 393 | { 394 | "toc_entry": "Tales F", 395 | "regex": "^f", 396 | "tags_all": [ 397 | "tale" 398 | ], 399 | "based_on": "title" 400 | }, 401 | { 402 | "toc_entry": "Tales G", 403 | "regex": "^g", 404 | "tags_all": [ 405 | "tale" 406 | ], 407 | "based_on": "title" 408 | }, 409 | { 410 | "toc_entry": "Tales H", 411 | "regex": "^h", 412 | "tags_all": [ 413 | "tale" 414 | ], 415 | "based_on": "title" 416 | }, 417 | { 418 | "toc_entry": "Tales I", 419 | "regex": "^i", 420 | "tags_all": [ 421 | "tale" 422 | ], 423 | "based_on": "title" 424 | }, 425 | { 426 | "toc_entry": "Tales J", 427 | "regex": "^j", 428 | "tags_all": [ 429 | "tale" 430 | ], 431 | "based_on": "title" 432 | }, 433 | { 434 | "toc_entry": "Tales K", 435 | "regex": "^k", 436 | "tags_all": [ 437 | "tale" 438 | ], 439 | "based_on": "title" 440 | }, 441 | { 442 | "toc_entry": "Tales L", 443 | "regex": "^l", 444 | "tags_all": [ 445 | "tale" 446 | ], 447 | "based_on": "title" 448 | }, 449 | { 450 | "toc_entry": "Tales M", 451 | "regex": "^m", 452 | "tags_all": [ 453 | "tale" 454 | ], 455 | "based_on": "title" 456 | }, 457 | { 458 | "toc_entry": "Tales N", 459 | "regex": "^n", 460 | "tags_all": [ 461 | "tale" 462 | ], 463 | "based_on": "title" 464 | }, 465 | { 466 | "toc_entry": "Tales O", 467 | "regex": "^o", 468 | "tags_all": [ 469 | "tale" 470 | ], 471 | "based_on": "title" 472 | }, 473 | { 474 | "toc_entry": "Tales P", 475 | "regex": "^p", 476 | "tags_all": [ 477 | "tale" 478 | ], 479 | "based_on": "title" 480 | }, 481 | { 482 | "toc_entry": "Tales Q", 483 | "regex": "^q", 484 | "tags_all": [ 485 | "tale" 486 | ], 487 | "based_on": "title" 488 | }, 489 | { 490 | "toc_entry": "Tales R", 491 | "regex": "^r", 492 | "tags_all": [ 493 | "tale" 494 | ], 495 | "based_on": "title" 496 | }, 497 | { 498 | "toc_entry": "Tales S", 499 | "regex": "^s", 500 | "tags_all": [ 501 | "tale" 502 | ], 503 | "based_on": "title" 504 | }, 505 | { 506 | "toc_entry": "Tales T", 507 | "regex": "^t", 508 | "tags_all": [ 509 | "tale" 510 | ], 511 | "based_on": "title" 512 | }, 513 | { 514 | "toc_entry": "Tales U", 515 | "regex": "^u", 516 | "tags_all": [ 517 | "tale" 518 | ], 519 | "based_on": "title" 520 | }, 521 | { 522 | "toc_entry": "Tales V", 523 | "regex": "^v", 524 | "tags_all": [ 525 | "tale" 526 | ], 527 | "based_on": "title" 528 | }, 529 | { 530 | "toc_entry": "Tales W", 531 | "regex": "^w", 532 | "tags_all": [ 533 | "tale" 534 | ], 535 | "based_on": "title" 536 | }, 537 | { 538 | "toc_entry": "Tales X", 539 | "regex": "^x", 540 | "tags_all": [ 541 | "tale" 542 | ], 543 | "based_on": "title" 544 | }, 545 | { 546 | "toc_entry": "Tales Y", 547 | "regex": "^y", 548 | "tags_all": [ 549 | "tale" 550 | ], 551 | "based_on": "title" 552 | }, 553 | { 554 | "toc_entry": "Tales Z", 555 | "regex": "^z", 556 | "tags_all": [ 557 | "tale" 558 | ], 559 | "based_on": "title" 560 | }, 561 | { 562 | "toc_entry": "Other Tales", 563 | "tags_all": [ 564 | "tale" 565 | ], 566 | "based_on": "title" 567 | } 568 | ] 569 | }, 570 | { 571 | "toc_entry": "Index", 572 | "special": "index" 573 | } 574 | ] 575 | } 576 | -------------------------------------------------------------------------------- /scp_epub/test_unit/download/test_cache.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import unittest.mock 3 | import os 4 | 5 | import download.cache 6 | from constants import constants 7 | 8 | 9 | class TestUseCache(unittest.TestCase): 10 | @unittest.mock.patch('download.utils.normalize_string') 11 | @unittest.mock.patch('download.cache.set_cached_contents') 12 | @unittest.mock.patch('download.cache.get_cached_contents') 13 | def test_use_cache_no_refresh_found_in_cache(self, mock_get_cached_contents, mock_set_cached_contents, mock_normalize_string): 14 | # Arrange 15 | expected_func = unittest.mock.MagicMock() 16 | expected_relative_path = 'foo/bar' 17 | expected_filetype = 'json' 18 | expected_item = 'Tale Of Three Soldiers' 19 | expected_refresh = False 20 | 21 | expected_normalized_item = 'tale-of-three-soldiers' 22 | expected_contents = 'contents' 23 | expected_cached_contents = expected_contents 24 | 25 | expected_args = [expected_item] 26 | expected_kwargs = { 27 | 'refresh': expected_refresh 28 | } 29 | 30 | mock_get_cached_contents.return_value = expected_cached_contents 31 | mock_normalize_string.return_value = expected_normalized_item 32 | 33 | # Act 34 | actual_contents = download.cache.use_cache(expected_relative_path, expected_filetype)(expected_func)(*expected_args, **expected_kwargs) 35 | 36 | # Assert 37 | mock_normalize_string.assert_called_once_with(expected_item) 38 | mock_get_cached_contents.assert_called_once_with(expected_relative_path, expected_normalized_item, expected_filetype) 39 | mock_set_cached_contents.assert_not_called() 40 | expected_func.assert_not_called() 41 | self.assertEqual(expected_contents, actual_contents) 42 | 43 | @unittest.mock.patch('download.utils.normalize_string') 44 | @unittest.mock.patch('download.cache.set_cached_contents') 45 | @unittest.mock.patch('download.cache.get_cached_contents') 46 | def test_use_cache_implicit_no_refresh_found_in_cache(self, mock_get_cached_contents, mock_set_cached_contents, mock_normalize_string): 47 | # Arrange 48 | expected_func = unittest.mock.MagicMock() 49 | expected_relative_path = 'foo/bar' 50 | expected_filetype = 'json' 51 | expected_item = 'Tale Of Three Soldiers' 52 | 53 | expected_normalized_item = 'tale-of-three-soldiers' 54 | expected_contents = 'contents' 55 | expected_cached_contents = expected_contents 56 | 57 | expected_args = [expected_item] 58 | expected_kwargs = dict() 59 | 60 | mock_get_cached_contents.return_value = expected_cached_contents 61 | mock_normalize_string.return_value = expected_normalized_item 62 | 63 | # Act 64 | actual_contents = download.cache.use_cache(expected_relative_path, expected_filetype)(expected_func)(*expected_args, **expected_kwargs) 65 | 66 | # Assert 67 | mock_normalize_string.assert_called_once_with(expected_item) 68 | mock_get_cached_contents.assert_called_once_with(expected_relative_path, expected_normalized_item, expected_filetype) 69 | mock_set_cached_contents.assert_not_called() 70 | expected_func.assert_not_called() 71 | self.assertEqual(expected_contents, actual_contents) 72 | 73 | @unittest.mock.patch('download.utils.normalize_string') 74 | @unittest.mock.patch('download.cache.set_cached_contents') 75 | @unittest.mock.patch('download.cache.get_cached_contents') 76 | def test_use_cache_no_refresh_not_found_in_cache(self, mock_get_cached_contents, mock_set_cached_contents, mock_normalize_string): 77 | # Arrange 78 | expected_func = unittest.mock.MagicMock() 79 | expected_relative_path = 'foo/bar' 80 | expected_filetype = 'json' 81 | expected_item = 'Tale Of Three Soldiers' 82 | expected_refresh = False 83 | 84 | expected_normalized_item = 'tale-of-three-soldiers' 85 | expected_contents = 'contents' 86 | expected_cached_contents = None 87 | 88 | expected_args = [expected_item] 89 | expected_kwargs = { 90 | 'refresh': expected_refresh 91 | } 92 | 93 | mock_get_cached_contents.return_value = expected_cached_contents 94 | mock_normalize_string.return_value = expected_normalized_item 95 | expected_func.return_value = expected_contents 96 | 97 | # Act 98 | actual_contents = download.cache.use_cache(expected_relative_path, expected_filetype)(expected_func)(*expected_args, **expected_kwargs) 99 | 100 | # Assert 101 | mock_normalize_string.assert_called_once_with(expected_item) 102 | mock_get_cached_contents.assert_called_once_with(expected_relative_path, expected_normalized_item, expected_filetype) 103 | mock_set_cached_contents.assert_called_once_with(expected_contents, expected_relative_path, expected_normalized_item, expected_filetype) 104 | expected_func.assert_called_once_with(*expected_args, **expected_kwargs) 105 | self.assertEqual(expected_contents, actual_contents) 106 | 107 | @unittest.mock.patch('download.utils.normalize_string') 108 | @unittest.mock.patch('download.cache.set_cached_contents') 109 | @unittest.mock.patch('download.cache.get_cached_contents') 110 | def test_use_cache_refresh(self, mock_get_cached_contents, mock_set_cached_contents, mock_normalize_string): 111 | # Arrange 112 | expected_func = unittest.mock.MagicMock() 113 | expected_relative_path = 'foo/bar' 114 | expected_filetype = 'json' 115 | expected_item = 'Tale Of Three Soldiers' 116 | expected_refresh = True 117 | 118 | expected_normalized_item = 'tale-of-three-soldiers' 119 | expected_contents = 'contents' 120 | expected_cached_contents = None 121 | 122 | expected_args = [expected_item] 123 | expected_kwargs = { 124 | 'refresh': expected_refresh 125 | } 126 | 127 | mock_get_cached_contents.return_value = expected_cached_contents 128 | mock_normalize_string.return_value = expected_normalized_item 129 | expected_func.return_value = expected_contents 130 | 131 | # Act 132 | actual_contents = download.cache.use_cache(expected_relative_path, expected_filetype)(expected_func)(*expected_args, **expected_kwargs) 133 | 134 | # Assert 135 | mock_normalize_string.assert_called_once_with(expected_item) 136 | mock_get_cached_contents.assert_not_called() 137 | mock_set_cached_contents.assert_called_once_with(expected_contents, expected_relative_path, expected_normalized_item, expected_filetype) 138 | expected_func.assert_called_once_with(*expected_args, **expected_kwargs) 139 | self.assertEqual(expected_contents, actual_contents) 140 | 141 | 142 | class TestGetCachedContents(unittest.TestCase): 143 | @unittest.mock.patch('json.loads') 144 | @unittest.mock.patch('download.aws.retrieve_from_s3_cache') 145 | @unittest.mock.patch('download.cache.retrieve_from_local_cache') 146 | def test_get_cached_contents_locally(self, mock_retrieve_from_local_cache, mock_retrieve_from_s3_cache, mock_loads): 147 | # Arrange 148 | os.environ.pop(constants.USE_AWS_VARIABLE, None) 149 | expected_filetype = 'html' 150 | expected_relative_path = 'foo/bar/' 151 | expected_item = 'scp-123' 152 | 153 | # Act 154 | actual_contents = download.cache.get_cached_contents(expected_relative_path, expected_item, expected_filetype) 155 | 156 | # Assert 157 | self.assertEqual(mock_retrieve_from_local_cache.return_value, actual_contents) 158 | mock_loads.assert_not_called() 159 | mock_retrieve_from_s3_cache.assert_not_called() 160 | mock_retrieve_from_local_cache.assert_called_once_with(expected_relative_path, expected_item, expected_filetype) 161 | 162 | @unittest.mock.patch('json.loads') 163 | @unittest.mock.patch('download.aws.retrieve_from_s3_cache') 164 | @unittest.mock.patch('download.cache.retrieve_from_local_cache') 165 | def test_get_cached_contents_s3(self, mock_retrieve_from_local_cache, mock_retrieve_from_s3_cache, mock_loads): 166 | # Arrange 167 | os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE 168 | expected_filetype = 'html' 169 | expected_relative_path = 'foo/bar/' 170 | expected_item = 'scp-123' 171 | 172 | # Act 173 | actual_contents = download.cache.get_cached_contents(expected_relative_path, expected_item, expected_filetype) 174 | 175 | # Assert 176 | self.assertEqual(mock_retrieve_from_s3_cache.return_value, actual_contents) 177 | mock_loads.assert_not_called() 178 | mock_retrieve_from_s3_cache.assert_called_once_with(expected_relative_path, expected_item, expected_filetype) 179 | mock_retrieve_from_local_cache.assert_not_called() 180 | 181 | @unittest.mock.patch('json.loads') 182 | @unittest.mock.patch('download.aws.retrieve_from_s3_cache') 183 | @unittest.mock.patch('download.cache.retrieve_from_local_cache') 184 | def test_get_cached_contents_load_json(self, mock_retrieve_from_local_cache, mock_retrieve_from_s3_cache, mock_loads): 185 | # Arrange 186 | os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE 187 | expected_filetype = 'json' 188 | expected_relative_path = 'foo/bar/' 189 | expected_item = 'scp-123' 190 | expected_contents = mock_loads.return_value 191 | 192 | # Act 193 | actual_contents = download.cache.get_cached_contents(expected_relative_path, expected_item, expected_filetype) 194 | 195 | # Assert 196 | self.assertEqual(expected_contents, actual_contents) 197 | mock_loads.assert_called_once_with(mock_retrieve_from_s3_cache.return_value) 198 | mock_retrieve_from_s3_cache.assert_called_once_with(expected_relative_path, expected_item, expected_filetype) 199 | mock_retrieve_from_local_cache.assert_not_called() 200 | 201 | 202 | class TestRetrieveFromLocalCache(unittest.TestCase): 203 | @unittest.mock.patch('builtins.open') 204 | def test_retrieve_from_local_cache(self, mock_open): 205 | # Arrange 206 | expected_relative_path = 'foo/bar' 207 | expected_item = 'scp-123' 208 | expected_filetype = 'json' 209 | expected_cache_file = os.path.join(constants.LOCAL_CACHE_BASE_PATH, expected_relative_path, expected_item + '.' + expected_filetype) 210 | expected_encoding = constants.ENCODING 211 | expected_open_type = 'r' 212 | expected_contents = mock_open.return_value.__enter__.return_value.read.return_value 213 | 214 | # Act 215 | actual_contents = download.cache.retrieve_from_local_cache(expected_relative_path, expected_item, expected_filetype) 216 | 217 | # Assert 218 | self.assertEqual(expected_contents, actual_contents) 219 | mock_open.assert_called_once_with(expected_cache_file, expected_open_type, encoding=expected_encoding) 220 | 221 | @unittest.mock.patch('builtins.open') 222 | def test_retrieve_from_local_cache_file_not_found(self, mock_open): 223 | # Arrange 224 | expected_relative_path = 'foo/bar' 225 | expected_item = 'scp-123' 226 | expected_filetype = 'json' 227 | expected_cache_file = os.path.join(constants.LOCAL_CACHE_BASE_PATH, expected_relative_path, expected_item + '.' + expected_filetype) 228 | expected_encoding = constants.ENCODING 229 | expected_open_type = 'r' 230 | mock_open.return_value.__enter__.side_effect = FileNotFoundError 231 | 232 | expected_contents = None 233 | 234 | # Act 235 | actual_contents = download.cache.retrieve_from_local_cache(expected_relative_path, expected_item, expected_filetype) 236 | 237 | # Assert 238 | self.assertEqual(expected_contents, actual_contents) 239 | mock_open.assert_called_once_with(expected_cache_file, expected_open_type, encoding=expected_encoding) 240 | 241 | 242 | class TestStoreInLocalCache(unittest.TestCase): 243 | @unittest.mock.patch('os.makedirs') 244 | @unittest.mock.patch('builtins.open') 245 | def test_store_in_local_cache(self, mock_open, mock_makedirs): 246 | # Arrange 247 | expected_relative_path = 'foo/bar' 248 | expected_item = 'scp-123' 249 | expected_filetype = 'json' 250 | expected_cache_dir = os.path.join(constants.LOCAL_CACHE_BASE_PATH, expected_relative_path) 251 | expected_cache_file = os.path.join(constants.LOCAL_CACHE_BASE_PATH, expected_relative_path, expected_item + '.' + expected_filetype) 252 | expected_encoding = constants.ENCODING 253 | expected_exist_ok = True 254 | expected_open_type = 'w' 255 | expected_contents = 'contents' 256 | 257 | # Act 258 | actual_contents = download.cache.store_in_local_cache(expected_contents, expected_relative_path, expected_item, expected_filetype) 259 | 260 | # Assert 261 | mock_makedirs.assert_called_once_with(expected_cache_dir, exist_ok=expected_exist_ok) 262 | mock_open.assert_called_once_with(expected_cache_file, expected_open_type, encoding=expected_encoding) 263 | mock_open.return_value.__enter__.return_value.write.assert_called_once_with(expected_contents) 264 | 265 | 266 | class TestSetCachedContents(unittest.TestCase): 267 | @unittest.mock.patch('json.dumps') 268 | @unittest.mock.patch('download.aws.store_in_s3_cache') 269 | @unittest.mock.patch('download.cache.store_in_local_cache') 270 | def test_set_cached_contents_locally(self, mock_store_in_local_cache, mock_store_in_s3_cache, mock_loads): 271 | # Arrange 272 | os.environ.pop(constants.USE_AWS_VARIABLE, None) 273 | expected_filetype = 'html' 274 | expected_relative_path = 'foo/bar/' 275 | expected_item = 'scp-123' 276 | expected_contents = 'contents' 277 | 278 | # Act 279 | download.cache.set_cached_contents(expected_contents, expected_relative_path, expected_item, expected_filetype) 280 | 281 | # Assert 282 | mock_loads.assert_not_called() 283 | mock_store_in_s3_cache.assert_not_called() 284 | mock_store_in_local_cache.assert_called_once_with(expected_contents, expected_relative_path, expected_item, expected_filetype) 285 | 286 | @unittest.mock.patch('json.dumps') 287 | @unittest.mock.patch('download.aws.store_in_s3_cache') 288 | @unittest.mock.patch('download.cache.store_in_local_cache') 289 | def test_set_cached_contents_s3(self, mock_store_in_local_cache, mock_store_in_s3_cache, mock_loads): 290 | # Arrange 291 | os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE 292 | expected_filetype = 'html' 293 | expected_relative_path = 'foo/bar/' 294 | expected_item = 'scp-123' 295 | expected_contents = 'contents' 296 | 297 | # Act 298 | download.cache.set_cached_contents(expected_contents, expected_relative_path, expected_item, expected_filetype) 299 | 300 | # Assert 301 | mock_loads.assert_not_called() 302 | mock_store_in_local_cache.assert_not_called() 303 | mock_store_in_s3_cache.assert_called_once_with(expected_contents, expected_relative_path, expected_item, expected_filetype) 304 | 305 | @unittest.mock.patch('json.dumps') 306 | @unittest.mock.patch('download.aws.store_in_s3_cache') 307 | @unittest.mock.patch('download.cache.store_in_local_cache') 308 | def test_set_cached_contents_load_json(self, mock_store_in_local_cache, mock_store_in_s3_cache, mock_loads): 309 | # Arrange 310 | os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE 311 | expected_filetype = 'json' 312 | expected_relative_path = 'foo/bar/' 313 | expected_item = 'scp-123' 314 | expected_contents = {'contents': 'contents'} 315 | 316 | # Act 317 | download.cache.set_cached_contents(expected_contents, expected_relative_path, expected_item, expected_filetype) 318 | 319 | # Assert 320 | mock_loads.assert_called_once_with(expected_contents) 321 | mock_store_in_s3_cache.assert_called_once_with(mock_loads.return_value, expected_relative_path, expected_item, expected_filetype) 322 | mock_store_in_local_cache.assert_not_called() 323 | -------------------------------------------------------------------------------- /scp_epub/test_component/process/test_process_page_cases/scp-1257.html: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 7 | SCP Foundation: SCP-1257 8 | 9 | 10 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 42 | 45 | 46 | 47 | 48 | 49 | 50 |
51 | 95 | 96 | 101 |
102 |
103 |
104 |
105 | SCP-1257 106 |
107 | 108 |
109 | 110 | 111 |
rating: +198+x
112 |

Item #: SCP-1257

113 |

Object Class: Safe

114 |

Special Containment Procedures: All copies of SCP-1257, encompassing all instances of SCP-1257-1, SCP-1257-2, and SCP-1257-3, are to be kept in the secure media vault at Site-██. Any uncontained copies of SCP-1257 are to be recovered or destroyed by MTF Mu-53 (“Ebert's Thumb”). Because of the nature of the original appearance of SCP-1257, and its widespread exposure to the public, MTF Mu-53 is also tasked to replace any new sources of information about SCP-1257 as they are discovered, in whatever format they may appear, in accordance with Protocol Gamma-1257-A (Codename: “Snopes’ Revenge”)1

115 |

Unauthorized persons exposed to copies of SCP-1257, or who evidence any knowledge about SCP-1257, shall be interrogated by the Foundation Intelligence Department, administered Class A Amnestics, and have implanted post-hypnotic suggestions to reinforce the belief that SCP-1257 is a hoax.2

116 |

Study of SCP-1257 is limited to personnel of Level 3 or higher, subject to approval by the Foundation Intelligence Department.

117 |

Description: SCP-1257 is an American-produced half-hour situation comedy originally titled Raising Danny that aired on the ███ television network for six episodes in 197█. Instances of SCP-1257-1 are the original production reels for all twelve episodes filmed, recovered from the ███ archives in 198█. Instances of SCP-1257-2 are the draft and shooting scripts and copies, including four episodes that were never shot. Instances of SCP-1257-3 are all other video recordings of the six episodes actually aired.

118 |

The anomalous properties of SCP-1257 manifest in any and all video copies produced from the original series, and in any copies of the scripts for those episodes. Every year, beginning in mid-September, video recordings and scripts for Raising Danny will change to reflect a new season of episodes. Replacements will begin with episode one, and progress sequentially through each episode in order during each subsequent week. While the Foundation has access to the first sixteen episodes of each season,3 it appears that each SCP-1257 season runs approximately 24 episodes. Additionally, while new seasons of SCP-1257 occasionally produce hour-long "specials,” copies are always limited to the first half-hour running time of the original episodes of SCP-1257.

119 |

Video copies of the original over-the-air broadcast of the first six episodes of SCP-1257 present a special case. Commercials recorded contemporaneously with SCP-1257 will also show changes consistent with the content of SCP-1257, and updated videos have occasionally shown news bulletins and weather alerts that imply multiple points of divergence between the world that continues to produce SCP-1257 and our own.

120 |

SCP-1257’s original premise had a black man, named Tyler (played by Whitman Mayo), married to a white woman who already had a son by a prior marriage. When the woman dies, prior to the pilot episode, the man is left raising her son, named Danny (played by Danny Bonaduce), as his own. Reviews of the original series recovered by the Foundation were universally unfavorable and referred to it as “The unwanted bastard child of Sanford and Son and The Courtship of Eddie's Father.” Because of the anomalous properties of SCP-1257, the Foundation has only been able to reconstruct a general outline of the original content of the series.

121 |

Addendum 1: Notes on selected episodes of SCP-1257 observed in Foundation custody.

122 |
123 | 124 | 144 |
145 | 154 | 159 | 160 |
161 | 162 |
163 | page revision: 16, last edited: 25 May 2018 20:16 164 |
165 | 166 |
167 |
168 | 169 | 170 | 171 | 172 |
173 | 174 |
175 | Unless stated otherwise Content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License 176 |
177 | 178 |
179 | 180 | 181 | 182 | 201 | 202 | 203 | 204 | 205 | -------------------------------------------------------------------------------- /scp_epub/test_unit/process/test_process_page.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import unittest.mock 3 | 4 | from parameterized import parameterized 5 | import bs4 6 | import json 7 | 8 | import process.process_page 9 | from constants import constants 10 | 11 | 12 | class TestProcessPage(unittest.TestCase): 13 | def setUp(self): 14 | self.maxDiff = None 15 | 16 | @unittest.mock.patch('process.process_page.process_page_html') 17 | def test_process_page(self, mock_process_page_html): 18 | # Arrange 19 | expected_url_allow_list = None 20 | 21 | expected_fullname = "personal-log-of-iceberg" 22 | expected_title = "Not Title Shown: Personal Log of █████ \"Iceberg\" ████" 23 | expected_title_shown = "Title Shown: Personal Log of █████ \"Iceberg\" ████" 24 | expected_created_at = "2008-10-16T21:06:01+00:00" 25 | expected_created_by = "unknown" 26 | expected_tags = [ 27 | "doctor-kondraki", 28 | "doctor-iceberg", 29 | "doctor-gears", 30 | "tale" 31 | ] 32 | expected_web_html = "blablabla" 33 | expected_substitute_html = None 34 | expected_processed_html = "
processed html
" 35 | 36 | expected_processed_title = expected_title_shown 37 | 38 | mock_process_page_html.return_value = expected_processed_html 39 | 40 | expected_page = { 41 | "fullname": expected_fullname, 42 | "created_at": expected_created_at, 43 | "created_by": expected_created_by, 44 | "updated_at": "2019-09-15T01:08:04+00:00", 45 | "updated_by": "Elogee FishTruck", 46 | "title": expected_title, 47 | "title_shown": expected_title_shown, 48 | "parent_fullname": None, 49 | "tags": expected_tags, 50 | "rating": 38, 51 | "revisions": 36, 52 | "parent_title": None, 53 | "content": "", 54 | "children": 0, 55 | "comments": 5, 56 | "commented_at": "2015-09-16T18:15:32+00:00", 57 | "commented_by": "Decibelles", 58 | "scp_epub_additional_data": { 59 | "web_html": expected_web_html 60 | } 61 | } 62 | 63 | expected_processed_page = { 64 | "name": expected_fullname, 65 | "title": expected_processed_title, 66 | "created_by": expected_created_by, 67 | "created_at": expected_created_at, 68 | "tags": expected_tags, 69 | "html": expected_processed_html, 70 | } 71 | 72 | # Act 73 | actual_processed_page = process.process_page.process_page(expected_page, url_allow_list=expected_url_allow_list) 74 | 75 | # Assert 76 | mock_process_page_html.assert_called_once_with(expected_web_html, expected_processed_title, url_allow_list=expected_url_allow_list) 77 | self.assertEqual(expected_processed_page, actual_processed_page) 78 | 79 | @unittest.mock.patch('process.process_page.process_page_html') 80 | def test_process_page_no_title_shown(self, mock_process_page_html): 81 | # Arrange 82 | expected_url_allow_list = None 83 | 84 | expected_fullname = "personal-log-of-iceberg" 85 | expected_title = "Not Title Shown: Personal Log of █████ \"Iceberg\" ████" 86 | expected_title_shown = None 87 | expected_created_at = "2008-10-16T21:06:01+00:00" 88 | expected_created_by = "unknown" 89 | expected_tags = [ 90 | "doctor-kondraki", 91 | "doctor-iceberg", 92 | "doctor-gears", 93 | "tale" 94 | ] 95 | expected_web_html = "blablabla" 96 | expected_substitute_html = None 97 | expected_processed_html = "
processed html
" 98 | 99 | expected_processed_title = expected_title 100 | 101 | mock_process_page_html.return_value = expected_processed_html 102 | 103 | expected_page = { 104 | "fullname": expected_fullname, 105 | "created_at": expected_created_at, 106 | "created_by": expected_created_by, 107 | "updated_at": "2019-09-15T01:08:04+00:00", 108 | "updated_by": "Elogee FishTruck", 109 | "title": expected_title, 110 | "title_shown": expected_title_shown, 111 | "parent_fullname": None, 112 | "tags": expected_tags, 113 | "rating": 38, 114 | "revisions": 36, 115 | "parent_title": None, 116 | "content": "", 117 | "children": 0, 118 | "comments": 5, 119 | "commented_at": "2015-09-16T18:15:32+00:00", 120 | "commented_by": "Decibelles", 121 | "scp_epub_additional_data": { 122 | "web_html": expected_web_html 123 | } 124 | } 125 | 126 | expected_processed_page = { 127 | "name": expected_fullname, 128 | "title": expected_processed_title, 129 | "created_by": expected_created_by, 130 | "created_at": expected_created_at, 131 | "tags": expected_tags, 132 | "html": expected_processed_html, 133 | } 134 | 135 | # Act 136 | actual_processed_page = process.process_page.process_page(expected_page, url_allow_list=expected_url_allow_list) 137 | 138 | # Assert 139 | mock_process_page_html.assert_called_once_with(expected_web_html, expected_processed_title, url_allow_list=expected_url_allow_list) 140 | self.assertEqual(expected_processed_page, actual_processed_page) 141 | 142 | @unittest.mock.patch('process.process_page.process_page_html') 143 | def test_process_page_missing_fields(self, mock_process_page_html): 144 | # Arrange 145 | expected_url_allow_list = None 146 | 147 | expected_fullname = "personal-log-of-iceberg" 148 | expected_title = None 149 | expected_title_shown = None 150 | expected_created_at = None 151 | expected_created_by = None 152 | expected_tags = None 153 | expected_web_html = "blablabla" 154 | expected_substitute_html = None 155 | expected_processed_html = "
processed html
" 156 | 157 | expected_processed_title = constants.EMPTY_TITLE 158 | 159 | mock_process_page_html.return_value = expected_processed_html 160 | 161 | expected_page = { 162 | "fullname": expected_fullname, 163 | "created_at": expected_created_at, 164 | "created_by": expected_created_by, 165 | "updated_at": "2019-09-15T01:08:04+00:00", 166 | "updated_by": "Elogee FishTruck", 167 | "title": expected_title, 168 | "title_shown": expected_title_shown, 169 | "parent_fullname": None, 170 | "tags": expected_tags, 171 | "rating": 38, 172 | "revisions": 36, 173 | "parent_title": None, 174 | "content": "", 175 | "children": 0, 176 | "comments": 5, 177 | "commented_at": "2015-09-16T18:15:32+00:00", 178 | "commented_by": "Decibelles", 179 | "scp_epub_additional_data": { 180 | "web_html": expected_web_html 181 | } 182 | } 183 | 184 | expected_processed_page = { 185 | "name": expected_fullname, 186 | "title": expected_processed_title, 187 | "created_by": constants.EMPTY_AUTHOR, 188 | "created_at": constants.EMPTY_TIMESTAMP, 189 | "tags": [], 190 | "html": expected_processed_html, 191 | } 192 | 193 | # Act 194 | actual_processed_page = process.process_page.process_page(expected_page, url_allow_list=expected_url_allow_list) 195 | 196 | # Assert 197 | mock_process_page_html.assert_called_once_with(expected_web_html, expected_processed_title, url_allow_list=expected_url_allow_list) 198 | self.assertEqual(expected_processed_page, actual_processed_page) 199 | 200 | @unittest.mock.patch('process.process_page.process_page_html') 201 | def test_process_page_url_allow_list(self, mock_process_page_html): 202 | # Arrange 203 | expected_url_allow_list = ['a', 'b'] 204 | 205 | expected_fullname = "personal-log-of-iceberg" 206 | expected_title = "Not Title Shown: Personal Log of █████ \"Iceberg\" ████" 207 | expected_title_shown = "Title Shown: Personal Log of █████ \"Iceberg\" ████" 208 | expected_created_at = "2008-10-16T21:06:01+00:00" 209 | expected_created_by = "unknown" 210 | expected_tags = [ 211 | "doctor-kondraki", 212 | "doctor-iceberg", 213 | "doctor-gears", 214 | "tale" 215 | ] 216 | expected_web_html = "blablabla" 217 | expected_substitute_html = None 218 | expected_processed_html = "
processed html
" 219 | 220 | expected_processed_title = expected_title_shown 221 | 222 | mock_process_page_html.return_value = expected_processed_html 223 | 224 | expected_page = { 225 | "fullname": expected_fullname, 226 | "created_at": expected_created_at, 227 | "created_by": expected_created_by, 228 | "updated_at": "2019-09-15T01:08:04+00:00", 229 | "updated_by": "Elogee FishTruck", 230 | "title": expected_title, 231 | "title_shown": expected_title_shown, 232 | "parent_fullname": None, 233 | "tags": expected_tags, 234 | "rating": 38, 235 | "revisions": 36, 236 | "parent_title": None, 237 | "content": "", 238 | "children": 0, 239 | "comments": 5, 240 | "commented_at": "2015-09-16T18:15:32+00:00", 241 | "commented_by": "Decibelles", 242 | "scp_epub_additional_data": { 243 | "web_html": expected_web_html 244 | } 245 | } 246 | 247 | expected_processed_page = { 248 | "name": expected_fullname, 249 | "title": expected_processed_title, 250 | "created_by": expected_created_by, 251 | "created_at": expected_created_at, 252 | "tags": expected_tags, 253 | "html": expected_processed_html, 254 | } 255 | 256 | # Act 257 | actual_processed_page = process.process_page.process_page(expected_page, url_allow_list=expected_url_allow_list) 258 | 259 | # Assert 260 | mock_process_page_html.assert_called_once_with(expected_web_html, expected_processed_title, url_allow_list=expected_url_allow_list) 261 | self.assertEqual(expected_processed_page, actual_processed_page) 262 | 263 | 264 | class TestGetPageContent(unittest.TestCase): 265 | def setUp(self): 266 | self.maxDiff = None 267 | 268 | def create_soup(self, html): 269 | return bs4.BeautifulSoup(html, "html.parser") 270 | 271 | @parameterized.expand([ 272 | [ 273 | 'simple page content', 274 | 'outside
inside
', 275 | '
inside
' 276 | ], 277 | [ 278 | 'not found', 279 | 'outside', 280 | 'None' 281 | ], 282 | ]) 283 | def test_get_page_content(self, reason, expected_html_string, expected_output_string): 284 | # Arrange 285 | expected_page_content_id = 'page-content' 286 | 287 | # Act 288 | actual_output = process.process_page.get_page_content(expected_html_string, page_content_id=expected_page_content_id) 289 | 290 | # Assert 291 | self.assertEqual(expected_output_string, str(actual_output)) 292 | 293 | 294 | class TestProcessContentFunctions(unittest.TestCase): 295 | def setUp(self): 296 | self.maxDiff = None 297 | 298 | def create_soup(self, html): 299 | return bs4.BeautifulSoup(html, "html.parser") 300 | 301 | @parameterized.expand([ 302 | [ 303 | 'nothing to remove', 304 | '
asdf
', 305 | '
asdf
' 306 | ], 307 | [ 308 | 'complete removal', 309 | '
asdf
', 310 | '' 311 | ], 312 | [ 313 | 'nested', 314 | 'outside
qwq
asdf
qwrq
outside', 315 | 'outsideoutside' 316 | ], 317 | [ 318 | 'reverse nested', 319 | 'outside
qwq
asdf
qwrq
outside', 320 | 'outsideoutside' 321 | ], 322 | ]) 323 | def test_remove_by_class(self, reason, expected_html_string, expected_output_string): 324 | # Arrange 325 | expected_classses_to_remove = [ 326 | 'foo', 327 | 'bar' 328 | ] 329 | 330 | expected_content = self.create_soup(expected_html_string) 331 | expected_output = None 332 | 333 | # Act 334 | actual_output = process.process_page.remove_classes(expected_content, classes_to_remove=expected_classses_to_remove) 335 | 336 | # Assert 337 | self.assertEqual(expected_output_string, str(expected_content)) 338 | self.assertEqual(expected_output, actual_output) 339 | 340 | @parameterized.expand([ 341 | [ 342 | 'nothing to remove', 343 | 'asdf', 344 | 'asdf' 345 | ], 346 | [ 347 | 'complete removal', 348 | '', 349 | '' 350 | ], 351 | [ 352 | 'simple removal', 353 | 'outsideoutside', 354 | 'outsideoutside' 355 | ], 356 | [ 357 | 'singletag', 358 | 'outsideoutside', 359 | 'outsideoutside' 360 | ], 361 | ]) 362 | def test_remove_by_tags(self, reason, expected_html_string, expected_output_string): 363 | # Arrange 364 | expected_tags_to_remove = [ 365 | 'img' 366 | ] 367 | 368 | expected_content = self.create_soup(expected_html_string) 369 | expected_output = None 370 | 371 | # Act 372 | actual_output = process.process_page.remove_tags(expected_content, tags_to_remove=expected_tags_to_remove) 373 | 374 | # Assert 375 | self.assertEqual(expected_output_string, str(expected_content)) 376 | self.assertEqual(expected_output, actual_output) 377 | 378 | @parameterized.expand([ 379 | [ 380 | 'scp-047', 381 | '''outside
''', 382 | '''outside

> Show details

  • Pathogenicity: Severe skin colonisation around sebaceous glands. Modification of skin pH to levels that become toxic to skin cells. Massive inflammation and immune cell infiltration. Eventual breakdown of skin structure leading to sepsis.
  • Transmission: Transmitted by skin-to-skin contact. Can remain active on inorganic surfaces for up to five hours.
  • Lethality: Approximately 40% mortality rate. Runs its course in 2-6 weeks. Very visible symptoms within 5-10 hours; contagious within 2-5 hours.
  • Handling: As soon as visible symptoms form, victims must be quarantined. Deceased victims should be incinerated.
''' 383 | ], 384 | [ 385 | 'multiple_items_spurious_newline', 386 | '''
''', 387 | '''

+ Document S-1257-11

\n
  • Season 3, Episode 3, “Tyler’s Date”: Episode manifests three months after initial containment. One line in the script implies the assassination of American President Jimmy Carter in late 1978 or early 1979.
  • Season 4, Episode 1, “Bad Touch pt. 2”: Second half of a “special episode” ending season three. Eric, a young classmate of Danny’s, is the target of a sexual predator using what appears to be anomalous items manufactured by Doctor Wondertainment to lure his victims.
  • Season 5, Episode 10, “The Senior Trip”: Episode mentions a scandal where 10 members of the UK House of Commons had been publicly revealed as members of a cult that bears a strong resemblance to the Church of the Broken God.
  • Season 6, Episode 1, “The Freshmen”: The title of SCP-1257 is changed to Danny. The premise of the series changes as well, dropping the Tyler character and sending Danny to college in New York City with five of his classmates from High School.4 The University Lab appears to have specimens of SCP-███, SCP-███ and SCP-███.
  • Season 6, Episode 11, “The ████████”: Plot of the episode concerns Eric’s suspicions that one of their dormmates might be secretly one of the “████████.” This turns out to be a misunderstanding. From context, the “████████” appear to prey on young women and have become endemic in [REDACTED] and seem to be the result of a containment breach of [REDACTED] in Mexico City.
  • Season 7, Episode 2, “Eric’s Midterm Caper”: When this episode manifested in SCP-1257-3-12, a new advertisement appeared during the second break for Marshall, Carter, and Dark Ltd. The ad promoted [REDACTED] services for [REDACTED].
  • Season 10, Episode 1, “The Job Hunt”: Hour-long “special” introducing another change in premise.5 One scene implies that the Global Occult Coalition has become public enough to run “want ads” in the local newspaper.
  • Season 10, Episode 2, “The New Guy”: The show’s title is officially changed to Agent Danny of the SCP.6 Danny has been employed as Level 1 security at Site-19, and through a series of mishaps, prevents a containment breach of SCP-173.
  • Season 10, Episode 5, “D-Class Act”: Danny mis-hears a co-worker’s conversation and becomes convinced he has been mistakenly reassigned to D-Class by the HR Department.
  • Season 10, Episode 11, “Leaping Lizards”: [REDACTED] SCP-682 [REDACTED].

Note: Details of SCP-1257 episodes past Season 10 are only available with the approval of the Intelligence Department.

''' 388 | 389 | ] 390 | ]) 391 | def test_unwrap_collapsible_blocks(self, reason, expected_html_string, expected_output_string): 392 | # Arrange 393 | expected_content = self.create_soup(expected_html_string) 394 | expected_output = None 395 | 396 | # Act 397 | actual_output = process.process_page.unwrap_collapsible_blocks(expected_content) 398 | 399 | # Assert 400 | self.assertEqual(expected_output_string, str(expected_content)) 401 | self.assertEqual(expected_output, actual_output) 402 | 403 | @parameterized.expand([ 404 | [ 405 | 'simple', 406 | '''outside

I love peace. I'd kill to preserve it

''', 407 | '''outside

I love peace. I'd kill to preserve it

''' 408 | ], 409 | ]) 410 | def test_divify_blockquotes(self, reason, expected_html_string, expected_output_string): 411 | # Arrange 412 | expected_content = self.create_soup(expected_html_string) 413 | expected_output = None 414 | 415 | # Act 416 | actual_output = process.process_page.divify_blockquotes(expected_content) 417 | 418 | # Assert 419 | self.assertEqual(expected_output_string, str(expected_content)) 420 | self.assertEqual(expected_output, actual_output) 421 | 422 | @parameterized.expand([ 423 | [ 424 | 'two with nested div', 425 | '''

A specimen.

Effect 1509-1 typically.

''', 426 | '''

Effect 1509-1

A specimen.

Effect 1509-1 typically.

Effect 1509-2

Effect SCP-1509-2 occurs.

''' 427 | ], 428 | ]) 429 | def test_unwrap_navset(self, reason, expected_html_string, expected_output_string): 430 | # Arrange 431 | expected_content = self.create_soup(expected_html_string) 432 | expected_output = None 433 | 434 | # Act 435 | actual_output = process.process_page.unwrap_yui_navset(expected_content) 436 | 437 | # Assert 438 | self.assertEqual(expected_output_string, str(expected_content)) 439 | self.assertEqual(expected_output, actual_output) 440 | 441 | @parameterized.expand([ 442 | [ 443 | 'no links', 444 | '''asdf''', 445 | '''asdf''' 446 | ], 447 | [ 448 | 'non-href anchors', 449 | '''asdfasdfasdfasdf''', 450 | '''asdfasdfasdfasdf''' 451 | ], 452 | [ 453 | 'expanded internal link', 454 | '''

This is by Autonomic (AARS821) RAISA. AAR

''', 455 | '''

This is by Autonomic (AARS821) RAISA. AAR

''' 456 | ], 457 | [ 458 | 'other internal link', 459 | '''

This is by Autonomic (AARS821) RAISA. AAR

''', 460 | '''

This is by Autonomic (AARS821) RAISA. AAR

''' 461 | ], 462 | [ 463 | 'implicit internal link', 464 | '''

This is by Autonomic (AARS821) RAISA. AAR

''', 465 | '''

This is by Autonomic (AARS821) RAISA. AAR

''' 466 | ], 467 | [ 468 | 'external link', 469 | '''

This is by Autonomic (AARS821) RAISA. AAR

''', 470 | '''

This is by Autonomic (AARS821) RAISA. AAR

''' 471 | ], 472 | [ 473 | 'multiple links', 474 | '''

This is by Autonomic (AARS821) RAISA. AAR

asdf

This is by Autonomic (AARS821) RAISA. AAR

This is by Autonomic (AARS821) RAISA. AAR

''', 475 | '''

This is by Autonomic (AARS821) RAISA. AAR

asdf

This is by Autonomic (AARS821) RAISA. AAR

This is by Autonomic (AARS821) RAISA. AAR

''' 476 | ], 477 | [ 478 | 'not in book', 479 | '''asdf''', 480 | '''asdf''' 481 | ], 482 | [ 483 | 'not in book, implicit', 484 | '''asdf''', 485 | '''asdf''' 486 | ], 487 | [ 488 | 'ignore footnote links', 489 | '''

Special Containment Procedures: SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family1! No stains! No mess! No permanent physical or mental trauma!

Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.992!

''', 490 | '''

Special Containment Procedures: SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family1! No stains! No mess! No permanent physical or mental trauma!

Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.992!

''' 491 | ] 492 | ]) 493 | def test_fix_links(self, reason, expected_html_string, expected_output_string): 494 | # Arrange 495 | expected_url_allow_list = ['scp-3281', 'scp-1234'] 496 | 497 | expected_content = self.create_soup(expected_html_string) 498 | expected_output = None 499 | 500 | # Act 501 | actual_output = process.process_page.fix_links(expected_content, url_allow_list=expected_url_allow_list) 502 | 503 | # Assert 504 | self.assertEqual(expected_output_string, str(expected_content)) 505 | self.assertEqual(expected_output, actual_output) 506 | 507 | @parameterized.expand([ 508 | [ 509 | 'no links', 510 | '''asdf''', 511 | '''asdf''' 512 | ], 513 | [ 514 | 'non-href anchors', 515 | '''asdfasdfasdfasdf''', 516 | '''asdfasdfasdfasdf''' 517 | ], 518 | [ 519 | 'not in book, implicit', 520 | '''asdf''', 521 | '''asdf''' 522 | ], 523 | ]) 524 | def test_fix_links_no_whitelist(self, reason, expected_html_string, expected_output_string): 525 | # Arrange 526 | expected_url_allow_list = None 527 | 528 | expected_content = self.create_soup(expected_html_string) 529 | expected_output = None 530 | 531 | # Act 532 | actual_output = process.process_page.fix_links(expected_content, url_allow_list=expected_url_allow_list) 533 | 534 | # Assert 535 | self.assertEqual(expected_output_string, str(expected_content)) 536 | self.assertEqual(expected_output, actual_output) 537 | 538 | @parameterized.expand([ 539 | [ 540 | 'simple add title', 541 | '''asdf''', 542 | '''

Hi there!

asdf''' 543 | ], 544 | [ 545 | 'some other tags', 546 | '''
asdf
''', 547 | '''

Hi there!

asdf
''' 548 | ] 549 | ]) 550 | def test_add_title(self, reason, expected_html_string, expected_output_string): 551 | # Arrange 552 | expected_title = 'Hi there!' 553 | 554 | expected_content = self.create_soup(expected_html_string) 555 | expected_output = None 556 | 557 | # Act 558 | actual_output = process.process_page.add_title(expected_content, expected_title) 559 | 560 | # Assert 561 | self.assertEqual(expected_output_string, str(expected_content)) 562 | self.assertEqual(expected_output, actual_output) 563 | 564 | @parameterized.expand([ 565 | [ 566 | 'just the noteref', 567 | '''1''', 568 | '''1''' 569 | ], 570 | [ 571 | 'just the footnote', 572 | '''''', 573 | '''''' 574 | ], 575 | [ 576 | 'noterefs and footnotes', 577 | '''

Special Containment Procedures: SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family1! No stains! No mess! No permanent physical or mental trauma!

Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.992!

''', 578 | '''

Special Containment Procedures: SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family1! No stains! No mess! No permanent physical or mental trauma!

Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.992!

''' 579 | ], 580 | ]) 581 | def test_fix_footnotes(self, reason, expected_html_string, expected_output_string): 582 | # Arrange 583 | expected_content = self.create_soup(expected_html_string) 584 | expected_output = None 585 | 586 | # Act 587 | actual_output = process.process_page.fix_footnotes(expected_content) 588 | 589 | # Assert 590 | self.assertEqual(expected_output_string, str(expected_content)) 591 | self.assertEqual(expected_output, actual_output) 592 | 593 | 594 | class TestHelpers(unittest.TestCase): 595 | def test_get_filename_from_name(self): 596 | # Arrange 597 | expected_name = 'scp-1234' 598 | expected_filename = 'scp-1234.xhtml' 599 | 600 | # Act 601 | actual_filename = process.process_page.get_filename(expected_name) 602 | 603 | # Assert 604 | self.assertEqual(expected_filename, actual_filename) 605 | --------------------------------------------------------------------------------