├── scp_epub
    ├── __init__.py
    ├── __main__.py
    ├── constants
    │   ├── __init__.py
    │   └── constants.py
    ├── download
    │   ├── __init__.py
    │   ├── aws.py
    │   ├── utils.py
    │   ├── get_wiki.py
    │   ├── wikidot_api.py
    │   └── cache.py
    ├── exceptions
    │   ├── __init__.py
    │   └── exceptions.py
    ├── test_unit
    │   ├── __init__.py
    │   ├── process
    │   │   ├── __init__.py
    │   │   ├── test_assemble.py
    │   │   └── test_process_page.py
    │   ├── download
    │   │   ├── __init__.py
    │   │   ├── test_get_wiki.py
    │   │   ├── wikidot_api.py
    │   │   ├── test_utils.py
    │   │   ├── test_wikidot_api.py
    │   │   └── test_cache.py
    │   └── _samples
    │   │   ├── 1509_yui-navset_output.html
    │   │   ├── 1509_yui-navset.html
    │   │   ├── scp_1-800-j_footnote_output.html
    │   │   ├── scp_1-800-j_footnote.html
    │   │   ├── scp_055.html
    │   │   ├── scp_055_pyscp.xhtml
    │   │   └── scp_055.json
    ├── test_component
    │   ├── __init__.py
    │   └── process
    │   │   ├── __init__.py
    │   │   ├── test_process_page_cases
    │   │       ├── _LICENSES.txt
    │   │       ├── scp-1257_converted.html
    │   │       └── scp-1257.html
    │   │   └── test_process_page.py
    ├── test_platform
    │   ├── __init__.py
    │   ├── download
    │   │   ├── __init__.py
    │   │   └── test_get_complete_page.py
    │   └── process
    │   │   ├── __init__.py
    │   │   └── test_process_all_pages.py
    └── process
    │   ├── assemble.py
    │   └── process_page.py
├── docs
    ├── references.md
    ├── book_definition.md
    ├── tests.md
    ├── configuration.md
    ├── constants.md
    └── how_it_works.md
├── requirements.txt
├── edge_cases
    └── _LICENSES.txt
├── README.md
├── LICENSE
├── .gitignore
├── progress.txt
└── definitions
    └── complete_collection.json


/scp_epub/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/__main__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/constants/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/download/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/exceptions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/test_component/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/test_platform/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/process/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/test_component/process/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/test_platform/download/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/test_platform/process/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/download/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/references.md:
--------------------------------------------------------------------------------
1 | # References
2 | 
3 | Useful documentation to better understand this project.
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | ratelimit
2 | lxml
3 | ebooklib
4 | parameterized
5 | requests
6 | bs4
7 | boto3
8 | 


--------------------------------------------------------------------------------
/scp_epub/exceptions/exceptions.py:
--------------------------------------------------------------------------------
1 | class SCPEpubError(Exception):
2 |     pass
3 | 
4 | 
5 | class SCPDownloadError(SCPEpubError):
6 |     pass
7 | 


--------------------------------------------------------------------------------
/docs/book_definition.md:
--------------------------------------------------------------------------------
1 | # Book Definition
2 | 
3 | This is the file that defines the entire SCP ebook that is created: its structure, contents etc.
4 | 


--------------------------------------------------------------------------------
/scp_epub/test_component/process/test_process_page_cases/_LICENSES.txt:
--------------------------------------------------------------------------------
1 | SCP-1257 (http://www.scp-wiki.net/scp-1257): CC-BY-SA-3.0 by http://www.wikidot.com/user:info/sandrewswann
2 | 


--------------------------------------------------------------------------------
/edge_cases/_LICENSES.txt:
--------------------------------------------------------------------------------
1 | The files in this directory are governed by a separate license than the main project.
2 | 
3 | SCP-3125: CC-BY-SA-3.0 http://www.scp-wiki.net/scp-3125 by qntm (http://www.scp-wiki.net/qntm-s-author-page)
4 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/_samples/1509_yui-navset_output.html:
--------------------------------------------------------------------------------
1 | <div class="tabview"><div class="tabview-tab"><p class="tab-title">Effect 1509-1</p><div class="inner-div" style="width:300px;"><p>A specimen.</p></div><p>Effect 1509-1 typically.</p></div><div class="tabview-tab"><p class="tab-title">Effect 1509-2</p><p>Effect SCP-1509-2 occurs.</p></div></div>
2 | 


--------------------------------------------------------------------------------
/scp_epub/download/aws.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | 
 3 | 
 4 | def get_api_key_from_secretsmanager():
 5 |     raise NotImplementedError
 6 | 
 7 | 
 8 | def retrieve_from_s3_cache(relative_path, item, filetype):
 9 |     raise NotImplementedError
10 | 
11 | 
12 | def store_in_s3_cache(contents, relative_path, item, filetype):
13 |     raise NotImplementedError
14 | 


--------------------------------------------------------------------------------
/scp_epub/download/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def filter_tags(pages, include_tags=None):
 5 |     if include_tags is not None:
 6 |         pages = [
 7 |             page for page in pages
 8 |             if 'tags' in page and any(
 9 |                 included_tag in page['tags'] for included_tag in include_tags
10 |             )
11 |         ]
12 | 
13 |     return pages
14 | 
15 | 
16 | def normalize_string(raw_string):
17 |     return re.sub('[^a-z0-9\\-]', '_', raw_string)
18 | 


--------------------------------------------------------------------------------
/docs/tests.md:
--------------------------------------------------------------------------------
 1 | # Running tests
 2 | 
 3 | To run tests:
 4 | 
 5 | * Run tests from the `scp_epub` module directory, not from the root of the repository: `cd scp_epub`
 6 | * Unit tests: `python3 -m unittest discover -s test_unit -t .`
 7 | * Component tests: `python3 -m unittest discover -s test_component -t .`
 8 | * Platform tests: CAUTION! These tests actually do stuff such as download pages! Recommended to run one at a time, as some may take several hours: `python3 -m unittest test_platform/path/to/test_file.py`
 9 |   * Note: some platform tests may prompt you for a Wikidot API key.
10 | 


--------------------------------------------------------------------------------
/scp_epub/process/assemble.py:
--------------------------------------------------------------------------------
 1 | import bs4
 2 | import re
 3 | 
 4 | from constants import constants
 5 | import process.process_page
 6 | 
 7 | 
 8 | def process_all_pages(pages):
 9 |     page_names = [
10 |         page[constants.PAGE_PATH_KEY]
11 |         for page in pages
12 |     ]
13 | 
14 |     results = []
15 |     failures = []
16 |     for page in pages:
17 |         try:
18 |             results.append(process.process_page.process_page(page, url_allow_list=page_names))
19 |         except Exception as exception:
20 |             failures.append(exception)
21 | 
22 |     return results, failures
23 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/_samples/1509_yui-navset.html:
--------------------------------------------------------------------------------
 1 | <div id="wiki-tabview-03edd57ee60acc9ffdcd1050bfe0a7c2" class="yui-navset">
 2 | <ul class="yui-nav">
 3 |     <li class="selected"><a href="javascript:;"><em>Effect 1509-1</em></a></li>
 4 |     <li><a href="javascript:;"><em>Effect 1509-2</em></a></li>
 5 | </ul>
 6 | <div class="yui-content">
 7 | <div id="wiki-tab-0-0">
 8 |     <div class="inner-div" style="width:300px;">
 9 |         <p>A specimen.</p>
10 |     </div>
11 |     <p>Effect 1509-1 typically.</p>
12 | </div>
13 | <div id="wiki-tab-0-1" style="display:none">
14 |     <p>Effect SCP-1509-2 occurs.</p>
15 | </div>
16 | </div>
17 | </div>
18 | 


--------------------------------------------------------------------------------
/scp_epub/download/get_wiki.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import download.wikidot_api
 5 | from constants import constants
 6 | 
 7 | 
 8 | def get_scp_wiki(book_definition, refresh=False):
 9 |     raise NotImplementedError
10 | 
11 | 
12 | def filter_pages(book_definition):
13 |     raise NotImplementedError
14 | 
15 | 
16 | def get_all_page_metadata(page_name, refresh=False):
17 |     raise NotImplementedError
18 | 
19 | 
20 | def enrich_all_page_metadata_with_contents(page_metadata, refresh=False):
21 |     raise NotImplementedError
22 | 
23 | 
24 | def get_edge_case(page_name):
25 |     json_file = os.path.join(constants.EDGE_CASES_DIR, page_name + '.' + constants.EDGE_CASES_FILETYPE)
26 |     with open(json_file, 'r', encoding=constants.ENCODING) as edge_case:
27 |         page = json.load(edge_case)
28 | 
29 |     return page
30 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SCP epub
 2 | 
 3 | Creates an epub from the scp wiki
 4 | 
 5 | **This is a work in progress with no current ETA.**
 6 | 
 7 | You can track the progress in progress.txt
 8 | 
 9 | ## Running the tool
10 | 
11 | ### Prerequisites
12 | 
13 | You can run the ebook builder locally in Linux, Windows (using WSL), or Mac.
14 | 
15 | Resource requirements:
16 | 
17 | * At least 2 GB of available memory
18 | * At least 2 GB of available storage
19 | 
20 | You need the following installed:
21 | 
22 | * Python 3 and pip3
23 | * All the python modules in requirements.txt: `pip3 install -r requirements.txt`
24 | 
25 | You need the following environment variables:
26 | 
27 | * `SCP_EPUB_USE_AWS`: this environment variable must be unset: `unset SCP_EPUB_USE_AWS`
28 | * `SCP_EPUB_WIKIDOT_API_KEY`: your read-only Wikidot API Key: `export SCP_EPUB_WIKIDOT_API_KEY=000000000000000000000000000`
29 | 


--------------------------------------------------------------------------------
/docs/configuration.md:
--------------------------------------------------------------------------------
 1 | # Configuration
 2 | 
 3 | The tool loads configuration information in three ways:
 4 | 
 5 | * If the tool is deployed on AWS, the tool will load infrastructure information (bucket names, locations of secrets in AWSSM etc.) from environment variables that are defined in [constants.py](/scp_epub/constants.py)
 6 | * The tool also reads configuration directly from [constants.py](/scp_epub/constants.py). This is not meant to be changed by the end user.
 7 | * Any configuration around building a book is contained in a book definition file in the [definitions directory](/definitions). This is documented in [book_definition.md](./book_definition.md) and is meant to be edited by the end user.
 8 | 
 9 | ## Constants file
10 | 
11 | All specifications regarding the format of the SCP wiki, caching settings, how to process the page contents etc. are defined in [the constants file](/scp_epub/constants.py).
12 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/_samples/scp_1-800-j_footnote_output.html:
--------------------------------------------------------------------------------
1 | <p><strong>Special Containment Procedures:</strong> SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family<sup class="footnoteref"><a epub:type="noteref" href="#footnote-1" id="footnoteref-1">1</a></sup>! No stains! No mess! No permanent physical or mental trauma!</p> <p>Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.99<sup class="footnoteref"><a epub:type="noteref" href="#footnote-2" id="footnoteref-2">2</a></sup>!</p> <div class="footnotes-footer"> <div class="title">Footnotes</div> <div class="footnote-footer" epub:type="footnote" id="footnote-1"><a href="#footnoteref-1">1</a>. Even Grandma!</div> <div class="footnote-footer" epub:type="footnote" id="footnote-2"><a href="#footnoteref-2">2</a>. Plus shipping and handling</div> </div>
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 elfakyn
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/_samples/scp_1-800-j_footnote.html:
--------------------------------------------------------------------------------
1 | <p><strong>Special Containment Procedures:</strong> SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family<sup class="footnoteref"><a id="footnoteref-1" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-1')">1</a></sup>! No stains! No mess! No permanent physical or mental trauma!</p> <p>Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.99<sup class="footnoteref"><a id="footnoteref-2" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-2')">2</a></sup>!</p> <div class="footnotes-footer"> <div class="title">Footnotes</div> <div class="footnote-footer" id="footnote-1"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-1')">1</a>. Even Grandma!</div> <div class="footnote-footer" id="footnote-2"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-2')">2</a>. Plus shipping and handling</div> </div>
2 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/download/test_get_wiki.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import json
 3 | import os
 4 | import unittest
 5 | import unittest.mock
 6 | 
 7 | import download.get_wiki
 8 | from constants import constants
 9 | 
10 | 
11 | class TestGetEdgeCase(unittest.TestCase):
12 |     @unittest.mock.patch('builtins.open')
13 |     def test_retrieve_from_local_cache(self, mock_open):
14 |         # Arrange
15 |         expected_relative_path = constants.EDGE_CASES_DIR
16 |         expected_item = 'scp-1234'
17 |         expected_filetype = constants.EDGE_CASES_FILETYPE
18 |         expected_file = os.path.join(expected_relative_path, expected_item + '.' + constants.EDGE_CASES_FILETYPE)
19 |         expected_encoding = constants.ENCODING
20 |         expected_open_type = 'r'
21 |         expected_contents = {'a': 'b'}
22 |         expected_encoded_contents = json.dumps(expected_contents)
23 |         mock_open.return_value.__enter__.return_value.read.return_value = expected_encoded_contents
24 | 
25 |         # Act
26 |         actual_contents = download.get_wiki.get_edge_case(expected_item)
27 | 
28 |         # Assert
29 |         self.assertEqual(expected_contents, actual_contents)
30 |         mock_open.assert_called_once_with(expected_file, expected_open_type, encoding=expected_encoding)
31 | 


--------------------------------------------------------------------------------
/docs/constants.md:
--------------------------------------------------------------------------------
 1 | # Constants
 2 | 
 3 | This file documents constants in [constants.py](scp_epub/constants/constants.py) and what they do.
 4 | 
 5 | Almost every string literal and magic number in the entire program is extracted in this file.
 6 | 
 7 | ## Charset
 8 | 
 9 | Character set-related values.
10 | 
11 | ## AWS Execution
12 | 
13 | This tool may be expanded in the future to run on AWS automatically. This sets some groundwork for that. This is currently not implemented, so enabling AWS use will not work. This may be removed in the future.
14 | 
15 | Constant | Explanation
16 | --- | ---
17 | `USE_AWS_VARIABLE` | The environment variable that defines whether to use AWS or not
18 | `USE_AWS_TRUE` | The value of `USE_AWS_VARIABLE` that will be interpreted as "True"
19 | `S3_CACHE_BASE_PATH` | The path in the s3 bucket that will be used to store the page cache
20 | `S3_BUCKET_VARIABLE` | The environment variable that defines which s3 bucket data will be stored in
21 | `API_KEY_SECRETSMANAGER_VARIABLE` | The environment variable that contains the name of the SecretsManager secret that will be used.
22 | 
23 | ## Local Execution
24 | 
25 | This tool is for the most part intended to be run locally. Some of the key file paths are defined relative to the path of the constants file.
26 | 
27 | To be continued...
28 | 


--------------------------------------------------------------------------------
/scp_epub/test_platform/process/test_process_all_pages.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import unittest.mock
 3 | 
 4 | from parameterized import parameterized
 5 | import bs4
 6 | import json
 7 | import os
 8 | import io
 9 | 
10 | import download.download_scp
11 | import process.assemble
12 | 
13 | class TestProcessAllPages(unittest.TestCase):
14 |     def setUp(self):
15 |         self.maxDiff = 500
16 | 
17 |     def test_process_all_pages(self):
18 |         # Arrange
19 |         expected_max_failures = 0
20 |         expected_failures = []
21 |         expected_definition = {
22 |             "download": {
23 |                 "download_tags": [
24 |                     "scp",
25 |                     "tale",
26 |                     "hub",
27 |                     "supplement"
28 |                 ],
29 |                 "edge_cases": [
30 |                     "scp-3125"
31 |                 ]
32 |             },
33 |         }
34 |         expected_pages = download.get_wiki.get_pages_from_book_definition(expected_definition)
35 | 
36 |         # Act
37 |         actual_processed_pages, actual_failures = process.assemble.process_all_pages(expected_pages)
38 | 
39 |         # Assert
40 |         self.assertLessEqual(len(actual_failures), expected_max_failures)
41 |         self.assertEqual(expected_failures, actual_failures)
42 | 


--------------------------------------------------------------------------------
/scp_epub/test_platform/download/test_get_complete_page.py:
--------------------------------------------------------------------------------
 1 | import getpass
 2 | import os
 3 | import parameterized.parameterized
 4 | import unittest
 5 | 
 6 | from constants import constants
 7 | import download.get_wiki
 8 | 
 9 | 
10 | TEST_CASES_REGULAR = [
11 |     ['scp-123'],
12 |     ['scp-4000'],
13 |     ['scp-173']
14 | ]
15 | 
16 | TEST_CASES_EDGE_CASE = [
17 |     ['scp-3125']
18 | ]
19 | 
20 | 
21 | class TestGetCompletePageSameWithWithoutCache(unittest.TestCase):
22 |     @classmethod
23 |     def setUpClass(cls):
24 |         os.environ[constants.API_KEY_VARIABLE] = getpass.getpass('Wikidot read-only API key: ')
25 |         return super().setUpClass()
26 | 
27 |     @classmethod
28 |     def tearDownClass(cls):
29 |         del os.environ[constants.API_KEY_VARIABLE]
30 |         return super().tearDownClass()
31 | 
32 |     def setUp(self):
33 |         self.maxDiff = 500
34 | 
35 |     @parameterized.parameterized.expand(TEST_CASES_REGULAR)
36 |     def test_download_page(self, expected_page_name):
37 |         # Arrange
38 |         expected_page = download.get_wiki.get_complete_page(expected_page_name, refresh=True)
39 | 
40 |         # Act
41 |         actual_page = download.get_wiki.get_complete_page(expected_page_name, refresh=False)
42 | 
43 |         # Assert
44 |         self.assertEqual(expected_page, actual_page)
45 |         self.assertEqual(expected_page_name, actual_page[constants.PAGE_PATH_KEY])
46 | 


--------------------------------------------------------------------------------
/scp_epub/test_component/process/test_process_page.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import unittest.mock
 3 | 
 4 | from parameterized import parameterized
 5 | import bs4
 6 | import json
 7 | import os
 8 | import io
 9 | 
10 | import process.process_page
11 | from constants import constants
12 | 
13 | TEST_COMPONENT_PROCESS_PAGE_CASES_DIR = 'test_process_page_cases'
14 | 
15 | 
16 | class TestProcessPage(unittest.TestCase):
17 |     def setUp(self):
18 |         self.maxDiff = 0
19 | 
20 |     @parameterized.expand([
21 |         [
22 |             'SCP-1257',
23 |             'scp-1257.html',
24 |             'scp-1257_converted.html',
25 |             ['scp-173', 'scp-682'],
26 |         ],
27 |     ])
28 |     def test_process_page(self, expected_page_title, expected_web_html_file, expected_processed_html_file, expected_url_allow_list):
29 |         # Arrange
30 | 
31 |         with open(os.path.join(os.path.dirname(__file__), TEST_COMPONENT_PROCESS_PAGE_CASES_DIR, expected_web_html_file), 'r', encoding=constants.ENCODING) as target_file:
32 |             expected_web_html = target_file.read()
33 | 
34 |         with open(os.path.join(os.path.dirname(__file__), TEST_COMPONENT_PROCESS_PAGE_CASES_DIR, expected_processed_html_file), 'r', encoding=constants.ENCODING) as target_file:
35 |             expected_processed_html = target_file.read()
36 | 
37 |         # Act
38 |         actual_processed_html = process.process_page.process_page_html(expected_web_html, expected_page_title, expected_url_allow_list)
39 | 
40 |         # Assert
41 |         self.assertEqual(expected_processed_html, actual_processed_html)
42 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/download/wikidot_api.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | import unittest
 4 | import unittest.mock
 5 | 
 6 | import download.wikidot_api
 7 | from constants import constants
 8 | 
 9 | class TestGetApiKey(unittest.TestCase):
10 |     @unittest.mock.patch('download.aws.get_api_key_from_secretsmanager')
11 |     def test_get_api_key_locally(self, mock_get_api_key_from_secretsmanager):
12 |         # Arrange
13 |         expected_api_key = '000000000000000000000000000'
14 |         os.environ.pop(constants.USE_AWS_VARIABLE, None)
15 |         os.environ[constants.API_KEY_VARIABLE] = expected_api_key
16 | 
17 |         # Act
18 |         actual_api_key = download.wikidot_api._get_api_key()
19 | 
20 |         # Assert
21 |         mock_get_api_key_from_secretsmanager.assert_not_called()
22 |         self.assertEqual(expected_api_key, actual_api_key)
23 | 
24 |     @unittest.mock.patch('download.aws.get_api_key_from_secretsmanager')
25 |     def test_get_api_key_with_aws(self, mock_get_api_key_from_secretsmanager):
26 |         # Arrange
27 |         expected_api_key = '000000000000000000000000000'
28 |         os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE
29 |         os.environ.pop(constants.API_KEY_VARIABLE, None)
30 | 
31 |         mock_get_api_key_from_secretsmanager.return_value = expected_api_key
32 | 
33 |         # Act
34 |         actual_api_key = download.wikidot_api._get_api_key()
35 | 
36 |         # Assert
37 |         mock_get_api_key_from_secretsmanager.assert_called_once_with()
38 |         self.assertEqual(expected_api_key, actual_api_key)
39 | 
40 | class TestWikidotClient(unittest.TestCase):
41 |     def setUp(self):
42 |         importlib.reload('download.wikidot_api')
43 | 
44 |     @unittest.mock.patch('download.wikidot_api.get_wikidot_client')
45 |     def test_client_closure(self, mock_get_wikidot_client):
46 |         # Arrange
47 |         expected_client = download.wikidot_api.client()
48 | 
49 |         # Act
50 |         actual_client = download.wikidot_api.client()
51 | 
52 |         # Assert
53 |         self.assertIs(expected_client(), actual_client())
54 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/download/test_utils.py:
--------------------------------------------------------------------------------
 1 | import download.utils
 2 | import unittest
 3 | from parameterized import parameterized
 4 | 
 5 | 
 6 | NORMALIZATION_TEST_CASES = [
 7 |     ["fragment:three-farewells-aktus", "fragment_three-farewells-aktus"],
 8 |     ["scp-3125", "scp-3125"]
 9 | ]
10 | 
11 | class TestNormalizeString(unittest.TestCase):
12 |     @parameterized.expand(NORMALIZATION_TEST_CASES)
13 |     def test_normalize_string(self, expected_raw_string, expected_normalized_string):
14 |         # Arrange
15 | 
16 |         # Act
17 |         actual_normalized_string = download.utils.normalize_string(expected_raw_string)
18 | 
19 |         # Assert
20 |         self.assertEqual(expected_normalized_string, actual_normalized_string)
21 | 
22 | 
23 | class TestFilterPages(unittest.TestCase):
24 |     def test_filter_tags_no_rule(self):
25 |         # Arrange
26 |         expected_pages = [
27 |             {'tags': ['scp', 'meta']},
28 |             {'tags': ['tale', 'antimemetic']},
29 |             {'tags': ['_sys']},
30 |             {'tags': []},
31 |             {'content': 'whatnot'},
32 |             {}
33 |         ]
34 | 
35 |         expected_filtered_pages = [
36 |             {'tags': ['scp', 'meta']},
37 |             {'tags': ['tale', 'antimemetic']},
38 |             {'tags': ['_sys']},
39 |             {'tags': []},
40 |             {'content': 'whatnot'},
41 |             {}
42 |         ]
43 | 
44 |         expected_tag_filter = None
45 | 
46 |         # Act
47 |         actual_filtered_pages = download.utils.filter_tags(expected_pages)
48 | 
49 |         # Assert
50 |         self.assertEqual(expected_filtered_pages, actual_filtered_pages)
51 | 
52 |     def test_filter_tags_include_tags(self):
53 |         # Arrange
54 |         expected_pages = [
55 |             {'tags': ['scp', 'meta']},
56 |             {'tags': ['hub', 'mtf']},
57 |             {'tags': ['tale', 'antimemetic']},
58 |             {'tags': ['_sys']},
59 |             {'tags': []},
60 |             {'content': 'whatnot'},
61 |             {},
62 |         ]
63 | 
64 |         expected_filtered_pages = [
65 |             {'tags': ['scp', 'meta']},
66 |             {'tags': ['tale', 'antimemetic']},
67 |         ]
68 | 
69 |         expected_tag_filter = None
70 |         expected_include_tags = ['scp', 'tale']
71 | 
72 |         # Act
73 |         actual_filtered_pages = download.utils.filter_tags(expected_pages, include_tags=expected_include_tags)
74 | 
75 |         # Assert
76 |         self.assertEqual(expected_filtered_pages, actual_filtered_pages)
77 | 


--------------------------------------------------------------------------------
/scp_epub/download/wikidot_api.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import json
 3 | import os
 4 | from ratelimit import limits, sleep_and_retry
 5 | import requests
 6 | import xmlrpc.client
 7 | 
 8 | import download.aws
 9 | from download import cache
10 | from constants import constants
11 | from exceptions import exceptions
12 | 
13 | 
14 | _wikidot_client = None
15 | 
16 | 
17 | def _create_wikidot_client():
18 |     api_key = _get_api_key()
19 |     return xmlrpc.client.ServerProxy(f'https://{constants.CLIENT_NAME}:{api_key}@{constants.RPC_ENDPOINT}')
20 | 
21 | 
22 | def _get_api_key():
23 |     if os.getenv(constants.USE_AWS_VARIABLE) == constants.USE_AWS_TRUE:
24 |         return download.aws.get_api_key_from_secretsmanager()
25 |     else:
26 |         return os.getenv(constants.API_KEY_VARIABLE)
27 | 
28 | 
29 | def _get_wikidot_client():
30 |     global _wikidot_client
31 |     if _wikidot_client is None:
32 |         _wikidot_client = _create_wikidot_client()
33 | 
34 |     return _wikidot_client
35 | 
36 | 
37 | def _get_list_of_pages_undecorated(categories, **kwargs):
38 |     client = _get_wikidot_client()
39 |     list_of_pages = client.pages.select({
40 |         'site': constants.SITE_NAME,
41 |         'categories': categories
42 |     })
43 |     return list_of_pages
44 | 
45 | 
46 | def _get_page_metadata_undecorated(page, **kwargs):
47 |     client = _get_wikidot_client()
48 |     page_data = client.pages.get_meta({
49 |         'site': constants.SITE_NAME,
50 |         'pages': [page]
51 |     })
52 |     return page_data[page]
53 | 
54 | 
55 | def _get_web_page_undecorated(page, **kwargs):
56 |     web_page = requests.get(f'{constants.SITE_DOWNLOAD_HOST}/{page}')
57 |     if web_page.status_code > 200:
58 |         return None
59 |     return web_page.content.decode(constants.ENCODING)
60 | 
61 | 
62 | @cache.use_cache(constants.CACHE_PAGE_LIST_DIR, filetype=constants.CACHE_FILETYPE_JSON)
63 | @sleep_and_retry
64 | @limits(calls=constants.RATE_LIMIT_CALLS, period=constants.RATE_LIMIT_PERIOD)
65 | def get_list_of_pages(*args, **kwargs):
66 |     return _get_list_of_pages_undecorated(*args, **kwargs)
67 | 
68 | 
69 | @cache.use_cache(constants.CACHE_PAGES_DIR, filetype=constants.CACHE_FILETYPE_JSON)
70 | @sleep_and_retry
71 | @limits(calls=constants.RATE_LIMIT_CALLS, period=constants.RATE_LIMIT_PERIOD)
72 | def get_page_metadata(*args, **kwargs):
73 |     return _get_page_metadata_undecorated(*args, **kwargs)
74 | 
75 | 
76 | @cache.use_cache(constants.CACHE_HTML_DIR, filetype=constants.CACHE_FILETYPE_HTML)
77 | @sleep_and_retry
78 | @limits(calls=constants.RATE_LIMIT_WEB_CALLS, period=constants.RATE_LIMIT_WEB_PERIOD)
79 | def get_web_page(*args, **kwargs):
80 |     return _get_web_page_undecorated(*args, **kwargs)
81 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Project stuff
  2 | build/
  3 | 
  4 | # Visual Studio Code
  5 | .vscode/
  6 | *.code-workspace
  7 | .history/
  8 | 
  9 | #################################################
 10 | ## Python stuff
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | pip-wheel-metadata/
 35 | share/python-wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | *.py,cover
 62 | .hypothesis/
 63 | .pytest_cache/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | .python-version
 97 | 
 98 | # pipenv
 99 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
100 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
101 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
102 | #   install all needed dependencies.
103 | #Pipfile.lock
104 | 
105 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
106 | __pypackages__/
107 | 
108 | # Celery stuff
109 | celerybeat-schedule
110 | celerybeat.pid
111 | 
112 | # SageMath parsed files
113 | *.sage.py
114 | 
115 | # Environments
116 | .env
117 | .venv
118 | env/
119 | venv/
120 | ENV/
121 | env.bak/
122 | venv.bak/
123 | 
124 | # Spyder project settings
125 | .spyderproject
126 | .spyproject
127 | 
128 | # Rope project settings
129 | .ropeproject
130 | 
131 | # mkdocs documentation
132 | /site
133 | 
134 | # mypy
135 | .mypy_cache/
136 | .dmypy.json
137 | dmypy.json
138 | 
139 | # Pyre type checker
140 | .pyre/
141 | test.bin
142 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/process/test_assemble.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import unittest.mock
 3 | 
 4 | from parameterized import parameterized
 5 | import bs4
 6 | import json
 7 | import os
 8 | import io
 9 | 
10 | import process.assemble
11 | from constants import constants
12 | 
13 | 
14 | class TestProcessAllPages(unittest.TestCase):
15 |     @unittest.mock.patch('process.process_page.process_page')
16 |     def test_process_all_pages(self, mock_process_page):
17 |         # Arrange
18 |         expected_page_names = ['a-1', 'b-2']
19 |         expected_processed_pages = ['result 1', 'result 2']
20 |         expected_failures = []
21 | 
22 |         mock_process_page.side_effect = expected_processed_pages
23 | 
24 |         expected_pages = [
25 |             {
26 |                 constants.PAGE_PATH_KEY: page_name
27 |             }
28 |             for page_name in expected_page_names
29 |         ]
30 | 
31 |         expected_calls = [
32 |             unittest.mock.call(
33 |                 {
34 |                     constants.PAGE_PATH_KEY: page_name
35 |                 },
36 |                 url_allow_list=expected_page_names
37 |             )
38 |             for page_name in expected_page_names
39 |         ]
40 | 
41 |         # Act
42 |         actual_processed_pages, actual_failures = process.assemble.process_all_pages(expected_pages)
43 | 
44 |         # Assert
45 |         self.assertEqual(expected_processed_pages, actual_processed_pages)
46 |         self.assertEqual(expected_failures, actual_failures)
47 |         mock_process_page.assert_has_calls(expected_calls)
48 | 
49 |     @unittest.mock.patch('process.process_page.process_page')
50 |     def test_process_all_pages_errors(self, mock_process_page):
51 |         # Arrange
52 |         expected_page_names = ['a-1', 'b-2']
53 |         expected_processed_pages = ['result 2']
54 | 
55 |         expected_error = ValueError()
56 |         expected_failures = [expected_error]
57 | 
58 |         mock_process_page.side_effect = [expected_error, 'result 2']
59 | 
60 |         expected_pages = [
61 |             {
62 |                 constants.PAGE_PATH_KEY: page_name
63 |             }
64 |             for page_name in expected_page_names
65 |         ]
66 | 
67 |         expected_calls = [
68 |             unittest.mock.call(
69 |                 {
70 |                     constants.PAGE_PATH_KEY: page_name
71 |                 },
72 |                 url_allow_list=expected_page_names
73 |             )
74 |             for page_name in expected_page_names
75 |         ]
76 | 
77 |         # Act
78 |         actual_processed_pages, actual_failures = process.assemble.process_all_pages(expected_pages)
79 | 
80 |         # Assert
81 |         self.assertEqual(expected_processed_pages, actual_processed_pages)
82 |         self.assertEqual(expected_failures, actual_failures)
83 |         mock_process_page.assert_has_calls(expected_calls)
84 | 


--------------------------------------------------------------------------------
/scp_epub/download/cache.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import json
 3 | import os
 4 | 
 5 | from constants import constants
 6 | import download.aws
 7 | import download.utils
 8 | 
 9 | 
10 | def use_cache(relative_path, filetype=constants.CACHE_DEFAULT_FILETYPE):
11 |     def decorator(func):
12 |         @functools.wraps(func)
13 |         def wrapper(*args, **kwargs):
14 |             normalized_item = download.utils.normalize_string(args[0])
15 | 
16 |             if 'refresh' in kwargs and kwargs['refresh'] is True:
17 |                 cached_contents = None
18 |             else:
19 |                 cached_contents = get_cached_contents(relative_path, normalized_item, filetype)
20 | 
21 |             if cached_contents is not None:
22 |                 return cached_contents
23 |             else:
24 |                 contents = func(*args, **kwargs)
25 |                 set_cached_contents(contents, relative_path, normalized_item, filetype)
26 |                 return contents
27 | 
28 |         return wrapper
29 |     return decorator
30 | 
31 | 
32 | def get_cached_contents(relative_path, item, filetype):
33 |     if os.getenv(constants.USE_AWS_VARIABLE) == constants.USE_AWS_TRUE:
34 |         content_string = download.aws.retrieve_from_s3_cache(relative_path, item, filetype)
35 |     else:
36 |         content_string = retrieve_from_local_cache(relative_path, item, filetype)
37 | 
38 |     if filetype == 'json':
39 |         return json.loads(content_string)
40 |     else:
41 |         return content_string
42 | 
43 | 
44 | def set_cached_contents(contents, relative_path, item, filetype):
45 |     if filetype == 'json':
46 |         content_string = json.dumps(contents)
47 |     else:
48 |         content_string = contents
49 | 
50 |     if os.getenv(constants.USE_AWS_VARIABLE) == constants.USE_AWS_TRUE:
51 |         download.aws.store_in_s3_cache(content_string, relative_path, item, filetype)
52 |     else:
53 |         store_in_local_cache(content_string, relative_path, item, filetype)
54 | 
55 | 
56 | def retrieve_from_local_cache(relative_path, item, filetype):
57 |     try:
58 |         filename = item + '.' + filetype
59 |         file_location = os.path.join(constants.LOCAL_CACHE_BASE_PATH, relative_path, filename)
60 |         with open(file_location, 'r', encoding=constants.ENCODING) as local_file:
61 |             contents = local_file.read()
62 | 
63 |         return contents
64 |     except FileNotFoundError:
65 |         return None
66 | 
67 | 
68 | def store_in_local_cache(contents, relative_path, item, filetype):
69 |     filename = item + '.' + filetype
70 |     file_dir = os.path.join(constants.LOCAL_CACHE_BASE_PATH, relative_path)
71 |     file_location = os.path.join(file_dir, filename)
72 | 
73 |     os.makedirs(file_dir, exist_ok=True)
74 |     with open(file_location, 'w', encoding=constants.ENCODING) as local_file:
75 |         local_file.write(contents)
76 | 


--------------------------------------------------------------------------------
/docs/how_it_works.md:
--------------------------------------------------------------------------------
 1 | # How SCP epub Works
 2 | 
 3 | SCP epub takes the entirety of the SCP wiki (running on Wikidot) and converts it into ebook format via a series of separate steps.
 4 | 
 5 | 1. Download the SCP wiki
 6 | 2. Convert every page from web format into an ebook-friendly format
 7 | 3. Organize and assemble the pages in the correct order
 8 | 4. Create the ebook
 9 | 
10 | The **book definition file** controls exactly what pages to download from the SCP wiki and how to organize the book (steps 1, 3, and 4). The [constants file](scp_epub/constants/constants.py) controls how the pages are converted into ebook format (step 2).
11 | 
12 | ## Downloading the SCP wiki
13 | 
14 | We'll use [the complete collection definition file](definitions/complete_collection.json) as an example.
15 | 
16 | First, SCP epub obtains a list of all SCP pages. For example, all pages that we care about (SCP entries, tales, hubs, supplements) are in the same category, `_default`. We get a list of all the pages in the specified categories (usually, `_default` is enough). This uses the Wikidot API and it requires a read-only Wikidot API key.
17 | 
18 | Then, using the wikidot API, we obtain metadata on all the pages of interest, specified by tags (in this example, scp, tale, hub, and supplement).
19 | 
20 | Now that we have all the pages of interest and their metadata, we download all of them. Some pages (like scp-3125), however, are super complicated or interactive, so they will not be downloaded and processed. Instead, they are treated as an edge case and they will be replaced by a version that's processed by hand (located in the [edge_cases folder](edge_cases/)). This is OK because the SCP wiki license allows such use.
21 | 
22 | After we have a list of all the pages we care about, we download their actual contents not through the API, but by scraping the HTML of each web page's "printer-friendly" version. The program implements rate limiting to follow Wikidot rate limits and to also be respectful of the site bandwidth. Rate limits are defined in the constants file.
23 | 
24 | Because of the way Wikidot works, there is no hierarchy or grouping of pages. This means everything is downloaded in the same directory and is not organized in any way. This will become relevant later.
25 | 
26 | All downloaded information is cached so that we don't need to re-download stuff all the time. The cached version are found in the `build/cache/` directory that will be created when you first run the program.
27 | 
28 | ## Converting pages into ebook-friendly format
29 | 
30 | Right now, we have a whole bunch of pages in HTML format, and they contain a lot of unnecessary information such as site headers and footers, web links etc. We need to convert all the pages into an ebook-friendly form. Thankfully, the epub ebook format is basically just a huge ZIP archive containing HTML files. There are certain requirements for a page to be epub-compatible but it is otherwise a straightforward conversion process.
31 | 
32 | Certain classes and tags need to be removed from the HTML outright. These are defined in the constants file. Currently, we simply remove images, although it would be theoretically possible to include them.
33 | 
34 | The SCP wiki uses collapsible blocks a lot (where text is hidden until you click on a dropdown). These are incompatible with epub, so they're unwrapped: we get rid of the collapsible blocks but keep the text inside them.
35 | 
36 | There are a number of other items, such as page headers, block quotes, footnotes, and internal links that all need to be properly converted.
37 | 
38 | Each page is processed individually and is now converted into ebook-friendly HTML that can be directly assembled into an epub.
39 | 
40 | ## Organizing and assembling the pages
41 | 
42 | However, we can't do that just yet. The pages need to be put in the correct order.
43 | 
44 | To be continued...
45 | 


--------------------------------------------------------------------------------
/scp_epub/constants/constants.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | ########################
  4 | # Charset
  5 | 
  6 | ENCODING = 'utf-8'
  7 | 
  8 | ########################
  9 | # AWS Execution
 10 | USE_AWS_VARIABLE = 'SCP_EPUB_USE_AWS'
 11 | USE_AWS_TRUE = 'TRUE'
 12 | S3_CACHE_BASE_PATH = 'cache/'
 13 | S3_BUCKET_VARIABLE = 'SCP_EPUB_S3_BUCKET'
 14 | API_KEY_SECRETSMANAGER_VARIABLE = 'SCP_EPUB_API_KEY_SECRET'
 15 | 
 16 | ########################
 17 | # Relative filepaths
 18 | 
 19 | LOCAL_CACHE_BASE_PATH = os.path.join(os.path.dirname(__file__), '../../build/cache')
 20 | EDGE_CASES_DIR = os.path.join(os.path.dirname(__file__), '../../edge_cases')
 21 | 
 22 | ########################
 23 | # Caching paths
 24 | 
 25 | CACHE_FILETYPE_JSON = 'json'
 26 | CACHE_FILETYPE_HTML = 'html'
 27 | CACHE_DEFAULT_FILETYPE = CACHE_FILETYPE_JSON
 28 | CACHE_PAGES_DIR = 'pages/'
 29 | CACHE_HTML_DIR = 'web/'
 30 | CACHE_PAGE_LIST_DIR = 'lists/'
 31 | 
 32 | ########################
 33 | # Scraping
 34 | 
 35 | API_KEY_VARIABLE = 'SCP_EPUB_WIKIDOT_API_KEY'
 36 | 
 37 | SITE_NAME = 'scp-wiki'
 38 | SITE_HOST = 'http://scp-wiki.net'
 39 | SITE_DOWNLOAD_HOST = 'http://scp-wiki.net/printer--friendly'
 40 | 
 41 | CLIENT_NAME = 'scp-epub'
 42 | RPC_ENDPOINT = 'www.wikidot.com/xml-rpc-api.php'
 43 | 
 44 | # Wikidot rate limit is 240 calls per 60 seconds, we're being conservative
 45 | RATE_LIMIT_CALLS = 60
 46 | RATE_LIMIT_PERIOD = 30
 47 | 
 48 | RATE_LIMIT_WEB_CALLS = 10
 49 | RATE_LIMIT_WEB_PERIOD = 5
 50 | 
 51 | EDGE_CASES_FILETYPE = 'json'
 52 | 
 53 | ######################
 54 | # Processing pages
 55 | 
 56 | PAGE_PATH_KEY = 'fullname'
 57 | TITLE_SHOWN_KEY = 'title_shown'
 58 | TITLE_KEY = 'title'
 59 | CREATED_BY_KEY = 'created_by'
 60 | CREATED_AT_KEY = 'created_at'
 61 | TAGS_KEY = 'tags'
 62 | ADDITIONAL_DATA_KEY = 'scp_epub_additional_data'
 63 | WEB_HTML_KEY = 'web_html'
 64 | 
 65 | PROCESSED_NAME_KEY = 'name'
 66 | PROCESSED_TITLE_KEY = 'title'
 67 | PROCESSED_AUTHOR_KEY = 'created_by'
 68 | PROCESSED_CREATION_DATE_KEY = 'created_at'
 69 | PROCESSED_TAGS_KEY = 'tags'
 70 | PROCESSED_HTML_KEY = 'html'
 71 | 
 72 | EMPTY_TITLE = '███████████'
 73 | EMPTY_AUTHOR = 'Unknown'
 74 | EMPTY_TIMESTAMP = 'Unknown'
 75 | 
 76 | 
 77 | BS4_FORMAT = 'lxml'
 78 | 
 79 | PAGE_CONTENT_ID = 'page-content'
 80 | 
 81 | CLASSES_TO_REMOVE = [
 82 |     'heritage-rating-module',
 83 |     'heritage-emblem',
 84 |     'page-rate-widget-box',
 85 |     'scp-image-block',
 86 |     'image',
 87 |     'scp-image-caption',
 88 |     'footer-wikiwalk-nav'
 89 | ]
 90 | 
 91 | TAGS_TO_REMOVE = [
 92 |     'img'
 93 | ]
 94 | 
 95 | COLLAPSIBLE_BLOCK_CLASS = 'collapsible-block'
 96 | COLLAPSIBLE_BLOCK_LINK_CLASS = 'collapsible-block-link'
 97 | COLLAPSIBLE_BLOCK_CONTENT_CLASS = 'collapsible-block-content'
 98 | 
 99 | COLLAPSIBLE_CLASS_NEW = 'collapsible'
100 | COLLAPSIBLE_TITLE_CLASS_NEW = 'collapsible-title'
101 | 
102 | BLOCKQUOTE_TAG = 'blockquote'
103 | BLOCKQUOTE_CLASS_NEW = 'quote'
104 | 
105 | YUI_NAVSET_CLASS = 'yui-navset'
106 | YUI_NAVSET_CLASS_NEW = 'tabview'
107 | YUI_NAVSET_TAB_CLASS = 'yui-nav'
108 | YUI_NAVSET_TAB_CLASS_NEW = 'tabview-tab'
109 | YUI_NAVSET_TAB_TITLE_IDENTIFIER = 'em'
110 | YUI_NAVSET_TAB_TITLE_CLASS_NEW = 'tab-title'
111 | 
112 | LINK_TAG = 'a'
113 | 
114 | HREF_ATTRIBUTE = 'href'
115 | ID_ATTRIBUTE = 'id'
116 | EPUB_TYPE_ATTRIBUTE = 'epub:type'
117 | ONCLICK_ATTRIBUTE = 'onclick'
118 | CLASS_ATTRIBUTE = 'class'
119 | 
120 | LINK_CLASS_NEW = 'link'
121 | LINK_EXTENSION = '.xhtml'
122 | 
123 | PAGE_TITLE_TAG = 'p'
124 | PAGE_TITLE_CLASS = 'page-title'
125 | 
126 | FOOTNOTEREF_TAG = 'sup'
127 | FOOTNOTEREF_CLASS = 'footnoteref'
128 | 
129 | FOOTNOTE_CLASS = 'footnote-footer'
130 | 
131 | EPUB_TYPE_FOOTNOTEREF = 'noteref'
132 | EPUB_TYPE_FOOTNOTE = 'footnote'
133 | 
134 | FOOTNOTE_HREF_PATTERN = r"WIKIDOT\.page\.utils\.scrollToReference\('([a-zA-Z0-9-_]+)'\)"
135 | 
136 | ######################
137 | # Assembling pages
138 | 
139 | DEFAULT_BASED_ON = 'name'
140 | 


--------------------------------------------------------------------------------
/progress.txt:
--------------------------------------------------------------------------------
  1 | Punchlist:
  2 | 
  3 | 
  4 | ==============================================
  5 | 
  6 | TASKS:
  7 | 
  8 | 1. download all pages
  9 | * Rewrite downloader to not be a complete janky mess.
 10 | 	[DONE] * Multiple backends for caching:
 11 | 		[DEFERRED] * aws
 12 | 		[DONE] * local
 13 | 		[DONE] * able to select between them
 14 | 	[DONE] * Multiple backends for retrieving the api secret:
 15 | 		[DEFERRED] * aws
 16 | 		[DONE] * local
 17 | 		[DONE] * able to select between them
 18 | 	[DONE] * Single instance of wikidot client
 19 | 	[DONE] * Downloads:
 20 | 		[DONE] * Download list
 21 | 		[DONE] * Download page via API
 22 | 		[DONE] * Download page via web
 23 | 	* Get entire wiki:
 24 | 		* Download list then filter pages based on definitions, download only those pages
 25 | 		[IP] * Download complete page:
 26 | 			[DONE] * API entry
 27 | 			[DONE] * Web download
 28 | 			* Apply edge case if applicable
 29 | 	* Platform tests:
 30 | 		[DONE] * Single page with cache
 31 | 		[DONE] * Single page without cache
 32 | 		[IP] * Edge case downloading
 33 | 		* Downloading everything
 34 | 
 35 | [DONE] 2. PRETTIFY SINGLE PAGE
 36 | [CANCELLED] * delete the div that aligns right and is at the very beginning of every page ( should we do this? what if there's a page that doesn't have that div but has another one instead with actual content)
 37 | [DONE] * WHAT PYSCP DOES (MAKE COMPATIBLE):
 38 | 	[DONE] * remove widget box
 39 | 	[DONE] * yui-navset
 40 | 	[DONE] * collapsible-block
 41 | 	[DONE] * footnoteref, footnote footer: BREAK COMPATIBILITY WITH PYSCP, MAKE NICE FOR EPUB
 42 | 	[DONE] * blockquote
 43 | 	[DONE] * links
 44 | 	[DONE] * images
 45 | 	[DONE] * title
 46 | [DONE] * Put them all together to parse a whole page
 47 | 	[DONE] * Component test that correctly parses whole web html files
 48 | 	[DONE] * I want some sort of test that the output of the combined parsers makes sense. Even if just a regression test.
 49 | [DONE] * Process all pages
 50 | 	[DONE] * Error handling: e.g. each item in its own try-catch with logging of errors
 51 | 		[DONE] * Log errors verbosely, exactly what failed and how
 52 | 	[DONE] * Platform test: parse LITERALLY ALL THE SCP PAGES and test that none fail, or <N fail.
 53 | 
 54 | 3. CREATE BOOK
 55 | [DONE] * Book definition for complete collection
 56 | * Dynamically generate Section Titles
 57 | * TOC generator: creates a TOC & spine (page order) from a list of pages & a book definition
 58 | 	* If page has a valid parent (parent which is in page list):
 59 | 		* Exclude it from normal TOC building and put it immediately after parent page
 60 | 		* Add parent key to builder json
 61 | 	* Pages are consumed when first included and appear only once
 62 | 	* Discard all pages that don't make it in the spine
 63 | 	* Index with credits at the very end (sorted by Title)
 64 | * Add metadata:
 65 | 	* Ebook metadata
 66 | 	* CSS
 67 | 	* Cover image
 68 | 		* Contact SunnyClockwork or make own artwork
 69 | * Write book file
 70 | 
 71 | 4. Put everything together into one command
 72 | * Run arbitrary definition or all definitions
 73 | * Flag to refresh list, flag to redownload pages
 74 | * Summary of run: pages processed / number of pages in book / number of pages discarded / number of pages that failed processing
 75 | 
 76 | 5. Fully automated end to end builder:
 77 | * AWS portion of code:
 78 | 	* store/retrieve from s3
 79 | 	* AWSSM api key
 80 | 	* SNS alerting on summary of run
 81 | * Lambda that uploads to MEGA
 82 | * Terraform infrastructure:
 83 | 	* main program, containerized monthly runs
 84 | 	* s3 event notification to trigger upload to MEGA
 85 | * Supporting infrastructure:
 86 | 	* Remote state
 87 | 	* RBAC on AWS account, better do it now rather than later when there's actually users and stuff in there.
 88 | 	* Alerting infrastructure
 89 | 
 90 | 5. Complete documentation on code and project
 91 | 
 92 | ==============================================
 93 | 
 94 | TOC GENERATOR:
 95 | * always do natural sort, there's no reason to use anything else
 96 | * default sort: "name" ASC natural sort
 97 | * natural title sort: "title" ASC natural sort
 98 | 
 99 | INDICES AND CREDITS GENERATOR:
100 | * Title (with internal link)
101 | * Author
102 | * Created at
103 | * Link to website entry (explicit: scp-wiki.net/blablabla)
104 | 
105 | ===================
106 | Long-term:
107 | * add support for images
108 | * more book definitions
109 | 
110 | ======================================
111 | 
112 | Fixing some jank:
113 | * Don't use paths relative to the constants file
114 | 


--------------------------------------------------------------------------------
/scp_epub/process/process_page.py:
--------------------------------------------------------------------------------
  1 | import bs4
  2 | import re
  3 | 
  4 | from constants import constants
  5 | 
  6 | 
  7 | def process_page(page, url_allow_list=None):
  8 |     if page[constants.TITLE_SHOWN_KEY] is not None:
  9 |         title = page[constants.TITLE_SHOWN_KEY]
 10 |     elif page[constants.TITLE_KEY] is not None:
 11 |         title = page[constants.TITLE_KEY]
 12 |     else:
 13 |         title = constants.EMPTY_TITLE
 14 | 
 15 |     html = page[constants.ADDITIONAL_DATA_KEY][constants.WEB_HTML_KEY]
 16 |     tags = page[constants.TAGS_KEY] if page[constants.TAGS_KEY] is not None else []
 17 |     author = page[constants.CREATED_BY_KEY] if page[constants.CREATED_BY_KEY] is not None else constants.EMPTY_AUTHOR
 18 |     creation_date = page[constants.CREATED_AT_KEY] if page[constants.CREATED_AT_KEY] is not None else constants.EMPTY_TIMESTAMP
 19 | 
 20 |     processed_html = process_page_html(html, title, url_allow_list=url_allow_list)
 21 | 
 22 |     return {
 23 |         constants.PROCESSED_NAME_KEY: page[constants.PAGE_PATH_KEY],
 24 |         constants.PROCESSED_TITLE_KEY: title,
 25 |         constants.PROCESSED_CREATION_DATE_KEY: creation_date,
 26 |         constants.PROCESSED_AUTHOR_KEY: author,
 27 |         constants.PROCESSED_TAGS_KEY: tags,
 28 |         constants.PROCESSED_HTML_KEY: processed_html
 29 |     }
 30 | 
 31 | 
 32 | def process_page_html(web_html, page_title, url_allow_list=None):
 33 |     content = get_page_content(web_html)
 34 | 
 35 |     remove_classes(content)
 36 |     remove_tags(content)
 37 | 
 38 |     unwrap_collapsible_blocks(content)
 39 |     unwrap_yui_navset(content)
 40 | 
 41 |     divify_blockquotes(content)
 42 | 
 43 |     fix_footnotes(content)
 44 |     fix_links(content, url_allow_list)
 45 | 
 46 |     add_title(content, page_title)
 47 | 
 48 |     return str(content)
 49 | 
 50 | 
 51 | def get_page_content(page_html, page_content_id=constants.PAGE_CONTENT_ID):
 52 |     html = bs4.BeautifulSoup(page_html, constants.BS4_FORMAT)
 53 | 
 54 |     return html.find(id=page_content_id)
 55 | 
 56 | 
 57 | def remove_classes(content, classes_to_remove=constants.CLASSES_TO_REMOVE):
 58 |     for class_to_remove in classes_to_remove:
 59 |         for element in content(class_=class_to_remove):
 60 |             element.decompose()
 61 | 
 62 | 
 63 | def remove_tags(content, tags_to_remove=constants.TAGS_TO_REMOVE):
 64 |     for tag_to_remove in tags_to_remove:
 65 |         for element in content(tag_to_remove):
 66 |             element.decompose()
 67 | 
 68 | 
 69 | def unwrap_collapsible_blocks(content):
 70 |     for element in content(class_=constants.COLLAPSIBLE_BLOCK_CLASS):
 71 |         element.attrs = {'class': constants.COLLAPSIBLE_CLASS_NEW}
 72 | 
 73 |         collapsible_title = bs4.BeautifulSoup('', constants.BS4_FORMAT).new_tag('p', **{'class': constants.COLLAPSIBLE_TITLE_CLASS_NEW})
 74 |         collapsible_title.string = element.find(class_=constants.COLLAPSIBLE_BLOCK_LINK_CLASS).text
 75 |         collapsible_content = element.find(class_=constants.COLLAPSIBLE_BLOCK_CONTENT_CLASS)
 76 | 
 77 |         element.clear()
 78 |         element.append(collapsible_title)
 79 | 
 80 |         for item in collapsible_content.contents[:]:
 81 |             element.append(item)
 82 | 
 83 | 
 84 | def divify_blockquotes(content):
 85 |     for element in content(constants.BLOCKQUOTE_TAG):
 86 |         element.name = 'div'
 87 |         element.attrs = {'class': constants.BLOCKQUOTE_CLASS_NEW}
 88 | 
 89 | 
 90 | def unwrap_yui_navset(content):
 91 |     for element in content(class_=constants.YUI_NAVSET_CLASS):
 92 |         element.attrs = {'class': constants.YUI_NAVSET_CLASS_NEW}
 93 |         titles = [title.string for title in element.find(class_=constants.YUI_NAVSET_TAB_CLASS)(constants.YUI_NAVSET_TAB_TITLE_IDENTIFIER)]
 94 |         element.find(class_=constants.YUI_NAVSET_TAB_CLASS).decompose()
 95 |         element.div.unwrap()
 96 |         for tab, title in zip(element('div', recursive=False), titles):
 97 |             tab.attrs = {'class': constants.YUI_NAVSET_TAB_CLASS_NEW}
 98 |             title_new = bs4.BeautifulSoup('', constants.BS4_FORMAT).new_tag('p', **{'class': constants.YUI_NAVSET_TAB_TITLE_CLASS_NEW})
 99 |             title_new.string = title
100 |             tab.insert(0, title_new)
101 | 
102 | 
103 | def fix_links(content, url_allow_list=None):
104 |     for element in content(constants.LINK_TAG):
105 |         if constants.HREF_ATTRIBUTE not in element.attrs:
106 |             continue
107 | 
108 |         link = element.attrs[constants.HREF_ATTRIBUTE]
109 | 
110 |         if link.startswith('#'):
111 |             continue
112 | 
113 |         local_link_result = re.search(f'^({constants.SITE_HOST})?/([a-z0-9-]+)$', link)
114 |         if local_link_result is None:
115 |             element.unwrap()
116 |             continue
117 | 
118 |         relative_link = local_link_result.group(2)
119 | 
120 |         if url_allow_list is None or relative_link in url_allow_list:
121 |             element.attrs['href'] = get_filename(relative_link)
122 |         else:
123 |             element.unwrap()
124 |             continue
125 | 
126 | 
127 | def add_title(content, page_title):
128 |     title_new = bs4.BeautifulSoup('', constants.BS4_FORMAT).new_tag(constants.PAGE_TITLE_TAG, **{'class': constants.PAGE_TITLE_CLASS})
129 |     title_new.string = page_title
130 |     content.insert(0, title_new)
131 | 
132 | 
133 | def fix_footnotes(content):
134 |     for footnoteref in content(constants.FOOTNOTEREF_TAG, class_=constants.FOOTNOTEREF_CLASS):
135 |         link = footnoteref.find(constants.LINK_TAG)
136 | 
137 |         footnote_href_result = re.search(constants.FOOTNOTE_HREF_PATTERN, link.attrs[constants.ONCLICK_ATTRIBUTE])
138 |         footnote_href = footnote_href_result[1] if footnote_href_result else ''
139 | 
140 |         link_attributes_new = {
141 |             constants.ID_ATTRIBUTE: link.attrs[constants.ID_ATTRIBUTE],
142 |             constants.HREF_ATTRIBUTE: '#' + footnote_href,
143 |             constants.EPUB_TYPE_ATTRIBUTE: constants.EPUB_TYPE_FOOTNOTEREF
144 |         }
145 | 
146 |         link.attrs = link_attributes_new
147 | 
148 |     for footnote in content(class_=constants.FOOTNOTE_CLASS):
149 |         link = footnote.find(constants.LINK_TAG)
150 | 
151 |         footnoteref_href_result = re.search(constants.FOOTNOTE_HREF_PATTERN, link.attrs[constants.ONCLICK_ATTRIBUTE])
152 |         footnoteref_href = footnoteref_href_result[1] if footnoteref_href_result else ''
153 | 
154 |         footnote_attributes_new = {
155 |             constants.CLASS_ATTRIBUTE: footnote.attrs[constants.CLASS_ATTRIBUTE],
156 |             constants.EPUB_TYPE_ATTRIBUTE: constants.EPUB_TYPE_FOOTNOTE,
157 |             constants.ID_ATTRIBUTE: footnote.attrs[constants.ID_ATTRIBUTE]
158 |         }
159 | 
160 |         link_attributes_new = {
161 |             constants.HREF_ATTRIBUTE: '#' + footnoteref_href
162 |         }
163 | 
164 |         footnote.attrs = footnote_attributes_new
165 |         link.attrs = link_attributes_new
166 | 
167 | 
168 | def get_filename(name):
169 |     return name + constants.LINK_EXTENSION
170 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/download/test_wikidot_api.py:
--------------------------------------------------------------------------------
  1 | import importlib
  2 | import os
  3 | import unittest
  4 | import unittest.mock
  5 | 
  6 | import download.wikidot_api
  7 | from constants import constants
  8 | 
  9 | 
 10 | class TestGetApiKey(unittest.TestCase):
 11 |     @unittest.mock.patch('download.aws.get_api_key_from_secretsmanager')
 12 |     def test_get_api_key_locally(self, mock_get_api_key_from_secretsmanager):
 13 |         # Arrange
 14 |         expected_api_key = '000000000000000000000000000'
 15 |         os.environ.pop(constants.USE_AWS_VARIABLE, None)
 16 |         os.environ[constants.API_KEY_VARIABLE] = expected_api_key
 17 | 
 18 |         # Act
 19 |         actual_api_key = download.wikidot_api._get_api_key()
 20 | 
 21 |         # Assert
 22 |         mock_get_api_key_from_secretsmanager.assert_not_called()
 23 |         self.assertEqual(expected_api_key, actual_api_key)
 24 | 
 25 |     @unittest.mock.patch('download.aws.get_api_key_from_secretsmanager')
 26 |     def test_get_api_key_with_aws(self, mock_get_api_key_from_secretsmanager):
 27 |         # Arrange
 28 |         expected_api_key = '000000000000000000000000000'
 29 |         os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE
 30 |         os.environ.pop(constants.API_KEY_VARIABLE, None)
 31 | 
 32 |         mock_get_api_key_from_secretsmanager.return_value = expected_api_key
 33 | 
 34 |         # Act
 35 |         actual_api_key = download.wikidot_api._get_api_key()
 36 | 
 37 |         # Assert
 38 |         mock_get_api_key_from_secretsmanager.assert_called_once_with()
 39 |         self.assertEqual(expected_api_key, actual_api_key)
 40 | 
 41 | 
 42 | class TestWikidotClient(unittest.TestCase):
 43 |     @classmethod
 44 |     def tearDownClass(cls):
 45 |         importlib.reload(download.wikidot_api)
 46 |         return super().tearDownClass()
 47 | 
 48 |     def setUp(self):
 49 |         importlib.reload(download.wikidot_api)
 50 | 
 51 |     @unittest.mock.patch('download.wikidot_api._create_wikidot_client')
 52 |     def test_get_wikidot_client_only_once(self, mock_create_wikidot_client):
 53 |         # Arrange
 54 |         expected_wikidot_client = download.wikidot_api._get_wikidot_client()
 55 | 
 56 |         # Act
 57 |         actual_wikidot_client = download.wikidot_api._get_wikidot_client()
 58 | 
 59 |         # Assert
 60 |         mock_create_wikidot_client.assert_called_once_with()
 61 |         self.assertIs(expected_wikidot_client, actual_wikidot_client)
 62 | 
 63 |     @unittest.mock.patch('xmlrpc.client')
 64 |     @unittest.mock.patch('download.wikidot_api._get_api_key')
 65 |     def test_create_wikidot_client(self, mock_get_api_key, mock_xmlrpc_client):
 66 |         # Arrange
 67 |         expected_api_key = '00000000000000000'
 68 |         expected_endpoint = f'https://{constants.CLIENT_NAME}:{expected_api_key}@{constants.RPC_ENDPOINT}'
 69 | 
 70 |         mock_get_api_key.return_value = expected_api_key
 71 |         expected_client = mock_xmlrpc_client.ServerProxy.return_value
 72 | 
 73 |         # Act
 74 |         actual_client = download.wikidot_api._create_wikidot_client()
 75 | 
 76 |         # Assert
 77 |         mock_get_api_key.assert_called_once_with()
 78 |         mock_xmlrpc_client.ServerProxy.assert_called_once_with(expected_endpoint)
 79 |         self.assertEqual(expected_client, actual_client)
 80 | 
 81 | 
 82 | class TestGetListOfPagesUndecorated(unittest.TestCase):
 83 |     @unittest.mock.patch('download.wikidot_api._get_wikidot_client')
 84 |     def test_get_list_of_pages(self, mock_get_wikidot_client):
 85 |         # Arrange
 86 |         expected_site = constants.SITE_NAME
 87 |         expected_categories = ['_default']
 88 |         expected_client = mock_get_wikidot_client.return_value
 89 |         expected_select_call = {
 90 |             'site': expected_site,
 91 |             'categories': expected_categories
 92 |         }
 93 |         expected_list_of_pages = expected_client.pages.select.return_value
 94 | 
 95 |         # Act
 96 |         actual_list_of_pages = download.wikidot_api._get_list_of_pages_undecorated(expected_categories)
 97 | 
 98 |         # Assert
 99 |         mock_get_wikidot_client.assert_called_once_with()
100 |         expected_client.pages.select.assert_called_once_with(expected_select_call)
101 |         self.assertEqual(expected_list_of_pages, actual_list_of_pages)
102 | 
103 | 
104 | class TestGetPageDataUndecorated(unittest.TestCase):
105 |     @unittest.mock.patch('download.wikidot_api._get_wikidot_client')
106 |     def test_get_page_data(self, mock_get_wikidot_client):
107 |         # Arrange
108 |         expected_site = constants.SITE_NAME
109 |         expected_page = 'scp-123'
110 |         expected_client = mock_get_wikidot_client.return_value
111 |         expected_get_meta_call = {
112 |             'site': expected_site,
113 |             'pages': [expected_page]
114 |         }
115 | 
116 |         expected_page_data = {'fullname': 'scp-123', 'created_at': '2008-07-26T12:28:25+00:00', 'created_by': 'far2', 'updated_at': '2020-02-14T00:29:33+00:00', 'updated_by': 'Elogee FishTruck', 'title': 'SCP-123', 'title_shown': 'SCP-123', 'parent_fullname': None, 'tags': ['euclid', 'scp', 'gravity', 'spacetime', 'sphere'], 'rating': 319, 'revisions': 27}
117 |         expected_return_value = {expected_page: expected_page_data}
118 | 
119 |         expected_client.pages.get_meta.return_value = expected_return_value
120 | 
121 |         # Act
122 |         actual_page_data = download.wikidot_api._get_page_metadata_undecorated(expected_page)
123 | 
124 |         # Assert
125 |         mock_get_wikidot_client.assert_called_once_with()
126 |         expected_client.pages.get_meta.assert_called_once_with(expected_get_meta_call)
127 |         self.assertEqual(expected_page_data, actual_page_data)
128 | 
129 | 
130 | class TestGetWebPageUndecorated(unittest.TestCase):
131 |     @unittest.mock.patch('requests.get')
132 |     def test_get_web_page_no_error(self, mock_requests_get):
133 |         # Arrange
134 |         expected_encoding = constants.ENCODING
135 |         expected_host = constants.SITE_DOWNLOAD_HOST
136 |         expected_page = 'scp-1337'
137 |         expected_url = f'{expected_host}/{expected_page}'
138 |         expected_status_code = 200
139 | 
140 |         expected_web_page = mock_requests_get.return_value
141 |         expected_web_page.status_code = expected_status_code
142 | 
143 |         expected_web_page_content = expected_web_page.content.decode.return_value
144 | 
145 |         # Act
146 |         actual_web_page_content = download.wikidot_api._get_web_page_undecorated(expected_page)
147 | 
148 |         # Assert
149 |         mock_requests_get.assert_called_once_with(expected_url)
150 |         expected_web_page.content.decode.assert_called_once_with(expected_encoding)
151 |         self.assertEqual(expected_web_page_content, actual_web_page_content)
152 | 
153 |     @unittest.mock.patch('requests.get')
154 |     def test_get_web_page_not_found(self, mock_requests_get):
155 |         # Arrange
156 |         expected_encoding = constants.ENCODING
157 |         expected_host = constants.SITE_DOWNLOAD_HOST
158 |         expected_page = 'scp-1337'
159 |         expected_url = f'{expected_host}/{expected_page}'
160 |         expected_status_code = 404
161 | 
162 |         expected_web_page = mock_requests_get.return_value
163 |         expected_web_page.status_code = expected_status_code
164 | 
165 |         expected_web_page_content = None
166 | 
167 |         # Act
168 |         actual_web_page_content = download.wikidot_api._get_web_page_undecorated(expected_page)
169 | 
170 |         # Assert
171 |         mock_requests_get.assert_called_once_with(expected_url)
172 |         expected_web_page.content.decode.assert_not_called()
173 |         self.assertEqual(expected_web_page_content, actual_web_page_content)
174 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/_samples/scp_055.html:
--------------------------------------------------------------------------------
  1 | <div style="text-align: right;">
  2 |     <div class="heritage-rating-module">
  3 |         <div class="heritage-emblem"><a href="/heritage-collection-arc"><img
  4 |                     src="(%%%UPLOAD_DOMAIN%%%)/local--files/component:heritage-rating/scp-heritage-v3.png"
  5 |                     alt="scp-heritage-v3.png" class="image" /></a></div>
  6 |     </div>
  7 | </div>
  8 | <p><strong>Item
  9 |         #:</strong> SCP-055</p>
 10 | <p><strong>Object Class:</strong> Keter</p>
 11 | <p><strong>Special Containment
 12 |         Procedures:</strong> Object is kept within a five (5) by five (5) by two point five (2.5) meter square room
 13 |     constructed of cement (fifty (50) centimeter thickness), with a Faraday cage surrounding the cement walls. Access is
 14 |     via a heavy containment door measuring two (2) by two point five (2.5) meters constructed on bearings to ensure door
 15 |     closes and locks automatically unless held open deliberately. Security guards are NOT to be posted outside SCP-055's
 16 |     room. It is further advised that all personnel maintaining or studying other SCP objects in the vicinity try to
 17 |     maintain a distance of at least fifty (50) meters from the geometric center of the room, as long as this is
 18 |     reasonably practical.</p>
 19 | <p><strong>Description:</strong> SCP-055 is a &quot;self-keeping secret&quot; or
 20 |     &quot;anti-meme&quot;. Information about SCP-055's physical appearance as well as its nature, behavior, and origins
 21 |     is self-classifying. To clarify:</p>
 22 | <ul>
 23 |     <li>How Site 19 originally acquired SCP-055 is unknown.</li>
 24 | </ul>
 25 | 
 26 | <ul>
 27 |     <li>When SCP-055 was obtained, and by whom, is unknown.</li>
 28 | </ul>
 29 | <ul>
 30 |     <li>SCP-055's physical appearance is
 31 |         unknown. It is not indescribable, or invisible: individuals are perfectly capable of entering SCP-055's
 32 |         container and observing it, taking mental or written notes, making sketches, taking photographs, and even making
 33 |         audio/video recordings. An extensive log of such observations is on file. However, information about SCP-055's
 34 |         physical appearance &quot;leaks&quot; out of a human mind soon after such an observation. Individuals tasked
 35 |         with describing SCP-055 afterwards find their minds wandering and lose interest in the task; individuals tasked
 36 |         with sketching a copy of a photograph of SCP-055 are unable to remember what the photograph looks like, as are
 37 |         researchers overseeing these tests. Security personnel who have observed SCP-055 via closed-circuit television
 38 |         cameras emerge after a full shift exhausted and effectively amnesiac about the events of the previous hours.
 39 |     </li>
 40 | </ul>
 41 | <ul>
 42 |     <li>Who authorized the construction of SCP-055's containment room, why it was constructed in
 43 |         this way, or what the purpose of the described Containment Procedures may be, are all unknown.</li>
 44 | </ul>
 45 | <ul>
 46 | 
 47 |     <li>Despite SCP-055's container being easily accessible, all personnel at Site 19 claim no knowledge of SCP-055's
 48 |         existence when challenged.</li>
 49 | </ul>
 50 | <p>All of these facts are periodically rediscovered, usually by chance
 51 |     readers of this file, causing a great deal of alarm. This state of concern lasts minutes at most, before the matter
 52 |     is simply forgotten about.</p>
 53 | <p>A great deal of scientific data has been recorded from SCP-055, but cannot be
 54 |     studied.</p>
 55 | <p>At least one attempt has been made to destroy SCP-055, or possibly move it from containment at Site
 56 |     19 to another site, meeting failure for reasons unknown.</p>
 57 | <p>SCP-055 may present a major physical threat and
 58 |     indeed may have killed many hundreds of personnel, and we would not know it. Certainly it presents a gigantic
 59 |     memetic/mental threat, hence its Keter classification.</p>
 60 | <p><strong>Document #055-1:</strong> An Analysis of
 61 |     SCP-055</p>
 62 | <p>The author puts forward the hypothesis that SCP-055 was never formally acquired by
 63 |     \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588
 64 |     \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 and is in fact an autonomous or remotely-controlled agent, inserted
 65 |     at Site 19 by an unidentified third party for one or all of the following purposes:</p>
 66 | <ul>
 67 |     <li>to silently
 68 |         observe, or interfere with, activities at Site 19</li>
 69 |     <li>to silently observe, or interfere with, activities
 70 |         at other SCP locations</li>
 71 |     <li>to silently observe, or interfere with, activities of humanity worldwide</li>
 72 | 
 73 |     <li>to silently observe, or interfere with, other SCP objects</li>
 74 |     <li>to silently observe, or interfere with,
 75 |         \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588</li>
 76 | </ul>
 77 | <p>No action to counter
 78 |     any of these potential threats is suggested, or indeed theoretically possible.</p>
 79 | <p><strong>Addendum A</strong>:
 80 | </p>
 81 | <blockquote>
 82 |     <p>Hey, if this thing really is an &quot;anti-meme&quot;, why doesn't the fact that it's an
 83 |         &quot;anti-meme&quot; get wiped? We must be wrong about that somehow. Wait a minute, what if we were to keep
 84 |         notes about what it isn't? Would we remember those? <em>Bartholomew Hughes, NSA</em></p>
 85 | </blockquote>
 86 | <p>
 87 |     <strong>Document #055-2:</strong> Report of Dr. John Marachek</p>
 88 | <p>Survey team #19-055-127BXE was successfully
 89 |     able to enter SCP-055's container and ascertain the appearance and, to some degree, the nature of the object. Notes
 90 |     were taken according to the project methodology (see
 91 |     \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588), after which the container was sealed
 92 |     again.</p>
 93 | <p>Excerpt from a transcript of personnel debriefing follows:</p>
 94 | <blockquote>
 95 |     <p>Dr. Hughes: Okay,
 96 |         I'm going to need to ask you some questions about number 55 now.</p>
 97 |     <p>
 98 |         \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Number what?</p>
 99 |     <p>Dr. Hughes: SCP object 55. The object you just
100 |         examined.</p>
101 |     <p>\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Um, I don't know what you're talking about. I
102 |         don't think we <em>have</em> a 55.</p>
103 |     <p>Dr. Hughes: Okay, then, \u2588\u2588\u2588\u2588\u2588\u2588\u2588,
104 |         I'd like you to tell me what you've been doing for the past two hours.</p>
105 |     <p>
106 |         \u2588\u2588\u2588\u2588\u2588\u2588\u2588: What? I&#8230; &lt;subject appears uncomfortable&gt; &#8230; I don't
107 |         know.</p>
108 |     <p>Dr. Hughes: Okay, then, do you remember that we all agreed that it wasn't spherical?</p>
109 |     <p>
110 |         \u2588\u2588\u2588\u2588\u2588\u2588\u2588: That what wasn't&#8230; Oh! Right! It isn't round at all! Object 55
111 |         isn't round!</p>
112 |     <p>Dr. Hughes: So you remember it now?</p>
113 |     <p>\u2588\u2588\u2588\u2588\u2588\u2588\u2588:
114 |         Well, no. I mean, I don't know what it is, but I know there is one. It's something you can't remember. And it's
115 |         not a sphere.</p>
116 |     <p>Dr. Hughes: Wait a minute. What's not a sphere?</p>
117 |     <p>
118 |         \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Object 55.</p>
119 |     <p>Dr. Hughes: Object what?</p>
120 |     <p>
121 |         \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Doc, do you remember agreeing that something wasn't shaped like a
122 |         sphere?</p>
123 |     <p>Dr. Hughes: Oh, right!</p>
124 | </blockquote>
125 | <p>It appears to be possible to remember what SCP-055
126 |     is not (negations of fact), and to repeatedly deduce its existence from these memories.</p>
127 | <p>Personnel involved
128 |     in Survey #19-055-127BXE reported moderate levels of disorientation and psychological trauma associated with cycles
129 |     of repeated memory and forgetfulness of SCP-055. However, no long-term behavioral or health problems were observed,
130 |     and psych assessments of survey personnel showed consistent reports of this distress fading over time.</p>
131 | <p>
132 |     Recommendations: It may be worthwhile to post at least one staff member capable of remembering the existence of
133 |     SCP-055 to each critical site.</p>
134 | <div class="footer-wikiwalk-nav">
135 |     <div style="text-align: center;">
136 |         <p>
137 |             &#171; <a href="/scp-054">SCP-054</a> | SCP-055 | <a href="/scp-056">SCP-056</a> &#187;</p>
138 |     </div>
139 | 
140 | </div>
141 | 


--------------------------------------------------------------------------------
/scp_epub/test_component/process/test_process_page_cases/scp-1257_converted.html:
--------------------------------------------------------------------------------
 1 | <div id="page-content"><p class="page-title">SCP-1257</p>
 2 | <div style="text-align: right;"></div>
 3 | <p><strong>Item #:</strong> SCP-1257</p>
 4 | <p><strong>Object Class:</strong> Safe</p>
 5 | <p><strong>Special Containment Procedures:</strong> All copies of SCP-1257, encompassing all instances of SCP-1257-1, SCP-1257-2, and SCP-1257-3, are to be kept in the secure media vault at Site-██. Any uncontained copies of SCP-1257 are to be recovered or destroyed by MTF Mu-53 (“Ebert's Thumb”). Because of the nature of the original appearance of SCP-1257, and its widespread exposure to the public, MTF Mu-53 is also tasked to replace any new sources of information about SCP-1257 as they are discovered, in whatever format they may appear, in accordance with Protocol Gamma-1257-A (Codename: “Snopes’ Revenge”)<sup class="footnoteref"><a epub:type="noteref" href="#footnote-1" id="footnoteref-1">1</a></sup></p>
 6 | <p>Unauthorized persons exposed to copies of SCP-1257, or who evidence any knowledge about SCP-1257, shall be interrogated by the Foundation Intelligence Department, administered Class A Amnestics, and have implanted post-hypnotic suggestions to reinforce the belief that SCP-1257 is a hoax.<sup class="footnoteref"><a epub:type="noteref" href="#footnote-2" id="footnoteref-2">2</a></sup></p>
 7 | <p>Study of SCP-1257 is limited to personnel of Level 3 or higher, subject to approval by the Foundation Intelligence Department.</p>
 8 | <p><strong>Description:</strong> SCP-1257 is an American-produced half-hour situation comedy originally titled <em>Raising Danny</em> that aired on the ███ television network for six episodes in 197█. Instances of SCP-1257-1 are the original production reels for all twelve episodes filmed, recovered from the ███ archives in 198█. Instances of SCP-1257-2 are the draft and shooting scripts and copies, including four episodes that were never shot. Instances of SCP-1257-3 are all other video recordings of the six episodes actually aired.</p>
 9 | <p>The anomalous properties of SCP-1257 manifest in any and all video copies produced from the original series, and in any copies of the scripts for those episodes. Every year, beginning in mid-September, video recordings and scripts for <em>Raising Danny</em> will change to reflect a new season of episodes. Replacements will begin with episode one, and progress sequentially through each episode in order during each subsequent week. While the Foundation has access to the first sixteen episodes of each season,<sup class="footnoteref"><a epub:type="noteref" href="#footnote-3" id="footnoteref-3">3</a></sup> it appears that each SCP-1257 season runs approximately 24 episodes. Additionally, while new seasons of SCP-1257 occasionally produce hour-long "specials,” copies are always limited to the first half-hour running time of the original episodes of SCP-1257.</p>
10 | <p>Video copies of the original over-the-air broadcast of the first six episodes of SCP-1257 present a special case. Commercials recorded contemporaneously with SCP-1257 will also show changes consistent with the content of SCP-1257, and updated videos have occasionally shown news bulletins and weather alerts that imply multiple points of divergence between the world that continues to produce SCP-1257 and our own.</p>
11 | <p>SCP-1257’s original premise had a black man, named Tyler (played by Whitman Mayo), married to a white woman who already had a son by a prior marriage. When the woman dies, prior to the pilot episode, the man is left raising her son, named Danny (played by Danny Bonaduce), as his own. Reviews of the original series recovered by the Foundation were universally unfavorable and referred to it as “The unwanted bastard child of <em>Sanford and Son</em> and <em>The Courtship of Eddie's Father</em>.” Because of the anomalous properties of SCP-1257, the Foundation has only been able to reconstruct a general outline of the original content of the series.</p>
12 | <p><strong>Addendum 1:</strong> Notes on selected episodes of SCP-1257 observed in Foundation custody.</p>
13 | <div class="collapsible"><p class="collapsible-title">+ Document S-1257-11</p>
14 | <div class="quote">
15 | <ul>
16 | <li>Season 3, Episode 3, “Tyler’s Date”: Episode manifests three months after initial containment. One line in the script implies the assassination of American President Jimmy Carter in late 1978 or early 1979.</li>
17 | <li>Season 4, Episode 1, “Bad Touch pt. 2”: Second half of a “special episode” ending season three. Eric, a young classmate of Danny’s, is the target of a sexual predator using what appears to be anomalous items manufactured by Doctor Wondertainment to lure his victims.</li>
18 | <li>Season 5, Episode 10, “The Senior Trip”: Episode mentions a scandal where 10 members of the UK House of Commons had been publicly revealed as members of a cult that bears a strong resemblance to the Church of the Broken God.</li>
19 | <li>Season 6, Episode 1, “The Freshmen”: The title of SCP-1257 is changed to <em>Danny</em>. The premise of the series changes as well, dropping the Tyler character and sending Danny to college in New York City with five of his classmates from High School.<sup class="footnoteref"><a epub:type="noteref" href="#footnote-4" id="footnoteref-4">4</a></sup> The University Lab appears to have specimens of SCP-███, SCP-███ and SCP-███.</li>
20 | <li>Season 6, Episode 11, “The ████████”: Plot of the episode concerns Eric’s suspicions that one of their dormmates might be secretly one of the “████████.” This turns out to be a misunderstanding. From context, the “████████” appear to prey on young women and have become endemic in [REDACTED] and seem to be the result of a containment breach of [REDACTED] in Mexico City.</li>
21 | <li>Season 7, Episode 2, “Eric’s Midterm Caper”: When this episode manifested in SCP-1257-3-12, a new advertisement appeared during the second break for Marshall, Carter, and Dark Ltd. The ad promoted [REDACTED] services for [REDACTED].</li>
22 | <li>Season 10, Episode 1, “The Job Hunt”: Hour-long “special” introducing another change in premise.<sup class="footnoteref"><a epub:type="noteref" href="#footnote-5" id="footnoteref-5">5</a></sup> One scene implies that the Global Occult Coalition has become public enough to run “want ads” in the local newspaper.</li>
23 | <li>Season 10, Episode 2, “The New Guy”: The show’s title is officially changed to <em>Agent Danny of the SCP</em>.<sup class="footnoteref"><a epub:type="noteref" href="#footnote-6" id="footnoteref-6">6</a></sup> Danny has been employed as Level 1 security at Site-19, and through a series of mishaps, prevents a containment breach of <a href="scp-173.xhtml">SCP-173</a>.</li>
24 | <li>Season 10, Episode 5, “D-Class Act”: Danny mis-hears a co-worker’s conversation and becomes convinced he has been mistakenly reassigned to D-Class by the HR Department.</li>
25 | <li>Season 10, Episode 11, “Leaping Lizards”: [REDACTED] <a href="scp-682.xhtml">SCP-682</a> [REDACTED].</li>
26 | </ul>
27 | <p><em><strong>Note:</strong> Details of SCP-1257 episodes past Season 10 are only available with the approval of the Intelligence Department.</em></p>
28 | </div>
29 | </div>
30 | <div class="footnotes-footer">
31 | <div class="title">Footnotes</div>
32 | <div class="footnote-footer" epub:type="footnote" id="footnote-1"><a href="#footnoteref-1">1</a>. Information about SCP-1257 is replaced with new information leading to the conclusion that SCP-1257 never existed, and is a hoax. Protocol Gamma-1257-A also requires all original documentation discovered about SCP-1257 be replaced with forged copies omitting references to SCP-1257.</div>
33 | <div class="footnote-footer" epub:type="footnote" id="footnote-2"><a href="#footnoteref-2">2</a>. As of 5/27/19██ all original participants in the production of SCP-1257 have been found and either treated with amnestics or terminated.</div>
34 | <div class="footnote-footer" epub:type="footnote" id="footnote-3"><a href="#footnoteref-3">3</a>. Twelve via video copies (SCP-1257-1) and four only via unproduced scripts (SCP-1257-2).</div>
35 | <div class="footnote-footer" epub:type="footnote" id="footnote-4"><a href="#footnoteref-4">4</a>. Including Eric, despite a three-year age difference.</div>
36 | <div class="footnote-footer" epub:type="footnote" id="footnote-5"><a href="#footnoteref-5">5</a>. Danny has graduated college, is unemployed, and advertisements shown during SCP-1257-3 instances show that the Eric character has been spun off into his own series.</div>
37 | <div class="footnote-footer" epub:type="footnote" id="footnote-6"><a href="#footnoteref-6">6</a>. It is clear from the script that the title the writers intended is actually <em>Agent Danny of the SCP <strong>Foundation</strong></em>. However, in the title card and in taped dialog, “Foundation” is consistently omitted. The name does appear correctly in the background sets intended to represent Foundation locations.</div>
38 | </div>
39 | 
40 | </div>


--------------------------------------------------------------------------------
/scp_epub/test_unit/_samples/scp_055_pyscp.xhtml:
--------------------------------------------------------------------------------
  1 | <?xml version='1.0' encoding='UTF-8'?>
  2 | <!DOCTYPE html>
  3 | <html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops"
  4 |     epub:prefix="z3998: http://www.daisy.org/z3998/2012/vocab/structure/#" lang="en" xml:lang="en">
  5 | 
  6 | <head>
  7 |     <title>SCP-055: [unknown]</title>
  8 |     <link href="../stylesheet.css" rel="stylesheet" type="text/css" />
  9 | </head>
 10 | 
 11 | <body>
 12 |     <div id="page-content">
 13 |         <p class="scp-title">SCP-055: [unknown]</p>
 14 |         <div style="text-align: right;">
 15 |             <div class="heritage-rating-module">
 16 |                 <div class="heritage-emblem"><span class="link" /></div>
 17 |             </div>
 18 |         </div>
 19 |         <p><strong>Item #:</strong> SCP-055</p>
 20 |         <p><strong>Object Class:</strong> Keter</p>
 21 |         <p><strong>Special Containment Procedures:</strong> Object is kept within a five (5) by five (5) by two point
 22 |             five (2.5) meter square room constructed of cement (fifty (50) centimeter thickness), with a Faraday cage
 23 |             surrounding the cement walls. Access is via a heavy containment door measuring two (2) by two point five
 24 |             (2.5) meters constructed on bearings to ensure door closes and locks automatically unless held open
 25 |             deliberately. Security guards are NOT to be posted outside SCP-055's room. It is further advised that all
 26 |             personnel maintaining or studying other SCP objects in the vicinity try to maintain a distance of at least
 27 |             fifty (50) meters from the geometric center of the room, as long as this is reasonably practical.</p>
 28 |         <p><strong>Description:</strong> SCP-055 is a "self-keeping secret" or "anti-meme". Information about SCP-055's
 29 |             physical appearance as well as its nature, behavior, and origins is self-classifying. To clarify:</p>
 30 |         <ul>
 31 |             <li>How Site 19 originally acquired SCP-055 is unknown.</li>
 32 |         </ul>
 33 |         <ul>
 34 |             <li>When SCP-055 was obtained, and by whom, is unknown.</li>
 35 |         </ul>
 36 |         <ul>
 37 |             <li>SCP-055's physical appearance is unknown. It is not indescribable, or invisible: individuals are
 38 |                 perfectly capable of entering SCP-055's container and observing it, taking mental or written notes,
 39 |                 making sketches, taking photographs, and even making audio/video recordings. An extensive log of such
 40 |                 observations is on file. However, information about SCP-055's physical appearance "leaks" out of a human
 41 |                 mind soon after such an observation. Individuals tasked with describing SCP-055 afterwards find their
 42 |                 minds wandering and lose interest in the task; individuals tasked with sketching a copy of a photograph
 43 |                 of SCP-055 are unable to remember what the photograph looks like, as are researchers overseeing these
 44 |                 tests. Security personnel who have observed SCP-055 via closed-circuit television cameras emerge after a
 45 |                 full shift exhausted and effectively amnesiac about the events of the previous hours.</li>
 46 |         </ul>
 47 |         <ul>
 48 |             <li>Who authorized the construction of SCP-055's containment room, why it was constructed in this way, or
 49 |                 what the purpose of the described Containment Procedures may be, are all unknown.</li>
 50 |         </ul>
 51 |         <ul>
 52 |             <li>Despite SCP-055's container being easily accessible, all personnel at Site 19 claim no knowledge of
 53 |                 SCP-055's existence when challenged.</li>
 54 |         </ul>
 55 |         <p>All of these facts are periodically rediscovered, usually by chance readers of this file, causing a great
 56 |             deal of alarm. This state of concern lasts minutes at most, before the matter is simply forgotten about.</p>
 57 |         <p>A great deal of scientific data has been recorded from SCP-055, but cannot be studied.</p>
 58 |         <p>At least one attempt has been made to destroy SCP-055, or possibly move it from containment at Site 19 to
 59 |             another site, meeting failure for reasons unknown.</p>
 60 |         <p>SCP-055 may present a major physical threat and indeed may have killed many hundreds of personnel, and we
 61 |             would not know it. Certainly it presents a gigantic memetic/mental threat, hence its Keter classification.
 62 |         </p>
 63 |         <p><strong>Document #055-1:</strong> An Analysis of SCP-055</p>
 64 |         <p>The author puts forward the hypothesis that SCP-055 was never formally acquired by ████████████ ████████ and
 65 |             is in fact an autonomous or remotely-controlled agent, inserted at Site 19 by an unidentified third party
 66 |             for one or all of the following purposes:</p>
 67 |         <ul>
 68 |             <li>to silently observe, or interfere with, activities at Site 19</li>
 69 |             <li>to silently observe, or interfere with, activities at other SCP locations</li>
 70 |             <li>to silently observe, or interfere with, activities of humanity worldwide</li>
 71 |             <li>to silently observe, or interfere with, other SCP objects</li>
 72 |             <li>to silently observe, or interfere with, ████████████</li>
 73 |         </ul>
 74 |         <p>No action to counter any of these potential threats is suggested, or indeed theoretically possible.</p>
 75 |         <p><strong>Addendum A</strong>:</p>
 76 |         <div class="quote">
 77 |             <p>Hey, if this thing really is an "anti-meme", why doesn't the fact that it's an "anti-meme" get wiped? We
 78 |                 must be wrong about that somehow. Wait a minute, what if we were to keep notes about what it isn't?
 79 |                 Would we remember those? <em>Bartholomew Hughes, NSA</em></p>
 80 |         </div>
 81 |         <p><strong>Document #055-2:</strong> Report of Dr. John Marachek</p>
 82 |         <p>Survey team #19-055-127BXE was successfully able to enter SCP-055's container and ascertain the appearance
 83 |             and, to some degree, the nature of the object. Notes were taken according to the project methodology (see
 84 |             ████████████), after which the container was sealed again.</p>
 85 |         <p>Excerpt from a transcript of personnel debriefing follows:</p>
 86 |         <div class="quote">
 87 |             <p>Dr. Hughes: Okay, I'm going to need to ask you some questions about number 55 now.</p>
 88 |             <p>███████: Number what?</p>
 89 |             <p>Dr. Hughes: SCP object 55. The object you just examined.</p>
 90 |             <p>███████: Um, I don't know what you're talking about. I don't think we <em>have</em> a 55.</p>
 91 |             <p>Dr. Hughes: Okay, then, ███████, I'd like you to tell me what you've been doing for the past two hours.
 92 |             </p>
 93 |             <p>███████: What? I… &lt;subject appears uncomfortable&gt; … I don't know.</p>
 94 |             <p>Dr. Hughes: Okay, then, do you remember that we all agreed that it wasn't spherical?</p>
 95 |             <p>███████: That what wasn't… Oh! Right! It isn't round at all! Object 55 isn't round!</p>
 96 |             <p>Dr. Hughes: So you remember it now?</p>
 97 |             <p>███████: Well, no. I mean, I don't know what it is, but I know there is one. It's something you can't
 98 |                 remember. And it's not a sphere.</p>
 99 |             <p>Dr. Hughes: Wait a minute. What's not a sphere?</p>
100 |             <p>███████: Object 55.</p>
101 |             <p>Dr. Hughes: Object what?</p>
102 |             <p>███████: Doc, do you remember agreeing that something wasn't shaped like a sphere?</p>
103 |             <p>Dr. Hughes: Oh, right!</p>
104 |         </div>
105 |         <p>It appears to be possible to remember what SCP-055 is not (negations of fact), and to repeatedly deduce its
106 |             existence from these memories.</p>
107 |         <p>Personnel involved in Survey #19-055-127BXE reported moderate levels of disorientation and psychological
108 |             trauma associated with cycles of repeated memory and forgetfulness of SCP-055. However, no long-term
109 |             behavioral or health problems were observed, and psych assessments of survey personnel showed consistent
110 |             reports of this distress fading over time.</p>
111 |         <p>Recommendations: It may be worthwhile to post at least one staff member capable of remembering the existence
112 |             of SCP-055 to each critical site.</p>
113 |         <div class="footer-wikiwalk-nav">
114 |             <div style="text-align: center;">
115 |                 <p>« <a href="0068.xhtml">SCP-054</a> | SCP-055 | <a href="0070.xhtml">SCP-056</a> »</p>
116 |             </div>
117 |         </div>
118 |     </div>
119 | </body>
120 | 
121 | </html>
122 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/_samples/scp_055.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fullname": "scp-055",
 3 |   "created_at": "2008-07-25T15:59:04+00:00",
 4 |   "created_by": "xthevilecorruptor",
 5 |   "updated_at": "2019-05-03T02:04:14+00:00",
 6 |   "updated_by": "Modern_Erasmus",
 7 |   "title": "SCP-055",
 8 |   "title_shown": "SCP-055",
 9 |   "parent_fullname": null,
10 |   "tags": [
11 |     "keter",
12 |     "scp",
13 |     "meta",
14 |     "featured",
15 |     "memory-affecting",
16 |     "heritage",
17 |     "infohazard",
18 |     "antimemetic"
19 |   ],
20 |   "rating": 2733,
21 |   "revisions": 37,
22 |   "parent_title": null,
23 |   "content": "[[include component:heritage-rating]]\n\n**Item #:** SCP-055 \n\n**Object Class:** Keter \n\n**Special Containment Procedures:** Object is kept within a five (5) by five (5) by two point five (2.5) meter square room constructed of cement (fifty (50) centimeter thickness), with a Faraday cage surrounding the cement walls. Access is via a heavy containment door measuring two (2) by two point five (2.5) meters constructed on bearings to ensure door closes and locks automatically unless held open deliberately. Security guards are NOT to be posted outside SCP-055's room. It is further advised that all personnel maintaining or studying other SCP objects in the vicinity try to maintain a distance of at least fifty (50) meters from the geometric center of the room, as long as this is reasonably practical. \n\n**Description:** SCP-055 is a \"self-keeping secret\" or \"anti-meme\". Information about SCP-055's physical appearance as well as its nature, behavior, and origins is self-classifying. To clarify: \n\n* How Site 19 originally acquired SCP-055 is unknown. \n\n* When SCP-055 was obtained, and by whom, is unknown. \n\n* SCP-055's physical appearance is unknown. It is not indescribable, or invisible: individuals are perfectly capable of entering SCP-055's container and observing it, taking mental or written notes, making sketches, taking photographs, and even making audio/video recordings. An extensive log of such observations is on file. However, information about SCP-055's physical appearance \"leaks\" out of a human mind soon after such an observation. Individuals tasked with describing SCP-055 afterwards find their minds wandering and lose interest in the task; individuals tasked with sketching a copy of a photograph of SCP-055 are unable to remember what the photograph looks like, as are researchers overseeing these tests. Security personnel who have observed SCP-055 via closed-circuit television cameras emerge after a full shift exhausted and effectively amnesiac about the events of the previous hours. \n\n* Who authorized the construction of SCP-055's containment room, why it was constructed in this way, or what the purpose of the described Containment Procedures may be, are all unknown. \n\n* Despite SCP-055's container being easily accessible, all personnel at Site 19 claim no knowledge of SCP-055's existence when challenged.\n \n\nAll of these facts are periodically rediscovered, usually by chance readers of this file, causing a great deal of alarm. This state of concern lasts minutes at most, before the matter is simply forgotten about. \n\nA great deal of scientific data has been recorded from SCP-055, but cannot be studied. \n\nAt least one attempt has been made to destroy SCP-055, or possibly move it from containment at Site 19 to another site, meeting failure for reasons unknown. \n\nSCP-055 may present a major physical threat and indeed may have killed many hundreds of personnel, and we would not know it. Certainly it presents a gigantic memetic/mental threat, hence its Keter classification. \n\n**Document #055-1:** An Analysis of SCP-055 \n\nThe author puts forward the hypothesis that SCP-055 was never formally acquired by \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 and is in fact an autonomous or remotely-controlled agent, inserted at Site 19 by an unidentified third party for one or all of the following purposes: \n\n* to silently observe, or interfere with, activities at Site 19 \n* to silently observe, or interfere with, activities at other SCP locations \n* to silently observe, or interfere with, activities of humanity worldwide \n* to silently observe, or interfere with, other SCP objects \n* to silently observe, or interfere with, \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\n \n\nNo action to counter any of these potential threats is suggested, or indeed theoretically possible.\n\n**Addendum A**: \n> Hey, if this thing really is an \"anti-meme\", why doesn't the fact that it's an \"anti-meme\" get wiped? We must be wrong about that somehow. Wait a minute, what if we were to keep notes about what it isn't? Would we remember those? //Bartholomew Hughes, NSA//\n\n**Document #055-2:** Report of Dr. John Marachek\n\nSurvey team #19-055-127BXE was successfully able to enter SCP-055's container and ascertain the appearance and, to some degree, the nature of the object. Notes were taken according to the project methodology (see \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588), after which the container was sealed again. \n\nExcerpt from a transcript of personnel debriefing follows:\n\n> Dr. Hughes: Okay, I'm going to need to ask you some questions about number 55 now.\n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Number what? \n> \n> Dr. Hughes: SCP object 55. The object you just examined.\n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Um, I don't know what you're talking about. I don't think we //have// a 55. \n> \n> Dr. Hughes: Okay, then, \u2588\u2588\u2588\u2588\u2588\u2588\u2588, I'd like you to tell me what you've been doing for the past two hours.\n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: What? I... <subject appears uncomfortable> ... I don't know. \n> \n> Dr. Hughes: Okay, then, do you remember that we all agreed that it wasn't spherical? \n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: That what wasn't... Oh! Right! It isn't round at all! Object 55 isn't round!\n> \n> Dr. Hughes: So you remember it now? \n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Well, no. I mean, I don't know what it is, but I know there is one.  It's something you can't remember. And it's not a sphere. \n> \n> Dr. Hughes: Wait a minute. What's not a sphere? \n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Object 55.\n> \n> Dr. Hughes: Object what?\n> \n> \u2588\u2588\u2588\u2588\u2588\u2588\u2588: Doc, do you remember agreeing that something wasn't shaped like a sphere?\n> \n> Dr. Hughes: Oh, right!\n\nIt appears to be possible to remember what SCP-055 is not (negations of fact), and to repeatedly deduce its existence from these memories. \n\nPersonnel involved in Survey #19-055-127BXE reported moderate levels of disorientation and psychological trauma associated with cycles of repeated memory and forgetfulness of SCP-055. However, no long-term behavioral or health problems were observed, and psych assessments of survey personnel showed consistent reports of this distress fading over time.\n\nRecommendations: It may be worthwhile to post at least one staff member capable of remembering the existence of SCP-055 to each critical site.\n\n[[footnoteblock]]\n\n[[div class=\"footer-wikiwalk-nav\"]]\n[[=]]\n<< [[[SCP-054]]] | SCP-055 | [[[SCP-056]]] >>\n[[/=]]\n[[/div]]",
24 |   "html": "\n\n<div style=\"text-align: right;\">\n<div class=\"heritage-rating-module\">\n<div class=\"heritage-emblem\"><a href=\"/heritage-collection-arc\"><img src=\"(%%%UPLOAD_DOMAIN%%%)/local--files/component:heritage-rating/scp-heritage-v3.png\" alt=\"scp-heritage-v3.png\" class=\"image\" /></a></div>\n</div>\n</div>\n<p><strong>Item #:</strong> SCP-055</p>\n<p><strong>Object Class:</strong> Keter</p>\n<p><strong>Special Containment Procedures:</strong> Object is kept within a five (5) by five (5) by two point five (2.5) meter square room constructed of cement (fifty (50) centimeter thickness), with a Faraday cage surrounding the cement walls. Access is via a heavy containment door measuring two (2) by two point five (2.5) meters constructed on bearings to ensure door closes and locks automatically unless held open deliberately. Security guards are NOT to be posted outside SCP-055's room. It is further advised that all personnel maintaining or studying other SCP objects in the vicinity try to maintain a distance of at least fifty (50) meters from the geometric center of the room, as long as this is reasonably practical.</p>\n<p><strong>Description:</strong> SCP-055 is a &quot;self-keeping secret&quot; or &quot;anti-meme&quot;. Information about SCP-055's physical appearance as well as its nature, behavior, and origins is self-classifying. To clarify:</p>\n<ul>\n<li>How Site 19 originally acquired SCP-055 is unknown.</li>\n</ul>\n<ul>\n<li>When SCP-055 was obtained, and by whom, is unknown.</li>\n</ul>\n<ul>\n<li>SCP-055's physical appearance is unknown. It is not indescribable, or invisible: individuals are perfectly capable of entering SCP-055's container and observing it, taking mental or written notes, making sketches, taking photographs, and even making audio/video recordings. An extensive log of such observations is on file. However, information about SCP-055's physical appearance &quot;leaks&quot; out of a human mind soon after such an observation. Individuals tasked with describing SCP-055 afterwards find their minds wandering and lose interest in the task; individuals tasked with sketching a copy of a photograph of SCP-055 are unable to remember what the photograph looks like, as are researchers overseeing these tests. Security personnel who have observed SCP-055 via closed-circuit television cameras emerge after a full shift exhausted and effectively amnesiac about the events of the previous hours.</li>\n</ul>\n<ul>\n<li>Who authorized the construction of SCP-055's containment room, why it was constructed in this way, or what the purpose of the described Containment Procedures may be, are all unknown.</li>\n</ul>\n<ul>\n<li>Despite SCP-055's container being easily accessible, all personnel at Site 19 claim no knowledge of SCP-055's existence when challenged.</li>\n</ul>\n<p>All of these facts are periodically rediscovered, usually by chance readers of this file, causing a great deal of alarm. This state of concern lasts minutes at most, before the matter is simply forgotten about.</p>\n<p>A great deal of scientific data has been recorded from SCP-055, but cannot be studied.</p>\n<p>At least one attempt has been made to destroy SCP-055, or possibly move it from containment at Site 19 to another site, meeting failure for reasons unknown.</p>\n<p>SCP-055 may present a major physical threat and indeed may have killed many hundreds of personnel, and we would not know it. Certainly it presents a gigantic memetic/mental threat, hence its Keter classification.</p>\n<p><strong>Document #055-1:</strong> An Analysis of SCP-055</p>\n<p>The author puts forward the hypothesis that SCP-055 was never formally acquired by \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588 and is in fact an autonomous or remotely-controlled agent, inserted at Site 19 by an unidentified third party for one or all of the following purposes:</p>\n<ul>\n<li>to silently observe, or interfere with, activities at Site 19</li>\n<li>to silently observe, or interfere with, activities at other SCP locations</li>\n<li>to silently observe, or interfere with, activities of humanity worldwide</li>\n<li>to silently observe, or interfere with, other SCP objects</li>\n<li>to silently observe, or interfere with, \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588</li>\n</ul>\n<p>No action to counter any of these potential threats is suggested, or indeed theoretically possible.</p>\n<p><strong>Addendum A</strong>:</p>\n<blockquote>\n<p>Hey, if this thing really is an &quot;anti-meme&quot;, why doesn't the fact that it's an &quot;anti-meme&quot; get wiped? We must be wrong about that somehow. Wait a minute, what if we were to keep notes about what it isn't? Would we remember those? <em>Bartholomew Hughes, NSA</em></p>\n</blockquote>\n<p><strong>Document #055-2:</strong> Report of Dr. John Marachek</p>\n<p>Survey team #19-055-127BXE was successfully able to enter SCP-055's container and ascertain the appearance and, to some degree, the nature of the object. Notes were taken according to the project methodology (see \u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588\u2588), after which the container was sealed again.</p>\n<p>Excerpt from a transcript of personnel debriefing follows:</p>\n<blockquote>\n<p>Dr. Hughes: Okay, I'm going to need to ask you some questions about number 55 now.</p>\n<p>\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Number what?</p>\n<p>Dr. Hughes: SCP object 55. The object you just examined.</p>\n<p>\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Um, I don't know what you're talking about. I don't think we <em>have</em> a 55.</p>\n<p>Dr. Hughes: Okay, then, \u2588\u2588\u2588\u2588\u2588\u2588\u2588, I'd like you to tell me what you've been doing for the past two hours.</p>\n<p>\u2588\u2588\u2588\u2588\u2588\u2588\u2588: What? I&#8230; &lt;subject appears uncomfortable&gt; &#8230; I don't know.</p>\n<p>Dr. Hughes: Okay, then, do you remember that we all agreed that it wasn't spherical?</p>\n<p>\u2588\u2588\u2588\u2588\u2588\u2588\u2588: That what wasn't&#8230; Oh! Right! It isn't round at all! Object 55 isn't round!</p>\n<p>Dr. Hughes: So you remember it now?</p>\n<p>\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Well, no. I mean, I don't know what it is, but I know there is one. It's something you can't remember. And it's not a sphere.</p>\n<p>Dr. Hughes: Wait a minute. What's not a sphere?</p>\n<p>\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Object 55.</p>\n<p>Dr. Hughes: Object what?</p>\n<p>\u2588\u2588\u2588\u2588\u2588\u2588\u2588: Doc, do you remember agreeing that something wasn't shaped like a sphere?</p>\n<p>Dr. Hughes: Oh, right!</p>\n</blockquote>\n<p>It appears to be possible to remember what SCP-055 is not (negations of fact), and to repeatedly deduce its existence from these memories.</p>\n<p>Personnel involved in Survey #19-055-127BXE reported moderate levels of disorientation and psychological trauma associated with cycles of repeated memory and forgetfulness of SCP-055. However, no long-term behavioral or health problems were observed, and psych assessments of survey personnel showed consistent reports of this distress fading over time.</p>\n<p>Recommendations: It may be worthwhile to post at least one staff member capable of remembering the existence of SCP-055 to each critical site.</p>\n<div class=\"footer-wikiwalk-nav\">\n<div style=\"text-align: center;\">\n<p>&#171; <a href=\"/scp-054\">SCP-054</a> | SCP-055 | <a href=\"/scp-056\">SCP-056</a> &#187;</p>\n</div>\n</div>\n",
25 |   "children": 0,
26 |   "comments": 412,
27 |   "commented_at": "2020-04-14T20:34:20+00:00",
28 |   "commented_by": "ZELYNER"
29 | }
30 | 


--------------------------------------------------------------------------------
/definitions/complete_collection.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "download": {
  3 |     "categories": ["_default"],
  4 |     "tags_to_download": [
  5 |       "scp",
  6 |       "tale",
  7 |       "hub",
  8 |       "supplement"
  9 |     ],
 10 |     "edge_cases": [
 11 |       "scp-3125"
 12 |     ]
 13 |   },
 14 |   "meta": {
 15 |     "title": "SCP Foundation — The Complete Collection",
 16 |     "author": "Various Authors",
 17 |     "publisher": "github.com/elfakyn/scp_epub",
 18 |     "identifier": "f2c8fbad-0cb0-4ae7-93c7-3a634a7e540e",
 19 |     "language": "en"
 20 |   },
 21 |   "toc": [
 22 |     {
 23 |       "toc_entry": "SCP Foundation — The Complete Collection",
 24 |       "children": [
 25 |         {
 26 |           "toc_entry": "SCPs by Series",
 27 |           "regex": "^scp-series(-\\d+)?$"
 28 |         },
 29 |         {
 30 |           "toc_entry": "SCP Tales by Series",
 31 |           "regex": "^scp-series(-\\d+)?-tales-edition$"
 32 |         },
 33 |         {
 34 |           "toc_entry": "Hubs",
 35 |           "tags_all": [
 36 |             "hub"
 37 |           ]
 38 |         }
 39 |       ]
 40 |     },
 41 |     {
 42 |       "toc_entry": "SCP Series 1",
 43 |       "children": [
 44 |         {
 45 |           "toc_entry": "000 to 099",
 46 |           "regex": "^scp-0\\d\\d$"
 47 |         },
 48 |         {
 49 |           "toc_entry": "100 to 199",
 50 |           "regex": "^scp-1\\d\\d$"
 51 |         },
 52 |         {
 53 |           "toc_entry": "200 to 299",
 54 |           "regex": "^scp-2\\d\\d$"
 55 |         },
 56 |         {
 57 |           "toc_entry": "300 to 399",
 58 |           "regex": "^scp-3\\d\\d$"
 59 |         },
 60 |         {
 61 |           "toc_entry": "400 to 499",
 62 |           "regex": "^scp-4\\d\\d$"
 63 |         },
 64 |         {
 65 |           "toc_entry": "500 to 599",
 66 |           "regex": "^scp-5\\d\\d$"
 67 |         },
 68 |         {
 69 |           "toc_entry": "600 to 699",
 70 |           "regex": "^scp-6\\d\\d$"
 71 |         },
 72 |         {
 73 |           "toc_entry": "700 to 799",
 74 |           "regex": "^scp-7\\d\\d$"
 75 |         },
 76 |         {
 77 |           "toc_entry": "800 to 899",
 78 |           "regex": "^scp-8\\d\\d$"
 79 |         },
 80 |         {
 81 |           "toc_entry": "900 to 999",
 82 |           "regex": "^scp-9\\d\\d$"
 83 |         }
 84 |       ]
 85 |     },
 86 |     {
 87 |       "toc_entry": "SCP Series 2",
 88 |       "children": [
 89 |         {
 90 |           "toc_entry": "1000 to 1099",
 91 |           "regex": "^scp-10\\d\\d$"
 92 |         },
 93 |         {
 94 |           "toc_entry": "1100 to 1199",
 95 |           "regex": "^scp-11\\d\\d$"
 96 |         },
 97 |         {
 98 |           "toc_entry": "1200 to 1299",
 99 |           "regex": "^scp-12\\d\\d$"
100 |         },
101 |         {
102 |           "toc_entry": "1300 to 1399",
103 |           "regex": "^scp-13\\d\\d$"
104 |         },
105 |         {
106 |           "toc_entry": "1400 to 1499",
107 |           "regex": "^scp-14\\d\\d$"
108 |         },
109 |         {
110 |           "toc_entry": "1500 to 1599",
111 |           "regex": "^scp-15\\d\\d$"
112 |         },
113 |         {
114 |           "toc_entry": "1600 to 1699",
115 |           "regex": "^scp-16\\d\\d$"
116 |         },
117 |         {
118 |           "toc_entry": "1700 to 1799",
119 |           "regex": "^scp-17\\d\\d$"
120 |         },
121 |         {
122 |           "toc_entry": "1800 to 1899",
123 |           "regex": "^scp-18\\d\\d$"
124 |         },
125 |         {
126 |           "toc_entry": "1900 to 1999",
127 |           "regex": "^scp-19\\d\\d$"
128 |         }
129 |       ]
130 |     },
131 |     {
132 |       "toc_entry": "SCP Series 3",
133 |       "children": [
134 |         {
135 |           "toc_entry": "2000 to 2099",
136 |           "regex": "^scp-20\\d\\d$"
137 |         },
138 |         {
139 |           "toc_entry": "2100 to 2199",
140 |           "regex": "^scp-21\\d\\d$"
141 |         },
142 |         {
143 |           "toc_entry": "2200 to 2299",
144 |           "regex": "^scp-22\\d\\d$"
145 |         },
146 |         {
147 |           "toc_entry": "2300 to 2399",
148 |           "regex": "^scp-23\\d\\d$"
149 |         },
150 |         {
151 |           "toc_entry": "2400 to 2499",
152 |           "regex": "^scp-24\\d\\d$"
153 |         },
154 |         {
155 |           "toc_entry": "2500 to 2599",
156 |           "regex": "^scp-25\\d\\d$"
157 |         },
158 |         {
159 |           "toc_entry": "2600 to 2699",
160 |           "regex": "^scp-26\\d\\d$"
161 |         },
162 |         {
163 |           "toc_entry": "2700 to 2799",
164 |           "regex": "^scp-27\\d\\d$"
165 |         },
166 |         {
167 |           "toc_entry": "2800 to 2899",
168 |           "regex": "^scp-28\\d\\d$"
169 |         },
170 |         {
171 |           "toc_entry": "2900 to 2999",
172 |           "regex": "^scp-29\\d\\d$"
173 |         }
174 |       ]
175 |     },
176 |     {
177 |       "toc_entry": "SCP Series 4",
178 |       "children": [
179 |         {
180 |           "toc_entry": "3000 to 3099",
181 |           "regex": "^scp-30\\d\\d$"
182 |         },
183 |         {
184 |           "toc_entry": "3100 to 3199",
185 |           "regex": "^scp-31\\d\\d$"
186 |         },
187 |         {
188 |           "toc_entry": "3200 to 3299",
189 |           "regex": "^scp-32\\d\\d$"
190 |         },
191 |         {
192 |           "toc_entry": "3300 to 3399",
193 |           "regex": "^scp-33\\d\\d$"
194 |         },
195 |         {
196 |           "toc_entry": "3400 to 3499",
197 |           "regex": "^scp-34\\d\\d$"
198 |         },
199 |         {
200 |           "toc_entry": "3500 to 3599",
201 |           "regex": "^scp-35\\d\\d$"
202 |         },
203 |         {
204 |           "toc_entry": "3600 to 3699",
205 |           "regex": "^scp-36\\d\\d$"
206 |         },
207 |         {
208 |           "toc_entry": "3700 to 3799",
209 |           "regex": "^scp-37\\d\\d$"
210 |         },
211 |         {
212 |           "toc_entry": "3800 to 3899",
213 |           "regex": "^scp-38\\d\\d$"
214 |         },
215 |         {
216 |           "toc_entry": "3900 to 3999",
217 |           "regex": "^scp-39\\d\\d$"
218 |         }
219 |       ]
220 |     },
221 |     {
222 |       "toc_entry": "SCP Series 5",
223 |       "children": [
224 |         {
225 |           "toc_entry": "4000 to 4099",
226 |           "regex": "^scp-40\\d\\d$"
227 |         },
228 |         {
229 |           "toc_entry": "4100 to 4199",
230 |           "regex": "^scp-41\\d\\d$"
231 |         },
232 |         {
233 |           "toc_entry": "4200 to 4299",
234 |           "regex": "^scp-42\\d\\d$"
235 |         },
236 |         {
237 |           "toc_entry": "4300 to 4399",
238 |           "regex": "^scp-43\\d\\d$"
239 |         },
240 |         {
241 |           "toc_entry": "4400 to 4499",
242 |           "regex": "^scp-44\\d\\d$"
243 |         },
244 |         {
245 |           "toc_entry": "4500 to 4599",
246 |           "regex": "^scp-45\\d\\d$"
247 |         },
248 |         {
249 |           "toc_entry": "4600 to 4699",
250 |           "regex": "^scp-46\\d\\d$"
251 |         },
252 |         {
253 |           "toc_entry": "4700 to 4799",
254 |           "regex": "^scp-47\\d\\d$"
255 |         },
256 |         {
257 |           "toc_entry": "4800 to 4899",
258 |           "regex": "^scp-48\\d\\d$"
259 |         },
260 |         {
261 |           "toc_entry": "4900 to 4999",
262 |           "regex": "^scp-49\\d\\d$"
263 |         }
264 |       ]
265 |     },
266 |     {
267 |       "toc_entry": "SCP Series 6",
268 |       "children": [
269 |         {
270 |           "toc_entry": "5000 to 5099",
271 |           "regex": "^scp-50\\d\\d$"
272 |         },
273 |         {
274 |           "toc_entry": "5100 to 5199",
275 |           "regex": "^scp-51\\d\\d$"
276 |         },
277 |         {
278 |           "toc_entry": "5200 to 5299",
279 |           "regex": "^scp-52\\d\\d$"
280 |         },
281 |         {
282 |           "toc_entry": "5300 to 5399",
283 |           "regex": "^scp-53\\d\\d$"
284 |         },
285 |         {
286 |           "toc_entry": "5400 to 5499",
287 |           "regex": "^scp-54\\d\\d$"
288 |         },
289 |         {
290 |           "toc_entry": "5500 to 5599",
291 |           "regex": "^scp-55\\d\\d$"
292 |         },
293 |         {
294 |           "toc_entry": "5600 to 5699",
295 |           "regex": "^scp-56\\d\\d$"
296 |         },
297 |         {
298 |           "toc_entry": "5700 to 5799",
299 |           "regex": "^scp-57\\d\\d$"
300 |         },
301 |         {
302 |           "toc_entry": "5800 to 5899",
303 |           "regex": "^scp-58\\d\\d$"
304 |         },
305 |         {
306 |           "toc_entry": "5900 to 5999",
307 |           "regex": "^scp-59\\d\\d$"
308 |         }
309 |       ]
310 |     },
311 |     {
312 |       "toc_entry": "Other SCPs",
313 |       "children": [
314 |         {
315 |           "toc_entry": "International SCPs",
316 |           "tags_all": [
317 |             "scp",
318 |             "international"
319 |           ]
320 |         },
321 |         {
322 |           "toc_entry": "Explained SCPs",
323 |           "tags_all": [
324 |             "scp",
325 |             "explained"
326 |           ]
327 |         },
328 |         {
329 |           "toc_entry": "Archived SCPs",
330 |           "tags_all": [
331 |             "scp",
332 |             "archived"
333 |           ]
334 |         },
335 |         {
336 |           "toc_entry": "Joke SCPs",
337 |           "tags_all": [
338 |             "scp",
339 |             "joke"
340 |           ]
341 |         },
342 |         {
343 |           "toc_entry": "Other SCPs",
344 |           "tags_all": [
345 |             "scp"
346 |           ]
347 |         }
348 |       ]
349 |     },
350 |     {
351 |       "toc_entry": "Tales",
352 |       "children": [
353 |         {
354 |           "toc_entry": "Tales A",
355 |           "regex": "^a",
356 |           "tags_all": [
357 |             "tale"
358 |           ],
359 |           "based_on": "title"
360 |         },
361 |         {
362 |           "toc_entry": "Tales B",
363 |           "regex": "^b",
364 |           "tags_all": [
365 |             "tale"
366 |           ],
367 |           "based_on": "title"
368 |         },
369 |         {
370 |           "toc_entry": "Tales C",
371 |           "regex": "^c",
372 |           "tags_all": [
373 |             "tale"
374 |           ],
375 |           "based_on": "title"
376 |         },
377 |         {
378 |           "toc_entry": "Tales D",
379 |           "regex": "^d",
380 |           "tags_all": [
381 |             "tale"
382 |           ],
383 |           "based_on": "title"
384 |         },
385 |         {
386 |           "toc_entry": "Tales E",
387 |           "regex": "^e",
388 |           "tags_all": [
389 |             "tale"
390 |           ],
391 |           "based_on": "title"
392 |         },
393 |         {
394 |           "toc_entry": "Tales F",
395 |           "regex": "^f",
396 |           "tags_all": [
397 |             "tale"
398 |           ],
399 |           "based_on": "title"
400 |         },
401 |         {
402 |           "toc_entry": "Tales G",
403 |           "regex": "^g",
404 |           "tags_all": [
405 |             "tale"
406 |           ],
407 |           "based_on": "title"
408 |         },
409 |         {
410 |           "toc_entry": "Tales H",
411 |           "regex": "^h",
412 |           "tags_all": [
413 |             "tale"
414 |           ],
415 |           "based_on": "title"
416 |         },
417 |         {
418 |           "toc_entry": "Tales I",
419 |           "regex": "^i",
420 |           "tags_all": [
421 |             "tale"
422 |           ],
423 |           "based_on": "title"
424 |         },
425 |         {
426 |           "toc_entry": "Tales J",
427 |           "regex": "^j",
428 |           "tags_all": [
429 |             "tale"
430 |           ],
431 |           "based_on": "title"
432 |         },
433 |         {
434 |           "toc_entry": "Tales K",
435 |           "regex": "^k",
436 |           "tags_all": [
437 |             "tale"
438 |           ],
439 |           "based_on": "title"
440 |         },
441 |         {
442 |           "toc_entry": "Tales L",
443 |           "regex": "^l",
444 |           "tags_all": [
445 |             "tale"
446 |           ],
447 |           "based_on": "title"
448 |         },
449 |         {
450 |           "toc_entry": "Tales M",
451 |           "regex": "^m",
452 |           "tags_all": [
453 |             "tale"
454 |           ],
455 |           "based_on": "title"
456 |         },
457 |         {
458 |           "toc_entry": "Tales N",
459 |           "regex": "^n",
460 |           "tags_all": [
461 |             "tale"
462 |           ],
463 |           "based_on": "title"
464 |         },
465 |         {
466 |           "toc_entry": "Tales O",
467 |           "regex": "^o",
468 |           "tags_all": [
469 |             "tale"
470 |           ],
471 |           "based_on": "title"
472 |         },
473 |         {
474 |           "toc_entry": "Tales P",
475 |           "regex": "^p",
476 |           "tags_all": [
477 |             "tale"
478 |           ],
479 |           "based_on": "title"
480 |         },
481 |         {
482 |           "toc_entry": "Tales Q",
483 |           "regex": "^q",
484 |           "tags_all": [
485 |             "tale"
486 |           ],
487 |           "based_on": "title"
488 |         },
489 |         {
490 |           "toc_entry": "Tales R",
491 |           "regex": "^r",
492 |           "tags_all": [
493 |             "tale"
494 |           ],
495 |           "based_on": "title"
496 |         },
497 |         {
498 |           "toc_entry": "Tales S",
499 |           "regex": "^s",
500 |           "tags_all": [
501 |             "tale"
502 |           ],
503 |           "based_on": "title"
504 |         },
505 |         {
506 |           "toc_entry": "Tales T",
507 |           "regex": "^t",
508 |           "tags_all": [
509 |             "tale"
510 |           ],
511 |           "based_on": "title"
512 |         },
513 |         {
514 |           "toc_entry": "Tales U",
515 |           "regex": "^u",
516 |           "tags_all": [
517 |             "tale"
518 |           ],
519 |           "based_on": "title"
520 |         },
521 |         {
522 |           "toc_entry": "Tales V",
523 |           "regex": "^v",
524 |           "tags_all": [
525 |             "tale"
526 |           ],
527 |           "based_on": "title"
528 |         },
529 |         {
530 |           "toc_entry": "Tales W",
531 |           "regex": "^w",
532 |           "tags_all": [
533 |             "tale"
534 |           ],
535 |           "based_on": "title"
536 |         },
537 |         {
538 |           "toc_entry": "Tales X",
539 |           "regex": "^x",
540 |           "tags_all": [
541 |             "tale"
542 |           ],
543 |           "based_on": "title"
544 |         },
545 |         {
546 |           "toc_entry": "Tales Y",
547 |           "regex": "^y",
548 |           "tags_all": [
549 |             "tale"
550 |           ],
551 |           "based_on": "title"
552 |         },
553 |         {
554 |           "toc_entry": "Tales Z",
555 |           "regex": "^z",
556 |           "tags_all": [
557 |             "tale"
558 |           ],
559 |           "based_on": "title"
560 |         },
561 |         {
562 |           "toc_entry": "Other Tales",
563 |           "tags_all": [
564 |             "tale"
565 |           ],
566 |           "based_on": "title"
567 |         }
568 |       ]
569 |     },
570 |     {
571 |       "toc_entry": "Index",
572 |       "special": "index"
573 |     }
574 |   ]
575 | }
576 | 


--------------------------------------------------------------------------------
/scp_epub/test_unit/download/test_cache.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import unittest.mock
  3 | import os
  4 | 
  5 | import download.cache
  6 | from constants import constants
  7 | 
  8 | 
  9 | class TestUseCache(unittest.TestCase):
 10 |     @unittest.mock.patch('download.utils.normalize_string')
 11 |     @unittest.mock.patch('download.cache.set_cached_contents')
 12 |     @unittest.mock.patch('download.cache.get_cached_contents')
 13 |     def test_use_cache_no_refresh_found_in_cache(self, mock_get_cached_contents, mock_set_cached_contents, mock_normalize_string):
 14 |         # Arrange
 15 |         expected_func = unittest.mock.MagicMock()
 16 |         expected_relative_path = 'foo/bar'
 17 |         expected_filetype = 'json'
 18 |         expected_item = 'Tale Of Three Soldiers'
 19 |         expected_refresh = False
 20 | 
 21 |         expected_normalized_item = 'tale-of-three-soldiers'
 22 |         expected_contents = 'contents'
 23 |         expected_cached_contents = expected_contents
 24 | 
 25 |         expected_args = [expected_item]
 26 |         expected_kwargs = {
 27 |             'refresh': expected_refresh
 28 |         }
 29 | 
 30 |         mock_get_cached_contents.return_value = expected_cached_contents
 31 |         mock_normalize_string.return_value = expected_normalized_item
 32 | 
 33 |         # Act
 34 |         actual_contents = download.cache.use_cache(expected_relative_path, expected_filetype)(expected_func)(*expected_args, **expected_kwargs)
 35 | 
 36 |         # Assert
 37 |         mock_normalize_string.assert_called_once_with(expected_item)
 38 |         mock_get_cached_contents.assert_called_once_with(expected_relative_path, expected_normalized_item, expected_filetype)
 39 |         mock_set_cached_contents.assert_not_called()
 40 |         expected_func.assert_not_called()
 41 |         self.assertEqual(expected_contents, actual_contents)
 42 | 
 43 |     @unittest.mock.patch('download.utils.normalize_string')
 44 |     @unittest.mock.patch('download.cache.set_cached_contents')
 45 |     @unittest.mock.patch('download.cache.get_cached_contents')
 46 |     def test_use_cache_implicit_no_refresh_found_in_cache(self, mock_get_cached_contents, mock_set_cached_contents, mock_normalize_string):
 47 |         # Arrange
 48 |         expected_func = unittest.mock.MagicMock()
 49 |         expected_relative_path = 'foo/bar'
 50 |         expected_filetype = 'json'
 51 |         expected_item = 'Tale Of Three Soldiers'
 52 | 
 53 |         expected_normalized_item = 'tale-of-three-soldiers'
 54 |         expected_contents = 'contents'
 55 |         expected_cached_contents = expected_contents
 56 | 
 57 |         expected_args = [expected_item]
 58 |         expected_kwargs = dict()
 59 | 
 60 |         mock_get_cached_contents.return_value = expected_cached_contents
 61 |         mock_normalize_string.return_value = expected_normalized_item
 62 | 
 63 |         # Act
 64 |         actual_contents = download.cache.use_cache(expected_relative_path, expected_filetype)(expected_func)(*expected_args, **expected_kwargs)
 65 | 
 66 |         # Assert
 67 |         mock_normalize_string.assert_called_once_with(expected_item)
 68 |         mock_get_cached_contents.assert_called_once_with(expected_relative_path, expected_normalized_item, expected_filetype)
 69 |         mock_set_cached_contents.assert_not_called()
 70 |         expected_func.assert_not_called()
 71 |         self.assertEqual(expected_contents, actual_contents)
 72 | 
 73 |     @unittest.mock.patch('download.utils.normalize_string')
 74 |     @unittest.mock.patch('download.cache.set_cached_contents')
 75 |     @unittest.mock.patch('download.cache.get_cached_contents')
 76 |     def test_use_cache_no_refresh_not_found_in_cache(self, mock_get_cached_contents, mock_set_cached_contents, mock_normalize_string):
 77 |         # Arrange
 78 |         expected_func = unittest.mock.MagicMock()
 79 |         expected_relative_path = 'foo/bar'
 80 |         expected_filetype = 'json'
 81 |         expected_item = 'Tale Of Three Soldiers'
 82 |         expected_refresh = False
 83 | 
 84 |         expected_normalized_item = 'tale-of-three-soldiers'
 85 |         expected_contents = 'contents'
 86 |         expected_cached_contents = None
 87 | 
 88 |         expected_args = [expected_item]
 89 |         expected_kwargs = {
 90 |             'refresh': expected_refresh
 91 |         }
 92 | 
 93 |         mock_get_cached_contents.return_value = expected_cached_contents
 94 |         mock_normalize_string.return_value = expected_normalized_item
 95 |         expected_func.return_value = expected_contents
 96 | 
 97 |         # Act
 98 |         actual_contents = download.cache.use_cache(expected_relative_path, expected_filetype)(expected_func)(*expected_args, **expected_kwargs)
 99 | 
100 |         # Assert
101 |         mock_normalize_string.assert_called_once_with(expected_item)
102 |         mock_get_cached_contents.assert_called_once_with(expected_relative_path, expected_normalized_item, expected_filetype)
103 |         mock_set_cached_contents.assert_called_once_with(expected_contents, expected_relative_path, expected_normalized_item, expected_filetype)
104 |         expected_func.assert_called_once_with(*expected_args, **expected_kwargs)
105 |         self.assertEqual(expected_contents, actual_contents)
106 | 
107 |     @unittest.mock.patch('download.utils.normalize_string')
108 |     @unittest.mock.patch('download.cache.set_cached_contents')
109 |     @unittest.mock.patch('download.cache.get_cached_contents')
110 |     def test_use_cache_refresh(self, mock_get_cached_contents, mock_set_cached_contents, mock_normalize_string):
111 |         # Arrange
112 |         expected_func = unittest.mock.MagicMock()
113 |         expected_relative_path = 'foo/bar'
114 |         expected_filetype = 'json'
115 |         expected_item = 'Tale Of Three Soldiers'
116 |         expected_refresh = True
117 | 
118 |         expected_normalized_item = 'tale-of-three-soldiers'
119 |         expected_contents = 'contents'
120 |         expected_cached_contents = None
121 | 
122 |         expected_args = [expected_item]
123 |         expected_kwargs = {
124 |             'refresh': expected_refresh
125 |         }
126 | 
127 |         mock_get_cached_contents.return_value = expected_cached_contents
128 |         mock_normalize_string.return_value = expected_normalized_item
129 |         expected_func.return_value = expected_contents
130 | 
131 |         # Act
132 |         actual_contents = download.cache.use_cache(expected_relative_path, expected_filetype)(expected_func)(*expected_args, **expected_kwargs)
133 | 
134 |         # Assert
135 |         mock_normalize_string.assert_called_once_with(expected_item)
136 |         mock_get_cached_contents.assert_not_called()
137 |         mock_set_cached_contents.assert_called_once_with(expected_contents, expected_relative_path, expected_normalized_item, expected_filetype)
138 |         expected_func.assert_called_once_with(*expected_args, **expected_kwargs)
139 |         self.assertEqual(expected_contents, actual_contents)
140 | 
141 | 
142 | class TestGetCachedContents(unittest.TestCase):
143 |     @unittest.mock.patch('json.loads')
144 |     @unittest.mock.patch('download.aws.retrieve_from_s3_cache')
145 |     @unittest.mock.patch('download.cache.retrieve_from_local_cache')
146 |     def test_get_cached_contents_locally(self, mock_retrieve_from_local_cache, mock_retrieve_from_s3_cache, mock_loads):
147 |         # Arrange
148 |         os.environ.pop(constants.USE_AWS_VARIABLE, None)
149 |         expected_filetype = 'html'
150 |         expected_relative_path = 'foo/bar/'
151 |         expected_item = 'scp-123'
152 | 
153 |         # Act
154 |         actual_contents = download.cache.get_cached_contents(expected_relative_path, expected_item, expected_filetype)
155 | 
156 |         # Assert
157 |         self.assertEqual(mock_retrieve_from_local_cache.return_value, actual_contents)
158 |         mock_loads.assert_not_called()
159 |         mock_retrieve_from_s3_cache.assert_not_called()
160 |         mock_retrieve_from_local_cache.assert_called_once_with(expected_relative_path, expected_item, expected_filetype)
161 | 
162 |     @unittest.mock.patch('json.loads')
163 |     @unittest.mock.patch('download.aws.retrieve_from_s3_cache')
164 |     @unittest.mock.patch('download.cache.retrieve_from_local_cache')
165 |     def test_get_cached_contents_s3(self, mock_retrieve_from_local_cache, mock_retrieve_from_s3_cache, mock_loads):
166 |         # Arrange
167 |         os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE
168 |         expected_filetype = 'html'
169 |         expected_relative_path = 'foo/bar/'
170 |         expected_item = 'scp-123'
171 | 
172 |         # Act
173 |         actual_contents = download.cache.get_cached_contents(expected_relative_path, expected_item, expected_filetype)
174 | 
175 |         # Assert
176 |         self.assertEqual(mock_retrieve_from_s3_cache.return_value, actual_contents)
177 |         mock_loads.assert_not_called()
178 |         mock_retrieve_from_s3_cache.assert_called_once_with(expected_relative_path, expected_item, expected_filetype)
179 |         mock_retrieve_from_local_cache.assert_not_called()
180 | 
181 |     @unittest.mock.patch('json.loads')
182 |     @unittest.mock.patch('download.aws.retrieve_from_s3_cache')
183 |     @unittest.mock.patch('download.cache.retrieve_from_local_cache')
184 |     def test_get_cached_contents_load_json(self, mock_retrieve_from_local_cache, mock_retrieve_from_s3_cache, mock_loads):
185 |         # Arrange
186 |         os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE
187 |         expected_filetype = 'json'
188 |         expected_relative_path = 'foo/bar/'
189 |         expected_item = 'scp-123'
190 |         expected_contents = mock_loads.return_value
191 | 
192 |         # Act
193 |         actual_contents = download.cache.get_cached_contents(expected_relative_path, expected_item, expected_filetype)
194 | 
195 |         # Assert
196 |         self.assertEqual(expected_contents, actual_contents)
197 |         mock_loads.assert_called_once_with(mock_retrieve_from_s3_cache.return_value)
198 |         mock_retrieve_from_s3_cache.assert_called_once_with(expected_relative_path, expected_item, expected_filetype)
199 |         mock_retrieve_from_local_cache.assert_not_called()
200 | 
201 | 
202 | class TestRetrieveFromLocalCache(unittest.TestCase):
203 |     @unittest.mock.patch('builtins.open')
204 |     def test_retrieve_from_local_cache(self, mock_open):
205 |         # Arrange
206 |         expected_relative_path = 'foo/bar'
207 |         expected_item = 'scp-123'
208 |         expected_filetype = 'json'
209 |         expected_cache_file = os.path.join(constants.LOCAL_CACHE_BASE_PATH, expected_relative_path, expected_item + '.' + expected_filetype)
210 |         expected_encoding = constants.ENCODING
211 |         expected_open_type = 'r'
212 |         expected_contents = mock_open.return_value.__enter__.return_value.read.return_value
213 | 
214 |         # Act
215 |         actual_contents = download.cache.retrieve_from_local_cache(expected_relative_path, expected_item, expected_filetype)
216 | 
217 |         # Assert
218 |         self.assertEqual(expected_contents, actual_contents)
219 |         mock_open.assert_called_once_with(expected_cache_file, expected_open_type, encoding=expected_encoding)
220 | 
221 |     @unittest.mock.patch('builtins.open')
222 |     def test_retrieve_from_local_cache_file_not_found(self, mock_open):
223 |         # Arrange
224 |         expected_relative_path = 'foo/bar'
225 |         expected_item = 'scp-123'
226 |         expected_filetype = 'json'
227 |         expected_cache_file = os.path.join(constants.LOCAL_CACHE_BASE_PATH, expected_relative_path, expected_item + '.' + expected_filetype)
228 |         expected_encoding = constants.ENCODING
229 |         expected_open_type = 'r'
230 |         mock_open.return_value.__enter__.side_effect = FileNotFoundError
231 | 
232 |         expected_contents = None
233 | 
234 |         # Act
235 |         actual_contents = download.cache.retrieve_from_local_cache(expected_relative_path, expected_item, expected_filetype)
236 | 
237 |         # Assert
238 |         self.assertEqual(expected_contents, actual_contents)
239 |         mock_open.assert_called_once_with(expected_cache_file, expected_open_type, encoding=expected_encoding)
240 | 
241 | 
242 | class TestStoreInLocalCache(unittest.TestCase):
243 |     @unittest.mock.patch('os.makedirs')
244 |     @unittest.mock.patch('builtins.open')
245 |     def test_store_in_local_cache(self, mock_open, mock_makedirs):
246 |         # Arrange
247 |         expected_relative_path = 'foo/bar'
248 |         expected_item = 'scp-123'
249 |         expected_filetype = 'json'
250 |         expected_cache_dir = os.path.join(constants.LOCAL_CACHE_BASE_PATH, expected_relative_path)
251 |         expected_cache_file = os.path.join(constants.LOCAL_CACHE_BASE_PATH, expected_relative_path, expected_item + '.' + expected_filetype)
252 |         expected_encoding = constants.ENCODING
253 |         expected_exist_ok = True
254 |         expected_open_type = 'w'
255 |         expected_contents = 'contents'
256 | 
257 |         # Act
258 |         actual_contents = download.cache.store_in_local_cache(expected_contents, expected_relative_path, expected_item, expected_filetype)
259 | 
260 |         # Assert
261 |         mock_makedirs.assert_called_once_with(expected_cache_dir, exist_ok=expected_exist_ok)
262 |         mock_open.assert_called_once_with(expected_cache_file, expected_open_type, encoding=expected_encoding)
263 |         mock_open.return_value.__enter__.return_value.write.assert_called_once_with(expected_contents)
264 | 
265 | 
266 | class TestSetCachedContents(unittest.TestCase):
267 |     @unittest.mock.patch('json.dumps')
268 |     @unittest.mock.patch('download.aws.store_in_s3_cache')
269 |     @unittest.mock.patch('download.cache.store_in_local_cache')
270 |     def test_set_cached_contents_locally(self, mock_store_in_local_cache, mock_store_in_s3_cache, mock_loads):
271 |         # Arrange
272 |         os.environ.pop(constants.USE_AWS_VARIABLE, None)
273 |         expected_filetype = 'html'
274 |         expected_relative_path = 'foo/bar/'
275 |         expected_item = 'scp-123'
276 |         expected_contents = 'contents'
277 | 
278 |         # Act
279 |         download.cache.set_cached_contents(expected_contents, expected_relative_path, expected_item, expected_filetype)
280 | 
281 |         # Assert
282 |         mock_loads.assert_not_called()
283 |         mock_store_in_s3_cache.assert_not_called()
284 |         mock_store_in_local_cache.assert_called_once_with(expected_contents, expected_relative_path, expected_item, expected_filetype)
285 | 
286 |     @unittest.mock.patch('json.dumps')
287 |     @unittest.mock.patch('download.aws.store_in_s3_cache')
288 |     @unittest.mock.patch('download.cache.store_in_local_cache')
289 |     def test_set_cached_contents_s3(self, mock_store_in_local_cache, mock_store_in_s3_cache, mock_loads):
290 |         # Arrange
291 |         os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE
292 |         expected_filetype = 'html'
293 |         expected_relative_path = 'foo/bar/'
294 |         expected_item = 'scp-123'
295 |         expected_contents = 'contents'
296 | 
297 |         # Act
298 |         download.cache.set_cached_contents(expected_contents, expected_relative_path, expected_item, expected_filetype)
299 | 
300 |         # Assert
301 |         mock_loads.assert_not_called()
302 |         mock_store_in_local_cache.assert_not_called()
303 |         mock_store_in_s3_cache.assert_called_once_with(expected_contents, expected_relative_path, expected_item, expected_filetype)
304 | 
305 |     @unittest.mock.patch('json.dumps')
306 |     @unittest.mock.patch('download.aws.store_in_s3_cache')
307 |     @unittest.mock.patch('download.cache.store_in_local_cache')
308 |     def test_set_cached_contents_load_json(self, mock_store_in_local_cache, mock_store_in_s3_cache, mock_loads):
309 |         # Arrange
310 |         os.environ[constants.USE_AWS_VARIABLE] = constants.USE_AWS_TRUE
311 |         expected_filetype = 'json'
312 |         expected_relative_path = 'foo/bar/'
313 |         expected_item = 'scp-123'
314 |         expected_contents = {'contents': 'contents'}
315 | 
316 |         # Act
317 |         download.cache.set_cached_contents(expected_contents, expected_relative_path, expected_item, expected_filetype)
318 | 
319 |         # Assert
320 |         mock_loads.assert_called_once_with(expected_contents)
321 |         mock_store_in_s3_cache.assert_called_once_with(mock_loads.return_value, expected_relative_path, expected_item, expected_filetype)
322 |         mock_store_in_local_cache.assert_not_called()
323 | 


--------------------------------------------------------------------------------
/scp_epub/test_component/process/test_process_page_cases/scp-1257.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html 
  2 |      PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  3 |      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  4 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
  5 | 
  6 | <head>
  7 |  	<title>SCP Foundation: SCP-1257</title>
  8 |  	<script type="text/javascript" src="http://d3g0gp89917ko0.cloudfront.net/v--3e3a6f7dbcc9/common--javascript/init.combined.js"></script>
  9 |  	
 10 |  	<script  type="text/javascript">
 11 |  		// global request information
 12 |  		
 13 |  		var WIKIREQUEST = {};
 14 |  		WIKIREQUEST.info = {};
 15 |  		
 16 |  		WIKIREQUEST.info.domain = "www.scp-wiki.net";
 17 |  		WIKIREQUEST.info.siteId = 66711;
 18 |  		WIKIREQUEST.info.requestPageName = "scp-1257";
 19 |  		OZONE.request.timestamp = 1589092344;
 20 |  		OZONE.request.date = new Date();
 21 |  		WIKIREQUEST.info.lang = 'en';
 22 |  		 		WIKIREQUEST.info.pageUnixName = "scp-1257";
 23 |  		WIKIREQUEST.info.pageId = 16166738;
 24 |  		// 		window.onload = WikidotInit();
 25 |  	</script>
 26 |  	
 27 |  <!--	<base href="http://www.scp-wiki.net/" /> -->
 28 |  	<meta http-equiv="content-type" content="text/html;charset=UTF-8"/>
 29 |     <meta http-equiv="content-language" content="en"/>
 30 |  	
 31 |  	<script type="text/javascript" src="http://d3g0gp89917ko0.cloudfront.net/v--3e3a6f7dbcc9/common--javascript/WIKIDOT.combined.js"></script>
 32 | 	
 33 |  	<script type="text/javascript" src="http://d3g0gp89917ko0.cloudfront.net/v--3e3a6f7dbcc9/common--javascript/printview.js"></script>
 34 |  	
 35 |    	<style type="text/css" id="internal-style">
 36 |    		
 37 |    		   			@import url(http://d3g0gp89917ko0.cloudfront.net/v--3e3a6f7dbcc9/common--theme/base/css/style.css);
 38 |    		   			@import url(http://scp-wiki.wdfiles.com/local--code/component%3Atheme/1);
 39 |    		   		@import url(/common--theme/base/css/print.css?0);
 40 |    		
 41 |     </style>
 42 |     <style type="text/css" media="print">
 43 | 	    @import url(/common--theme/base/css/print2.css?0);
 44 |     </style>
 45 | <script type="text/javascript" src="http://d3g0gp89917ko0.cloudfront.net/v--3e3a6f7dbcc9/common--modules/js/pagerate/PageRateWidgetModule.js"></script>
 46 | </head>
 47 | 
 48 |   <body id="html-body" class="print-body">
 49 | 
 50 | 	<div id="container">
 51 | 		<div id="print-options">
 52 | 			<table>
 53 | 				<tr>
 54 | 					<td>
 55 | 						Base font size:
 56 | 					</td>
 57 | 					<td>
 58 | 						<a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontSize(event, '6pt')">6pt</a>
 59 | 						| <a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontSize(event, '8pt')">8pt</a>
 60 | 						| <a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontSize(event, '10pt')">10pt</a>
 61 | 						| <a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontSize(event, '12pt')">12pt</a>
 62 | 						| <a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontSize(event, '14pt')">14pt</a>
 63 | 						| <a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontSize(event, '16pt')">16pt</a>
 64 | 					</td>
 65 | 				</tr>
 66 | 				<tr>
 67 | 					<td>
 68 | 						Body font:
 69 | 					</td>
 70 | 					<td>
 71 | 						<a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontFamily(event,'original')">original font</a>
 72 | 						| <a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontFamily(event,'Georgia')">Georgia</a>
 73 | 						| <a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontFamily(event,'roman')">Times New Roman</a>
 74 | 						| <a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontFamily(event,'Serif')">Serif (generic)</a>
 75 | 						| <a href="javascript:;" onclick="WIKIDOT.printview.listeners.changeFontFamily(event,'Arial, Helvetica')">Arial/Helvetica</a>
 76 | 					</td>
 77 | 				</tr>
 78 | 				<tr>
 79 | 					<td>
 80 | 						Source info:
 81 | 					</td>
 82 | 					<td>
 83 | 						<a href="javascript:;" onclick="WIKIDOT.printview.listeners.toggleSourceInfo(event)">toggle visibility</a>
 84 | 					</td>
 85 | 				</tr>
 86 | 				<tr>
 87 | 					<td>Options:</td>
 88 | 					<td>
 89 | 						<b><a href="javascript:;" onclick="window.print()">PRINT THE PAGE</a></b>
 90 | 						| <a href="javascript:;" onclick="window.close()">Close this window</a>
 91 | 					</td>
 92 | 				</tr>
 93 | 			</table>
 94 | 		</div>
 95 | 	
 96 | 	  	<div id="print-head">
 97 | 		  	Site: <b>SCP Foundation</b> at http://www.scp-wiki.net
 98 | 		  	<br/>
 99 | 		  	Source page: <b>SCP-1257</b> at http://www.scp-wiki.net/scp-1257
100 | 	  	</div>
101 | 		<div id="content-wrap">
102 | 			<div id="main-content">
103 | 				<div id="action-area-top"></div>
104 | 								<div id="page-title">
105 | 					SCP-1257
106 | 				</div>
107 | 								
108 | 				<div id="page-content">
109 | 					
110 | 
111 | <div style="text-align: right;"><div class="page-rate-widget-box"><span class="rate-points">rating:&nbsp;<span class="number prw54353">+198</span></span><span class="rateup btn btn-default"><a title="I like it" href="javascript:;" onclick="WIKIDOT.modules.PageRateWidgetModule.listeners.rate(event, 1)">+</a></span><span class="ratedown btn btn-default"><a title="I don't like it" href="javascript:;" onclick="WIKIDOT.modules.PageRateWidgetModule.listeners.rate(event, -1)">&#8211;</a></span><span class="cancel btn btn-default"><a title="Cancel my vote" href="javascript:;" onclick="WIKIDOT.modules.PageRateWidgetModule.listeners.cancelVote(event)">x</a></span></div></div>
112 | <p><strong>Item #:</strong> SCP-1257</p>
113 | <p><strong>Object Class:</strong> Safe</p>
114 | <p><strong>Special Containment Procedures:</strong> All copies of SCP-1257, encompassing all instances of SCP-1257-1, SCP-1257-2, and SCP-1257-3, are to be kept in the secure media vault at Site-██. Any uncontained copies of SCP-1257 are to be recovered or destroyed by MTF Mu-53 (“Ebert's Thumb”). Because of the nature of the original appearance of SCP-1257, and its widespread exposure to the public, MTF Mu-53 is also tasked to replace any new sources of information about SCP-1257 as they are discovered, in whatever format they may appear, in accordance with Protocol Gamma-1257-A (Codename: “Snopes’ Revenge”)<sup class="footnoteref"><a id="footnoteref-1" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-1')">1</a></sup></p>
115 | <p>Unauthorized persons exposed to copies of SCP-1257, or who evidence any knowledge about SCP-1257, shall be interrogated by the Foundation Intelligence Department, administered Class A Amnestics, and have implanted post-hypnotic suggestions to reinforce the belief that SCP-1257 is a hoax.<sup class="footnoteref"><a id="footnoteref-2" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-2')">2</a></sup></p>
116 | <p>Study of SCP-1257 is limited to personnel of Level 3 or higher, subject to approval by the Foundation Intelligence Department.</p>
117 | <p><strong>Description:</strong> SCP-1257 is an American-produced half-hour situation comedy originally titled <em>Raising Danny</em> that aired on the ███ television network for six episodes in 197█. Instances of SCP-1257-1 are the original production reels for all twelve episodes filmed, recovered from the ███ archives in 198█. Instances of SCP-1257-2 are the draft and shooting scripts and copies, including four episodes that were never shot. Instances of SCP-1257-3 are all other video recordings of the six episodes actually aired.</p>
118 | <p>The anomalous properties of SCP-1257 manifest in any and all video copies produced from the original series, and in any copies of the scripts for those episodes. Every year, beginning in mid-September, video recordings and scripts for <em>Raising Danny</em> will change to reflect a new season of episodes. Replacements will begin with episode one, and progress sequentially through each episode in order during each subsequent week. While the Foundation has access to the first sixteen episodes of each season,<sup class="footnoteref"><a id="footnoteref-3" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-3')">3</a></sup> it appears that each SCP-1257 season runs approximately 24 episodes. Additionally, while new seasons of SCP-1257 occasionally produce hour-long &quot;specials,” copies are always limited to the first half-hour running time of the original episodes of SCP-1257.</p>
119 | <p>Video copies of the original over-the-air broadcast of the first six episodes of SCP-1257 present a special case. Commercials recorded contemporaneously with SCP-1257 will also show changes consistent with the content of SCP-1257, and updated videos have occasionally shown news bulletins and weather alerts that imply multiple points of divergence between the world that continues to produce SCP-1257 and our own.</p>
120 | <p>SCP-1257’s original premise had a black man, named Tyler (played by Whitman Mayo), married to a white woman who already had a son by a prior marriage. When the woman dies, prior to the pilot episode, the man is left raising her son, named Danny (played by Danny Bonaduce), as his own. Reviews of the original series recovered by the Foundation were universally unfavorable and referred to it as “The unwanted bastard child of <em>Sanford and Son</em> and <em>The Courtship of Eddie's Father</em>.” Because of the anomalous properties of SCP-1257, the Foundation has only been able to reconstruct a general outline of the original content of the series.</p>
121 | <p><strong>Addendum 1:</strong> Notes on selected episodes of SCP-1257 observed in Foundation custody.</p>
122 | <div class="collapsible-block">
123 | <div class="collapsible-block-folded"><a class="collapsible-block-link" href="javascript:;">+&nbsp;Document&nbsp;S-1257-11</a></div>
124 | <div class="collapsible-block-unfolded" style="display:none">
125 | <div class="collapsible-block-unfolded-link"><a class="collapsible-block-link" href="javascript:;">-&nbsp;Document&nbsp;S-1257-11</a></div>
126 | <div class="collapsible-block-content">
127 | <blockquote>
128 | <ul>
129 | <li>Season 3, Episode 3, “Tyler’s Date”: Episode manifests three months after initial containment. One line in the script implies the assassination of American President Jimmy Carter in late 1978 or early 1979.</li>
130 | <li>Season 4, Episode 1, “Bad Touch pt. 2”: Second half of a “special episode” ending season three. Eric, a young classmate of Danny’s, is the target of a sexual predator using what appears to be anomalous items manufactured by Doctor Wondertainment to lure his victims.</li>
131 | <li>Season 5, Episode 10, “The Senior Trip”: Episode mentions a scandal where 10 members of the UK House of Commons had been publicly revealed as members of a cult that bears a strong resemblance to the Church of the Broken God.</li>
132 | <li>Season 6, Episode 1, “The Freshmen”: The title of SCP-1257 is changed to <em>Danny</em>. The premise of the series changes as well, dropping the Tyler character and sending Danny to college in New York City with five of his classmates from High School.<sup class="footnoteref"><a id="footnoteref-4" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-4')">4</a></sup> The University Lab appears to have specimens of SCP-███, SCP-███ and SCP-███.</li>
133 | <li>Season 6, Episode 11, “The ████████”: Plot of the episode concerns Eric’s suspicions that one of their dormmates might be secretly one of the “████████.” This turns out to be a misunderstanding. From context, the “████████” appear to prey on young women and have become endemic in [REDACTED] and seem to be the result of a containment breach of [REDACTED] in Mexico City.</li>
134 | <li>Season 7, Episode 2, “Eric’s Midterm Caper”: When this episode manifested in SCP-1257-3-12, a new advertisement appeared during the second break for Marshall, Carter, and Dark Ltd. The ad promoted [REDACTED] services for [REDACTED].</li>
135 | <li>Season 10, Episode 1, “The Job Hunt”: Hour-long “special” introducing another change in premise.<sup class="footnoteref"><a id="footnoteref-5" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-5')">5</a></sup> One scene implies that the Global Occult Coalition has become public enough to run “want ads” in the local newspaper.</li>
136 | <li>Season 10, Episode 2, “The New Guy”: The show’s title is officially changed to <em>Agent Danny of the SCP</em>.<sup class="footnoteref"><a id="footnoteref-6" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-6')">6</a></sup> Danny has been employed as Level 1 security at Site-19, and through a series of mishaps, prevents a containment breach of <a href="/scp-173">SCP-173</a>.</li>
137 | <li>Season 10, Episode 5, “D-Class Act”: Danny mis-hears a co-worker’s conversation and becomes convinced he has been mistakenly reassigned to D-Class by the HR Department.</li>
138 | <li>Season 10, Episode 11, “Leaping Lizards”: [REDACTED] <a href="/scp-682">SCP-682</a> [REDACTED].</li>
139 | </ul>
140 | <p><em><strong>Note:</strong> Details of SCP-1257 episodes past Season 10 are only available with the approval of the Intelligence Department.</em></p>
141 | </blockquote>
142 | </div>
143 | </div>
144 | </div>
145 | <div class="footnotes-footer">
146 | <div class="title">Footnotes</div>
147 | <div class="footnote-footer" id="footnote-1"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-1')">1</a>. Information about SCP-1257 is replaced with new information leading to the conclusion that SCP-1257 never existed, and is a hoax. Protocol Gamma-1257-A also requires all original documentation discovered about SCP-1257 be replaced with forged copies omitting references to SCP-1257.</div>
148 | <div class="footnote-footer" id="footnote-2"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-2')">2</a>. As of 5/27/19██ all original participants in the production of SCP-1257 have been found and either treated with amnestics or terminated.</div>
149 | <div class="footnote-footer" id="footnote-3"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-3')">3</a>. Twelve via video copies (SCP-1257-1) and four only via unproduced scripts (SCP-1257-2).</div>
150 | <div class="footnote-footer" id="footnote-4"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-4')">4</a>. Including Eric, despite a three-year age difference.</div>
151 | <div class="footnote-footer" id="footnote-5"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-5')">5</a>. Danny has graduated college, is unemployed, and advertisements shown during SCP-1257-3 instances show that the Eric character has been spun off into his own series.</div>
152 | <div class="footnote-footer" id="footnote-6"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-6')">6</a>. It is clear from the script that the title the writers intended is actually <em>Agent Danny of the SCP <strong>Foundation</strong></em>. However, in the title card and in taped dialog, “Foundation” is consistently omitted. The name does appear correctly in the background sets intended to represent Foundation locations.</div>
153 | </div>
154 | <div class="footer-wikiwalk-nav">
155 | <div style="text-align: center;">
156 | <p>&#171; <a href="/scp-1256">SCP-1256</a> | SCP-1257 | <a href="/scp-1258">SCP-1258</a> &#187;</p>
157 | </div>
158 | </div>
159 | 
160 | 				</div>
161 | 				
162 | 				<div id="page-info" >
163 | 					page revision: 16, last edited: <span class="odate time_1527279391 format_%25e%20%25b%20%25Y%2C%20%25H%3A%25M%20%28%25O%20ago%29">25 May 2018 20:16</span>
164 | 				</div>
165 | 				<div id="action-area" style="display: none"></div>
166 | 			</div>
167 | 		</div>
168 | 		
169 | 		
170 |  		
171 | 	 	
172 |  		<hr/>
173 |  		
174 |  		<div id="license-area" class="license-area">
175 | 			Unless stated otherwise Content of this page is licensed under <a rel="license" href="http://creativecommons.org/licenses/by-sa/3.0/">Creative Commons Attribution-ShareAlike 3.0 License</a>
176 | 		</div>
177 | 		
178 |  	</div>
179 |  	
180 |  	<div style="display:none" id="dummy-ondomready-block"></div>
181 | 	
182 | 	<script type="text/javascript">
183 |   var _gaq = _gaq || [];
184 |   _gaq.push(['_setAccount', 'UA-18234656-1']);
185 |   _gaq.push(['_setDomainName', 'none']);
186 |   _gaq.push(['_setAllowLinker', true]);
187 |   _gaq.push(['_trackPageview']);
188 | 
189 |   _gaq.push(['old._setAccount', 'UA-68540-5']);
190 |   _gaq.push(['old._setDomainName', 'none']);
191 |   _gaq.push(['old._setAllowLinker', true]);
192 |   _gaq.push(['old._trackPageview']);
193 |   
194 | 
195 |   (function() {
196 |     var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
197 |     ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
198 |     var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
199 |   })();
200 | </script>
201 | 
202 | 	
203 |   </body>
204 | 
205 | </html>


--------------------------------------------------------------------------------
/scp_epub/test_unit/process/test_process_page.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import unittest.mock
  3 | 
  4 | from parameterized import parameterized
  5 | import bs4
  6 | import json
  7 | 
  8 | import process.process_page
  9 | from constants import constants
 10 | 
 11 | 
 12 | class TestProcessPage(unittest.TestCase):
 13 |     def setUp(self):
 14 |         self.maxDiff = None
 15 | 
 16 |     @unittest.mock.patch('process.process_page.process_page_html')
 17 |     def test_process_page(self, mock_process_page_html):
 18 |         # Arrange
 19 |         expected_url_allow_list = None
 20 | 
 21 |         expected_fullname = "personal-log-of-iceberg"
 22 |         expected_title = "Not Title Shown: Personal Log of █████ \"Iceberg\" ████"
 23 |         expected_title_shown = "Title Shown: Personal Log of █████ \"Iceberg\" ████"
 24 |         expected_created_at = "2008-10-16T21:06:01+00:00"
 25 |         expected_created_by = "unknown"
 26 |         expected_tags = [
 27 |             "doctor-kondraki",
 28 |             "doctor-iceberg",
 29 |             "doctor-gears",
 30 |             "tale"
 31 |         ]
 32 |         expected_web_html = "<html>blablabla</html>"
 33 |         expected_substitute_html = None
 34 |         expected_processed_html = "<div>processed html</div>"
 35 | 
 36 |         expected_processed_title = expected_title_shown
 37 | 
 38 |         mock_process_page_html.return_value = expected_processed_html
 39 | 
 40 |         expected_page = {
 41 |             "fullname": expected_fullname,
 42 |             "created_at": expected_created_at,
 43 |             "created_by": expected_created_by,
 44 |             "updated_at": "2019-09-15T01:08:04+00:00",
 45 |             "updated_by": "Elogee FishTruck",
 46 |             "title": expected_title,
 47 |             "title_shown": expected_title_shown,
 48 |             "parent_fullname": None,
 49 |             "tags": expected_tags,
 50 |             "rating": 38,
 51 |             "revisions": 36,
 52 |             "parent_title": None,
 53 |             "content": "",
 54 |             "children": 0,
 55 |             "comments": 5,
 56 |             "commented_at": "2015-09-16T18:15:32+00:00",
 57 |             "commented_by": "Decibelles",
 58 |             "scp_epub_additional_data": {
 59 |                 "web_html": expected_web_html
 60 |             }
 61 |         }
 62 | 
 63 |         expected_processed_page = {
 64 |             "name": expected_fullname,
 65 |             "title": expected_processed_title,
 66 |             "created_by": expected_created_by,
 67 |             "created_at": expected_created_at,
 68 |             "tags": expected_tags,
 69 |             "html": expected_processed_html,
 70 |         }
 71 | 
 72 |         # Act
 73 |         actual_processed_page = process.process_page.process_page(expected_page, url_allow_list=expected_url_allow_list)
 74 | 
 75 |         # Assert
 76 |         mock_process_page_html.assert_called_once_with(expected_web_html, expected_processed_title, url_allow_list=expected_url_allow_list)
 77 |         self.assertEqual(expected_processed_page, actual_processed_page)
 78 | 
 79 |     @unittest.mock.patch('process.process_page.process_page_html')
 80 |     def test_process_page_no_title_shown(self, mock_process_page_html):
 81 |         # Arrange
 82 |         expected_url_allow_list = None
 83 | 
 84 |         expected_fullname = "personal-log-of-iceberg"
 85 |         expected_title = "Not Title Shown: Personal Log of █████ \"Iceberg\" ████"
 86 |         expected_title_shown = None
 87 |         expected_created_at = "2008-10-16T21:06:01+00:00"
 88 |         expected_created_by = "unknown"
 89 |         expected_tags = [
 90 |             "doctor-kondraki",
 91 |             "doctor-iceberg",
 92 |             "doctor-gears",
 93 |             "tale"
 94 |         ]
 95 |         expected_web_html = "<html>blablabla</html>"
 96 |         expected_substitute_html = None
 97 |         expected_processed_html = "<div>processed html</div>"
 98 | 
 99 |         expected_processed_title = expected_title
100 | 
101 |         mock_process_page_html.return_value = expected_processed_html
102 | 
103 |         expected_page = {
104 |             "fullname": expected_fullname,
105 |             "created_at": expected_created_at,
106 |             "created_by": expected_created_by,
107 |             "updated_at": "2019-09-15T01:08:04+00:00",
108 |             "updated_by": "Elogee FishTruck",
109 |             "title": expected_title,
110 |             "title_shown": expected_title_shown,
111 |             "parent_fullname": None,
112 |             "tags": expected_tags,
113 |             "rating": 38,
114 |             "revisions": 36,
115 |             "parent_title": None,
116 |             "content": "",
117 |             "children": 0,
118 |             "comments": 5,
119 |             "commented_at": "2015-09-16T18:15:32+00:00",
120 |             "commented_by": "Decibelles",
121 |             "scp_epub_additional_data": {
122 |                 "web_html": expected_web_html
123 |             }
124 |         }
125 | 
126 |         expected_processed_page = {
127 |             "name": expected_fullname,
128 |             "title": expected_processed_title,
129 |             "created_by": expected_created_by,
130 |             "created_at": expected_created_at,
131 |             "tags": expected_tags,
132 |             "html": expected_processed_html,
133 |         }
134 | 
135 |         # Act
136 |         actual_processed_page = process.process_page.process_page(expected_page, url_allow_list=expected_url_allow_list)
137 | 
138 |         # Assert
139 |         mock_process_page_html.assert_called_once_with(expected_web_html, expected_processed_title, url_allow_list=expected_url_allow_list)
140 |         self.assertEqual(expected_processed_page, actual_processed_page)
141 | 
142 |     @unittest.mock.patch('process.process_page.process_page_html')
143 |     def test_process_page_missing_fields(self, mock_process_page_html):
144 |         # Arrange
145 |         expected_url_allow_list = None
146 | 
147 |         expected_fullname = "personal-log-of-iceberg"
148 |         expected_title = None
149 |         expected_title_shown = None
150 |         expected_created_at = None
151 |         expected_created_by = None
152 |         expected_tags = None
153 |         expected_web_html = "<html>blablabla</html>"
154 |         expected_substitute_html = None
155 |         expected_processed_html = "<div>processed html</div>"
156 | 
157 |         expected_processed_title = constants.EMPTY_TITLE
158 | 
159 |         mock_process_page_html.return_value = expected_processed_html
160 | 
161 |         expected_page = {
162 |             "fullname": expected_fullname,
163 |             "created_at": expected_created_at,
164 |             "created_by": expected_created_by,
165 |             "updated_at": "2019-09-15T01:08:04+00:00",
166 |             "updated_by": "Elogee FishTruck",
167 |             "title": expected_title,
168 |             "title_shown": expected_title_shown,
169 |             "parent_fullname": None,
170 |             "tags": expected_tags,
171 |             "rating": 38,
172 |             "revisions": 36,
173 |             "parent_title": None,
174 |             "content": "",
175 |             "children": 0,
176 |             "comments": 5,
177 |             "commented_at": "2015-09-16T18:15:32+00:00",
178 |             "commented_by": "Decibelles",
179 |             "scp_epub_additional_data": {
180 |                 "web_html": expected_web_html
181 |             }
182 |         }
183 | 
184 |         expected_processed_page = {
185 |             "name": expected_fullname,
186 |             "title": expected_processed_title,
187 |             "created_by": constants.EMPTY_AUTHOR,
188 |             "created_at": constants.EMPTY_TIMESTAMP,
189 |             "tags": [],
190 |             "html": expected_processed_html,
191 |         }
192 | 
193 |         # Act
194 |         actual_processed_page = process.process_page.process_page(expected_page, url_allow_list=expected_url_allow_list)
195 | 
196 |         # Assert
197 |         mock_process_page_html.assert_called_once_with(expected_web_html, expected_processed_title, url_allow_list=expected_url_allow_list)
198 |         self.assertEqual(expected_processed_page, actual_processed_page)
199 | 
200 |     @unittest.mock.patch('process.process_page.process_page_html')
201 |     def test_process_page_url_allow_list(self, mock_process_page_html):
202 |         # Arrange
203 |         expected_url_allow_list = ['a', 'b']
204 | 
205 |         expected_fullname = "personal-log-of-iceberg"
206 |         expected_title = "Not Title Shown: Personal Log of █████ \"Iceberg\" ████"
207 |         expected_title_shown = "Title Shown: Personal Log of █████ \"Iceberg\" ████"
208 |         expected_created_at = "2008-10-16T21:06:01+00:00"
209 |         expected_created_by = "unknown"
210 |         expected_tags = [
211 |             "doctor-kondraki",
212 |             "doctor-iceberg",
213 |             "doctor-gears",
214 |             "tale"
215 |         ]
216 |         expected_web_html = "<html>blablabla</html>"
217 |         expected_substitute_html = None
218 |         expected_processed_html = "<div>processed html</div>"
219 | 
220 |         expected_processed_title = expected_title_shown
221 | 
222 |         mock_process_page_html.return_value = expected_processed_html
223 | 
224 |         expected_page = {
225 |             "fullname": expected_fullname,
226 |             "created_at": expected_created_at,
227 |             "created_by": expected_created_by,
228 |             "updated_at": "2019-09-15T01:08:04+00:00",
229 |             "updated_by": "Elogee FishTruck",
230 |             "title": expected_title,
231 |             "title_shown": expected_title_shown,
232 |             "parent_fullname": None,
233 |             "tags": expected_tags,
234 |             "rating": 38,
235 |             "revisions": 36,
236 |             "parent_title": None,
237 |             "content": "",
238 |             "children": 0,
239 |             "comments": 5,
240 |             "commented_at": "2015-09-16T18:15:32+00:00",
241 |             "commented_by": "Decibelles",
242 |             "scp_epub_additional_data": {
243 |                 "web_html": expected_web_html
244 |             }
245 |         }
246 | 
247 |         expected_processed_page = {
248 |             "name": expected_fullname,
249 |             "title": expected_processed_title,
250 |             "created_by": expected_created_by,
251 |             "created_at": expected_created_at,
252 |             "tags": expected_tags,
253 |             "html": expected_processed_html,
254 |         }
255 | 
256 |         # Act
257 |         actual_processed_page = process.process_page.process_page(expected_page, url_allow_list=expected_url_allow_list)
258 | 
259 |         # Assert
260 |         mock_process_page_html.assert_called_once_with(expected_web_html, expected_processed_title, url_allow_list=expected_url_allow_list)
261 |         self.assertEqual(expected_processed_page, actual_processed_page)
262 | 
263 | 
264 | class TestGetPageContent(unittest.TestCase):
265 |     def setUp(self):
266 |         self.maxDiff = None
267 | 
268 |     def create_soup(self, html):
269 |         return bs4.BeautifulSoup(html, "html.parser")
270 | 
271 |     @parameterized.expand([
272 |         [
273 |             'simple page content',
274 |             '<html><head/><body>outside<div id="page-content">inside</div></body>',
275 |             '<div id="page-content">inside</div>'
276 |         ],
277 |         [
278 |             'not found',
279 |             '<html><head/><body>outside</body>',
280 |             'None'
281 |         ],
282 |     ])
283 |     def test_get_page_content(self, reason, expected_html_string, expected_output_string):
284 |         # Arrange
285 |         expected_page_content_id = 'page-content'
286 | 
287 |         # Act
288 |         actual_output = process.process_page.get_page_content(expected_html_string, page_content_id=expected_page_content_id)
289 | 
290 |         # Assert
291 |         self.assertEqual(expected_output_string, str(actual_output))
292 | 
293 | 
294 | class TestProcessContentFunctions(unittest.TestCase):
295 |     def setUp(self):
296 |         self.maxDiff = None
297 | 
298 |     def create_soup(self, html):
299 |         return bs4.BeautifulSoup(html, "html.parser")
300 | 
301 |     @parameterized.expand([
302 |         [
303 |             'nothing to remove',
304 |             '<div class="qux">asdf</div>',
305 |             '<div class="qux">asdf</div>'
306 |         ],
307 |         [
308 |             'complete removal',
309 |             '<div class="foo">asdf</div>',
310 |             ''
311 |         ],
312 |         [
313 |             'nested',
314 |             'outside<div class="foo">qwq<div class="bar">asdf</div>qwrq</div>outside',
315 |             'outsideoutside'
316 |         ],
317 |         [
318 |             'reverse nested',
319 |             'outside<div class="bar">qwq<div class="foo">asdf</div>qwrq</div>outside',
320 |             'outsideoutside'
321 |         ],
322 |     ])
323 |     def test_remove_by_class(self, reason, expected_html_string, expected_output_string):
324 |         # Arrange
325 |         expected_classses_to_remove = [
326 |             'foo',
327 |             'bar'
328 |         ]
329 | 
330 |         expected_content = self.create_soup(expected_html_string)
331 |         expected_output = None
332 | 
333 |         # Act
334 |         actual_output = process.process_page.remove_classes(expected_content, classes_to_remove=expected_classses_to_remove)
335 | 
336 |         # Assert
337 |         self.assertEqual(expected_output_string, str(expected_content))
338 |         self.assertEqual(expected_output, actual_output)
339 | 
340 |     @parameterized.expand([
341 |         [
342 |             'nothing to remove',
343 |             '<a href="foobar">asdf</a>',
344 |             '<a href="foobar">asdf</a>'
345 |         ],
346 |         [
347 |             'complete removal',
348 |             '<img></img>',
349 |             ''
350 |         ],
351 |         [
352 |             'simple removal',
353 |             'outside<img src="foo.png" class="bar"></img>outside',
354 |             'outsideoutside'
355 |         ],
356 |         [
357 |             'singletag',
358 |             'outside<img src="foo.png" class="bar"/>outside',
359 |             'outsideoutside'
360 |         ],
361 |     ])
362 |     def test_remove_by_tags(self, reason, expected_html_string, expected_output_string):
363 |         # Arrange
364 |         expected_tags_to_remove = [
365 |             'img'
366 |         ]
367 | 
368 |         expected_content = self.create_soup(expected_html_string)
369 |         expected_output = None
370 | 
371 |         # Act
372 |         actual_output = process.process_page.remove_tags(expected_content, tags_to_remove=expected_tags_to_remove)
373 | 
374 |         # Assert
375 |         self.assertEqual(expected_output_string, str(expected_content))
376 |         self.assertEqual(expected_output, actual_output)
377 | 
378 |     @parameterized.expand([
379 |         [
380 |             'scp-047',
381 |             '''outside<div class="collapsible-block"><div class="collapsible-block-folded"><a class="collapsible-block-link" href="javascript:;">&gt; Show details</a></div><div class="collapsible-block-unfolded" style="display:none"><div class="collapsible-block-unfolded-link"><a class="collapsible-block-link" href="javascript:;">&lt; Hide details</a></div><div class="collapsible-block-content"><ul><li><strong>Pathogenicity:</strong> Severe skin colonisation around sebaceous glands. Modification of skin pH to levels that become toxic to skin cells. Massive inflammation and immune cell infiltration. Eventual breakdown of skin structure leading to sepsis.</li><li><strong>Transmission:</strong> Transmitted by skin-to-skin contact. Can remain active on inorganic surfaces for up to five hours.</li><li><strong>Lethality:</strong> Approximately 40% mortality rate. Runs its course in 2-6 weeks. Very visible symptoms within 5-10 hours; contagious within 2-5 hours.</li><li><strong>Handling:</strong> As soon as visible symptoms form, victims must be quarantined. Deceased victims should be incinerated.</li></ul></div></div></div>''',
382 |             '''outside<div class="collapsible"><p class="collapsible-title">&gt; Show details</p><ul><li><strong>Pathogenicity:</strong> Severe skin colonisation around sebaceous glands. Modification of skin pH to levels that become toxic to skin cells. Massive inflammation and immune cell infiltration. Eventual breakdown of skin structure leading to sepsis.</li><li><strong>Transmission:</strong> Transmitted by skin-to-skin contact. Can remain active on inorganic surfaces for up to five hours.</li><li><strong>Lethality:</strong> Approximately 40% mortality rate. Runs its course in 2-6 weeks. Very visible symptoms within 5-10 hours; contagious within 2-5 hours.</li><li><strong>Handling:</strong> As soon as visible symptoms form, victims must be quarantined. Deceased victims should be incinerated.</li></ul></div>'''
383 |         ],
384 |         [
385 |             'multiple_items_spurious_newline',
386 |             '''<div class="collapsible-block"><div class="collapsible-block-folded"><a class="collapsible-block-link" href="javascript:;">+ Document S-1257-11</a></div><div class="collapsible-block-unfolded" style="display:none"><div class="collapsible-block-unfolded-link"><a class="collapsible-block-link" href="javascript:;">- Document S-1257-11</a></div><div class="collapsible-block-content">\n<blockquote><ul><li>Season 3, Episode 3, “Tyler’s Date”: Episode manifests three months after initial containment. One line in the script implies the assassination of American President Jimmy Carter in late 1978 or early 1979.</li><li>Season 4, Episode 1, “Bad Touch pt. 2”: Second half of a “special episode” ending season three. Eric, a young classmate of Danny’s, is the target of a sexual predator using what appears to be anomalous items manufactured by Doctor Wondertainment to lure his victims.</li><li>Season 5, Episode 10, “The Senior Trip”: Episode mentions a scandal where 10 members of the UK House of Commons had been publicly revealed as members of a cult that bears a strong resemblance to the Church of the Broken God.</li><li>Season 6, Episode 1, “The Freshmen”: The title of SCP-1257 is changed to <em>Danny</em>. The premise of the series changes as well, dropping the Tyler character and sending Danny to college in New York City with five of his classmates from High School.<sup class="footnoteref"><a class="footnoteref" href="javascript:;" id="footnoteref-4" onclick="WIKIDOT.page.utils.scrollToReference('footnote-4')">4</a></sup> The University Lab appears to have specimens of SCP-███, SCP-███ and SCP-███.</li><li>Season 6, Episode 11, “The ████████”: Plot of the episode concerns Eric’s suspicions that one of their dormmates might be secretly one of the “████████.” This turns out to be a misunderstanding. From context, the “████████” appear to prey on young women and have become endemic in [REDACTED] and seem to be the result of a containment breach of [REDACTED] in Mexico City.</li><li>Season 7, Episode 2, “Eric’s Midterm Caper”: When this episode manifested in SCP-1257-3-12, a new advertisement appeared during the second break for Marshall, Carter, and Dark Ltd. The ad promoted [REDACTED] services for [REDACTED].</li><li>Season 10, Episode 1, “The Job Hunt”: Hour-long “special” introducing another change in premise.<sup class="footnoteref"><a class="footnoteref" href="javascript:;" id="footnoteref-5" onclick="WIKIDOT.page.utils.scrollToReference('footnote-5')">5</a></sup> One scene implies that the Global Occult Coalition has become public enough to run “want ads” in the local newspaper.</li><li>Season 10, Episode 2, “The New Guy”: The show’s title is officially changed to <em>Agent Danny of the SCP</em>.<sup class="footnoteref"><a class="footnoteref" href="javascript:;" id="footnoteref-6" onclick="WIKIDOT.page.utils.scrollToReference('footnote-6')">6</a></sup> Danny has been employed as Level 1 security at Site-19, and through a series of mishaps, prevents a containment breach of <a href="/scp-173">SCP-173</a>.</li><li>Season 10, Episode 5, “D-Class Act”: Danny mis-hears a co-worker’s conversation and becomes convinced he has been mistakenly reassigned to D-Class by the HR Department.</li><li>Season 10, Episode 11, “Leaping Lizards”: [REDACTED] <a href="/scp-682">SCP-682</a> [REDACTED].</li></ul><p><em><strong>Note:</strong> Details of SCP-1257 episodes past Season 10 are only available with the approval of the Intelligence Department.</em></p></blockquote></div></div></div>''',
387 |             '''<div class="collapsible"><p class="collapsible-title">+ Document S-1257-11</p>\n<blockquote><ul><li>Season 3, Episode 3, “Tyler’s Date”: Episode manifests three months after initial containment. One line in the script implies the assassination of American President Jimmy Carter in late 1978 or early 1979.</li><li>Season 4, Episode 1, “Bad Touch pt. 2”: Second half of a “special episode” ending season three. Eric, a young classmate of Danny’s, is the target of a sexual predator using what appears to be anomalous items manufactured by Doctor Wondertainment to lure his victims.</li><li>Season 5, Episode 10, “The Senior Trip”: Episode mentions a scandal where 10 members of the UK House of Commons had been publicly revealed as members of a cult that bears a strong resemblance to the Church of the Broken God.</li><li>Season 6, Episode 1, “The Freshmen”: The title of SCP-1257 is changed to <em>Danny</em>. The premise of the series changes as well, dropping the Tyler character and sending Danny to college in New York City with five of his classmates from High School.<sup class="footnoteref"><a class="footnoteref" href="javascript:;" id="footnoteref-4" onclick="WIKIDOT.page.utils.scrollToReference('footnote-4')">4</a></sup> The University Lab appears to have specimens of SCP-███, SCP-███ and SCP-███.</li><li>Season 6, Episode 11, “The ████████”: Plot of the episode concerns Eric’s suspicions that one of their dormmates might be secretly one of the “████████.” This turns out to be a misunderstanding. From context, the “████████” appear to prey on young women and have become endemic in [REDACTED] and seem to be the result of a containment breach of [REDACTED] in Mexico City.</li><li>Season 7, Episode 2, “Eric’s Midterm Caper”: When this episode manifested in SCP-1257-3-12, a new advertisement appeared during the second break for Marshall, Carter, and Dark Ltd. The ad promoted [REDACTED] services for [REDACTED].</li><li>Season 10, Episode 1, “The Job Hunt”: Hour-long “special” introducing another change in premise.<sup class="footnoteref"><a class="footnoteref" href="javascript:;" id="footnoteref-5" onclick="WIKIDOT.page.utils.scrollToReference('footnote-5')">5</a></sup> One scene implies that the Global Occult Coalition has become public enough to run “want ads” in the local newspaper.</li><li>Season 10, Episode 2, “The New Guy”: The show’s title is officially changed to <em>Agent Danny of the SCP</em>.<sup class="footnoteref"><a class="footnoteref" href="javascript:;" id="footnoteref-6" onclick="WIKIDOT.page.utils.scrollToReference('footnote-6')">6</a></sup> Danny has been employed as Level 1 security at Site-19, and through a series of mishaps, prevents a containment breach of <a href="/scp-173">SCP-173</a>.</li><li>Season 10, Episode 5, “D-Class Act”: Danny mis-hears a co-worker’s conversation and becomes convinced he has been mistakenly reassigned to D-Class by the HR Department.</li><li>Season 10, Episode 11, “Leaping Lizards”: [REDACTED] <a href="/scp-682">SCP-682</a> [REDACTED].</li></ul><p><em><strong>Note:</strong> Details of SCP-1257 episodes past Season 10 are only available with the approval of the Intelligence Department.</em></p></blockquote></div>'''
388 | 
389 |         ]
390 |     ])
391 |     def test_unwrap_collapsible_blocks(self, reason, expected_html_string, expected_output_string):
392 |         # Arrange
393 |         expected_content = self.create_soup(expected_html_string)
394 |         expected_output = None
395 | 
396 |         # Act
397 |         actual_output = process.process_page.unwrap_collapsible_blocks(expected_content)
398 | 
399 |         # Assert
400 |         self.assertEqual(expected_output_string, str(expected_content))
401 |         self.assertEqual(expected_output, actual_output)
402 | 
403 |     @parameterized.expand([
404 |         [
405 |             'simple',
406 |             '''outside<blockquote><p>I love peace. I'd kill to preserve it</p></blockquote>''',
407 |             '''outside<div class="quote"><p>I love peace. I'd kill to preserve it</p></div>'''
408 |         ],
409 |     ])
410 |     def test_divify_blockquotes(self, reason, expected_html_string, expected_output_string):
411 |         # Arrange
412 |         expected_content = self.create_soup(expected_html_string)
413 |         expected_output = None
414 | 
415 |         # Act
416 |         actual_output = process.process_page.divify_blockquotes(expected_content)
417 | 
418 |         # Assert
419 |         self.assertEqual(expected_output_string, str(expected_content))
420 |         self.assertEqual(expected_output, actual_output)
421 | 
422 |     @parameterized.expand([
423 |         [
424 |             'two with nested div',
425 |             '''<div id="wiki-tabview-03edd57ee60acc9ffdcd1050bfe0a7c2" class="yui-navset"><ul class="yui-nav"><li class="selected"><a href="javascript:;"><em>Effect 1509-1</em></a></li><li><a href="javascript:;"><em>Effect 1509-2</em></a></li></ul><div class="yui-content"><div id="wiki-tab-0-0"><div class="inner-div" style="width:300px;"><p>A specimen.</p></div><p>Effect 1509-1 typically.</p></div><div id="wiki-tab-0-1" style="display:none"><p>Effect SCP-1509-2 occurs.</p></div></div></div>''',
426 |             '''<div class="tabview"><div class="tabview-tab"><p class="tab-title">Effect 1509-1</p><div class="inner-div" style="width:300px;"><p>A specimen.</p></div><p>Effect 1509-1 typically.</p></div><div class="tabview-tab"><p class="tab-title">Effect 1509-2</p><p>Effect SCP-1509-2 occurs.</p></div></div>'''
427 |         ],
428 |     ])
429 |     def test_unwrap_navset(self, reason, expected_html_string, expected_output_string):
430 |         # Arrange
431 |         expected_content = self.create_soup(expected_html_string)
432 |         expected_output = None
433 | 
434 |         # Act
435 |         actual_output = process.process_page.unwrap_yui_navset(expected_content)
436 | 
437 |         # Assert
438 |         self.assertEqual(expected_output_string, str(expected_content))
439 |         self.assertEqual(expected_output, actual_output)
440 | 
441 |     @parameterized.expand([
442 |         [
443 |             'no links',
444 |             '''asdf''',
445 |             '''asdf'''
446 |         ],
447 |         [
448 |             'non-href anchors',
449 |             '''asdf<a>asdf</a>asdf<a name="asdf">asdf</a>''',
450 |             '''asdf<a>asdf</a>asdf<a name="asdf">asdf</a>'''
451 |         ],
452 |         [
453 |             'expanded internal link',
454 |             '''<p>This is by <a href="http://scp-wiki.net/scp-3281">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>''',
455 |             '''<p>This is by <a href="scp-3281.xhtml">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>'''
456 |         ],
457 |         [
458 |             'other internal link',
459 |             '''<p>This is by <a href="http://scp-wiki.net/scp-1234">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>''',
460 |             '''<p>This is by <a href="scp-1234.xhtml">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>'''
461 |         ],
462 |         [
463 |             'implicit internal link',
464 |             '''<p>This is by <a href="/scp-3281">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>''',
465 |             '''<p>This is by <a href="scp-3281.xhtml">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>'''
466 |         ],
467 |         [
468 |             'external link',
469 |             '''<p>This is by <a href="http://wikipedia.org/scp-3281">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>''',
470 |             '''<p>This is by Autonomic (AARS821) RAISA. <strong>AAR</strong></p>'''
471 |         ],
472 |         [
473 |             'multiple links',
474 |             '''<p>This is by <a href="/scp-3281">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>asdf<p>This is by <a href="http://scp-wiki.net/scp-3281">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p><p>This is by <a href="http://wikipedia.org/scp-3281">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>''',
475 |             '''<p>This is by <a href="scp-3281.xhtml">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p>asdf<p>This is by <a href="scp-3281.xhtml">Autonomic (AARS821)</a> RAISA. <strong>AAR</strong></p><p>This is by Autonomic (AARS821) RAISA. <strong>AAR</strong></p>'''
476 |         ],
477 |         [
478 |             'not in book',
479 |             '''<a href="http://scp-wiki.net/scp-11111">asdf</a>''',
480 |             '''asdf'''
481 |         ],
482 |         [
483 |             'not in book, implicit',
484 |             '''<a href="/scp-11111">asdf</a>''',
485 |             '''asdf'''
486 |         ],
487 |         [
488 |             'ignore footnote links',
489 |             '''<p><strong>Special Containment Procedures:</strong> SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family<sup class="footnoteref"><a epub:type="noteref" href="#footnote-1" id="footnoteref-1">1</a></sup>! No stains! No mess! No permanent physical or mental trauma!</p> <p>Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.99<sup class="footnoteref"><a epub:type="noteref" href="#footnote-2" id="footnoteref-2">2</a></sup>!</p> <div class="footnotes-footer"> <div class="title">Footnotes</div> <div class="footnote-footer" epub:type="footnote" id="footnote-1"><a href="#footnoteref-1">1</a>. Even Grandma!</div> <div class="footnote-footer" epub:type="footnote" id="footnote-2"><a href="#footnoteref-2">2</a>. Plus shipping and handling</div> </div>''',
490 |             '''<p><strong>Special Containment Procedures:</strong> SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family<sup class="footnoteref"><a epub:type="noteref" href="#footnote-1" id="footnoteref-1">1</a></sup>! No stains! No mess! No permanent physical or mental trauma!</p> <p>Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.99<sup class="footnoteref"><a epub:type="noteref" href="#footnote-2" id="footnoteref-2">2</a></sup>!</p> <div class="footnotes-footer"> <div class="title">Footnotes</div> <div class="footnote-footer" epub:type="footnote" id="footnote-1"><a href="#footnoteref-1">1</a>. Even Grandma!</div> <div class="footnote-footer" epub:type="footnote" id="footnote-2"><a href="#footnoteref-2">2</a>. Plus shipping and handling</div> </div>'''
491 |         ]
492 |     ])
493 |     def test_fix_links(self, reason, expected_html_string, expected_output_string):
494 |         # Arrange
495 |         expected_url_allow_list = ['scp-3281', 'scp-1234']
496 | 
497 |         expected_content = self.create_soup(expected_html_string)
498 |         expected_output = None
499 | 
500 |         # Act
501 |         actual_output = process.process_page.fix_links(expected_content, url_allow_list=expected_url_allow_list)
502 | 
503 |         # Assert
504 |         self.assertEqual(expected_output_string, str(expected_content))
505 |         self.assertEqual(expected_output, actual_output)
506 | 
507 |     @parameterized.expand([
508 |         [
509 |             'no links',
510 |             '''asdf''',
511 |             '''asdf'''
512 |         ],
513 |         [
514 |             'non-href anchors',
515 |             '''asdf<a>asdf</a>asdf<a name="asdf">asdf</a>''',
516 |             '''asdf<a>asdf</a>asdf<a name="asdf">asdf</a>'''
517 |         ],
518 |         [
519 |             'not in book, implicit',
520 |             '''<a href="/scp-11111">asdf</a>''',
521 |             '''<a href="scp-11111.xhtml">asdf</a>'''
522 |         ],
523 |     ])
524 |     def test_fix_links_no_whitelist(self, reason, expected_html_string, expected_output_string):
525 |         # Arrange
526 |         expected_url_allow_list = None
527 | 
528 |         expected_content = self.create_soup(expected_html_string)
529 |         expected_output = None
530 | 
531 |         # Act
532 |         actual_output = process.process_page.fix_links(expected_content, url_allow_list=expected_url_allow_list)
533 | 
534 |         # Assert
535 |         self.assertEqual(expected_output_string, str(expected_content))
536 |         self.assertEqual(expected_output, actual_output)
537 | 
538 |     @parameterized.expand([
539 |         [
540 |             'simple add title',
541 |             '''asdf''',
542 |             '''<p class="page-title">Hi there!</p>asdf'''
543 |         ],
544 |         [
545 |             'some other tags',
546 |             '''<div class="foo">asdf</div>''',
547 |             '''<p class="page-title">Hi there!</p><div class="foo">asdf</div>'''
548 |         ]
549 |     ])
550 |     def test_add_title(self, reason, expected_html_string, expected_output_string):
551 |         # Arrange
552 |         expected_title = 'Hi there!'
553 | 
554 |         expected_content = self.create_soup(expected_html_string)
555 |         expected_output = None
556 | 
557 |         # Act
558 |         actual_output = process.process_page.add_title(expected_content, expected_title)
559 | 
560 |         # Assert
561 |         self.assertEqual(expected_output_string, str(expected_content))
562 |         self.assertEqual(expected_output, actual_output)
563 | 
564 |     @parameterized.expand([
565 |         [
566 |             'just the noteref',
567 |             '''<sup class="footnoteref"><a id="footnoteref-1"href="javascript:;" class="footnoteref"onclick="WIKIDOT.page.utils.scrollToReference('footnote-1')">1</a></sup>''',
568 |             '''<sup class="footnoteref"><a epub:type="noteref" href="#footnote-1" id="footnoteref-1">1</a></sup>'''
569 |         ],
570 |         [
571 |             'just the footnote',
572 |             '''<div class="footnote-footer" id="footnote-1"><a href="javascript:;"onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-1')">1</a>. Even Grandma!</div>''',
573 |             '''<div class="footnote-footer" epub:type="footnote" id="footnote-1"><a href="#footnoteref-1">1</a>. Even Grandma!</div>'''
574 |         ],
575 |         [
576 |             'noterefs and footnotes',
577 |             '''<p><strong>Special Containment Procedures:</strong> SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family<sup class="footnoteref"><a id="footnoteref-1" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-1')">1</a></sup>! No stains! No mess! No permanent physical or mental trauma!</p> <p>Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.99<sup class="footnoteref"><a id="footnoteref-2" href="javascript:;" class="footnoteref" onclick="WIKIDOT.page.utils.scrollToReference('footnote-2')">2</a></sup>!</p> <div class="footnotes-footer"> <div class="title">Footnotes</div> <div class="footnote-footer" id="footnote-1"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-1')">1</a>. Even Grandma!</div> <div class="footnote-footer" id="footnote-2"><a href="javascript:;" onclick="WIKIDOT.page.utils.scrollToReference('footnoteref-2')">2</a>. Plus shipping and handling</div> </div>''',
578 |             '''<p><strong>Special Containment Procedures:</strong> SCP-1-800-J can be easily and safely stored anywhere in your home! SCP-1-800-J can be used safely by any member of the family<sup class="footnoteref"><a epub:type="noteref" href="#footnote-1" id="footnoteref-1">1</a></sup>! No stains! No mess! No permanent physical or mental trauma!</p> <p>Companies like Marshall, Carter, and Dark Ltd. and Dr. Wondertainment would charge you FORTUNES for similar products. But SCP-1-800-J is only $19.99! That's right! SCP-1-800-J is only $19.99<sup class="footnoteref"><a epub:type="noteref" href="#footnote-2" id="footnoteref-2">2</a></sup>!</p> <div class="footnotes-footer"> <div class="title">Footnotes</div> <div class="footnote-footer" epub:type="footnote" id="footnote-1"><a href="#footnoteref-1">1</a>. Even Grandma!</div> <div class="footnote-footer" epub:type="footnote" id="footnote-2"><a href="#footnoteref-2">2</a>. Plus shipping and handling</div> </div>'''
579 |         ],
580 |     ])
581 |     def test_fix_footnotes(self, reason, expected_html_string, expected_output_string):
582 |         # Arrange
583 |         expected_content = self.create_soup(expected_html_string)
584 |         expected_output = None
585 | 
586 |         # Act
587 |         actual_output = process.process_page.fix_footnotes(expected_content)
588 | 
589 |         # Assert
590 |         self.assertEqual(expected_output_string, str(expected_content))
591 |         self.assertEqual(expected_output, actual_output)
592 | 
593 | 
594 | class TestHelpers(unittest.TestCase):
595 |     def test_get_filename_from_name(self):
596 |         # Arrange
597 |         expected_name = 'scp-1234'
598 |         expected_filename = 'scp-1234.xhtml'
599 | 
600 |         # Act
601 |         actual_filename = process.process_page.get_filename(expected_name)
602 | 
603 |         # Assert
604 |         self.assertEqual(expected_filename, actual_filename)
605 | 


--------------------------------------------------------------------------------