├── .gitignore ├── LICENSE ├── README.md ├── main.py ├── poetry.lock └── pyproject.toml /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | .DS_Store 131 | *.html 132 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Collin Heist 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TPDbCollectionMaker 2 | Python script to quickly make Kometa (PMM) poster entries from 3 | [ThePosterDatabase](https://theposterdb.com) (TPDb) sets. 4 | 5 | Because TPDb doesn't permit automated scraping, this tool reads HTML files. 6 | 7 | This tool will read handle collections, movies, shows, and season posters all in 8 | one file and output YAML that can be used in Kometa metadata files. 9 | An example of part of the output for TheDoctor30's 10 | [Marvel Television Set](https://theposterdb.com/set/11318) is shown below: 11 | 12 | ```yaml 13 | # -------------------------------------------------------------------------------- 14 | # collections 15 | Marvel Television: 16 | url_poster: https://theposterdb.com/api/assets/19724 17 | # -------------------------------------------------------------------------------- 18 | # shows 19 | Marvel's Daredevil: 20 | url_poster: https://theposterdb.com/api/assets/19725 21 | seasons: 22 | 1: {url_poster: https://theposterdb.com/api/assets/19726} 23 | 2: {url_poster: https://theposterdb.com/api/assets/19727} 24 | 3: {url_poster: https://theposterdb.com/api/assets/19728} 25 | Marvel's Jessica Jones: 26 | url_poster: https://theposterdb.com/api/assets/19729 27 | seasons: 28 | 1: {url_poster: https://theposterdb.com/api/assets/19730} 29 | 2: {url_poster: https://theposterdb.com/api/assets/19731} 30 | 3: {url_poster: https://theposterdb.com/api/assets/19732} 31 | Marvel's Luke Cage: 32 | url_poster: https://theposterdb.com/api/assets/19733 33 | seasons: 34 | 1: {url_poster: https://theposterdb.com/api/assets/19734} 35 | 2: {url_poster: https://theposterdb.com/api/assets/19735} 36 | # etc.. 37 | ``` 38 | 39 | # How to Use 40 | ## Help Menu 41 | This is a Python command-line tool. All arguments are shown with `--help`: 42 | 43 | ```console 44 | $ poetry run python main.py -h 45 | usage: main.py [-h] [-p] [-q] HTML_FILE 46 | 47 | TPDb Collection Maker 48 | 49 | positional arguments: 50 | HTML_FILE file with TPDb Collection page HTML to scrape 51 | 52 | optional arguments: 53 | -h, --help show this help message and exit 54 | -p, --primary-only only parse the primary set (ignore any Additional Sets) 55 | -q, --always-quote put all titles in quotes ("") 56 | ``` 57 | 58 | ## Installation 59 | 60 | > NOTE: If copying these commands, do __not__ copy the `$` - that is just to 61 | show this is a _command_. 62 | 63 | 1. Install `poetry` - see [here](https://python-poetry.org/docs/#installation) 64 | with the `pipx` method. 65 | 66 | 2. Download this tool: 67 | ```console 68 | $ git clone https://github.com/CollinHeist/TPDbCollectionMaker/ 69 | ``` 70 | 71 | 3. Install the Python dependencies: 72 | ```console 73 | cd TPDbCollectionMaker 74 | poetry install 75 | ``` 76 | 77 | 4. Run the script (see [Arguments](#arguments) for details). 78 | ``` 79 | poetry run python main.py -h 80 | ``` 81 | 82 | ## Getting Page HTML 83 | Because TPDb doesn't permit automated scraping, this tool reads HTML files. To 84 | get the HTML of a set, right-click the set page and select `Inspect`: 85 | 86 | 87 | 88 | This should launch your browser's HTML inspector. It should look something like: 89 | 90 | 91 | 92 | Go to the top-most HTML element (if HTML is selected, hold the left-arrow key 93 | to collapse all the HTML). The top-most HTML should look like: 94 | 95 | ```html 96 | 97 | 98 | ... 99 | ``` 100 | 101 | Right-click the `` element, go to `Copy` > 102 | `Inner HTML`. Your clipboard now has the complete HTML of the set page; paste 103 | this into some file alongside the `main.py` file of this project. This file will 104 | be the input to the script (see below). 105 | 106 | ## Arguments 107 | ### `html` 108 | Input HTML file to parse. 109 | 110 | ### `-p`, `--primary-only` 111 | Only parse the primary content on the given HTML page, ignoring any Additional 112 | Sets. If unspecified, then the entire page is parsed. 113 | 114 | ### `-q`, `--always-quote` 115 | Quote all titles in the output. If unspecified, only titles with colons are 116 | quoted. 117 | 118 | Below is an example of this argument: 119 | 120 | ```console 121 | $ poetry run python main.py in.html --always-quote 122 | "Iron Man (2008)": 123 | url_poster: https://theposterdb.com/api/assets/9773 124 | "The Incredible Hulk (2008)": 125 | url_poster: https://theposterdb.com/api/assets/9775 126 | "Iron Man 2 (2010)": 127 | url_poster: https://theposterdb.com/api/assets/9776 128 | ``` 129 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from pathlib import Path 3 | from re import compile as re_compile 4 | from typing import Iterable, Literal 5 | 6 | try: 7 | from bs4 import BeautifulSoup 8 | except ImportError: 9 | print(f'Missing required packages - execute "poetry install"') 10 | exit(1) 11 | 12 | 13 | # Create ArgumentParser object and arguments 14 | parser = ArgumentParser(description='TPDb Collection Maker') 15 | parser.add_argument( 16 | 'html', 17 | type=Path, 18 | metavar='HTML_FILE', 19 | help='file with TPDb Collection page HTML to scrape') 20 | parser.add_argument( 21 | '-p', '--primary-only', 22 | action='store_true', 23 | help='only parse the primary set (ignore any Additional Sets)') 24 | parser.add_argument( 25 | '-q', '--always-quote', 26 | action='store_true', 27 | help='put all titles in quotes ("")') 28 | 29 | 30 | ContentType = Literal[ 31 | 'category', 'collection', 'show', 'movie', 'company', 'season' 32 | ] 33 | PRIMARY_CONTENT_CLASS: str = 'row d-flex flex-wrap m-0 w-100 mx-n1 mt-n1' 34 | 35 | 36 | class Content: 37 | """ 38 | This class describes some type of content. Content contains a poster ID, 39 | a type, title, year, and a URL. 40 | """ 41 | 42 | """Poster URL format for all TPDb posters""" 43 | POSTER_URL = 'https://theposterdb.com/api/assets/{id}' 44 | 45 | """Regex to match yearless titles and season names from full titles""" 46 | YEARLESS_REGEX=re_compile(r'^(.*) \(\d+\)($| - (?:Season \d+|(Specials)))$') 47 | SEASON_REGEX = re_compile(r'^.* - (?:Season (\d+)|(Specials))$') 48 | 49 | __slots__ = ( 50 | 'poster_id', 'content_type', 'title', 'use_year', 'must_quote', 'url', 51 | 'yearless_title', 'season_number', 'sub_content', 52 | ) 53 | 54 | def __init__(self, poster_id: int, 55 | content_type: ContentType, 56 | title: str, *, 57 | must_quote: bool = False) -> None: 58 | """ 59 | Initialize the content described by the given attributes. 60 | 61 | Args: 62 | poster_id: TPDb poster ID of the content. 63 | content_type: Type of content being created. 64 | title: Title of the content. 65 | must_quote: Override for whether the finalized title of this 66 | content should be quoted or not. Keyword only. 67 | """ 68 | 69 | self.poster_id = poster_id 70 | self.content_type = content_type 71 | self.title = title 72 | self.use_year = False 73 | self.must_quote = must_quote or ': ' in self.title 74 | self.url = self.POSTER_URL.format(id=self.poster_id) 75 | 76 | # Attempt to parse the yearless title 77 | if (group := self.YEARLESS_REGEX.match(self.title)) is None: 78 | self.yearless_title = self.title 79 | else: 80 | self.yearless_title = group.group(1) 81 | 82 | # If season name is in the title, parse 83 | if (season_group := self.SEASON_REGEX.match(self.title)) is None: 84 | self.season_number = None 85 | else: 86 | self.content_type = 'season' 87 | if season_group.group(2) == 'Specials': 88 | self.season_number = 0 89 | else: 90 | self.season_number = int(season_group.group(1)) 91 | 92 | # No subcontent yet 93 | self.sub_content = {} 94 | 95 | 96 | @property 97 | def final_title(self) -> str: 98 | """ 99 | The finalized title for this Content. Quoted and utilizing the 100 | year if necessary. 101 | """ 102 | 103 | title = self.title if self.use_year else self.yearless_title 104 | return f'"{title}"' if self.must_quote else title 105 | 106 | 107 | def __repr__(self) -> str: 108 | attributes = ', '.join( 109 | f'{attr}={getattr(self, attr)!r}' for attr in self.__slots__ 110 | if not attr.startswith('__') 111 | ) 112 | 113 | return f'' 114 | 115 | 116 | def __str__(self) -> str: 117 | """ 118 | The string representation of this content. This is the formatted 119 | content string used within PMM, and the return format depends on 120 | the content type of this object. 121 | """ 122 | 123 | if self.content_type in ('category', 'collection', 'movie', 'company'): 124 | return f'{self.final_title}:\n url_poster: {self.url}' 125 | elif self.content_type == 'show': 126 | base = f'{self.final_title}:\n url_poster: {self.url}' 127 | if len(self.sub_content) > 0: 128 | sub = '\n '.join(str(self.sub_content[season]) 129 | for season in sorted(self.sub_content)) 130 | return f'{base}\n seasons:\n {sub}' 131 | 132 | return base 133 | elif self.content_type == 'season': 134 | return f'{self.season_number}: ' + '{url_poster: ' + self.url + '}' 135 | 136 | return f'' 137 | 138 | 139 | def is_sub_content_of(self, content: 'Content') -> bool: 140 | """ 141 | Determine whether the given content object is the parent content 142 | of this object. 143 | 144 | Args: 145 | content: Object being compared against. 146 | 147 | Returns: 148 | True if the given object is the parent of this content. 149 | False otherwise. 150 | """ 151 | 152 | # Only a show can have a season child 153 | if self.content_type != 'season' or content.content_type != 'show': 154 | return False 155 | 156 | return (content.yearless_title == self.yearless_title 157 | and content.title in self.title) 158 | 159 | 160 | def is_parent_content_of(self, content: 'Content') -> bool: 161 | """Logical composite of `is_sub_content_of` on this object.""" 162 | 163 | return content.is_sub_content_of(self) 164 | 165 | 166 | def add_sub_content(self, content: 'Content') -> None: 167 | """ 168 | Add the given content as sub content of this object. 169 | 170 | Args: 171 | content: Sub-content being added. 172 | """ 173 | 174 | self.sub_content[content.season_number] = content 175 | 176 | 177 | class ContentList: 178 | """ 179 | This class describes a container list of Content objects. This is a 180 | glorified dictionary of lists for each content type 181 | """ 182 | 183 | 184 | def __init__(self) -> None: 185 | self.content: dict[ContentType, Iterable[Content]] = { 186 | 'category': [], 187 | 'collection': [], 188 | 'movie': [], 189 | 'show': [], 190 | 'season': [], 191 | 'company': [], 192 | } 193 | 194 | 195 | def __bool__(self) -> bool: 196 | """Whether this object contains any content.""" 197 | 198 | return any(content for content in self.content.values()) 199 | 200 | 201 | def __repr__(self) -> str: 202 | return f'' 203 | 204 | 205 | def __divider(self, label: str, /) -> str: 206 | return f'\n# {"-"*80}\n# {label}\n# {"-" * 80}' 207 | 208 | 209 | def add_content(self, new: Content) -> None: 210 | """ 211 | Add the given content to this object. This finds any existing 212 | content the new content could be a child or parent of, and adds 213 | this object to them if indicated. 214 | 215 | Args: 216 | new: Content being added. 217 | """ 218 | 219 | # Check if new content belongs to any existing shows 220 | for existing in self.content['show']: 221 | if new.is_sub_content_of(existing): 222 | existing.add_sub_content(new) 223 | # Can only belong to one show, stop looping 224 | break 225 | 226 | # Check if any existing seasons belong to new content 227 | for existing in self.content['season']: 228 | if new.is_parent_content_of(existing): 229 | new.add_sub_content(existing) 230 | 231 | # Check for content of this same title 232 | for existing in self.content[new.content_type]: 233 | if existing.title == new.title: 234 | new.use_year = True 235 | break 236 | 237 | self.content[new.content_type].append(new) 238 | 239 | 240 | def print(self) -> None: 241 | """ 242 | Print this object. This prints segmented sections of each type 243 | of Content in this object. 244 | """ 245 | 246 | # Print each content group 247 | for content_type, content_list in self.content.items(): 248 | # Don't print empty content sets, or base seasons 249 | if not content_list or content_type == 'season': 250 | continue 251 | 252 | # Print divider, content type header, and all content 253 | print(self.__divider(content_type + 's')) 254 | for content in content_list: 255 | # print(f'{content=!r}') 256 | print(str(content)) 257 | 258 | # Print season content if no parent show content was parsed 259 | if self.content['season'] and not self.content['show']: 260 | print(self.__divider('Unassigned Content')) 261 | for content in self.content['season']: 262 | print(str(content)) 263 | 264 | 265 | """If file is entrypoint, parse args""" 266 | if __name__ == '__main__': 267 | # Parse given arguments 268 | args = parser.parse_args() 269 | 270 | # Get page HTML from file if provided 271 | if not args.html.exists(): 272 | print(f'File "{args.html_file.resolve()}" does not exist') 273 | exit(1) 274 | 275 | # Open file and read content 276 | with args.html.open('r') as file_handle: 277 | html = file_handle.read() 278 | 279 | # Create BeautifulSoup element of HTML 280 | page = BeautifulSoup(html, 'html.parser') 281 | 282 | # If only doing primary content, filter webpage 283 | if args.primary_only: 284 | webpage = page.find('div', class_=PRIMARY_CONTENT_CLASS) 285 | posters = webpage.find_all('div', class_='overlay rounded-poster') 286 | else: 287 | posters = page.find_all('div', class_='overlay rounded-poster') 288 | 289 | # Get all posters in this set, create Content and add to list 290 | content_list = ContentList() 291 | for poster_element in posters: 292 | content = Content( 293 | poster_element.attrs['data-poster-id'], 294 | poster_element.attrs['data-poster-type'].lower(), 295 | poster_element.find('p', class_='p-0 mb-1 text-break').string, 296 | must_quote=args.always_quote, 297 | ) 298 | content_list.add_content(content) 299 | 300 | if content_list: 301 | content_list.print() 302 | else: 303 | print(f'No content identified!') 304 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. 2 | 3 | [[package]] 4 | name = "beautifulsoup4" 5 | version = "4.12.3" 6 | description = "Screen-scraping library" 7 | optional = false 8 | python-versions = ">=3.6.0" 9 | files = [ 10 | {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, 11 | {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, 12 | ] 13 | 14 | [package.dependencies] 15 | soupsieve = ">1.2" 16 | 17 | [package.extras] 18 | cchardet = ["cchardet"] 19 | chardet = ["chardet"] 20 | charset-normalizer = ["charset-normalizer"] 21 | html5lib = ["html5lib"] 22 | lxml = ["lxml"] 23 | 24 | [[package]] 25 | name = "soupsieve" 26 | version = "2.5" 27 | description = "A modern CSS selector implementation for Beautiful Soup." 28 | optional = false 29 | python-versions = ">=3.8" 30 | files = [ 31 | {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, 32 | {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, 33 | ] 34 | 35 | [metadata] 36 | lock-version = "2.0" 37 | python-versions = "^3.12" 38 | content-hash = "99f2a8fe1acf290a6f68cacbf4817fb2ec5bc643cc7d1380fe234555f962eb59" 39 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "tpdbcollectionmaker" 3 | version = "0.2.1" 4 | description = "Quickly make Plex Meta Manager poster entries from ThePosterDatabase sets" 5 | authors = ["Collin Heist "] 6 | license = "MIT" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.12" 11 | beautifulsoup4 = "^4.12.3" 12 | 13 | 14 | [build-system] 15 | requires = ["poetry-core"] 16 | build-backend = "poetry.core.masonry.api" 17 | --------------------------------------------------------------------------------