├── .gitignore
├── LICENSE
├── README.md
├── main.py
├── poetry.lock
└── pyproject.toml
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 | .DS_Store
131 | *.html
132 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Collin Heist
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TPDbCollectionMaker
2 | Python script to quickly make Kometa (PMM) poster entries from
3 | [ThePosterDatabase](https://theposterdb.com) (TPDb) sets.
4 |
5 | Because TPDb doesn't permit automated scraping, this tool reads HTML files.
6 |
7 | This tool will read handle collections, movies, shows, and season posters all in
8 | one file and output YAML that can be used in Kometa metadata files.
9 | An example of part of the output for TheDoctor30's
10 | [Marvel Television Set](https://theposterdb.com/set/11318) is shown below:
11 |
12 | ```yaml
13 | # --------------------------------------------------------------------------------
14 | # collections
15 | Marvel Television:
16 | url_poster: https://theposterdb.com/api/assets/19724
17 | # --------------------------------------------------------------------------------
18 | # shows
19 | Marvel's Daredevil:
20 | url_poster: https://theposterdb.com/api/assets/19725
21 | seasons:
22 | 1: {url_poster: https://theposterdb.com/api/assets/19726}
23 | 2: {url_poster: https://theposterdb.com/api/assets/19727}
24 | 3: {url_poster: https://theposterdb.com/api/assets/19728}
25 | Marvel's Jessica Jones:
26 | url_poster: https://theposterdb.com/api/assets/19729
27 | seasons:
28 | 1: {url_poster: https://theposterdb.com/api/assets/19730}
29 | 2: {url_poster: https://theposterdb.com/api/assets/19731}
30 | 3: {url_poster: https://theposterdb.com/api/assets/19732}
31 | Marvel's Luke Cage:
32 | url_poster: https://theposterdb.com/api/assets/19733
33 | seasons:
34 | 1: {url_poster: https://theposterdb.com/api/assets/19734}
35 | 2: {url_poster: https://theposterdb.com/api/assets/19735}
36 | # etc..
37 | ```
38 |
39 | # How to Use
40 | ## Help Menu
41 | This is a Python command-line tool. All arguments are shown with `--help`:
42 |
43 | ```console
44 | $ poetry run python main.py -h
45 | usage: main.py [-h] [-p] [-q] HTML_FILE
46 |
47 | TPDb Collection Maker
48 |
49 | positional arguments:
50 | HTML_FILE file with TPDb Collection page HTML to scrape
51 |
52 | optional arguments:
53 | -h, --help show this help message and exit
54 | -p, --primary-only only parse the primary set (ignore any Additional Sets)
55 | -q, --always-quote put all titles in quotes ("")
56 | ```
57 |
58 | ## Installation
59 |
60 | > NOTE: If copying these commands, do __not__ copy the `$` - that is just to
61 | show this is a _command_.
62 |
63 | 1. Install `poetry` - see [here](https://python-poetry.org/docs/#installation)
64 | with the `pipx` method.
65 |
66 | 2. Download this tool:
67 | ```console
68 | $ git clone https://github.com/CollinHeist/TPDbCollectionMaker/
69 | ```
70 |
71 | 3. Install the Python dependencies:
72 | ```console
73 | cd TPDbCollectionMaker
74 | poetry install
75 | ```
76 |
77 | 4. Run the script (see [Arguments](#arguments) for details).
78 | ```
79 | poetry run python main.py -h
80 | ```
81 |
82 | ## Getting Page HTML
83 | Because TPDb doesn't permit automated scraping, this tool reads HTML files. To
84 | get the HTML of a set, right-click the set page and select `Inspect`:
85 |
86 |
87 |
88 | This should launch your browser's HTML inspector. It should look something like:
89 |
90 |
91 |
92 | Go to the top-most HTML element (if HTML is selected, hold the left-arrow key
93 | to collapse all the HTML). The top-most HTML should look like:
94 |
95 | ```html
96 |
97 |
98 | ...
99 | ```
100 |
101 | Right-click the `` element, go to `Copy` >
102 | `Inner HTML`. Your clipboard now has the complete HTML of the set page; paste
103 | this into some file alongside the `main.py` file of this project. This file will
104 | be the input to the script (see below).
105 |
106 | ## Arguments
107 | ### `html`
108 | Input HTML file to parse.
109 |
110 | ### `-p`, `--primary-only`
111 | Only parse the primary content on the given HTML page, ignoring any Additional
112 | Sets. If unspecified, then the entire page is parsed.
113 |
114 | ### `-q`, `--always-quote`
115 | Quote all titles in the output. If unspecified, only titles with colons are
116 | quoted.
117 |
118 | Below is an example of this argument:
119 |
120 | ```console
121 | $ poetry run python main.py in.html --always-quote
122 | "Iron Man (2008)":
123 | url_poster: https://theposterdb.com/api/assets/9773
124 | "The Incredible Hulk (2008)":
125 | url_poster: https://theposterdb.com/api/assets/9775
126 | "Iron Man 2 (2010)":
127 | url_poster: https://theposterdb.com/api/assets/9776
128 | ```
129 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | from pathlib import Path
3 | from re import compile as re_compile
4 | from typing import Iterable, Literal
5 |
6 | try:
7 | from bs4 import BeautifulSoup
8 | except ImportError:
9 | print(f'Missing required packages - execute "poetry install"')
10 | exit(1)
11 |
12 |
13 | # Create ArgumentParser object and arguments
14 | parser = ArgumentParser(description='TPDb Collection Maker')
15 | parser.add_argument(
16 | 'html',
17 | type=Path,
18 | metavar='HTML_FILE',
19 | help='file with TPDb Collection page HTML to scrape')
20 | parser.add_argument(
21 | '-p', '--primary-only',
22 | action='store_true',
23 | help='only parse the primary set (ignore any Additional Sets)')
24 | parser.add_argument(
25 | '-q', '--always-quote',
26 | action='store_true',
27 | help='put all titles in quotes ("")')
28 |
29 |
30 | ContentType = Literal[
31 | 'category', 'collection', 'show', 'movie', 'company', 'season'
32 | ]
33 | PRIMARY_CONTENT_CLASS: str = 'row d-flex flex-wrap m-0 w-100 mx-n1 mt-n1'
34 |
35 |
36 | class Content:
37 | """
38 | This class describes some type of content. Content contains a poster ID,
39 | a type, title, year, and a URL.
40 | """
41 |
42 | """Poster URL format for all TPDb posters"""
43 | POSTER_URL = 'https://theposterdb.com/api/assets/{id}'
44 |
45 | """Regex to match yearless titles and season names from full titles"""
46 | YEARLESS_REGEX=re_compile(r'^(.*) \(\d+\)($| - (?:Season \d+|(Specials)))$')
47 | SEASON_REGEX = re_compile(r'^.* - (?:Season (\d+)|(Specials))$')
48 |
49 | __slots__ = (
50 | 'poster_id', 'content_type', 'title', 'use_year', 'must_quote', 'url',
51 | 'yearless_title', 'season_number', 'sub_content',
52 | )
53 |
54 | def __init__(self, poster_id: int,
55 | content_type: ContentType,
56 | title: str, *,
57 | must_quote: bool = False) -> None:
58 | """
59 | Initialize the content described by the given attributes.
60 |
61 | Args:
62 | poster_id: TPDb poster ID of the content.
63 | content_type: Type of content being created.
64 | title: Title of the content.
65 | must_quote: Override for whether the finalized title of this
66 | content should be quoted or not. Keyword only.
67 | """
68 |
69 | self.poster_id = poster_id
70 | self.content_type = content_type
71 | self.title = title
72 | self.use_year = False
73 | self.must_quote = must_quote or ': ' in self.title
74 | self.url = self.POSTER_URL.format(id=self.poster_id)
75 |
76 | # Attempt to parse the yearless title
77 | if (group := self.YEARLESS_REGEX.match(self.title)) is None:
78 | self.yearless_title = self.title
79 | else:
80 | self.yearless_title = group.group(1)
81 |
82 | # If season name is in the title, parse
83 | if (season_group := self.SEASON_REGEX.match(self.title)) is None:
84 | self.season_number = None
85 | else:
86 | self.content_type = 'season'
87 | if season_group.group(2) == 'Specials':
88 | self.season_number = 0
89 | else:
90 | self.season_number = int(season_group.group(1))
91 |
92 | # No subcontent yet
93 | self.sub_content = {}
94 |
95 |
96 | @property
97 | def final_title(self) -> str:
98 | """
99 | The finalized title for this Content. Quoted and utilizing the
100 | year if necessary.
101 | """
102 |
103 | title = self.title if self.use_year else self.yearless_title
104 | return f'"{title}"' if self.must_quote else title
105 |
106 |
107 | def __repr__(self) -> str:
108 | attributes = ', '.join(
109 | f'{attr}={getattr(self, attr)!r}' for attr in self.__slots__
110 | if not attr.startswith('__')
111 | )
112 |
113 | return f''
114 |
115 |
116 | def __str__(self) -> str:
117 | """
118 | The string representation of this content. This is the formatted
119 | content string used within PMM, and the return format depends on
120 | the content type of this object.
121 | """
122 |
123 | if self.content_type in ('category', 'collection', 'movie', 'company'):
124 | return f'{self.final_title}:\n url_poster: {self.url}'
125 | elif self.content_type == 'show':
126 | base = f'{self.final_title}:\n url_poster: {self.url}'
127 | if len(self.sub_content) > 0:
128 | sub = '\n '.join(str(self.sub_content[season])
129 | for season in sorted(self.sub_content))
130 | return f'{base}\n seasons:\n {sub}'
131 |
132 | return base
133 | elif self.content_type == 'season':
134 | return f'{self.season_number}: ' + '{url_poster: ' + self.url + '}'
135 |
136 | return f''
137 |
138 |
139 | def is_sub_content_of(self, content: 'Content') -> bool:
140 | """
141 | Determine whether the given content object is the parent content
142 | of this object.
143 |
144 | Args:
145 | content: Object being compared against.
146 |
147 | Returns:
148 | True if the given object is the parent of this content.
149 | False otherwise.
150 | """
151 |
152 | # Only a show can have a season child
153 | if self.content_type != 'season' or content.content_type != 'show':
154 | return False
155 |
156 | return (content.yearless_title == self.yearless_title
157 | and content.title in self.title)
158 |
159 |
160 | def is_parent_content_of(self, content: 'Content') -> bool:
161 | """Logical composite of `is_sub_content_of` on this object."""
162 |
163 | return content.is_sub_content_of(self)
164 |
165 |
166 | def add_sub_content(self, content: 'Content') -> None:
167 | """
168 | Add the given content as sub content of this object.
169 |
170 | Args:
171 | content: Sub-content being added.
172 | """
173 |
174 | self.sub_content[content.season_number] = content
175 |
176 |
177 | class ContentList:
178 | """
179 | This class describes a container list of Content objects. This is a
180 | glorified dictionary of lists for each content type
181 | """
182 |
183 |
184 | def __init__(self) -> None:
185 | self.content: dict[ContentType, Iterable[Content]] = {
186 | 'category': [],
187 | 'collection': [],
188 | 'movie': [],
189 | 'show': [],
190 | 'season': [],
191 | 'company': [],
192 | }
193 |
194 |
195 | def __bool__(self) -> bool:
196 | """Whether this object contains any content."""
197 |
198 | return any(content for content in self.content.values())
199 |
200 |
201 | def __repr__(self) -> str:
202 | return f''
203 |
204 |
205 | def __divider(self, label: str, /) -> str:
206 | return f'\n# {"-"*80}\n# {label}\n# {"-" * 80}'
207 |
208 |
209 | def add_content(self, new: Content) -> None:
210 | """
211 | Add the given content to this object. This finds any existing
212 | content the new content could be a child or parent of, and adds
213 | this object to them if indicated.
214 |
215 | Args:
216 | new: Content being added.
217 | """
218 |
219 | # Check if new content belongs to any existing shows
220 | for existing in self.content['show']:
221 | if new.is_sub_content_of(existing):
222 | existing.add_sub_content(new)
223 | # Can only belong to one show, stop looping
224 | break
225 |
226 | # Check if any existing seasons belong to new content
227 | for existing in self.content['season']:
228 | if new.is_parent_content_of(existing):
229 | new.add_sub_content(existing)
230 |
231 | # Check for content of this same title
232 | for existing in self.content[new.content_type]:
233 | if existing.title == new.title:
234 | new.use_year = True
235 | break
236 |
237 | self.content[new.content_type].append(new)
238 |
239 |
240 | def print(self) -> None:
241 | """
242 | Print this object. This prints segmented sections of each type
243 | of Content in this object.
244 | """
245 |
246 | # Print each content group
247 | for content_type, content_list in self.content.items():
248 | # Don't print empty content sets, or base seasons
249 | if not content_list or content_type == 'season':
250 | continue
251 |
252 | # Print divider, content type header, and all content
253 | print(self.__divider(content_type + 's'))
254 | for content in content_list:
255 | # print(f'{content=!r}')
256 | print(str(content))
257 |
258 | # Print season content if no parent show content was parsed
259 | if self.content['season'] and not self.content['show']:
260 | print(self.__divider('Unassigned Content'))
261 | for content in self.content['season']:
262 | print(str(content))
263 |
264 |
265 | """If file is entrypoint, parse args"""
266 | if __name__ == '__main__':
267 | # Parse given arguments
268 | args = parser.parse_args()
269 |
270 | # Get page HTML from file if provided
271 | if not args.html.exists():
272 | print(f'File "{args.html_file.resolve()}" does not exist')
273 | exit(1)
274 |
275 | # Open file and read content
276 | with args.html.open('r') as file_handle:
277 | html = file_handle.read()
278 |
279 | # Create BeautifulSoup element of HTML
280 | page = BeautifulSoup(html, 'html.parser')
281 |
282 | # If only doing primary content, filter webpage
283 | if args.primary_only:
284 | webpage = page.find('div', class_=PRIMARY_CONTENT_CLASS)
285 | posters = webpage.find_all('div', class_='overlay rounded-poster')
286 | else:
287 | posters = page.find_all('div', class_='overlay rounded-poster')
288 |
289 | # Get all posters in this set, create Content and add to list
290 | content_list = ContentList()
291 | for poster_element in posters:
292 | content = Content(
293 | poster_element.attrs['data-poster-id'],
294 | poster_element.attrs['data-poster-type'].lower(),
295 | poster_element.find('p', class_='p-0 mb-1 text-break').string,
296 | must_quote=args.always_quote,
297 | )
298 | content_list.add_content(content)
299 |
300 | if content_list:
301 | content_list.print()
302 | else:
303 | print(f'No content identified!')
304 |
--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
1 | # This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
2 |
3 | [[package]]
4 | name = "beautifulsoup4"
5 | version = "4.12.3"
6 | description = "Screen-scraping library"
7 | optional = false
8 | python-versions = ">=3.6.0"
9 | files = [
10 | {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
11 | {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
12 | ]
13 |
14 | [package.dependencies]
15 | soupsieve = ">1.2"
16 |
17 | [package.extras]
18 | cchardet = ["cchardet"]
19 | chardet = ["chardet"]
20 | charset-normalizer = ["charset-normalizer"]
21 | html5lib = ["html5lib"]
22 | lxml = ["lxml"]
23 |
24 | [[package]]
25 | name = "soupsieve"
26 | version = "2.5"
27 | description = "A modern CSS selector implementation for Beautiful Soup."
28 | optional = false
29 | python-versions = ">=3.8"
30 | files = [
31 | {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
32 | {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
33 | ]
34 |
35 | [metadata]
36 | lock-version = "2.0"
37 | python-versions = "^3.12"
38 | content-hash = "99f2a8fe1acf290a6f68cacbf4817fb2ec5bc643cc7d1380fe234555f962eb59"
39 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "tpdbcollectionmaker"
3 | version = "0.2.1"
4 | description = "Quickly make Plex Meta Manager poster entries from ThePosterDatabase sets"
5 | authors = ["Collin Heist "]
6 | license = "MIT"
7 | readme = "README.md"
8 |
9 | [tool.poetry.dependencies]
10 | python = "^3.12"
11 | beautifulsoup4 = "^4.12.3"
12 |
13 |
14 | [build-system]
15 | requires = ["poetry-core"]
16 | build-backend = "poetry.core.masonry.api"
17 |
--------------------------------------------------------------------------------