├── .gitignore
├── LICENSE
├── README.md
├── main.py
├── poetry.lock
└── pyproject.toml


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | .DS_Store
131 | *.html
132 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Collin Heist
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TPDbCollectionMaker
  2 | Python script to quickly make Kometa (PMM) poster entries from
  3 | [ThePosterDatabase](https://theposterdb.com) (TPDb) sets.
  4 | 
  5 | Because TPDb doesn't permit automated scraping, this tool reads HTML files.
  6 | 
  7 | This tool will read handle collections, movies, shows, and season posters all in
  8 | one file and output YAML that can be used in Kometa metadata files.
  9 | An example of part of the output for TheDoctor30's
 10 | [Marvel Television Set](https://theposterdb.com/set/11318) is shown below:
 11 | 
 12 | ```yaml
 13 | # --------------------------------------------------------------------------------
 14 | # collections
 15 | Marvel Television:
 16 |   url_poster: https://theposterdb.com/api/assets/19724
 17 | # --------------------------------------------------------------------------------
 18 | # shows
 19 | Marvel's Daredevil:
 20 |   url_poster: https://theposterdb.com/api/assets/19725
 21 |   seasons:
 22 |     1: {url_poster: https://theposterdb.com/api/assets/19726}
 23 |     2: {url_poster: https://theposterdb.com/api/assets/19727}
 24 |     3: {url_poster: https://theposterdb.com/api/assets/19728}
 25 | Marvel's Jessica Jones:
 26 |   url_poster: https://theposterdb.com/api/assets/19729
 27 |   seasons:
 28 |     1: {url_poster: https://theposterdb.com/api/assets/19730}
 29 |     2: {url_poster: https://theposterdb.com/api/assets/19731}
 30 |     3: {url_poster: https://theposterdb.com/api/assets/19732}
 31 | Marvel's Luke Cage:
 32 |   url_poster: https://theposterdb.com/api/assets/19733
 33 |   seasons:
 34 |     1: {url_poster: https://theposterdb.com/api/assets/19734}
 35 |     2: {url_poster: https://theposterdb.com/api/assets/19735}
 36 | # etc..
 37 | ```
 38 | 
 39 | # How to Use
 40 | ## Help Menu
 41 | This is a Python command-line tool. All arguments are shown with `--help`:
 42 | 
 43 | ```console
 44 | $ poetry run python main.py -h
 45 | usage: main.py [-h] [-p] [-q] HTML_FILE
 46 | 
 47 | TPDb Collection Maker
 48 | 
 49 | positional arguments:
 50 |   HTML_FILE           file with TPDb Collection page HTML to scrape
 51 | 
 52 | optional arguments:
 53 |   -h, --help          show this help message and exit
 54 |   -p, --primary-only  only parse the primary set (ignore any Additional Sets)
 55 |   -q, --always-quote  put all titles in quotes ("")
 56 |   ```
 57 | 
 58 | ## Installation
 59 | 
 60 | > NOTE: If copying these commands, do __not__ copy the `$` - that is just to
 61 | show this is a _command_.
 62 | 
 63 | 1. Install `poetry` - see [here](https://python-poetry.org/docs/#installation)
 64 | with the `pipx` method.
 65 | 
 66 | 2. Download this tool:
 67 | ```console
 68 | $ git clone https://github.com/CollinHeist/TPDbCollectionMaker/
 69 | ```
 70 | 
 71 | 3. Install the Python dependencies:
 72 | ```console
 73 | cd TPDbCollectionMaker
 74 | poetry install
 75 | ```
 76 | 
 77 | 4. Run the script (see [Arguments](#arguments) for details).
 78 | ```
 79 | poetry run python main.py -h
 80 | ```
 81 | 
 82 | ## Getting Page HTML
 83 | Because TPDb doesn't permit automated scraping, this tool reads HTML files. To
 84 | get the HTML of a set, right-click the set page and select `Inspect`:
 85 | 
 86 | <img src="https://user-images.githubusercontent.com/17693271/168729610-42ac80fc-afb7-40b4-a6bd-39b3f310619c.jpg" width="600"/>
 87 | 
 88 | This should launch your browser's HTML inspector. It should look something like:
 89 | 
 90 | <img src="https://user-images.githubusercontent.com/17693271/168729837-eacfc4d8-29d3-4968-80f2-17ed164a8884.jpg" width="600"/>
 91 | 
 92 | Go to the top-most HTML element (if HTML is selected, hold the left-arrow key
 93 | to collapse all the HTML). The top-most HTML should look like:
 94 | 
 95 | ```html
 96 | <!DOCTYPE html>
 97 | <html class="h-100" lang="en"><head>
 98 | ...
 99 | ```
100 | 
101 | Right-click the `<html class="h-100" lang="en"><head>` element, go to `Copy` >
102 | `Inner HTML`. Your clipboard now has the complete HTML of the set page; paste
103 | this into some file alongside the `main.py` file of this project. This file will
104 | be the input to the script (see below).
105 | 
106 | ## Arguments
107 | ### `html`
108 | Input HTML file to parse.
109 | 
110 | ### `-p`, `--primary-only`
111 | Only parse the primary content on the given HTML page, ignoring any Additional
112 | Sets. If unspecified, then the entire page is parsed.
113 | 
114 | ### `-q`, `--always-quote`
115 | Quote all titles in the output. If unspecified, only titles with colons are
116 | quoted.
117 | 
118 | Below is an example of this argument:
119 | 
120 | ```console
121 | $ poetry run python main.py in.html --always-quote
122 | "Iron Man (2008)":
123 |   url_poster: https://theposterdb.com/api/assets/9773
124 | "The Incredible Hulk (2008)":
125 |   url_poster: https://theposterdb.com/api/assets/9775
126 | "Iron Man 2 (2010)":
127 |   url_poster: https://theposterdb.com/api/assets/9776
128 | ```
129 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | from pathlib import Path
  3 | from re import compile as re_compile
  4 | from typing import Iterable, Literal
  5 | 
  6 | try:
  7 |     from bs4 import BeautifulSoup
  8 | except ImportError:
  9 |     print(f'Missing required packages - execute "poetry install"')
 10 |     exit(1)
 11 | 
 12 | 
 13 | # Create ArgumentParser object and arguments
 14 | parser = ArgumentParser(description='TPDb Collection Maker')
 15 | parser.add_argument(
 16 |     'html',
 17 |     type=Path,
 18 |     metavar='HTML_FILE',
 19 |     help='file with TPDb Collection page HTML to scrape')
 20 | parser.add_argument(
 21 |     '-p', '--primary-only',
 22 |     action='store_true',
 23 |     help='only parse the primary set (ignore any Additional Sets)')
 24 | parser.add_argument(
 25 |     '-q', '--always-quote',
 26 |     action='store_true',
 27 |     help='put all titles in quotes ("")')
 28 | 
 29 | 
 30 | ContentType = Literal[
 31 |     'category', 'collection', 'show', 'movie', 'company', 'season'
 32 | ]
 33 | PRIMARY_CONTENT_CLASS: str = 'row d-flex flex-wrap m-0 w-100 mx-n1 mt-n1'
 34 | 
 35 | 
 36 | class Content:
 37 |     """
 38 |     This class describes some type of content. Content contains a poster ID,
 39 |     a type, title, year, and a URL.
 40 |     """
 41 | 
 42 |     """Poster URL format for all TPDb posters"""
 43 |     POSTER_URL = 'https://theposterdb.com/api/assets/{id}'
 44 | 
 45 |     """Regex to match yearless titles and season names from full titles"""
 46 |     YEARLESS_REGEX=re_compile(r'^(.*) \(\d+\)($| - (?:Season \d+|(Specials)))$')
 47 |     SEASON_REGEX = re_compile(r'^.* - (?:Season (\d+)|(Specials))$')
 48 | 
 49 |     __slots__ = (
 50 |         'poster_id', 'content_type', 'title', 'use_year', 'must_quote', 'url',
 51 |         'yearless_title', 'season_number', 'sub_content',    
 52 |     )
 53 | 
 54 |     def __init__(self, poster_id: int,
 55 |             content_type: ContentType,
 56 |             title: str, *,
 57 |             must_quote: bool = False) -> None:
 58 |         """
 59 |         Initialize the content described by the given attributes.
 60 | 
 61 |         Args:
 62 |             poster_id: TPDb poster ID of the content.
 63 |             content_type: Type of content being created.
 64 |             title: Title of the content.
 65 |             must_quote: Override for whether the finalized title of this
 66 |                 content should be quoted or not. Keyword only.
 67 |         """
 68 | 
 69 |         self.poster_id = poster_id
 70 |         self.content_type = content_type
 71 |         self.title = title
 72 |         self.use_year = False
 73 |         self.must_quote = must_quote or ': ' in self.title
 74 |         self.url = self.POSTER_URL.format(id=self.poster_id)
 75 | 
 76 |         # Attempt to parse the yearless title
 77 |         if (group := self.YEARLESS_REGEX.match(self.title)) is None:
 78 |             self.yearless_title = self.title
 79 |         else:
 80 |             self.yearless_title = group.group(1)
 81 | 
 82 |         # If season name is in the title, parse 
 83 |         if (season_group := self.SEASON_REGEX.match(self.title)) is None:
 84 |             self.season_number = None
 85 |         else:
 86 |             self.content_type = 'season'
 87 |             if season_group.group(2) == 'Specials':
 88 |                 self.season_number = 0
 89 |             else:
 90 |                 self.season_number = int(season_group.group(1))
 91 | 
 92 |         # No subcontent yet
 93 |         self.sub_content = {}
 94 | 
 95 | 
 96 |     @property
 97 |     def final_title(self) -> str:
 98 |         """
 99 |         The finalized title for this Content. Quoted and utilizing the
100 |         year if necessary.
101 |         """
102 | 
103 |         title = self.title if self.use_year else self.yearless_title
104 |         return f'"{title}"' if self.must_quote else title
105 | 
106 | 
107 |     def __repr__(self) -> str:
108 |         attributes = ', '.join(
109 |             f'{attr}={getattr(self, attr)!r}' for attr in self.__slots__
110 |             if not attr.startswith('__')
111 |         )
112 | 
113 |         return f'<Content {attributes}>'
114 | 
115 | 
116 |     def __str__(self) -> str:
117 |         """
118 |         The string representation of this content. This is the formatted
119 |         content string used within PMM, and the return format depends on
120 |         the content type of this object.
121 |         """
122 | 
123 |         if self.content_type in ('category', 'collection', 'movie', 'company'):
124 |             return f'{self.final_title}:\n  url_poster: {self.url}'
125 |         elif self.content_type == 'show':
126 |             base = f'{self.final_title}:\n  url_poster: {self.url}'
127 |             if len(self.sub_content) > 0:
128 |                 sub = '\n    '.join(str(self.sub_content[season])
129 |                                     for season in sorted(self.sub_content))
130 |                 return f'{base}\n  seasons:\n    {sub}'
131 |             
132 |             return base
133 |         elif self.content_type == 'season':
134 |             return f'{self.season_number}: ' + '{url_poster: ' + self.url + '}'
135 | 
136 |         return f'<Bad content type "{self.content_type}">'
137 | 
138 | 
139 |     def is_sub_content_of(self, content: 'Content') -> bool:
140 |         """
141 |         Determine whether the given content object is the parent content
142 |         of this object. 
143 | 
144 |         Args:
145 |             content: Object being compared against.
146 | 
147 |         Returns:
148 |             True if the given object is the parent of this content.
149 |             False otherwise.
150 |         """
151 | 
152 |         # Only a show can have a season child
153 |         if self.content_type != 'season' or content.content_type != 'show':
154 |             return False
155 | 
156 |         return (content.yearless_title == self.yearless_title
157 |                 and content.title in self.title)
158 | 
159 |     
160 |     def is_parent_content_of(self, content: 'Content') -> bool:
161 |         """Logical composite of `is_sub_content_of` on this object."""
162 | 
163 |         return content.is_sub_content_of(self)
164 | 
165 | 
166 |     def add_sub_content(self, content: 'Content') -> None:
167 |         """
168 |         Add the given content as sub content of this object.
169 | 
170 |         Args:
171 |             content: Sub-content being added.
172 |         """
173 | 
174 |         self.sub_content[content.season_number] = content
175 | 
176 | 
177 | class ContentList:
178 |     """
179 |     This class describes a container list of Content objects. This is a
180 |     glorified dictionary of lists for each content type
181 |     """
182 | 
183 | 
184 |     def __init__(self) -> None:
185 |         self.content: dict[ContentType, Iterable[Content]] = {
186 |             'category': [],
187 |             'collection': [],
188 |             'movie': [],
189 |             'show': [],
190 |             'season': [],
191 |             'company': [],
192 |         }
193 | 
194 | 
195 |     def __bool__(self) -> bool:
196 |         """Whether this object contains any content."""
197 | 
198 |         return any(content for content in self.content.values())
199 |     
200 | 
201 |     def __repr__(self) -> str:
202 |         return f'<ContentList {self.content}>'
203 |     
204 | 
205 |     def __divider(self, label: str, /) -> str:
206 |         return f'\n# {"-"*80}\n# {label}\n# {"-" * 80}'
207 | 
208 | 
209 |     def add_content(self, new: Content) -> None:
210 |         """
211 |         Add the given content to this object. This finds any existing
212 |         content the new content could be a child or parent of, and adds
213 |         this object to them if indicated.
214 | 
215 |         Args:
216 |             new: Content being added.
217 |         """
218 | 
219 |         # Check if new content belongs to any existing shows
220 |         for existing in self.content['show']:
221 |             if new.is_sub_content_of(existing):
222 |                 existing.add_sub_content(new)
223 |                 # Can only belong to one show, stop looping
224 |                 break
225 | 
226 |         # Check if any existing seasons belong to new content
227 |         for existing in self.content['season']:
228 |             if new.is_parent_content_of(existing):
229 |                 new.add_sub_content(existing)
230 | 
231 |         # Check for content of this same title
232 |         for existing in self.content[new.content_type]:
233 |             if existing.title == new.title:
234 |                 new.use_year = True
235 |                 break
236 | 
237 |         self.content[new.content_type].append(new)
238 | 
239 | 
240 |     def print(self) -> None:
241 |         """
242 |         Print this object. This prints segmented sections of each type
243 |         of Content in this object.
244 |         """
245 | 
246 |         # Print each content group
247 |         for content_type, content_list in self.content.items():
248 |             # Don't print empty content sets, or base seasons
249 |             if not content_list or content_type == 'season':
250 |                 continue
251 | 
252 |             # Print divider, content type header, and all content
253 |             print(self.__divider(content_type + 's'))
254 |             for content in content_list:
255 |                 # print(f'{content=!r}')
256 |                 print(str(content))
257 | 
258 |         # Print season content if no parent show content was parsed
259 |         if self.content['season'] and not self.content['show']:
260 |             print(self.__divider('Unassigned Content'))
261 |             for content in self.content['season']:
262 |                 print(str(content))
263 | 
264 | 
265 | """If file is entrypoint, parse args"""
266 | if __name__ == '__main__':
267 |     # Parse given arguments
268 |     args = parser.parse_args()
269 | 
270 |     # Get page HTML from file if provided
271 |     if not args.html.exists():
272 |         print(f'File "{args.html_file.resolve()}" does not exist')
273 |         exit(1)
274 | 
275 |     # Open file and read content
276 |     with args.html.open('r') as file_handle:
277 |         html = file_handle.read()
278 | 
279 |     # Create BeautifulSoup element of HTML
280 |     page = BeautifulSoup(html, 'html.parser')
281 | 
282 |     # If only doing primary content, filter webpage
283 |     if args.primary_only:
284 |         webpage = page.find('div', class_=PRIMARY_CONTENT_CLASS)
285 |         posters = webpage.find_all('div', class_='overlay rounded-poster')
286 |     else:
287 |         posters = page.find_all('div', class_='overlay rounded-poster')
288 | 
289 |     # Get all posters in this set, create Content and add to list
290 |     content_list = ContentList()
291 |     for poster_element in posters:
292 |         content = Content(
293 |             poster_element.attrs['data-poster-id'],
294 |             poster_element.attrs['data-poster-type'].lower(),
295 |             poster_element.find('p', class_='p-0 mb-1 text-break').string,
296 |             must_quote=args.always_quote,
297 |         )
298 |         content_list.add_content(content)
299 | 
300 |     if content_list:
301 |         content_list.print()
302 |     else:
303 |         print(f'No content identified!')
304 | 


--------------------------------------------------------------------------------
/poetry.lock:
--------------------------------------------------------------------------------
 1 | # This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 2 | 
 3 | [[package]]
 4 | name = "beautifulsoup4"
 5 | version = "4.12.3"
 6 | description = "Screen-scraping library"
 7 | optional = false
 8 | python-versions = ">=3.6.0"
 9 | files = [
10 |     {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"},
11 |     {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"},
12 | ]
13 | 
14 | [package.dependencies]
15 | soupsieve = ">1.2"
16 | 
17 | [package.extras]
18 | cchardet = ["cchardet"]
19 | chardet = ["chardet"]
20 | charset-normalizer = ["charset-normalizer"]
21 | html5lib = ["html5lib"]
22 | lxml = ["lxml"]
23 | 
24 | [[package]]
25 | name = "soupsieve"
26 | version = "2.5"
27 | description = "A modern CSS selector implementation for Beautiful Soup."
28 | optional = false
29 | python-versions = ">=3.8"
30 | files = [
31 |     {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
32 |     {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
33 | ]
34 | 
35 | [metadata]
36 | lock-version = "2.0"
37 | python-versions = "^3.12"
38 | content-hash = "99f2a8fe1acf290a6f68cacbf4817fb2ec5bc643cc7d1380fe234555f962eb59"
39 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "tpdbcollectionmaker"
 3 | version = "0.2.1"
 4 | description = "Quickly make Plex Meta Manager poster entries from ThePosterDatabase sets"
 5 | authors = ["Collin Heist <collinheist@gmail.com>"]
 6 | license = "MIT"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.12"
11 | beautifulsoup4 = "^4.12.3"
12 | 
13 | 
14 | [build-system]
15 | requires = ["poetry-core"]
16 | build-backend = "poetry.core.masonry.api"
17 | 


--------------------------------------------------------------------------------