├── .github ├── dependabot.yml └── workflows │ └── submodules.yml ├── .gitignore ├── .gitmodules ├── README.md ├── generate.py ├── poetry.lock ├── pyproject.toml ├── scrapy.cfg ├── sitelist.md ├── tester.py ├── tester.ui └── tpdb ├── BaseMovieScraper.py ├── BaseOCR.py ├── BasePerformerScraper.py ├── BaseSceneScraper.py ├── BaseScraper.py ├── __init__.py ├── custommiddlewares.py ├── helpers ├── __init__.py ├── flare_solverr.py ├── http.py ├── scrapy_dpath │ ├── __init__.py │ ├── dpath.py │ ├── middleware.py │ └── response.py └── scrapy_flare │ ├── __init__.py │ ├── middleware.py │ ├── request.py │ └── response.py ├── items.py ├── middlewares.py ├── pipelines.py └── settings.py.example /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | -------------------------------------------------------------------------------- /.github/workflows/submodules.yml: -------------------------------------------------------------------------------- 1 | name: 'Submodules Sync' 2 | 3 | on: 4 | # Allows you to run this workflow manually from the Actions tab or through HTTP API 5 | workflow_dispatch: 6 | 7 | jobs: 8 | sync: 9 | name: 'Submodules Sync' 10 | runs-on: ubuntu-latest 11 | 12 | # Use the Bash shell regardless whether the GitHub Actions runner is ubuntu-latest, macos-latest, or windows-latest 13 | defaults: 14 | run: 15 | shell: bash 16 | 17 | steps: 18 | # Checkout the repository to the GitHub Actions runner 19 | - name: Checkout 20 | uses: actions/checkout@v3 21 | with: 22 | submodules: 'recursive' 23 | 24 | - uses: actions/setup-python@v3 25 | with: 26 | python-version: '3.x' # Version range or exact version of a Python version to use, using SemVer's version range syntax 27 | architecture: 'x64' # optional x64 or x86. Defaults to x64 if not specified 28 | 29 | - name: Install poetry 30 | uses: abatilo/actions-poetry@v2 31 | 32 | # Update references 33 | - name: Git Submodule Update 34 | run: | 35 | git pull --recurse-submodules 36 | git submodule foreach git pull origin main --ff-only 37 | 38 | - name: Commit update 39 | run: | 40 | git config --global user.name 'Git bot' 41 | git config --global user.email 'bot@noreply.github.com' 42 | git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }} 43 | cp tpdb/settings.py.example tpdb/settings.py 44 | poetry install 45 | poetry run python3 generate.py 46 | rm tpdb/settings.py 47 | git commit -am "Auto updated submodule references" && git push || echo "No changes to commit" 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | settings.py 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | pip-wheel-metadata/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | .idea 133 | .vscode 134 | 135 | .flakeheaven_cache 136 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tpdb/spiders"] 2 | path = tpdb/spiders 3 | url = https://github.com/ThePornDatabase/scrapers.git 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Scrapy for TPDB 2 | 3 | This is the scrapy framework for TPDB's scraper. 4 | 5 | ### Installation 6 | 7 | Clone this repo 8 | 9 | ``git clone --recurse-submodules https://github.com/ThePornDatabase/scrapy.git`` 10 | 11 | Install the packages using poetry 12 | 13 | ``poetry install`` 14 | 15 | Next change directory to `tpdb` and create a copy of example settings and edit if necessary: 16 | 17 | ``` 18 | cd tpdb 19 | cp settings.py.example settings.py 20 | ``` 21 | 22 | You can then run a scraper using `scrapy crawl` **ScraperName** 23 | 24 | ``poetry run scrapy crawl Vixen`` 25 | 26 | ### How it works 27 | 28 | Each scraper is in it's own Python file, placed in the tpdb/spiders folder - they are stored in another repository so 29 | people can contribute. 30 | 31 | 32 | # Site Lists 33 | 34 | [sitelist.md](sitelist.md) 35 | -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import inspect 4 | import os 5 | 6 | from urllib.parse import urlparse 7 | 8 | from scrapy import spiderloader 9 | from scrapy.utils import project 10 | from mdutils.mdutils import MdUtils 11 | 12 | 13 | class Generator: 14 | mdFile = '' 15 | 16 | @staticmethod 17 | def loop_spiders(): 18 | data = [ 19 | 'Network', 20 | 'Parent', 21 | 'URL', 22 | 'Class', 23 | ] 24 | 25 | settings = project.get_project_settings() 26 | spider_loader = spiderloader.SpiderLoader.from_settings(settings) 27 | spiders = spider_loader.list() 28 | for spider in spiders: 29 | spider_class = spider_loader.load(spider) 30 | 31 | start_urls = [''] 32 | if hasattr(spider_class, 'start_urls'): 33 | start_urls = [urlparse(url) for url in spider_class.start_urls if isinstance(url, str)] 34 | start_urls = ['%s://%s' % (url.scheme, url.hostname) for url in start_urls] 35 | 36 | for url in start_urls: 37 | data.extend([ 38 | spider_class.network.title() if hasattr(spider_class, 'network') else '', 39 | spider_class.parent.title() if hasattr(spider_class, 'parent') else '', 40 | url, 41 | os.path.basename(inspect.getfile(spider_class)) 42 | ]) 43 | 44 | return data 45 | 46 | def main(self): 47 | md_file = MdUtils(file_name='sitelist', title='Scraper Site List') 48 | md_file.new_header(level=1, title='Sites') 49 | data = self.loop_spiders() 50 | md_file.new_line() 51 | md_file.new_table(columns=4, rows=int(len(data) / 4), text=data, text_align='center') 52 | md_file.create_md_file() 53 | 54 | 55 | if __name__ == '__main__': 56 | g = Generator() 57 | g.main() 58 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "tpdb" 3 | version = "1.0" 4 | description = "" 5 | authors = ["DirtyRacer1337 "] 6 | 7 | [tool.poetry.dependencies] 8 | python = ">=3.8,<3.13" 9 | scrapy = "*" 10 | tldextract = "*" 11 | pymongo = "*" 12 | python-slugify = "*" 13 | dateparser = "*" 14 | requests = "*" 15 | yapf = "*" 16 | extruct = "*" 17 | chompjs = "*" 18 | mdutils = "*" 19 | PySide6 = "*" 20 | Unidecode = "*" 21 | dpath = "*" 22 | scrapy_splash = "*" 23 | googletrans = "*" 24 | deep_translator = "*" 25 | pytesseract = "*" 26 | pillow = "*" 27 | furl = "*" 28 | pycountry = "*" 29 | 30 | [tool.poetry.group.dev.dependencies] 31 | ruff = "*" 32 | 33 | [tool.ruff] 34 | exclude = [".git", "__pycache__"] 35 | line-length = 320 36 | indent-width = 4 37 | target-version = "py39" 38 | 39 | [tool.ruff.format] 40 | quote-style = "single" 41 | indent-style = "space" 42 | line-ending = "auto" 43 | 44 | [tool.ruff.lint] 45 | select = ["E", "F"] 46 | ignore = ["E501", "E722"] 47 | 48 | [build-system] 49 | requires = ["poetry-core>=1.8.1"] 50 | build-backend = "poetry.core.masonry.api" 51 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = tpdb.settings -------------------------------------------------------------------------------- /tester.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from pathlib import Path 4 | 5 | from scrapy.http import TextResponse 6 | from scrapy.utils import project 7 | from PySide6.QtUiTools import QUiLoader 8 | from PySide6.QtWidgets import QApplication, QStyleFactory, QTreeWidgetItem 9 | from PySide6.QtCore import QFile, QIODevice, QCoreApplication, Qt 10 | 11 | from tpdb.helpers.http import Http 12 | from tpdb.helpers.scrapy_dpath import DPathResponse 13 | from tpdb.BaseScraper import BaseScraper 14 | 15 | 16 | class GUI: 17 | request = None 18 | response = None 19 | headers = {} 20 | 21 | def __init__(self): 22 | QCoreApplication.setAttribute(Qt.AA_ShareOpenGLContexts) 23 | app = QApplication(sys.argv) 24 | style = QStyleFactory.create('Fusion') 25 | app.setStyle(style) 26 | 27 | ui_file_name = '%s.ui' % Path(__file__).stem 28 | ui_file = QFile(ui_file_name) 29 | if not ui_file.open(QIODevice.ReadOnly): 30 | print('Cannot open %s: %s' % (ui_file_name, ui_file.errorString())) 31 | sys.exit(-1) 32 | 33 | loader = QUiLoader() 34 | self.window = loader.load(ui_file) 35 | ui_file.close() 36 | if not self.window: 37 | print(loader.errorString()) 38 | sys.exit(-1) 39 | 40 | self.connect() 41 | self.setting() 42 | 43 | self.window.show() 44 | 45 | sys.exit(app.exec()) 46 | 47 | def connect(self): 48 | self.window.pushButton.pressed.connect(self.load) 49 | self.window.lineEdit_2.editingFinished.connect(self.get) 50 | 51 | def setting(self): 52 | settings = project.get_project_settings() 53 | self.headers['User-Agent'] = settings.get('USER_AGENT', default='Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36') 54 | 55 | def get_response(self, content, request=None): 56 | url = request.url if request else '' 57 | response = TextResponse(url=url, headers=self.headers, body=content) 58 | response = DPathResponse(request, response) 59 | 60 | return response 61 | 62 | def load(self): 63 | self.request = None 64 | self.response = None 65 | 66 | url = self.window.lineEdit.text() 67 | if url: 68 | self.request = Http.get(url, headers=self.headers) 69 | 70 | if self.request is not None: 71 | self.response = self.get_response(self.request.content, self.request) 72 | 73 | self.window.label.setText('{0}'.format(url)) 74 | self.window.plainTextEdit.setPlainText(self.request.text) 75 | else: 76 | text = self.window.plainTextEdit.toPlainText().encode('UTF-8') 77 | if text: 78 | self.response = self.get_response(text) 79 | 80 | self.window.label.setText('From TextBox') 81 | 82 | def get(self): 83 | result = None 84 | self.window.treeWidget.clear() 85 | 86 | selector = self.window.lineEdit_2.text().strip() 87 | if self.response: 88 | result = BaseScraper.process_xpath(self.response, selector) 89 | 90 | if result: 91 | self.window.lineEdit_3.setText(result.get().strip()) 92 | data = {k: v.strip() for k, v in enumerate(result.getall())} 93 | 94 | tree = QTreeWidgetItem() 95 | items = self.fill_item(tree, data) 96 | self.window.treeWidget.addTopLevelItems(items) 97 | self.window.treeWidget.expandAll() 98 | 99 | def fill_item(self, item, value): 100 | def new_item(parent, item_text, item_val=None): 101 | child = QTreeWidgetItem([item_text]) 102 | self.fill_item(child, item_val) 103 | parent.addChild(child) 104 | 105 | if value is None: 106 | return None 107 | 108 | if isinstance(value, dict): 109 | for key, val in sorted(value.items()): 110 | new_item(item, str(key), val) 111 | elif isinstance(value, (list, tuple)): 112 | for val in value: 113 | text = (str(val) if not isinstance(val, (dict, list, tuple)) else '[%s]' % type(val).__name__) 114 | new_item(item, text, val) 115 | else: 116 | new_item(item, str(value)) 117 | 118 | return [item] 119 | 120 | 121 | if __name__ == '__main__': 122 | GUI() 123 | -------------------------------------------------------------------------------- /tester.ui: -------------------------------------------------------------------------------- 1 | 2 | 3 | MainWindow 4 | 5 | 6 | 7 | 0 8 | 0 9 | 632 10 | 505 11 | 12 | 13 | 14 | Scrapy Selector Tester 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | Load 27 | 28 | 29 | true 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | Qt::Horizontal 39 | 40 | 41 | 42 | 43 | 44 | 45 | URL 46 | 47 | 48 | true 49 | 50 | 51 | true 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | Expression: 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | true 71 | 72 | 73 | 74 | 75 | 76 | 77 | Result: 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | Qt::Vertical 87 | 88 | 89 | 90 | 91 | Result 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 0 108 | 0 109 | 632 110 | 22 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /tpdb/BaseMovieScraper.py: -------------------------------------------------------------------------------- 1 | import string 2 | import scrapy 3 | 4 | from tpdb.BaseScraper import BaseScraper 5 | from tpdb.items import MovieItem 6 | 7 | 8 | class BaseMovieScraper(BaseScraper): 9 | custom_tpdb_settings = { 10 | 'ITEM_PIPELINES': { 11 | 'tpdb.pipelines.TpdbApiMoviePipeline': 400, 12 | }, 13 | 'DOWNLOADER_MIDDLEWARES': { 14 | 'tpdb.custommiddlewares.CustomProxyMiddleware': 350, 15 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, 16 | 'tpdb.helpers.scrapy_dpath.DPathMiddleware': 542, 17 | 'tpdb.middlewares.TpdbMovieDownloaderMiddleware': 543, 18 | } 19 | } 20 | 21 | def parse(self, response, **kwargs): 22 | movies = self.get_movies(response) 23 | count = 0 24 | for movie in movies: 25 | count += 1 26 | yield movie 27 | 28 | if count: 29 | if 'page' in response.meta and response.meta['page'] < self.limit_pages: 30 | meta = response.meta 31 | meta['page'] = meta['page'] + 1 32 | print('NEXT PAGE: ' + str(meta['page'])) 33 | yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), 34 | callback=self.parse, 35 | meta=meta, 36 | headers=self.headers, 37 | cookies=self.cookies) 38 | 39 | def get_movies(self, response): 40 | return [] 41 | 42 | def parse_movie(self, response): 43 | item = MovieItem() 44 | 45 | if 'title' in response.meta and response.meta['title']: 46 | item['title'] = response.meta['title'] 47 | else: 48 | item['title'] = self.get_title(response) 49 | 50 | if 'description' in response.meta: 51 | item['description'] = response.meta['description'] 52 | else: 53 | item['description'] = self.get_description(response) 54 | 55 | if hasattr(self, 'site'): 56 | item['site'] = self.site 57 | elif 'site' in response.meta: 58 | item['site'] = response.meta['site'] 59 | else: 60 | item['site'] = self.get_site(response) 61 | 62 | if hasattr(self, 'network'): 63 | item['network'] = self.network 64 | elif 'network' in response.meta: 65 | item['network'] = response.meta['network'] 66 | else: 67 | item['network'] = item['site'] 68 | 69 | if 'date' in response.meta: 70 | item['date'] = response.meta['date'] 71 | else: 72 | item['date'] = self.get_date(response) 73 | 74 | if 'front' in response.meta: 75 | item['front'] = response.meta['front'] 76 | else: 77 | item['front'] = self.get_image(response, 'front') 78 | 79 | if 'front' not in item or not item['front']: 80 | item['front'] = None 81 | 82 | if item['front']: 83 | item['front_blob'] = self.get_image_blob_from_link(item['front']) 84 | 85 | if 'back' in response.meta: 86 | item['back'] = response.meta['back'] 87 | else: 88 | item['back'] = self.get_image(response, 'back') 89 | 90 | if 'back' not in item or not item['back']: 91 | item['back'] = None 92 | 93 | if item['back']: 94 | item['back_blob'] = self.get_image_blob_from_link(item['back']) 95 | 96 | if 'front_blob' not in item: 97 | item['front_blob'] = None 98 | 99 | if 'back_blob' not in item: 100 | item['back_blob'] = None 101 | 102 | if 'performers' in response.meta: 103 | item['performers'] = response.meta['performers'] 104 | else: 105 | item['performers'] = self.get_performers(response) 106 | 107 | if 'tags' in response.meta: 108 | item['tags'] = response.meta['tags'] 109 | else: 110 | item['tags'] = self.get_tags(response) 111 | 112 | if 'id' in response.meta: 113 | item['id'] = response.meta['id'] 114 | else: 115 | item['id'] = self.get_id(response) 116 | 117 | if 'trailer' in response.meta: 118 | item['trailer'] = response.meta['trailer'] 119 | else: 120 | item['trailer'] = self.get_trailer(response) 121 | 122 | if 'studio' in response.meta: 123 | item['studio'] = response.meta['studio'] 124 | else: 125 | item['studio'] = self.get_studio(response) 126 | 127 | if 'director' in response.meta: 128 | item['director'] = response.meta['director'] 129 | else: 130 | item['director'] = self.get_director(response) 131 | 132 | if 'format' in response.meta: 133 | item['format'] = response.meta['format'] 134 | else: 135 | item['format'] = self.get_format(response) 136 | 137 | if 'length' in response.meta: 138 | item['length'] = response.meta['length'] 139 | else: 140 | item['length'] = self.get_length(response) 141 | 142 | if 'year' in response.meta: 143 | item['year'] = response.meta['year'] 144 | else: 145 | item['year'] = self.get_year(response) 146 | 147 | if 'rating' in response.meta: 148 | item['rating'] = response.meta['rating'] 149 | else: 150 | item['rating'] = self.get_rating(response) 151 | 152 | if 'sku' in response.meta: 153 | item['sku'] = response.meta['sku'] 154 | else: 155 | item['sku'] = self.get_sku(response) 156 | 157 | if 'upc' in response.meta: 158 | item['upc'] = response.meta['upc'] 159 | else: 160 | item['upc'] = self.get_upc(response) 161 | 162 | item['url'] = self.get_url(response) 163 | 164 | yield self.check_item(item, self.days) 165 | 166 | def get_description(self, response): 167 | if 'description' in self.get_selector_map(): 168 | description = self.get_element(response, 'description', 're_description') 169 | if isinstance(description, list): 170 | description = ' '.join(description) 171 | return self.cleanup_description(description) 172 | return '' 173 | 174 | def get_date(self, response): 175 | if 'date' in self.get_selector_map(): 176 | scenedate = self.cleanup_text(self.get_element(response, 'date', 're_date')) 177 | if scenedate: 178 | date_formats = self.get_selector_map('date_formats') if 'date_formats' in self.get_selector_map() else None 179 | return self.parse_date(scenedate, date_formats=date_formats).isoformat() 180 | return self.parse_date('today').isoformat() 181 | 182 | def get_performers(self, response): 183 | if 'performers' in self.get_selector_map(): 184 | performers = self.get_element(response, 'performers', "list") 185 | if performers and isinstance(performers, list): 186 | return list(map(lambda x: string.capwords(x.strip()), performers)) 187 | return [] 188 | 189 | def get_tags(self, response): 190 | if 'tags' in self.get_selector_map(): 191 | tags = self.get_element(response, 'tags', "list") 192 | if tags and isinstance(tags, list): 193 | new_tags = [] 194 | for tag in tags: 195 | if ',' in tag: 196 | new_tags.extend(tag.split(',')) 197 | elif tag: 198 | new_tags.append(tag) 199 | return list(map(lambda x: string.capwords(x.strip()), new_tags)) 200 | return [] 201 | 202 | def get_image(self, response, side=None): 203 | if not side: 204 | side = 'image' 205 | if side in self.get_selector_map(): 206 | image = self.process_xpath(response, self.get_selector_map(side)) 207 | if image: 208 | image_re = 're_' + side 209 | image = self.get_from_regex(image.get(), image_re) 210 | if image: 211 | return self.format_link(response, image) 212 | return None 213 | 214 | def get_title(self, response): 215 | if 'title' in self.get_selector_map(): 216 | return string.capwords(self.cleanup_text(self.get_element(response, 'title', 're_title'))) 217 | return '' 218 | 219 | def get_trailer(self, response): 220 | if 'trailer' in self.get_selector_map(): 221 | return self.format_link(response, self.get_element(response, 'trailer', 're_trailer')) 222 | return '' 223 | 224 | def get_studio(self, response): 225 | if 'studio' in self.get_selector_map(): 226 | return string.capwords(self.cleanup_text(self.get_element(response, 'studio', 're_studio'))) 227 | return '' 228 | 229 | def get_director(self, response): 230 | if 'director' in self.get_selector_map(): 231 | director = self.get_element(response, 'director', 're_director') 232 | if director and isinstance(director, list): 233 | director = ", ".join(director) 234 | return string.capwords(self.cleanup_text(director)) 235 | return '' 236 | 237 | def get_format(self, response): 238 | if 'format' in self.get_selector_map(): 239 | return string.capwords(self.cleanup_text(self.get_element(response, 'format', 're_format'))) 240 | return '' 241 | 242 | def get_length(self, response): 243 | if 'length' in self.get_selector_map(): 244 | return string.capwords(self.cleanup_text(self.get_element(response, 'length', 're_length'))) 245 | return '' 246 | 247 | def get_year(self, response): 248 | if 'year' in self.get_selector_map(): 249 | return string.capwords(self.cleanup_text(self.get_element(response, 'year', 're_year'))) 250 | return '' 251 | 252 | def get_rating(self, response): 253 | if 'rating' in self.get_selector_map(): 254 | return string.capwords(self.cleanup_text(self.get_element(response, 'rating', 're_rating'))) 255 | return '' 256 | 257 | def get_sku(self, response): 258 | if 'sku' in self.get_selector_map(): 259 | return string.capwords(self.cleanup_text(self.get_element(response, 'sku', 're_sku'))) 260 | return '' 261 | 262 | def get_upc(self, response): 263 | if 'upc' in self.get_selector_map(): 264 | return string.capwords(self.cleanup_text(self.get_element(response, 'upc', 're_upc'))) 265 | return '' 266 | -------------------------------------------------------------------------------- /tpdb/BaseOCR.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from typing import Any 3 | 4 | import pytesseract 5 | 6 | from PIL import Image, ImageFilter, ImageOps 7 | from unidecode import unidecode 8 | 9 | 10 | class BaseOCR: 11 | def get_data_from_image(self, image: bytes) -> Any: 12 | image = self._get_image_from_bytes(image) 13 | image = self._image_pre_processing(image) 14 | text: str = pytesseract.image_to_string(image) 15 | res = self._text_post_processing(text) 16 | 17 | return res 18 | 19 | @staticmethod 20 | def _image_pre_processing(image: Image.Image) -> Image.Image: 21 | image = ImageOps.expand(image, 20) 22 | image = image.convert('L') 23 | image = image.filter(ImageFilter.EDGE_ENHANCE_MORE) 24 | 25 | return image 26 | 27 | @staticmethod 28 | def _text_post_processing(text: str) -> str: 29 | text = unidecode(text) 30 | text = text.strip() 31 | 32 | return text 33 | 34 | @staticmethod 35 | def _get_image_from_bytes(data: bytes): 36 | data = BytesIO(data) 37 | return Image.open(data) 38 | -------------------------------------------------------------------------------- /tpdb/BasePerformerScraper.py: -------------------------------------------------------------------------------- 1 | import string 2 | import scrapy 3 | 4 | from tpdb.BaseScraper import BaseScraper 5 | from tpdb.items import PerformerItem 6 | 7 | 8 | class BasePerformerScraper(BaseScraper): 9 | custom_tpdb_settings = { 10 | 'ITEM_PIPELINES': { 11 | 'tpdb.pipelines.TpdbApiPerformerPipeline': 400, 12 | }, 13 | 'DOWNLOADER_MIDDLEWARES': { 14 | 'tpdb.custommiddlewares.CustomProxyMiddleware': 350, 15 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, 16 | 'tpdb.helpers.scrapy_dpath.DPathMiddleware': 542, 17 | 'tpdb.middlewares.TpdbPerformerDownloaderMiddleware': 543, 18 | } 19 | } 20 | 21 | def parse(self, response, **kwargs): 22 | performers = self.get_performers(response) 23 | count = 0 24 | for performer in performers: 25 | count += 1 26 | yield performer 27 | 28 | if count: 29 | if 'page' in response.meta and response.meta['page'] < self.limit_pages: 30 | meta = response.meta 31 | meta['page'] = meta['page'] + 1 32 | print('NEXT PAGE: ' + str(meta['page'])) 33 | yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), 34 | callback=self.parse, 35 | meta=meta, 36 | headers=self.headers, 37 | cookies=self.cookies) 38 | 39 | def get_performers(self, response): 40 | return [] 41 | 42 | def parse_performer(self, response): 43 | item = PerformerItem() 44 | 45 | if 'name' in response.meta and response.meta['name']: 46 | item['name'] = response.meta['name'] 47 | else: 48 | item['name'] = self.get_name(response) 49 | 50 | if 'image' in response.meta and response.meta['image']: 51 | item['image'] = response.meta['image'] 52 | else: 53 | item['image'] = self.get_image(response) 54 | 55 | if 'image' not in item or not item['image']: 56 | item['image'] = None 57 | 58 | if 'image_blob' in response.meta: 59 | item['image_blob'] = response.meta['image_blob'] 60 | else: 61 | item['image_blob'] = self.get_image_blob(response) 62 | 63 | if ('image_blob' not in item or not item['image_blob']) and item['image']: 64 | item['image_blob'] = self.get_image_blob_from_link(item['image']) 65 | 66 | if 'image_blob' not in item: 67 | item['image_blob'] = None 68 | 69 | if 'bio' in response.meta and response.meta['bio']: 70 | item['bio'] = response.meta['bio'] 71 | else: 72 | item['bio'] = self.get_bio(response) 73 | 74 | if 'gender' in response.meta and response.meta['gender']: 75 | item['gender'] = response.meta['gender'] 76 | else: 77 | item['gender'] = self.get_gender(response) 78 | 79 | if 'birthday' in response.meta and response.meta['birthday']: 80 | item['birthday'] = response.meta['birthday'] 81 | else: 82 | item['birthday'] = self.get_birthday(response) 83 | 84 | if 'astrology' in response.meta and response.meta['astrology']: 85 | item['astrology'] = response.meta['astrology'] 86 | else: 87 | item['astrology'] = self.get_astrology(response) 88 | 89 | if 'birthplace' in response.meta and response.meta['birthplace']: 90 | item['birthplace'] = response.meta['birthplace'] 91 | else: 92 | item['birthplace'] = self.get_birthplace(response) 93 | 94 | if 'ethnicity' in response.meta and response.meta['ethnicity']: 95 | item['ethnicity'] = response.meta['ethnicity'] 96 | else: 97 | item['ethnicity'] = self.get_ethnicity(response) 98 | 99 | if 'nationality' in response.meta and response.meta['nationality']: 100 | item['nationality'] = response.meta['nationality'] 101 | else: 102 | item['nationality'] = self.get_nationality(response) 103 | 104 | if 'eyecolor' in response.meta and response.meta['eyecolor']: 105 | item['eyecolor'] = response.meta['eyecolor'] 106 | else: 107 | item['eyecolor'] = self.get_eyecolor(response) 108 | 109 | if 'haircolor' in response.meta and response.meta['haircolor']: 110 | item['haircolor'] = response.meta['haircolor'] 111 | else: 112 | item['haircolor'] = self.get_haircolor(response) 113 | 114 | if 'height' in response.meta and response.meta['height']: 115 | item['height'] = response.meta['height'] 116 | else: 117 | item['height'] = self.get_height(response) 118 | 119 | if 'weight' in response.meta and response.meta['weight']: 120 | item['weight'] = response.meta['weight'] 121 | else: 122 | item['weight'] = self.get_weight(response) 123 | 124 | if 'measurements' in response.meta and response.meta['measurements']: 125 | item['measurements'] = response.meta['measurements'] 126 | else: 127 | item['measurements'] = self.get_measurements(response) 128 | 129 | if 'tattoos' in response.meta and response.meta['tattoos']: 130 | item['tattoos'] = response.meta['tattoos'] 131 | else: 132 | item['tattoos'] = self.get_tattoos(response) 133 | 134 | if 'piercings' in response.meta and response.meta['piercings']: 135 | item['piercings'] = response.meta['piercings'] 136 | else: 137 | item['piercings'] = self.get_piercings(response) 138 | 139 | if 'cupsize' in response.meta and response.meta['cupsize']: 140 | item['cupsize'] = response.meta['cupsize'] 141 | else: 142 | item['cupsize'] = self.get_cupsize(response) 143 | 144 | if 'fakeboobs' in response.meta and response.meta['fakeboobs']: 145 | item['fakeboobs'] = response.meta['fakeboobs'] 146 | else: 147 | item['fakeboobs'] = self.get_fakeboobs(response) 148 | 149 | item['url'] = self.get_url(response) 150 | 151 | if hasattr(self, 'network'): 152 | item['network'] = self.network 153 | elif 'network' in response.meta: 154 | item['network'] = response.meta['network'] 155 | else: 156 | item['network'] = self.get_network(response) 157 | 158 | yield item 159 | 160 | def get_name(self, response): 161 | if 'name' in self.selector_map: 162 | name = self.get_element(response, 'name', 're_name') 163 | if isinstance(name, list): 164 | name = ''.join(name).strip() 165 | return string.capwords(self.cleanup_text(name)) 166 | return '' 167 | 168 | def get_bio(self, response): 169 | if 'bio' in self.get_selector_map(): 170 | bio = self.get_element(response, 'bio', 're_bio') 171 | if isinstance(bio, list): 172 | bio = ' '.join(bio) 173 | return self.cleanup_description(bio) 174 | return '' 175 | 176 | def get_gender(self, response): 177 | if 'gender' in self.selector_map: 178 | gender = self.process_xpath(response, self.get_selector_map('gender')) 179 | if gender: 180 | gender = self.get_from_regex(gender.get(), 're_gender') 181 | if gender: 182 | return self.cleanup_text(gender).title() 183 | 184 | return '' 185 | 186 | def get_birthday(self, response): 187 | if 'birthday' in self.selector_map: 188 | birthday = self.cleanup_text(self.get_element(response, 'birthday', 're_birthday')) 189 | if birthday: 190 | return self.parse_date(birthday).isoformat() 191 | return '' 192 | 193 | def get_astrology(self, response): 194 | if 'astrology' in self.selector_map: 195 | return string.capwords(self.cleanup_text(self.get_element(response, 'astrology', 're_astrology'))) 196 | return '' 197 | 198 | def get_birthplace(self, response): 199 | if 'birthplace' in self.selector_map: 200 | return string.capwords(self.cleanup_text(self.get_element(response, 'birthplace', 're_birthplace'))) 201 | return '' 202 | 203 | def get_ethnicity(self, response): 204 | if 'ethnicity' in self.selector_map: 205 | return string.capwords(self.cleanup_text(self.get_element(response, 'ethnicity', 're_ethnicity'))) 206 | return '' 207 | 208 | def get_nationality(self, response): 209 | if 'nationality' in self.selector_map: 210 | return string.capwords(self.cleanup_text(self.get_element(response, 'nationality', 're_nationality'))) 211 | return '' 212 | 213 | def get_eyecolor(self, response): 214 | if 'eyecolor' in self.selector_map: 215 | return string.capwords(self.cleanup_text(self.get_element(response, 'eyecolor', 're_eyecolor'))) 216 | return '' 217 | 218 | def get_haircolor(self, response): 219 | if 'haircolor' in self.selector_map: 220 | return string.capwords(self.cleanup_text(self.get_element(response, 'haircolor', 're_haircolor'))) 221 | return '' 222 | 223 | def get_height(self, response): 224 | if 'height' in self.selector_map: 225 | return self.cleanup_text(self.get_element(response, 'height', 're_height')) 226 | return '' 227 | 228 | def get_weight(self, response): 229 | if 'weight' in self.selector_map: 230 | return self.cleanup_text(self.get_element(response, 'weight', 're_weight')) 231 | return '' 232 | 233 | def get_measurements(self, response): 234 | if 'measurements' in self.selector_map: 235 | return self.cleanup_text(self.get_element(response, 'measurements', 're_measurements')).upper() 236 | return '' 237 | 238 | def get_tattoos(self, response): 239 | if 'tattoos' in self.selector_map: 240 | return string.capwords(self.cleanup_text(self.get_element(response, 'tattoos', 're_tattoos'))) 241 | return '' 242 | 243 | def get_piercings(self, response): 244 | if 'piercings' in self.selector_map: 245 | return string.capwords(self.cleanup_text(self.get_element(response, 'piercings', 're_piercings'))) 246 | return '' 247 | 248 | def get_cupsize(self, response): 249 | if 'cupsize' in self.selector_map: 250 | return self.cleanup_text(self.get_element(response, 'cupsize', 're_cupsize')).upper() 251 | return '' 252 | 253 | def get_fakeboobs(self, response): 254 | if 'fakeboobs' in self.selector_map: 255 | return string.capwords(self.cleanup_text(self.get_element(response, 'fakeboobs', 're_fakeboobs'))) 256 | return '' 257 | -------------------------------------------------------------------------------- /tpdb/BaseSceneScraper.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import scrapy 4 | 5 | from tpdb.BaseScraper import BaseScraper 6 | from tpdb.items import SceneItem 7 | 8 | 9 | class BaseSceneScraper(BaseScraper): 10 | custom_tpdb_settings = { 11 | 'ITEM_PIPELINES': { 12 | 'tpdb.pipelines.TpdbApiScenePipeline': 400, 13 | }, 14 | 'DOWNLOADER_MIDDLEWARES': { 15 | 'tpdb.custommiddlewares.CustomProxyMiddleware': 350, 16 | 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 400, 17 | 'tpdb.helpers.scrapy_dpath.DPathMiddleware': 542, 18 | 'tpdb.middlewares.TpdbSceneDownloaderMiddleware': 543, 19 | } 20 | } 21 | 22 | def parse(self, response, **kwargs): 23 | scenes = self.get_scenes(response) 24 | count = 0 25 | for scene in scenes: 26 | count += 1 27 | yield scene 28 | 29 | if count: 30 | if 'page' in response.meta and response.meta['page'] < self.limit_pages: 31 | meta = response.meta 32 | meta['page'] = meta['page'] + 1 33 | print('NEXT PAGE: ' + str(meta['page'])) 34 | yield scrapy.Request(url=self.get_next_page_url(response.url, meta['page']), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) 35 | 36 | def get_scenes(self, response): 37 | return [] 38 | 39 | def parse_scene(self, response): 40 | item = SceneItem() 41 | 42 | if 'title' in response.meta and response.meta['title']: 43 | item['title'] = response.meta['title'] 44 | else: 45 | item['title'] = self.get_title(response) 46 | 47 | if 'description' in response.meta: 48 | item['description'] = response.meta['description'] 49 | else: 50 | item['description'] = self.get_description(response) 51 | 52 | if hasattr(self, 'site'): 53 | item['site'] = self.site 54 | elif 'site' in response.meta: 55 | item['site'] = response.meta['site'] 56 | else: 57 | item['site'] = self.get_site(response) 58 | 59 | if 'date' in response.meta: 60 | item['date'] = response.meta['date'] 61 | else: 62 | item['date'] = self.get_date(response) 63 | 64 | if 'image' in response.meta: 65 | item['image'] = response.meta['image'] 66 | else: 67 | item['image'] = self.get_image(response) 68 | 69 | if 'image' not in item or not item['image']: 70 | item['image'] = None 71 | 72 | if 'image_blob' in response.meta: 73 | item['image_blob'] = response.meta['image_blob'] 74 | else: 75 | item['image_blob'] = self.get_image_blob(response) 76 | 77 | if ('image_blob' not in item or not item['image_blob']) and item['image']: 78 | item['image_blob'] = self.get_image_blob_from_link(item['image']) 79 | 80 | if 'image_blob' not in item: 81 | item['image_blob'] = None 82 | 83 | if 'performers' in response.meta: 84 | item['performers'] = response.meta['performers'] 85 | else: 86 | item['performers'] = self.get_performers(response) 87 | 88 | if 'tags' in response.meta: 89 | item['tags'] = response.meta['tags'] 90 | else: 91 | item['tags'] = self.get_tags(response) 92 | 93 | if 'markers' in response.meta: 94 | item['markers'] = response.meta['markers'] 95 | else: 96 | item['markers'] = self.get_markers(response) 97 | 98 | if 'id' in response.meta: 99 | item['id'] = response.meta['id'] 100 | else: 101 | item['id'] = self.get_id(response) 102 | 103 | if 'merge_id' in response.meta: 104 | item['merge_id'] = response.meta['merge_id'] 105 | else: 106 | item['merge_id'] = self.get_merge_id(response) 107 | 108 | if 'trailer' in response.meta: 109 | item['trailer'] = response.meta['trailer'] 110 | else: 111 | item['trailer'] = self.get_trailer(response) 112 | 113 | if 'duration' in response.meta: 114 | item['duration'] = response.meta['duration'] 115 | else: 116 | item['duration'] = self.get_duration(response) 117 | 118 | if 'url' in response.meta: 119 | item['url'] = response.meta['url'] 120 | else: 121 | item['url'] = self.get_url(response) 122 | 123 | if hasattr(self, 'network'): 124 | item['network'] = self.network 125 | elif 'network' in response.meta: 126 | item['network'] = response.meta['network'] 127 | else: 128 | item['network'] = self.get_network(response) 129 | 130 | if hasattr(self, 'parent'): 131 | item['parent'] = self.parent 132 | elif 'parent' in response.meta: 133 | item['parent'] = response.meta['parent'] 134 | else: 135 | item['parent'] = self.get_parent(response) 136 | 137 | # Movie Items 138 | 139 | if 'store' in response.meta: 140 | item['store'] = response.meta['store'] 141 | else: 142 | item['store'] = self.get_store(response) 143 | 144 | if 'director' in response.meta: 145 | item['director'] = response.meta['director'] 146 | else: 147 | item['director'] = self.get_director(response) 148 | 149 | if 'format' in response.meta: 150 | item['format'] = response.meta['format'] 151 | else: 152 | item['format'] = self.get_format(response) 153 | 154 | if 'back' in response.meta: 155 | item['back'] = response.meta['back'] 156 | else: 157 | item['back'] = self.get_back_image(response) 158 | 159 | if 'back' not in item or not item['back']: 160 | item['back'] = None 161 | item['back_blob'] = None 162 | else: 163 | if 'back_blob' in response.meta: 164 | item['back_blob'] = response.meta['back_blob'] 165 | else: 166 | item['back_blob'] = self.get_image_back_blob(response) 167 | 168 | if ('back_blob' not in item or not item['back_blob']) and item['back']: 169 | item['back_blob'] = self.get_image_from_link(item['back']) 170 | 171 | if 'back_blob' not in item: 172 | item['back_blob'] = None 173 | 174 | if 'sku' in response.meta: 175 | item['sku'] = response.meta['sku'] 176 | else: 177 | item['sku'] = self.get_sku(response) 178 | 179 | if hasattr(self, 'type'): 180 | item['type'] = self.type 181 | elif 'type' in response.meta: 182 | item['type'] = response.meta['type'] 183 | elif 'type' in self.get_selector_map(): 184 | item['type'] = self.get_selector_map('type') 185 | else: 186 | item['type'] = 'Scene' 187 | 188 | yield self.check_item(item, self.days) 189 | 190 | def get_date(self, response): 191 | if 'date' in self.get_selector_map(): 192 | scenedate = self.get_element(response, 'date', 're_date') 193 | if scenedate: 194 | if isinstance(scenedate, list): 195 | scenedate = scenedate[0] 196 | date_formats = self.get_selector_map('date_formats') if 'date_formats' in self.get_selector_map() else None 197 | return self.parse_date(self.cleanup_text(scenedate), date_formats=date_formats).isoformat() 198 | return None 199 | 200 | def get_tags(self, response): 201 | if 'tags' in self.get_selector_map(): 202 | tags = self.get_element(response, 'tags', "list") 203 | if tags and isinstance(tags, list): 204 | new_tags = [] 205 | for tag in tags: 206 | if ',' in tag: 207 | new_tags.extend(tag.split(',')) 208 | elif tag: 209 | new_tags.append(tag) 210 | return list(map(lambda x: string.capwords(x.strip()), new_tags)) 211 | return [] 212 | 213 | def get_performers(self, response): 214 | if 'performers' in self.get_selector_map(): 215 | performers = self.get_element(response, 'performers', "list") 216 | if performers and isinstance(performers, list): 217 | return list(map(lambda x: string.capwords(x.strip()), performers)) 218 | return [] 219 | 220 | def get_description(self, response): 221 | if 'description' in self.get_selector_map(): 222 | description = self.get_element(response, 'description', 're_description') 223 | if isinstance(description, list): 224 | description = ' '.join(description) 225 | if description: 226 | return self.cleanup_description(description) 227 | return '' 228 | 229 | def get_trailer(self, response, path=None): 230 | if 'trailer' in self.get_selector_map(): 231 | trailer = self.get_element(response, 'trailer', 're_trailer') 232 | if trailer: 233 | if path: 234 | return self.format_url(path, trailer) 235 | else: 236 | return self.format_link(response, trailer) 237 | 238 | return '' 239 | 240 | def get_duration(self, response): 241 | if 'duration' in self.get_selector_map(): 242 | duration = self.get_element(response, 'duration', 're_duration') 243 | if duration: 244 | if ":" in duration or re.search(r'(\d{1,2})M(\d{1,2})S', duration): 245 | duration = self.duration_to_seconds(duration) 246 | return duration 247 | return '' 248 | 249 | def get_store(self, response): 250 | if 'store' in self.get_selector_map(): 251 | return string.capwords(self.cleanup_text(self.get_element(response, 'store', 're_store'))) 252 | return None 253 | 254 | def get_director(self, response): 255 | if 'director' in self.get_selector_map(): 256 | director = self.get_element(response, 'director', 're_director') 257 | if director and isinstance(director, list): 258 | director = ", ".join(director) 259 | return string.capwords(self.cleanup_text(director)) 260 | return None 261 | 262 | def get_format(self, response): 263 | if 'format' in self.get_selector_map(): 264 | return string.capwords(self.cleanup_text(self.get_element(response, 'format', 're_format'))) 265 | return None 266 | 267 | def get_sku(self, response): 268 | if 'sku' in self.get_selector_map(): 269 | return string.capwords(self.cleanup_text(self.get_element(response, 'sku', 're_sku'))) 270 | return None 271 | 272 | def get_markers(self, response): 273 | # Until there's a better feel for Markers, will need to be done in the scraper 274 | return [] 275 | 276 | def get_merge_id(self, response): 277 | # Just a stub 278 | return None 279 | 280 | def get_title(self, response): 281 | if 'title' in self.get_selector_map(): 282 | title = self.get_element(response, 'title', 're_title') 283 | if title: 284 | if isinstance(title, list): 285 | title = title[0] 286 | return string.capwords(self.cleanup_text(title)) 287 | return None 288 | -------------------------------------------------------------------------------- /tpdb/BaseScraper.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from datetime import date, timedelta 3 | import re 4 | from PIL import Image 5 | import base64 6 | from io import BytesIO 7 | import html 8 | import logging 9 | import string 10 | from abc import ABC 11 | from urllib.parse import urlparse, unquote 12 | 13 | import dateparser 14 | import scrapy 15 | import tldextract 16 | 17 | from furl import furl 18 | from tpdb.helpers.http import Http 19 | from scrapy.utils.project import get_project_settings 20 | 21 | 22 | class BaseScraper(scrapy.Spider, ABC): 23 | limit_pages = 1 24 | force = False 25 | debug = False 26 | days = 9999 27 | max_pages = 100 28 | cookies = {} 29 | headers = {} 30 | page = 1 31 | 32 | custom_tpdb_settings = {} 33 | custom_scraper_settings = {} 34 | selector_map = {} 35 | regex = {} 36 | proxy_address = None 37 | 38 | title_trash = [] 39 | description_trash = ['Description:'] 40 | date_trash = ['Released:', 'Added:', 'Published:'] 41 | 42 | def __init__(self, *args, **kwargs): 43 | super(BaseScraper, self).__init__(*args, **kwargs) 44 | 45 | for name in self.get_selector_map(): 46 | if (name == 'external_id' or name.startswith('re_')) and name in self.get_selector_map() and self.get_selector_map()[name]: 47 | regexp, group, mod = self.get_regex(self.get_selector_map(name)) 48 | self.regex[name] = (re.compile(regexp, mod), group) 49 | 50 | self.days = int(self.days) 51 | if self.days < 9999: 52 | logging.info(f"Days to retrieve: {self.days}") 53 | self.force = bool(self.force) 54 | self.debug = bool(self.debug) 55 | self.page = int(self.page) 56 | 57 | if self.limit_pages is None: 58 | self.limit_pages = 1 59 | else: 60 | if self.limit_pages == 'all': 61 | self.limit_pages = sys.maxsize 62 | self.limit_pages = int(self.limit_pages) 63 | 64 | @classmethod 65 | def update_settings(cls, settings): 66 | cls.custom_tpdb_settings.update(cls.custom_scraper_settings) 67 | settings.update(cls.custom_tpdb_settings) 68 | cls.headers['User-Agent'] = settings['USER_AGENT'] 69 | if settings['DAYS']: 70 | cls.days = settings['DAYS'] 71 | super(BaseScraper, cls).update_settings(settings) 72 | 73 | def start_requests(self): 74 | settings = get_project_settings() 75 | 76 | if not hasattr(self, 'start_urls'): 77 | raise AttributeError('start_urls missing') 78 | 79 | if not self.start_urls: 80 | raise AttributeError('start_urls selector missing') 81 | 82 | meta = {} 83 | meta['page'] = self.page 84 | if 'USE_PROXY' in self.settings.attributes.keys(): 85 | use_proxy = self.settings.get('USE_PROXY') 86 | elif 'USE_PROXY' in settings.attributes.keys(): 87 | use_proxy = settings.get('USE_PROXY') 88 | else: 89 | use_proxy = None 90 | 91 | if use_proxy: 92 | print(f"Using Settings Defined Proxy: True ({settings.get('PROXY_ADDRESS')})") 93 | else: 94 | if self.proxy_address: 95 | meta['proxy'] = self.proxy_address 96 | print(f"Using Scraper Defined Proxy: True ({meta['proxy']})") 97 | else: 98 | print("Using Proxy: False") 99 | 100 | for link in self.start_urls: 101 | yield scrapy.Request(url=self.get_next_page_url(link, self.page), callback=self.parse, meta=meta, headers=self.headers, cookies=self.cookies) 102 | 103 | def get_selector_map(self, attr=None): 104 | if hasattr(self, 'selector_map'): 105 | if attr is None: 106 | return self.selector_map 107 | if attr not in self.selector_map: 108 | raise AttributeError(f'{attr} missing from selector map') 109 | return self.selector_map[attr] 110 | raise NotImplementedError('selector map missing') 111 | 112 | def get_image(self, response, path=None): 113 | force_update = self.settings.get('force_update') 114 | if force_update: 115 | force_update = True 116 | force_fields = self.settings.get('force_fields') 117 | if force_fields: 118 | force_fields = force_fields.split(",") 119 | 120 | if not force_update or (force_update and "image" in force_fields): 121 | if 'image' in self.get_selector_map(): 122 | image = self.get_element(response, 'image', 're_image') 123 | if isinstance(image, list): 124 | image = image[0] 125 | if path: 126 | return self.format_url(path, image) 127 | else: 128 | return self.format_link(response, image) 129 | return '' 130 | 131 | def get_back_image(self, response): 132 | if 'back' in self.get_selector_map(): 133 | image = self.get_element(response, 'back', 're_back') 134 | if isinstance(image, list): 135 | image = image[0] 136 | return self.format_link(response, image) 137 | return '' 138 | 139 | def get_image_blob(self, response): 140 | if 'image_blob' not in self.get_selector_map(): 141 | image = self.get_image(response) 142 | return self.get_image_blob_from_link(image) 143 | return None 144 | 145 | def get_image_back_blob(self, response): 146 | if 'image_blob' not in self.get_selector_map(): 147 | image = self.get_back_image(response) 148 | return self.get_image_blob_from_link(image) 149 | return None 150 | 151 | def get_image_from_link(self, image): 152 | if image: 153 | req = Http.get(image, headers=self.headers, cookies=self.cookies) 154 | if req and req.ok: 155 | return req.content 156 | return None 157 | 158 | def get_image_blob_from_link(self, image): 159 | force_update = self.settings.get('force_update') 160 | if force_update: 161 | force_update = True 162 | force_fields = self.settings.get('force_fields') 163 | if force_fields: 164 | force_fields = force_fields.split(",") 165 | 166 | if (not force_update or (force_update and "image" in force_fields)) and image: 167 | data = self.get_image_from_link(image) 168 | if data: 169 | try: 170 | img = BytesIO(data) 171 | img = Image.open(img) 172 | img = img.convert('RGB') 173 | width, height = img.size 174 | if height > 1080 or width > 1920: 175 | img.thumbnail((1920, 1080)) 176 | buffer = BytesIO() 177 | img.save(buffer, format="JPEG") 178 | data = buffer.getvalue() 179 | except Exception as ex: 180 | print(f"Could not decode image for evaluation: '{image}'. Error: ", ex) 181 | return base64.b64encode(data).decode('utf-8') 182 | return None 183 | 184 | @staticmethod 185 | def duration_to_seconds(time_text): 186 | duration = '' 187 | if ":" in time_text: 188 | time_text = time_text.split(":") 189 | time_text = [i for i in time_text if i] 190 | if len(time_text) == 3: 191 | duration = str(int(time_text[0]) * 3600 + int(time_text[1]) * 60 + int(time_text[2])) 192 | elif len(time_text) == 2: 193 | duration = str(int(time_text[0]) * 60 + int(time_text[1])) 194 | elif len(time_text) == 1: 195 | duration = time_text[0] 196 | elif re.search(r'(\d{1,2})M(\d{1,2})S', time_text): 197 | if "H" in time_text: 198 | duration = re.search(r'(\d{1,2})H(\d{1,2})M(\d{1,2})S', time_text) 199 | hours = int(duration.group(1)) * 3660 200 | minutes = int(duration.group(2)) * 60 201 | seconds = int(duration.group(3)) 202 | duration = str(hours + minutes + seconds) 203 | else: 204 | duration = re.search(r'(\d{1,2})M(\d{1,2})S', time_text) 205 | minutes = int(duration.group(1)) * 60 206 | seconds = int(duration.group(2)) 207 | duration = str(minutes + seconds) 208 | return duration 209 | 210 | def get_url(self, response): 211 | return self.prepare_url(response.url) 212 | 213 | def get_id(self, response): 214 | sceneid = self.get_from_regex(response.url, 'external_id') 215 | if "?nats" in sceneid: 216 | sceneid = re.search(r'(.*)\?nats', sceneid).group(1) 217 | return sceneid 218 | 219 | def get_site(self, response): 220 | return tldextract.extract(response.url).domain 221 | 222 | def get_network(self, response): 223 | return tldextract.extract(response.url).domain 224 | 225 | def get_parent(self, response): 226 | return tldextract.extract(response.url).domain 227 | 228 | def get_studio(self, response): 229 | if 'studio' in self.get_selector_map(): 230 | return string.capwords(self.cleanup_text(self.get_element(response, 'studio', 're_studio'))) 231 | return '' 232 | 233 | @staticmethod 234 | def process_xpath(response, selector: str): 235 | if selector.startswith('//') or selector.startswith('./'): 236 | return response.xpath(selector) 237 | 238 | if selector.startswith('/'): 239 | return response.dpath(selector) 240 | 241 | return response.css(selector) 242 | 243 | def format_link(self, response, link): 244 | return self.format_url(response.url, link) 245 | 246 | @staticmethod 247 | def format_url(base, path): 248 | if path.startswith('http'): 249 | return path 250 | 251 | if path.startswith('//'): 252 | return 'https:' + path 253 | 254 | new_url = urlparse(path) 255 | url = urlparse(base) 256 | url = url._replace(path=new_url.path, query=new_url.query) 257 | 258 | return BaseScraper.prepare_url(url.geturl()) 259 | 260 | @staticmethod 261 | def prepare_url(url: str) -> str: 262 | if not url: 263 | return '' 264 | 265 | return furl(unquote(url)).url 266 | 267 | def get_next_page_url(self, base, page): 268 | return self.format_url(base, self.get_selector_map('pagination') % page) 269 | 270 | def get_from_regex(self, text, re_name): 271 | if re_name in self.regex and self.regex[re_name]: 272 | regexp, group, mod = self.get_regex(self.regex[re_name]) 273 | 274 | r = regexp.search(text) 275 | if r: 276 | return r.group(group) 277 | return None 278 | 279 | return text 280 | 281 | @staticmethod 282 | def get_regex(regexp, group=1, mod=re.IGNORECASE): 283 | if isinstance(regexp, tuple): 284 | mod = regexp[2] if len(regexp) > 2 else mod 285 | group = regexp[1] if len(regexp) > 1 else group 286 | regexp = regexp[0] 287 | 288 | return regexp, group, mod 289 | 290 | @staticmethod 291 | def cleanup_text(text, trash_words=None): 292 | if trash_words is None: 293 | trash_words = [] 294 | 295 | text = html.unescape(text) 296 | for trash in trash_words: 297 | text = text.replace(trash, '') 298 | 299 | return text.strip() 300 | 301 | def cleanup_title(self, title): 302 | return string.capwords(self.cleanup_text(title, self.title_trash)) 303 | 304 | def cleanup_description(self, description): 305 | return self.cleanup_text(description, self.description_trash) 306 | 307 | def cleanup_date(self, item_date): 308 | return self.cleanup_text(item_date, self.date_trash) 309 | 310 | def parse_date(self, item_date, date_formats=None): 311 | item_date = self.cleanup_date(item_date) 312 | settings = {'TIMEZONE': 'UTC'} 313 | 314 | return dateparser.parse(item_date, date_formats=date_formats, settings=settings) 315 | 316 | def check_item(self, item, days=None): 317 | if 'date' not in item: 318 | return item 319 | if item['date']: 320 | if days: 321 | if days > 27375: 322 | filter_date = '0000-00-00' 323 | else: 324 | days = self.days 325 | filter_date = date.today() - timedelta(days) 326 | filter_date = filter_date.strftime('%Y-%m-%d') 327 | 328 | if self.debug: 329 | if not item['date'] > filter_date: 330 | item['filtered'] = 'Scene filtered due to date restraint' 331 | print(item) 332 | else: 333 | if filter_date: 334 | if item['date'] > filter_date: 335 | return item 336 | return None 337 | return item 338 | 339 | def get_element(self, response, selector, regex=None): 340 | selector = self.get_selector_map(selector) 341 | if selector: 342 | element = self.process_xpath(response, selector) 343 | if element: 344 | if (len(element) > 1 or regex == "list") and "/script" not in selector: 345 | element = list(map(lambda x: x.strip(), element.getall())) 346 | else: 347 | if isinstance(element, list): 348 | element = element.getall() 349 | element = " ".join(element) 350 | else: 351 | element = element.get() 352 | element = self.get_from_regex(element, regex) 353 | if element: 354 | element = element.strip() 355 | if element: 356 | if isinstance(element, list): 357 | element = [i for i in element if i] 358 | return element 359 | return '' 360 | -------------------------------------------------------------------------------- /tpdb/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThePornDatabase/scrapy/f014440ab1bd74077560323f1a615587ed488afd/tpdb/__init__.py -------------------------------------------------------------------------------- /tpdb/custommiddlewares.py: -------------------------------------------------------------------------------- 1 | from scrapy.utils.project import get_project_settings 2 | 3 | 4 | class CustomProxyMiddleware(object): 5 | settings = get_project_settings() 6 | 7 | if 'PROXY_ADDRESS' in settings.attributes.keys(): 8 | proxy_address = settings.get('PROXY_ADDRESS') 9 | else: 10 | proxy_address = None 11 | 12 | if 'USE_PROXY' in settings.attributes.keys(): 13 | use_proxy = settings.get('USE_PROXY') 14 | else: 15 | use_proxy = None 16 | 17 | def process_request(self, request, spider): 18 | if 'proxy' not in request.meta: 19 | if self.use_proxy or spider.settings.get('USE_PROXY'): 20 | request.meta['proxy'] = self.proxy_address 21 | else: 22 | request.meta['proxy'] = '' 23 | -------------------------------------------------------------------------------- /tpdb/helpers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ThePornDatabase/scrapy/f014440ab1bd74077560323f1a615587ed488afd/tpdb/helpers/__init__.py -------------------------------------------------------------------------------- /tpdb/helpers/flare_solverr.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | 3 | from .http import Http 4 | 5 | 6 | class FlareSolverr: 7 | __session = None 8 | 9 | def __init__(self, base_url: str): 10 | self.__BASE_URL = base_url 11 | self.__API_URL = f'{self.__BASE_URL}/v1' 12 | self.__session = self.__set_session() 13 | 14 | def __del__(self): 15 | if self.__session: 16 | Http.post(self.__API_URL, json={'cmd': 'sessions.destroy', 'session': self.__session}) 17 | 18 | def __set_session(self) -> str: 19 | sessions = self.__get_sessions() 20 | if sessions: 21 | session = sessions[0] 22 | else: 23 | session = self.__create_session() 24 | 25 | return session 26 | 27 | def __create_session(self) -> str: 28 | req = Http.post(self.__API_URL, json={'cmd': 'sessions.create'}) 29 | 30 | session = None 31 | if req and req.ok: 32 | session = req.json()['session'] 33 | 34 | return session 35 | 36 | def __get_sessions(self) -> list: 37 | req = Http.post(self.__API_URL, json={'cmd': 'sessions.list'}) 38 | sessions = None 39 | if req and req.ok: 40 | sessions = req.json()['sessions'] 41 | 42 | return sessions 43 | 44 | def __request(self, url: str, method: str, **kwargs): 45 | cookies = kwargs.pop('cookies', {}) 46 | data = kwargs.pop('data', {}) 47 | method = method.lower() 48 | 49 | if not self.__session: 50 | return 51 | 52 | if method not in ['get', 'post']: 53 | return 54 | 55 | params = { 56 | 'cmd': f'request.{method}', 57 | 'session': self.__session, 58 | 'url': url, 59 | } 60 | 61 | if method == 'post': 62 | params['postData'] = data 63 | 64 | if cookies: 65 | domain = urlparse(url).hostname 66 | if isinstance(cookies, dict): 67 | cookies = [{'name': name, 'value': value, 'domain': domain} for name, value in cookies.items()] 68 | params['cookies'] = cookies 69 | 70 | req = Http.post(self.__API_URL, json=params) 71 | if req and req.ok: 72 | resp = req.json()['solution'] 73 | headers = resp['headers'] 74 | cookies = {cookie['name']: cookie['value'] for cookie in resp['cookies']} 75 | 76 | return Http.fake_response(url, int(resp['status']), resp['response'], headers, cookies) 77 | 78 | return 79 | 80 | def get(self, url: str, **kwargs): 81 | return self.__request(url, 'GET', **kwargs) 82 | 83 | def post(self, url: str, **kwargs): 84 | return self.__request(url, 'POST', **kwargs) 85 | 86 | def get_api_url(self): 87 | return self.__API_URL 88 | 89 | def get_session(self): 90 | return self.__session 91 | -------------------------------------------------------------------------------- /tpdb/helpers/http.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import requests 4 | import urllib3 5 | from requests.cookies import cookiejar_from_dict 6 | from requests.models import Response 7 | 8 | urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) 9 | 10 | 11 | class Http: 12 | @staticmethod 13 | def request(method: str, url: str, **kwargs): 14 | req = None 15 | try: 16 | req = requests.request(method, url, verify=False, **kwargs) 17 | except Exception as e: 18 | logging.error(e) 19 | pass 20 | 21 | return req 22 | 23 | @staticmethod 24 | def get(url: str, **kwargs): 25 | return Http.request('GET', url, **kwargs) 26 | 27 | @staticmethod 28 | def post(url: str, **kwargs): 29 | return Http.request('POST', url, **kwargs) 30 | 31 | @staticmethod 32 | def head(url: str, **kwargs): 33 | return Http.request('HEAD', url, **kwargs) 34 | 35 | @staticmethod 36 | def fake_response(url: str, status_code: int, content, headers: dict, cookies: dict) -> Response: 37 | content = content if isinstance(content, bytes) else content.encode('UTF-8') 38 | cookies = {} if cookies is None else cookies 39 | headers = {} if headers is None else headers 40 | 41 | response = Response() 42 | response.url = url 43 | response.status_code = status_code 44 | response._content = content 45 | response.headers = headers 46 | response.cookies = cookiejar_from_dict(cookies) 47 | 48 | return response 49 | -------------------------------------------------------------------------------- /tpdb/helpers/scrapy_dpath/__init__.py: -------------------------------------------------------------------------------- 1 | from .response import DPathResponse 2 | from .middleware import DPathMiddleware 3 | 4 | __all__ = ['DPathResponse', 'DPathMiddleware'] 5 | -------------------------------------------------------------------------------- /tpdb/helpers/scrapy_dpath/dpath.py: -------------------------------------------------------------------------------- 1 | import dpath.util 2 | 3 | 4 | class ScrapyDPath: 5 | __result = None 6 | 7 | def __init__(self, obj, selector, separator='/'): 8 | _dpath = None 9 | try: 10 | _dpath = dpath.util.values(obj, selector, separator=separator) 11 | except: 12 | pass 13 | 14 | if _dpath: 15 | self.__result = [str(res) for res in _dpath] 16 | 17 | def __repr__(self): 18 | return repr(self.__result) 19 | 20 | def __len__(self): 21 | return len(self.__result) if self.__result else 0 22 | 23 | def __iter__(self): 24 | yield self.__result 25 | 26 | def get(self): 27 | return self.__result[0] if self.__result else None 28 | 29 | def getall(self): 30 | return self.__result if self.__result else None 31 | -------------------------------------------------------------------------------- /tpdb/helpers/scrapy_dpath/middleware.py: -------------------------------------------------------------------------------- 1 | from scrapy.http import TextResponse 2 | 3 | from tpdb.helpers.scrapy_dpath import DPathResponse 4 | 5 | 6 | class DPathMiddleware(object): 7 | def process_response(self, request, response, spider): 8 | if isinstance(response, TextResponse): 9 | return DPathResponse(request, response) 10 | -------------------------------------------------------------------------------- /tpdb/helpers/scrapy_dpath/response.py: -------------------------------------------------------------------------------- 1 | from scrapy.http import TextResponse 2 | 3 | from tpdb.helpers.scrapy_dpath.dpath import ScrapyDPath 4 | 5 | 6 | class DPathResponse(TextResponse): 7 | request = None 8 | response = None 9 | 10 | def __init__(self, request, response): 11 | self.request = request 12 | self.response = response 13 | 14 | super(DPathResponse, self).__init__(response.url, 15 | status=response.status, 16 | headers=response.headers, 17 | body=response.body, 18 | flags=response.flags, 19 | request=response.request, 20 | certificate=response.certificate, 21 | ip_address=response.ip_address, 22 | protocol=response.protocol) 23 | 24 | def dpath(self, selector): 25 | return ScrapyDPath(self.response.json(), selector) 26 | -------------------------------------------------------------------------------- /tpdb/helpers/scrapy_flare/__init__.py: -------------------------------------------------------------------------------- 1 | from .response import FlareResponse 2 | from .request import FlareRequest 3 | from .middleware import FlareMiddleware 4 | 5 | __all__ = ['FlareResponse', 'FlareRequest', 'FlareMiddleware'] 6 | -------------------------------------------------------------------------------- /tpdb/helpers/scrapy_flare/middleware.py: -------------------------------------------------------------------------------- 1 | from tpdb.helpers.flare_solverr import FlareSolverr 2 | from tpdb.helpers.scrapy_flare import FlareRequest, FlareResponse 3 | 4 | 5 | class FlareMiddleware(object): 6 | def __init__(self, crawler, flare_solverr): 7 | self.crawler = crawler 8 | self.flare_solverr = flare_solverr 9 | 10 | @classmethod 11 | def from_crawler(cls, crawler): 12 | s = crawler.settings 13 | flare_url = s.get('FLARE_URL', '') 14 | flare_solverr = FlareSolverr(flare_url) 15 | 16 | return cls(crawler, flare_solverr) 17 | 18 | def process_request(self, request, spider): 19 | if request.url == self.flare_solverr.get_api_url(): 20 | return 21 | 22 | new_request = FlareRequest(request.url, 23 | self.flare_solverr, 24 | method=request.method, 25 | meta=request.meta, 26 | callback=request.callback, 27 | priority=request.priority) 28 | 29 | return new_request 30 | 31 | def process_response(self, request, response, spider): 32 | if request.url != self.flare_solverr.get_api_url(): 33 | return request 34 | 35 | new_response = FlareResponse(response) 36 | return new_response 37 | -------------------------------------------------------------------------------- /tpdb/helpers/scrapy_flare/request.py: -------------------------------------------------------------------------------- 1 | import json 2 | from urllib.parse import urlparse 3 | from scrapy import Request 4 | 5 | 6 | class FlareRequest(Request): 7 | def __init__(self, 8 | url, 9 | flare_solverr=None, 10 | callback=None, 11 | method='GET', 12 | cookies=None, 13 | meta=None, 14 | **kwargs): 15 | 16 | if not flare_solverr: 17 | return 18 | 19 | method = method.lower() 20 | params = { 21 | 'cmd': f'request.{method}', 22 | 'session': flare_solverr.get_session(), 23 | 'url': url, 24 | } 25 | 26 | if cookies: 27 | domain = urlparse(url).hostname 28 | if isinstance(cookies, dict): 29 | cookies = [{'name': name, 'value': value, 'domain': domain} for name, value in cookies.items()] 30 | params['cookies'] = cookies 31 | 32 | super(FlareRequest, self).__init__(url=flare_solverr.get_api_url(), 33 | method='POST', 34 | callback=callback, 35 | meta=meta, 36 | body=json.dumps(params), 37 | headers={'Content-Type': 'application/json'}, 38 | **kwargs) 39 | -------------------------------------------------------------------------------- /tpdb/helpers/scrapy_flare/response.py: -------------------------------------------------------------------------------- 1 | from scrapy.http import HtmlResponse 2 | 3 | 4 | class FlareResponse(HtmlResponse): 5 | def __init__(self, response): 6 | resp = response.json()['solution'] 7 | url = resp['url'] 8 | status = resp['status'] 9 | body = resp['response'] 10 | 11 | super(FlareResponse, self).__init__(url, status=status, body=body, encoding='UTF-8', request=response.request) 12 | -------------------------------------------------------------------------------- /tpdb/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class SceneItem(scrapy.Item): 10 | back = scrapy.Field() 11 | back_blob = scrapy.Field() 12 | date = scrapy.Field() 13 | description = scrapy.Field() 14 | director = scrapy.Field() 15 | duration = scrapy.Field() 16 | format = scrapy.Field() 17 | id = scrapy.Field() 18 | merge_id = scrapy.Field() 19 | image = scrapy.Field() 20 | image_blob = scrapy.Field() 21 | markers = scrapy.Field() 22 | network = scrapy.Field() 23 | parent = scrapy.Field() 24 | performers = scrapy.Field() 25 | scenes = scrapy.Field() 26 | site = scrapy.Field() 27 | sku = scrapy.Field() 28 | store = scrapy.Field() 29 | tags = scrapy.Field() 30 | title = scrapy.Field() 31 | trailer = scrapy.Field() 32 | type = scrapy.Field() 33 | url = scrapy.Field() 34 | 35 | 36 | class MovieItem(scrapy.Item): 37 | title = scrapy.Field() 38 | description = scrapy.Field() 39 | site = scrapy.Field() 40 | store = scrapy.Field() 41 | date = scrapy.Field() 42 | front = scrapy.Field() 43 | front_blob = scrapy.Field() 44 | back = scrapy.Field() 45 | back_blob = scrapy.Field() 46 | performers = scrapy.Field() 47 | tags = scrapy.Field() 48 | url = scrapy.Field() 49 | markers = scrapy.Field() 50 | id = scrapy.Field() 51 | trailer = scrapy.Field() 52 | studio = scrapy.Field() 53 | director = scrapy.Field() 54 | format = scrapy.Field() 55 | length = scrapy.Field() 56 | year = scrapy.Field() 57 | rating = scrapy.Field() 58 | sku = scrapy.Field() 59 | upc = scrapy.Field() 60 | duration = scrapy.Field() 61 | 62 | 63 | class PerformerItem(scrapy.Item): 64 | name = scrapy.Field() 65 | network = scrapy.Field() 66 | url = scrapy.Field() 67 | image = scrapy.Field() 68 | image_blob = scrapy.Field() 69 | bio = scrapy.Field() 70 | gender = scrapy.Field() 71 | birthday = scrapy.Field() 72 | astrology = scrapy.Field() 73 | birthplace = scrapy.Field() 74 | ethnicity = scrapy.Field() 75 | nationality = scrapy.Field() 76 | haircolor = scrapy.Field() 77 | weight = scrapy.Field() 78 | height = scrapy.Field() 79 | measurements = scrapy.Field() 80 | tattoos = scrapy.Field() 81 | piercings = scrapy.Field() 82 | cupsize = scrapy.Field() 83 | fakeboobs = scrapy.Field() 84 | eyecolor = scrapy.Field() 85 | -------------------------------------------------------------------------------- /tpdb/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | import re 6 | 7 | from pymongo import MongoClient 8 | from scrapy import signals 9 | from scrapy.exceptions import IgnoreRequest 10 | 11 | 12 | class TpdbSceneDownloaderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the downloader middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | 22 | cls.crawler = crawler 23 | 24 | if crawler.settings['ENABLE_MONGODB']: 25 | db = MongoClient(crawler.settings['MONGODB_URL']) 26 | cls.db = db['scrapy'] 27 | 28 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 29 | return s 30 | 31 | def process_request(self, request, spider): 32 | if re.search(spider.get_selector_map('external_id'), request.url) is None: 33 | return None 34 | 35 | if spider.force is True: 36 | return None 37 | 38 | # Used in production - we store the scene in MongoDB for caching reasons 39 | if self.crawler.settings['ENABLE_MONGODB']: 40 | result = self.db.scenes.find_one({'url': request.url}) 41 | if result is not None and ('api_response' not in result or not result['api_response']): 42 | raise IgnoreRequest 43 | 44 | return None 45 | 46 | def process_response(self, request, response, spider): 47 | # Called with the response returned from the downloader. 48 | 49 | # Must either; 50 | # - return a Response object 51 | # - return a Request object 52 | # - or raise IgnoreRequest 53 | return response 54 | 55 | def process_exception(self, request, exception, spider): 56 | # Called when a download handler or a process_request() 57 | # (from other downloader middleware) raises an exception. 58 | 59 | # Must either: 60 | # - return None: continue processing this exception 61 | # - return a Response object: stops process_exception() chain 62 | # - return a Request object: stops process_exception() chain 63 | pass 64 | 65 | def spider_opened(self, spider): 66 | spider.logger.info('Spider opened: %s' % spider.name) 67 | 68 | 69 | class TpdbMovieDownloaderMiddleware: 70 | # Not all methods need to be defined. If a method is not defined, 71 | # scrapy acts as if the downloader middleware does not modify the 72 | # passed objects. 73 | 74 | @classmethod 75 | def from_crawler(cls, crawler): 76 | # This method is used by Scrapy to create your spiders. 77 | s = cls() 78 | 79 | cls.crawler = crawler 80 | 81 | if crawler.settings['ENABLE_MONGODB']: 82 | db = MongoClient(crawler.settings['MONGODB_URL']) 83 | cls.db = db['scrapy'] 84 | 85 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 86 | return s 87 | 88 | def process_request(self, request, spider): 89 | 90 | if spider.force is True: 91 | return None 92 | 93 | # Used in production - we store the scene in MongoDB for caching reasons 94 | if self.crawler.settings['ENABLE_MONGODB']: 95 | result = self.db.scenes.find_one({'url': request.url}) 96 | if result is not None and ('api_response' not in result or not result['api_response']): 97 | raise IgnoreRequest 98 | 99 | return None 100 | 101 | def process_response(self, request, response, spider): 102 | # Called with the response returned from the downloader. 103 | 104 | # Must either; 105 | # - return a Response object 106 | # - return a Request object 107 | # - or raise IgnoreRequest 108 | return response 109 | 110 | def process_exception(self, request, exception, spider): 111 | # Called when a download handler or a process_request() 112 | # (from other downloader middleware) raises an exception. 113 | 114 | # Must either: 115 | # - return None: continue processing this exception 116 | # - return a Response object: stops process_exception() chain 117 | # - return a Request object: stops process_exception() chain 118 | pass 119 | 120 | def spider_opened(self, spider): 121 | spider.logger.info('Spider opened: %s' % spider.name) 122 | 123 | 124 | class TpdbPerformerDownloaderMiddleware: 125 | # Not all methods need to be defined. If a method is not defined, 126 | # scrapy acts as if the downloader middleware does not modify the 127 | # passed objects. 128 | @classmethod 129 | def from_crawler(cls, crawler): 130 | # This method is used by Scrapy to create your spiders. 131 | s = cls() 132 | 133 | cls.crawler = crawler 134 | 135 | if crawler.settings['ENABLE_MONGODB']: 136 | db = MongoClient(crawler.settings['MONGODB_URL']) 137 | cls.db = db['scrapy'] 138 | 139 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 140 | return s 141 | 142 | def process_request(self, request, spider): 143 | if re.search(spider.get_selector_map('external_id'), request.url) is None: 144 | return None 145 | 146 | if spider.force is True: 147 | return None 148 | 149 | # Used in production - we store the scene in MongoDB for caching reasons 150 | if self.crawler.settings['ENABLE_MONGODB']: 151 | result = self.db.performers.find_one({'url': request.url}) 152 | if result is not None and ('api_response' not in result or not result['api_response']): 153 | raise IgnoreRequest 154 | 155 | return None 156 | 157 | def process_response(self, request, response, spider): 158 | # Called with the response returned from the downloader. 159 | 160 | # Must either; 161 | # - return a Response object 162 | # - return a Request object 163 | # - or raise IgnoreRequest 164 | return response 165 | 166 | def process_exception(self, request, exception, spider): 167 | # Called when a download handler or a process_request() 168 | # (from other downloader middleware) raises an exception. 169 | 170 | # Must either: 171 | # - return None: continue processing this exception 172 | # - return a Response object: stops process_exception() chain 173 | # - return a Request object: stops process_exception() chain 174 | pass 175 | 176 | def spider_opened(self, spider): 177 | spider.logger.info('Spider opened: %s' % spider.name) 178 | -------------------------------------------------------------------------------- /tpdb/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | import hashlib 7 | import html 8 | import re 9 | import json 10 | import logging 11 | import time 12 | 13 | from pathlib import Path 14 | from datetime import datetime 15 | 16 | from pymongo import MongoClient 17 | from scrapy.exporters import JsonItemExporter, JsonLinesItemExporter 18 | 19 | from tpdb.BaseScraper import BaseScraper 20 | from tpdb.helpers.http import Http 21 | 22 | 23 | class TpdbPipeline: 24 | def process_item(self, item, spider): 25 | return item 26 | 27 | 28 | class TpdbApiScenePipeline: 29 | def __init__(self, crawler): 30 | if crawler.settings['ENABLE_MONGODB']: 31 | db = MongoClient(crawler.settings['MONGODB_URL']) 32 | self.db = db['scrapy'] 33 | 34 | self.crawler = crawler 35 | 36 | if crawler.settings.get('path'): 37 | path = crawler.settings.get('path') 38 | else: 39 | path = crawler.settings.get('DEFAULT_EXPORT_PATH') 40 | 41 | if crawler.settings.get('FILTER_TAGS'): 42 | logging.info(f"Loading Scene Tag Alias File: {crawler.settings.get('FILTER_TAG_FILENAME')}") 43 | with open(crawler.settings.get('FILTER_TAG_FILENAME'), encoding='utf-8') as f: 44 | self.tagaliases = json.load(f) 45 | 46 | if crawler.settings.get('file'): 47 | filename = crawler.settings.get('file') 48 | if '\\' not in filename and '/' not in filename: 49 | filename = Path(path, filename) 50 | else: 51 | filename = Path(path, f'{crawler.spidercls.name}_{time.strftime("%Y%m%d-%H%M")}.json') 52 | 53 | if crawler.settings.getbool('export') or self.crawler.settings['EXPORT_ITEMS']: 54 | print(f'*** Exporting to file: {filename}') 55 | self.fp = open(filename, 'wb') 56 | self.fp.write('{"scenes":['.encode()) 57 | 58 | if crawler.settings.getbool('oneline'): 59 | self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') 60 | else: 61 | self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8', sort_keys=True, indent=2) 62 | 63 | @classmethod 64 | def from_crawler(cls, crawler): 65 | return cls(crawler) 66 | 67 | async def process_item(self, item, spider): 68 | if spider.debug is True: 69 | return item 70 | 71 | # So we don't re-send scenes that have already been scraped 72 | if self.crawler.settings['ENABLE_MONGODB']: 73 | if spider.force is not True: 74 | result = self.db.scenes.find_one({'url': item['url']}) 75 | if result is not None: 76 | return 77 | 78 | if self.crawler.settings['FILTER_TAGS']: 79 | item['tags'] = self.clean_tags(item['tags'], self.tagaliases) 80 | 81 | if item['date']: 82 | item['date'] = re.search(r'(\d{4}-\d{2}-\d{2})', item['date']).group(1) 83 | 84 | if "back" not in item: 85 | item['back'] = '' 86 | if "back_blob" not in item: 87 | item['back_blob'] = '' 88 | if "director" not in item: 89 | item['director'] = '' 90 | if "duration" not in item: 91 | item['duration'] = '' 92 | if "format" not in item: 93 | item['format'] = '' 94 | if "markers" not in item: 95 | item['markers'] = [] 96 | if "scenes" not in item: 97 | item['scenes'] = [] 98 | if "sku" not in item: 99 | item['sku'] = '' 100 | if "merge_id" not in item: 101 | item['merge_id'] = '' 102 | if "store" not in item: 103 | item['store'] = '' 104 | if "type" not in item: 105 | item['type'] = 'Scene' 106 | 107 | item['title'] = html.unescape(item['title']) 108 | item['description'] = html.unescape(item['description']) 109 | 110 | payload = { 111 | 'back': BaseScraper.prepare_url(item['back']), 112 | 'back_blob': item['back_blob'], 113 | 'date': item['date'], 114 | 'description': item['description'], 115 | 'director': item['director'], 116 | 'duration': item['duration'], 117 | 'external_id': str(item['id']), 118 | 'merge_id': str(item['merge_id']), 119 | 'force_update': self.crawler.settings.getbool('FORCE_UPDATE'), 120 | 'format': item['format'], 121 | 'image': BaseScraper.prepare_url(item['image']), 122 | 'image_blob': item['image_blob'], 123 | 'markers': item['markers'], 124 | 'network': item['network'], 125 | 'parent': item['parent'], 126 | 'performers': item['performers'], 127 | 'scenes': item['scenes'], 128 | 'site': item['site'], 129 | 'sku': item['sku'], 130 | 'store': item['store'], 131 | 'tags': item['tags'], 132 | 'title': item['title'], 133 | 'trailer': BaseScraper.prepare_url(item['trailer']), 134 | 'type': item['type'], 135 | 'url': BaseScraper.prepare_url(item['url']), 136 | } 137 | 138 | # Post the scene to the API - requires auth with permissions 139 | disp_result = "" 140 | if self.crawler.settings['TPDB_API_KEY'] and not spider.settings.get('local'): 141 | headers = { 142 | 'Authorization': f'Bearer {self.crawler.settings["TPDB_API_KEY"]}', 143 | 'Accept': 'application/json', 144 | 'Content-Type': 'application/json', 145 | 'User-Agent': 'tpdb-scraper/1.0.0' 146 | } 147 | 148 | response = Http.post('https://api.metadataapi.net/scenes', json=payload, headers=headers, timeout=2) 149 | if response: 150 | if response.ok: 151 | disp_result = f'{disp_result} Submitted OK' 152 | else: 153 | disp_result = f'{disp_result} Submission Error: Code #{response.status_code}' 154 | else: 155 | disp_result = f'{disp_result} Submission Error: No Response Code' 156 | print(response) 157 | if response: 158 | logging.info(response.content) 159 | url_hash = hashlib.sha1(str(item['url']).encode('utf-8')).hexdigest() 160 | 161 | if self.crawler.settings['MONGODB_ENABLE']: 162 | if not response.ok: 163 | self.db.errors.replace_one({'_id': url_hash}, { 164 | 'url': item['url'], 165 | 'error': 1, 166 | 'when': datetime.now().isoformat(), 167 | 'response': response.json() 168 | }, upsert=True) 169 | else: 170 | self.db.scenes.replace_one( 171 | {'_id': url_hash}, dict(item), upsert=True) 172 | else: 173 | disp_result = 'Local Run, Not Submitted' 174 | 175 | if spider.settings.get('localdump'): 176 | # Toss to local TPDB Instance 177 | headers = { 178 | 'Authorization': f'Bearer {self.crawler.settings["TPDB_TEST_API_KEY"]}', 179 | 'Accept': 'application/json', 180 | 'Content-Type': 'application/json', 181 | 'User-Agent': 'tpdb-scraper/1.0.0' 182 | } 183 | response = Http.post('http://api.tpdb.test/scenes', json=payload, headers=headers) 184 | if response: 185 | if response.ok: 186 | disp_result = disp_result + '\tSubmitted to Local OK' 187 | else: 188 | disp_result = f'{disp_result} \tSubmission to Local Error: Code #%d{response.status_code}' 189 | else: 190 | disp_result = disp_result + '\tSubmission to Local Error: No Response Code' 191 | if response: 192 | logging.info(response.content) 193 | # ############################# 194 | 195 | if (spider.settings.getbool('display') or self.crawler.settings['DISPLAY_ITEMS']) and spider.settings.get('LOG_LEVEL') == 'INFO': 196 | if len(item['title']) >= 50: 197 | title_length = 5 198 | else: 199 | title_length = 55 - len(item['title']) 200 | 201 | if len(item['site']) >= 15: 202 | site_length = 5 203 | else: 204 | site_length = 20 - len(item['site']) 205 | 206 | if item['date']: 207 | if "T" in item['date']: 208 | disp_date = re.search(r'(.*)T\d', item['date']).group(1) 209 | else: 210 | disp_date = item['date'] 211 | else: 212 | disp_date = "Calculated" 213 | 214 | logging.info(f"Item: {item['title'][0:50]}" + " " * title_length + f"{item['site'][0:15]}" + " " * site_length + f"\t{str(item['id'])[0:15]}\t{disp_date}\t{item['url']}\t{disp_result}") 215 | 216 | if spider.settings.getbool('export') or self.crawler.settings['EXPORT_ITEMS']: 217 | item2 = item.copy() 218 | if not spider.settings.get('showblob'): 219 | if 'image_blob' in item2: 220 | item2.pop('image_blob', None) 221 | item2.pop('back_blob', None) 222 | self.exporter.export_item(item2) 223 | 224 | return item 225 | 226 | def clean_tags(self, tags, aliaslist): 227 | tags2 = [] 228 | if tags: 229 | for tag in tags: 230 | pointer = 0 231 | for alias in aliaslist: 232 | if not pointer and tag.lower().strip() == alias['alias'].lower().strip(): 233 | tags2.append(alias['tag']) 234 | pointer = 1 235 | break 236 | if not pointer: 237 | tags2.append(tag.rstrip(".").rstrip(",").strip()) 238 | 239 | tags2 = [i for n, i in enumerate(tags2) if i not in tags2[:n]] 240 | return tags2 241 | 242 | def close_spider(self, spider): 243 | if spider.settings.getbool('export') or self.crawler.settings['EXPORT_ITEMS']: 244 | self.fp.write(']}'.encode()) 245 | self.fp.close() 246 | 247 | 248 | class TpdbApiMoviePipeline: 249 | def __init__(self, crawler): 250 | if crawler.settings['ENABLE_MONGODB']: 251 | db = MongoClient(crawler.settings['MONGODB_URL']) 252 | self.db = db['scrapy'] 253 | 254 | self.crawler = crawler 255 | 256 | if crawler.settings.get('path'): 257 | path = crawler.settings.get('path') 258 | else: 259 | path = crawler.settings.get('DEFAULT_EXPORT_PATH') 260 | 261 | if crawler.settings.get('FILTER_TAGS'): 262 | logging.info(f"Loading Movie Tag Alias File: {crawler.settings.get('FILTER_TAG_FILENAME')}") 263 | with open(crawler.settings.get('FILTER_TAG_FILENAME'), encoding='utf-8') as f: 264 | self.tagaliases = json.load(f) 265 | 266 | if crawler.settings.get('file'): 267 | filename = crawler.settings.get('file') 268 | if '\\' not in filename and '/' not in filename: 269 | filename = Path(path, filename) 270 | else: 271 | filename = Path(path, f'{crawler.spidercls.name}_{time.strftime("%Y%m%d-%H%M")}.json') 272 | 273 | if crawler.settings.getbool('export') or self.crawler.settings['EXPORT_ITEMS']: 274 | print(f'*** Exporting to file: {filename}') 275 | self.fp = open(filename, 'wb') 276 | self.fp.write('{"movies":['.encode()) 277 | 278 | if crawler.settings.getbool('oneline'): 279 | self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') 280 | else: 281 | self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8', sort_keys=True, indent=2) 282 | 283 | @classmethod 284 | def from_crawler(cls, crawler): 285 | return cls(crawler) 286 | 287 | async def process_item(self, item, spider): 288 | if spider.debug is True: 289 | return item 290 | 291 | # So we don't re-send scenes that have already been scraped 292 | if self.crawler.settings['ENABLE_MONGODB']: 293 | if spider.force is not True: 294 | result = self.db.scenes.find_one({'url': item['url']}) 295 | if result is not None: 296 | return 297 | 298 | if self.crawler.settings['FILTER_TAGS']: 299 | item['tags'] = self.clean_tags(item['tags'], self.tagaliases) 300 | 301 | if 'length' in item and 'duration' not in item: 302 | item['duration'] = None 303 | if item['length']: 304 | length = int(item['length']) 305 | if length < 450: 306 | length = length * 60 307 | item['length'] = str(length) 308 | item['duration'] = str(length) 309 | 310 | if "markers" not in item: 311 | item['markers'] = [] 312 | 313 | payload = { 314 | 'title': item['title'], 315 | 'description': item['description'], 316 | 'site': item['site'], 317 | 'date': item['date'], 318 | 'front': item['front'], 319 | 'front_blob': item['front_blob'], 320 | 'back': item['back'], 321 | 'back_blob': item['back_blob'], 322 | 'performers': item['performers'], 323 | 'tags': item['tags'], 324 | 'url': item['url'], 325 | 'external_id': str(item['id']), 326 | 'trailer': item['trailer'], 327 | 'markers': item['markers'], 328 | 'studio': item['studio'], 329 | 'director': item['director'], 330 | 'format': item['format'], 331 | 'length': item['length'], 332 | 'duration': item['duration'], 333 | 'year': item['year'], 334 | 'rating': item['rating'], 335 | 'sku': item['sku'], 336 | 'upc': item['upc'], 337 | 'store': item['store'], 338 | 'force_update': self.crawler.settings.getbool('FORCE_UPDATE'), 339 | } 340 | 341 | # Post the scene to the API - requires auth with permissions 342 | disp_result = "" 343 | if self.crawler.settings['TPDB_API_KEY'] and not spider.settings.get('local'): 344 | headers = { 345 | 'Authorization': f'Bearer {self.crawler.settings["TPDB_API_KEY"]}', 346 | 'Accept': 'application/json', 347 | 'Content-Type': 'application/json', 348 | 'User-Agent': 'tpdb-scraper/1.0.0' 349 | } 350 | 351 | response = Http.post('https://api.metadataapi.net/movies', json=payload, headers=headers, timeout=2) 352 | if response: 353 | if response.ok: 354 | disp_result = f'{disp_result} Submitted OK' 355 | else: 356 | disp_result = f'{disp_result} Submission Error: Code #{response.status_code}' 357 | else: 358 | disp_result = disp_result + 'Submission Error: No Response Code' 359 | if response: 360 | logging.info(response.content) 361 | url_hash = hashlib.sha1(str(item['url']).encode('utf-8')).hexdigest() 362 | 363 | if self.crawler.settings['MONGODB_ENABLE']: 364 | if not response.ok: 365 | self.db.errors.replace_one({'_id': url_hash}, { 366 | 'url': item['url'], 367 | 'error': 1, 368 | 'when': datetime.now().isoformat(), 369 | 'response': response.json() 370 | }, upsert=True) 371 | else: 372 | self.db.scenes.replace_one( 373 | {'_id': url_hash}, dict(item), upsert=True) 374 | else: 375 | disp_result = 'Local Run, Not Submitted' 376 | 377 | if spider.settings.get('localdump'): 378 | # Toss to local TPDB Instance 379 | headers = { 380 | 'Authorization': f'Bearer {self.crawler.settings["TPDB_TEST_API_KEY"]}', 381 | 'Accept': 'application/json', 382 | 'Content-Type': 'application/json', 383 | 'User-Agent': 'tpdb-scraper/1.0.0' 384 | } 385 | response = Http.post('http://api.tpdb.test/movies', json=payload, headers=headers) 386 | if response: 387 | if response.ok: 388 | disp_result = disp_result + '\tSubmitted to Local OK' 389 | else: 390 | disp_result = disp_result + f'\tSubmission to Local Error: Code #{response.status_code}' 391 | else: 392 | disp_result = disp_result + '\tSubmission to Local Error: No Response Code' 393 | if response: 394 | logging.info(response.content) 395 | # ############################# 396 | 397 | if (spider.settings.getbool('display') or self.crawler.settings['DISPLAY_ITEMS']) and spider.settings.get('LOG_LEVEL') == 'INFO': 398 | if len(item['title']) >= 50: 399 | title_length = 5 400 | else: 401 | title_length = 55 - len(item['title']) 402 | 403 | if len(item['site']) >= 15: 404 | site_length = 5 405 | else: 406 | site_length = 20 - len(item['site']) 407 | 408 | if "T" in item['date']: 409 | disp_date = re.search(r'(.*)T\d', item['date']).group(1) 410 | else: 411 | disp_date = item['date'] 412 | 413 | logging.info(f"Item: {item['title'][0:50]}" + " " * title_length + f"{item['site'][0:15]}" + " " * site_length + f"\t{str(item['id'])[0:15]}\t{disp_date}\t{item['url']}\t{disp_result}") 414 | 415 | if spider.settings.getbool('export') or self.crawler.settings['EXPORT_ITEMS']: 416 | item2 = item.copy() 417 | if not spider.settings.get('showblob'): 418 | if 'front_blob' in item2: 419 | item2.pop('front_blob', None) 420 | if 'back_blob' in item2: 421 | item2.pop('back_blob', None) 422 | self.exporter.export_item(item2) 423 | 424 | return item 425 | 426 | def clean_tags(self, tags, aliaslist): 427 | tags2 = [] 428 | if tags: 429 | for tag in tags: 430 | pointer = 0 431 | for alias in aliaslist: 432 | if not pointer and tag.lower().strip() == alias['alias'].lower().strip(): 433 | tags2.append(alias['tag']) 434 | pointer = 1 435 | break 436 | if not pointer: 437 | tags2.append(tag.rstrip(".").rstrip(",").strip()) 438 | 439 | tags2 = [i for n, i in enumerate(tags2) if i not in tags2[:n]] 440 | return tags2 441 | 442 | def close_spider(self, spider): 443 | if spider.settings.getbool('export') or self.crawler.settings['EXPORT_ITEMS']: 444 | self.fp.write(']}'.encode()) 445 | self.fp.close() 446 | 447 | 448 | class TpdbApiPerformerPipeline: 449 | def __init__(self, crawler): 450 | if crawler.settings['ENABLE_MONGODB']: 451 | db = MongoClient(crawler.settings['MONGODB_URL']) 452 | self.db = db['scrapy'] 453 | 454 | self.crawler = crawler 455 | 456 | if crawler.settings.get('path'): 457 | path = crawler.settings.get('path') 458 | else: 459 | path = crawler.settings.get('DEFAULT_EXPORT_PATH') 460 | 461 | if crawler.settings.get('file'): 462 | filename = crawler.settings.get('file') 463 | if '\\' not in filename and '/' not in filename: 464 | filename = Path(path, filename) 465 | else: 466 | filename = Path(path, f'{crawler.spidercls.name}_{time.strftime("%Y%m%d-%H%M")}-performers.json') 467 | 468 | if crawler.settings.getbool('export') or self.crawler.settings['EXPORT_ITEMS']: 469 | print(f"*** Exporting to file: {filename}") 470 | self.fp = open(filename, 'wb') 471 | self.fp.write('{"scenes":['.encode()) 472 | 473 | if crawler.settings.getbool('oneline'): 474 | self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') 475 | else: 476 | self.exporter = JsonItemExporter(self.fp, ensure_ascii=False, encoding='utf-8', sort_keys=True, indent=2) 477 | 478 | @classmethod 479 | def from_crawler(cls, crawler): 480 | return cls(crawler) 481 | 482 | async def process_item(self, item, spider): 483 | if self.crawler.settings['ENABLE_MONGODB']: 484 | if spider.force is not True: 485 | result = self.db.performers.find_one({'url': item['url']}) 486 | if result is not None: 487 | return 488 | 489 | if 'fakeboobs' in item and item['fakeboobs']: 490 | if item['fakeboobs'].lower() == 'yes': 491 | item['fakeboobs'] = True 492 | elif item['fakeboobs'].lower() == 'no': 493 | item['fakeboobs'] = False 494 | else: 495 | item['fakeboobs'] = None 496 | 497 | payload = { 498 | 'name': item['name'], 499 | 'site': item['network'], 500 | 'network': item['network'], 501 | 'url': item['url'], 502 | 'bio': item['bio'], 503 | 'image': item['image'], 504 | 'image_blob': item['image_blob'], 505 | 'extra': { 506 | 'gender': item['gender'], 507 | 'birthday': item['birthday'], 508 | 'astrology': item['astrology'], 509 | 'birthplace': item['birthplace'], 510 | 'ethnicity': item['ethnicity'], 511 | 'nationality': item['nationality'], 512 | 'haircolor': item['haircolor'], 513 | 'eyecolor': item['eyecolor'], 514 | 'weight': item['weight'], 515 | 'height': item['height'], 516 | 'measurements': item['measurements'], 517 | 'tattoos': item['tattoos'], 518 | 'piercings': item['piercings'], 519 | 'cupsize': item['cupsize'], 520 | 'fakeboobs': item['fakeboobs'] 521 | } 522 | } 523 | 524 | # Post the scene to the API - requires auth with permissions 525 | disp_result = "" 526 | if self.crawler.settings['TPDB_API_KEY'] and not spider.settings.get('local'): 527 | headers = { 528 | 'Authorization': f'Bearer {self.crawler.settings["TPDB_API_KEY"]}', 529 | 'Accept': 'application/json', 530 | 'Content-Type': 'application/json', 531 | 'User-Agent': 'tpdb-scraper/1.0.0' 532 | } 533 | 534 | response = Http.post('https://api.metadataapi.net/performer_sites', json=payload, headers=headers, timeout=2) 535 | if response: 536 | if response.ok: 537 | disp_result = 'Submitted OK' 538 | else: 539 | disp_result = 'Submission Error: Code #' + str(response.status_code) 540 | else: 541 | disp_result = 'Submission Error: No Response Code' 542 | if response: 543 | logging.info(response.content) 544 | 545 | if self.crawler.settings['MONGODB_ENABLE']: 546 | url_hash = hashlib.sha1(str(item['url']).encode('utf-8')).hexdigest() 547 | if not response.ok: 548 | self.db.errors.replace_one({'_id': url_hash}, { 549 | 'url': item['url'], 550 | 'error': 1, 551 | 'when': datetime.now().isoformat(), 552 | 'response': response.json() 553 | }, upsert=True) 554 | else: 555 | self.db.performers.replace_one({'_id': url_hash}, dict(item), upsert=True) 556 | else: 557 | disp_result = 'Local Run, Not Submitted' 558 | 559 | if spider.settings.get('localdump'): 560 | # Toss to local TPDB Instance 561 | headers = { 562 | 'Authorization': f'Bearer {self.crawler.settings["TPDB_TEST_API_KEY"]}', 563 | 'Accept': 'application/json', 564 | 'Content-Type': 'application/json', 565 | 'User-Agent': 'tpdb-scraper/1.0.0' 566 | } 567 | 568 | response = Http.post('http://api.tpdb.test/performer_sites', json=payload, headers=headers) 569 | if response: 570 | if response.ok: 571 | disp_result = disp_result + '\tSubmitted to Local OK' 572 | else: 573 | disp_result = disp_result + f'\tSubmission to Local Error: Code #{response.status_code}' 574 | else: 575 | disp_result = disp_result + '\tSubmission to Local Error: No Response Code' 576 | if response: 577 | logging.info(response.content) 578 | # ############################## 579 | 580 | if (spider.settings.getbool('display') or self.crawler.settings['DISPLAY_ITEMS']) and spider.settings.get('LOG_LEVEL') == 'INFO': 581 | name_length = 50 - len(payload['name']) 582 | if name_length < 1: 583 | name_length = 1 584 | 585 | logging.info(f"Performer: {payload['name']}" + " " * name_length + f"{payload['site']}\t{payload['url']}\t{disp_result}") 586 | 587 | if spider.settings.getbool('export') or self.crawler.settings['EXPORT_ITEMS']: 588 | item2 = payload.copy() 589 | if not spider.settings.get('showblob'): 590 | if "image_blob" in item2: 591 | item2.pop('image_blob', None) 592 | self.exporter.export_item(item2) 593 | 594 | return item 595 | 596 | def close_spider(self, spider): 597 | if spider.settings.getbool('export') or self.crawler.settings['EXPORT_ITEMS']: 598 | self.fp.write(']}'.encode()) 599 | self.fp.close() 600 | -------------------------------------------------------------------------------- /tpdb/settings.py.example: -------------------------------------------------------------------------------- 1 | # Scrapy settings for tpdb project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | # Custom CLI Options: 11 | # 12 | # --loglevel=(DEBUG, INFO, WARNING, ERROR, CRITICAL) (loglevel = INFO by default) 13 | # -a limit_pages=#### (Number of pages to stop after, limit_pages=all for no limit) 14 | # -a page=### (Page number to being scrape with. Defaults to 1) 15 | # -a days=### (Filters results to only return results within the past ### days. Without flag will return all results found on page) 16 | # -s display=true (Display a single line per entry scraped, DEBUG level will display entire Item object) 17 | # -s export=true (Export to JsonLinesItemExporter file, each line is its own JSON structure. Good for memory with large datasets) 18 | # -s file= (works with '-s export' to set the filename. Defaults to ".json") 19 | # -s path= (works with '-s export' to set the file path. Defaults to whatever "DEFAULT_EXPORT_PATH" is set to in this file) 20 | 21 | 22 | BOT_NAME = 'tpdb' 23 | 24 | SPIDER_MODULES = ['tpdb.spiders'] 25 | NEWSPIDER_MODULE = 'tpdb.spiders' 26 | 27 | # Crawl responsibly by identifying yourself (and your website) on the 28 | # user-agent 29 | USER_AGENT = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86.0) Gecko/20100101 Firefox/86.0' 30 | 31 | # Obey robots.txt rules 32 | ROBOTSTXT_OBEY = False 33 | 34 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 35 | # CONCURRENT_REQUESTS = 32 36 | 37 | # Configure a delay for requests for the same website (default: 0) 38 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 39 | # See also autothrottle settings and docs 40 | # DOWNLOAD_DELAY = 3 41 | # The download delay setting will honor only one of: 42 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 43 | # CONCURRENT_REQUESTS_PER_IP = 16 44 | 45 | # Disable cookies (enabled by default) 46 | COOKIES_ENABLED = True 47 | # COOKIES_DEBUG = True 48 | # Disable Telnet Console (enabled by default) 49 | # TELNETCONSOLE_ENABLED = False 50 | 51 | # Override the default request headers: 52 | # DEFAULT_REQUEST_HEADERS = { 53 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 54 | # 'Accept-Language': 'en', 55 | # } 56 | 57 | # Enable or disable spider middlewares 58 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 59 | SPIDER_MIDDLEWARES = { 60 | # 'tpdb.middlewares.TpdbSpiderMiddleware': 543, 61 | } 62 | 63 | # Enable or disable downloader middlewares 64 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 65 | DOWNLOADER_MIDDLEWARES = { 66 | 'scrapy.downloadermiddlewares.cookies.CookiesMiddleware': 100, 67 | # 'tpdb.middlewares.TpdbDownloaderMiddleware': 543, 68 | } 69 | 70 | # Enable or disable extensions 71 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 72 | # EXTENSIONS = { 73 | # 'scrapy.extensions.telnet.TelnetConsole': None, 74 | # } 75 | 76 | # Configure item pipelines 77 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 78 | ITEM_PIPELINES = { 79 | # 'tpdb.pipelines.TpdbApiScenePipeline': 400, 80 | } 81 | 82 | # Enable and configure the AutoThrottle extension (disabled by default) 83 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 84 | AUTOTHROTTLE_ENABLED = True 85 | # The initial download delay 86 | AUTOTHROTTLE_START_DELAY = 5 87 | # The maximum download delay to be set in case of high latencies 88 | AUTOTHROTTLE_MAX_DELAY = 60 89 | # The average number of requests Scrapy should be sending in parallel to 90 | # each remote server 91 | AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 92 | # Enable showing throttling stats for every response received: 93 | AUTOTHROTTLE_DEBUG = True 94 | 95 | # Enable and configure HTTP caching (disabled by default) 96 | # See 97 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 98 | HTTPCACHE_ENABLED = True 99 | HTTPCACHE_EXPIRATION_SECS = 720 100 | HTTPCACHE_DIR = 'httpcache' 101 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 102 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 103 | # TWISTED_REACTOR = 'twisted.internet.asyncioreactor.AsyncioSelectorReactor' 104 | # DUPEFILTER_DEBUG = True 105 | 106 | LOG_LEVEL = 'INFO' 107 | 108 | ENABLE_MONGODB = False 109 | MONGODB_URL = '' 110 | TPDB_API_KEY = '' 111 | 112 | FLARE_URL = 'http://127.0.0.1:8191' 113 | SPLASH_URL = 'http://127.0.0.1:8090' 114 | 115 | PROXY_ADDRESS = 'http://127.0.0.1:8118' 116 | USE_PROXY = False 117 | 118 | # To pre-filter tags before submission, based on a JSON file in format like `[ { "tag": "18+ Teens", "alias": "Teen (18+)" }, { "tag": "18+ Teens", "alias": "Teen 18 19" }]` 119 | # Simply if 'alias' matches a scene tag, replace it with 'tag' from file then de-dupe the list 120 | FILTER_TAGS = False 121 | FILTER_TAG_FILENAME = 'tagaliases.json' 122 | 123 | # DISPLAY_ITEMS = True # Display a running list of returned items. Can also be done on command line with '-s display=true' 124 | # EXPORT_ITEMS = True # Export a running list of returned items into a JSON file named for scraper and dated. Can also be done on command line with '-s export=true' 125 | DEFAULT_EXPORT_PATH = "./" # Directory to save exported JSON files into. Relative to where Scrapy is called from 126 | --------------------------------------------------------------------------------