├── .github └── FUNDING.yml ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── readme_images ├── browser.png ├── source.png └── tools.png ├── requirements-shell.txt ├── requirements.txt ├── scrapy_gui ├── __init__.py ├── browser.py ├── browser_window │ ├── __init__.py │ ├── browser.py │ └── images │ │ ├── back.png │ │ ├── empty.png │ │ ├── forward.png │ │ └── loader.gif ├── load_selector.py └── utils_ui │ ├── __init__.py │ ├── errors.py │ ├── parser.py │ ├── text_viewer.py │ └── tools_tab_ui.py └── setup.py /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | patreon: further_reading 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | pip_upload_steps 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | .python-version 87 | 88 | # pipenv 89 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 90 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 91 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 92 | # install all needed dependencies. 93 | #Pipfile.lock 94 | 95 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 96 | __pypackages__/ 97 | 98 | # Celery stuff 99 | celerybeat-schedule 100 | celerybeat.pid 101 | 102 | # SageMath parsed files 103 | *.sage.py 104 | 105 | # Environments 106 | .env 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | 132 | # custom 133 | user_fun.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Roy Healy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include MANIFEST.in 2 | recursive-include scrapy_gui/browser_window/images * -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Requires Python 3.6+ 2 | 3 | # Scrapy GUI 4 | A simple, Qt-Webengine powered web browser with built in functionality for testing scrapy spider code. 5 | 6 | Also includes an addon to enable a GUI for use with the scrapy shell. 7 | 8 | 9 | **Table of Contents** 10 | 11 | - [Installation](#installation) 12 | - [Standalone UI](#standalone-ui) 13 | - [Browser Tab](#browser-tab) 14 | - [Tools Tab](#tools-tab) 15 | - [Query Box](#query-box) 16 | - [Regex Box](#regex-box) 17 | - [Function Box](#function-box) 18 | - [Results Box](#results-box) 19 | - [Source Tab](#source-tab) 20 | - [Notes Tab](#notes-tab) 21 | - [Integration with Scrapy Shell](#integration-with-scrapy-shell) 22 | - [Activation](#activation) 23 | 24 | # Installation 25 | 26 | You can import the package from PyPi using 27 | 28 | > pip install scrapy_gui 29 | 30 | Then you can import it to a shell using `import scrapy_gui`. 31 | 32 | # Standalone UI 33 | The standlaone UI can be opened by using `scrapy_gui.open_browser()` from a python shell. This consists of a web browser and a set of tools to analyse its contents. 34 | 35 | ## Browser Tab 36 | Enter any url into search bar and hit return or press the Go button. When the loading animation finishes it will be ready to parse in the Tools tab. 37 | 38 | ![Browser tab](https://raw.githubusercontent.com/further-reading/scraping-browser/master/readme_images/browser.png "Browser Example") 39 | 40 | ## Tools Tab 41 | The tools tab contains various sections for parsing content of the page. The purpose of this tab is to make it easy to test queries and code for use in a scrapy spider. 42 | > **NOTE:** This will use the **initial** html response. If additional requests, javascript, etc alter the page later this will not be taken into account. 43 | 44 | It will load the initial html with an additional request using the `requests` package. When running a query it will create a selector object using `Selection` from the parsel package. 45 | 46 | ![Tools tab](https://raw.githubusercontent.com/further-reading/scraping-browser/master/readme_images/tools.png "Tools Example") 47 | 48 | ### Query Box 49 | The query box lets you use [parsel](https://github.com/scrapy/parsel) compatible CSS and XPath queries to extract data from the page. 50 | 51 | It returns results as though `selection.css/xpath('YOUR QUERY').getall()` was called. 52 | 53 | If there are no results or there is an error in the query a dialogue will pop up informing you of the issue. 54 | 55 | ### Regex Box 56 | This box lets you add a regular expression pattern to be used in addition to the previous css query. 57 | 58 | It returns results as though `selection.css/xpath('YOUR QUERY').re(r'YOUR REGEX')'` was called. This means that if you use groups it will only return the content within parenthesis. 59 | 60 | ### Function Box 61 | This box lets you define additional python code that can run on the results of your query and regex. The code can be as long and complex as you want, including adding additional functions, classes, imports etc. 62 | 63 | The only requirement is you must include a function called `user_fun(results, selector)` that returns a `list`. 64 | 65 | ### Results Box 66 | 67 | This table will list all the results, passed through the regex and function if defined. 68 | 69 | ## Source Tab 70 | 71 | This tab contains the html source that is used in the Tools tab. You can use the text box to search for specific content. All searches are not case sensitive. 72 | 73 | ![Source Tab](https://raw.githubusercontent.com/further-reading/scraping-browser/master/readme_images/source.png "Source Example") 74 | 75 | ## Notes Tab 76 | 77 | This is just a plain text box. Content in here is not saved when you exit the app. 78 | 79 | # Integration with Scrapy Shell 80 | 81 | It is possible to integrate this tool with the scrapy shell. This will allow you to use it on responses that have been passed through your middlewares, access more complex requests and more specific selectors. 82 | 83 | ## Activation 84 | 85 | To use it in your shell import the load_selector method using: 86 | 87 | `from scrapy_gui import load_selector` 88 | 89 | Then you can write load_selector(YOUR_SELECTOR) to open a window with your selector loaded into it. 90 | 91 | > For example `load_selector(response)` will load your response into the UI. 92 | 93 | When you run the code a window named `Scrapy GUI` will open that contains the `Tools`, `Source` and `Notes` tabs from the standalone window mentioned above. 94 | -------------------------------------------------------------------------------- /readme_images/browser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/further-reading/scrapy-gui/d19552683d4f0cd7b44574af71505f8e8a7b7c9e/readme_images/browser.png -------------------------------------------------------------------------------- /readme_images/source.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/further-reading/scrapy-gui/d19552683d4f0cd7b44574af71505f8e8a7b7c9e/readme_images/source.png -------------------------------------------------------------------------------- /readme_images/tools.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/further-reading/scrapy-gui/d19552683d4f0cd7b44574af71505f8e8a7b7c9e/readme_images/tools.png -------------------------------------------------------------------------------- /requirements-shell.txt: -------------------------------------------------------------------------------- 1 | parsel==1.5.2 2 | cssselect==1.1.0 3 | beautifulsoup4==4.8.2 4 | PyQt5==5.14.0 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -r requirements-shell.txt 2 | requests==2.31.0 3 | PyQtWebEngine-5.14.0 -------------------------------------------------------------------------------- /scrapy_gui/__init__.py: -------------------------------------------------------------------------------- 1 | from .load_selector import load_selector 2 | from .browser import open_browser 3 | -------------------------------------------------------------------------------- /scrapy_gui/browser.py: -------------------------------------------------------------------------------- 1 | from PyQt5.QtWidgets import * 2 | 3 | from parsel import Selector 4 | import requests 5 | 6 | from .utils_ui.text_viewer import TextViewer 7 | from .browser_window.browser import QtBrowser 8 | from .utils_ui.tools_tab_ui import Queries 9 | import sys 10 | 11 | 12 | class Main(QMainWindow): 13 | def __init__(self, *args, **kwargs): 14 | super().__init__(*args, **kwargs) 15 | self.init_ui() 16 | 17 | def init_ui(self): 18 | self.setWindowTitle('Scrapy GUI - Browser') 19 | tabs = QTabWidget() 20 | self.browser = QtBrowser(main=self) 21 | self.queries = Queries(main=self) 22 | self.source_viewer = TextViewer() 23 | self.notes = QPlainTextEdit() 24 | tabs.addTab(self.browser, 'Browser') 25 | tabs.addTab(self.queries, 'Tools') 26 | tabs.addTab(self.source_viewer, 'Source') 27 | tabs.addTab(self.notes, 'Notes') 28 | self.setCentralWidget(tabs) 29 | self.show() 30 | 31 | def update_source(self, url): 32 | # pyqt5 webengine has the final html including manipulation from javascript, etc 33 | # for scraping with scrapy the first one matters, so will get again 34 | response = requests.get(url) 35 | html = response.text 36 | selector = Selector(text=html) 37 | self.queries.update_source(selector) 38 | self.source_viewer.setPrettyHtml(html) 39 | 40 | 41 | def open_browser(): 42 | app = QApplication(sys.argv) 43 | main = Main() 44 | app.exec_() 45 | -------------------------------------------------------------------------------- /scrapy_gui/browser_window/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/further-reading/scrapy-gui/d19552683d4f0cd7b44574af71505f8e8a7b7c9e/scrapy_gui/browser_window/__init__.py -------------------------------------------------------------------------------- /scrapy_gui/browser_window/browser.py: -------------------------------------------------------------------------------- 1 | from PyQt5.QtWidgets import * 2 | from PyQt5.QtCore import * 3 | from PyQt5.QtGui import * 4 | from PyQt5.QtWebEngineWidgets import QWebEngineView 5 | 6 | import os 7 | 8 | HOME = 'http://quotes.toscrape.com/' 9 | 10 | 11 | class QtBrowser(QWidget): 12 | def __init__(self, *args, main, **kwargs): 13 | super().__init__(*args, **kwargs) 14 | self.main = main 15 | self.html = None 16 | self.init_ui() 17 | 18 | def init_ui(self): 19 | grid = QGridLayout() 20 | self.setLayout(grid) 21 | 22 | self.go_button = QPushButton('Go') 23 | self.go_button.clicked.connect(self.go_to_page) 24 | grid.addWidget(self.go_button, 0, 3) 25 | 26 | self.entry_box = QLineEdit() 27 | self.entry_box.returnPressed.connect(self.go_button.click) 28 | grid.addWidget(self.entry_box, 0, 2) 29 | 30 | self.web = QWebEngineView() 31 | grid.addWidget(self.web, 1, 0, 1, 5) 32 | self.web.urlChanged.connect(self.update_url) 33 | self.web.loadStarted.connect(self.load_started) 34 | self.web.loadFinished.connect(self.load_finished) 35 | self.web.load(QUrl(HOME)) 36 | 37 | back_button = BrowserButton(image=get_path('images/back.png')) 38 | back_button.clicked.connect(self.web.back) 39 | grid.addWidget(back_button, 0, 0) 40 | 41 | forward_button = BrowserButton(image=get_path('images/forward.png')) 42 | forward_button.clicked.connect(self.web.forward) 43 | grid.addWidget(forward_button, 0, 1) 44 | 45 | self.movie = MovieScreen( 46 | movie_file=get_path('images/loader.gif'), 47 | end_file=get_path('images/empty.png'), 48 | ) 49 | self.movie.setMaximumHeight(20) 50 | self.movie.setMaximumWidth(20) 51 | grid.addWidget(self.movie, 0, 4) 52 | 53 | def go_to_page(self): 54 | entered_page = self.entry_box.text() 55 | if not entered_page.startswith('http'): 56 | entered_page = f'https://{entered_page}' 57 | elif not entered_page.startswith('https'): 58 | entered_page = entered_page.replace('http', 'https', 1) 59 | 60 | self.web.load(QUrl(entered_page)) 61 | 62 | def update_url(self): 63 | url = self.get_url() 64 | self.entry_box.setText(url) 65 | 66 | def get_url(self): 67 | qurl = self.web.url() 68 | url = qurl.url() 69 | return url 70 | 71 | def load_started(self): 72 | self.go_button.setDisabled(True) 73 | self.movie.start() 74 | 75 | def load_finished(self): 76 | url = self.get_url() 77 | self.movie.stop() 78 | self.main.update_source(url) 79 | self.go_button.setEnabled(True) 80 | 81 | 82 | class BrowserButton(QPushButton): 83 | def __init__(self, *args, image): 84 | super().__init__(*args) 85 | self.pixmap = QPixmap(image) 86 | self.setCursor(QCursor(Qt.PointingHandCursor)) 87 | 88 | def paintEvent(self, event): 89 | painter = QPainter(self) 90 | painter.drawPixmap(event.rect(), self.pixmap) 91 | self.update() 92 | 93 | 94 | class MovieScreen(QLabel): 95 | def __init__(self, *args, movie_file, end_file): 96 | super().__init__(*args) 97 | self.movie = QMovie(movie_file, QByteArray(), self) 98 | self.end = QMovie(end_file, QByteArray(), self) 99 | self.movie.setScaledSize(QSize(20, 20)) 100 | self.end.setScaledSize(QSize(20, 20)) 101 | self.setMovie(self.movie) 102 | 103 | def start(self): 104 | self.setMovie(self.movie) 105 | self.movie.start() 106 | 107 | def stop(self): 108 | self.movie.stop() 109 | self.setMovie(self.end) 110 | 111 | 112 | def get_path(relative_path): 113 | dirname = os.path.dirname(__file__) 114 | full_path = os.path.join(dirname, relative_path) 115 | return full_path 116 | -------------------------------------------------------------------------------- /scrapy_gui/browser_window/images/back.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/further-reading/scrapy-gui/d19552683d4f0cd7b44574af71505f8e8a7b7c9e/scrapy_gui/browser_window/images/back.png -------------------------------------------------------------------------------- /scrapy_gui/browser_window/images/empty.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/further-reading/scrapy-gui/d19552683d4f0cd7b44574af71505f8e8a7b7c9e/scrapy_gui/browser_window/images/empty.png -------------------------------------------------------------------------------- /scrapy_gui/browser_window/images/forward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/further-reading/scrapy-gui/d19552683d4f0cd7b44574af71505f8e8a7b7c9e/scrapy_gui/browser_window/images/forward.png -------------------------------------------------------------------------------- /scrapy_gui/browser_window/images/loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/further-reading/scrapy-gui/d19552683d4f0cd7b44574af71505f8e8a7b7c9e/scrapy_gui/browser_window/images/loader.gif -------------------------------------------------------------------------------- /scrapy_gui/load_selector.py: -------------------------------------------------------------------------------- 1 | from PyQt5.QtWidgets import * 2 | 3 | from .utils_ui.text_viewer import TextViewer 4 | from .utils_ui.tools_tab_ui import Queries 5 | 6 | import sys 7 | 8 | 9 | class MiniUI(QMainWindow): 10 | def __init__(self, *args, **kwargs): 11 | super().__init__(*args, **kwargs) 12 | self.init_ui() 13 | 14 | def init_ui(self): 15 | self.setWindowTitle('Scrapy GUI') 16 | tabs = QTabWidget() 17 | self.queries = Queries(main=self) 18 | self.source_viewer = TextViewer() 19 | self.notes = QPlainTextEdit() 20 | tabs.addTab(self.queries, 'Tools') 21 | tabs.addTab(self.source_viewer, 'Source') 22 | tabs.addTab(self.notes, 'Notes') 23 | self.setCentralWidget(tabs) 24 | 25 | def add_selector(self, selector): 26 | self.queries.update_source(selector) 27 | self.source_viewer.setPrettyHtml(selector.text) 28 | 29 | 30 | def load_selector(selector): 31 | print('Shell UI window opened - Close window to regain use of shell') 32 | app = QApplication(sys.argv) 33 | main = MiniUI() 34 | main.add_selector(selector) 35 | main.show() 36 | app.exec_() 37 | -------------------------------------------------------------------------------- /scrapy_gui/utils_ui/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/further-reading/scrapy-gui/d19552683d4f0cd7b44574af71505f8e8a7b7c9e/scrapy_gui/utils_ui/__init__.py -------------------------------------------------------------------------------- /scrapy_gui/utils_ui/errors.py: -------------------------------------------------------------------------------- 1 | from PyQt5.QtWidgets import QMessageBox 2 | 3 | ERROR_TYPES = { 4 | 'info': QMessageBox.information, 5 | 'critical': QMessageBox.critical, 6 | } 7 | 8 | 9 | class QueryError(Exception): 10 | def __init__(self, *args, title, message, error_type): 11 | super().__init__(*args) 12 | self.title = title 13 | self.message = message 14 | self.error_type = error_type 15 | 16 | 17 | def show_error_dialog(parent, title, message, error_type): 18 | message_box = ERROR_TYPES[error_type] 19 | message_box(parent, title, message) 20 | -------------------------------------------------------------------------------- /scrapy_gui/utils_ui/parser.py: -------------------------------------------------------------------------------- 1 | from cssselect.xpath import ExpressionError 2 | from cssselect.parser import SelectorSyntaxError 3 | import traceback 4 | from . import errors 5 | 6 | 7 | class Parser: 8 | def __init__(self, selector): 9 | self.selector = selector 10 | 11 | def do_query(self, query, query_type, selector, regex=None, function=None): 12 | try: 13 | if query_type == 'css': 14 | results = self.selector.css(query) 15 | elif query_type == 'xpath': 16 | results = self.selector.xpath(query) 17 | except (ExpressionError, SelectorSyntaxError, ValueError) as e: 18 | message = f'Error parsing {query_type} query\n\n{e}' 19 | raise errors.QueryError( 20 | title=f'{query_type.title()} Error', 21 | message=message, 22 | error_type='critical', 23 | ) 24 | if not results: 25 | raise errors.QueryError( 26 | title='CSS Empty', 27 | message=f'No results for {query_type} Query\n{query}', 28 | error_type='info', 29 | ) 30 | if regex: 31 | try: 32 | results = results.re(regex) 33 | except Exception as e: 34 | message = f'Error running regex\n\n{e}' 35 | raise errors.QueryError( 36 | title='RegEx Error', 37 | message=message, 38 | error_type='critical', 39 | ) 40 | if not results: 41 | raise errors.QueryError( 42 | title='RegEX Empty', 43 | message=f'No results for Regular Expression\n{regex}', 44 | error_type='info', 45 | ) 46 | 47 | else: 48 | results = results.getall() 49 | 50 | if function: 51 | results = self.use_custom_function(results, function, selector) 52 | if not results: 53 | raise errors.QueryError( 54 | title='Function Empty', 55 | message=f'No results when using function\n\n{function}', 56 | error_type='critical', 57 | ) 58 | return results 59 | 60 | def use_custom_function(self, results, function, selector): 61 | if 'def user_fun(results, selector):' not in function: 62 | message = f'Custom function needs to be named "user_fun" and have "results" and "selector" as arguments' 63 | raise errors.QueryError( 64 | title='Function Error', 65 | message=message, 66 | error_type='critical', 67 | ) 68 | 69 | try: 70 | exec(function, globals()) 71 | results = user_fun(results, selector) 72 | except Exception as e: 73 | message = f'Error running custom function\n\n{type(e).__name__}: {e.args}' 74 | message += f'\n\n{traceback.format_exc()}' 75 | raise errors.QueryError( 76 | title='Function Error', 77 | message=message, 78 | error_type='critical', 79 | ) 80 | 81 | return results 82 | -------------------------------------------------------------------------------- /scrapy_gui/utils_ui/text_viewer.py: -------------------------------------------------------------------------------- 1 | from PyQt5.QtWidgets import * 2 | from PyQt5.QtCore import * 3 | from PyQt5.QtGui import * 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | class TextViewer(QWidget): 8 | current_index = 0 9 | total_hits = 0 10 | 11 | def __init__(self, *args, **kwargs): 12 | super().__init__(*args, **kwargs) 13 | self.indexes = None 14 | self.initUI() 15 | 16 | def initUI(self): 17 | grid = QGridLayout() 18 | self.setLayout(grid) 19 | 20 | search_button = QPushButton('Search') 21 | grid.addWidget(search_button, 0, 1) 22 | search_button.clicked.connect(self.find_pressed) 23 | 24 | self.search_bar = QLineEdit() 25 | grid.addWidget(self.search_bar, 0, 0) 26 | self.search_bar.returnPressed.connect(self.find_pressed) 27 | 28 | self.results = QLabel('0 of 0 Results') 29 | grid.addWidget(self.results, 0, 2) 30 | 31 | next_button = QPushButton('Next') 32 | next_button.clicked.connect(self.next_pressed) 33 | grid.addWidget(next_button, 0, 3) 34 | 35 | previous_button = QPushButton('Previous') 36 | previous_button.clicked.connect(self.previous_pressed) 37 | grid.addWidget(previous_button, 0, 4) 38 | 39 | self.source_text = QTextEdit() 40 | grid.addWidget(self.source_text, 1, 0, 1, 5) 41 | self.source_text.setReadOnly(True) 42 | 43 | self.keywordFormat = QTextCharFormat() 44 | self.keywordFormat.setBackground(Qt.yellow) 45 | self.keywordFormat.setFontWeight(QFont.Bold) 46 | 47 | def setPlainText(self, text): 48 | self.source_text.setReadOnly(False) 49 | self.source_text.setPlainText(text) 50 | self.source_text.setReadOnly(True) 51 | 52 | def setPrettyHtml(self, text): 53 | # uses bs4 to prettify html input 54 | soup = BeautifulSoup(text, 'html.parser') 55 | html_out = soup.prettify() 56 | self.setPlainText(html_out) 57 | 58 | def find_pressed(self): 59 | self.source_text.setReadOnly(False) 60 | self.find_indexes() 61 | self.current_index = 1 62 | self.total_hits = len(self.indexes) 63 | self.set_format() 64 | self.update_position() 65 | self.source_text.setReadOnly(True) 66 | 67 | def make_cursor(self, index): 68 | cursor = self.source_text.textCursor() 69 | size = len(self.search_bar.text()) 70 | cursor.setPosition(index) 71 | cursor.setPosition(index + size, QTextCursor.KeepAnchor) 72 | return cursor 73 | 74 | def set_format(self): 75 | # clear current formatting first 76 | cursor = self.source_text.textCursor() 77 | cursor.select(QTextCursor.Document) 78 | cursor.setCharFormat(QTextCharFormat()) 79 | cursor.clearSelection() 80 | 81 | for index in self.indexes: 82 | cursor = self.make_cursor(index) 83 | cursor.setCharFormat(self.keywordFormat) 84 | 85 | def find_indexes(self): 86 | search_term = self.search_bar.text().lower() 87 | all_text = self.source_text.toPlainText().lower() 88 | if not search_term or not all_text: 89 | return 90 | del self.indexes 91 | self.indexes = [i for i in range(len(all_text)) if all_text.startswith(search_term, i)] 92 | 93 | def next_pressed(self): 94 | if not self.indexes: 95 | return 96 | self.current_index += 1 97 | if self.current_index > self.total_hits: 98 | self.current_index = 1 99 | 100 | self.update_position() 101 | 102 | def previous_pressed(self): 103 | if not self.indexes: 104 | return 105 | self.current_index -= 1 106 | if self.current_index < 1: 107 | self.current_index = self.total_hits 108 | 109 | self.update_position() 110 | 111 | def update_position(self): 112 | if not self.indexes: 113 | return 114 | index = self.indexes[self.current_index - 1] 115 | cursor = self.make_cursor(index) 116 | self.source_text.setTextCursor(cursor) 117 | self.results.setText(f'{self.current_index} of {self.total_hits} Results') -------------------------------------------------------------------------------- /scrapy_gui/utils_ui/tools_tab_ui.py: -------------------------------------------------------------------------------- 1 | from PyQt5.QtWidgets import * 2 | from PyQt5.QtCore import * 3 | 4 | from .parser import Parser 5 | from . import errors 6 | 7 | 8 | class BigHandleSplitter(QSplitter): 9 | css_sheet = """ 10 | QSplitter::handle { 11 | background-color: #000 12 | ;} 13 | """ 14 | 15 | def __init__(self, *args): 16 | super().__init__(*args) 17 | self.setHandleWidth(1) 18 | self.setStyleSheet(self.css_sheet) 19 | 20 | 21 | class Queries(BigHandleSplitter): 22 | url = None 23 | selector = None 24 | use_re = False 25 | 26 | def __init__(self, *args, main): 27 | super().__init__(*args) 28 | self.main = main 29 | self.initUI() 30 | 31 | def initUI(self): 32 | self.setOrientation(Qt.Vertical) 33 | top = BigHandleSplitter(Qt.Horizontal) 34 | left_frame = BigHandleSplitter(Qt.Vertical) 35 | 36 | self.query_section = QueryChoiceEntry(label='Query') 37 | self.query_section.initUI() 38 | left_frame.addWidget(self.query_section) 39 | 40 | left_bottom = QFrame() 41 | left_bottom_box = QVBoxLayout() 42 | left_bottom.setLayout(left_bottom_box) 43 | 44 | self.re_section = OptionalQuery(label='Regex') 45 | self.re_section.initUI() 46 | left_bottom_box.addWidget(self.re_section) 47 | 48 | run_button = QPushButton('Run Query') 49 | run_button.clicked.connect(self.do_query) 50 | left_bottom_box.addWidget(run_button) 51 | 52 | copy_button = QPushButton('Copy Query') 53 | copy_button.clicked.connect(self.copy_query) 54 | left_bottom_box.addWidget(copy_button) 55 | 56 | left_frame.addWidget(left_bottom) 57 | top.addWidget(left_frame) 58 | 59 | self.function_section = OptionalQuery(label='Function') 60 | self.function_section.initUI() 61 | self.function_section.query.setPlainText( 62 | """# import packages 63 | 64 | # must have 'user_fun' function with 65 | # 'results' and 'selector' as arguments 66 | # and return a list 67 | 68 | def user_fun(results, selector): 69 | # your code 70 | return results""" 71 | ) 72 | top.addWidget(self.function_section) 73 | self.addWidget(top) 74 | 75 | self.results = ResultsWidget() 76 | self.addWidget(self.results) 77 | 78 | def do_query(self): 79 | if self.selector is None: 80 | return 81 | parser = Parser(self.selector) 82 | query, query_type = self.query_section.get_query() 83 | 84 | if self.re_section.use: 85 | regex = self.re_section.get_query() 86 | else: 87 | regex = None 88 | if self.function_section.use: 89 | function = self.function_section.get_query() 90 | else: 91 | function = None 92 | 93 | try: 94 | results = parser.do_query(query, query_type, parser.selector, regex, function) 95 | except errors.QueryError as e: 96 | errors.show_error_dialog( 97 | self, 98 | e.title, 99 | e.message, 100 | e.error_type, 101 | ) 102 | return 103 | 104 | self.results.add_results(results) 105 | 106 | def update_source(self, text): 107 | self.selector = text 108 | 109 | def copy_query(self): 110 | cb = QApplication.clipboard() 111 | cb.clear(mode=cb.Clipboard) 112 | query, query_type = self.query_section.get_query() 113 | text = f"sel.{query_type}('{query}')" 114 | if self.re_section.use: 115 | text += f'.re({self.re_section.get_query()})' 116 | else: 117 | text += '.getall()' 118 | 119 | cb.setText(text, mode=cb.Clipboard) 120 | 121 | 122 | class QueryEntry(QWidget): 123 | def __init__(self, *args, label, **kwargs): 124 | super().__init__(*args, **kwargs) 125 | self.label = label 126 | 127 | def initUI(self): 128 | grid = QGridLayout() 129 | self.setLayout(grid) 130 | 131 | label = QLabel(self.label) 132 | grid.addWidget(label, 0, 0) 133 | 134 | self.query = QPlainTextEdit() 135 | grid.addWidget(self.query, 1, 0) 136 | self.query.setLineWrapMode(QPlainTextEdit.NoWrap) 137 | 138 | def get_query(self): 139 | return self.query.toPlainText() 140 | 141 | class QueryChoiceEntry(QueryEntry): 142 | def __init__(self, *args, **kwargs): 143 | super().__init__(*args, **kwargs) 144 | self.query_type = 'css' 145 | 146 | def initUI(self): 147 | grid = QGridLayout() 148 | self.setLayout(grid) 149 | 150 | label = QLabel(self.label) 151 | grid.addWidget(label, 0, 0) 152 | 153 | css_button = QRadioButton(f'CSS') 154 | grid.addWidget(css_button, 1, 0) 155 | css_button.toggled.connect(lambda x: self.update_query(x, 'css')) 156 | css_button.setChecked(True) 157 | 158 | xpath_button = QRadioButton(f'XPath') 159 | xpath_button.toggled.connect(lambda x: self.update_query(x, 'xpath')) 160 | grid.addWidget(xpath_button, 2, 0) 161 | 162 | self.query = QPlainTextEdit() 163 | grid.addWidget(self.query, 3, 0, 1, 2) 164 | self.query.setLineWrapMode(QPlainTextEdit.NoWrap) 165 | 166 | def update_query(self, selected, query_type): 167 | if selected: 168 | self.query_type = query_type 169 | 170 | def get_query(self): 171 | return self.query.toPlainText(), self.query_type 172 | 173 | class OptionalQuery(QueryEntry): 174 | def __init__(self, *args, **kwargs): 175 | super().__init__(*args, **kwargs) 176 | self.use = False 177 | 178 | def initUI(self): 179 | grid = QGridLayout() 180 | self.setLayout(grid) 181 | 182 | label = QLabel(self.label) 183 | grid.addWidget(label, 0, 0) 184 | 185 | check = QCheckBox(f'Use {self.label.title()}') 186 | check.setChecked(False) 187 | check.clicked.connect(self.check_click) 188 | grid.addWidget(check, 0, 1) 189 | 190 | self.query = QPlainTextEdit() 191 | grid.addWidget(self.query, 2, 0, 1, 2) 192 | self.query.setLineWrapMode(QPlainTextEdit.NoWrap) 193 | self.query.setDisabled(True) 194 | 195 | def check_click(self): 196 | self.use = not self.use 197 | self.query.setDisabled(not self.use) 198 | 199 | 200 | class ResultsWidget(QWidget): 201 | def __init__(self, *args, **kwargs): 202 | super().__init__(*args, **kwargs) 203 | self.initUI() 204 | 205 | def initUI(self): 206 | grid = QGridLayout() 207 | self.setLayout(grid) 208 | 209 | label = QLabel("Results:") 210 | grid.addWidget(label, 0, 0) 211 | 212 | self.table = QTableWidget() 213 | self.table.setSizeAdjustPolicy( 214 | QAbstractScrollArea.AdjustToContents, 215 | ) 216 | 217 | self.table.setHorizontalScrollMode(QAbstractItemView.ScrollPerPixel) 218 | self.table.setVerticalScrollMode(QAbstractItemView.ScrollPerPixel) 219 | grid.addWidget(self.table, 1, 0) 220 | 221 | def add_results(self, results): 222 | self.table.clearContents() 223 | self.table.setColumnCount(1) 224 | self.table.setRowCount(0) 225 | 226 | for index, result in enumerate(results): 227 | if result is not None: 228 | self.table.insertRow(index) 229 | self.table.setItem( 230 | index, 231 | 0, 232 | QTableWidgetItem(str(result)), 233 | ) 234 | self.table.resizeColumnsToContents() 235 | self.table.resizeRowsToContents() 236 | del results 237 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="scrapy-GUI", 8 | version="1.2", 9 | author="Roy Healy", 10 | author_email="roy.healy87@gmail.com", 11 | description="A package for offering UI tools for building scrapy queries", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/further-reading/scraping-browser", 15 | packages=setuptools.find_packages(), 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ], 21 | install_requires=[ 22 | 'requests>=2.22.0', 23 | 'PyQtWebEngine>=5.14.0', 24 | 'parsel>=1.5.2', 25 | 'cssselect>=1.1.0', 26 | 'beautifulsoup4>=4.8.2', 27 | 'PyQt5>=5.14.0', 28 | ], 29 | python_requires='>=3.6', 30 | include_package_data=True, 31 | ) 32 | --------------------------------------------------------------------------------