├── MANIFEST.in ├── requirements.txt ├── .github ├── FUNDING.yml ├── ISSUE_TEMPLATE.md └── workflows │ └── python-publish.yml ├── Webtrench ├── __init__.py ├── helper.py ├── VideoScrapper.py ├── TextScrapper.py ├── AudioScrapper.py ├── ImageScrapper.py └── MetaDataScrapper.py ├── setup.py ├── LICENSE ├── .gitignore └── README.md /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.11.2 2 | requests==2.28.2 3 | setuptools>=65.5.1 4 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: nuhmanpk 4 | ko_fi: nuhmanpk 5 | 6 | -------------------------------------------------------------------------------- /Webtrench/__init__.py: -------------------------------------------------------------------------------- 1 | from .ImageScrapper import ImageScrapper 2 | from .AudioScrapper import AudioScrapper 3 | from .VideoScrapper import VideoScrapper 4 | from .TextScrapper import TextScrapper 5 | from .MetaDataScrapper import MetaDataScrapper 6 | 7 | __all__ = ['ImageScrapper', 'AudioScrapper', 'VideoScrapper', 'TextScrapper', 'MetaDataScrapper'] -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | * Webtrench version: 2 | * Python version: 3 | * Operating System: 4 | 5 | ### Description 6 | 7 | Describe what you were trying to get done. 8 | Tell us what happened, what went wrong, and what you expected to happen. 9 | 10 | ### What I Did 11 | 12 | ``` 13 | Paste the command(s) you ran and the output. 14 | If there was a crash, please include the traceback here. 15 | ``` 16 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | name: Upload Python Package 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Set up Python 18 | uses: actions/setup-python@v3 19 | with: 20 | python-version: '3.9' 21 | 22 | - name: "Installs dependencies" 23 | run: | 24 | python3 -m pip install --upgrade pip 25 | python3 -m pip install setuptools wheel twine 26 | python3 -m pip install requests 27 | python3 -m pip install bs4 28 | 29 | - name: "Builds and uploads to PyPI" 30 | run: | 31 | python3 setup.py sdist bdist_wheel 32 | python3 -m twine upload dist/* 33 | 34 | env: 35 | TWINE_USERNAME: __token__ 36 | TWINE_PASSWORD: ${{ secrets.TWINE_TOKEN }} 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import setuptools 3 | 4 | file = pathlib.Path(__file__).parent 5 | 6 | README = (file / "README.md").read_text() 7 | 8 | setuptools.setup( 9 | name="Webtrench", 10 | version="0.1.02", 11 | author="Nuhman Pk", 12 | author_email="nuhmanpk7@gmail.com", 13 | long_description = README, 14 | long_description_content_type = "text/markdown", 15 | description="A powerful and easy-to-use web scrapper for collecting data from the web. Supports scraping of images, text, videos, meta data, and more. Ideal for machine learning and deep learning engineers. Download and extract data with just one line of code", 16 | license="MIT", 17 | url="https://github.com/nuhmanpk/Webtrench", 18 | classifiers=[ 19 | "Programming Language :: Python :: 3", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | ], 23 | packages=setuptools.find_packages(include=['Webtrench']), 24 | install_requires=[ 25 | 'bs4', 26 | 'requests', 27 | ], 28 | 29 | python_requires=">=3.6", 30 | 31 | ) 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Nuhman Pk 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Webtrench/helper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | 4 | def scrapable(url): 5 | try: 6 | # Check if robots.txt allows scraping 7 | robots_txt = requests.get(url + '/robots.txt').text 8 | if "Disallow:" in robots_txt: 9 | print("The website does not allow scraping.") 10 | return False 11 | 12 | # Send a request to the URL 13 | response = requests.get(url) 14 | 15 | # Check if the response status code is 200 16 | if response.status_code != 200: 17 | print(f"The website returned a response status code of {response.status_code}, which indicates that the website is not scraping friendly.") 18 | return False 19 | 20 | # Check if the website's terms of use allow scraping 21 | terms_of_use = re.search(r'Terms of Use', response.text) 22 | if terms_of_use is None: 23 | print("The terms of use of the website do not mention anything about web scraping.") 24 | return True 25 | 26 | # Check if the terms of use allow scraping 27 | if "prohibited" in terms_of_use.group(0): 28 | print("Web scraping is prohibited according to the terms of use of the website.") 29 | return False 30 | 31 | print("The website is scraping friendly.") 32 | return True 33 | 34 | except Exception as e: 35 | print(f"An error occurred while checking the scraping friendliness of the website: {e}") 36 | return False 37 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | dist/ 13 | downloads/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | wheels/ 20 | pip-wheel-metadata/ 21 | share/python-wheels/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | MANIFEST 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .nox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | *.py,cover 48 | .hypothesis/ 49 | .pytest_cache/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | local_settings.py 58 | db.sqlite3 59 | db.sqlite3-journal 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # IPython 78 | profile_default/ 79 | ipython_config.py 80 | 81 | # pyenv 82 | .python-version 83 | 84 | # pipenv 85 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 86 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 87 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 88 | # install all needed dependencies. 89 | #Pipfile.lock 90 | 91 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 92 | __pypackages__/ 93 | 94 | # Celery stuff 95 | celerybeat-schedule 96 | celerybeat.pid 97 | 98 | # SageMath parsed files 99 | *.sage.py 100 | 101 | # Environments 102 | .env 103 | .venv 104 | env/ 105 | venv/ 106 | ENV/ 107 | env.bak/ 108 | venv.bak/ 109 | 110 | # Spyder project settings 111 | .spyderproject 112 | .spyproject 113 | 114 | # Rope project settings 115 | .ropeproject 116 | 117 | # mkdocs documentation 118 | /site 119 | 120 | # mypy 121 | .mypy_cache/ 122 | .dmypy.json 123 | dmypy.json 124 | 125 | # Pyre type checker 126 | .pyre/ 127 | -------------------------------------------------------------------------------- /Webtrench/VideoScrapper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import os 4 | import random 5 | # from helper import is_scrapable 6 | 7 | class VideoScrapper: 8 | 9 | def from_url(url,folder_path=None): 10 | if not folder_path: 11 | folder_path='.' 12 | try: 13 | request=requests.get(url) 14 | if not os.path.exists(folder_path): 15 | os.makedirs(folder_path) 16 | if request.status_code==200: 17 | with open(f'{folder_path}/{random.randint(1,40000)}.mp4','wb') as f: 18 | f.write(request.content) 19 | else: 20 | pass 21 | except Exception as e: 22 | raise e 23 | 24 | def all_video_from_url(url,folder_path=None): 25 | if not folder_path: 26 | folder_path='.' 27 | try: 28 | request=requests.get(url) 29 | html_content = request.content 30 | soup = BeautifulSoup(html_content, "html.parser") 31 | elements = soup.find_all("video") 32 | if not os.path.exists(folder_path): 33 | os.makedirs(folder_path) 34 | for i, element in enumerate(elements): 35 | response = requests.get(element["src"]) 36 | if response.status_code == 200: 37 | try: 38 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp4", "wb") as f: 39 | f.write(response.content) 40 | except Exception as err: 41 | print(err) 42 | else: 43 | pass 44 | except Exception as e: 45 | raise e 46 | 47 | def with_url_pattern(url,pattern,folder_path=None): 48 | if not folder_path: 49 | folder_path='.' 50 | try: 51 | request=requests.get(url) 52 | html_content = request.content 53 | soup = BeautifulSoup(html_content, "html.parser") 54 | elements = soup.find_all("video") 55 | if not os.path.exists(folder_path): 56 | os.makedirs(folder_path) 57 | for i, element in enumerate(elements): 58 | if pattern in element["src"]: 59 | response = requests.get(element["src"]) 60 | if response.status_code == 200: 61 | try: 62 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp4", "wb") as f: 63 | f.write(response.content) 64 | except Exception as err: 65 | print(err) 66 | else: 67 | pass 68 | except Exception as e: 69 | raise e 70 | 71 | def with_class(url,classname,folder_path=None): 72 | if not folder_path: 73 | folder_path='.' 74 | try: 75 | request=requests.get(url) 76 | html_content = request.content 77 | soup = BeautifulSoup(html_content, "html.parser") 78 | elements = soup.find_all("video",class_=classname) 79 | if not os.path.exists(folder_path): 80 | os.makedirs(folder_path) 81 | for i, element in enumerate(elements): 82 | response = requests.get(element["src"]) 83 | if response.status_code == 200: 84 | try: 85 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp4", "wb") as f: 86 | f.write(response.content) 87 | except Exception as err: 88 | print(err) 89 | else: 90 | pass 91 | except Exception as e: 92 | raise e 93 | 94 | def with_id(url,idname,folder_path=None): 95 | if not folder_path: 96 | folder_path='.' 97 | try: 98 | request=requests.get(url) 99 | html_content = request.content 100 | soup = BeautifulSoup(html_content, "html.parser") 101 | elements = soup.find_all("video",id=idname) 102 | if not os.path.exists(folder_path): 103 | os.makedirs(folder_path) 104 | for i, element in enumerate(elements): 105 | response = requests.get(element["src"]) 106 | if response.status_code == 200: 107 | try: 108 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp4", "wb") as f: 109 | f.write(response.content) 110 | except Exception as err: 111 | print(err) 112 | else: 113 | pass 114 | except Exception as e: 115 | raise e 116 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Webtrench 2 | 3 | 4 | WebTrench provides a comprehensive and powerful toolkit for web scraping. Whether you're working on a machine learning project, conducting research, or simply need to gather data from the web, WebTrench is the perfect tool for the job. So why wait? Start using WebTrench today and streamline your data collection process! 5 | ```python 6 | pip install Webtrench 7 | ``` 8 | ----- 9 | ### Check Documentation [Here](https://github.com/nuhmanpk/Webtrench/wiki) 10 | ------ 11 | [![Downloads](https://static.pepy.tech/personalized-badge/webtrench?period=total&units=international_system&left_color=grey&right_color=yellow&left_text=Total-Downloads)](https://pepy.tech/project/webtrench) 12 | ![PyPI - Format](https://img.shields.io/pypi/format/Webtrench) 13 | [![GitHub license](https://img.shields.io/github/license/nuhmanpk/webtrench.svg)](https://github.com/nuhmanpk/webtrench/blob/main/LICENSE) 14 | [![Upload Python Package](https://github.com/nuhmanpk/Webtrench/actions/workflows/python-publish.yml/badge.svg)](https://github.com/nuhmanpk/Webtrench/actions/workflows/python-publish.yml) 15 | [![Supported Versions](https://img.shields.io/pypi/pyversions/Webtrench.svg)](https://pypi.org/project/Webtrench) 16 | ![PyPI](https://img.shields.io/pypi/v/Webtrench) 17 | [![Documentation Status](https://readthedocs.org/projects/webtrench/badge/?version=latest)](https://webtrench.readthedocs.io/en/latest/?badge=latest) 18 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/Webtrench) 19 | [![Downloads](https://static.pepy.tech/personalized-badge/Webtrench?period=week&units=international_system&left_color=grey&right_color=brightgreen&left_text=Downloads/Week)](https://pepy.tech/project/Webtrench) 20 | 21 | 22 | 23 | ## Why WebTrench 24 | Easy to use: With its simple and intuitive interface, WebTrench makes it easy to extract data from the web. 25 | Comprehensive: WebTrench includes functions for extracting a wide range of data, from images to tables and beyond. 26 | Fast and efficient: WebTrench is designed to be fast and efficient, so you can quickly gather the data you need. 27 | Suitable for a variety of use cases: Whether you're working on a machine learning project, conducting research, or simply need to gather data from the web, WebTrench is a versatile tool that can meet your needs. 28 | ```python 29 | from Webtrench import ImageScrapper 30 | url = 'https://example.com' 31 | folder_path = './images' 32 | ImageScrapper.all_image_from_url(url, folder_path) 33 | ``` 34 | This code snippet downloads an image from the URL https://example.com/image.jpg and saves it in the ./images folder with a random number as the file name. 35 | ## Limitations of WebTrench 36 | Depends on website structure: The success of web scraping with WebTrench depends on the structure of the website being scraped. If the website's structure changes, WebTrench may not work as expected. 37 | Legal restrictions: There may be legal restrictions on the use of web scraping, so it's important to familiarize yourself with the laws in your jurisdiction before using WebTrench. 38 | 39 | ## Privacy Policy 40 | WebTrench respects the privacy of its users and is committed to protecting their data. We do not collect or store any personal information, and all data collected through the use of WebTrench is kept confidential. 41 | 42 | ## Web Scraping Ethics 43 | When using WebTrench or any other web scraping tool, it's important to follow ethical guidelines and avoid scraping websites without the owner's permission. This includes websites that explicitly prohibit scraping, as well as websites that contain sensitive or confidential information. 44 | 45 | ## Legal Warning 46 | The use of web scraping may be subject to legal restrictions, and the legality of web scraping depends on the jurisdiction in which it is being used. Before using WebTrench, it's important to familiarize yourself with the laws in your jurisdiction and ensure that your use of the tool complies with all applicable laws. WebTrench cannot be held responsible for any illegal use of the tool. 47 | 48 | ## Contributing Guide 49 | We welcome contributions from the community! If you are interested in contributing to the WebTrench project, here are some guidelines to get started: 50 | 51 | - Check the [issues](https://github.com/nuhmanpk/Webtrench/issues) page to see if there are any open bugs or features that you would like to work on. 52 | - Fork the repository and make your changes in a separate branch. 53 | - Once you have made your changes, submit a pull request for review. 54 | - The project maintainers will review your pull request and provide feedback. If necessary, make any requested changes and resubmit your pull request. 55 | - Once your pull request is approved and merged, you will become a contributor to the WebTrench project! 56 | 57 | ### Project Clone Guide 58 | If you would like to clone the WebTrench repository, follow these steps: 59 | - Install Git on your computer. 60 | - Open a terminal window and navigate to the directory where you would like to clone the repository. 61 | - Run the following command: 62 | ```git clone https://github.com/nuhmanpk/WebTrench.git``` 63 | - The repository will be cloned to your computer, and you can now make changes to the code and contribute to the project. 64 | 65 | ## Reminder 66 | Please note that WebTrench is currently in the pre-release stage and is not yet finished. If you encounter any issues, please check the [issues](https://github.com/nuhmanpk/Webtrench/issues) page, or consider contributing to make a better version of WebTrench! 67 | -------------------------------------------------------------------------------- /Webtrench/TextScrapper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | # from helper import is_scrapable 4 | 5 | class TextScrapper: 6 | 7 | def from_url(url): 8 | try: 9 | 10 | response = requests.get(url) 11 | soup = BeautifulSoup(response.content, 'html.parser') 12 | text = soup.get_text() 13 | return text 14 | except Exception as e: 15 | print(f"An error occurred while trying to extract text from the URL {url}: {e}") 16 | return None 17 | 18 | def from_file(file): 19 | try: 20 | with open(file, 'r') as f: 21 | text = f.read() 22 | return text 23 | except Exception as e: 24 | print(f"An error occurred while trying to extract text from the file {file}: {e}") 25 | return None 26 | 27 | def from_html(html): 28 | try: 29 | soup = BeautifulSoup(html, 'html.parser') 30 | text = soup.get_text() 31 | return text 32 | except Exception as e: 33 | print(f"An error occurred while trying to extract text from the HTML: {e}") 34 | return None 35 | 36 | def paragraph_from_url(url): 37 | try: 38 | response = requests.get(url) 39 | soup = BeautifulSoup(response.content, 'html.parser') 40 | paragraphs = soup.find_all('p') 41 | return paragraphs 42 | except Exception as e: 43 | print(f"An error occurred while trying to extract paragraphs from the URL {url}: {e}") 44 | return None 45 | 46 | def link_from_url(url): 47 | try: 48 | response = requests.get(url) 49 | soup = BeautifulSoup(response.content, 'html.parser') 50 | links = soup.find_all('a') 51 | return links 52 | except Exception as e: 53 | print(f"An error occurred while trying to extract links from the URL {url}: {e}") 54 | return None 55 | 56 | def from_class(url, class_name): 57 | try: 58 | response = requests.get(url) 59 | soup = BeautifulSoup(response.content, 'html.parser') 60 | elem = soup.find({'class': class_name}) 61 | if elem is not None: 62 | return elem.get_text().strip() 63 | else: 64 | return None 65 | except Exception as e: 66 | print(f"An error occurred while trying to extract text from the class {class_name} of the URL {url}: {e}") 67 | return None 68 | 69 | def from_id(url, id_name): 70 | try: 71 | response = requests.get(url) 72 | soup = BeautifulSoup(response.content, 'html.parser') 73 | elem = soup.find(id=id_name) 74 | if elem is not None: 75 | return elem.get_text().strip() 76 | else: 77 | return None 78 | except Exception as e: 79 | print(f"An error occurred while trying to extract text from the id {id_name} of the URL {url}: {e}") 80 | return None 81 | 82 | def heading_from_url(url, heading_tag): 83 | try: 84 | response = requests.get(url) 85 | soup = BeautifulSoup(response.content, 'html.parser') 86 | elem = soup.find(heading_tag) 87 | if elem is not None: 88 | return elem.get_text().strip() 89 | else: 90 | return None 91 | except Exception as e: 92 | print(f"An error occurred while trying to extract text from the url {url}: {e}") 93 | None 94 | 95 | def all_headings_from_url(url): 96 | try: 97 | response = requests.get(url) 98 | soup = BeautifulSoup(response.content, 'html.parser') 99 | all_headings = [] 100 | for heading_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: 101 | heading = soup.find(heading_tag) 102 | if heading is not None: 103 | all_headings.append((heading_tag, heading.get_text().strip())) 104 | return all_headings 105 | except Exception as e: 106 | print(f"An error occurred while trying to extract text from the url {url}: {e}") 107 | None 108 | 109 | def list_from_url(url): 110 | try: 111 | response = requests.get(url) 112 | soup = BeautifulSoup(response.content, 'html.parser') 113 | lists = soup.find_all(['ul', 'ol']) 114 | return lists 115 | except Exception as e: 116 | print(f"An error occurred while trying to extract text from the url {url}: {e}") 117 | None 118 | 119 | def list_item_from_url(url): 120 | try: 121 | response = requests.get(url) 122 | soup = BeautifulSoup(response.content, 'html.parser') 123 | list_items = soup.find_all('li') 124 | return list_items 125 | except Exception as e: 126 | print(f"An error occurred while trying to extract text from the url {url}: {e}") 127 | None 128 | 129 | def table_from_url(url): 130 | try: 131 | response = requests.get(url) 132 | soup = BeautifulSoup(response.content, 'html.parser') 133 | tables = soup.find_all('table') 134 | return tables 135 | except Exception as e: 136 | print(f"An error occurred while trying to extract text from the url {url}: {e}") 137 | None 138 | 139 | def table_row_from_url(url): 140 | try: 141 | response = requests.get(url) 142 | soup = BeautifulSoup(response.content, 'html.parser') 143 | table_rows = soup.find_all('tr') 144 | return table_rows 145 | except Exception as e: 146 | print(f"An error occurred while trying to extract text from the url {url}: {e}") 147 | None 148 | 149 | def table_data_from_url(url): 150 | try: 151 | response = requests.get(url) 152 | soup = BeautifulSoup(response.content, 'html.parser') 153 | table_cells = soup.find_all(['th', 'td']) 154 | return table_cells 155 | except Exception as e: 156 | print(f"An error occurred while trying to extract text from the url {url}: {e}") 157 | None -------------------------------------------------------------------------------- /Webtrench/AudioScrapper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import os 4 | import random 5 | # from helper import is_scrapable 6 | 7 | class AudioScrapper: 8 | 9 | def from_url(url,folder_path=None): 10 | if not folder_path: 11 | folder_path='.' 12 | try: 13 | request=requests.get(url) 14 | if not os.path.exists(folder_path): 15 | os.makedirs(folder_path) 16 | if request.status_code==200: 17 | with open(f'{folder_path}/{random.randint(1,40000)}.mp3','wb') as f: 18 | f.write(request.content) 19 | else: 20 | pass 21 | except Exception as e: 22 | raise e 23 | 24 | def all_audio_from_url(url,folder_path=None): 25 | if not folder_path: 26 | folder_path='.' 27 | try: 28 | request=requests.get(url) 29 | html_content = request.content 30 | soup = BeautifulSoup(html_content, "html.parser") 31 | elements = soup.find_all("audio") 32 | if not os.path.exists(folder_path): 33 | os.makedirs(folder_path) 34 | for i, element in enumerate(elements): 35 | response = requests.get(element["src"]) 36 | if response.status_code == 200: 37 | try: 38 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f: 39 | f.write(response.content) 40 | except Exception as err: 41 | print(err) 42 | else: 43 | pass 44 | except Exception as e: 45 | raise e 46 | 47 | def with_url_pattern(url,pattern,folder_path=None): 48 | if not folder_path: 49 | folder_path='.' 50 | try: 51 | request=requests.get(url) 52 | html_content = request.content 53 | soup = BeautifulSoup(html_content, "html.parser") 54 | elements = soup.find_all("audio") 55 | if not os.path.exists(folder_path): 56 | os.makedirs(folder_path) 57 | for i, element in enumerate(elements): 58 | if pattern in element["src"]: 59 | response = requests.get(element["src"]) 60 | if response.status_code == 200: 61 | try: 62 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f: 63 | f.write(response.content) 64 | except Exception as err: 65 | print(err) 66 | else: 67 | pass 68 | except Exception as e: 69 | raise e 70 | 71 | def with_class(url,cls,folder_path=None): 72 | if not folder_path: 73 | folder_path='.' 74 | try: 75 | request=requests.get(url) 76 | html_content = request.content 77 | soup = BeautifulSoup(html_content, "html.parser") 78 | elements = soup.find_all("audio",{"class":cls}) 79 | if not os.path.exists(folder_path): 80 | os.makedirs(folder_path) 81 | for i, element in enumerate(elements): 82 | response = requests.get(element["src"]) 83 | if response.status_code == 200: 84 | try: 85 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f: 86 | f.write(response.content) 87 | except Exception as err: 88 | print(err) 89 | else: 90 | pass 91 | except Exception as e: 92 | raise e 93 | 94 | def with_id(url,cls,folder_path=None): 95 | if not folder_path: 96 | folder_path='.' 97 | try: 98 | request=requests.get(url) 99 | html_content = request.content 100 | soup = BeautifulSoup(html_content, "html.parser") 101 | elements = soup.find_all("audio",{"id":cls}) 102 | if not os.path.exists(folder_path): 103 | os.makedirs(folder_path) 104 | for i, element in enumerate(elements): 105 | response = requests.get(element["src"]) 106 | if response.status_code == 200: 107 | try: 108 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f: 109 | f.write(response.content) 110 | except Exception as err: 111 | print(err) 112 | else: 113 | pass 114 | except Exception as e: 115 | raise e 116 | 117 | def with_attribute(url,attr,folder_path=None): 118 | if not folder_path: 119 | folder_path='.' 120 | try: 121 | request=requests.get(url) 122 | html_content = request.content 123 | soup = BeautifulSoup(html_content, "html.parser") 124 | elements = soup.find_all("audio",attrs=attr) 125 | if not os.path.exists(folder_path): 126 | os.makedirs(folder_path) 127 | for i, element in enumerate(elements): 128 | response = requests.get(element["src"]) 129 | if response.status_code == 200: 130 | try: 131 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f: 132 | f.write(response.content) 133 | except Exception as err: 134 | print(err) 135 | else: 136 | pass 137 | except Exception as e: 138 | raise e 139 | 140 | def with_attribute_value(url,attr,value,folder_path=None): 141 | if not folder_path: 142 | folder_path='.' 143 | try: 144 | request=requests.get(url) 145 | html_content = request.content 146 | soup = BeautifulSoup(html_content, "html.parser") 147 | elements = soup.find_all("audio",attrs={attr:value}) 148 | if not os.path.exists(folder_path): 149 | os.makedirs(folder_path) 150 | for i, element in enumerate(elements): 151 | response = requests.get(element["src"]) 152 | if response.status_code == 200: 153 | try: 154 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f: 155 | f.write(response.content) 156 | except Exception as err: 157 | print(err) 158 | else: 159 | pass 160 | except Exception as e: 161 | raise e 162 | 163 | def with_attribute_value_pattern(url,attr,value,folder_path=None): 164 | if not folder_path: 165 | folder_path='.' 166 | try: 167 | request=requests.get(url) 168 | html_content = request.content 169 | soup = BeautifulSoup(html_content, "html.parser") 170 | elements = soup.find_all("audio",attrs={attr:value}) 171 | if not os.path.exists(folder_path): 172 | os.makedirs(folder_path) 173 | for i, element in enumerate(elements): 174 | if value in element[attr]: 175 | response = requests.get(element["src"]) 176 | if response.status_code == 200: 177 | try: 178 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f: 179 | f.write(response.content) 180 | except Exception as err: 181 | print(err) 182 | else: 183 | pass 184 | except Exception as e: 185 | raise e 186 | 187 | -------------------------------------------------------------------------------- /Webtrench/ImageScrapper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | # from helper import is_scrapable 4 | import os 5 | import random 6 | 7 | class ImageScrapper: 8 | 9 | def from_url(url,folder_path=None): 10 | if not folder_path: 11 | folder_path='.' 12 | try: 13 | request=requests.get(url) 14 | if not os.path.exists(folder_path): 15 | os.makedirs(folder_path) 16 | if request.status_code==200: 17 | with open(f'{folder_path}/{random.randint(1,40000)}.jpg','wb') as f: 18 | f.write(request.content) 19 | else: 20 | pass 21 | except Exception as e: 22 | raise e 23 | 24 | def all_image_from_url(url,folder_path=None): 25 | if not folder_path: 26 | folder_path='.' 27 | try: 28 | request=requests.get(url) 29 | html_content = request.content 30 | soup = BeautifulSoup(html_content, "html.parser") 31 | elements = soup.find_all("img") 32 | if not os.path.exists(folder_path): 33 | os.makedirs(folder_path) 34 | for i, element in enumerate(elements): 35 | response = requests.get(element["src"]) 36 | if response.status_code == 200: 37 | try: 38 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f: 39 | f.write(response.content) 40 | except Exception as err: 41 | print(err) 42 | else: 43 | pass 44 | except Exception as e: 45 | raise e 46 | 47 | def with_url_pattern(url,pattern,folder_path=None): 48 | if not folder_path: 49 | folder_path='.' 50 | try: 51 | request=requests.get(url) 52 | html_content = request.content 53 | soup = BeautifulSoup(html_content, "html.parser") 54 | elements = soup.find_all("img") 55 | if not os.path.exists(folder_path): 56 | os.makedirs(folder_path) 57 | for i, element in enumerate(elements): 58 | if pattern in element["src"]: 59 | response = requests.get(element["src"]) 60 | if response.status_code == 200: 61 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f: 62 | f.write(response.content) 63 | else: 64 | pass 65 | except Exception as e: 66 | raise e 67 | 68 | def with_class(url,class_name,folder_path=None): 69 | if not folder_path: 70 | folder_path='.' 71 | try: 72 | request=requests.get(url) 73 | html_content = request.content 74 | soup = BeautifulSoup(html_content, "html.parser") 75 | elements = soup.find_all("img",class_=class_name) 76 | if not os.path.exists(folder_path): 77 | os.makedirs(folder_path) 78 | for i, element in enumerate(elements): 79 | response = requests.get(element["src"]) 80 | if response.status_code == 200: 81 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f: 82 | f.write(response.content) 83 | else: 84 | pass 85 | except Exception as e: 86 | raise e 87 | 88 | def with_id(url,id_name,folder_path=None): 89 | if not folder_path: 90 | folder_path='.' 91 | try: 92 | request=requests.get(url) 93 | html_content = request.content 94 | soup = BeautifulSoup(html_content, "html.parser") 95 | elements = soup.find_all("img",id=id_name) 96 | if not os.path.exists(folder_path): 97 | os.makedirs(folder_path) 98 | for i, element in enumerate(elements): 99 | response = requests.get(element["src"]) 100 | if response.status_code == 200: 101 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f: 102 | f.write(response.content) 103 | else: 104 | pass 105 | except Exception as e: 106 | raise e 107 | 108 | def with_alt(url,alt_name,folder_path=None): 109 | if not folder_path: 110 | folder_path='.' 111 | try: 112 | request=requests.get(url) 113 | html_content = request.content 114 | soup = BeautifulSoup(html_content, "html.parser") 115 | elements = soup.find_all("img",alt=alt_name) 116 | if not os.path.exists(folder_path): 117 | os.makedirs(folder_path) 118 | for i, element in enumerate(elements): 119 | response = requests.get(element["src"]) 120 | if response.status_code == 200: 121 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f: 122 | f.write(response.content) 123 | else: 124 | pass 125 | except Exception as e: 126 | raise e 127 | 128 | def with_title(url,title_name,folder_path=None): 129 | if not folder_path: 130 | folder_path='.' 131 | try: 132 | request=requests.get(url) 133 | html_content = request.content 134 | soup = BeautifulSoup(html_content, "html.parser") 135 | elements = soup.find_all("img",title=title_name) 136 | if not os.path.exists(folder_path): 137 | os.makedirs(folder_path) 138 | for i, element in enumerate(elements): 139 | response = requests.get(element["src"]) 140 | if response.status_code == 200: 141 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f: 142 | f.write(response.content) 143 | else: 144 | pass 145 | except Exception as e: 146 | raise e 147 | 148 | def with_height(url,height,folder_path=None): 149 | if not folder_path: 150 | folder_path='.' 151 | try: 152 | request=requests.get(url) 153 | html_content = request.content 154 | soup = BeautifulSoup(html_content, "html.parser") 155 | elements = soup.find_all("img",height=height) 156 | if not os.path.exists(folder_path): 157 | os.makedirs(folder_path) 158 | for i, element in enumerate(elements): 159 | response = requests.get(element["src"]) 160 | if response.status_code == 200: 161 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f: 162 | f.write(response.content) 163 | else: 164 | pass 165 | except Exception as e: 166 | raise e 167 | 168 | def with_width(url,width,folder_path=None): 169 | if not folder_path: 170 | folder_path='.' 171 | try: 172 | request=requests.get(url) 173 | html_content = request.content 174 | soup = BeautifulSoup(html_content, "html.parser") 175 | elements = soup.find_all("img",width=width) 176 | if not os.path.exists(folder_path): 177 | os.makedirs(folder_path) 178 | for i, element in enumerate(elements): 179 | response = requests.get(element["src"]) 180 | if response.status_code == 200: 181 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f: 182 | f.write(response.content) 183 | else: 184 | pass 185 | except Exception as e: 186 | raise e 187 | 188 | def with_style(url,style,folder_path=None): 189 | if not folder_path: 190 | folder_path='.' 191 | try: 192 | request=requests.get(url) 193 | html_content = request.content 194 | soup = BeautifulSoup(html_content, "html.parser") 195 | elements = soup.find_all("img",style=style) 196 | if not os.path.exists(folder_path): 197 | os.makedirs(folder_path) 198 | for i, element in enumerate(elements): 199 | response = requests.get(element["src"]) 200 | if response.status_code == 200: 201 | with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f: 202 | f.write(response.content) 203 | else: 204 | pass 205 | except Exception as e: 206 | raise e 207 | -------------------------------------------------------------------------------- /Webtrench/MetaDataScrapper.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | # from helper import is_scrapable 5 | 6 | class MetaDataScrapper: 7 | def get_website_title(url): 8 | try: 9 | request=requests.get(url) 10 | html_content = request.content 11 | soup = BeautifulSoup(html_content, "html.parser") 12 | title = soup.title.string 13 | return title 14 | except Exception as e: 15 | raise e 16 | 17 | def get_website_description(url): 18 | try: 19 | request=requests.get(url) 20 | html_content = request.content 21 | soup = BeautifulSoup(html_content, "html.parser") 22 | description = soup.find("meta", property="og:description") 23 | return description["content"] 24 | except Exception as e: 25 | raise e 26 | 27 | def get_website_keywords(url): 28 | try: 29 | request=requests.get(url) 30 | html_content = request.content 31 | soup = BeautifulSoup(html_content, "html.parser") 32 | keywords = soup.find("meta", property="og:keywords") 33 | return keywords["content"] 34 | except Exception as e: 35 | raise e 36 | 37 | def get_website_image(url): 38 | try: 39 | request=requests.get(url) 40 | html_content = request.content 41 | soup = BeautifulSoup(html_content, "html.parser") 42 | image = soup.find("meta", property="og:image") 43 | return image["content"] 44 | except Exception as e: 45 | raise e 46 | 47 | def get_website_url(url): 48 | try: 49 | request=requests.get(url) 50 | html_content = request.content 51 | soup = BeautifulSoup(html_content, "html.parser") 52 | url = soup.find("meta", property="og:url") 53 | return url["content"] 54 | except Exception as e: 55 | raise e 56 | 57 | def get_website_type(url): 58 | try: 59 | request=requests.get(url) 60 | html_content = request.content 61 | soup = BeautifulSoup(html_content, "html.parser") 62 | type = soup.find("meta", property="og:type") 63 | return type["content"] 64 | except Exception as e: 65 | raise e 66 | 67 | def get_website_site_name(url): 68 | try: 69 | request=requests.get(url) 70 | html_content = request.content 71 | soup = BeautifulSoup(html_content, "html.parser") 72 | site_name = soup.find("meta", property="og:site_name") 73 | return site_name["content"] 74 | except Exception as e: 75 | raise e 76 | 77 | def get_website_locale(url): 78 | try: 79 | request=requests.get(url) 80 | html_content = request.content 81 | soup = BeautifulSoup(html_content, "html.parser") 82 | locale = soup.find("meta", property="og:locale") 83 | return locale["content"] 84 | except Exception as e: 85 | raise e 86 | 87 | def meta_data(url): 88 | try: 89 | request=requests.get(url) 90 | html_content = request.content 91 | soup = BeautifulSoup(html_content, "html.parser") 92 | meta_data = soup.find_all("meta") 93 | return meta_data 94 | except Exception as e: 95 | raise e 96 | 97 | def with_property(url): 98 | try: 99 | request=requests.get(url) 100 | html_content = request.content 101 | soup = BeautifulSoup(html_content, "html.parser") 102 | meta_data = soup.find_all("meta", property=True) 103 | return meta_data 104 | except Exception as e: 105 | raise e 106 | 107 | def with_name(url): 108 | try: 109 | request=requests.get(url) 110 | html_content = request.content 111 | soup = BeautifulSoup(html_content, "html.parser") 112 | meta_data = soup.find_all("meta", name=True) 113 | return meta_data 114 | except Exception as e: 115 | raise e 116 | 117 | def with_http_equiv(url): 118 | try: 119 | request=requests.get(url) 120 | html_content = request.content 121 | soup = BeautifulSoup(html_content, "html.parser") 122 | meta_data = soup.find_all("meta", http_equiv=True) 123 | return meta_data 124 | except Exception as e: 125 | raise e 126 | 127 | def with_content(url): 128 | try: 129 | request=requests.get(url) 130 | html_content = request.content 131 | soup = BeautifulSoup(html_content, "html.parser") 132 | meta_data = soup.find_all("meta", content=True) 133 | return meta_data 134 | except Exception as e: 135 | raise e 136 | 137 | def with_charset(url): 138 | try: 139 | request=requests.get(url) 140 | html_content = request.content 141 | soup = BeautifulSoup(html_content, "html.parser") 142 | meta_data = soup.find_all("meta", charset=True) 143 | return meta_data 144 | except Exception as e: 145 | raise e 146 | 147 | def with_itemprop(url): 148 | try: 149 | request=requests.get(url) 150 | html_content = request.content 151 | soup = BeautifulSoup(html_content, "html.parser") 152 | meta_data = soup.find_all("meta", itemprop=True) 153 | return meta_data 154 | except Exception as e: 155 | raise e 156 | 157 | def with_scheme(url): 158 | try: 159 | request=requests.get(url) 160 | html_content = request.content 161 | soup = BeautifulSoup(html_content, "html.parser") 162 | meta_data = soup.find_all("meta", scheme=True) 163 | return meta_data 164 | except Exception as e: 165 | raise e 166 | 167 | def with_lang(url): 168 | try: 169 | request=requests.get(url) 170 | html_content = request.content 171 | soup = BeautifulSoup(html_content, "html.parser") 172 | meta_data = soup.find_all("meta", lang=True) 173 | return meta_data 174 | except Exception as e: 175 | raise e 176 | 177 | def with_dir(url): 178 | try: 179 | request=requests.get(url) 180 | html_content = request.content 181 | soup = BeautifulSoup(html_content, "html.parser") 182 | meta_data = soup.find_all("meta", dir=True) 183 | return meta_data 184 | except Exception as e: 185 | raise e 186 | 187 | def with_xml_lang(url): 188 | try: 189 | request=requests.get(url) 190 | html_content = request.content 191 | soup = BeautifulSoup(html_content, "html.parser") 192 | meta_data = soup.find_all("meta", xml_lang=True) 193 | return meta_data 194 | except Exception as e: 195 | raise e 196 | 197 | def with_xmlns(url): 198 | try: 199 | request=requests.get(url) 200 | html_content = request.content 201 | soup = BeautifulSoup(html_content, "html.parser") 202 | meta_data = soup.find_all("meta", xmlns=True) 203 | return meta_data 204 | except Exception as e: 205 | raise e 206 | 207 | def with_xmlns_xsi(url): 208 | try: 209 | request=requests.get(url) 210 | html_content = request.content 211 | soup = BeautifulSoup(html_content, "html.parser") 212 | meta_data = soup.find_all("meta", xmlns_xsi=True) 213 | return meta_data 214 | except Exception as e: 215 | raise e 216 | 217 | def with_xsi_schemaLocation(url): 218 | try: 219 | request=requests.get(url) 220 | html_content = request.content 221 | soup = BeautifulSoup(html_content, "html.parser") 222 | meta_data = soup.find_all("meta", xsi_schemaLocation=True) 223 | return meta_data 224 | except Exception as e: 225 | raise e 226 | 227 | def with_xmlns_og(url): 228 | try: 229 | request=requests.get(url) 230 | html_content = request.content 231 | soup = BeautifulSoup(html_content, "html.parser") 232 | meta_data = soup.find_all("meta", xmlns_og=True) 233 | return meta_data 234 | except Exception as e: 235 | raise e 236 | 237 | def with_xmlns_fb(url): 238 | try: 239 | request=requests.get(url) 240 | html_content = request.content 241 | soup = BeautifulSoup(html_content, "html.parser") 242 | meta_data = soup.find_all("meta", xmlns_fb=True) 243 | return meta_data 244 | except Exception as e: 245 | raise e 246 | 247 | def with_xmlns_article(url): 248 | try: 249 | request=requests.get(url) 250 | html_content = request.content 251 | soup = BeautifulSoup(html_content, "html.parser") 252 | meta_data = soup.find_all("meta", xmlns_article=True) 253 | return meta_data 254 | except Exception as e: 255 | raise e 256 | 257 | def with_xmlns_profile(url): 258 | try: 259 | request=requests.get(url) 260 | html_content = request.content 261 | soup = BeautifulSoup(html_content, "html.parser") 262 | meta_data = soup.find_all("meta", xmlns_profile=True) 263 | return meta_data 264 | except Exception as e: 265 | raise e 266 | 267 | def with_xmlns_book(url): 268 | try: 269 | request=requests.get(url) 270 | html_content = request.content 271 | soup = BeautifulSoup(html_content, "html.parser") 272 | meta_data = soup.find_all("meta", xmlns_book=True) 273 | return meta_data 274 | except Exception as e: 275 | raise e 276 | 277 | def with_xmlns_video(url): 278 | try: 279 | request=requests.get(url) 280 | html_content = request.content 281 | soup = BeautifulSoup(html_content, "html.parser") 282 | meta_data = soup.find_all("meta", xmlns_video=True) 283 | return meta_data 284 | except Exception as e: 285 | raise e 286 | 287 | def with_xmlns_music(url): 288 | try: 289 | request=requests.get(url) 290 | html_content = request.content 291 | soup = BeautifulSoup(html_content, "html.parser") 292 | meta_data = soup.find_all("meta", xmlns_music=True) 293 | return meta_data 294 | except Exception as e: 295 | raise e 296 | 297 | 298 | def with_xmlns_place(url): 299 | try: 300 | request=requests.get(url) 301 | html_content = request.content 302 | soup = BeautifulSoup(html_content, "html.parser") 303 | meta_data = soup.find_all("meta", xmlns_place=True) 304 | return meta_data 305 | except Exception as e: 306 | raise e 307 | 308 | def get_keyword_density(url, keyword): 309 | response = requests.get(url) 310 | content = response.content.decode("utf-8").lower() 311 | word_count = len(re.findall(keyword, content)) 312 | total_words = len(re.findall("\w+", content)) 313 | density = (word_count / total_words) * 100 314 | return density 315 | 316 | def get_meta_data(url): 317 | response = requests.get(url) 318 | content = response.content.decode("utf-8") 319 | meta_data = {} 320 | meta_tags = re.findall("", content) 321 | for meta_tag in meta_tags: 322 | name = re.search("name=['\"](.*?)['\"]", meta_tag) 323 | if name: 324 | name = name.group(1) 325 | else: 326 | name = re.search("property=['\"](.*?)['\"]", meta_tag) 327 | if name: 328 | name = name.group(1) 329 | else: 330 | continue 331 | value = re.search("content=['\"](.*?)['\"]", meta_tag) 332 | if value: 333 | value = value.group(1) 334 | meta_data[name] = value 335 | return meta_data --------------------------------------------------------------------------------