├── MANIFEST.in
├── requirements.txt
├── .github
    ├── FUNDING.yml
    ├── ISSUE_TEMPLATE.md
    └── workflows
    │   └── python-publish.yml
├── Webtrench
    ├── __init__.py
    ├── helper.py
    ├── VideoScrapper.py
    ├── TextScrapper.py
    ├── AudioScrapper.py
    ├── ImageScrapper.py
    └── MetaDataScrapper.py
├── setup.py
├── LICENSE
├── .gitignore
└── README.md


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.md


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4==4.11.2
2 | requests==2.28.2
3 | setuptools>=65.5.1
4 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | 
3 | github: nuhmanpk
4 | ko_fi: nuhmanpk
5 | 
6 | 


--------------------------------------------------------------------------------
/Webtrench/__init__.py:
--------------------------------------------------------------------------------
1 | from .ImageScrapper import ImageScrapper
2 | from .AudioScrapper import AudioScrapper
3 | from .VideoScrapper import VideoScrapper
4 | from .TextScrapper import TextScrapper
5 | from .MetaDataScrapper import MetaDataScrapper
6 | 
7 | __all__ = ['ImageScrapper', 'AudioScrapper', 'VideoScrapper', 'TextScrapper', 'MetaDataScrapper']


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | * Webtrench version:
 2 | * Python version:
 3 | * Operating System:
 4 | 
 5 | ### Description
 6 | 
 7 | Describe what you were trying to get done.
 8 | Tell us what happened, what went wrong, and what you expected to happen.
 9 | 
10 | ### What I Did
11 | 
12 | ```
13 | Paste the command(s) you ran and the output.
14 | If there was a crash, please include the traceback here.
15 | ```
16 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Upload Python Package
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v3
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v3
19 |       with:
20 |         python-version: '3.9'
21 |         
22 |     - name: "Installs dependencies"
23 |       run: |
24 |           python3 -m pip install --upgrade pip
25 |           python3 -m pip install setuptools wheel twine
26 |           python3 -m pip install requests 
27 |           python3 -m pip install bs4
28 |           
29 |     - name: "Builds and uploads to PyPI"
30 |       run: |
31 |           python3 setup.py sdist bdist_wheel
32 |           python3 -m twine upload dist/*
33 |       
34 |       env:
35 |           TWINE_USERNAME: __token__
36 |           TWINE_PASSWORD: ${{ secrets.TWINE_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import setuptools
 3 | 
 4 | file = pathlib.Path(__file__).parent
 5 | 
 6 | README = (file / "README.md").read_text()
 7 | 
 8 | setuptools.setup(
 9 |     name="Webtrench",
10 |     version="0.1.02",
11 |     author="Nuhman Pk",
12 |     author_email="nuhmanpk7@gmail.com",
13 |     long_description = README,
14 |     long_description_content_type = "text/markdown",
15 |     description="A powerful and easy-to-use web scrapper for collecting data from the web. Supports scraping of images, text, videos, meta data, and more. Ideal for machine learning and deep learning engineers. Download and extract data with just one line of code",
16 |     license="MIT",
17 |     url="https://github.com/nuhmanpk/Webtrench",
18 |     classifiers=[
19 |         "Programming Language :: Python :: 3",
20 |         "License :: OSI Approved :: MIT License",
21 |         "Operating System :: OS Independent",
22 |     ],
23 |     packages=setuptools.find_packages(include=['Webtrench']),  
24 |     install_requires=[
25 |         'bs4',
26 |         'requests',
27 |     ],
28 |     
29 |     python_requires=">=3.6",
30 |     
31 | )
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Nuhman Pk
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Webtrench/helper.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | 
 4 | def scrapable(url):
 5 |     try:
 6 |         # Check if robots.txt allows scraping
 7 |         robots_txt = requests.get(url + '/robots.txt').text
 8 |         if "Disallow:" in robots_txt:
 9 |             print("The website does not allow scraping.")
10 |             return False
11 |         
12 |         # Send a request to the URL
13 |         response = requests.get(url)
14 |         
15 |         # Check if the response status code is 200
16 |         if response.status_code != 200:
17 |             print(f"The website returned a response status code of {response.status_code}, which indicates that the website is not scraping friendly.")
18 |             return False
19 |         
20 |         # Check if the website's terms of use allow scraping
21 |         terms_of_use = re.search(r'Terms of Use', response.text)
22 |         if terms_of_use is None:
23 |             print("The terms of use of the website do not mention anything about web scraping.")
24 |             return True
25 |         
26 |         # Check if the terms of use allow scraping
27 |         if "prohibited" in terms_of_use.group(0):
28 |             print("Web scraping is prohibited according to the terms of use of the website.")
29 |             return False
30 |         
31 |         print("The website is scraping friendly.")
32 |         return True
33 |         
34 |     except Exception as e:
35 |         print(f"An error occurred while checking the scraping friendliness of the website: {e}")
36 |         return False
37 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | dist/
 13 | downloads/
 14 | lib/
 15 | lib64/
 16 | parts/
 17 | sdist/
 18 | var/
 19 | wheels/
 20 | pip-wheel-metadata/
 21 | share/python-wheels/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | MANIFEST
 26 | 
 27 | # PyInstaller
 28 | #  Usually these files are written by a python script from a template
 29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 30 | *.manifest
 31 | *.spec
 32 | 
 33 | # Installer logs
 34 | pip-log.txt
 35 | pip-delete-this-directory.txt
 36 | 
 37 | # Unit test / coverage reports
 38 | htmlcov/
 39 | .tox/
 40 | .nox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | *.py,cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | db.sqlite3-journal
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | 
 77 | # IPython
 78 | profile_default/
 79 | ipython_config.py
 80 | 
 81 | # pyenv
 82 | .python-version
 83 | 
 84 | # pipenv
 85 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 86 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 87 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 88 | #   install all needed dependencies.
 89 | #Pipfile.lock
 90 | 
 91 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 92 | __pypackages__/
 93 | 
 94 | # Celery stuff
 95 | celerybeat-schedule
 96 | celerybeat.pid
 97 | 
 98 | # SageMath parsed files
 99 | *.sage.py
100 | 
101 | # Environments
102 | .env
103 | .venv
104 | env/
105 | venv/
106 | ENV/
107 | env.bak/
108 | venv.bak/
109 | 
110 | # Spyder project settings
111 | .spyderproject
112 | .spyproject
113 | 
114 | # Rope project settings
115 | .ropeproject
116 | 
117 | # mkdocs documentation
118 | /site
119 | 
120 | # mypy
121 | .mypy_cache/
122 | .dmypy.json
123 | dmypy.json
124 | 
125 | # Pyre type checker
126 | .pyre/
127 | 


--------------------------------------------------------------------------------
/Webtrench/VideoScrapper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import os
  4 | import random
  5 | # from helper import is_scrapable
  6 | 
  7 | class VideoScrapper:
  8 |     
  9 |     def from_url(url,folder_path=None):
 10 |         if not folder_path:
 11 |             folder_path='.'
 12 |         try:
 13 |             request=requests.get(url)
 14 |             if not os.path.exists(folder_path):
 15 |                 os.makedirs(folder_path)
 16 |             if request.status_code==200:
 17 |                 with open(f'{folder_path}/{random.randint(1,40000)}.mp4','wb') as f:
 18 |                     f.write(request.content) 
 19 |             else:
 20 |                 pass
 21 |         except Exception as e:
 22 |             raise e
 23 | 
 24 |     def all_video_from_url(url,folder_path=None):
 25 |         if not folder_path:
 26 |             folder_path='.'
 27 |         try:
 28 |             request=requests.get(url)
 29 |             html_content = request.content
 30 |             soup = BeautifulSoup(html_content, "html.parser")
 31 |             elements = soup.find_all("video")
 32 |             if not os.path.exists(folder_path):
 33 |                 os.makedirs(folder_path)
 34 |             for i, element in enumerate(elements):
 35 |                 response = requests.get(element["src"])
 36 |                 if response.status_code == 200:
 37 |                     try:
 38 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp4", "wb") as f:
 39 |                             f.write(response.content) 
 40 |                     except Exception as err:
 41 |                         print(err) 
 42 |                 else:
 43 |                     pass
 44 |         except Exception as e:
 45 |             raise e
 46 | 
 47 |     def with_url_pattern(url,pattern,folder_path=None):
 48 |         if not folder_path:
 49 |             folder_path='.'
 50 |         try:
 51 |             request=requests.get(url)
 52 |             html_content = request.content
 53 |             soup = BeautifulSoup(html_content, "html.parser")
 54 |             elements = soup.find_all("video")
 55 |             if not os.path.exists(folder_path):
 56 |                 os.makedirs(folder_path)
 57 |             for i, element in enumerate(elements):
 58 |                 if pattern in element["src"]:
 59 |                     response = requests.get(element["src"])
 60 |                     if response.status_code == 200:
 61 |                         try:
 62 |                             with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp4", "wb") as f:
 63 |                                 f.write(response.content) 
 64 |                         except Exception as err:
 65 |                             print(err) 
 66 |                     else:
 67 |                         pass
 68 |         except Exception as e:
 69 |             raise e
 70 | 
 71 |     def with_class(url,classname,folder_path=None):
 72 |         if not folder_path:
 73 |             folder_path='.'
 74 |         try:
 75 |             request=requests.get(url)
 76 |             html_content = request.content
 77 |             soup = BeautifulSoup(html_content, "html.parser")
 78 |             elements = soup.find_all("video",class_=classname)
 79 |             if not os.path.exists(folder_path):
 80 |                 os.makedirs(folder_path)
 81 |             for i, element in enumerate(elements):
 82 |                 response = requests.get(element["src"])
 83 |                 if response.status_code == 200:
 84 |                     try:
 85 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp4", "wb") as f:
 86 |                             f.write(response.content) 
 87 |                     except Exception as err:
 88 |                         print(err) 
 89 |                 else:
 90 |                     pass
 91 |         except Exception as e:
 92 |             raise e
 93 | 
 94 |     def with_id(url,idname,folder_path=None):
 95 |         if not folder_path:
 96 |             folder_path='.'
 97 |         try:
 98 |             request=requests.get(url)
 99 |             html_content = request.content
100 |             soup = BeautifulSoup(html_content, "html.parser")
101 |             elements = soup.find_all("video",id=idname)
102 |             if not os.path.exists(folder_path):
103 |                 os.makedirs(folder_path)
104 |             for i, element in enumerate(elements):
105 |                 response = requests.get(element["src"])
106 |                 if response.status_code == 200:
107 |                     try:
108 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp4", "wb") as f:
109 |                             f.write(response.content) 
110 |                     except Exception as err:
111 |                         print(err) 
112 |                 else:
113 |                     pass
114 |         except Exception as e:
115 |             raise e
116 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Webtrench
 2 | 
 3 | 
 4 | WebTrench provides a comprehensive and powerful toolkit for web scraping. Whether you're working on a machine learning project, conducting research, or simply need to gather data from the web, WebTrench is the perfect tool for the job. So why wait? Start using WebTrench today and streamline your data collection process!
 5 | ```python
 6 | pip install Webtrench
 7 | ```
 8 | -----
 9 | ### Check Documentation [Here](https://github.com/nuhmanpk/Webtrench/wiki)
10 | ------
11 | [![Downloads](https://static.pepy.tech/personalized-badge/webtrench?period=total&units=international_system&left_color=grey&right_color=yellow&left_text=Total-Downloads)](https://pepy.tech/project/webtrench)
12 | ![PyPI - Format](https://img.shields.io/pypi/format/Webtrench)
13 | [![GitHub license](https://img.shields.io/github/license/nuhmanpk/webtrench.svg)](https://github.com/nuhmanpk/webtrench/blob/main/LICENSE)
14 | [![Upload Python Package](https://github.com/nuhmanpk/Webtrench/actions/workflows/python-publish.yml/badge.svg)](https://github.com/nuhmanpk/Webtrench/actions/workflows/python-publish.yml)
15 | [![Supported Versions](https://img.shields.io/pypi/pyversions/Webtrench.svg)](https://pypi.org/project/Webtrench)
16 | ![PyPI](https://img.shields.io/pypi/v/Webtrench)
17 | [![Documentation Status](https://readthedocs.org/projects/webtrench/badge/?version=latest)](https://webtrench.readthedocs.io/en/latest/?badge=latest)
18 | ![PyPI - Downloads](https://img.shields.io/pypi/dm/Webtrench)
19 | [![Downloads](https://static.pepy.tech/personalized-badge/Webtrench?period=week&units=international_system&left_color=grey&right_color=brightgreen&left_text=Downloads/Week)](https://pepy.tech/project/Webtrench)
20 | 
21 | 
22 | 
23 | ## Why WebTrench
24 | Easy to use: With its simple and intuitive interface, WebTrench makes it easy to extract data from the web.
25 | Comprehensive: WebTrench includes functions for extracting a wide range of data, from images to tables and beyond.
26 | Fast and efficient: WebTrench is designed to be fast and efficient, so you can quickly gather the data you need.
27 | Suitable for a variety of use cases: Whether you're working on a machine learning project, conducting research, or simply need to gather data from the web, WebTrench is a versatile tool that can meet your needs.
28 | ```python
29 | from Webtrench import ImageScrapper
30 | url = 'https://example.com'
31 | folder_path = './images'
32 | ImageScrapper.all_image_from_url(url, folder_path)
33 | ```
34 | This code snippet downloads an image from the URL https://example.com/image.jpg and saves it in the ./images folder with a random number as the file name.
35 | ## Limitations of WebTrench
36 | Depends on website structure: The success of web scraping with WebTrench depends on the structure of the website being scraped. If the website's structure changes, WebTrench may not work as expected.
37 | Legal restrictions: There may be legal restrictions on the use of web scraping, so it's important to familiarize yourself with the laws in your jurisdiction before using WebTrench.
38 | 
39 | ## Privacy Policy
40 | WebTrench respects the privacy of its users and is committed to protecting their data. We do not collect or store any personal information, and all data collected through the use of WebTrench is kept confidential.
41 | 
42 | ## Web Scraping Ethics
43 | When using WebTrench or any other web scraping tool, it's important to follow ethical guidelines and avoid scraping websites without the owner's permission. This includes websites that explicitly prohibit scraping, as well as websites that contain sensitive or confidential information.
44 | 
45 | ## Legal Warning
46 | The use of web scraping may be subject to legal restrictions, and the legality of web scraping depends on the jurisdiction in which it is being used. Before using WebTrench, it's important to familiarize yourself with the laws in your jurisdiction and ensure that your use of the tool complies with all applicable laws. WebTrench cannot be held responsible for any illegal use of the tool.
47 | 
48 | ## Contributing Guide
49 | We welcome contributions from the community! If you are interested in contributing to the WebTrench project, here are some guidelines to get started:
50 | 
51 | - Check the [issues](https://github.com/nuhmanpk/Webtrench/issues) page to see if there are any open bugs or features that you would like to work on.
52 | - Fork the repository and make your changes in a separate branch.
53 | - Once you have made your changes, submit a pull request for review.
54 | - The project maintainers will review your pull request and provide feedback. If necessary, make any requested changes and resubmit your pull request.
55 | - Once your pull request is approved and merged, you will become a contributor to the WebTrench project!
56 | 
57 | ### Project Clone Guide
58 | If you would like to clone the WebTrench repository, follow these steps:
59 | - Install Git on your computer.
60 | - Open a terminal window and navigate to the directory where you would like to clone the repository.
61 | - Run the following command: 
62 | ```git clone https://github.com/nuhmanpk/WebTrench.git```
63 | - The repository will be cloned to your computer, and you can now make changes to the code and contribute to the project.
64 | 
65 | ## Reminder
66 | Please note that WebTrench is currently in the pre-release stage and is not yet finished. If you encounter any issues, please check the [issues](https://github.com/nuhmanpk/Webtrench/issues) page, or consider contributing to make a better version of WebTrench!
67 | 


--------------------------------------------------------------------------------
/Webtrench/TextScrapper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | # from helper import is_scrapable
  4 | 
  5 | class TextScrapper:
  6 | 
  7 |     def from_url(url):
  8 |         try:
  9 | 
 10 |             response = requests.get(url)
 11 |             soup = BeautifulSoup(response.content, 'html.parser')
 12 |             text = soup.get_text()
 13 |             return text
 14 |         except Exception as e:
 15 |             print(f"An error occurred while trying to extract text from the URL {url}: {e}")
 16 |             return None
 17 |     
 18 |     def from_file(file):
 19 |         try:
 20 |             with open(file, 'r') as f:
 21 |                 text = f.read()
 22 |             return text
 23 |         except Exception as e:
 24 |             print(f"An error occurred while trying to extract text from the file {file}: {e}")
 25 |             return None
 26 | 
 27 |     def from_html(html):
 28 |         try:    
 29 |             soup = BeautifulSoup(html, 'html.parser')
 30 |             text = soup.get_text()
 31 |             return text
 32 |         except Exception as e:
 33 |             print(f"An error occurred while trying to extract text from the HTML: {e}")
 34 |             return None
 35 |             
 36 |     def paragraph_from_url(url):
 37 |         try:
 38 |             response = requests.get(url)
 39 |             soup = BeautifulSoup(response.content, 'html.parser')
 40 |             paragraphs = soup.find_all('p')
 41 |             return paragraphs
 42 |         except Exception as e:
 43 |             print(f"An error occurred while trying to extract paragraphs from the URL {url}: {e}")
 44 |             return None
 45 |             
 46 |     def link_from_url(url):
 47 |         try:
 48 |             response = requests.get(url)
 49 |             soup = BeautifulSoup(response.content, 'html.parser')
 50 |             links = soup.find_all('a')
 51 |             return links
 52 |         except Exception as e:
 53 |             print(f"An error occurred while trying to extract links from the URL {url}: {e}")
 54 |             return None
 55 |             
 56 |     def from_class(url, class_name):
 57 |         try:
 58 |             response = requests.get(url)
 59 |             soup = BeautifulSoup(response.content, 'html.parser')
 60 |             elem = soup.find({'class': class_name})
 61 |             if elem is not None:
 62 |                 return elem.get_text().strip()
 63 |             else:
 64 |                 return None
 65 |         except Exception as e:
 66 |             print(f"An error occurred while trying to extract text from the class {class_name} of the URL {url}: {e}")
 67 |             return None
 68 |             
 69 |     def from_id(url, id_name):
 70 |         try:
 71 |             response = requests.get(url)
 72 |             soup = BeautifulSoup(response.content, 'html.parser')
 73 |             elem = soup.find(id=id_name)
 74 |             if elem is not None:
 75 |                 return elem.get_text().strip()
 76 |             else:
 77 |                 return None
 78 |         except Exception as e:
 79 |             print(f"An error occurred while trying to extract text from the id {id_name} of the URL {url}: {e}")
 80 |             return None
 81 | 
 82 |     def heading_from_url(url, heading_tag):
 83 |         try:
 84 |             response = requests.get(url)
 85 |             soup = BeautifulSoup(response.content, 'html.parser')
 86 |             elem = soup.find(heading_tag)
 87 |             if elem is not None:
 88 |                 return elem.get_text().strip()
 89 |             else:
 90 |                 return None
 91 |         except Exception as e:
 92 |             print(f"An error occurred while trying to extract text from the url {url}: {e}")
 93 |             None
 94 | 
 95 |     def all_headings_from_url(url):
 96 |         try:
 97 |             response = requests.get(url)
 98 |             soup = BeautifulSoup(response.content, 'html.parser')
 99 |             all_headings = []
100 |             for heading_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
101 |                 heading = soup.find(heading_tag)
102 |                 if heading is not None:
103 |                     all_headings.append((heading_tag, heading.get_text().strip()))
104 |             return all_headings
105 |         except Exception as e:
106 |             print(f"An error occurred while trying to extract text from the url {url}: {e}")
107 |             None
108 | 
109 |     def list_from_url(url):
110 |         try:
111 |             response = requests.get(url)
112 |             soup = BeautifulSoup(response.content, 'html.parser')
113 |             lists = soup.find_all(['ul', 'ol'])
114 |             return lists
115 |         except Exception as e:
116 |             print(f"An error occurred while trying to extract text from the url {url}: {e}")
117 |             None
118 |     
119 |     def list_item_from_url(url):
120 |         try:
121 |             response = requests.get(url)
122 |             soup = BeautifulSoup(response.content, 'html.parser')
123 |             list_items = soup.find_all('li')
124 |             return list_items
125 |         except Exception as e:
126 |             print(f"An error occurred while trying to extract text from the url {url}: {e}")
127 |             None
128 | 
129 |     def table_from_url(url):
130 |         try:    
131 |             response = requests.get(url)
132 |             soup = BeautifulSoup(response.content, 'html.parser')
133 |             tables = soup.find_all('table')
134 |             return tables
135 |         except Exception as e:
136 |             print(f"An error occurred while trying to extract text from the url {url}: {e}")
137 |             None
138 |     
139 |     def table_row_from_url(url):
140 |         try:
141 |             response = requests.get(url)
142 |             soup = BeautifulSoup(response.content, 'html.parser')
143 |             table_rows = soup.find_all('tr')
144 |             return table_rows
145 |         except Exception as e:
146 |             print(f"An error occurred while trying to extract text from the url {url}: {e}")
147 |             None
148 | 
149 |     def table_data_from_url(url):
150 |         try:
151 |             response = requests.get(url)
152 |             soup = BeautifulSoup(response.content, 'html.parser')
153 |             table_cells = soup.find_all(['th', 'td'])
154 |             return table_cells
155 |         except Exception as e:
156 |             print(f"An error occurred while trying to extract text from the url {url}: {e}")
157 |             None


--------------------------------------------------------------------------------
/Webtrench/AudioScrapper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import os
  4 | import random
  5 | # from helper import is_scrapable
  6 | 
  7 | class AudioScrapper:
  8 |     
  9 |     def from_url(url,folder_path=None):
 10 |         if not folder_path:
 11 |             folder_path='.'
 12 |         try:
 13 |             request=requests.get(url)
 14 |             if not os.path.exists(folder_path):
 15 |                 os.makedirs(folder_path)
 16 |             if request.status_code==200:
 17 |                 with open(f'{folder_path}/{random.randint(1,40000)}.mp3','wb') as f:
 18 |                     f.write(request.content) 
 19 |             else:
 20 |                 pass
 21 |         except Exception as e:
 22 |             raise e
 23 | 
 24 |     def all_audio_from_url(url,folder_path=None):
 25 |         if not folder_path:
 26 |             folder_path='.'
 27 |         try:
 28 |             request=requests.get(url)
 29 |             html_content = request.content
 30 |             soup = BeautifulSoup(html_content, "html.parser")
 31 |             elements = soup.find_all("audio")
 32 |             if not os.path.exists(folder_path):
 33 |                 os.makedirs(folder_path)
 34 |             for i, element in enumerate(elements):
 35 |                 response = requests.get(element["src"])
 36 |                 if response.status_code == 200:
 37 |                     try:
 38 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f:
 39 |                             f.write(response.content) 
 40 |                     except Exception as err:
 41 |                         print(err) 
 42 |                 else:
 43 |                     pass
 44 |         except Exception as e:
 45 |             raise e
 46 | 
 47 |     def with_url_pattern(url,pattern,folder_path=None):
 48 |         if not folder_path:
 49 |             folder_path='.'
 50 |         try:
 51 |             request=requests.get(url)
 52 |             html_content = request.content
 53 |             soup = BeautifulSoup(html_content, "html.parser")
 54 |             elements = soup.find_all("audio")
 55 |             if not os.path.exists(folder_path):
 56 |                 os.makedirs(folder_path)
 57 |             for i, element in enumerate(elements):
 58 |                 if pattern in element["src"]:
 59 |                     response = requests.get(element["src"])
 60 |                     if response.status_code == 200:
 61 |                         try:
 62 |                             with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f:
 63 |                                 f.write(response.content) 
 64 |                         except Exception as err:
 65 |                             print(err) 
 66 |                     else:
 67 |                         pass
 68 |         except Exception as e:
 69 |             raise e
 70 |         
 71 |     def with_class(url,cls,folder_path=None):
 72 |         if not folder_path:
 73 |             folder_path='.'
 74 |         try:
 75 |             request=requests.get(url)
 76 |             html_content = request.content
 77 |             soup = BeautifulSoup(html_content, "html.parser")
 78 |             elements = soup.find_all("audio",{"class":cls})
 79 |             if not os.path.exists(folder_path):
 80 |                 os.makedirs(folder_path)
 81 |             for i, element in enumerate(elements):
 82 |                 response = requests.get(element["src"])
 83 |                 if response.status_code == 200:
 84 |                     try:
 85 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f:
 86 |                             f.write(response.content) 
 87 |                     except Exception as err:
 88 |                         print(err) 
 89 |                 else:
 90 |                     pass
 91 |         except Exception as e:
 92 |             raise e
 93 | 
 94 |     def with_id(url,cls,folder_path=None):
 95 |         if not folder_path:
 96 |             folder_path='.'
 97 |         try:
 98 |             request=requests.get(url)
 99 |             html_content = request.content
100 |             soup = BeautifulSoup(html_content, "html.parser")
101 |             elements = soup.find_all("audio",{"id":cls})
102 |             if not os.path.exists(folder_path):
103 |                 os.makedirs(folder_path)
104 |             for i, element in enumerate(elements):
105 |                 response = requests.get(element["src"])
106 |                 if response.status_code == 200:
107 |                     try:
108 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f:
109 |                             f.write(response.content) 
110 |                     except Exception as err:
111 |                         print(err) 
112 |                 else:
113 |                     pass
114 |         except Exception as e:
115 |             raise e
116 | 
117 |     def with_attribute(url,attr,folder_path=None):
118 |         if not folder_path:
119 |             folder_path='.'
120 |         try:
121 |             request=requests.get(url)
122 |             html_content = request.content
123 |             soup = BeautifulSoup(html_content, "html.parser")
124 |             elements = soup.find_all("audio",attrs=attr)
125 |             if not os.path.exists(folder_path):
126 |                 os.makedirs(folder_path)
127 |             for i, element in enumerate(elements):
128 |                 response = requests.get(element["src"])
129 |                 if response.status_code == 200:
130 |                     try:
131 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f:
132 |                             f.write(response.content) 
133 |                     except Exception as err:
134 |                         print(err) 
135 |                 else:
136 |                     pass
137 |         except Exception as e:
138 |             raise e
139 | 
140 |     def with_attribute_value(url,attr,value,folder_path=None):
141 |         if not folder_path:
142 |             folder_path='.'
143 |         try:
144 |             request=requests.get(url)
145 |             html_content = request.content
146 |             soup = BeautifulSoup(html_content, "html.parser")
147 |             elements = soup.find_all("audio",attrs={attr:value})
148 |             if not os.path.exists(folder_path):
149 |                 os.makedirs(folder_path)
150 |             for i, element in enumerate(elements):
151 |                 response = requests.get(element["src"])
152 |                 if response.status_code == 200:
153 |                     try:
154 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f:
155 |                             f.write(response.content) 
156 |                     except Exception as err:
157 |                         print(err) 
158 |                 else:
159 |                     pass
160 |         except Exception as e:
161 |             raise e
162 | 
163 |     def with_attribute_value_pattern(url,attr,value,folder_path=None):
164 |         if not folder_path:
165 |             folder_path='.'
166 |         try:
167 |             request=requests.get(url)
168 |             html_content = request.content
169 |             soup = BeautifulSoup(html_content, "html.parser")
170 |             elements = soup.find_all("audio",attrs={attr:value})
171 |             if not os.path.exists(folder_path):
172 |                 os.makedirs(folder_path)
173 |             for i, element in enumerate(elements):
174 |                 if value in element[attr]:
175 |                     response = requests.get(element["src"])
176 |                     if response.status_code == 200:
177 |                         try:
178 |                             with open(f"{folder_path}/{i}-{random.randint(1,30000)}.mp3", "wb") as f:
179 |                                 f.write(response.content) 
180 |                         except Exception as err:
181 |                             print(err) 
182 |                     else:
183 |                         pass
184 |         except Exception as e:
185 |             raise e
186 | 
187 |     


--------------------------------------------------------------------------------
/Webtrench/ImageScrapper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | # from helper import is_scrapable
  4 | import os
  5 | import random
  6 | 
  7 | class ImageScrapper:
  8 | 
  9 |     def from_url(url,folder_path=None):
 10 |         if not folder_path:
 11 |             folder_path='.'
 12 |         try:
 13 |             request=requests.get(url)
 14 |             if not os.path.exists(folder_path):
 15 |                 os.makedirs(folder_path)
 16 |             if request.status_code==200:
 17 |                 with open(f'{folder_path}/{random.randint(1,40000)}.jpg','wb') as f:
 18 |                     f.write(request.content) 
 19 |             else:
 20 |                 pass
 21 |         except Exception as e:
 22 |             raise e
 23 | 
 24 |     def all_image_from_url(url,folder_path=None):
 25 |         if not folder_path:
 26 |             folder_path='.'
 27 |         try:
 28 |             request=requests.get(url)
 29 |             html_content = request.content
 30 |             soup = BeautifulSoup(html_content, "html.parser")
 31 |             elements = soup.find_all("img")
 32 |             if not os.path.exists(folder_path):
 33 |                 os.makedirs(folder_path)
 34 |             for i, element in enumerate(elements):
 35 |                 response = requests.get(element["src"])
 36 |                 if response.status_code == 200:
 37 |                     try:
 38 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f:
 39 |                             f.write(response.content) 
 40 |                     except Exception as err:
 41 |                         print(err) 
 42 |                 else:
 43 |                     pass
 44 |         except Exception as e:
 45 |             raise e
 46 |     
 47 |     def with_url_pattern(url,pattern,folder_path=None):
 48 |         if not folder_path:
 49 |             folder_path='.'
 50 |         try:
 51 |             request=requests.get(url)
 52 |             html_content = request.content
 53 |             soup = BeautifulSoup(html_content, "html.parser")
 54 |             elements = soup.find_all("img")
 55 |             if not os.path.exists(folder_path):
 56 |                 os.makedirs(folder_path)
 57 |             for i, element in enumerate(elements):
 58 |                 if pattern in element["src"]:
 59 |                     response = requests.get(element["src"])
 60 |                     if response.status_code == 200:
 61 |                         with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f:
 62 |                             f.write(response.content)
 63 |                     else:
 64 |                         pass
 65 |         except Exception as e:
 66 |             raise e
 67 | 
 68 |     def with_class(url,class_name,folder_path=None):
 69 |         if not folder_path:
 70 |             folder_path='.'
 71 |         try:
 72 |             request=requests.get(url)
 73 |             html_content = request.content
 74 |             soup = BeautifulSoup(html_content, "html.parser")
 75 |             elements = soup.find_all("img",class_=class_name)
 76 |             if not os.path.exists(folder_path):
 77 |                 os.makedirs(folder_path)
 78 |             for i, element in enumerate(elements):
 79 |                 response = requests.get(element["src"])
 80 |                 if response.status_code == 200:
 81 |                     with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f:
 82 |                         f.write(response.content)
 83 |                 else:
 84 |                     pass
 85 |         except Exception as e:
 86 |             raise e
 87 | 
 88 |     def with_id(url,id_name,folder_path=None):
 89 |         if not folder_path:
 90 |             folder_path='.'
 91 |         try:
 92 |             request=requests.get(url)
 93 |             html_content = request.content
 94 |             soup = BeautifulSoup(html_content, "html.parser")
 95 |             elements = soup.find_all("img",id=id_name)
 96 |             if not os.path.exists(folder_path):
 97 |                 os.makedirs(folder_path)
 98 |             for i, element in enumerate(elements):
 99 |                 response = requests.get(element["src"])
100 |                 if response.status_code == 200:
101 |                     with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f:
102 |                         f.write(response.content)
103 |                 else:
104 |                     pass
105 |         except Exception as e:
106 |             raise e
107 | 
108 |     def with_alt(url,alt_name,folder_path=None):
109 |         if not folder_path:
110 |             folder_path='.'
111 |         try:
112 |             request=requests.get(url)
113 |             html_content = request.content
114 |             soup = BeautifulSoup(html_content, "html.parser")
115 |             elements = soup.find_all("img",alt=alt_name)
116 |             if not os.path.exists(folder_path):
117 |                 os.makedirs(folder_path)
118 |             for i, element in enumerate(elements):
119 |                 response = requests.get(element["src"])
120 |                 if response.status_code == 200:
121 |                     with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f:
122 |                         f.write(response.content)
123 |                 else:
124 |                     pass
125 |         except Exception as e:
126 |             raise e
127 | 
128 |     def with_title(url,title_name,folder_path=None):
129 |         if not folder_path:
130 |             folder_path='.'
131 |         try:
132 |             request=requests.get(url)
133 |             html_content = request.content
134 |             soup = BeautifulSoup(html_content, "html.parser")
135 |             elements = soup.find_all("img",title=title_name)
136 |             if not os.path.exists(folder_path):
137 |                 os.makedirs(folder_path)
138 |             for i, element in enumerate(elements):
139 |                 response = requests.get(element["src"])
140 |                 if response.status_code == 200:
141 |                     with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f:
142 |                         f.write(response.content)
143 |                 else:
144 |                     pass
145 |         except Exception as e:
146 |             raise e
147 |     
148 |     def with_height(url,height,folder_path=None):
149 |         if not folder_path:
150 |             folder_path='.'
151 |         try:
152 |             request=requests.get(url)
153 |             html_content = request.content
154 |             soup = BeautifulSoup(html_content, "html.parser")
155 |             elements = soup.find_all("img",height=height)
156 |             if not os.path.exists(folder_path):
157 |                 os.makedirs(folder_path)
158 |             for i, element in enumerate(elements):
159 |                 response = requests.get(element["src"])
160 |                 if response.status_code == 200:
161 |                     with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f:
162 |                         f.write(response.content)
163 |                 else:
164 |                     pass
165 |         except Exception as e:
166 |             raise e
167 | 
168 |     def with_width(url,width,folder_path=None):
169 |         if not folder_path:
170 |             folder_path='.'
171 |         try:
172 |             request=requests.get(url)
173 |             html_content = request.content
174 |             soup = BeautifulSoup(html_content, "html.parser")
175 |             elements = soup.find_all("img",width=width)
176 |             if not os.path.exists(folder_path):
177 |                 os.makedirs(folder_path)
178 |             for i, element in enumerate(elements):
179 |                 response = requests.get(element["src"])
180 |                 if response.status_code == 200:
181 |                     with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f:
182 |                         f.write(response.content)
183 |                 else:
184 |                     pass
185 |         except Exception as e:
186 |             raise e
187 |     
188 |     def with_style(url,style,folder_path=None):
189 |         if not folder_path:
190 |             folder_path='.'
191 |         try:
192 |             request=requests.get(url)
193 |             html_content = request.content
194 |             soup = BeautifulSoup(html_content, "html.parser")
195 |             elements = soup.find_all("img",style=style)
196 |             if not os.path.exists(folder_path):
197 |                 os.makedirs(folder_path)
198 |             for i, element in enumerate(elements):
199 |                 response = requests.get(element["src"])
200 |                 if response.status_code == 200:
201 |                     with open(f"{folder_path}/{i}-{random.randint(1,30000)}.png", "wb") as f:
202 |                         f.write(response.content)
203 |                 else:
204 |                     pass
205 |         except Exception as e:
206 |             raise e
207 |         


--------------------------------------------------------------------------------
/Webtrench/MetaDataScrapper.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import re
  4 | # from helper import is_scrapable
  5 | 
  6 | class MetaDataScrapper:
  7 |     def get_website_title(url):
  8 |         try:
  9 |             request=requests.get(url)
 10 |             html_content = request.content
 11 |             soup = BeautifulSoup(html_content, "html.parser")
 12 |             title = soup.title.string
 13 |             return title
 14 |         except Exception as e:
 15 |             raise e
 16 | 
 17 |     def get_website_description(url):
 18 |         try:
 19 |             request=requests.get(url)
 20 |             html_content = request.content
 21 |             soup = BeautifulSoup(html_content, "html.parser")
 22 |             description = soup.find("meta",  property="og:description")
 23 |             return description["content"]
 24 |         except Exception as e:
 25 |             raise e
 26 | 
 27 |     def get_website_keywords(url):
 28 |         try:
 29 |             request=requests.get(url)
 30 |             html_content = request.content
 31 |             soup = BeautifulSoup(html_content, "html.parser")
 32 |             keywords = soup.find("meta",  property="og:keywords")
 33 |             return keywords["content"]
 34 |         except Exception as e:
 35 |             raise e
 36 | 
 37 |     def get_website_image(url):
 38 |         try:
 39 |             request=requests.get(url)
 40 |             html_content = request.content
 41 |             soup = BeautifulSoup(html_content, "html.parser")
 42 |             image = soup.find("meta",  property="og:image")
 43 |             return image["content"]
 44 |         except Exception as e:
 45 |             raise e
 46 | 
 47 |     def get_website_url(url):
 48 |         try:
 49 |             request=requests.get(url)
 50 |             html_content = request.content
 51 |             soup = BeautifulSoup(html_content, "html.parser")
 52 |             url = soup.find("meta",  property="og:url")
 53 |             return url["content"]
 54 |         except Exception as e:
 55 |             raise e
 56 |     
 57 |     def get_website_type(url):
 58 |         try:
 59 |             request=requests.get(url)
 60 |             html_content = request.content
 61 |             soup = BeautifulSoup(html_content, "html.parser")
 62 |             type = soup.find("meta",  property="og:type")
 63 |             return type["content"]
 64 |         except Exception as e:
 65 |             raise e
 66 |     
 67 |     def get_website_site_name(url):
 68 |         try:
 69 |             request=requests.get(url)
 70 |             html_content = request.content
 71 |             soup = BeautifulSoup(html_content, "html.parser")
 72 |             site_name = soup.find("meta",  property="og:site_name")
 73 |             return site_name["content"]
 74 |         except Exception as e:
 75 |             raise e
 76 |     
 77 |     def get_website_locale(url):
 78 |         try:
 79 |             request=requests.get(url)
 80 |             html_content = request.content
 81 |             soup = BeautifulSoup(html_content, "html.parser")
 82 |             locale = soup.find("meta",  property="og:locale")
 83 |             return locale["content"]
 84 |         except Exception as e:
 85 |             raise e
 86 | 
 87 |     def meta_data(url):
 88 |         try:
 89 |             request=requests.get(url)
 90 |             html_content = request.content
 91 |             soup = BeautifulSoup(html_content, "html.parser")
 92 |             meta_data = soup.find_all("meta")
 93 |             return meta_data
 94 |         except Exception as e:
 95 |             raise e
 96 | 
 97 |     def with_property(url):
 98 |         try:
 99 |             request=requests.get(url)
100 |             html_content = request.content
101 |             soup = BeautifulSoup(html_content, "html.parser")
102 |             meta_data = soup.find_all("meta",  property=True)
103 |             return meta_data
104 |         except Exception as e:
105 |             raise e
106 |     
107 |     def with_name(url):
108 |         try:
109 |             request=requests.get(url)
110 |             html_content = request.content
111 |             soup = BeautifulSoup(html_content, "html.parser")
112 |             meta_data = soup.find_all("meta",  name=True)
113 |             return meta_data
114 |         except Exception as e:
115 |             raise e
116 |         
117 |     def with_http_equiv(url):
118 |         try:
119 |             request=requests.get(url)
120 |             html_content = request.content
121 |             soup = BeautifulSoup(html_content, "html.parser")
122 |             meta_data = soup.find_all("meta",  http_equiv=True)
123 |             return meta_data
124 |         except Exception as e:
125 |             raise e
126 | 
127 |     def with_content(url):
128 |         try:
129 |             request=requests.get(url)
130 |             html_content = request.content
131 |             soup = BeautifulSoup(html_content, "html.parser")
132 |             meta_data = soup.find_all("meta",  content=True)
133 |             return meta_data
134 |         except Exception as e:
135 |             raise e
136 |     
137 |     def with_charset(url):
138 |         try:
139 |             request=requests.get(url)
140 |             html_content = request.content
141 |             soup = BeautifulSoup(html_content, "html.parser")
142 |             meta_data = soup.find_all("meta",  charset=True)
143 |             return meta_data
144 |         except Exception as e:
145 |             raise e
146 | 
147 |     def with_itemprop(url):
148 |         try:
149 |             request=requests.get(url)
150 |             html_content = request.content
151 |             soup = BeautifulSoup(html_content, "html.parser")
152 |             meta_data = soup.find_all("meta",  itemprop=True)
153 |             return meta_data
154 |         except Exception as e:
155 |             raise e
156 | 
157 |     def with_scheme(url):
158 |         try:
159 |             request=requests.get(url)
160 |             html_content = request.content
161 |             soup = BeautifulSoup(html_content, "html.parser")
162 |             meta_data = soup.find_all("meta",  scheme=True)
163 |             return meta_data
164 |         except Exception as e:
165 |             raise e
166 | 
167 |     def with_lang(url):
168 |         try:
169 |             request=requests.get(url)
170 |             html_content = request.content
171 |             soup = BeautifulSoup(html_content, "html.parser")
172 |             meta_data = soup.find_all("meta",  lang=True)
173 |             return meta_data
174 |         except Exception as e:
175 |             raise e
176 | 
177 |     def with_dir(url):
178 |         try:
179 |             request=requests.get(url)
180 |             html_content = request.content
181 |             soup = BeautifulSoup(html_content, "html.parser")
182 |             meta_data = soup.find_all("meta",  dir=True)
183 |             return meta_data
184 |         except Exception as e:
185 |             raise e
186 | 
187 |     def with_xml_lang(url):
188 |         try:
189 |             request=requests.get(url)
190 |             html_content = request.content
191 |             soup = BeautifulSoup(html_content, "html.parser")
192 |             meta_data = soup.find_all("meta",  xml_lang=True)
193 |             return meta_data
194 |         except Exception as e:
195 |             raise e
196 | 
197 |     def with_xmlns(url):
198 |         try:
199 |             request=requests.get(url)
200 |             html_content = request.content
201 |             soup = BeautifulSoup(html_content, "html.parser")
202 |             meta_data = soup.find_all("meta",  xmlns=True)
203 |             return meta_data
204 |         except Exception as e:
205 |             raise e
206 | 
207 |     def with_xmlns_xsi(url):
208 |         try:
209 |             request=requests.get(url)
210 |             html_content = request.content
211 |             soup = BeautifulSoup(html_content, "html.parser")
212 |             meta_data = soup.find_all("meta",  xmlns_xsi=True)
213 |             return meta_data
214 |         except Exception as e:
215 |             raise e
216 | 
217 |     def with_xsi_schemaLocation(url):
218 |         try:
219 |             request=requests.get(url)
220 |             html_content = request.content
221 |             soup = BeautifulSoup(html_content, "html.parser")
222 |             meta_data = soup.find_all("meta",  xsi_schemaLocation=True)
223 |             return meta_data
224 |         except Exception as e:
225 |             raise e
226 | 
227 |     def with_xmlns_og(url):
228 |         try:
229 |             request=requests.get(url)
230 |             html_content = request.content
231 |             soup = BeautifulSoup(html_content, "html.parser")
232 |             meta_data = soup.find_all("meta",  xmlns_og=True)
233 |             return meta_data
234 |         except Exception as e:
235 |             raise e
236 | 
237 |     def with_xmlns_fb(url):
238 |         try:
239 |             request=requests.get(url)
240 |             html_content = request.content
241 |             soup = BeautifulSoup(html_content, "html.parser")
242 |             meta_data = soup.find_all("meta",  xmlns_fb=True)
243 |             return meta_data
244 |         except Exception as e:
245 |             raise e
246 | 
247 |     def with_xmlns_article(url):
248 |         try:
249 |             request=requests.get(url)
250 |             html_content = request.content
251 |             soup = BeautifulSoup(html_content, "html.parser")
252 |             meta_data = soup.find_all("meta",  xmlns_article=True)
253 |             return meta_data
254 |         except Exception as e:
255 |             raise e
256 | 
257 |     def with_xmlns_profile(url):
258 |         try:
259 |             request=requests.get(url)
260 |             html_content = request.content
261 |             soup = BeautifulSoup(html_content, "html.parser")
262 |             meta_data = soup.find_all("meta",  xmlns_profile=True)
263 |             return meta_data
264 |         except Exception as e:
265 |             raise e
266 | 
267 |     def with_xmlns_book(url):
268 |         try:
269 |             request=requests.get(url)
270 |             html_content = request.content
271 |             soup = BeautifulSoup(html_content, "html.parser")
272 |             meta_data = soup.find_all("meta",  xmlns_book=True)
273 |             return meta_data
274 |         except Exception as e:
275 |             raise e
276 | 
277 |     def with_xmlns_video(url):
278 |         try:
279 |             request=requests.get(url)
280 |             html_content = request.content
281 |             soup = BeautifulSoup(html_content, "html.parser")
282 |             meta_data = soup.find_all("meta",  xmlns_video=True)
283 |             return meta_data
284 |         except Exception as e:
285 |             raise e
286 | 
287 |     def with_xmlns_music(url):
288 |         try:
289 |             request=requests.get(url)
290 |             html_content = request.content
291 |             soup = BeautifulSoup(html_content, "html.parser")
292 |             meta_data = soup.find_all("meta",  xmlns_music=True)
293 |             return meta_data
294 |         except Exception as e:
295 |             raise e
296 | 
297 | 
298 |     def with_xmlns_place(url):
299 |         try:
300 |             request=requests.get(url)
301 |             html_content = request.content
302 |             soup = BeautifulSoup(html_content, "html.parser")
303 |             meta_data = soup.find_all("meta",  xmlns_place=True)
304 |             return meta_data
305 |         except Exception as e:
306 |             raise e
307 | 
308 |     def get_keyword_density(url, keyword):
309 |         response = requests.get(url)
310 |         content = response.content.decode("utf-8").lower()
311 |         word_count = len(re.findall(keyword, content))
312 |         total_words = len(re.findall("\w+", content))
313 |         density = (word_count / total_words) * 100
314 |         return density
315 | 
316 |     def get_meta_data(url):
317 |         response = requests.get(url)
318 |         content = response.content.decode("utf-8")
319 |         meta_data = {}
320 |         meta_tags = re.findall("<meta.*?>", content)
321 |         for meta_tag in meta_tags:
322 |             name = re.search("name=['\"](.*?)['\"]", meta_tag)
323 |             if name:
324 |                 name = name.group(1)
325 |             else:
326 |                 name = re.search("property=['\"](.*?)['\"]", meta_tag)
327 |                 if name:
328 |                     name = name.group(1)
329 |                 else:
330 |                     continue
331 |             value = re.search("content=['\"](.*?)['\"]", meta_tag)
332 |             if value:
333 |                 value = value.group(1)
334 |                 meta_data[name] = value
335 |         return meta_data


--------------------------------------------------------------------------------