├── .dockerignore ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── custom.md │ └── feature_request.md ├── dependabot.yml └── workflows │ ├── ci.yml │ └── codeql-analysis.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── test-requirements.txt └── unit │ ├── __init__.py │ ├── data_duplicates_negative.json │ ├── data_html_negative.json │ ├── data_html_positive.json │ ├── data_sitemap_negative.json │ ├── data_sitemap_positive.json │ ├── data_url_negative.json │ ├── data_url_positive.json │ ├── data_visible_tags.json │ ├── data_webpage.json │ ├── test_stop_words.py │ ├── test_webpage_analysis.py │ └── test_website_analysis.py └── webedge ├── __init__.py ├── cli_output.py ├── social_websites.py ├── stop_words.py ├── warnings.py ├── webedge.py ├── webpage_analysis.py └── website_analysis.py /.dockerignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | #docker should ignore the tests folder 132 | tests -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | exclude = 3 | # don't traverse git directory 4 | .git, 5 | # don't traverse cached files 6 | __pycache__, 7 | # don't traverse venv files 8 | bin, 9 | lib, 10 | share, 11 | local, 12 | # don't traverse autogenerated scripts 13 | migrations 14 | max-line-length = 99 15 | 16 | # Specify a list of codes to ignore. 17 | ignore = 18 | E722, W503, E251, E501 -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/custom.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Custom issue template 3 | about: Describe this issue template's purpose here. 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "pip" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | open-pull-requests-limit: 10 8 | reviewers: 9 | - "HarshCasper" 10 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Python CI Workflow 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: 16 | - "3.6" 17 | - "3.7" 18 | - "3.8" 19 | - "3.9" 20 | 21 | steps: 22 | - uses: actions/checkout@v2 23 | 24 | - name: Set up Python ${{ matrix.python-version }} 25 | uses: actions/setup-python@v2 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | 29 | - name: Upgrade pip version 30 | run: | 31 | python3 -m pip install --upgrade pip 32 | - name: Installs all the Dependencies 33 | run: | 34 | python3 setup.py install 35 | - name: Checks the Application Build 36 | run: | 37 | pip3 install wheel 38 | python3 setup.py sdist bdist_wheel 39 | - name: Tests the Application 40 | run: | 41 | pip3 install -r tests/test-requirements.txt 42 | nosetests --with-coverage --cover-package=webedge tests.unit 43 | - name: Lint with flake8 44 | run: | 45 | pip3 install flake8 46 | flake8 . -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL Analysis - Python" 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | branches: [main] 8 | 9 | jobs: 10 | CodeQL-Build: 11 | 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | language: ['python'] 16 | 17 | runs-on: ubuntu-latest 18 | 19 | steps: 20 | - name: Checkout repository 21 | uses: actions/checkout@v2 22 | 23 | - name: Initialize CodeQL 24 | uses: github/codeql-action/init@v1 25 | with: 26 | languages: python 27 | setup-python-dependencies: false 28 | 29 | - name: Perform CodeQL Analysis 30 | uses: github/codeql-action/analyze@v1 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7 2 | 3 | WORKDIR /app 4 | 5 | COPY requirements.txt ./ 6 | 7 | RUN pip install -r requirements.txt 8 | 9 | COPY . . 10 | 11 | RUN python setup.py install 12 | 13 | CMD ["webedge", "-d", "https://ajitesh13.github.io"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Harsh Bardhan Mishra 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![WebEdge](https://socialify.git.ci/HarshCasper/WebEdge/image?description=1&descriptionEditable=Bringing%20Edge%20to%20your%20Web%20Performance%20%F0%9F%94%A5%E2%9C%A8&forks=1&issues=1&language=1&pulls=1&stargazers=1&theme=Light) 2 | 3 |

4 | We all have inborn talent and also inborn failings,
5 | So often scorn a balance, chained to our own railings,
6 | And our world misses a website that deserved to be a star,
7 | But not unfurled in all its might, merely cowering from afar,
8 | Why not take your Van Dyke, or Rembrandt seen by few,
9 | And let us make it see the light, exposed to global view,
10 | Don't hide them in shadows behind barriers of your mind,
11 | Where pride and jealous arrows make them hard to find,
12 | Instead turn to experts just as good as you would like to be,
13 | Who you'll learn to trust, and who will set your website free.
14 |

15 |

16 | Developed with by your friends at MLH Fellowship Team-1. 17 |

18 |

19 | version 1.0.2 20 | license MIT 21 | MLH Fellowship Team 1 22 | GitHub-Actions-Build 23 | Code Format: Black 24 | PyPI version 25 | PyPi downloads 26 |

27 | 28 | ## 💥 Introduction 29 | 30 | > Bringing Edge to your Web Performance 31 | 32 | Rise of Web has heralded the increasing ways in which we optimize Digital Performance. With SEO and Web Performance playing an important part, Developers feel lost around Performance needs. WebEdge aims to fix this 🌐 33 | 34 | WebEdge have been introduced to suggest Web Optimizations for the App that can speed up operations and boost productivity ⚡ 35 | 36 | ## 💡 Why did we build it? 37 | 38 | As Frontend Developers, Performance plays an important part for Ranking and User Experience. The priority is such that it cannot be avoided any longer. WebEdge provides a Python Package for you to scrap you Website and auto-suggest improvements you can make to improve your Optimization Ranking ♾️ 39 | 40 | With this Package, we aim to have a unified tool to improve your SEO Ranking with real-time optimizations, that you can fix as a Developer. Sounds interesting? Well it is 🔥 41 | 42 | ## 🚀 Installation 43 | 44 | To install WebEdge, we can use `pip`: 45 | 46 | ```sh 47 | pip3 install webedge 48 | ``` 49 | 50 | The standard Python package will setup the CLI and you can use the same for local testing and analysis of your website and webpages. 51 | 52 | ```sh 53 | _ __ __ ______ __ 54 | | | / /__ / /_ / ____/___/ /___ ____ 55 | | | /| / / _ \/ __ \/ __/ / __ / __ `/ _ \ 56 | | |/ |/ / __/ /_/ / /___/ /_/ / /_/ / __/ 57 | |__/|__/\___/_.___/_____/\__,_/\__, /\___/ 58 | /____/ 59 | 60 | 61 | usage: webedge [-h] -d DOMAIN [-s SITEMAP] [-p PAGE] 62 | ``` 63 | 64 | ## 🛠️ Local development 65 | 66 | That's pretty easy. To ensure that you are able to install everything properly, we would recommend you to have Git, Python and pip installed. You should ideally work with a Virtual Environment, such as `venv` or the `virtualenv` module, to get the best out of the package. 67 | 68 | We will first start with setting up the Local Project Environment: 69 | 70 | ```sh 71 | git clone https://github.com/HarshCasper/WebEdge.git 72 | cd WebEdge 73 | virtualenv venv 74 | source venv/bin/activate 75 | pip3 install -r requirements.txt 76 | python3 setup.py install 77 | ``` 78 | 79 | Once you run the Commands and get everything fine, we are all set to run the tool ✔️ 80 | 81 | Let's run the tool now: 82 | 83 | ```sh 84 | webedge -d http://[DOMAIN_NAME]/ 85 | ``` 86 | 87 | * For example if your domain is `https://fastcoder.netlify.app/` then your command should be (you can use `http` or `https` in the command according to your needs): 88 | 89 | ```sh 90 | webedge -d https://fastcoder.netlify.app/ 91 | ``` 92 | 93 | Pass your Website to the tool and you will get a generated JSON highlighting all the achievements you have made in SEO Optimization or the warnings being displayed by the same 🔑 94 | 95 | To run the tests, simply push: 96 | 97 | ```sh 98 | nosetests --with-coverage --cover-package=webedge tests.unit 99 | ``` 100 | 101 | To build with Docker, simply push: 102 | 103 | **Building using docker** 104 | ```bash 105 | $ docker build -t 'app:webedge' . 106 | $ docker run app:webedge 107 | ``` 108 | 109 | ## 🛑 External Tools 110 | 111 | The Python Files have been linted using [flake8](https://flake8.pycqa.org/) which automatically suggests linting errors and issues with formatting and styling. You can run the `flake8` command with the given configuration in the Project 🍀 112 | 113 | We are also making use of CodeQL Analysis, which can be viewed [here](.github/workflows/codeql-analysis.yml). This allows us to identify potential bugs and anti-patterns with each push to the repository, and potentially fix it 🐛 114 | 115 | For setting up CI/CD, we are making use of [GitHub Actions](https://github.com/features/actions). With a simple configuration set-up, we were able to test each build for specific issues, which can be viewed [here](.github/workflows/ci.yml) 🌱 116 | 117 | ## 📜 LICENSE 118 | 119 | [MIT License](https://github.com/HarshCasper/WebEdge/blob/main/LICENSE) 120 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.3 2 | beautifulsoup4==4.9.3 3 | bs4==0.0.1 4 | CacheControl==0.12.6 5 | certifi==2019.11.28 6 | chardet==3.0.4 7 | clanimate==0.0.1 8 | click==7.1.2 9 | colorama==0.4.4 10 | contextlib2==0.6.0 11 | distlib==0.3.0 12 | distro==1.4.0 13 | html5lib==1.0.1 14 | idna==2.8 15 | ipaddr==2.2.0 16 | joblib==0.17.0 17 | lockfile==0.12.2 18 | msgpack==0.6.2 19 | nltk==3.5 20 | packaging==20.3 21 | pep517==0.8.2 22 | progress==1.5 23 | prompt-toolkit==1.0.14 24 | pyfiglet==0.8.post1 25 | Pygments==2.7.2 26 | PyInquirer==1.0.3 27 | pyparsing==2.4.6 28 | pytoml==0.1.21 29 | pyyaml==5.3.1 30 | regex==2020.11.13 31 | requests==2.25.0 32 | retrying==1.3.3 33 | SentimentAnalysis==0.8 34 | six==1.15.0 35 | soupsieve==2.0.1 36 | tqdm==4.53.0 37 | urllib3==1.26.2 38 | vaderSentiment==3.3.2 39 | wcwidth==0.2.5 40 | webencodings==0.5.1 41 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [nosetests] 2 | cover-branches = true 3 | cover-erase = true 4 | cover-inclusive = true 5 | cover-min-percentage = 90 6 | cover-package = webedge 7 | match = ^test 8 | where = tests 9 | 10 | with-doctest = true -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open('requirements.txt') as f: 4 | REQUIREMENTS = f.read().splitlines() 5 | with open('README.md', encoding='utf8') as f: 6 | README = f.read() 7 | 8 | setup( 9 | name='WebEdge', 10 | version='1.0.2', 11 | license='MIT License', 12 | author='MLH Fellowship Team 1', 13 | author_email='erbeusgriffincasper@gmail.com', 14 | description='Bringing Edge to your Web Performance', 15 | long_description=README, 16 | long_description_content_type='text/markdown', 17 | url='https://github.com/HarshCasper/WebEdge', 18 | install_requires=REQUIREMENTS, 19 | packages=find_packages(exclude = ["*.tests", "*.tests.*", "tests.*", "tests"]), 20 | entry_points={ 21 | 'console_scripts': [ 22 | 'webedge = webedge.webedge:main' 23 | ] 24 | } 25 | ) 26 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarshCasper/WebEdge/3175e89a1753c9ef9a5e69766d355319206f84a3/tests/__init__.py -------------------------------------------------------------------------------- /tests/test-requirements.txt: -------------------------------------------------------------------------------- 1 | nose==1.3.7 2 | coverage==5.3 3 | mock==4.0.2 4 | tox==3.20.1 5 | testtools==2.4.0 6 | ddt==1.4.1 7 | jsonschema==3.2.0 -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarshCasper/WebEdge/3175e89a1753c9ef9a5e69766d355319206f84a3/tests/unit/__init__.py -------------------------------------------------------------------------------- /tests/unit/data_duplicates_negative.json: -------------------------------------------------------------------------------- 1 | { 2 | "TITLE_DUPLICATED": ["The cat in the hat", "TITLE_DUPLICATED"], 3 | "DESCRIPTION_DUPLICATED": ["", "DESCRIPTION_DUPLICATED"] 4 | } 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /tests/unit/data_html_negative.json: -------------------------------------------------------------------------------- 1 | { 2 | "TITLE_MISSING": ["", "TITLE_MISSING"], 3 | "TITLE_MISSING_EMPTY": ["", "TITLE_MISSING"], 4 | "TITLE_TOO_SHORT": ["Short", "TITLE_TOO_SHORT"], 5 | "TITLE_TOO_LONG": ["This title is way too long to be a useful title. You should really try to keep the length to a reasonable size", "TITLE_TOO_LONG"], 6 | "TITLE_TOO_GENERIC": ["This is a Page", "TITLE_TOO_GENERIC"], 7 | "TITLE_TOO_GENERIC_UNTITLED": ["Untitled Page", "TITLE_TOO_GENERIC"], 8 | "TITLE_KEYWORD_STUFFED": ["Web Design, Design, Website Design, Design Websites in Atlanta", "TITLE_KEYWORD_STUFFED"], 9 | "DESCRIPTION_MISSING_EMPTY": ["", "DESCRIPTION_MISSING"], 10 | "DESCRIPTION_MISSING_CONTENT_TAG": ["", "DESCRIPTION_MISSING"], 11 | "DESCRIPTION_MISSING_ABSENT": ["", "DESCRIPTION_MISSING"], 12 | "DESCRIPTION_TOO_SHORT": ["", "DESCRIPTION_TOO_SHORT"], 13 | "DESCRIPTION_TOO_LONG": ["", "DESCRIPTION_TOO_LONG"], 14 | "DESCRIPTION_TOO_GENERIC_PAGE": ["", "DESCRIPTION_TOO_GENERIC"], 15 | "DESCRIPTION_TOO_GENERIC_UNTITLED": ["", "DESCRIPTION_TOO_GENERIC"], 16 | "DESCRIPTION_KEYWORD_STUFFED": ["", "DESCRIPTION_KEYWORD_STUFFED"], 17 | "URL_NOT_CANONICAL": ["", "URL_NOT_CANONICAL"], 18 | "IMAGE_LINK_ALT_MISSING": ["", "IMAGE_LINK_ALT_MISSING"], 19 | "IMAGE_LINK_ALT_MISSING_EMPTY": ["", "IMAGE_LINK_ALT_MISSING"], 20 | "ANCHOR_TEXT_MISSING": ["", "ANCHOR_TEXT_MISSING"], 21 | "ANCHOR_TEXT_TOO_SHORT": ["as", "ANCHOR_TEXT_TOO_SHORT"], 22 | "ANCHOR_TEXT_TOO_LONG": ["Click here if you want to see something really cool. We will do a bunch of magic and you may get spammed.", "ANCHOR_TEXT_TOO_LONG"], 23 | "ANCHOR_TEXT_TOO_GENERIC_PAGE1": ["Page 1", "ANCHOR_TEXT_TOO_GENERIC"], 24 | "ANCHOR_TEXT_TOO_GENERIC_CLICKHERE": ["Click Here!", "ANCHOR_TEXT_TOO_GENERIC"], 25 | "ANCHOR_TEXT_TOO_GENERIC_ARTICLE": ["Article One", "ANCHOR_TEXT_TOO_GENERIC"], 26 | "ANCHOR_HREF_TOO_LONG": ["Lengthy Link", "ANCHOR_HREF_TOO_LONG"], 27 | "ANCHOR_HREF_EQUALS_TEXT": ["404.html", "ANCHOR_HREF_EQUALS_TEXT"], 28 | "BROKEN_LINK_RELATIVE": ["404.html", "BROKEN_LINK"], 29 | "BROKEN_LINK_ABSOLUTE": ["404.html", "BROKEN_LINK"], 30 | "BROKEN_LINK_DUPLICATE": ["404.html404.html", "BROKEN_LINK"], 31 | "ANCHOR_NO_FOLLOW": ["Go External", "ANCHOR_NO_FOLLOW"], 32 | "IMAGE_SRC_MISSING": ["", "IMAGE_SRC_MISSING"], 33 | "IMAGE_ALT_MISSING": ["", "IMAGE_ALT_MISSING"], 34 | "IMAGE_ALT_MISSING_EMPTY": ["", "IMAGE_ALT_MISSING"], 35 | "IMAGE_ALT_TOO_LONG": ["This description is way too long to be a useful description.  You should really try to keep the length to a reasonable size.This description is way too long to be a useful description.  You should really try to keep the length to a reasonable size. Its just way too long and try not to do this.", "IMAGE_ALT_TOO_LONG"], 36 | "H1_ONE_PER_PAGE_MISSING": ["

This is a second level heading

", "H1_ONE_PER_PAGE"], 37 | "H1_ONE_PER_PAGE_TWOFOUND": ["

Heading One

Heading Two

", "H1_ONE_PER_PAGE"], 38 | "H1_TOO_SHORT_EMPTY": ["

", "H1_TOO_SHORT"], 39 | "H1_TOO_SHORT": ["

Eg

", "H1_TOO_SHORT"], 40 | "KEYWORDS_META": ["", "KEYWORDS_META"], 41 | "WORDCOUNT_TOO_SHORT": ["

This is a good header

but not enough text

", "WORDCOUNT_TOO_SHORT"] 42 | } 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /tests/unit/data_html_positive.json: -------------------------------------------------------------------------------- 1 | { 2 | "TITLE_LENGTH": ["This is a good length title that is well optimized for SEO", "TITLE_LENGTH"], 3 | "TITLE_INFORMATIVE": ["This is a good length title that is well optimized for SEO", "TITLE_INFORMATIVE"], 4 | "TITLE_UNIQUE": ["This is a good length title that is well optimized for SEO", "TITLE_UNIQUE"], 5 | "DESCRIPTION_LENGTH": ["", "DESCRIPTION_LENGTH"], 6 | "DESCRIPTION_INFORMATIVE": ["", "DESCRIPTION_INFORMATIVE"], 7 | "URL_CANONICAL": ["", "URL_CANONICAL"], 8 | "IMAGE_LINK_ALT": ["Some like to run in the hot hot sun", "IMAGE_LINK_ALT"], 9 | "IMAGE_SRC_TOO_LONG_EXTERNAL": ["", ""], 10 | "ANCHOR_NO_FOLLOW": ["Go External", "ANCHOR_NO_FOLLOW"], 11 | "H1_ONE_PER_PAGE": ["

This is a good header

", "H1_ONE_PER_PAGE"], 12 | "H1_LENGTH": ["

This is a good header

", "H1_LENGTH"], 13 | "MAIL_LINKS": ["Message Me", ""], 14 | "WORDCOUNT": ["

Lorem ipsum dolor sit amet, consectetur adipiscing elit. Aliquam in elit augue. Ut dolor ex, pretium a eros eget, interdum congue eros. Nulla eget vehicula eros. In efficitur sapien vitae vehicula suscipit. Aenean dictum elit eget turpis ornare maximus. Fusce at volutpat dolor. Sed quam ante, volutpat cursus massa nec, vehicula volutpat nulla. Etiam feugiat aliquam lectus et efficitur. Vestibulum porta metus interdum blandit hendrerit. Duis ultrices eu erat sit amet hendrerit. Ut ut diam maximus, luctus eros a, molestie ipsum.

Praesent ultricies orci sit amet lobortis ultrices. Donec consectetur nisl quis dignissim rutrum. Vestibulum vulputate massa dui, id elementum mi sagittis sed. Integer volutpat quis tellus et porta. Cras diam justo, volutpat at sodales at, venenatis et quam. Quisque a magna malesuada eros varius porta vel at nibh. Aliquam ac erat magna. Vestibulum sodales aliquam nulla at gravida. Vivamus ac ullamcorper nibh.

Aenean scelerisque dolor a arcu viverra, quis convallis nulla tincidunt. Curabitur posuere vestibulum mollis. Nunc tincidunt ipsum tellus, ut sollicitudin orci ornare non. Etiam sapien neque, cursus sed purus ut, tincidunt elementum nulla. Sed at gravida magna. Duis euismod blandit placerat. Sed eget efficitur quam. Sed vel scelerisque urna. Praesent nulla sem, eleifend eget luctus eget, vestibulum vel purus. Etiam aliquet feugiat felis, quis convallis sapien commodo sit amet. Duis vel diam eros.

Nunc malesuada et sem ac fringilla. Phasellus pulvinar molestie turpis vel feugiat. Nam consequat congue odio aliquet dictum. Nam rhoncus elementum leo, in varius dolor molestie et. Pellentesque vestibulum ante gravida, malesuada lectus et, scelerisque metus. Curabitur dapibus dolor sem, efficitur bibendum sem dignissim id. Curabitur rhoncus tempor elit, vel ultrices nibh tincidunt vitae. Phasellus imperdiet vel justo non tincidunt. Integer ultrices luctus purus ut eleifend. Duis et sem vel quam dignissim placerat at id erat. Ut a mattis felis.

Proin ut mi sed quam efficitur aliquet. Fusce in velit id diam dignissim facilisis sit amet nec eros. Etiam non est nibh. Mauris ultrices scelerisque mauris, nec dapibus ligula. Nulla eu posuere lectus. Sed accumsan eros eget nulla finibus dignissim. Fusce eleifend congue ipsum, vitae condimentum libero bibendum sit amet. Sed vulputate tincidunt dui, in vehicula odio mattis id. Etiam condimentum venenatis lacus, ut porttitor neque rutrum sit amet. Nulla sit amet urna at nunc gravida convallis eget sed justo. Nulla posuere ultrices eros, molestie placerat enim molestie in. Integer rutrum orci felis, et pulvinar odio fringilla eget. In vehicula sit amet massa et varius. Etiam non ex quis tortor laoreet aliquam.

Pellentesque nec arcu suscipit, facilisis leo quis, viverra velit. Fusce non ultricies nisi, eu auctor felis. Integer vehicula dui a quam tempus, ut venenatis tortor tempus. Donec turpis erat, consequat varius lorem in, fermentum scelerisque enim. Sed laoreet a sapien vel pellentesque. Aliquam nulla ligula, malesuada sit amet enim non, tempus dictum mauris. Fusce congue rutrum ex ut posuere. Interdum et malesuada fames ac ante ipsum primis in faucibus. Proin a nibh porta, consectetur tellus at, lacinia tellus. In consequat massa enim, non condimentum purus tincidunt nec. Proin sapien ipsum, consectetur laoreet mauris in, sagittis ornare turpis.

Morbi interdum rutrum mi ac vehicula. Aenean eget nulla a turpis suscipit accumsan. Nam vehicula eu diam mattis pharetra. In rutrum mi ac ipsum gravida, egestas consequat dui viverra. Cras eleifend sapien sed odio suscipit euismod. Praesent suscipit nec justo convallis gravida. Curabitur ullamcorper, velit cursus placerat facilisis, enim leo interdum tortor, sit amet blandit neque felis sed lectus. Phasellus non ex sed libero finibus rutrum eget molestie turpis. Quisque vehicula vehicula hendrerit.

Aenean ultricies porttitor lobortis. Sed vitae enim ipsum. Praesent feugiat vel dolor eget lacinia. In in porttitor arcu. Quisque consequat augue ut dolor semper vulputate. Donec auctor vulputate lectus quis maximus. Mauris ultricies molestie porta. Nulla facilisi. Suspendisse dignissim diam sollicitudin, imperdiet odio sed, rhoncus arcu.

Fusce sapien sem, blandit ac tellus pulvinar, volutpat accumsan metus. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Quisque vehicula, tortor at lobortis interdum, turpis erat sollicitudin orci, vel egestas dolor ligula at felis. Proin ac vestibulum nisl. Vivamus fermentum tellus volutpat sem tristique, a venenatis dolor volutpat. Nunc a mauris tincidunt, consequat dolor in, bibendum nisi. Quisque venenatis magna massa, et dictum ligula accumsan sed. Sed eget nulla vitae justo ultrices rutrum. Cras sagittis congue congue.

Fusce interdum, metus a volutpat molestie, lacus metus aliquet odio, eget suscipit nibh eros efficitur mi. Nullam ut tristique libero. Duis commodo, nulla at dapibus vulputate, ipsum turpis faucibus ipsum, eu egestas elit dolor sed leo. Nullam porttitor gravida dolor vel tincidunt. Mauris non ante elementum, luctus nisl quis, fermentum dui. Phasellus in fringilla odio. Proin sed rutrum nunc. Curabitur sed suscipit libero, fermentum consequat ligula. Vestibulum consectetur lacus sed mi interdum ullamcorper. Sed non ultrices arcu, sed tincidunt sem. Sed et ante ac eros vulputate luctus.

In vitae ipsum quis justo elementum vestibulum. Aenean finibus, magna vel dapibus molestie, nibh lectus faucibus tellus, ut imperdiet purus nisl ut leo. Ut laoreet nisi vel vehicula pretium. Mauris mi lectus, maximus sed libero id, sollicitudin tristique massa. Aliquam sollicitudin felis eget quam tempus, vel blandit mi facilisis. Nunc facilisis ullamcorper nisi non facilisis. Etiam et porttitor ligula, sed sagittis tellus. Mauris facilisis diam libero, ut fermentum nunc lobortis scelerisque. Etiam a commodo est, aliquet commodo justo.

Integer facilisis hendrerit massa a posuere. Vestibulum accumsan orci nec lorem auctor aliquam. Aliquam erat volutpat. Mauris ligula purus, lacinia nec elit vitae, fermentum porttitor nisl. Nunc finibus non sapien at sagittis. Morbi elementum elit nec justo varius, eu tristique risus egestas. Fusce ipsum purus, eleifend malesuada leo a, auctor vulputate arcu. Pellentesque in dui vitae massa dignissim hendrerit nec quis elit. Quisque dapibus, ex eu molestie vestibulum, dolor orci congue dui, non porta nisi purus in leo. Etiam ac lacus sit amet sem varius mollis. Donec id tempor mauris. Cras vel finibus lorem. Fusce tempor consectetur metus eget pharetra. Pellentesque aliquam bibendum cursus. Pellentesque feugiat tempus sollicitudin.

Donec vitae imperdiet lacus. Duis ac vulputate dolor. Praesent nec mauris luctus, eleifend justo quis, suscipit ipsum. Aenean dui nulla, dapibus nec consectetur eu, semper sit amet libero. Nunc vitae lacus tortor. Nam a commodo ipsum, ut accumsan lorem. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nullam eget sem est. Vivamus non vestibulum mauris. Nullam et ultricies est.

Nullam convallis risus eget ligula maximus aliquam. Pellentesque varius, enim sed volutpat congue, leo augue porttitor arcu, ac hendrerit ex leo ac neque. Curabitur sodales consequat volutpat. Cras ultrices magna non sollicitudin suscipit. Cras dignissim consectetur lacus eu imperdiet. Maecenas est ex, pulvinar quis aliquet a, consectetur vitae ex. Duis sem magna, facilisis vel lacinia auctor, pharetra vel arcu. Sed sit amet velit id ante sagittis blandit. Suspendisse venenatis vehicula massa, et auctor tortor laoreet mattis. Etiam tincidunt arcu vel erat dapibus feugiat. Cras augue metus, suscipit et felis nec, luctus scelerisque augue. Aliquam eu luctus erat. Maecenas. Aliquam eu luctus erat. Maecenas. Suspendisse venenatis vehicula massa, et auctor tortor laoreet mattis. Etiam tincidunt arcu vel erat dapibus feugiat. Cras augue metus, suscipit et felis nec, luctus scelerisque augue. Aliquam eu luctus erat. Maecenas. Aliquam eu luctus erat. Maecenas In rutrum mi ac ipsum gravida, egestas consequat dui viverra. Cras eleifend sapien sed odio suscipit euismod. Praesent suscipit nec justo convallis gravida. Curabitur ullamcorper, velit cursus placerat facilisis, enim leo interdum tortor, sit amet blandit neque felis sed lectus. Phasellus non ex sed libero finibus rutrum eget molestie turpis. Quisque vehicula vehicula hendrerit.

Aenean ultricies porttitor lobortis. Sed vitae enim ipsum. Praesent feugiat vel dolor eget lacinia. In in porttitor arcu. Quisque consequat augue ut dolor semper vulputate. Donec auctor vulputate lectus quis maximus. Mauris ultricies molestie porta. Nulla facilisi. Suspendisse dignissim diam sollicitudin, imperdiet odio sed, rhoncus arcu.

Fusce sapien sem, blandit ac tellus pulvinar, volutpat accumsan metus. Vestibulum ante ipsum primis in faucibus orci luctus et ultrices posuere cubilia Curae; Quisque vehicula, tortor at lobortis interdum, turpis erat sollicitudin orci, vel egestas dolor ligula at felis. Proin ac vestibulum nisl. Vivamus fermentum tellus volutpat sem tristique, a venenatis dolor volutpat. Nunc a mauris tincidunt, consequat dolor in, bibendum nisi. Quisque venenatis magna massa, et dictum ligula accumsan sed. Sed eget nulla vitae justo ultrices rutrum. Cras sagittis congue congue.

Fusce interdum, metus a volutpat molestie, lacus metus aliquet odio, eget suscipit nibh eros efficitur mi. Nullam ut tristique libero. Duis commodo, nulla at dapibus vulputate, ipsum turpis faucibus ipsum, eu egestas elit dolor sed leo. Nullam porttitor gravida dolor vel tincidunt. Mauris non ante elementum, luctus nisl quis, fermentum dui. Phasellus in fringilla odio. Proin sed rutrum nunc. Curabitur sed suscipit libero, fermentum consequat ligula. Vestibulum consectetur lacus sed mi interdum ullamcorper. Sed non ultrices arcu, sed tincidunt sem. Sed et ante ac eros vulputate luctus.

In vitae ipsum quis justo elementum vestibulum. Aenean finibus, magna vel dapibus molestie, nibh lectus faucibus tellus, ut imperdiet purus nisl ut leo. Ut laoreet nisi vel vehicula pretium. Mauris mi lectus, maximus sed libero id, sollicitudin tristique massa. Aliquam sollicitudin felis eget quam tempus, vel blandit mi facilisis. Nunc facilisis ullamcorper nisi non facilisis. Etiam et porttitor ligula, sed sagittis tellus. Mauris facilisis diam libero, ut fermentum nunc lobortis scelerisque. Etiam a commodo est, aliquet commodo justo.

Integer facilisis hendrerit massa a posuere. Vestibulum accumsan orci nec lorem auctor aliquam. Aliquam erat volutpat. Mauris ligula purus, lacinia nec elit vitae, fermentum porttitor nisl. Nunc finibus non sapien at sagittis. Morbi elementum elit nec justo varius, eu tristique risus egestas. Fusce ipsum purus, eleifend malesuada leo a, auctor vulputate arcu. Pellentesque in dui vitae massa dignissim hendrerit nec quis elit. Quisque dapibus, ex eu molestie vestibulum, dolor orci congue dui, non porta nisi purus in leo. Etiam ac lacus sit amet sem varius mollis. Donec id tempor mauris. Cras vel finibus lorem. Fusce tempor consectetur metus eget pharetra. Pellentesque aliquam bibendum cursus. Pellentesque feugiat tempus sollicitudin.

Donec vitae imperdiet lacus. Duis ac vulputate dolor. Praesent nec mauris luctus, eleifend justo quis, suscipit ipsum. Aenean dui nulla, dapibus nec consectetur eu, semper sit amet libero. Nunc vitae lacus tortor. Nam a commodo ipsum, ut accumsan lorem. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Nullam eget sem est. Vivamus non vestibulum mauris. Nullam et ultricies est.

Nullam convallis risus eget ligula maximus aliquam. Pellentesque varius, enim sed volutpat congue, leo augue porttitor arcu, ac hendrerit ex leo ac neque. Curabitur sodales consequat volutpat. Cras ultrices magna non sollicitudin suscipit. Cras dignissim consectetur lacus eu imperdiet. Maecenas est ex, pulvinar quis aliquet a, consectetur vitae ex. Duis sem magna, facilisis vel lacinia auctor, pharetra vel arcu. Sed sit amet velit id ante sagittis blandit. Suspendisse venenatis vehicula massa, et auctor tortor laoreet mattis. Etiam tincidunt arcu vel erat dapibus feugiat. Cras augue metus, suscipit et felis nec, luctus scelerisque augue. Aliquam eu luctus erat. Maecenas. Aliquam eu luctus erat. Maecenas. Suspendisse venenatis vehicula massa, et auctor tortor laoreet mattis. Etiam tincidunt arcu vel erat dapibus feugiat. Cras augue metus, suscipit et felis nec, luctus scelerisque augue. Aliquam eu luctus erat. Maecenas. Aliquam eu luctus erat. Maecenas

", "WORDCOUNT"], 15 | "ANCHOR_INTERNAL_HREF": ["Portfolio", ""], 16 | "ANCHOR_INTERNAL_HREF_SLASH": ["About Me", ""], 17 | "ANCHOR_INTERNAL_HREF_NOSLASH": ["About Me", ""], 18 | "ANCHOR_SOCIAL_HREF": ["Follow me on Twitter", ""] 19 | } 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /tests/unit/data_sitemap_negative.json: -------------------------------------------------------------------------------- 1 | { 2 | "invalid_sitemap": "" 3 | } 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /tests/unit/data_sitemap_positive.json: -------------------------------------------------------------------------------- 1 | { 2 | "positive": "https://harshcasper.github.iohttps://harshcasper.github.io/portfolio.html", 3 | "nolocations": "" 4 | } -------------------------------------------------------------------------------- /tests/unit/data_url_negative.json: -------------------------------------------------------------------------------- 1 | { 2 | "URL_TOO_LONG": ["http://www.amazon.com/gp/product/B0007TJ5OG/102-8372974-4064145?v=glance&n=502394&m=ATVPDKIKX0DER&n=3031001&s=photo&v=glance'", "URL_TOO_LONG"], 3 | "URL_TOO_GENERIC": ["http://www.domain.com/page1.html", "URL_TOO_GENERIC"], 4 | "URL_KEYWORD_STUFFED": ["http://www.domain.com/baseball-cards-baseball-cards-baseballcards.htm", "URL_KEYWORD_STUFFED"], 5 | "URL_TOO_DEEP": ["http://www.domain.com/redfish/bluefish/blackfish/bluefish/oldfish/newfish/little_star.html", "URL_TOO_DEEP"], 6 | "URL_CAPITALIZED": ["http://www.domain.com/SomeWhoRun.html", "URL_CAPITALIZED"] 7 | } 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/unit/data_url_positive.json: -------------------------------------------------------------------------------- 1 | { 2 | "URL_CORRECTLY_CASED": ["http://www.domain.com/page1.html", "URL_CORRECTLY_CASED"] 3 | } 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /tests/unit/data_visible_tags.json: -------------------------------------------------------------------------------- 1 | { 2 | "style": ["", false], 3 | "script": ["", false], 4 | "document": ["[document]", false], 5 | "head": ["", false], 6 | "title": ["Title", false], 7 | "meta": ["", false], 8 | "comment": ["
", false], 9 | "p": ["

paragraph

", true], 10 | "a": ["link", true], 11 | "b": ["bold", true], 12 | "strong": ["strong", true], 13 | "em": ["em", true], 14 | "i": ["italic", true], 15 | "h1": ["

header 1

", true], 16 | "h2": ["

header 2

", true] 17 | } 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /tests/unit/data_webpage.json: -------------------------------------------------------------------------------- 1 | { 2 | "good_website": "200|This is a title that is good.", 3 | "not_found_website": "404|This is a title that is not found.", 4 | "crashed_website": "500|This is a title for a broken site." 5 | } 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /tests/unit/test_stop_words.py: -------------------------------------------------------------------------------- 1 | import testtools 2 | from webedge import stop_words 3 | 4 | 5 | class StopWordsTests(testtools.TestCase): 6 | 7 | def setUp(self): 8 | super(StopWordsTests, self).setUp() 9 | pass 10 | 11 | def test_stopwords(self): 12 | words = stop_words.ENGLISH_STOP_WORDS 13 | self.assertTrue("able" in words) 14 | self.assertTrue("about" in words) 15 | self.assertTrue("looks" in words) 16 | self.assertTrue("zero" in words) 17 | self.assertEqual(len(words), 635) 18 | -------------------------------------------------------------------------------- /tests/unit/test_webpage_analysis.py: -------------------------------------------------------------------------------- 1 | import bs4 2 | import ddt 3 | import testtools 4 | from webedge import webpage_analysis 5 | from webedge.warnings import BADGES 6 | from webedge.warnings import WARNINGS 7 | 8 | 9 | @ddt.ddt 10 | class WebpageTests(testtools.TestCase): 11 | 12 | def setUp(self): 13 | super(WebpageTests, self).setUp() 14 | self.titles = {} 15 | self.descriptions = {} 16 | 17 | def soup_file(self, html): 18 | soup = bs4.BeautifulSoup(html, "html.parser") 19 | return soup 20 | 21 | @ddt.file_data('data_html_positive.json') 22 | def test_analyze_positive(self, data): 23 | html = data[0] 24 | # badge = data[1] 25 | self.wp = webpage_analysis.Webpage( 26 | "https://harshcasper.github.io", 27 | html, 28 | self.titles, 29 | self.descriptions) 30 | self.wp.report() 31 | 32 | @ddt.file_data('data_html_negative.json') 33 | def test_analyze_negative(self, data): 34 | html = data[0] 35 | expected_error = data[1] 36 | self.wp = webpage_analysis.Webpage( 37 | "https://harshcasper.github.io", 38 | html, 39 | self.titles, 40 | self.descriptions) 41 | self.wp.report() 42 | self.assertTrue(any(issue["warning"] == WARNINGS[expected_error] 43 | for issue in self.wp.issues), 44 | "{0} not raised.".format(WARNINGS[expected_error])) 45 | 46 | @ddt.file_data('data_url_negative.json') 47 | def test_analyze_negative_url(self, data): 48 | url = data[0] 49 | expected_error = data[1] 50 | html = "" 51 | self.wp = webpage_analysis.Webpage( 52 | url, html, self.titles, self.descriptions) 53 | self.wp.report() 54 | self.assertTrue(any(issue["warning"] == WARNINGS[expected_error] 55 | for issue in self.wp.issues), 56 | "{0} not raised.".format(WARNINGS[expected_error])) 57 | 58 | @ddt.file_data('data_url_positive.json') 59 | def test_analyze_positive_url(self, data): 60 | url = data[0] 61 | badge = data[1] 62 | html = "" 63 | self.wp = webpage_analysis.Webpage( 64 | url, html, self.titles, self.descriptions) 65 | self.wp.report() 66 | if badge != "": 67 | self.assertTrue(any(earned["achievement"] == BADGES[badge] 68 | for earned in self.wp.achieved), 69 | "{0} not earned".format(BADGES[badge])) 70 | 71 | @ddt.file_data('data_visible_tags.json') 72 | def test_visible_tags(self, data): 73 | html = "" 74 | self.wp = webpage_analysis.Webpage( 75 | "https://harshcasper.github.io", 76 | html, 77 | self.titles, 78 | self.descriptions) 79 | soup = self.soup_file(data[0]) 80 | elements = soup.findAll(text=True) 81 | for tag in elements: 82 | result = self.wp.visible_tags(tag) 83 | self.assertEqual(result, data[1]) 84 | 85 | @ddt.file_data('data_duplicates_negative.json') 86 | def test_analyze_duplicates_negative(self, page): 87 | html = page[0] 88 | expected_error = page[1] 89 | report = {"pages": []} 90 | for i in range(0, 2): 91 | self.wp = webpage_analysis.Webpage( 92 | "https://harshcasper.github.io/page{0}.html".format(i), 93 | html, 94 | self.titles, 95 | self.descriptions) 96 | 97 | page_report = self.wp.report() 98 | report['pages'].append(page_report) 99 | self.assertTrue(any(issue["warning"] == WARNINGS[expected_error] 100 | for p in report['pages'] for issue in p['issues']), 101 | "{0} not raised. {1} {2}".format( 102 | WARNINGS[expected_error], 103 | self.titles, 104 | self.descriptions)) 105 | -------------------------------------------------------------------------------- /tests/unit/test_website_analysis.py: -------------------------------------------------------------------------------- 1 | import uuid 2 | import mock 3 | import requests 4 | from bs4 import BeautifulSoup as Soup 5 | from webedge import website_analysis 6 | from webedge.warnings import BADGES 7 | from webedge.warnings import WARNINGS 8 | import ddt 9 | import testtools 10 | 11 | 12 | @ddt.ddt 13 | class WebsiteTests(testtools.TestCase): 14 | 15 | def setUp(self): 16 | super(WebsiteTests, self).setUp() 17 | self.site_url = "http://www.mock{0}.com".format(uuid.uuid4()) 18 | 19 | def test_init_url(self): 20 | web_page = website_analysis.Spider(self.site_url, None) 21 | self.assertEqual(len(web_page.pages_to_crawl), 1) 22 | self.assertEqual(web_page.pages_to_crawl[0], self.site_url) 23 | 24 | @ddt.file_data("data_sitemap_positive.json") 25 | @mock.patch('webedge.website_analysis.requests.get') 26 | def test_init_sitemap_positive(self, sitemap_content, mock_requests): 27 | sitemap_url = "/sitemap.xml" 28 | mock_requests.return_value.status_code = requests.codes.ok 29 | mock_requests.return_value.content = sitemap_content 30 | web_page = website_analysis.Spider(self.site_url, self.site_url + sitemap_url) 31 | self.assertTrue(self.site_url in web_page.pages_to_crawl) 32 | 33 | @ddt.file_data("data_sitemap_negative.json") 34 | @mock.patch('webedge.website_analysis.requests.get') 35 | def test_init_sitemap_negative(self, sitemap_content, mock_requests): 36 | sitemap_url = "/sitemap.xml" 37 | mock_requests.return_value.status_code = requests.codes.not_found 38 | mock_requests.return_value.content = sitemap_content 39 | web_page = website_analysis.Spider(self.site_url, self.site_url + sitemap_url) 40 | self.assertTrue(self.site_url in web_page.pages_to_crawl) 41 | 42 | @ddt.file_data("data_sitemap_positive.json") 43 | def test_parse_sitemap(self, sitemap_content): 44 | web_page = website_analysis.Spider(self.site_url, None) 45 | locations = web_page._parse_sitemap(sitemap_content) 46 | soup = Soup(sitemap_content, "html.parser") 47 | urls = soup.findAll('url') 48 | self.assertEqual(len(locations), len(urls)) 49 | 50 | @ddt.file_data("data_webpage.json") 51 | @mock.patch('webedge.website_analysis.requests.get') 52 | def test_crawl(self, data, mock_requests): 53 | web_page = website_analysis.Spider(self.site_url, None) 54 | web_page._analyze_crawlers = mock.MagicMock(name="_analyze_crawlers") 55 | resp_code, content = data.split("|") 56 | mock_requests.return_value.status_code = int(resp_code) 57 | mock_requests.return_value.content = content 58 | web_page.crawl() 59 | if int(resp_code) == requests.codes.ok: 60 | self.assertEqual(len(web_page.issues), 0) 61 | elif int(resp_code) == requests.codes.not_found: 62 | self.assertTrue(any(issue["warning"] == WARNINGS["BROKEN_LINK"] 63 | for issue in web_page.issues), 64 | "{0} not raised.".format(WARNINGS["BROKEN_LINK"])) 65 | else: 66 | self.assertTrue(any(issue["warning"] == WARNINGS["SERVER_ERROR"] 67 | for issue in web_page.issues), 68 | "{0} not raised.".format(WARNINGS["SERVER_ERROR"])) 69 | 70 | @ddt.data("200", "404", "500") 71 | @mock.patch('webedge.website_analysis.requests.get') 72 | def test_analyze_crawlers(self, resp_code, mock_requests): 73 | mock_requests.return_value.status_code = int(resp_code) 74 | web_page = website_analysis.Spider(self.site_url, None) 75 | web_page._analyze_crawlers() 76 | if int(resp_code) == requests.codes.ok: 77 | self.assertTrue(any(earned["achievement"] == BADGES["ROBOTS.TXT"] 78 | for earned in web_page.achieved), 79 | "{0} not earned".format(BADGES["ROBOTS.TXT"])) 80 | else: 81 | self.assertTrue(any(issue["warning"] == WARNINGS["ROBOTS.TXT"] 82 | for issue in web_page.issues), 83 | "{0} not raised.".format(WARNINGS["ROBOTS.TXT"])) 84 | 85 | @ddt.data("200", "404", "500") 86 | @mock.patch('webedge.website_analysis.requests.get') 87 | def test_analyze_blog(self, resp_code, mock_requests): 88 | mock_requests.return_value.status_code = int(resp_code) 89 | web_page = website_analysis.Spider(self.site_url, None) 90 | web_page._analyze_blog() 91 | if int(resp_code) == requests.codes.ok: 92 | self.assertTrue( 93 | any(earned["achievement"] == BADGES["BLOG_DETECTED"] 94 | for earned in web_page.achieved), 95 | "{0} not earned".format(BADGES["BLOG_DETECTED"])) 96 | else: 97 | self.assertTrue( 98 | any(issue["warning"] == WARNINGS["BLOG_MISSING"] 99 | for issue in web_page.issues), 100 | "{0} not raised.".format(WARNINGS["BLOG_MISSING"])) 101 | -------------------------------------------------------------------------------- /webedge/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/HarshCasper/WebEdge/3175e89a1753c9ef9a5e69766d355319206f84a3/webedge/__init__.py -------------------------------------------------------------------------------- /webedge/cli_output.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function, unicode_literals 2 | from PyInquirer import style_from_dict, Token, prompt 3 | from pyfiglet import Figlet 4 | from colorama import Fore, Style 5 | import json 6 | import clanimate 7 | import yaml 8 | 9 | jsonData = ["emptyData"] 10 | loadingAnim = clanimate.Animator( 11 | "scroll_text", 12 | 10, 13 | name=" => WebEdge Is Scrapping Your Website ", 14 | animation_frames="===============", 15 | ) 16 | 17 | style = style_from_dict( 18 | { 19 | Token.Separator: "#cc5454", 20 | Token.QuestionMark: "#00ff00 bold", 21 | Token.Selected: "#cc5454", # default 22 | Token.Pointer: "#673ab7 bold", 23 | Token.Instruction: "", # default 24 | Token.Answer: "#0000ff bold", 25 | Token.Question: "", 26 | } 27 | ) 28 | 29 | 30 | def shouldc2(answers): 31 | check1 = answers["c1"] 32 | if check1 != "site": 33 | return True 34 | return False 35 | 36 | 37 | def getm2(answers): 38 | m2 = "Which " + str(answers["c1"] + " you want to review?") 39 | return m2 40 | 41 | 42 | def getc2(answers): 43 | options = [] 44 | if answers["c1"] != "pages": 45 | options.append("no pages") 46 | return options 47 | for i in jsonData[answers["c1"]]: 48 | options.append(i["url"]) 49 | if not options: 50 | options.append("no pages") 51 | return options 52 | 53 | 54 | def filterc2(val): 55 | m = 0 56 | for i in jsonData["pages"]: 57 | if i["url"] == val: 58 | return m 59 | m = m + 1 60 | 61 | 62 | def filterc3(val): 63 | if val[0] == "I": 64 | return "issues" 65 | return "achieved" 66 | 67 | 68 | def outputJson(jsonValue): 69 | global jsonData # skipcq PYL-W0603 70 | jsonData = json.loads(jsonValue) 71 | options = [] 72 | for i in jsonData.keys(): 73 | options.append(i) 74 | print() 75 | questions = [ 76 | { 77 | "type": "list", 78 | "name": "c1", 79 | "message": "What do you want to check first ?", 80 | "choices": options, 81 | }, 82 | { 83 | "type": "list", 84 | "name": "c2", 85 | "message": "Which page do you want to review ?", 86 | "choices": getc2, 87 | "filter": filterc2, 88 | "when": shouldc2, 89 | }, 90 | { 91 | "type": "list", 92 | "name": "c3", 93 | "message": "Issues or Achievements?", 94 | "choices": ["Issues", "Achievements"], 95 | "filter": filterc3, 96 | }, 97 | { 98 | "type": "list", 99 | "name": "c4", 100 | "message": "See them all at once or one by one", 101 | "choices": ["All at Once", "One by One"], 102 | }, 103 | ] 104 | answers = prompt(questions, style=style) 105 | k1 = "warning" 106 | if answers["c3"] == "achieved": 107 | k1 = "achievement" 108 | 109 | if answers["c1"] == "pages": 110 | li = jsonData[answers["c1"]][answers["c2"]][answers["c3"]] 111 | else: 112 | li = jsonData[answers["c1"]][answers["c3"]] 113 | 114 | no = 0 115 | didBreak = False 116 | allAtOnce = False 117 | if answers["c4"] == "All at Once": 118 | allAtOnce = True 119 | for i in li: 120 | no = no + 1 121 | ivalue = str(i["value"]) 122 | message = ( 123 | "Point - " + str(no) + "\n Label : " + i[k1] + "\n Current : " + ivalue 124 | ) 125 | if allAtOnce is False: 126 | qn = [ 127 | { 128 | "type": "confirm", 129 | "name": "forward", 130 | "message": message + "\n Go to next?", 131 | "default": True, 132 | } 133 | ] 134 | a = prompt(qn, style=style) 135 | if a["forward"] is False: 136 | didBreak = True 137 | break 138 | else: 139 | if no % 2 == 1: 140 | print(Fore.BLUE + Style.BRIGHT + message + "\n" + Style.RESET_ALL) 141 | else: 142 | print(Fore.CYAN + Style.BRIGHT + message + "\n" + Style.RESET_ALL) 143 | 144 | if didBreak is False and allAtOnce is False: 145 | print("List Ended") 146 | 147 | retry = [ 148 | { 149 | "type": "confirm", 150 | "name": "again", 151 | "message": "Do you want to check other things?", 152 | "default": True, 153 | } 154 | ] 155 | res = prompt(retry, style=style) 156 | if res["again"] is True: 157 | outputJson(jsonValue) 158 | else: 159 | saveFile = [ 160 | { 161 | "type": "confirm", 162 | "name": "fileSave", 163 | "message": "Do you want to save your analysis in a file?", 164 | "default": True, 165 | } 166 | ] 167 | isFileSaved = prompt(saveFile, style=style) 168 | if isFileSaved["fileSave"] is True: 169 | filename = str(jsonData["pages"][0]["url"] + "_webedge_analysis.yaml") 170 | bad_chars = ["/", ":", "\\"] 171 | for i in bad_chars: 172 | filename = filename.replace(i, "") 173 | with open(filename, "w+") as f: 174 | f.write(yaml.dump(yaml.safe_load(json.dumps(json.loads(jsonValue))))) 175 | print(filename + " saved") 176 | 177 | print( 178 | Fore.GREEN 179 | + Style.BRIGHT 180 | + "=====================\nWebEdge Analysis Done\n=====================" 181 | + Style.RESET_ALL 182 | ) 183 | 184 | 185 | def outputName(name): 186 | f = Figlet(font="slant") 187 | print(Style.RESET_ALL) 188 | print(Fore.GREEN + Style.BRIGHT + f.renderText(name)) 189 | print(Style.RESET_ALL) 190 | 191 | 192 | def startLoading(): 193 | print(Fore.GREEN + Style.BRIGHT, end="") 194 | loadingAnim.start_animation() 195 | 196 | 197 | def endLoading(): 198 | loadingAnim.end_animation() 199 | print(Style.RESET_ALL, end="") 200 | 201 | 202 | def outputError(): 203 | catsay("WebEdge Couldn't Parse Your Website") 204 | 205 | 206 | def printError(errMessage): 207 | print( 208 | Style.RESET_ALL 209 | + Fore.RED 210 | + Style.BRIGHT 211 | + "\nERROR => " 212 | + errMessage 213 | + Style.RESET_ALL 214 | ) 215 | 216 | 217 | def exitError(): 218 | catsay("Unexpected Exit By User") 219 | 220 | 221 | def catsay(message): 222 | space = len(message) + 4 223 | upBlock = " " + "_" * space + "\n " + "/" + " " * space + "\\ \n |< " 224 | downBlock = " >|\n \\" + "_" * space + "/\n " 225 | catStr = ( 226 | " \\ \n " 227 | + " \\ /\\_/\\ ___\n " 228 | + " \\ = o_o =_______ \\ \\ \n " 229 | + " __^ __( \\.__) )\n " 230 | + " (@)<_____>__(_____)____/\n" 231 | ) 232 | print( 233 | Style.RESET_ALL 234 | + Fore.YELLOW 235 | + Style.BRIGHT 236 | + upBlock 237 | + message 238 | + downBlock 239 | + catStr 240 | + Style.RESET_ALL 241 | ) 242 | -------------------------------------------------------------------------------- /webedge/social_websites.py: -------------------------------------------------------------------------------- 1 | SOCIAL_WEBSITES = [ 2 | "www.facebook.com", 3 | "twitter.com", 4 | "plus.google.com", 5 | "www.instagram.com", 6 | "www.pinterest.com", 7 | "apple.com", 8 | "youtube.com", 9 | "www.google.com", 10 | "play.google.com", 11 | "microsoft.com", 12 | "www.blogger.com", 13 | "en.wikipedia.org", 14 | "wordpress.org", 15 | "maps.google.com", 16 | "docs.google.com", 17 | "linkedin.com", 18 | "mozilla.org", 19 | "youtu.be", 20 | "amazon.com", 21 | "github.com", 22 | "medium.com", 23 | "www.yahoo.com", 24 | "t.me", 25 | "paypal.com", 26 | "slideshare.net", 27 | "whatsapp.com", 28 | "telegram.me", 29 | "bit.ly", 30 | "quora.com", 31 | "discord.gg", 32 | "calendar.google.com", 33 | "outlook.com", 34 | "canva.com", 35 | "ieee.org", 36 | ] 37 | -------------------------------------------------------------------------------- /webedge/stop_words.py: -------------------------------------------------------------------------------- 1 | ENGLISH_STOP_WORDS = [ 2 | "able", 3 | "about", 4 | "above", 5 | "abroad", 6 | "according", 7 | "accordingly", 8 | "across", 9 | "actually", 10 | "adj", 11 | "after", 12 | "afterwards", 13 | "again", 14 | "against", 15 | "ago", 16 | "ahead", 17 | "ain't", 18 | "all", 19 | "allow", 20 | "allows", 21 | "almost", 22 | "alone", 23 | "along", 24 | "alongside", 25 | "already", 26 | "also", 27 | "although", 28 | "always", 29 | "am", 30 | "amid", 31 | "amidst", 32 | "among", 33 | "amongst", 34 | "an", 35 | "and", 36 | "another", 37 | "any", 38 | "anybody", 39 | "anyhow", 40 | "anyone", 41 | "anything", 42 | "anyway", 43 | "anyways", 44 | "anywhere", 45 | "apart", 46 | "appear", 47 | "appreciate", 48 | "appropriate", 49 | "are", 50 | "aren't", 51 | "around", 52 | "as", 53 | "a's", 54 | "aside", 55 | "ask", 56 | "asking", 57 | "associated", 58 | "at", 59 | "available", 60 | "away", 61 | "awfully", 62 | "back", 63 | "backward", 64 | "backwards", 65 | "be", 66 | "became", 67 | "because", 68 | "become", 69 | "becomes", 70 | "becoming", 71 | "been", 72 | "before", 73 | "beforehand", 74 | "begin", 75 | "behind", 76 | "being", 77 | "believe", 78 | "below", 79 | "beside", 80 | "besides", 81 | "best", 82 | "better", 83 | "between", 84 | "beyond", 85 | "both", 86 | "brief", 87 | "but", 88 | "by", 89 | "came", 90 | "can", 91 | "cannot", 92 | "cant", 93 | "can't", 94 | "caption", 95 | "cause", 96 | "causes", 97 | "certain", 98 | "certainly", 99 | "changes", 100 | "clearly", 101 | "c'mon", 102 | "co", 103 | "co.", 104 | "com", 105 | "come", 106 | "comes", 107 | "concerning", 108 | "consequently", 109 | "consider", 110 | "considering", 111 | "contain", 112 | "containing", 113 | "contains", 114 | "corresponding", 115 | "could", 116 | "couldn't", 117 | "course", 118 | "c's", 119 | "currently", 120 | "dare", 121 | "daren't", 122 | "definitely", 123 | "described", 124 | "despite", 125 | "did", 126 | "didn't", 127 | "different", 128 | "directly", 129 | "do", 130 | "does", 131 | "doesn't", 132 | "doing", 133 | "done", 134 | "don't", 135 | "down", 136 | "downwards", 137 | "during", 138 | "each", 139 | "edu", 140 | "eg", 141 | "eight", 142 | "eighty", 143 | "either", 144 | "else", 145 | "elsewhere", 146 | "end", 147 | "ending", 148 | "enough", 149 | "entirely", 150 | "especially", 151 | "et", 152 | "etc", 153 | "even", 154 | "ever", 155 | "evermore", 156 | "every", 157 | "everybody", 158 | "everyone", 159 | "everything", 160 | "everywhere", 161 | "ex", 162 | "exactly", 163 | "example", 164 | "except", 165 | "fairly", 166 | "far", 167 | "farther", 168 | "few", 169 | "fewer", 170 | "fifth", 171 | "first", 172 | "five", 173 | "followed", 174 | "following", 175 | "follows", 176 | "for", 177 | "forever", 178 | "former", 179 | "formerly", 180 | "forth", 181 | "forward", 182 | "found", 183 | "four", 184 | "from", 185 | "further", 186 | "furthermore", 187 | "get", 188 | "gets", 189 | "getting", 190 | "given", 191 | "gives", 192 | "go", 193 | "goes", 194 | "going", 195 | "gone", 196 | "got", 197 | "gotten", 198 | "greetings", 199 | "had", 200 | "hadn't", 201 | "half", 202 | "happens", 203 | "hardly", 204 | "has", 205 | "hasn't", 206 | "have", 207 | "haven't", 208 | "having", 209 | "he", 210 | "he'd", 211 | "he'll", 212 | "hello", 213 | "help", 214 | "hence", 215 | "her", 216 | "here", 217 | "hereafter", 218 | "hereby", 219 | "herein", 220 | "here's", 221 | "hereupon", 222 | "hers", 223 | "herself", 224 | "he's", 225 | "hi", 226 | "him", 227 | "himself", 228 | "his", 229 | "hither", 230 | "hopefully", 231 | "how", 232 | "howbeit", 233 | "however", 234 | "hundred", 235 | "i'd", 236 | "ie", 237 | "if", 238 | "ignored", 239 | "i'll", 240 | "i'm", 241 | "immediate", 242 | "in", 243 | "inasmuch", 244 | "inc", 245 | "inc.", 246 | "indeed", 247 | "indicate", 248 | "indicated", 249 | "indicates", 250 | "inner", 251 | "inside", 252 | "insofar", 253 | "instead", 254 | "into", 255 | "inward", 256 | "is", 257 | "isn't", 258 | "it", 259 | "it'd", 260 | "it'll", 261 | "its", 262 | "it's", 263 | "itself", 264 | "i've", 265 | "just", 266 | "k", 267 | "keep", 268 | "keeps", 269 | "kept", 270 | "know", 271 | "known", 272 | "knows", 273 | "last", 274 | "lately", 275 | "later", 276 | "latter", 277 | "latterly", 278 | "least", 279 | "less", 280 | "lest", 281 | "let", 282 | "let's", 283 | "like", 284 | "liked", 285 | "likely", 286 | "likewise", 287 | "little", 288 | "look", 289 | "looking", 290 | "looks", 291 | "low", 292 | "lower", 293 | "ltd", 294 | "made", 295 | "mainly", 296 | "make", 297 | "makes", 298 | "many", 299 | "may", 300 | "maybe", 301 | "mayn't", 302 | "me", 303 | "mean", 304 | "meantime", 305 | "meanwhile", 306 | "merely", 307 | "might", 308 | "mightn't", 309 | "mine", 310 | "minus", 311 | "miss", 312 | "more", 313 | "moreover", 314 | "most", 315 | "mostly", 316 | "mr", 317 | "mrs", 318 | "much", 319 | "must", 320 | "mustn't", 321 | "my", 322 | "myself", 323 | "name", 324 | "namely", 325 | "nd", 326 | "near", 327 | "nearly", 328 | "necessary", 329 | "need", 330 | "needn't", 331 | "needs", 332 | "neither", 333 | "never", 334 | "neverf", 335 | "neverless", 336 | "nevertheless", 337 | "new", 338 | "next", 339 | "nine", 340 | "ninety", 341 | "no", 342 | "nobody", 343 | "non", 344 | "none", 345 | "nonetheless", 346 | "noone", 347 | "no-one", 348 | "nor", 349 | "normally", 350 | "not", 351 | "nothing", 352 | "notwithstanding", 353 | "novel", 354 | "now", 355 | "nowhere", 356 | "obviously", 357 | "of", 358 | "off", 359 | "often", 360 | "oh", 361 | "ok", 362 | "okay", 363 | "old", 364 | "on", 365 | "once", 366 | "one", 367 | "ones", 368 | "one's", 369 | "only", 370 | "onto", 371 | "opposite", 372 | "or", 373 | "other", 374 | "others", 375 | "otherwise", 376 | "ought", 377 | "oughtn't", 378 | "our", 379 | "ours", 380 | "ourselves", 381 | "out", 382 | "outside", 383 | "over", 384 | "overall", 385 | "own", 386 | "particular", 387 | "particularly", 388 | "past", 389 | "per", 390 | "perhaps", 391 | "placed", 392 | "please", 393 | "plus", 394 | "possible", 395 | "presumably", 396 | "probably", 397 | "provided", 398 | "provides", 399 | "que", 400 | "quite", 401 | "qv", 402 | "rather", 403 | "rd", 404 | "re", 405 | "really", 406 | "reasonably", 407 | "recent", 408 | "recently", 409 | "regarding", 410 | "regardless", 411 | "regards", 412 | "relatively", 413 | "respectively", 414 | "right", 415 | "round", 416 | "said", 417 | "same", 418 | "saw", 419 | "say", 420 | "saying", 421 | "says", 422 | "second", 423 | "secondly", 424 | "see", 425 | "seeing", 426 | "seem", 427 | "seemed", 428 | "seeming", 429 | "seems", 430 | "seen", 431 | "self", 432 | "selves", 433 | "sensible", 434 | "sent", 435 | "serious", 436 | "seriously", 437 | "seven", 438 | "several", 439 | "shall", 440 | "shan't", 441 | "she", 442 | "she'd", 443 | "she'll", 444 | "she's", 445 | "should", 446 | "shouldn't", 447 | "since", 448 | "six", 449 | "so", 450 | "some", 451 | "somebody", 452 | "someday", 453 | "somehow", 454 | "someone", 455 | "something", 456 | "sometime", 457 | "sometimes", 458 | "somewhat", 459 | "somewhere", 460 | "soon", 461 | "sorry", 462 | "specified", 463 | "specify", 464 | "specifying", 465 | "still", 466 | "sub", 467 | "such", 468 | "sup", 469 | "sure", 470 | "take", 471 | "taken", 472 | "taking", 473 | "tell", 474 | "tends", 475 | "th", 476 | "than", 477 | "thank", 478 | "thanks", 479 | "thanx", 480 | "that", 481 | "that'll", 482 | "thats", 483 | "that's", 484 | "that've", 485 | "the", 486 | "their", 487 | "theirs", 488 | "them", 489 | "themselves", 490 | "then", 491 | "thence", 492 | "there", 493 | "thereafter", 494 | "thereby", 495 | "there'd", 496 | "therefore", 497 | "therein", 498 | "there'll", 499 | "there're", 500 | "theres", 501 | "there's", 502 | "thereupon", 503 | "there've", 504 | "these", 505 | "they", 506 | "they'd", 507 | "they'll", 508 | "they're", 509 | "they've", 510 | "thing", 511 | "things", 512 | "think", 513 | "third", 514 | "thirty", 515 | "this", 516 | "thorough", 517 | "thoroughly", 518 | "those", 519 | "though", 520 | "three", 521 | "through", 522 | "throughout", 523 | "thru", 524 | "thus", 525 | "till", 526 | "to", 527 | "together", 528 | "too", 529 | "took", 530 | "toward", 531 | "towards", 532 | "tried", 533 | "tries", 534 | "truly", 535 | "try", 536 | "trying", 537 | "t's", 538 | "twice", 539 | "two", 540 | "un", 541 | "under", 542 | "underneath", 543 | "undoing", 544 | "unfortunately", 545 | "unless", 546 | "unlike", 547 | "unlikely", 548 | "until", 549 | "unto", 550 | "up", 551 | "upon", 552 | "upwards", 553 | "us", 554 | "use", 555 | "used", 556 | "useful", 557 | "uses", 558 | "using", 559 | "usually", 560 | "v", 561 | "value", 562 | "various", 563 | "versus", 564 | "very", 565 | "via", 566 | "viz", 567 | "vs", 568 | "want", 569 | "wants", 570 | "was", 571 | "wasn't", 572 | "way", 573 | "we", 574 | "we'd", 575 | "welcome", 576 | "well", 577 | "we'll", 578 | "went", 579 | "were", 580 | "we're", 581 | "weren't", 582 | "we've", 583 | "what", 584 | "whatever", 585 | "what'll", 586 | "what's", 587 | "what've", 588 | "when", 589 | "whence", 590 | "whenever", 591 | "where", 592 | "whereafter", 593 | "whereas", 594 | "whereby", 595 | "wherein", 596 | "where's", 597 | "whereupon", 598 | "wherever", 599 | "whether", 600 | "which", 601 | "whichever", 602 | "while", 603 | "whilst", 604 | "whither", 605 | "who", 606 | "who'd", 607 | "whoever", 608 | "whole", 609 | "who'll", 610 | "whom", 611 | "whomever", 612 | "who's", 613 | "whose", 614 | "why", 615 | "will", 616 | "willing", 617 | "wish", 618 | "with", 619 | "within", 620 | "without", 621 | "wonder", 622 | "won't", 623 | "would", 624 | "wouldn't", 625 | "yes", 626 | "yet", 627 | "you", 628 | "you'd", 629 | "you'll", 630 | "your", 631 | "you're", 632 | "yours", 633 | "yourself", 634 | "yourselves", 635 | "you've", 636 | "zero", 637 | ] 638 | -------------------------------------------------------------------------------- /webedge/warnings.py: -------------------------------------------------------------------------------- 1 | WARNINGS = { 2 | "NEGATIVE_DESCRIPTION": u"Description is too negative", 3 | "NEGATIVE_TITLE": u"Title is too negative", 4 | "TITLE_MISSING": u"Title tag is missing or empty.", 5 | "TITLE_TOO_SHORT": u"Avoid using extremely short titles " 6 | u"that are unhelpful to users (less than 10 characters).", 7 | "TITLE_TOO_LONG": u"Avoid using extremely lengthy titles " 8 | u"that are unhelpful to users (more than 70 characters).", 9 | "TITLE_TOO_GENERIC": u"Avoid using default or vague titles like 'Untitled' or 'New Page 1'.", 10 | "TITLE_KEYWORD_STUFFED": u"Avoid stuffing unneeded keywords in your title tags.", 11 | "TITLE_DUPLICATED": u"Avoid using a duplicate title tag across your website.", 12 | "DESCRIPTION_MISSING": u"Description is missing.", 13 | "DESCRIPTION_TOO_SHORT": u"Description is too short (less than 140 characters). " 14 | u"Descriptions are important as Google may use them as page snippets.", 15 | "DESCRIPTION_TOO_LONG": u"Description is too long (more than 255 characters). " 16 | u"Descriptions are important as Google may use them as page snippets.", 17 | "DESCRIPTION_TOO_GENERIC": u"Description is too generic.", 18 | "DESCRIPTION_KEYWORD_STUFFED": u"Avoid keyword stuffing in the description.", 19 | "DESCRIPTION_DUPLICATED": u"Avoid using a duplicate description across your website.", 20 | "URL_TOO_LONG": u"Avoid using URLs with unnecessary parameters and IDs.", 21 | "URL_TOO_GENERIC": u"Avoid choosing generic page names like 'page1.html'.", 22 | "URL_KEYWORD_STUFFED": u"Avoid keyword stuffing in the url.", 23 | "URL_TOO_DEEP": u"Avoid having deep nesting of subdirectories (more than 3 levels deep) " 24 | u"like '.../dir1/dir2/dir3/dir4/dir5/dir6/page.html'.", 25 | "URL_NOT_CANONICAL": u"Only one version of a URL (Canonical URL) " 26 | u"should be used to reach a document", 27 | "URL_CAPITALIZED": u"Avoid using uppercase characters in the URL. " 28 | u"Many users expect lower-case URLs and remember them better.", 29 | "IMAGE_LINK_ALT_MISSING": u"Image link missing Alt tag.", 30 | "ANCHOR_TEXT_MISSING": u"Anchor missing title tag or text.", 31 | "ANCHOR_TEXT_TOO_SHORT": u"Anchor text too short (less than 3 characters).", 32 | "ANCHOR_TEXT_TOO_LONG": u"Avoid using lengthy links with unnecessary parameters " 33 | u"(more than 80 characters).", 34 | "ANCHOR_TEXT_TOO_GENERIC": u"Anchor text contains generic text.", 35 | "ANCHOR_HREF_TOO_LONG": u"Avoid using lengthy links with unnecessary parameters " 36 | u"(more than 100 characters).", 37 | "ANCHOR_HREF_EQUALS_TEXT": u"Avoid using the page URL as the anchor text.", 38 | "ANCHOR_NO_FOLLOW": u"Avoid passing your reputation to low ranking or non relevant websites.", 39 | "IMAGE_SRC_MISSING": u"Image missing src tag.", 40 | "IMAGE_SRC_TOO_LONG": u"Avoid using long filenames in links (more than 15 characters).", 41 | "IMAGE_ALT_MISSING": u"Image missing alt tag.", 42 | "IMAGE_ALT_TOO_LONG": u"Avoid writing excessively long alt text that could be spammy.", 43 | "H1_ONE_PER_PAGE": u"Each page should have only one h1 tag", 44 | "H1_TOO_SHORT": u"Avoid using H1 Tags that are too short (less than 3 characters).", 45 | "KEYWORDS_META": u"The Keywords Metatag should be avoided as they are a spam indicator " 46 | u"and no longer used by Search Engines.", 47 | "WORDCOUNT_TOO_SHORT": u"The average word count for top-ranking content is 1,140 - 1,285 words.", 48 | "ROBOTS.TXT": u"robots.txt is missing. " 49 | u"A 'robots.txt' file tells search engines whether they can " 50 | u"access, and therefore crawl parts of your site", 51 | "BROKEN_LINK": u"Avoid referencing broken links on your site.", 52 | "SERVER_ERROR": u"Avoid referencing pages that error out on your site.", 53 | "BLOG_MISSING": u"Blog was not found on this domain. " 54 | u"Blogging about your expertise helps build trust and relationships. " 55 | u"Ensure your blog exists on this domain to build your domain authority.", 56 | } 57 | BADGES = { 58 | "POSITIVE_TITLE": u"Title has positive sentiments", 59 | "NEUTRAL_TITLE": u"Title has neutral sentiments", 60 | "TITLE_LENGTH": u"Title length is between 10 and 70 characters.", 61 | "TITLE_INFORMATIVE": u"Title is informative.", 62 | "TITLE_UNIQUE": u"This page has a unique title tag.", 63 | "NEUTRAL_DESCRIPTION": u"Description has neutral sentiments", 64 | "POSITIVE_DESCRIPTION": u"Description has positive sentiments", 65 | "DESCRIPTION_LENGTH": u"Descriptions are important as Google may use them as page snippets.", 66 | "DESCRIPTION_INFORMATIVE": u"Description is informative and helps give context to " 67 | u"customers trying to get to your page.", 68 | "URL_CANONICAL": u"Using canonical URLs helps avoid duplicate content.", 69 | "URL_CORRECTLY_CASED": u"URL is lowercase. Many users expect lower-case URLs and " 70 | u"remember them better.", 71 | "IMAGE_LINK_ALT": u"Image link contains an alt tag.", 72 | "ANCHOR_NO_FOLLOW": u"Good use of nofollow to nonrelevant websites.", 73 | "H1_ONE_PER_PAGE": u"Page contains a single H1 Heading", 74 | "H1_LENGTH": u"Page contains an H1 with a good length", 75 | "WORDCOUNT": u"You have provided great comprehensive coverage of your topic.", 76 | "ROBOTS.TXT": u"Robots.txt file detected. Robots.txt helps search engines navigate " 77 | u"pages that should be indexed.", 78 | "BLOG_DETECTED": u"Blog was found on this domain. " 79 | u"Blogging about your expertise helps build trust and relationships.", 80 | } 81 | -------------------------------------------------------------------------------- /webedge/webedge.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from webedge import website_analysis 4 | from webedge import cli_output 5 | import sys 6 | 7 | 8 | def create_parser(): 9 | """ 10 | Creates a Parser to pass Arguement Parser. 11 | Returns: 12 | parser: Arguement Parser through which the User can pass the Website 13 | """ 14 | parser = argparse.ArgumentParser( 15 | description="Search and Analyze the Search Engine Optimization of a Website" 16 | ) 17 | parser.add_argument( 18 | "-d", 19 | "--domain", 20 | type=str, 21 | required=True, 22 | help="Share the Website Domain to analyze", 23 | ) 24 | parser.add_argument( 25 | "-s", "--sitemap", type=str, required=False, help="Sitemap.xml file to use" 26 | ) 27 | 28 | parser.add_argument( 29 | "-p", "--page", type=str, required=False, help="Single Page to analyze" 30 | ) 31 | return parser 32 | 33 | 34 | def analyze(domain, sitemap, page): 35 | """ 36 | Analyzes the Domain/Sitemap/Page passed by the User. 37 | Args: 38 | domain: Uniform Resource Locator of the Web Application 39 | sitempap: An XML Sitemap for a Web Application 40 | page: Uniform Resource Locator for a single Webpage 41 | Returns: 42 | report: JSON Document consisting of all achievements and warnings 43 | """ 44 | spider = website_analysis.Spider(domain, sitemap, page) 45 | raw_report = spider.crawl() 46 | report = json.dumps(raw_report, indent=4, separators=(",", ": ")) 47 | return report 48 | 49 | 50 | def main(): 51 | """ 52 | Main Function to run the Parser and invoke the Scripts. 53 | Returns: 54 | report: JSON Report of the whole Website/Webpage/Sitemap 55 | """ 56 | cli_output.outputName("WebEdge") 57 | parser = create_parser() 58 | args = parser.parse_args() 59 | err = False 60 | cli_output.startLoading() 61 | try: 62 | report = analyze(args.domain, args.sitemap, args.page) 63 | except (SystemExit, KeyError): 64 | cli_output.exitError() 65 | err = True 66 | except: # skipcq FLK-E722 67 | cli_output.printError(str(sys.exc_info()[0]) + "\n" + str(sys.exc_info()[1])) 68 | cli_output.outputError() 69 | err = True 70 | try: 71 | cli_output.endLoading() 72 | except: # skipcq FLK-E722 73 | sys.exit() 74 | try: 75 | if err is False: 76 | cli_output.outputJson(report) 77 | except (SystemExit, KeyError): 78 | cli_output.exitError() 79 | except: # skipcq FLK-E722 80 | cli_output.printError(str(sys.exc_info()[0]) + "\n" + str(sys.exc_info()[1])) 81 | cli_output.outputError() 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /webedge/webpage_analysis.py: -------------------------------------------------------------------------------- 1 | import re 2 | import bs4 3 | import requests 4 | from six.moves.urllib import parse 5 | from webedge.stop_words import ENGLISH_STOP_WORDS 6 | from webedge.warnings import BADGES 7 | from webedge.warnings import WARNINGS 8 | from webedge.social_websites import SOCIAL_WEBSITES 9 | from AnalyseSentiment.AnalyseSentiment import AnalyseSentiment 10 | 11 | # REGEX to match the Words on the Markup Document 12 | TOKEN_REGEX = re.compile(r"(?u)\b\w\w+\b") 13 | 14 | 15 | class Webpage: 16 | url = None 17 | title = None 18 | description = None 19 | 20 | website_titles = {} 21 | website_descriptions = {} 22 | 23 | def __init__(self, page_url, html, website_titles, website_descriptions): 24 | self.url = page_url 25 | self.netloc = parse.urlparse(page_url).netloc 26 | self.html = html 27 | self.title = None 28 | self.description = None 29 | self.keywords = {} 30 | self.issues = [] 31 | self.achieved = [] 32 | 33 | self.website_titles = website_titles 34 | self.website_descriptions = website_descriptions 35 | 36 | def report(self): 37 | """ 38 | Analyzes and verified the Optimizations on the Page. 39 | """ 40 | soup = bs4.BeautifulSoup(self.html, "html.parser") 41 | 42 | # per page analysis 43 | self._analyze_title(soup) 44 | self._analyze_description(soup) 45 | self._analyze_url_structure(soup) 46 | self._analyze_anchors(soup) 47 | self._analyze_images(soup) 48 | self._analyze_headings(soup) 49 | self._analyze_keywords(soup) 50 | self._analyze_wordcount(soup) 51 | 52 | return self._render() 53 | 54 | def _analyze_title(self, doc): 55 | """ 56 | Validate the title 57 | Args: 58 | doc: Beautful Soup Object 59 | Returns: 60 | earned/warn: Returns if the Document Title fall among the prerequisties set 61 | """ 62 | self.title = t = u"" 63 | if doc.title: 64 | self.title = t = doc.title.text 65 | 66 | length = len(t) 67 | if length == 0: 68 | self.warn(WARNINGS["TITLE_MISSING"], self.title) 69 | return 70 | if length < 10: 71 | self.warn(WARNINGS["TITLE_TOO_SHORT"], self.title) 72 | elif length > 70: 73 | self.warn(WARNINGS["TITLE_TOO_LONG"], self.title) 74 | else: 75 | self.earned(BADGES["TITLE_LENGTH"], self.title) 76 | 77 | if any(vague_words in t.lower() for vague_words in ["untitled", "page"]): 78 | self.warn(WARNINGS["TITLE_TOO_GENERIC"], self.title) 79 | else: 80 | self.earned(BADGES["TITLE_INFORMATIVE"], self.title) 81 | 82 | sentimentobj = AnalyseSentiment() 83 | sentimentdata = sentimentobj.Analyse(self.title) 84 | if sentimentdata.get("overall_sentiment") == "Negative": 85 | self.warn(WARNINGS["NEGATIVE_TITLE"], self.title) 86 | elif sentimentdata.get("overall_sentiment") == "Neutral": 87 | self.earned(BADGES["NEUTRAL_TITLE"], self.title) 88 | else: 89 | self.earned(BADGES["POSITIVE_TITLE"], self.title) 90 | 91 | title_words = self.grouped(self.tokenize(t)) 92 | for word, count in title_words: 93 | if count > 3: 94 | self.warn(WARNINGS["TITLE_KEYWORD_STUFFED"], self.title) 95 | 96 | if t in self.website_titles: 97 | self.warn( 98 | WARNINGS["TITLE_DUPLICATED"], 99 | u'"{0}" previously used on pages: {1}'.format( 100 | t, self.website_titles[t] 101 | ), 102 | ) 103 | else: 104 | self.earned(BADGES["TITLE_UNIQUE"], self.title) 105 | self.website_titles[t] = self.url 106 | 107 | def _analyze_description(self, doc): 108 | """ 109 | Analyzes and Validates the description present in the Markup Document. 110 | Args: 111 | doc: Beautful Soup Object 112 | Returns: 113 | earned/warn: Returns if Description fall among the prerequisties set 114 | """ 115 | desc = doc.findAll("meta", attrs={"name": "description"}) 116 | 117 | self.description = d = u"" 118 | if len(desc) > 0: 119 | self.description = d = desc[0].get("content", "") 120 | 121 | length = len(d) 122 | if length == 0: 123 | self.warn(WARNINGS["DESCRIPTION_MISSING"]) 124 | return 125 | if length < 140: 126 | self.warn(WARNINGS["DESCRIPTION_TOO_SHORT"], self.description) 127 | elif length > 255: 128 | self.warn(WARNINGS["DESCRIPTION_TOO_LONG"], self.description) 129 | else: 130 | self.earned(BADGES["DESCRIPTION_LENGTH"], self.description) 131 | 132 | if any(vague_words in d.lower() for vague_words in ["web page", "page about"]): 133 | self.warn(WARNINGS["DESCRIPTION_TOO_GENERIC"], self.description) 134 | else: 135 | self.earned(BADGES["DESCRIPTION_INFORMATIVE"], self.description) 136 | 137 | sentimentobj = AnalyseSentiment() 138 | sentimentdata = sentimentobj.Analyse(self.title) 139 | if sentimentdata.get("overall_sentiment") == "Negative": 140 | self.warn(WARNINGS["NEGATIVE_DESCRIPTION"], self.description) 141 | elif sentimentdata.get("overall_sentiment") == "Neutral": 142 | self.earned(BADGES["NEUTRAL_DESCRIPTION"], self.description) 143 | else: 144 | self.earned(BADGES["POSITIVE_DESCRIPTION"], self.description) 145 | 146 | desc_words = self.grouped(self.tokenize(d)) 147 | for word, count in desc_words: 148 | if count > 3: 149 | self.warn(WARNINGS["DESCRIPTION_KEYWORD_STUFFED"], self.description) 150 | 151 | if d in self.website_descriptions: 152 | self.warn( 153 | WARNINGS["DESCRIPTION_DUPLICATED"], 154 | u'"{0}" previously used on pages: {1}'.format( 155 | d, self.website_descriptions[d] 156 | ), 157 | ) 158 | else: 159 | self.website_descriptions[d] = self.url 160 | 161 | def _analyze_url_structure(self, doc): 162 | """ 163 | Analyze and verified the URL Structure of the Website. 164 | Args: 165 | doc: Beautful Soup Object 166 | Returns: 167 | earned/warn: Returns if URL Structure falls in the prerequisties set 168 | """ 169 | 170 | parsed_url = parse.urlparse(self.url) 171 | path = parsed_url.path.split("/") 172 | 173 | if len(self.url) > 100: 174 | self.warn(WARNINGS["URL_TOO_LONG"], self.url) 175 | 176 | if any(vague_words in self.url.lower() for vague_words in ["page"]): 177 | self.warn(WARNINGS["URL_TOO_GENERIC"], self.url) 178 | 179 | url_words = self.grouped(self.tokenize(path[-1])) 180 | for word, count in url_words: 181 | if count >= 2: 182 | self.warn(WARNINGS["URL_KEYWORD_STUFFED"], self.url) 183 | 184 | if len(path) > 3: 185 | self.warn(WARNINGS["URL_TOO_DEEP"], self.url) 186 | 187 | canonical = doc.find("link", rel="canonical") 188 | if canonical: 189 | canonical_url = canonical["href"] 190 | 191 | if canonical_url != self.url: 192 | self.warn(WARNINGS["URL_NOT_CANONICAL"], canonical_url) 193 | else: 194 | self.earned(BADGES["URL_CANONICAL"], self.url) 195 | 196 | if any(x.isupper() for x in self.url): 197 | self.warn(WARNINGS["URL_CAPITALIZED"], self.url) 198 | else: 199 | self.earned(BADGES["URL_CORRECTLY_CASED"], self.url) 200 | 201 | def _analyze_anchors(self, doc): 202 | """ 203 | Analyzes and verified the Anchor Tags on the Markup. 204 | Args: 205 | doc: Beautful Soup Object 206 | Returns: 207 | earned/warn: Returns if Anchors are defined and the prerequisties are set. 208 | """ 209 | anchors = doc.find_all("a", href=True) 210 | verified_pages = [] 211 | 212 | for tag in anchors: 213 | tag_href = tag["href"] 214 | tag_text = tag.text.lower().strip() 215 | 216 | image_link = tag.find("img") 217 | 218 | if image_link is not None: 219 | if len(image_link.get("alt", "")) == 0: 220 | self.warn(WARNINGS["IMAGE_LINK_ALT_MISSING"], tag_href) 221 | else: 222 | self.earned(BADGES["IMAGE_LINK_ALT"], image_link.get("alt", "")) 223 | 224 | else: 225 | if len(tag.get("title", "")) == 0 and len(tag_text) == 0: 226 | self.warn(WARNINGS["ANCHOR_TEXT_MISSING"], tag_href) 227 | elif len(tag_text) < 3: 228 | self.warn(WARNINGS["ANCHOR_TEXT_TOO_SHORT"], tag_text) 229 | elif len(tag_text) > 100: 230 | self.warn(WARNINGS["ANCHOR_TEXT_TOO_LONG"], tag_text) 231 | 232 | if any( 233 | vague_words in tag_text.lower() 234 | for vague_words in ["click here", "page", "article"] 235 | ): 236 | self.warn(WARNINGS["ANCHOR_TEXT_TOO_GENERIC"], tag_text) 237 | 238 | if len(tag_href) > 100: 239 | self.warn(WARNINGS["ANCHOR_HREF_TOO_LONG"], tag_href) 240 | 241 | if tag_text == tag_href: 242 | self.warn(WARNINGS["ANCHOR_HREF_EQUALS_TEXT"], tag_text) 243 | 244 | if len(parse.urlparse(tag_href).netloc) > 0: 245 | if self.netloc not in tag_href: 246 | if not ( 247 | any(social_site in tag_href for social_site in SOCIAL_WEBSITES) 248 | ): 249 | if tag.get("rel") is None or "nofollow" not in tag.get("rel"): 250 | self.warn(WARNINGS["ANCHOR_NO_FOLLOW"], tag_href) 251 | else: 252 | self.earned(BADGES["ANCHOR_NO_FOLLOW"], tag_href) 253 | 254 | if not tag_href.startswith("mailto:"): 255 | referenced_href = tag_href 256 | if len(parse.urlparse(tag_href).netloc) == 0: 257 | referenced_href = parse.urljoin(self.url, tag_href) 258 | 259 | if referenced_href not in verified_pages: 260 | resp = requests.head(referenced_href) 261 | if resp.status_code == requests.codes.not_found: 262 | self.warn(WARNINGS["BROKEN_LINK"], referenced_href) 263 | 264 | verified_pages.append(referenced_href) 265 | 266 | def _analyze_images(self, doc): 267 | """ 268 | Analyzes and verifies that each image has an alt and title. 269 | Args: 270 | doc: Beautful Soup Object 271 | Returns: 272 | earned/warn: Returns if Images Alt and Title tag fall in the prerequisties set 273 | """ 274 | images = doc.find_all("img") 275 | 276 | for image in images: 277 | src = image.get("src", image.get("data-src", "")) 278 | 279 | if len(src) == 0: 280 | self.warn(WARNINGS["IMAGE_SRC_MISSING"], str(image)) 281 | else: 282 | if len(image.get("alt", "")) == 0: 283 | self.warn(WARNINGS["IMAGE_ALT_MISSING"], str(image)) 284 | 285 | if len(parse.urlparse(src).netloc) == 0 or self.netloc in src: 286 | if len(src) > 15: 287 | self.warn(WARNINGS["IMAGE_SRC_TOO_LONG"], src) 288 | if len(image.get("alt", "")) > 40: 289 | self.warn(WARNINGS["IMAGE_ALT_TOO_LONG"], image.get("alt", "")) 290 | 291 | def _analyze_headings(self, doc): 292 | """ 293 | Analyzes Headings on the Website and makes sure of atleast one heading tag. 294 | Args: 295 | doc: Beautful Soup Object 296 | Returns: 297 | earned/warn: Returns if Headings fall in the prerequisties set 298 | """ 299 | h1tags = doc.find_all("h1") 300 | 301 | self.headers = [] 302 | for h in h1tags: 303 | self.headers.append(h.text) 304 | 305 | if len(h.text) < 3: 306 | self.warn(WARNINGS["H1_TOO_SHORT"], h.text) 307 | else: 308 | self.earned(BADGES["H1_LENGTH"], h.text) 309 | 310 | if len(h1tags) != 1: 311 | self.warn(WARNINGS["H1_ONE_PER_PAGE"], self.headers) 312 | else: 313 | self.earned(BADGES["H1_ONE_PER_PAGE"], self.headers) 314 | 315 | def _analyze_keywords(self, doc): 316 | """ 317 | Analyzes the Keywords on the Website. 318 | Args: 319 | doc: Beautful Soup Object 320 | Returns: 321 | earned/warn: Returns if Keyword Count fall in the prerequisties set 322 | """ 323 | kw_meta = doc.findAll("meta", attrs={"name": "keywords"}) 324 | 325 | if len(kw_meta) > 0: 326 | self.warn(WARNINGS["KEYWORDS_META"], kw_meta) 327 | 328 | self.keywords = self._get_keywords(doc) 329 | 330 | del self.keywords[5:] 331 | 332 | def _analyze_wordcount(self, doc): 333 | """ 334 | Analyzes the Wordcount on the Website. 335 | Args: 336 | doc: Beautful Soup Object 337 | Returns: 338 | earned/warn: Returns if Wordcount fall in the prerequistie limit 339 | """ 340 | page_content = self._get_keywords(doc) 341 | count = 0 342 | for word, freq in page_content: 343 | count += freq 344 | 345 | if count < 2416: 346 | self.warn( 347 | WARNINGS["WORDCOUNT_TOO_SHORT"], u"You have {0} words.".format(count) 348 | ) 349 | else: 350 | self.earned(BADGES["WORDCOUNT"], u"You have {0} words.".format(count)) 351 | 352 | def _render(self): 353 | """ 354 | Renders the Result of SEO Analysis 355 | """ 356 | keywords_result = [] 357 | 358 | for word, count in self.keywords: 359 | kw = { 360 | "keyword": word, 361 | "frequency": count, 362 | "in_title": word in self.title.lower(), 363 | "in_description": word in self.description.lower(), 364 | "in_header": word in self.headers, 365 | } 366 | keywords_result.append(kw) 367 | 368 | result = { 369 | "url": self.url, 370 | "keywords": keywords_result, 371 | "issues": self.issues, 372 | "achieved": self.achieved, 373 | "title": self.title, 374 | "description": self.description, 375 | } 376 | 377 | return result 378 | 379 | def warn(self, message, value=None): 380 | """ 381 | Value lost through improper SEO Optimization on the Website. 382 | """ 383 | self.issues.append({"warning": message, "value": value}) 384 | 385 | def earned(self, message, value=None): 386 | """ 387 | Value earned through proper SEO Optimization on the Website. 388 | """ 389 | self.achieved.append({"achievement": message, "value": value}) 390 | 391 | def visible_tags(self, element): 392 | """ 393 | Finds element tags in the Markup Document. 394 | Args: 395 | element: Elements from the Markup Document 396 | Returns: 397 | boolean: True/False depending on the availability of the Elements 398 | """ 399 | non_visible_elements = [ 400 | "style", 401 | "script", 402 | "[document]", 403 | "head", 404 | "title", 405 | "meta", 406 | ] 407 | 408 | if element.parent.name in non_visible_elements: 409 | return False 410 | if isinstance(element, bs4.element.Comment): 411 | return False 412 | 413 | return True 414 | 415 | def tokenize(self, rawtext): 416 | """ 417 | Tokenizes the Raw Text passed to it by passing through Regex and removing Stop Words. 418 | Args: 419 | rawtext: Markup Text 420 | Returns: 421 | word: Tokenized Text after removing Stop Words and passing through Regex 422 | """ 423 | return [ 424 | word 425 | for word in TOKEN_REGEX.findall(rawtext.lower()) 426 | if word not in ENGLISH_STOP_WORDS 427 | ] 428 | 429 | def grouped(self, token_list): 430 | """ 431 | Groups the List with the Token List passed to it. 432 | Args: 433 | token_list: List Data Structure 434 | Returns: 435 | grouped_list: Dictionary consisting of all the Grouped Lists together 436 | """ 437 | grouped_list = {} 438 | for word in token_list: 439 | if word in grouped_list: 440 | grouped_list[word] += 1 441 | else: 442 | grouped_list[word] = 1 443 | 444 | grouped_list = sorted(grouped_list.items(), key=lambda x: x[1], reverse=True) 445 | return grouped_list 446 | 447 | def _get_keywords(self, doc): 448 | """ 449 | Fetches the Keywords present in the given Webpage. 450 | Args: 451 | doc: Beautful Soup Object 452 | 453 | Returns: 454 | keywords: Dictionary of Keywords and their Frequencies 455 | """ 456 | keywords = {} 457 | text_elements = filter(self.visible_tags, doc.findAll(text=True)) 458 | page_text = "" 459 | for element in text_elements: 460 | page_text += element.lower() + " " 461 | 462 | tokens = self.tokenize(page_text) 463 | keywords = self.grouped(tokens) 464 | 465 | return keywords 466 | -------------------------------------------------------------------------------- /webedge/website_analysis.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup as Soup 2 | import requests 3 | from six.moves.urllib import parse 4 | from webedge.warnings import BADGES 5 | from webedge.warnings import WARNINGS 6 | from webedge import webpage_analysis 7 | 8 | 9 | class Spider: 10 | report = {"pages": []} 11 | 12 | def __init__(self, site, sitemap=None, page=None): 13 | parsed_url = parse.urlparse(site) 14 | 15 | self.domain = "{0}://{1}".format(parsed_url.scheme, parsed_url.netloc) 16 | self.pages_crawled = [] 17 | self.pages_to_crawl = [] 18 | self.titles = {} 19 | self.descriptions = {} 20 | self.issues = [] 21 | self.achieved = [] 22 | 23 | if sitemap is not None: 24 | locations = [] 25 | resp = requests.get(self.domain + sitemap) 26 | if resp.status_code == requests.codes.ok: 27 | locations = self._parse_sitemap(resp.content) 28 | 29 | self.pages_to_crawl.append(site) 30 | self.pages_to_crawl.extend(locations) 31 | elif page is not None: 32 | self.pages_to_crawl.append(site + page) 33 | else: 34 | self.pages_to_crawl.append(site) 35 | 36 | def _parse_sitemap(self, sitemap): 37 | """ 38 | Parse the Sitemap for Locations. 39 | Args: 40 | sitemap: XML Sitempa 41 | Returns: 42 | locations 43 | """ 44 | locations = [] 45 | 46 | soup = Soup(sitemap, "html.parser") 47 | urls = soup.findAll("url") 48 | 49 | if len(urls) > 0: 50 | for u in urls: 51 | loc = u.find("loc").string 52 | locations.append(loc) 53 | 54 | return locations 55 | 56 | def _analyze_crawlers(self): 57 | """ 58 | Analyzes Crawlers in form of robots.txt file. 59 | Returns: 60 | Badges/Warnings: Depending on whether a Robots.txt exists. 61 | """ 62 | resp = requests.get(self.domain + "/robots.txt") 63 | if resp.status_code == requests.codes.ok: 64 | self.earned(BADGES["ROBOTS.TXT"]) 65 | else: 66 | self.warn(WARNINGS["ROBOTS.TXT"]) 67 | 68 | def _analyze_blog(self): 69 | """ 70 | Analyzes Blogs in form of a Blogging Subdomain 71 | Returns: 72 | Badges/Warnings: Depending on whether a Blog exists or not. 73 | """ 74 | resp = requests.get(self.domain + "/blog") 75 | if resp.status_code == requests.codes.ok: 76 | self.earned(BADGES["BLOG_DETECTED"], self.domain + u"/blog") 77 | else: 78 | self.warn(WARNINGS["BLOG_MISSING"]) 79 | 80 | def warn(self, message, value=None): 81 | """ 82 | Value lost through improper SEO Optimization on the Website. 83 | """ 84 | self.issues.append({"warning": message, "value": value}) 85 | 86 | def earned(self, message, value=None): 87 | """ 88 | Value earned through proper SEO Optimization on the Website. 89 | """ 90 | self.achieved.append({"achievement": message, "value": value}) 91 | 92 | def crawl(self): 93 | """ 94 | Crawl the Website and analyze different things. 95 | """ 96 | self._analyze_crawlers() 97 | self._analyze_blog() 98 | for page_url in self.pages_to_crawl: 99 | resp = requests.get(page_url) 100 | if resp.status_code == requests.codes.ok: 101 | html = webpage_analysis.Webpage( 102 | page_url, resp.content, self.titles, self.descriptions 103 | ) 104 | page_report = html.report() 105 | self.report["pages"].append(page_report) 106 | self.pages_crawled.append(page_url.strip().lower()) 107 | # print("Crawled {0} Pages of {1}: {2}".format( 108 | # len(self.pages_crawled), len(self.pages_to_crawl), page_url)) 109 | elif resp.status_code == requests.codes.not_found: 110 | self.warn(WARNINGS["BROKEN_LINK"], page_url) 111 | else: 112 | self.warn( 113 | WARNINGS["SERVER_ERROR"], 114 | "HTTP{0} received for {1}".format(resp.status_code, page_url), 115 | ) 116 | self.report["site"] = {} 117 | self.report["site"]["issues"] = self.issues 118 | self.report["site"]["achieved"] = self.achieved 119 | return self.report 120 | --------------------------------------------------------------------------------