├── mwc ├── __init__.py ├── counter.py └── cli.py ├── tests ├── __init__.py └── test_mwc.py ├── .travis.yml ├── .circleci └── config.yml ├── setup.py ├── LICENSE ├── .github └── workflows │ ├── python-publish.yml │ └── python-package.yml ├── README.md └── .gitignore /mwc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | script: 5 | - python -m unittest discover 6 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | 3 | orbs: 4 | python: circleci/python@0.2.1 5 | 6 | jobs: 7 | build-and-test: 8 | executor: python/default 9 | steps: 10 | - checkout 11 | - run: 12 | command: python -m unittest discover 13 | name: Test 14 | 15 | workflows: 16 | main: 17 | jobs: 18 | - build-and-test 19 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="markdown-word-count", 8 | version="0.1.0", 9 | author="Georgios Andreadis", 10 | author_email="info@gandreadis.com", 11 | description="Word counter for raw Markdown files", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | url="https://github.com/gandreadis/markdown-word-count", 15 | packages=['mwc'], 16 | classifiers=[ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ], 21 | python_requires='>=3.6', 22 | entry_points={ 23 | 'console_scripts': ['mwc=mwc.cli:main'], 24 | } 25 | ) 26 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Georgios Andreadis 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /mwc/counter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def count_words_in_markdown(markdown): 5 | text = markdown 6 | 7 | # Comments 8 | text = re.sub(r'', '', text, flags=re.MULTILINE) 9 | # Tabs to spaces 10 | text = text.replace('\t', ' ') 11 | # More than 1 space to 4 spaces 12 | text = re.sub(r'[ ]{2,}', ' ', text) 13 | # Footnotes 14 | text = re.sub(r'^\[[^]]*\][^(].*', '', text, flags=re.MULTILINE) 15 | # Indented blocks of code 16 | text = re.sub(r'^( {4,}[^-*]).*', '', text, flags=re.MULTILINE) 17 | # Custom header IDs 18 | text = re.sub(r'{#.*}', '', text) 19 | # Replace newlines with spaces for uniform handling 20 | text = text.replace('\n', ' ') 21 | # Remove images 22 | text = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', text) 23 | # Remove HTML tags 24 | text = re.sub(r']*>', '', text) 25 | # Remove special characters 26 | text = re.sub(r'[#*`~\-–^=<>+|/:]', '', text) 27 | # Remove footnote references 28 | text = re.sub(r'\[[0-9]*\]', '', text) 29 | # Remove enumerations 30 | text = re.sub(r'[0-9#]*\.', '', text) 31 | 32 | return len(text.split()) 33 | -------------------------------------------------------------------------------- /mwc/cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | import os 4 | import sys 5 | 6 | from mwc.counter import count_words_in_markdown 7 | 8 | 9 | def get_count(files): 10 | count = 0 11 | for file in files: 12 | if not os.path.isfile(file): 13 | print('The file at the given location {file} could not be opened') 14 | sys.exit(1) 15 | with open(file, 'r', encoding='utf8') as f: 16 | count += count_words_in_markdown(f.read()) 17 | 18 | return count 19 | 20 | 21 | def main(): 22 | if sys.version_info < (3,): 23 | print( 24 | 'Python 3 is required. You are using Python 2. You should probably run this script as follows:') 25 | print('python3 mwc.py') 26 | sys.exit(1) 27 | 28 | if len(sys.argv) < 2: 29 | print('Provide the Markdown file to parse as first argument') 30 | sys.exit(1) 31 | 32 | files = sys.argv[1:] 33 | 34 | count = get_count(files) 35 | 36 | if len(files) == 1: 37 | print(f"Number of words in file {files[0]}") 38 | print(count) 39 | else: 40 | print(f"Words across {len(files)} files") 41 | print(count) 42 | 43 | return count 44 | 45 | 46 | if __name__ == '__main__': 47 | main() 48 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | runs-on: ubuntu-latest 21 | environment: Env 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_KEY }} 40 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ⬇ Markdown Word Count 2 | 3 | [![Python package](https://github.com/gandreadis/markdown-word-count/actions/workflows/python-package.yml/badge.svg)](https://github.com/gandreadis/markdown-word-count/actions/workflows/python-package.yml) 4 | 5 | A word counter for raw Markdown files, excluding punctuation, footnotes, and special Markdown or HTML tag syntax. 6 | 7 | ## 💻 Installation 8 | 9 | You will need... 10 | 11 | - 🐍 Python 3 12 | - 🐑 PIP3 or a [clone](https://github.com/gandreadis/markdown-word-count.git) of this repo. 13 | 14 | ## ▶ Usage 15 | 16 | ### Through PIP 17 | 18 | The easiest way is to run: 19 | 20 | ``` 21 | pip install markdown-word-count 22 | ``` 23 | 24 | Then, you'll be able to analyze any file by passing its name (relative path) to the `mwc` script: 25 | 26 | ``` 27 | mwc yourfile.md 28 | ``` 29 | 30 | You can also pass in multiple files or a blob if your shell supports it. This allows for checking all files in a folder, for example. 31 | 32 | ``` 33 | mwc text1.md text2.md 34 | mwc test/*.md 35 | ``` 36 | 37 | ### Manually 38 | 39 | If you want to clone the repo and run the Python script manually, run: 40 | 41 | ``` 42 | python mwc/cli.py myfile.md 43 | ``` 44 | 45 | If this doesn't work, try `python3` instead of `python`. 46 | 47 | ## ⛏ Development 48 | 49 | Run this to execute all tests: 50 | 51 | ``` 52 | python -m unittest discover 53 | ``` 54 | 55 | ## 💬 Ports to Other Programming Languages 56 | 57 | - A PHP port can be found [here](https://github.com/Arcesilas/md-word-count), with thanks to [@Arcesilas](https://github.com/Arcesilas)! 58 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "master" ] 9 | pull_request: 10 | branches: [ "master" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.9", "3.10", "3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Test with unittest 39 | run: | 40 | python -m unittest discover 41 | - name: Build the package locally 42 | run: | 43 | python -m pip install . 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### Python template 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | -------------------------------------------------------------------------------- /tests/test_mwc.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import shutil 4 | 5 | import textwrap 6 | from unittest import TestCase 7 | 8 | from mwc.counter import count_words_in_markdown 9 | from mwc.cli import main 10 | 11 | try: 12 | # Python 3.4+ should use builtin unittest.mock not mock package 13 | from unittest.mock import patch 14 | except ImportError: 15 | from mock import patch 16 | 17 | 18 | class TestMWC(TestCase): 19 | 20 | def test_single_markdown_file(self): 21 | # Test single markdown file 22 | with open("test.md", "w+") as f: 23 | f.write("this is a markdown file!") 24 | testargs = ["mwc.cli", "test.md"] 25 | with patch.object(sys, 'argv', testargs): 26 | test = main() 27 | self.assertEqual(test, 5) 28 | os.remove("test.md") 29 | 30 | def test_multiple_markdown_files(self): 31 | # Test multiple files in folder 32 | if os.path.exists("test"): 33 | shutil.rmtree("test") 34 | os.mkdir("test") 35 | with open("test/test1.md", "w+") as f: 36 | f.write("this is a markdown file!") 37 | with open("test/test2.md", "w+") as f: 38 | f.write("this is a markdown file number 2!") 39 | testargs = ["mwc.cli", "test/test1.md", "test/test2.md"] 40 | with patch.object(sys, 'argv', testargs): 41 | test = main() 42 | self.assertEqual(test, 12) 43 | shutil.rmtree("test") 44 | 45 | def test_file_does_not_exist(self): 46 | # Test if program works when file or folder doesn't exist 47 | testargs = ["mwc.cli", "something.md"] 48 | with patch.object(sys, 'argv', testargs): 49 | with self.assertRaises(SystemExit): 50 | main() 51 | 52 | def test_simple_text(self): 53 | text = textwrap.dedent(""" 54 | test a b c 55 | """) 56 | self.assertEqual(count_words_in_markdown(text), 4) 57 | 58 | def test_headings(self): 59 | text = textwrap.dedent(""" 60 | # H1 61 | ## H2 62 | ### H3 63 | 64 | H1 65 | ----- 66 | 67 | H1 68 | ===== 69 | 70 | ### My Great Heading {#custom-id} 71 | """) 72 | self.assertEqual(count_words_in_markdown(text), 8) 73 | 74 | def test_inline(self): 75 | text = textwrap.dedent(""" 76 | **bold text** 77 | *italicized text* 78 | `test` 79 | ~~test~~ 80 | """) 81 | self.assertEqual(count_words_in_markdown(text), 6) 82 | 83 | def test_comments(self): 84 | text = textwrap.dedent(""" 85 | 86 | 87 | 92 | 93 | Test 94 | """) 95 | self.assertEqual(count_words_in_markdown(text), 1) 96 | 97 | def test_quote(self): 98 | text = textwrap.dedent(""" 99 | > blockquote 100 | """) 101 | self.assertEqual(count_words_in_markdown(text), 1) 102 | 103 | def test_enumeration(self): 104 | text = textwrap.dedent(""" 105 | 1. foo 106 | 2. bar 107 | #. smart item 108 | """) 109 | self.assertEqual(count_words_in_markdown(text), 4) 110 | 111 | def test_bullet_points(self): 112 | text = textwrap.dedent(""" 113 | - foo 114 | - bar 115 | """) 116 | self.assertEqual(count_words_in_markdown(text), 2) 117 | 118 | def test_nested_bullet_points(self): 119 | text = textwrap.dedent(""" 120 | - foo 121 | - bar 122 | - test 123 | """) 124 | self.assertEqual(count_words_in_markdown(text), 3) 125 | 126 | def test_nested_star_bullet_points(self): 127 | text = textwrap.dedent(""" 128 | - foo 129 | - bar 130 | * test 131 | * baz 132 | """) 133 | self.assertEqual(count_words_in_markdown(text), 4) 134 | 135 | def test_indented_code_block(self): 136 | text = textwrap.dedent(""" 137 | foo bar 138 | 139 | test code 140 | """) 141 | self.assertEqual(count_words_in_markdown(text), 2) 142 | 143 | def test_code_block(self): 144 | text = textwrap.dedent(""" 145 | ``` 146 | test 147 | ``` 148 | """) 149 | self.assertEqual(count_words_in_markdown(text), 1) 150 | 151 | def test_link(self): 152 | text = textwrap.dedent(""" 153 | Some [linked text](https://google.com/). 154 | """) 155 | self.assertEqual(count_words_in_markdown(text), 3) 156 | 157 | def test_image(self): 158 | text = textwrap.dedent(""" 159 | test 160 | 161 | ![test](images1) 162 | 163 | ![blah](images2) 164 | 165 | test 166 | """) 167 | self.assertEqual(count_words_in_markdown(text), 2) 168 | 169 | def test_footnote(self): 170 | text = textwrap.dedent(""" 171 | MWC is great [1]. 172 | 173 | [1] source footnote 174 | [1](do count this one please) 175 | 176 | Followup text 177 | """) 178 | self.assertEqual(count_words_in_markdown(text), 10) 179 | 180 | def test_html_tags(self): 181 | text = textwrap.dedent(""" 182 | test 183 | 184 |
185 | test 186 | 187 | test 188 | """) 189 | self.assertEqual(count_words_in_markdown(text), 3) 190 | 191 | def test_custom_header_tags(self): 192 | text = textwrap.dedent(""" 193 | ## header1 {#header1} 194 | foo bar 195 | ## header2 {#header2} 196 | """) 197 | self.assertEqual(count_words_in_markdown(text), 4) 198 | --------------------------------------------------------------------------------