├── mwc
├── __init__.py
├── counter.py
└── cli.py
├── tests
├── __init__.py
└── test_mwc.py
├── .travis.yml
├── .circleci
└── config.yml
├── setup.py
├── LICENSE
├── .github
└── workflows
│ ├── python-publish.yml
│ └── python-package.yml
├── README.md
└── .gitignore
/mwc/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 | script:
5 | - python -m unittest discover
6 |
--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
1 | version: 2.1
2 |
3 | orbs:
4 | python: circleci/python@0.2.1
5 |
6 | jobs:
7 | build-and-test:
8 | executor: python/default
9 | steps:
10 | - checkout
11 | - run:
12 | command: python -m unittest discover
13 | name: Test
14 |
15 | workflows:
16 | main:
17 | jobs:
18 | - build-and-test
19 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import setuptools
2 |
3 | with open("README.md", "r") as fh:
4 | long_description = fh.read()
5 |
6 | setuptools.setup(
7 | name="markdown-word-count",
8 | version="0.1.0",
9 | author="Georgios Andreadis",
10 | author_email="info@gandreadis.com",
11 | description="Word counter for raw Markdown files",
12 | long_description=long_description,
13 | long_description_content_type="text/markdown",
14 | url="https://github.com/gandreadis/markdown-word-count",
15 | packages=['mwc'],
16 | classifiers=[
17 | "Programming Language :: Python :: 3",
18 | "License :: OSI Approved :: MIT License",
19 | "Operating System :: OS Independent",
20 | ],
21 | python_requires='>=3.6',
22 | entry_points={
23 | 'console_scripts': ['mwc=mwc.cli:main'],
24 | }
25 | )
26 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Georgios Andreadis
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/mwc/counter.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 |
4 | def count_words_in_markdown(markdown):
5 | text = markdown
6 |
7 | # Comments
8 | text = re.sub(r'', '', text, flags=re.MULTILINE)
9 | # Tabs to spaces
10 | text = text.replace('\t', ' ')
11 | # More than 1 space to 4 spaces
12 | text = re.sub(r'[ ]{2,}', ' ', text)
13 | # Footnotes
14 | text = re.sub(r'^\[[^]]*\][^(].*', '', text, flags=re.MULTILINE)
15 | # Indented blocks of code
16 | text = re.sub(r'^( {4,}[^-*]).*', '', text, flags=re.MULTILINE)
17 | # Custom header IDs
18 | text = re.sub(r'{#.*}', '', text)
19 | # Replace newlines with spaces for uniform handling
20 | text = text.replace('\n', ' ')
21 | # Remove images
22 | text = re.sub(r'!\[[^\]]*\]\([^)]*\)', '', text)
23 | # Remove HTML tags
24 | text = re.sub(r'?[^>]*>', '', text)
25 | # Remove special characters
26 | text = re.sub(r'[#*`~\-–^=<>+|/:]', '', text)
27 | # Remove footnote references
28 | text = re.sub(r'\[[0-9]*\]', '', text)
29 | # Remove enumerations
30 | text = re.sub(r'[0-9#]*\.', '', text)
31 |
32 | return len(text.split())
33 |
--------------------------------------------------------------------------------
/mwc/cli.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 | import os
4 | import sys
5 |
6 | from mwc.counter import count_words_in_markdown
7 |
8 |
9 | def get_count(files):
10 | count = 0
11 | for file in files:
12 | if not os.path.isfile(file):
13 | print('The file at the given location {file} could not be opened')
14 | sys.exit(1)
15 | with open(file, 'r', encoding='utf8') as f:
16 | count += count_words_in_markdown(f.read())
17 |
18 | return count
19 |
20 |
21 | def main():
22 | if sys.version_info < (3,):
23 | print(
24 | 'Python 3 is required. You are using Python 2. You should probably run this script as follows:')
25 | print('python3 mwc.py')
26 | sys.exit(1)
27 |
28 | if len(sys.argv) < 2:
29 | print('Provide the Markdown file to parse as first argument')
30 | sys.exit(1)
31 |
32 | files = sys.argv[1:]
33 |
34 | count = get_count(files)
35 |
36 | if len(files) == 1:
37 | print(f"Number of words in file {files[0]}")
38 | print(count)
39 | else:
40 | print(f"Words across {len(files)} files")
41 | print(count)
42 |
43 | return count
44 |
45 |
46 | if __name__ == '__main__':
47 | main()
48 |
--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package using Twine when a release is created
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
3 |
4 | # This workflow uses actions that are not certified by GitHub.
5 | # They are provided by a third-party and are governed by
6 | # separate terms of service, privacy policy, and support
7 | # documentation.
8 |
9 | name: Upload Python Package
10 |
11 | on:
12 | release:
13 | types: [published]
14 |
15 | permissions:
16 | contents: read
17 |
18 | jobs:
19 | deploy:
20 | runs-on: ubuntu-latest
21 | environment: Env
22 |
23 | steps:
24 | - uses: actions/checkout@v3
25 | - name: Set up Python
26 | uses: actions/setup-python@v3
27 | with:
28 | python-version: '3.x'
29 | - name: Install dependencies
30 | run: |
31 | python -m pip install --upgrade pip
32 | pip install build
33 | - name: Build package
34 | run: python -m build
35 | - name: Publish package
36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 | with:
38 | user: __token__
39 | password: ${{ secrets.PYPI_API_KEY }}
40 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ⬇ Markdown Word Count
2 |
3 | [](https://github.com/gandreadis/markdown-word-count/actions/workflows/python-package.yml)
4 |
5 | A word counter for raw Markdown files, excluding punctuation, footnotes, and special Markdown or HTML tag syntax.
6 |
7 | ## 💻 Installation
8 |
9 | You will need...
10 |
11 | - 🐍 Python 3
12 | - 🐑 PIP3 or a [clone](https://github.com/gandreadis/markdown-word-count.git) of this repo.
13 |
14 | ## ▶ Usage
15 |
16 | ### Through PIP
17 |
18 | The easiest way is to run:
19 |
20 | ```
21 | pip install markdown-word-count
22 | ```
23 |
24 | Then, you'll be able to analyze any file by passing its name (relative path) to the `mwc` script:
25 |
26 | ```
27 | mwc yourfile.md
28 | ```
29 |
30 | You can also pass in multiple files or a blob if your shell supports it. This allows for checking all files in a folder, for example.
31 |
32 | ```
33 | mwc text1.md text2.md
34 | mwc test/*.md
35 | ```
36 |
37 | ### Manually
38 |
39 | If you want to clone the repo and run the Python script manually, run:
40 |
41 | ```
42 | python mwc/cli.py myfile.md
43 | ```
44 |
45 | If this doesn't work, try `python3` instead of `python`.
46 |
47 | ## ⛏ Development
48 |
49 | Run this to execute all tests:
50 |
51 | ```
52 | python -m unittest discover
53 | ```
54 |
55 | ## 💬 Ports to Other Programming Languages
56 |
57 | - A PHP port can be found [here](https://github.com/Arcesilas/md-word-count), with thanks to [@Arcesilas](https://github.com/Arcesilas)!
58 |
--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | name: Python package
5 |
6 | on:
7 | push:
8 | branches: [ "master" ]
9 | pull_request:
10 | branches: [ "master" ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | fail-fast: false
18 | matrix:
19 | python-version: ["3.9", "3.10", "3.11"]
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v3
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | python -m pip install flake8
31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 | - name: Lint with flake8
33 | run: |
34 | # stop the build if there are Python syntax errors or undefined names
35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 | - name: Test with unittest
39 | run: |
40 | python -m unittest discover
41 | - name: Build the package locally
42 | run: |
43 | python -m pip install .
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Created by .ignore support plugin (hsz.mobi)
2 | ### Python template
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 |
--------------------------------------------------------------------------------
/tests/test_mwc.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | import shutil
4 |
5 | import textwrap
6 | from unittest import TestCase
7 |
8 | from mwc.counter import count_words_in_markdown
9 | from mwc.cli import main
10 |
11 | try:
12 | # Python 3.4+ should use builtin unittest.mock not mock package
13 | from unittest.mock import patch
14 | except ImportError:
15 | from mock import patch
16 |
17 |
18 | class TestMWC(TestCase):
19 |
20 | def test_single_markdown_file(self):
21 | # Test single markdown file
22 | with open("test.md", "w+") as f:
23 | f.write("this is a markdown file!")
24 | testargs = ["mwc.cli", "test.md"]
25 | with patch.object(sys, 'argv', testargs):
26 | test = main()
27 | self.assertEqual(test, 5)
28 | os.remove("test.md")
29 |
30 | def test_multiple_markdown_files(self):
31 | # Test multiple files in folder
32 | if os.path.exists("test"):
33 | shutil.rmtree("test")
34 | os.mkdir("test")
35 | with open("test/test1.md", "w+") as f:
36 | f.write("this is a markdown file!")
37 | with open("test/test2.md", "w+") as f:
38 | f.write("this is a markdown file number 2!")
39 | testargs = ["mwc.cli", "test/test1.md", "test/test2.md"]
40 | with patch.object(sys, 'argv', testargs):
41 | test = main()
42 | self.assertEqual(test, 12)
43 | shutil.rmtree("test")
44 |
45 | def test_file_does_not_exist(self):
46 | # Test if program works when file or folder doesn't exist
47 | testargs = ["mwc.cli", "something.md"]
48 | with patch.object(sys, 'argv', testargs):
49 | with self.assertRaises(SystemExit):
50 | main()
51 |
52 | def test_simple_text(self):
53 | text = textwrap.dedent("""
54 | test a b c
55 | """)
56 | self.assertEqual(count_words_in_markdown(text), 4)
57 |
58 | def test_headings(self):
59 | text = textwrap.dedent("""
60 | # H1
61 | ## H2
62 | ### H3
63 |
64 | H1
65 | -----
66 |
67 | H1
68 | =====
69 |
70 | ### My Great Heading {#custom-id}
71 | """)
72 | self.assertEqual(count_words_in_markdown(text), 8)
73 |
74 | def test_inline(self):
75 | text = textwrap.dedent("""
76 | **bold text**
77 | *italicized text*
78 | `test`
79 | ~~test~~
80 | """)
81 | self.assertEqual(count_words_in_markdown(text), 6)
82 |
83 | def test_comments(self):
84 | text = textwrap.dedent("""
85 |
86 |
87 |
92 |
93 | Test
94 | """)
95 | self.assertEqual(count_words_in_markdown(text), 1)
96 |
97 | def test_quote(self):
98 | text = textwrap.dedent("""
99 | > blockquote
100 | """)
101 | self.assertEqual(count_words_in_markdown(text), 1)
102 |
103 | def test_enumeration(self):
104 | text = textwrap.dedent("""
105 | 1. foo
106 | 2. bar
107 | #. smart item
108 | """)
109 | self.assertEqual(count_words_in_markdown(text), 4)
110 |
111 | def test_bullet_points(self):
112 | text = textwrap.dedent("""
113 | - foo
114 | - bar
115 | """)
116 | self.assertEqual(count_words_in_markdown(text), 2)
117 |
118 | def test_nested_bullet_points(self):
119 | text = textwrap.dedent("""
120 | - foo
121 | - bar
122 | - test
123 | """)
124 | self.assertEqual(count_words_in_markdown(text), 3)
125 |
126 | def test_nested_star_bullet_points(self):
127 | text = textwrap.dedent("""
128 | - foo
129 | - bar
130 | * test
131 | * baz
132 | """)
133 | self.assertEqual(count_words_in_markdown(text), 4)
134 |
135 | def test_indented_code_block(self):
136 | text = textwrap.dedent("""
137 | foo bar
138 |
139 | test code
140 | """)
141 | self.assertEqual(count_words_in_markdown(text), 2)
142 |
143 | def test_code_block(self):
144 | text = textwrap.dedent("""
145 | ```
146 | test
147 | ```
148 | """)
149 | self.assertEqual(count_words_in_markdown(text), 1)
150 |
151 | def test_link(self):
152 | text = textwrap.dedent("""
153 | Some [linked text](https://google.com/).
154 | """)
155 | self.assertEqual(count_words_in_markdown(text), 3)
156 |
157 | def test_image(self):
158 | text = textwrap.dedent("""
159 | test
160 |
161 | 
162 |
163 | 
164 |
165 | test
166 | """)
167 | self.assertEqual(count_words_in_markdown(text), 2)
168 |
169 | def test_footnote(self):
170 | text = textwrap.dedent("""
171 | MWC is great [1].
172 |
173 | [1] source footnote
174 | [1](do count this one please)
175 |
176 | Followup text
177 | """)
178 | self.assertEqual(count_words_in_markdown(text), 10)
179 |
180 | def test_html_tags(self):
181 | text = textwrap.dedent("""
182 | test
183 |
184 |
185 | test
186 |
187 | test
188 | """)
189 | self.assertEqual(count_words_in_markdown(text), 3)
190 |
191 | def test_custom_header_tags(self):
192 | text = textwrap.dedent("""
193 | ## header1 {#header1}
194 | foo bar
195 | ## header2 {#header2}
196 | """)
197 | self.assertEqual(count_words_in_markdown(text), 4)
198 |
--------------------------------------------------------------------------------