├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── html2markdown.py ├── requirements.txt ├── setup.cfg ├── setup.py └── tests.py /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.7 4 | - 3.6 5 | cache: pip 6 | install: 7 | - pip install -r requirements.txt 8 | - pip install markdown 9 | script: 10 | - python tests.py -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2017 David L (dlon) 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ============= 2 | html2markdown 3 | ============= 4 | 5 | .. image:: https://travis-ci.com/dlon/html2markdown.svg?branch=master 6 | :target: https://travis-ci.com/dlon/html2markdown 7 | 8 | **Experimental** 9 | 10 | **Purpose**: Converts html to markdown while preserving unsupported html markup. The goal is to generate markdown that can be converted back into html. This is the major difference between html2markdown and html2text. The latter doesn't purport to be reversible. 11 | 12 | Usage example 13 | ============= 14 | :: 15 | 16 | import html2markdown 17 | print html2markdown.convert('
Here is some code
')
18 |
19 | Output::
20 |
21 | ## Test
22 |
23 | Here is some code
24 |
25 | Information and caveats
26 | =======================
27 |
28 | Does not convert the content of block-type tags other than ```` -- such as ``
this is stuff. stuff
`` 38 | 39 | **Result**: ``this is stuff. __stuff__`` (surrounded by a newline on either side) 40 | 41 | **Input**: ``strike through some text here`` 42 | 43 | **Result**: ``strike __through__ some text here`` 44 | 45 | Except in unprocessed block-type tags, formatting characters are escaped 46 | ------------------------------------------------------------------------ 47 | 48 | **Input**: ``**escape me?**
`` (in html, we would use \ here) 49 | 50 | **Result**: ``\*\*escape me?\*\*`` 51 | 52 | **Input**: ``**escape me?**`` 53 | 54 | **Result**: ``\*\*escape me?\*\*`` 55 | 56 | **Input**: ``126 | # does not:asdf
asdf
)""" 18 | testStr = '
's content should be converted""" 24 | testStr = '
this is stuff. stuff
' 25 | expectedStr = 'this is stuff. __stuff__' 26 | mdStr = html2markdown.convert(testStr) 27 | self.assertEqual(mdStr, expectedStr) 28 | 29 | def test_inline_tag_break(self): 30 | """inline-type tags should not cause line breaks""" 31 | emptyElements = self.emptyElements 32 | for tag in html2markdown._inlineTags: 33 | if tag not in emptyElements: 34 | testStr = 'test <%s>test%s> test
' % (tag, tag) 35 | else: 36 | testStr = 'test <%s /> test
' % tag 37 | mdStr = html2markdown.convert(testStr) 38 | bs = bs4.BeautifulSoup(markdown.markdown(mdStr), 'html.parser') 39 | 40 | self.assertEqual(len(bs.find_all('p')), 1) 41 | 42 | def test_inline_tag_content(self): 43 | """content of inline-type tags should be converted""" 44 | emptyElements = self.emptyElements 45 | for tag in html2markdown._inlineTags: 46 | if tag in emptyElements: 47 | continue 48 | 49 | testStr = '<%s style="text-decoration:line-through;">strike through some text%s> here' % (tag, tag) 50 | expectedStr = '<%s style="text-decoration:line-through;">strike __through__ some text%s> here' % (tag, tag) 51 | 52 | mdStr = html2markdown.convert(testStr) 53 | 54 | self.assertEqual(mdStr, expectedStr, 'Tag: {}'.format(tag)) 55 | 56 | bs = bs4.BeautifulSoup(markdown.markdown(mdStr), 'html.parser') 57 | self.assertEqual( 58 | len(bs.find_all('strong')), 1 if tag != 'strong' else 2, 59 | 'Tag: {}. Conversion: {}'.format(tag, mdStr) 60 | ) 61 | 62 | class TestEscaping(unittest.TestCase): 63 | 64 | escapableChars = r'\`*_{}[]()#+-.!' 65 | 66 | @classmethod 67 | def setUpClass(cls): 68 | cls.escapedChars = html2markdown._escapeCharSequence 69 | 70 | def test_block_tag_escaping(self): 71 | """formatting characters should NOT be escaped for block-type tags (except)""" 72 | for escChar in self.escapableChars: 73 | testStr = '
**escape me**
'.replace('*', escChar) 82 | expectedStr = '\*\*escape me\*\*'.replace('*', escChar) 83 | mdStr = html2markdown.convert(testStr) 84 | self.assertEqual(mdStr, expectedStr) 85 | 86 | def test_p_escaping_2(self): 87 | """ensure all escapable characters are retained for""" 88 | for escChar in self.escapableChars: 89 | testStr = '
**escape me**
'.replace('*', escChar) 90 | mdStr = html2markdown.convert(testStr) 91 | reconstructedStr = markdown.markdown(mdStr) 92 | self.assertEqual(reconstructedStr, testStr) 93 | 94 | def test_inline_tag_escaping(self): 95 | """formatting characters should be escaped for inline-type tags""" 96 | for escChar in self.escapedChars: 97 | testStr = '**escape me**' 98 | expectedStr = '\*\*escape me\*\*' 99 | mdStr = html2markdown.convert(testStr) 100 | self.assertEqual(mdStr, expectedStr) 101 | 102 | def test_inline_tag_escaping_2(self): 103 | """ensure all escapable characters are retained for inline-type tags""" 104 | for escChar in self.escapableChars: 105 | testStr = '**escape me**
' 106 | mdStr = html2markdown.convert(testStr) 107 | reconstructedStr = markdown.markdown(mdStr) 108 | self.assertEqual(reconstructedStr, testStr) 109 | 110 | def test_header(self): 111 | result = html2markdown.convert('# test
') 112 | bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser') 113 | self.assertEqual(len(bs.find_all('h1')), 0) 114 | 115 | result = html2markdown.convert('[http://google.com](test)
') 121 | bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser') 122 | self.assertEqual(len(bs.find_all('a')), 0) 123 | 124 | result = html2markdown.convert('') 125 | bs = bs4.BeautifulSoup(markdown.markdown(result), 'html.parser') 126 | self.assertEqual(len(bs.find_all('a')), 1) 127 | 128 | class TestTags(unittest.TestCase): 129 | 130 | genericStr = 'asdf
Here is some code
'
131 | problematic_a_string_1 = "before test after"
132 | problematic_a_string_2 = "before test after"
133 | problematic_a_string_3 = ""
134 | problematic_a_string_4 = "test"
135 | problematic_a_string_5 = "test"
136 | problematic_a_string_6 = "test"
137 |
138 | def test_h2(self):
139 | mdStr = html2markdown.convert(self.genericStr)
140 | reconstructedStr = markdown.markdown(mdStr)
141 |
142 | bs = bs4.BeautifulSoup(reconstructedStr, 'html.parser')
143 | childTags = bs.find_all(recursive=False)
144 |
145 | self.assertEqual(childTags[1].name, 'h2')
146 | self.assertEqual(childTags[1].string, 'Test')
147 |
148 | def test_a(self):
149 | mdStr = html2markdown.convert(self.problematic_a_string_1)
150 | self.assertEqual(mdStr, self.problematic_a_string_1,
151 | " tag without an href attribute should be left alone")
152 |
153 | mdStr = html2markdown.convert(self.problematic_a_string_2)
154 | self.assertEqual(mdStr, self.problematic_a_string_2,
155 | " tag without an href attribute should be left alone")
156 |
157 | mdStr = html2markdown.convert(self.problematic_a_string_3)
158 | self.assertEqual(mdStr, self.problematic_a_string_3,
159 | " tag without an href attribute should be left alone")
160 |
161 | mdStr = html2markdown.convert(self.problematic_a_string_4)
162 | self.assertEqual(mdStr, '[test](test "test")')
163 |
164 | mdStr = html2markdown.convert(self.problematic_a_string_5)
165 | self.assertEqual(mdStr, '