.
675 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # tomd
2 |
3 | ](https://img.shields.io/pypi/l/tomd.svg)
4 | ](https://img.shields.io/pypi/v/tomd.svg)
5 | ](https://img.shields.io/pypi/pyversions/tomd.svg)
6 |
7 | When crawling online articles such as news, blogs, etc. I want to save them in markdown files but not databases.
8 | Tomd has the ability of converting a HTML that converted from markdown. If a HTML can't be described by markdown, tomd can't convert it right.
9 | Tomd is a python tool.
10 |
11 |
12 | ## Road map
13 |
14 | - [x] Basic support
15 | - [ ] Full support(Nested list)
16 | - [ ] Command line tool
17 |
18 | ## Installation
19 |
20 | `pip install tomd`
21 |
22 | ## Getting Started
23 |
24 | Input
25 |
26 | ```python
27 | import tomd
28 |
29 | tomd.Tomd('h1
').markdown
30 | # or
31 | tomd.convert('h1
')
32 | ```
33 |
34 | Output
35 |
36 | ```markdown
37 | # h1
38 | ```
39 |
40 | ## Usage
41 |
42 | ```python
43 | from tomd import Tomd
44 |
45 |
46 | html="""
47 | h1
48 | h2
49 | h3
50 | h4
51 | h5
52 | h6
53 | paragraph
54 | link
55 |
img
56 |
57 |
58 | - 1
59 | - 2
60 | - 3
61 |
62 |
63 | - 1
64 | - 2
65 | - 3
66 |
67 | blockquote
68 | inline code
69 | block code
70 |
71 | del
72 | bold
73 | italic
74 | bold italic
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 | th1 |
83 | th2 |
84 |
85 |
86 |
87 |
88 | td |
89 | td |
90 |
91 |
92 | td |
93 | td |
94 |
95 | """
96 |
97 |
98 | Tomd(html).markdown
99 | ```
100 |
101 | ## Result
102 |
103 | ```markdown
104 | # h1
105 |
106 | ## h2
107 |
108 | ### h3
109 |
110 | #### h4
111 |
112 | ##### h5
113 |
114 | ###### h6
115 |
116 | paragraph
117 | [link](https://github.com)
118 | 
119 |
120 |
121 | - 1
122 | - 2
123 | - 3
124 |
125 | 1. 1
126 | 1. 2
127 | 1. 3
128 |
129 | > blockquote
130 |
131 | `inline code`
132 |
133 |
134 | block code
135 |
136 |
137 | ~~del~~
138 | **bold**
139 | *italic*
140 | ***bold italic***
141 |
142 |
143 | ---
144 |
145 |
146 | |th1|th2
147 | |------
148 | |td|td
149 | |td|td
150 |
151 | ```
152 |
--------------------------------------------------------------------------------
/ex.md:
--------------------------------------------------------------------------------
1 | # h1
2 |
3 | ## h2
4 |
5 | ### h3
6 |
7 | #### h4
8 |
9 | ##### h5
10 |
11 | ###### h6
12 |
13 | paragraph
14 | [link](https://github.com)
15 | 
16 |
17 |
18 | - 1
19 | - 2
20 | - 3
21 |
22 | 1. 1
23 | 1. 2
24 | 1. 3
25 |
26 | > blockquote
27 |
28 | `inline code`
29 |
30 | ```
31 | block code
32 | ```
33 |
34 |
35 | ~~del~~
36 | **bold**
37 | *italic*
38 | ***bold italic***
39 | **em**
40 | **strong**
41 |
42 |
43 | ---
44 |
45 |
46 | |th1|th2
47 | |------
48 | |td|td
49 | |td|td
50 |
51 |
52 | # h1
53 |
54 | ## h2
55 |
56 | ### h3
57 |
58 | #### h4
59 |
60 | ##### h5
61 |
62 | ###### h6
63 |
64 | paragraph
65 | [link](https://github.com)
66 | 
67 |
68 |
69 | - 1
70 | - 2
71 | - 3
72 |
73 | 1. 1
74 | 1. 2
75 | 1. 3
76 |
77 | > blockquote
78 |
79 | `inline code`
80 |
81 | ```
82 | block code
83 | ```
84 |
85 |
86 | ~~del~~
87 | **bold**
88 | *italic*
89 | ***bold italic***
90 | **em**
91 | **strong**
92 |
93 |
94 | ---
95 |
96 |
97 | |th1|th2
98 | |------
99 | |td|td
100 | |td|td
101 |
102 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 |
3 | setup(
4 | name="tomd",
5 | version="0.1.4",
6 | description="Convert HTML to Markdown.",
7 | author="Gaojiuli",
8 | author_email="gaojiuli@gmail.com",
9 | url='https://github.com/gaojiuli/tomd',
10 | classifiers=[
11 | 'Development Status :: 5 - Production/Stable',
12 | 'Intended Audience :: Developers',
13 | 'License :: OSI Approved :: GNU General Public License (GPL)',
14 | 'Programming Language :: Python :: 2.7',
15 | 'Programming Language :: Python :: 3.5',
16 | 'Programming Language :: Python :: 3.6',
17 | ],
18 | install_requires=[
19 | 'pyquery'
20 | ],
21 | license='GNU GPL 3',
22 | packages=find_packages(),
23 | py_modules=['tomd'],
24 | include_package_data=True,
25 | zip_safe=False,
26 | )
27 |
--------------------------------------------------------------------------------
/test_tomd.py:
--------------------------------------------------------------------------------
1 | import tomd
2 | import requests
3 |
4 | r = requests.get('https://github.com/gaojiuli/toapi')
5 | r.encoding = None
6 | tomd.convert(r.text)
7 |
8 |
--------------------------------------------------------------------------------
/tomd.py:
--------------------------------------------------------------------------------
1 | from pyquery import PyQuery as pq
2 |
3 | __all__ = ['Tomd', 'convert']
4 |
5 | MARKDOWN = {
6 | 'h1': "#",
7 | 'h2': "##",
8 | 'h3': "###",
9 | 'h4': "####",
10 | 'h5': "#####",
11 | 'h6': "######",
12 | "blockquote": ">",
13 | "li": "-",
14 | "hr": "---",
15 | "p": "\n"
16 | }
17 |
18 | INLINE = {
19 | 'em': ('*', '*'),
20 | 'strong': ('**', '**'),
21 | 'b': ('**', '**'),
22 | 'i': ('*', '*'),
23 | 'del': ('~~', '~~'),
24 | "code": ('`', '`')
25 | }
26 |
27 | split_str = "++++++++++++++++++"
28 |
29 |
30 | class Tomd:
31 | def __init__(self, html=''):
32 | self.html = html
33 | self._markdown = ""
34 |
35 | def convert(self, html=""):
36 |
37 | d = pq(html)
38 | d('head').remove()
39 | html = d.html()
40 |
41 | d = pq(html)
42 | for e in d('span'):
43 | inline_mark = pq(e).text()
44 | html = html.replace(str(pq(e)), inline_mark)
45 |
46 | d = pq(html)
47 | for e in d('a'):
48 | if "http" in pq(e).attr('href'):
49 | inline_mark = f"[{pq(e).text()}]({pq(e).attr('href')})"
50 | html = html.replace(str(pq(e)), inline_mark)
51 |
52 | d = pq(html)
53 | for e in d('img'):
54 | inline_mark = f".attr('src')})"
55 | html = html.replace(str(pq(e)), inline_mark)
56 |
57 | d = pq(html)
58 | for e in d('thead'):
59 | inline_mark = pq(e).outer_html() + '|------' * (pq(e)('th').length - 1)
60 | html = html.replace(str(pq(e)), inline_mark)
61 |
62 | d = pq(html)
63 | for e in d('th,td'):
64 | inline_mark = "|" + pq(e).text()
65 | html = html.replace(str(pq(e)), inline_mark)
66 |
67 | d = pq(html)
68 | for e in d('pre'):
69 | inline_mark = "```" + split_str + pq(e).html() + split_str + "```" + split_str
70 | html = html.replace(str(pq(e)), inline_mark)
71 |
72 | d = pq(html)
73 | selectors = ','.join(INLINE.keys())
74 | for e in d(selectors):
75 | inline_mark = INLINE.get(e.tag)[0] + pq(e).text() + INLINE.get(e.tag)[1]
76 | html = html.replace(str(pq(e)), inline_mark)
77 |
78 | d = pq(html)
79 | selectors = ','.join(MARKDOWN.keys())
80 | for e in d(selectors):
81 | inline_mark = split_str + MARKDOWN.get(e.tag) + " " + pq(e).text() + split_str
82 | html = html.replace(str(pq(e)), inline_mark)
83 |
84 | self._markdown = pq(html).text().replace(split_str, '\n')
85 |
86 | print(self._markdown)
87 | return self._markdown
88 |
89 | @property
90 | def markdown(self):
91 | self.convert(self.html)
92 | return self._markdown
93 |
94 |
95 | _inst = Tomd()
96 | convert = _inst.convert
97 |
--------------------------------------------------------------------------------