├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.rst
├── htmllistparse
├── __init__.py
├── htmllistparse.py
└── rehttpfs.py
├── requirements.txt
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016-2017 Dingyuan Wang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE
3 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | htmllisting-parser
2 | ==================
3 | Python parser for Apache/nginx-style HTML directory listing
4 |
5 | .. code-block:: python
6 |
7 | import htmllistparse
8 | cwd, listing = htmllistparse.fetch_listing(some_url, timeout=30)
9 |
10 | # or you can get the url and make a BeautifulSoup yourself, then use
11 | # cwd, listing = htmllistparse.parse(soup)
12 |
13 | where ``cwd`` is the current directory, ``listing`` is a list of ``FileEntry`` named tuples:
14 |
15 | * ``name``: File name, ``str``. Have a trailing / if it's a directory.
16 | * ``modified``: Last modification time, ``time.struct_time`` or ``None``. Timezone is not known.
17 | * ``size``: File size, ``int`` or ``None``. May be estimated from the prefix, such as "K", "M".
18 | * ``description``: File description, file type, or any other things found. ``str`` as HTML, or ``None``.
19 |
20 | Supports:
21 |
22 | * Vanilla Apache/nginx/lighttpd/darkhttpd autoindex
23 | * Most ``
``-style index
24 | * Many other ``
``-style index
25 | * ````-style
26 |
27 | .. note::
28 | Please wrap the functions in a general ``try... except`` block. It may throw exceptions unexpectedly.
29 |
30 | ReHTTPFS
31 | --------
32 |
33 | Reinvented HTTP Filesystem.
34 |
35 | * Mounts most HTTP file listings with FUSE.
36 | * Gets directory tree and file stats with less overhead.
37 | * Supports Range requests.
38 | * Supports Keep-Alive.
39 |
40 | ::
41 |
42 | usage: rehttpfs.py [-h] [-o OPTIONS] [-t TIMEOUT] [-u USER_AGENT] [-v] [-d]
43 | url mountpoint
44 |
45 | Mount HTML directory listings.
46 |
47 | positional arguments:
48 | url URL to mount
49 | mountpoint filesystem mount point
50 |
51 | optional arguments:
52 | -h, --help show this help message and exit
53 | -o OPTIONS comma separated FUSE options
54 | -t TIMEOUT, --timeout TIMEOUT
55 | HTTP request timeout
56 | -u USER_AGENT, --user-agent USER_AGENT
57 | HTTP User-Agent
58 | -v, --verbose enable debug logging
59 | -d, --daemon run in background
60 |
61 |
--------------------------------------------------------------------------------
/htmllistparse/__init__.py:
--------------------------------------------------------------------------------
1 | from .htmllistparse import *
2 |
3 | __version__ = '0.6'
4 |
--------------------------------------------------------------------------------
/htmllistparse/htmllistparse.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import re
6 | import time
7 | import collections
8 | import urllib.parse
9 |
10 | import bs4
11 |
12 | RE_ISO8601 = re.compile(r'\d{4}-\d+-\d+T\d+:\d{2}:\d{2}Z')
13 | DATETIME_FMTs = (
14 | (re.compile(r'\d+-[A-S][a-y]{2}-\d{4} \d+:\d{2}:\d{2}'), "%d-%b-%Y %H:%M:%S"),
15 | (re.compile(r'\d+-[A-S][a-y]{2}-\d{4} \d+:\d{2}'), "%d-%b-%Y %H:%M"),
16 | (re.compile(r'\d{4}-\d+-\d+ \d+:\d{2}:\d{2}'), "%Y-%m-%d %H:%M:%S"),
17 | (RE_ISO8601, "%Y-%m-%dT%H:%M:%SZ"),
18 | (re.compile(r'\d{4}-\d+-\d+ \d+:\d{2}'), "%Y-%m-%d %H:%M"),
19 | (re.compile(r'\d{4}-[A-S][a-y]{2}-\d+ \d+:\d{2}:\d{2}'), "%Y-%b-%d %H:%M:%S"),
20 | (re.compile(r'\d{4}-[A-S][a-y]{2}-\d+ \d+:\d{2}'), "%Y-%b-%d %H:%M"),
21 | (re.compile(r'[F-W][a-u]{2} [A-S][a-y]{2} +\d+ \d{2}:\d{2}:\d{2} \d{4}'), "%a %b %d %H:%M:%S %Y"),
22 | (re.compile(r'[F-W][a-u]{2}, \d+ [A-S][a-y]{2} \d{4} \d{2}:\d{2}:\d{2} .+'), "%a, %d %b %Y %H:%M:%S %Z"),
23 | (re.compile(r'\d{4}-\d+-\d+'), "%Y-%m-%d"),
24 | (re.compile(r'\d+/\d+/\d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}'), "%d/%m/%Y %H:%M:%S %z"),
25 | (re.compile(r'\d{2} [A-S][a-y]{2} \d{4}'), "%d %b %Y")
26 | )
27 |
28 | RE_FILESIZE = re.compile(r'\d+(\.\d+)? ?[BKMGTPEZY]|\d+|-', re.I)
29 | RE_ABSPATH = re.compile(r'^((ht|f)tps?:/)?/')
30 | RE_COMMONHEAD = re.compile('Name|(Last )?modifi(ed|cation)|date|Size|Description|Metadata|Type|Parent Directory', re.I)
31 | RE_HASTEXT = re.compile('.+')
32 | RE_HEAD_NAME = re.compile('name$|^file|^download')
33 | RE_HEAD_MOD = re.compile('modifi|^uploaded|date|time')
34 | RE_HEAD_SIZE = re.compile('size|bytes$')
35 |
36 | FileEntry = collections.namedtuple('FileEntry', 'name modified size description')
37 |
38 | def human2bytes(s):
39 | """
40 | >>> human2bytes('1M')
41 | 1048576
42 | >>> human2bytes('1G')
43 | 1073741824
44 | """
45 | if s is None:
46 | return None
47 | try:
48 | return int(s)
49 | except ValueError:
50 | symbols = 'BKMGTPEZY'
51 | letter = s[-1:].strip().upper()
52 | num = float(s[:-1])
53 | prefix = {symbols[0]: 1}
54 | for i, s in enumerate(symbols[1:]):
55 | prefix[s] = 1 << (i+1)*10
56 | return int(num * prefix[letter])
57 |
58 | def aherf2filename(a_href):
59 | isdir = ('/' if a_href[-1] == '/' else '')
60 | return os.path.basename(urllib.parse.unquote(a_href.rstrip('/'))) + isdir
61 |
62 | def parse(soup):
63 | '''
64 | Try to parse apache/nginx-style directory listing with all kinds of tricks.
65 |
66 | Exceptions or an empty listing suggust a failure.
67 | We strongly recommend generating the `soup` with 'html5lib'.
68 |
69 | Returns: Current directory, Directory listing
70 | '''
71 | cwd = None
72 | listing = []
73 | if soup.title and soup.title.string and soup.title.string.startswith('Index of '):
74 | cwd = soup.title.string[9:]
75 | elif soup.h1:
76 | title = soup.h1.get_text().strip()
77 | if title.startswith('Index of '):
78 | cwd = title[9:]
79 | [img.decompose() for img in soup.find_all('img')]
80 | file_name = file_mod = file_size = file_desc = None
81 | pres = [x for x in soup.find_all('pre') if
82 | x.find('a', string=RE_HASTEXT)]
83 | tables = [x for x in soup.find_all('table') if
84 | x.find(string=RE_COMMONHEAD)] if not pres else ()
85 | heads = []
86 | if pres:
87 | pre = pres[0]
88 | started = False
89 | for element in (pre.hr.next_siblings if pre.hr else pre.children):
90 | if element.name == 'a':
91 | if not element.string or not element.string.strip():
92 | continue
93 | elif started:
94 | if file_name:
95 | listing.append(FileEntry(
96 | file_name, file_mod, file_size, file_desc))
97 | file_name = aherf2filename(element['href'])
98 | file_mod = file_size = file_desc = None
99 | elif element.string in ('Parent Directory', '..', '../'):
100 | # start with next a
101 | started = True
102 | elif element['href'][0] not in '?/':
103 | # start right away
104 | file_name = aherf2filename(element['href'])
105 | file_mod = file_size = file_desc = None
106 | started = True
107 | elif not element.name:
108 | line = element.string.replace('\r', '').split('\n', 1)[0].lstrip()
109 | for regex, fmt in DATETIME_FMTs:
110 | match = regex.match(line)
111 | if match:
112 | file_mod = time.strptime(match.group(0), fmt)
113 | line = line[match.end():].lstrip()
114 | break
115 | match = RE_FILESIZE.match(line)
116 | if match:
117 | sizestr = match.group(0)
118 | if sizestr == '-':
119 | file_size = None
120 | else:
121 | file_size = human2bytes(sizestr.replace(' ', '').replace(',', ''))
122 | line = line[match.end():].lstrip()
123 | if line:
124 | file_desc = line.rstrip()
125 | if file_name and file_desc == '/':
126 | file_name += '/'
127 | file_desc = None
128 | else:
129 | continue
130 | if file_name:
131 | listing.append(FileEntry(file_name, file_mod, file_size, file_desc))
132 | elif tables:
133 | started = False
134 | for tr in tables[0].find_all('tr'):
135 | status = 0
136 | file_name = file_mod = file_size = file_desc = None
137 | if started:
138 | if tr.parent.name in ('thead', 'tfoot') or tr.th:
139 | continue
140 | for td in tr.find_all('td'):
141 | if status >= len(heads):
142 | raise AssertionError("can't detect table column number")
143 | if td.get('colspan'):
144 | continue
145 | elif heads[status] == 'name':
146 | if not td.a:
147 | continue
148 | a_str = td.a.get_text().strip()
149 | a_href = td.a['href']
150 | if not a_str or not a_href or a_href[0] == '#':
151 | continue
152 | elif a_str == 'Parent Directory' or a_href == '../':
153 | break
154 | else:
155 | file_name = aherf2filename(a_href)
156 | status = 1
157 | elif heads[status] == 'modified':
158 | if td.time:
159 | timestr = td.time.get('datetime', '')
160 | if RE_ISO8601.match(timestr):
161 | file_mod = time.strptime(timestr, "%Y-%m-%dT%H:%M:%SZ")
162 | status += 1
163 | continue
164 | timestr = td.get_text().strip()
165 | if timestr:
166 | for regex, fmt in DATETIME_FMTs:
167 | match = regex.match(timestr)
168 | if match:
169 | file_mod = time.strptime(match.group(0), fmt)
170 | break
171 | else:
172 | if td.get('data-sort-value'):
173 | file_mod = time.gmtime(int(td['data-sort-value']))
174 | # else:
175 | # raise AssertionError(
176 | # "can't identify date/time format")
177 | status += 1
178 | elif heads[status] == 'size':
179 | sizestr = td.get_text().strip().replace(',', '')
180 | if sizestr == '-' or not sizestr:
181 | file_size = None
182 | elif td.get('data-sort-value'):
183 | file_size = int(td['data-sort-value'])
184 | else:
185 | match = RE_FILESIZE.match(sizestr)
186 | if match:
187 | file_size = human2bytes(
188 | match.group(0).replace(' ', ''))
189 | else:
190 | file_size = None
191 | status += 1
192 | elif heads[status] == 'description':
193 | file_desc = file_desc or ''.join(map(str, td.children)
194 | ).strip(' \t\n\r\x0b\x0c\xa0') or None
195 | status += 1
196 | elif status:
197 | # unknown header
198 | status += 1
199 | if file_name:
200 | listing.append(FileEntry(
201 | file_name, file_mod, file_size, file_desc))
202 | elif tr.hr:
203 | started = True
204 | continue
205 | elif tr.find(string=RE_COMMONHEAD):
206 | namefound = False
207 | colspan = False
208 | for th in (tr.find_all('th') if tr.th else tr.find_all('td')):
209 | if th.get('colspan'):
210 | colspan = True
211 | continue
212 | name = th.get_text().strip(' \t\n\r\x0b\x0c\xa0↑↓').lower()
213 | if not name:
214 | continue
215 | elif not namefound and RE_HEAD_NAME.search(name):
216 | heads.append('name')
217 | namefound = True
218 | elif name in ('size', 'description'):
219 | heads.append(name)
220 | elif RE_HEAD_MOD.search(name):
221 | heads.append('modified')
222 | elif RE_HEAD_SIZE.search(name):
223 | heads.append('size')
224 | elif name.endswith('signature'):
225 | heads.append('signature')
226 | else:
227 | heads.append('description')
228 | if colspan:
229 | continue
230 | if not heads:
231 | heads = ('name', 'modified', 'size', 'description')
232 | elif not namefound:
233 | heads[0] = 'name'
234 | started = True
235 | continue
236 | elif soup.ul:
237 | for li in soup.ul.find_all('li'):
238 | a = li.a
239 | if not a or not a.get('href'):
240 | continue
241 | file_name = urllib.parse.unquote(a['href'])
242 | if (file_name in {'Parent Directory', '.', './', '..', '../', '#'}
243 | or RE_ABSPATH.match(file_name)):
244 | continue
245 | else:
246 | listing.append(FileEntry(file_name, None, None, None))
247 | return cwd, listing
248 |
249 | def fetch_listing(url, timeout=30, **requests_kwargs):
250 | import requests
251 | req = requests.get(url, timeout=timeout, **requests_kwargs)
252 | req.raise_for_status()
253 | soup = bs4.BeautifulSoup(req.content, 'html5lib')
254 | return parse(soup)
255 |
256 | if __name__ == '__main__':
257 | import sys
258 | import requests
259 | for url in sys.argv[1:] or ('http://httpredir.debian.org/debian/',):
260 | req = requests.get(url, timeout=30)
261 | req.raise_for_status()
262 | print(req.url)
263 | soup = bs4.BeautifulSoup(req.content, 'html5lib')
264 | cwd, listing = parse(soup)
265 | print('Cwd:', cwd)
266 | for f in listing:
267 | print(f)
268 | print()
269 |
--------------------------------------------------------------------------------
/htmllistparse/rehttpfs.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | # -*- coding: utf-8 -*-
3 |
4 | import os
5 | import io
6 | import stat
7 | import time
8 | import logging
9 | import argparse
10 | import calendar
11 | import urllib.parse
12 | from errno import EACCES, ENOENT, EIO
13 | from email.utils import parsedate
14 |
15 | import bs4
16 | import requests
17 | import htmllistparse
18 | try:
19 | import fusepy as fuse
20 | except ImportError:
21 | import fuse
22 |
23 | CONFIG = {
24 | 'timeout': None,
25 | 'user_agent': None,
26 | }
27 | SESSION = requests.Session()
28 | CONTENT_CHUNK_SIZE = 10 * 1024
29 |
30 |
31 | def parse_dir(html):
32 | return htmllistparse.parse(bs4.BeautifulSoup(html, 'html5lib'))
33 |
34 |
35 | def make_url(urlbase, name):
36 | return urllib.parse.urljoin(urlbase, urllib.parse.quote(name.lstrip('/')))
37 |
38 |
39 | def sizeof_fmt(num):
40 | for unit in ('', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'):
41 | if abs(num) < 1024:
42 | if unit:
43 | return "%3.1f%s" % (num, unit)
44 | else:
45 | return int(num)
46 | num /= 1024.0
47 | return "%.1f%s" % (num, 'Y')
48 |
49 |
50 | def convert_fuse_options(options):
51 | kwargs = {}
52 | if options is not None:
53 | for opt in options.split(','):
54 | kv = opt.split('=', 1)
55 | if len(kv) == 1:
56 | kwargs[kv[0]] = True
57 | else:
58 | kwargs[kv[0]] = kv[1]
59 | return kwargs
60 |
61 |
62 | class IsADirectory(ValueError):
63 | pass
64 |
65 |
66 | class FileStat:
67 | __slots__ = (
68 | 'st_mode', 'st_nlink', 'st_uid', 'st_gid', 'st_size',
69 | 'st_atime', 'st_mtime', 'st_ctime'
70 | )
71 |
72 | def __init__(self):
73 | self.st_mode = stat.S_IFREG | 0o444
74 | self.st_nlink = 1
75 | self.st_uid = 0
76 | self.st_gid = 0
77 | self.st_size = 0
78 | self.st_atime = 0
79 | self.st_mtime = 0
80 | self.st_ctime = 0
81 |
82 | def settime(self, value):
83 | self.st_atime = self.st_mtime = self.st_ctime = value
84 |
85 | def setmode(self, value, isdir=False):
86 | if isdir:
87 | self.st_mode = stat.S_IFDIR | value
88 | else:
89 | self.st_mode = stat.S_IFREG | value
90 |
91 | def __getitem__(self, key):
92 | return getattr(self, key)
93 |
94 | def items(self):
95 | for key in self.__slots__:
96 | yield key, getattr(self, key)
97 |
98 | def __repr__(self):
99 | return '' % (
100 | self.st_mode, sizeof_fmt(self.st_size), self.st_mtime)
101 |
102 |
103 | class File(io.IOBase):
104 | __slots__ = (
105 | 'baseurl', 'path', 'url', 'stat', 'init',
106 | 'exist', '_readable', '_seekable', 'offset'
107 | )
108 |
109 | def __init__(self, baseurl, path):
110 | self.baseurl = baseurl
111 | self.path = path
112 | self.url = make_url(baseurl, path)
113 | self.stat = FileStat()
114 | self.init = 0
115 | self.exist = True
116 | self._readable = True
117 | self._seekable = False
118 | self.offset = 0
119 |
120 | def get_stat(self):
121 | req = SESSION.head(self.url, timeout=CONFIG[
122 | 'timeout'], allow_redirects=False)
123 | req.close()
124 | if 400 <= req.status_code <= 499:
125 | self.stat.setmode(0o000)
126 | self.init = 2
127 | self._readable = False
128 | if req.status_code == 404:
129 | self.exist = False
130 | return self.stat
131 | elif req.status_code in (301, 302):
132 | raise IsADirectory()
133 | else:
134 | req.raise_for_status()
135 | self.stat.st_size = int(req.headers.get('Content-Length', 0))
136 | lm = req.headers.get('Last-Modified')
137 | if lm:
138 | self.stat.settime(time.mktime(parsedate(lm)))
139 | else:
140 | self.stat.settime(time.time())
141 | if req.headers.get('Accept-Ranges') == 'bytes':
142 | self._seekable = True
143 | self.init = 2
144 | return self.stat
145 |
146 | def read(self, size=None, offset=None):
147 | if not self.init or not self.stat.st_size:
148 | self.get_stat()
149 | if not self.exist:
150 | raise fuse.FuseOSError(ENOENT)
151 | elif not self.readable():
152 | raise fuse.FuseOSError(EIO)
153 | if offset is None:
154 | offset = self.offset
155 | end = min(self.stat.st_size, offset + size - 1)
156 | brange = '%d-%d' % (offset, end)
157 | headers = {'range': 'bytes=' + brange}
158 | req = SESSION.get(self.url, headers=headers,
159 | stream=True, timeout=CONFIG['timeout'])
160 | if req.status_code == 206:
161 | self._seekable = True
162 | elif req.status_code == 416:
163 | # we may have a wrong size
164 | self.get_stat()
165 | raise fuse.FuseOSError(EIO)
166 | elif req.status_code == 200:
167 | self._seekable = False
168 | if offset != 0:
169 | raise fuse.FuseOSError(EIO)
170 | elif req.status_code == 403:
171 | self._readable = False
172 | raise fuse.FuseOSError(EACCES)
173 | elif req.status_code == 404:
174 | self.exist = False
175 | self._readable = False
176 | raise fuse.FuseOSError(ENOENT)
177 | else:
178 | self._readable = False
179 | raise fuse.FuseOSError(EIO)
180 | content = bytes()
181 | for chunk in req.iter_content(CONTENT_CHUNK_SIZE, False):
182 | content += chunk
183 | if len(content) > size:
184 | content = content[:size]
185 | break
186 | req.close()
187 | if self._seekable:
188 | self.offset = end
189 | return content
190 |
191 | def readable(self):
192 | return self._readable
193 |
194 | def seekable(self):
195 | return self._seekable
196 |
197 | def seek(self, offset):
198 | if self._seekable:
199 | self.offset = offset
200 |
201 | def tell(self):
202 | return self.offset
203 |
204 |
205 | class Directory:
206 | __slots__ = (
207 | 'baseurl', 'path', 'url', 'stat', 'content', 'init', 'exist', '_readable'
208 | )
209 |
210 | def __init__(self, baseurl, path):
211 | self.baseurl = baseurl
212 | self.path = path
213 | self.url = make_url(baseurl, path)
214 | self.stat = FileStat()
215 | self.stat.setmode(0o555, True)
216 | self.stat.st_nlink = 2
217 | self.content = ['.', '..']
218 | self.init = 0
219 | self.exist = True
220 | self._readable = True
221 |
222 | def read(self):
223 | try:
224 | req = SESSION.get(self.url, timeout=CONFIG['timeout'])
225 | except Exception:
226 | raise fuse.FuseOSError(EIO)
227 | try:
228 | req.raise_for_status()
229 | except requests.exceptions.HTTPError:
230 | self.stat.setmode(0o000, True)
231 | self.init = 2
232 | self._readable = False
233 | if req.status_code == 403:
234 | raise fuse.FuseOSError(EACCES)
235 | elif req.status_code == 404:
236 | self.exist = False
237 | raise fuse.FuseOSError(ENOENT)
238 | else:
239 | raise fuse.FuseOSError(EIO)
240 | lm = req.headers.get('Last-Modified')
241 | if lm:
242 | self.stat.settime(time.mktime(parsedate(lm)))
243 | else:
244 | self.stat.settime(time.time())
245 | try:
246 | cwd, listing = parse_dir(req.content)
247 | except Exception:
248 | logging.exception('failed to parse listing: ' + self.url)
249 | listing = []
250 | content = ['.', '..']
251 | objmap = {}
252 | for name, modified, size, description in listing:
253 | fpath = os.path.join(self.path, name)
254 | if name[-1] == '/':
255 | fileobj = Directory(self.baseurl, fpath)
256 | fpath = fpath.rstrip('/')
257 | else:
258 | fileobj = File(self.baseurl, fpath)
259 | if size is None:
260 | fileobj.get_stat()
261 | else:
262 | fileobj.stat.st_size = size
263 | if modified:
264 | fileobj.stat.settime(calendar.timegm(modified))
265 | else:
266 | fileobj.stat.settime(self.stat.st_mtime)
267 | fileobj.init = fileobj.init or 1
268 | content.append(name.rstrip('/'))
269 | objmap[fpath] = fileobj
270 | self.content = content
271 | self.stat.st_nlink = len(content)
272 | self.init = 2
273 | self._readable = True
274 | return objmap
275 |
276 | def readable(self):
277 | return self._readable
278 |
279 |
280 | class rehttpfs(fuse.LoggingMixIn, fuse.Operations):
281 | '''Reinvented HTTP Filesystem'''
282 |
283 | def __init__(self, url):
284 | self.url = url
285 | if url[-1] != '/':
286 | self.url += '/'
287 | self.fd = 0
288 | self.metacache = {'/': Directory(self.url, '/')}
289 |
290 | def _getpath(self, path, refresh=False):
291 | pathobj = self.metacache.get(path)
292 | if isinstance(pathobj, Directory):
293 | return self._getdirobj(path, refresh)
294 | else:
295 | return self._getfileobj(path, refresh)
296 |
297 | def _makeparents(self, path):
298 | while path != '/':
299 | path = os.path.dirname(path)
300 | if path not in self.metacache:
301 | self.metacache[path] = Directory(self.url, path + '/')
302 | else:
303 | break
304 |
305 | def _getfileobj(self, path, refresh=False):
306 | logging.debug('_getfileobj: %s', path)
307 | fileobj = self.metacache.get(path)
308 | try:
309 | if fileobj:
310 | if not fileobj.init or refresh:
311 | fileobj.get_stat()
312 | else:
313 | self._makeparents(path)
314 | fileobj = File(self.url, path)
315 | fileobj.get_stat()
316 | self.metacache[path] = fileobj
317 | except IsADirectory:
318 | logging.info('IsADirectory: %s', path)
319 | return self._getdirobj(path, refresh)
320 | return fileobj
321 |
322 | def _getdirobj(self, path, refresh=False):
323 | logging.debug('_getdirobj: %s', path)
324 | path = path.rstrip('/')
325 | dirobj = self.metacache.get(path)
326 | if dirobj:
327 | if not dirobj.init or refresh:
328 | objmap = dirobj.read()
329 | self._update_metacache(objmap)
330 | else:
331 | self._makeparents(path)
332 | dirobj = Directory(self.url, path + '/')
333 | objmap = dirobj.read()
334 | self._update_metacache(objmap)
335 | self.metacache[path] = dirobj
336 | return dirobj
337 |
338 | def _update_metacache(self, objmap):
339 | for name, obj in objmap.items():
340 | cached = self.metacache.get(name)
341 | if not (cached and cached.init and type(cached) == type(obj)):
342 | self.metacache[name] = obj
343 |
344 | def access(self, path, amode):
345 | if amode & os.W_OK:
346 | raise fuse.FuseOSError(EACCES)
347 | obj = self._getpath(path)
348 | if not obj.exist:
349 | raise fuse.FuseOSError(ENOENT)
350 | elif (obj.stat.st_mode & amode) != amode:
351 | raise fuse.FuseOSError(EACCES)
352 | return 0
353 |
354 | def getattr(self, path, fh=None):
355 | logging.debug('getattr: %s', path)
356 | obj = self._getpath(path)
357 | if not obj.exist:
358 | raise fuse.FuseOSError(ENOENT)
359 | return obj.stat
360 |
361 | def open(self, path, flags):
362 | self.fd += 1
363 | return self.fd
364 |
365 | def opendir(self, path):
366 | self.fd += 1
367 | return self.fd
368 |
369 | def read(self, path, size, offset, fh):
370 | fileobj = self._getfileobj(path, False)
371 | return fileobj.read(size, offset)
372 |
373 | def readdir(self, path, fh):
374 | logging.debug('readdir: %s', path)
375 | dirobj = self._getdirobj(path)
376 | if dirobj.init != 2:
377 | objmap = dirobj.read()
378 | self._update_metacache(objmap)
379 | content = []
380 | for name in dirobj.content:
381 | fpath = os.path.normpath(os.path.join(path, name))
382 | content.append((name, self.metacache[fpath].stat, 0))
383 | return content
384 |
385 |
386 | def main():
387 | parser = argparse.ArgumentParser(description="Mount HTML directory listings.")
388 | parser.add_argument("-o", help="comma separated FUSE options", metavar='OPTIONS')
389 | parser.add_argument("-t", "--timeout", help="HTTP request timeout", type=int, default=30)
390 | parser.add_argument("-u", "--user-agent", help="HTTP User-Agent")
391 | parser.add_argument("-v", "--verbose", help="enable debug logging", action='store_true')
392 | parser.add_argument("-d", "--daemon", help="run in background", action='store_true')
393 | parser.add_argument("url", help="URL to mount")
394 | parser.add_argument("mountpoint", help="filesystem mount point")
395 | args = parser.parse_args()
396 | logging.basicConfig(
397 | format='%(levelname)s:%(name)s %(message)s',
398 | level=logging.DEBUG if args.verbose else logging.INFO
399 | )
400 | CONFIG['timeout'] = args.timeout
401 | CONFIG['user_agent'] = args.user_agent
402 | fuseobj = fuse.FUSE(
403 | rehttpfs(args.url),
404 | args.mountpoint,
405 | foreground=(not args.daemon),
406 | **convert_fuse_options(args.o)
407 | )
408 |
409 |
410 | if __name__ == '__main__':
411 | main()
412 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | html5lib
3 | requests
4 | fusepy
5 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | try:
5 | from setuptools import setup
6 | except ImportError:
7 | from distutils.core import setup
8 |
9 | if sys.version_info < (3, 3):
10 | raise NotImplementedError("You need at least Python 3.3.")
11 |
12 | setup(
13 | name='htmllistparse',
14 | version='0.6.1',
15 | description='Python parser for Apache/nginx-style HTML directory listing.',
16 | long_description=open('README.rst', 'r').read(),
17 | author='Dingyuan Wang',
18 | author_email='gumblex@aosc.io',
19 | url='https://github.com/gumblex/htmllisting-parser',
20 | packages=['htmllistparse'],
21 | install_requires=[
22 | 'beautifulsoup4',
23 | 'html5lib',
24 | 'requests',
25 | 'fusepy'
26 | ],
27 | entry_points = {
28 | 'console_scripts': ['rehttpfs=htmllistparse.rehttpfs:main'],
29 | },
30 | license='MIT',
31 | platforms='any',
32 | classifiers=[
33 | 'Development Status :: 4 - Beta',
34 | 'Intended Audience :: Developers',
35 | 'License :: OSI Approved :: MIT License',
36 | 'Topic :: Internet :: WWW/HTTP',
37 | 'Programming Language :: Python :: 3',
38 | 'Programming Language :: Python :: 3 :: Only',
39 | ],
40 | keywords='apache nginx listing fuse'
41 | )
42 |
--------------------------------------------------------------------------------