├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.rst
├── htmllistparse
    ├── __init__.py
    ├── htmllistparse.py
    └── rehttpfs.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2016-2017 Dingyuan Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include LICENSE
3 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | htmllisting-parser
 2 | ==================
 3 | Python parser for Apache/nginx-style HTML directory listing
 4 | 
 5 | .. code-block:: python
 6 | 
 7 |    import htmllistparse
 8 |    cwd, listing = htmllistparse.fetch_listing(some_url, timeout=30)
 9 | 
10 |    # or you can get the url and make a BeautifulSoup yourself, then use
11 |    # cwd, listing = htmllistparse.parse(soup)
12 | 
13 | where ``cwd`` is the current directory, ``listing`` is a list of ``FileEntry`` named tuples:
14 | 
15 | * ``name``: File name, ``str``. Have a trailing / if it's a directory.
16 | * ``modified``: Last modification time, ``time.struct_time`` or ``None``. Timezone is not known.
17 | * ``size``: File size, ``int`` or ``None``. May be estimated from the prefix, such as "K", "M".
18 | * ``description``: File description, file type, or any other things found. ``str`` as HTML, or ``None``.
19 | 
20 | Supports:
21 | 
22 | * Vanilla Apache/nginx/lighttpd/darkhttpd autoindex
23 | * Most ``<pre>``-style index
24 | * Many other ``<table>``-style index
25 | * ``<ul>``-style
26 | 
27 | .. note::
28 |    Please wrap the functions in a general ``try... except`` block. It may throw exceptions unexpectedly.
29 | 
30 | ReHTTPFS
31 | --------
32 | 
33 | Reinvented HTTP Filesystem.
34 | 
35 | * Mounts most HTTP file listings with FUSE.
36 | * Gets directory tree and file stats with less overhead.
37 | * Supports Range requests.
38 | * Supports Keep-Alive.
39 | 
40 | ::
41 | 
42 |    usage: rehttpfs.py [-h] [-o OPTIONS] [-t TIMEOUT] [-u USER_AGENT] [-v] [-d]
43 |                       url mountpoint
44 | 
45 |    Mount HTML directory listings.
46 | 
47 |    positional arguments:
48 |      url                   URL to mount
49 |      mountpoint            filesystem mount point
50 | 
51 |    optional arguments:
52 |      -h, --help            show this help message and exit
53 |      -o OPTIONS            comma separated FUSE options
54 |      -t TIMEOUT, --timeout TIMEOUT
55 |                            HTTP request timeout
56 |      -u USER_AGENT, --user-agent USER_AGENT
57 |                            HTTP User-Agent
58 |      -v, --verbose         enable debug logging
59 |      -d, --daemon          run in background
60 | 
61 | 


--------------------------------------------------------------------------------
/htmllistparse/__init__.py:
--------------------------------------------------------------------------------
1 | from .htmllistparse import *
2 | 
3 | __version__ = '0.6'
4 | 


--------------------------------------------------------------------------------
/htmllistparse/htmllistparse.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import re
  6 | import time
  7 | import collections
  8 | import urllib.parse
  9 | 
 10 | import bs4
 11 | 
 12 | RE_ISO8601 = re.compile(r'\d{4}-\d+-\d+T\d+:\d{2}:\d{2}Z')
 13 | DATETIME_FMTs = (
 14 | (re.compile(r'\d+-[A-S][a-y]{2}-\d{4} \d+:\d{2}:\d{2}'), "%d-%b-%Y %H:%M:%S"),
 15 | (re.compile(r'\d+-[A-S][a-y]{2}-\d{4} \d+:\d{2}'), "%d-%b-%Y %H:%M"),
 16 | (re.compile(r'\d{4}-\d+-\d+ \d+:\d{2}:\d{2}'), "%Y-%m-%d %H:%M:%S"),
 17 | (RE_ISO8601, "%Y-%m-%dT%H:%M:%SZ"),
 18 | (re.compile(r'\d{4}-\d+-\d+ \d+:\d{2}'), "%Y-%m-%d %H:%M"),
 19 | (re.compile(r'\d{4}-[A-S][a-y]{2}-\d+ \d+:\d{2}:\d{2}'), "%Y-%b-%d %H:%M:%S"),
 20 | (re.compile(r'\d{4}-[A-S][a-y]{2}-\d+ \d+:\d{2}'), "%Y-%b-%d %H:%M"),
 21 | (re.compile(r'[F-W][a-u]{2} [A-S][a-y]{2} +\d+ \d{2}:\d{2}:\d{2} \d{4}'), "%a %b %d %H:%M:%S %Y"),
 22 | (re.compile(r'[F-W][a-u]{2}, \d+ [A-S][a-y]{2} \d{4} \d{2}:\d{2}:\d{2} .+'), "%a, %d %b %Y %H:%M:%S %Z"),
 23 | (re.compile(r'\d{4}-\d+-\d+'), "%Y-%m-%d"),
 24 | (re.compile(r'\d+/\d+/\d{4} \d{2}:\d{2}:\d{2} [+-]\d{4}'), "%d/%m/%Y %H:%M:%S %z"),
 25 | (re.compile(r'\d{2} [A-S][a-y]{2} \d{4}'), "%d %b %Y")
 26 | )
 27 | 
 28 | RE_FILESIZE = re.compile(r'\d+(\.\d+)? ?[BKMGTPEZY]|\d+|-', re.I)
 29 | RE_ABSPATH = re.compile(r'^((ht|f)tps?:/)?/')
 30 | RE_COMMONHEAD = re.compile('Name|(Last )?modifi(ed|cation)|date|Size|Description|Metadata|Type|Parent Directory', re.I)
 31 | RE_HASTEXT = re.compile('.+')
 32 | RE_HEAD_NAME = re.compile('name$|^file|^download')
 33 | RE_HEAD_MOD = re.compile('modifi|^uploaded|date|time')
 34 | RE_HEAD_SIZE = re.compile('size|bytes$')
 35 | 
 36 | FileEntry = collections.namedtuple('FileEntry', 'name modified size description')
 37 | 
 38 | def human2bytes(s):
 39 |     """
 40 |     >>> human2bytes('1M')
 41 |     1048576
 42 |     >>> human2bytes('1G')
 43 |     1073741824
 44 |     """
 45 |     if s is None:
 46 |         return None
 47 |     try:
 48 |         return int(s)
 49 |     except ValueError:
 50 |         symbols = 'BKMGTPEZY'
 51 |         letter = s[-1:].strip().upper()
 52 |         num = float(s[:-1])
 53 |         prefix = {symbols[0]: 1}
 54 |         for i, s in enumerate(symbols[1:]):
 55 |             prefix[s] = 1 << (i+1)*10
 56 |         return int(num * prefix[letter])
 57 | 
 58 | def aherf2filename(a_href):
 59 |     isdir = ('/' if a_href[-1] == '/' else '')
 60 |     return os.path.basename(urllib.parse.unquote(a_href.rstrip('/'))) + isdir
 61 | 
 62 | def parse(soup):
 63 |     '''
 64 |     Try to parse apache/nginx-style directory listing with all kinds of tricks.
 65 | 
 66 |     Exceptions or an empty listing suggust a failure.
 67 |     We strongly recommend generating the `soup` with 'html5lib'.
 68 | 
 69 |     Returns: Current directory, Directory listing
 70 |     '''
 71 |     cwd = None
 72 |     listing = []
 73 |     if soup.title and soup.title.string and soup.title.string.startswith('Index of '):
 74 |         cwd = soup.title.string[9:]
 75 |     elif soup.h1:
 76 |         title = soup.h1.get_text().strip()
 77 |         if title.startswith('Index of '):
 78 |             cwd = title[9:]
 79 |     [img.decompose() for img in soup.find_all('img')]
 80 |     file_name = file_mod = file_size = file_desc = None
 81 |     pres = [x for x in soup.find_all('pre') if
 82 |             x.find('a', string=RE_HASTEXT)]
 83 |     tables = [x for x in soup.find_all('table') if
 84 |               x.find(string=RE_COMMONHEAD)] if not pres else ()
 85 |     heads = []
 86 |     if pres:
 87 |         pre = pres[0]
 88 |         started = False
 89 |         for element in (pre.hr.next_siblings if pre.hr else pre.children):
 90 |             if element.name == 'a':
 91 |                 if not element.string or not element.string.strip():
 92 |                     continue
 93 |                 elif started:
 94 |                     if file_name:
 95 |                         listing.append(FileEntry(
 96 |                             file_name, file_mod, file_size, file_desc))
 97 |                     file_name = aherf2filename(element['href'])
 98 |                     file_mod = file_size = file_desc = None
 99 |                 elif element.string in ('Parent Directory', '..', '../'):
100 |                     # start with next a
101 |                     started = True
102 |                 elif element['href'][0] not in '?/':
103 |                     # start right away
104 |                     file_name = aherf2filename(element['href'])
105 |                     file_mod = file_size = file_desc = None
106 |                     started = True
107 |             elif not element.name:
108 |                 line = element.string.replace('\r', '').split('\n', 1)[0].lstrip()
109 |                 for regex, fmt in DATETIME_FMTs:
110 |                     match = regex.match(line)
111 |                     if match:
112 |                         file_mod = time.strptime(match.group(0), fmt)
113 |                         line = line[match.end():].lstrip()
114 |                         break
115 |                 match = RE_FILESIZE.match(line)
116 |                 if match:
117 |                     sizestr = match.group(0)
118 |                     if sizestr == '-':
119 |                         file_size = None
120 |                     else:
121 |                         file_size = human2bytes(sizestr.replace(' ', '').replace(',', ''))
122 |                     line = line[match.end():].lstrip()
123 |                 if line:
124 |                     file_desc = line.rstrip()
125 |                     if file_name and file_desc == '/':
126 |                         file_name += '/'
127 |                         file_desc = None
128 |             else:
129 |                 continue
130 |         if file_name:
131 |             listing.append(FileEntry(file_name, file_mod, file_size, file_desc))
132 |     elif tables:
133 |         started = False
134 |         for tr in tables[0].find_all('tr'):
135 |             status = 0
136 |             file_name = file_mod = file_size = file_desc = None
137 |             if started:
138 |                 if tr.parent.name in ('thead', 'tfoot') or tr.th:
139 |                     continue
140 |                 for td in tr.find_all('td'):
141 |                     if status >= len(heads):
142 |                         raise AssertionError("can't detect table column number")
143 |                     if td.get('colspan'):
144 |                         continue
145 |                     elif heads[status] == 'name':
146 |                         if not td.a:
147 |                             continue
148 |                         a_str = td.a.get_text().strip()
149 |                         a_href = td.a['href']
150 |                         if not a_str or not a_href or a_href[0] == '#':
151 |                             continue
152 |                         elif a_str == 'Parent Directory' or a_href == '../':
153 |                             break
154 |                         else:
155 |                             file_name = aherf2filename(a_href)
156 |                             status = 1
157 |                     elif heads[status] == 'modified':
158 |                         if td.time:
159 |                             timestr = td.time.get('datetime', '')
160 |                             if RE_ISO8601.match(timestr):
161 |                                 file_mod = time.strptime(timestr, "%Y-%m-%dT%H:%M:%SZ")
162 |                                 status += 1
163 |                                 continue
164 |                         timestr = td.get_text().strip()
165 |                         if timestr:
166 |                             for regex, fmt in DATETIME_FMTs:
167 |                                 match = regex.match(timestr)
168 |                                 if match:
169 |                                     file_mod = time.strptime(match.group(0), fmt)
170 |                                     break
171 |                             else:
172 |                                 if td.get('data-sort-value'):
173 |                                     file_mod = time.gmtime(int(td['data-sort-value']))
174 |                                 # else:
175 |                                     # raise AssertionError(
176 |                                         # "can't identify date/time format")
177 |                         status += 1
178 |                     elif heads[status] == 'size':
179 |                         sizestr = td.get_text().strip().replace(',', '')
180 |                         if sizestr == '-' or not sizestr:
181 |                             file_size = None
182 |                         elif td.get('data-sort-value'):
183 |                             file_size = int(td['data-sort-value'])
184 |                         else:
185 |                             match = RE_FILESIZE.match(sizestr)
186 |                             if match:
187 |                                 file_size = human2bytes(
188 |                                     match.group(0).replace(' ', ''))
189 |                             else:
190 |                                 file_size = None
191 |                         status += 1
192 |                     elif heads[status] == 'description':
193 |                         file_desc = file_desc or ''.join(map(str, td.children)
194 |                                         ).strip(' \t\n\r\x0b\x0c\xa0') or None
195 |                         status += 1
196 |                     elif status:
197 |                         # unknown header
198 |                         status += 1
199 |                 if file_name:
200 |                     listing.append(FileEntry(
201 |                         file_name, file_mod, file_size, file_desc))
202 |             elif tr.hr:
203 |                 started = True
204 |                 continue
205 |             elif tr.find(string=RE_COMMONHEAD):
206 |                 namefound = False
207 |                 colspan = False
208 |                 for th in (tr.find_all('th') if tr.th else tr.find_all('td')):
209 |                     if th.get('colspan'):
210 |                         colspan = True
211 |                         continue
212 |                     name = th.get_text().strip(' \t\n\r\x0b\x0c\xa0↑↓').lower()
213 |                     if not name:
214 |                         continue
215 |                     elif not namefound and RE_HEAD_NAME.search(name):
216 |                         heads.append('name')
217 |                         namefound = True
218 |                     elif name in ('size', 'description'):
219 |                         heads.append(name)
220 |                     elif RE_HEAD_MOD.search(name):
221 |                         heads.append('modified')
222 |                     elif RE_HEAD_SIZE.search(name):
223 |                         heads.append('size')
224 |                     elif name.endswith('signature'):
225 |                         heads.append('signature')
226 |                     else:
227 |                         heads.append('description')
228 |                 if colspan:
229 |                     continue
230 |                 if not heads:
231 |                     heads = ('name', 'modified', 'size', 'description')
232 |                 elif not namefound:
233 |                     heads[0] = 'name'
234 |                 started = True
235 |                 continue
236 |     elif soup.ul:
237 |         for li in soup.ul.find_all('li'):
238 |             a = li.a
239 |             if not a or not a.get('href'):
240 |                 continue
241 |             file_name = urllib.parse.unquote(a['href'])
242 |             if (file_name in {'Parent Directory', '.', './', '..', '../', '#'}
243 |                 or RE_ABSPATH.match(file_name)):
244 |                 continue
245 |             else:
246 |                 listing.append(FileEntry(file_name, None, None, None))
247 |     return cwd, listing
248 | 
249 | def fetch_listing(url, timeout=30, **requests_kwargs):
250 |     import requests
251 |     req = requests.get(url, timeout=timeout, **requests_kwargs)
252 |     req.raise_for_status()
253 |     soup = bs4.BeautifulSoup(req.content, 'html5lib')
254 |     return parse(soup)
255 | 
256 | if __name__ == '__main__':
257 |     import sys
258 |     import requests
259 |     for url in sys.argv[1:] or ('http://httpredir.debian.org/debian/',):
260 |         req = requests.get(url, timeout=30)
261 |         req.raise_for_status()
262 |         print(req.url)
263 |         soup = bs4.BeautifulSoup(req.content, 'html5lib')
264 |         cwd, listing = parse(soup)
265 |         print('Cwd:', cwd)
266 |         for f in listing:
267 |             print(f)
268 |         print()
269 | 


--------------------------------------------------------------------------------
/htmllistparse/rehttpfs.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import os
  5 | import io
  6 | import stat
  7 | import time
  8 | import logging
  9 | import argparse
 10 | import calendar
 11 | import urllib.parse
 12 | from errno import EACCES, ENOENT, EIO
 13 | from email.utils import parsedate
 14 | 
 15 | import bs4
 16 | import requests
 17 | import htmllistparse
 18 | try:
 19 |     import fusepy as fuse
 20 | except ImportError:
 21 |     import fuse
 22 | 
 23 | CONFIG = {
 24 |     'timeout': None,
 25 |     'user_agent': None,
 26 | }
 27 | SESSION = requests.Session()
 28 | CONTENT_CHUNK_SIZE = 10 * 1024
 29 | 
 30 | 
 31 | def parse_dir(html):
 32 |     return htmllistparse.parse(bs4.BeautifulSoup(html, 'html5lib'))
 33 | 
 34 | 
 35 | def make_url(urlbase, name):
 36 |     return urllib.parse.urljoin(urlbase, urllib.parse.quote(name.lstrip('/')))
 37 | 
 38 | 
 39 | def sizeof_fmt(num):
 40 |     for unit in ('', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'):
 41 |         if abs(num) < 1024:
 42 |             if unit:
 43 |                 return "%3.1f%s" % (num, unit)
 44 |             else:
 45 |                 return int(num)
 46 |         num /= 1024.0
 47 |     return "%.1f%s" % (num, 'Y')
 48 | 
 49 | 
 50 | def convert_fuse_options(options):
 51 |     kwargs = {}
 52 |     if options is not None:
 53 |         for opt in options.split(','):
 54 |             kv = opt.split('=', 1)
 55 |             if len(kv) == 1:
 56 |                 kwargs[kv[0]] = True
 57 |             else:
 58 |                 kwargs[kv[0]] = kv[1]
 59 |     return kwargs
 60 | 
 61 | 
 62 | class IsADirectory(ValueError):
 63 |     pass
 64 | 
 65 | 
 66 | class FileStat:
 67 |     __slots__ = (
 68 |         'st_mode', 'st_nlink', 'st_uid', 'st_gid', 'st_size',
 69 |         'st_atime', 'st_mtime', 'st_ctime'
 70 |     )
 71 | 
 72 |     def __init__(self):
 73 |         self.st_mode = stat.S_IFREG | 0o444
 74 |         self.st_nlink = 1
 75 |         self.st_uid = 0
 76 |         self.st_gid = 0
 77 |         self.st_size = 0
 78 |         self.st_atime = 0
 79 |         self.st_mtime = 0
 80 |         self.st_ctime = 0
 81 | 
 82 |     def settime(self, value):
 83 |         self.st_atime = self.st_mtime = self.st_ctime = value
 84 | 
 85 |     def setmode(self, value, isdir=False):
 86 |         if isdir:
 87 |             self.st_mode = stat.S_IFDIR | value
 88 |         else:
 89 |             self.st_mode = stat.S_IFREG | value
 90 | 
 91 |     def __getitem__(self, key):
 92 |         return getattr(self, key)
 93 | 
 94 |     def items(self):
 95 |         for key in self.__slots__:
 96 |             yield key, getattr(self, key)
 97 | 
 98 |     def __repr__(self):
 99 |         return '<FileStat mode=%o, size=%s, mtime=%d>' % (
100 |             self.st_mode, sizeof_fmt(self.st_size), self.st_mtime)
101 | 
102 | 
103 | class File(io.IOBase):
104 |     __slots__ = (
105 |         'baseurl', 'path', 'url', 'stat', 'init',
106 |         'exist', '_readable', '_seekable', 'offset'
107 |     )
108 | 
109 |     def __init__(self, baseurl, path):
110 |         self.baseurl = baseurl
111 |         self.path = path
112 |         self.url = make_url(baseurl, path)
113 |         self.stat = FileStat()
114 |         self.init = 0
115 |         self.exist = True
116 |         self._readable = True
117 |         self._seekable = False
118 |         self.offset = 0
119 | 
120 |     def get_stat(self):
121 |         req = SESSION.head(self.url, timeout=CONFIG[
122 |                            'timeout'], allow_redirects=False)
123 |         req.close()
124 |         if 400 <= req.status_code <= 499:
125 |             self.stat.setmode(0o000)
126 |             self.init = 2
127 |             self._readable = False
128 |             if req.status_code == 404:
129 |                 self.exist = False
130 |             return self.stat
131 |         elif req.status_code in (301, 302):
132 |             raise IsADirectory()
133 |         else:
134 |             req.raise_for_status()
135 |         self.stat.st_size = int(req.headers.get('Content-Length', 0))
136 |         lm = req.headers.get('Last-Modified')
137 |         if lm:
138 |             self.stat.settime(time.mktime(parsedate(lm)))
139 |         else:
140 |             self.stat.settime(time.time())
141 |         if req.headers.get('Accept-Ranges') == 'bytes':
142 |             self._seekable = True
143 |         self.init = 2
144 |         return self.stat
145 | 
146 |     def read(self, size=None, offset=None):
147 |         if not self.init or not self.stat.st_size:
148 |             self.get_stat()
149 |         if not self.exist:
150 |             raise fuse.FuseOSError(ENOENT)
151 |         elif not self.readable():
152 |             raise fuse.FuseOSError(EIO)
153 |         if offset is None:
154 |             offset = self.offset
155 |         end = min(self.stat.st_size, offset + size - 1)
156 |         brange = '%d-%d' % (offset, end)
157 |         headers = {'range': 'bytes=' + brange}
158 |         req = SESSION.get(self.url, headers=headers,
159 |                           stream=True, timeout=CONFIG['timeout'])
160 |         if req.status_code == 206:
161 |             self._seekable = True
162 |         elif req.status_code == 416:
163 |             # we may have a wrong size
164 |             self.get_stat()
165 |             raise fuse.FuseOSError(EIO)
166 |         elif req.status_code == 200:
167 |             self._seekable = False
168 |             if offset != 0:
169 |                 raise fuse.FuseOSError(EIO)
170 |         elif req.status_code == 403:
171 |             self._readable = False
172 |             raise fuse.FuseOSError(EACCES)
173 |         elif req.status_code == 404:
174 |             self.exist = False
175 |             self._readable = False
176 |             raise fuse.FuseOSError(ENOENT)
177 |         else:
178 |             self._readable = False
179 |             raise fuse.FuseOSError(EIO)
180 |         content = bytes()
181 |         for chunk in req.iter_content(CONTENT_CHUNK_SIZE, False):
182 |             content += chunk
183 |             if len(content) > size:
184 |                 content = content[:size]
185 |                 break
186 |         req.close()
187 |         if self._seekable:
188 |             self.offset = end
189 |         return content
190 | 
191 |     def readable(self):
192 |         return self._readable
193 | 
194 |     def seekable(self):
195 |         return self._seekable
196 | 
197 |     def seek(self, offset):
198 |         if self._seekable:
199 |             self.offset = offset
200 | 
201 |     def tell(self):
202 |         return self.offset
203 | 
204 | 
205 | class Directory:
206 |     __slots__ = (
207 |         'baseurl', 'path', 'url', 'stat', 'content', 'init', 'exist', '_readable'
208 |     )
209 | 
210 |     def __init__(self, baseurl, path):
211 |         self.baseurl = baseurl
212 |         self.path = path
213 |         self.url = make_url(baseurl, path)
214 |         self.stat = FileStat()
215 |         self.stat.setmode(0o555, True)
216 |         self.stat.st_nlink = 2
217 |         self.content = ['.', '..']
218 |         self.init = 0
219 |         self.exist = True
220 |         self._readable = True
221 | 
222 |     def read(self):
223 |         try:
224 |             req = SESSION.get(self.url, timeout=CONFIG['timeout'])
225 |         except Exception:
226 |             raise fuse.FuseOSError(EIO)
227 |         try:
228 |             req.raise_for_status()
229 |         except requests.exceptions.HTTPError:
230 |             self.stat.setmode(0o000, True)
231 |             self.init = 2
232 |             self._readable = False
233 |             if req.status_code == 403:
234 |                 raise fuse.FuseOSError(EACCES)
235 |             elif req.status_code == 404:
236 |                 self.exist = False
237 |                 raise fuse.FuseOSError(ENOENT)
238 |             else:
239 |                 raise fuse.FuseOSError(EIO)
240 |         lm = req.headers.get('Last-Modified')
241 |         if lm:
242 |             self.stat.settime(time.mktime(parsedate(lm)))
243 |         else:
244 |             self.stat.settime(time.time())
245 |         try:
246 |             cwd, listing = parse_dir(req.content)
247 |         except Exception:
248 |             logging.exception('failed to parse listing: ' + self.url)
249 |             listing = []
250 |         content = ['.', '..']
251 |         objmap = {}
252 |         for name, modified, size, description in listing:
253 |             fpath = os.path.join(self.path, name)
254 |             if name[-1] == '/':
255 |                 fileobj = Directory(self.baseurl, fpath)
256 |                 fpath = fpath.rstrip('/')
257 |             else:
258 |                 fileobj = File(self.baseurl, fpath)
259 |                 if size is None:
260 |                     fileobj.get_stat()
261 |                 else:
262 |                     fileobj.stat.st_size = size
263 |             if modified:
264 |                 fileobj.stat.settime(calendar.timegm(modified))
265 |             else:
266 |                 fileobj.stat.settime(self.stat.st_mtime)
267 |             fileobj.init = fileobj.init or 1
268 |             content.append(name.rstrip('/'))
269 |             objmap[fpath] = fileobj
270 |         self.content = content
271 |         self.stat.st_nlink = len(content)
272 |         self.init = 2
273 |         self._readable = True
274 |         return objmap
275 | 
276 |     def readable(self):
277 |         return self._readable
278 | 
279 | 
280 | class rehttpfs(fuse.LoggingMixIn, fuse.Operations):
281 |     '''Reinvented HTTP Filesystem'''
282 | 
283 |     def __init__(self, url):
284 |         self.url = url
285 |         if url[-1] != '/':
286 |             self.url += '/'
287 |         self.fd = 0
288 |         self.metacache = {'/': Directory(self.url, '/')}
289 | 
290 |     def _getpath(self, path, refresh=False):
291 |         pathobj = self.metacache.get(path)
292 |         if isinstance(pathobj, Directory):
293 |             return self._getdirobj(path, refresh)
294 |         else:
295 |             return self._getfileobj(path, refresh)
296 | 
297 |     def _makeparents(self, path):
298 |         while path != '/':
299 |             path = os.path.dirname(path)
300 |             if path not in self.metacache:
301 |                 self.metacache[path] = Directory(self.url, path + '/')
302 |             else:
303 |                 break
304 | 
305 |     def _getfileobj(self, path, refresh=False):
306 |         logging.debug('_getfileobj: %s', path)
307 |         fileobj = self.metacache.get(path)
308 |         try:
309 |             if fileobj:
310 |                 if not fileobj.init or refresh:
311 |                     fileobj.get_stat()
312 |             else:
313 |                 self._makeparents(path)
314 |                 fileobj = File(self.url, path)
315 |                 fileobj.get_stat()
316 |                 self.metacache[path] = fileobj
317 |         except IsADirectory:
318 |             logging.info('IsADirectory: %s', path)
319 |             return self._getdirobj(path, refresh)
320 |         return fileobj
321 | 
322 |     def _getdirobj(self, path, refresh=False):
323 |         logging.debug('_getdirobj: %s', path)
324 |         path = path.rstrip('/')
325 |         dirobj = self.metacache.get(path)
326 |         if dirobj:
327 |             if not dirobj.init or refresh:
328 |                 objmap = dirobj.read()
329 |                 self._update_metacache(objmap)
330 |         else:
331 |             self._makeparents(path)
332 |             dirobj = Directory(self.url, path + '/')
333 |             objmap = dirobj.read()
334 |             self._update_metacache(objmap)
335 |             self.metacache[path] = dirobj
336 |         return dirobj
337 | 
338 |     def _update_metacache(self, objmap):
339 |         for name, obj in objmap.items():
340 |             cached = self.metacache.get(name)
341 |             if not (cached and cached.init and type(cached) == type(obj)):
342 |                 self.metacache[name] = obj
343 | 
344 |     def access(self, path, amode):
345 |         if amode & os.W_OK:
346 |             raise fuse.FuseOSError(EACCES)
347 |         obj = self._getpath(path)
348 |         if not obj.exist:
349 |             raise fuse.FuseOSError(ENOENT)
350 |         elif (obj.stat.st_mode & amode) != amode:
351 |             raise fuse.FuseOSError(EACCES)
352 |         return 0
353 | 
354 |     def getattr(self, path, fh=None):
355 |         logging.debug('getattr: %s', path)
356 |         obj = self._getpath(path)
357 |         if not obj.exist:
358 |             raise fuse.FuseOSError(ENOENT)
359 |         return obj.stat
360 | 
361 |     def open(self, path, flags):
362 |         self.fd += 1
363 |         return self.fd
364 | 
365 |     def opendir(self, path):
366 |         self.fd += 1
367 |         return self.fd
368 | 
369 |     def read(self, path, size, offset, fh):
370 |         fileobj = self._getfileobj(path, False)
371 |         return fileobj.read(size, offset)
372 | 
373 |     def readdir(self, path, fh):
374 |         logging.debug('readdir: %s', path)
375 |         dirobj = self._getdirobj(path)
376 |         if dirobj.init != 2:
377 |             objmap = dirobj.read()
378 |             self._update_metacache(objmap)
379 |         content = []
380 |         for name in dirobj.content:
381 |             fpath = os.path.normpath(os.path.join(path, name))
382 |             content.append((name, self.metacache[fpath].stat, 0))
383 |         return content
384 | 
385 | 
386 | def main():
387 |     parser = argparse.ArgumentParser(description="Mount HTML directory listings.")
388 |     parser.add_argument("-o", help="comma separated FUSE options", metavar='OPTIONS')
389 |     parser.add_argument("-t", "--timeout", help="HTTP request timeout", type=int, default=30)
390 |     parser.add_argument("-u", "--user-agent", help="HTTP User-Agent")
391 |     parser.add_argument("-v", "--verbose", help="enable debug logging", action='store_true')
392 |     parser.add_argument("-d", "--daemon", help="run in background", action='store_true')
393 |     parser.add_argument("url", help="URL to mount")
394 |     parser.add_argument("mountpoint", help="filesystem mount point")
395 |     args = parser.parse_args()
396 |     logging.basicConfig(
397 |         format='%(levelname)s:%(name)s %(message)s',
398 |         level=logging.DEBUG if args.verbose else logging.INFO
399 |     )
400 |     CONFIG['timeout'] = args.timeout
401 |     CONFIG['user_agent'] = args.user_agent
402 |     fuseobj = fuse.FUSE(
403 |         rehttpfs(args.url),
404 |         args.mountpoint,
405 |         foreground=(not args.daemon),
406 |         **convert_fuse_options(args.o)
407 |     )
408 | 
409 | 
410 | if __name__ == '__main__':
411 |     main()
412 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | html5lib
3 | requests
4 | fusepy
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | try:
 5 |     from setuptools import setup
 6 | except ImportError:
 7 |     from distutils.core import setup
 8 | 
 9 | if sys.version_info < (3, 3):
10 |     raise NotImplementedError("You need at least Python 3.3.")
11 | 
12 | setup(
13 |     name='htmllistparse',
14 |     version='0.6.1',
15 |     description='Python parser for Apache/nginx-style HTML directory listing.',
16 |     long_description=open('README.rst', 'r').read(),
17 |     author='Dingyuan Wang',
18 |     author_email='gumblex@aosc.io',
19 |     url='https://github.com/gumblex/htmllisting-parser',
20 |     packages=['htmllistparse'],
21 |     install_requires=[
22 |         'beautifulsoup4',
23 |         'html5lib',
24 |         'requests',
25 |         'fusepy'
26 |     ],
27 |     entry_points = {
28 |         'console_scripts': ['rehttpfs=htmllistparse.rehttpfs:main'],
29 |     },
30 |     license='MIT',
31 |     platforms='any',
32 |     classifiers=[
33 |         'Development Status :: 4 - Beta',
34 |         'Intended Audience :: Developers',
35 |         'License :: OSI Approved :: MIT License',
36 |         'Topic :: Internet :: WWW/HTTP',
37 |         'Programming Language :: Python :: 3',
38 |         'Programming Language :: Python :: 3 :: Only',
39 |     ],
40 |     keywords='apache nginx listing fuse'
41 | )
42 | 


--------------------------------------------------------------------------------