├── .gitignore
├── README.md
├── setup.py
└── webhdfs
├── VERSION
├── __init__.py
├── example.py
└── webhdfs.py
/.gitignore:
--------------------------------------------------------------------------------
1 | .svn
2 | .pyc
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WebHDFS Python Client Implementation
2 |
3 | WebHDFS is REST-API to HDFS. To facilitate access to WebHDFS from Python, webhdfs-py was developed. The library can easily be installed via easy_install or pip:
4 |
5 | easy_install webhdfs
6 |
7 | Webhdfs-py has no further dependencies and solely relies on the Python standard library. Similar to the Python os package, webhdfs-py provides basic capabilities like the creation/listing and deletion of directories on files.
8 |
9 | ## Hadoop configuration
10 |
11 | Supported Hadoop version: 2.x (including 2.4). Tested with HDP 2.1
12 |
13 | Ensure that WebHDFS is enabled in the `hdfs-site.xml`:
14 |
15 | Relevant properties:
16 |
17 |
18 | dfs.webhdfs.enabled
19 | true
20 |
21 |
22 | see
23 |
24 |
25 | ## Limitations
26 |
27 | * Kerberos security not supported
28 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import sys
4 | from setuptools import setup, find_packages
5 | import webhdfs
6 |
7 | setup(name='WebHDFS',
8 | version=webhdfs.version,
9 | description='HDFS Python client based on WebHDFS REST API',
10 | author='Andre Luckow',
11 | author_email='andre.luckow@gmail.com',
12 | url='na',
13 | classifiers = ['Development Status :: 4 - Beta',
14 | 'Programming Language :: Python',
15 | 'Environment :: Console',
16 | 'Topic :: Utilities',
17 | ],
18 | platforms = ('Unix', 'Linux', 'Mac OS'),
19 | packages=['webhdfs'],
20 | data_files=['webhdfs/VERSION'],
21 | install_requires=[],
22 | entry_points = {
23 | 'console_scripts': []
24 | }
25 | )
26 |
--------------------------------------------------------------------------------
/webhdfs/VERSION:
--------------------------------------------------------------------------------
1 | 0.2.0
2 |
--------------------------------------------------------------------------------
/webhdfs/__init__.py:
--------------------------------------------------------------------------------
1 | # read and log the version of package
2 | import os
3 | import logging
4 | logging.basicConfig(level=logging.DEBUG, datefmt='%m/%d/%Y %I:%M:%S %p',
5 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
6 | logger = logging.getLogger(name='webhdfs')
7 |
8 | version = "0.2.0"
9 |
10 | try:
11 | fn = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'VERSION')
12 | version = open(fn).read().strip()
13 | logger.debug("Loading WebHDFS version: " + version)
14 | except IOError:
15 | pass
16 |
--------------------------------------------------------------------------------
/webhdfs/example.py:
--------------------------------------------------------------------------------
1 | from webhdfs.webhdfs import WebHDFS
2 | import os, tempfile
3 | import time
4 |
5 | webhdfs = WebHDFS("localhost", 50070, "luckow")
6 |
7 | webhdfs.mkdir("/tmp/hello-world/")
8 |
9 | # create a temporary file
10 | f = tempfile.NamedTemporaryFile()
11 | f.write(b'Hello world!\n')
12 | f.flush()
13 |
14 | print "Upload file: " + f.name
15 |
16 | webhdfs.copyFromLocal(f.name,
17 | "/tmp/test.txt")
18 |
19 | webhdfs.copyToLocal("/hello-world/test.txt",
20 | "/tmp/test1.txt")
21 |
22 | for i in webhdfs.listdir("/hello-world/"):
23 | print str(i)
24 |
25 | f.close()
26 |
--------------------------------------------------------------------------------
/webhdfs/webhdfs.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 | import stat
3 | import httplib
4 | import urlparse
5 | import json
6 | import urllib
7 | import logging
8 | logging.basicConfig(level=logging.DEBUG, datefmt='%m/%d/%Y %I:%M:%S %p',
9 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
10 | logger = logging.getLogger(name='webhdfs')
11 |
12 | WEBHDFS_CONTEXT_ROOT="/webhdfs/v1"
13 |
14 | class WebHDFS(object):
15 | """ Class for accessing HDFS via WebHDFS
16 |
17 | To enable WebHDFS in your Hadoop Installation add the following configuration
18 | to your hdfs_site.xml (requires Hadoop >0.20.205.0):
19 |
20 |
21 | dfs.webhdfs.enabled
22 | true
23 |
24 |
25 | see: https://issues.apache.org/jira/secure/attachment/12500090/WebHdfsAPI20111020.pdf
26 | """
27 |
28 | def __init__(self, namenode_host, namenode_port, hdfs_username):
29 | self.namenode_host=namenode_host
30 | self.namenode_port = namenode_port
31 | self.username = hdfs_username
32 |
33 |
34 | def mkdir(self, path):
35 | if os.path.isabs(path)==False:
36 | raise Exception("Only absolute paths supported: %s"%(path))
37 |
38 | url_path = WEBHDFS_CONTEXT_ROOT + path +'?op=MKDIRS&user.name='+self.username
39 | logger.debug("Create directory: " + url_path)
40 | httpClient = self.__getNameNodeHTTPClient()
41 | httpClient.request('PUT', url_path , headers={})
42 | response = httpClient.getresponse()
43 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason))
44 | httpClient.close()
45 |
46 |
47 | def rmdir(self, path):
48 | if os.path.isabs(path)==False:
49 | raise Exception("Only absolute paths supported: %s"%(path))
50 |
51 | url_path = WEBHDFS_CONTEXT_ROOT + path +'?op=DELETE&recursive=true&user.name='+self.username
52 | logger.debug("Delete directory: " + url_path)
53 | httpClient = self.__getNameNodeHTTPClient()
54 | httpClient.request('DELETE', url_path , headers={})
55 | response = httpClient.getresponse()
56 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason))
57 | httpClient.close()
58 |
59 |
60 | def put(self, source_data, target_path, replication=1):
61 | if os.path.isabs(target_path)==False:
62 | raise Exception("Only absolute paths supported: %s"%(target_path))
63 |
64 | url_path = WEBHDFS_CONTEXT_ROOT + target_path + '?op=CREATE&overwrite=true&user.name='+self.username
65 |
66 | httpClient = self.__getNameNodeHTTPClient()
67 | httpClient.request('PUT', url_path , headers={})
68 | response = httpClient.getresponse()
69 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason))
70 | msg = response.msg
71 | redirect_location = msg["location"]
72 | logger.debug("HTTP Location: %s"%(redirect_location))
73 | result = urlparse.urlparse(redirect_location)
74 | redirect_host = result.netloc[:result.netloc.index(":")]
75 | redirect_port = result.netloc[(result.netloc.index(":")+1):]
76 | # Bug in WebHDFS 0.20.205 => requires param otherwise a NullPointerException is thrown
77 | redirect_path = result.path + "?" + result.query + "&replication="+str(replication)
78 |
79 | logger.debug("Send redirect to: host: %s, port: %s, path: %s "%(redirect_host, redirect_port, redirect_path))
80 | fileUploadClient = httplib.HTTPConnection(redirect_host,
81 | redirect_port, timeout=600)
82 | # This requires currently Python 2.6 or higher
83 | fileUploadClient.request('PUT', redirect_path, source_data, headers={})
84 | response = fileUploadClient.getresponse()
85 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason))
86 | httpClient.close()
87 | fileUploadClient.close()
88 | return response.status
89 |
90 |
91 | def get(self, source_path):
92 | if os.path.isabs(source_path)==False:
93 | raise Exception("Only absolute paths supported: %s"%(source_path))
94 | url_path = WEBHDFS_CONTEXT_ROOT + source_path+'?op=OPEN&overwrite=true&user.name='+self.username
95 | logger.debug("GET URL: %s"%url_path)
96 | httpClient = self.__getNameNodeHTTPClient()
97 | httpClient.request('GET', url_path , headers={})
98 | response = httpClient.getresponse()
99 | data = None
100 | if response.length!=None:
101 | msg = response.msg
102 | redirect_location = msg["location"]
103 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason))
104 | logger.debug("HTTP Location: %s"%(redirect_location))
105 | result = urlparse.urlparse(redirect_location)
106 | redirect_host = result.netloc[:result.netloc.index(":")]
107 | redirect_port = result.netloc[(result.netloc.index(":")+1):]
108 |
109 | redirect_path = result.path + "?" + result.query
110 |
111 | logger.debug("Send redirect to: host: %s, port: %s, path: %s "%(redirect_host, redirect_port, redirect_path))
112 | fileDownloadClient = httplib.HTTPConnection(redirect_host,
113 | redirect_port, timeout=600)
114 |
115 | fileDownloadClient.request('GET', redirect_path, headers={})
116 | response = fileDownloadClient.getresponse()
117 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason))
118 | data=response.read()
119 | httpClient.close()
120 | return data
121 |
122 |
123 | def copyFromLocal(self, source_path, target_path, replication=1):
124 | f = open(source_path, "r")
125 | source_data = f.read()
126 | f.close()
127 | return self.put(source_data, target_path, replication)
128 |
129 |
130 | def copyToLocal(self, source_path, target_path):
131 | if os.path.isabs(source_path)==False:
132 | raise Exception("Only absolute paths supported: %s"%(source_path))
133 | url_path = WEBHDFS_CONTEXT_ROOT + source_path+'?op=OPEN&overwrite=true&user.name='+self.username
134 | logger.debug("GET URL: %s"%url_path)
135 | httpClient = self.__getNameNodeHTTPClient()
136 | httpClient.request('GET', url_path , headers={})
137 | response = httpClient.getresponse()
138 | # if file is empty GET returns a response with length == NONE and
139 | # no msg["location"]
140 | if response.length!=None:
141 | msg = response.msg
142 | redirect_location = msg["location"]
143 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason))
144 | logger.debug("HTTP Location: %s"%(redirect_location))
145 | result = urlparse.urlparse(redirect_location)
146 | redirect_host = result.netloc[:result.netloc.index(":")]
147 | redirect_port = result.netloc[(result.netloc.index(":")+1):]
148 |
149 | redirect_path = result.path + "?" + result.query
150 |
151 | logger.debug("Send redirect to: host: %s, port: %s, path: %s "%(redirect_host, redirect_port, redirect_path))
152 | fileDownloadClient = httplib.HTTPConnection(redirect_host,
153 | redirect_port, timeout=600)
154 |
155 | fileDownloadClient.request('GET', redirect_path, headers={})
156 | response = fileDownloadClient.getresponse()
157 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason))
158 |
159 | # Write data to file
160 | target_file = open(target_path, "w")
161 | target_file.write(response.read())
162 | target_file.close()
163 | fileDownloadClient.close()
164 | else:
165 | target_file = open(target_path, "w")
166 | target_file.close()
167 |
168 | httpClient.close()
169 | return response.status
170 |
171 |
172 | def listdir(self, path):
173 | if os.path.isabs(path)==False:
174 | raise Exception("Only absolute paths supported: %s"%(path))
175 |
176 | url_path = urllib.quote(WEBHDFS_CONTEXT_ROOT + path+'?op=LISTSTATUS&user.name='+self.username)
177 | logger.debug("List directory: " + url_path)
178 | httpClient = self.__getNameNodeHTTPClient()
179 | httpClient.request('GET', url_path , headers={})
180 | response = httpClient.getresponse()
181 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason))
182 | data_dict = json.loads(response.read())
183 | logger.debug("Data: " + str(data_dict))
184 | files=[]
185 | for i in data_dict["FileStatuses"]["FileStatus"]:
186 | logger.debug(i["type"] + ": " + i["pathSuffix"])
187 | files.append(i["pathSuffix"])
188 | httpClient.close()
189 | return files
190 |
191 | def __getNameNodeHTTPClient(self):
192 | httpClient = httplib.HTTPConnection(self.namenode_host,
193 | self.namenode_port,
194 | timeout=600)
195 | return httpClient
196 |
197 |
198 |
199 | if __name__ == "__main__":
200 | webhdfs = WebHDFS("localhost", 50070, "luckow")
201 | webhdfs.mkdir("/pilotstore-1/pd-9c2d42c4-30a3-11e1-bab1-00264a13ca4c/")
202 | webhdfs.copyFromLocal("/Users/luckow/workspace-saga/applications/pilot-store/test/data1/test1.txt",
203 | "/pilotstore-1/pd-9c2d42c4-30a3-11e1-bab1-00264a13ca4c/test1.txt")
204 |
205 | webhdfs.copyToLocal("/pilotstore-1/pd-9c2d42c4-30a3-11e1-bab1-00264a13ca4c/test1.txt",
206 | "/tmp/test1.txt")
207 |
208 | webhdfs.listdir("/")
209 |
210 |
211 |
212 |
--------------------------------------------------------------------------------