├── .gitignore ├── README.md ├── setup.py └── webhdfs ├── VERSION ├── __init__.py ├── example.py └── webhdfs.py /.gitignore: -------------------------------------------------------------------------------- 1 | .svn 2 | .pyc 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WebHDFS Python Client Implementation 2 | 3 | WebHDFS is REST-API to HDFS. To facilitate access to WebHDFS from Python, webhdfs-py was developed. The library can easily be installed via easy_install or pip: 4 | 5 | easy_install webhdfs 6 | 7 | Webhdfs-py has no further dependencies and solely relies on the Python standard library. Similar to the Python os package, webhdfs-py provides basic capabilities like the creation/listing and deletion of directories on files. 8 | 9 | ## Hadoop configuration 10 | 11 | Supported Hadoop version: 2.x (including 2.4). Tested with HDP 2.1 12 | 13 | Ensure that WebHDFS is enabled in the `hdfs-site.xml`: 14 | 15 | Relevant properties: 16 | 17 | 18 | dfs.webhdfs.enabled 19 | true 20 | 21 | 22 | see 23 | 24 | 25 | ## Limitations 26 | 27 | * Kerberos security not supported 28 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | from setuptools import setup, find_packages 5 | import webhdfs 6 | 7 | setup(name='WebHDFS', 8 | version=webhdfs.version, 9 | description='HDFS Python client based on WebHDFS REST API', 10 | author='Andre Luckow', 11 | author_email='andre.luckow@gmail.com', 12 | url='na', 13 | classifiers = ['Development Status :: 4 - Beta', 14 | 'Programming Language :: Python', 15 | 'Environment :: Console', 16 | 'Topic :: Utilities', 17 | ], 18 | platforms = ('Unix', 'Linux', 'Mac OS'), 19 | packages=['webhdfs'], 20 | data_files=['webhdfs/VERSION'], 21 | install_requires=[], 22 | entry_points = { 23 | 'console_scripts': [] 24 | } 25 | ) 26 | -------------------------------------------------------------------------------- /webhdfs/VERSION: -------------------------------------------------------------------------------- 1 | 0.2.0 2 | -------------------------------------------------------------------------------- /webhdfs/__init__.py: -------------------------------------------------------------------------------- 1 | # read and log the version of package 2 | import os 3 | import logging 4 | logging.basicConfig(level=logging.DEBUG, datefmt='%m/%d/%Y %I:%M:%S %p', 5 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 6 | logger = logging.getLogger(name='webhdfs') 7 | 8 | version = "0.2.0" 9 | 10 | try: 11 | fn = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'VERSION') 12 | version = open(fn).read().strip() 13 | logger.debug("Loading WebHDFS version: " + version) 14 | except IOError: 15 | pass 16 | -------------------------------------------------------------------------------- /webhdfs/example.py: -------------------------------------------------------------------------------- 1 | from webhdfs.webhdfs import WebHDFS 2 | import os, tempfile 3 | import time 4 | 5 | webhdfs = WebHDFS("localhost", 50070, "luckow") 6 | 7 | webhdfs.mkdir("/tmp/hello-world/") 8 | 9 | # create a temporary file 10 | f = tempfile.NamedTemporaryFile() 11 | f.write(b'Hello world!\n') 12 | f.flush() 13 | 14 | print "Upload file: " + f.name 15 | 16 | webhdfs.copyFromLocal(f.name, 17 | "/tmp/test.txt") 18 | 19 | webhdfs.copyToLocal("/hello-world/test.txt", 20 | "/tmp/test1.txt") 21 | 22 | for i in webhdfs.listdir("/hello-world/"): 23 | print str(i) 24 | 25 | f.close() 26 | -------------------------------------------------------------------------------- /webhdfs/webhdfs.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | import stat 3 | import httplib 4 | import urlparse 5 | import json 6 | import urllib 7 | import logging 8 | logging.basicConfig(level=logging.DEBUG, datefmt='%m/%d/%Y %I:%M:%S %p', 9 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') 10 | logger = logging.getLogger(name='webhdfs') 11 | 12 | WEBHDFS_CONTEXT_ROOT="/webhdfs/v1" 13 | 14 | class WebHDFS(object): 15 | """ Class for accessing HDFS via WebHDFS 16 | 17 | To enable WebHDFS in your Hadoop Installation add the following configuration 18 | to your hdfs_site.xml (requires Hadoop >0.20.205.0): 19 | 20 | 21 | dfs.webhdfs.enabled 22 | true 23 | 24 | 25 | see: https://issues.apache.org/jira/secure/attachment/12500090/WebHdfsAPI20111020.pdf 26 | """ 27 | 28 | def __init__(self, namenode_host, namenode_port, hdfs_username): 29 | self.namenode_host=namenode_host 30 | self.namenode_port = namenode_port 31 | self.username = hdfs_username 32 | 33 | 34 | def mkdir(self, path): 35 | if os.path.isabs(path)==False: 36 | raise Exception("Only absolute paths supported: %s"%(path)) 37 | 38 | url_path = WEBHDFS_CONTEXT_ROOT + path +'?op=MKDIRS&user.name='+self.username 39 | logger.debug("Create directory: " + url_path) 40 | httpClient = self.__getNameNodeHTTPClient() 41 | httpClient.request('PUT', url_path , headers={}) 42 | response = httpClient.getresponse() 43 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason)) 44 | httpClient.close() 45 | 46 | 47 | def rmdir(self, path): 48 | if os.path.isabs(path)==False: 49 | raise Exception("Only absolute paths supported: %s"%(path)) 50 | 51 | url_path = WEBHDFS_CONTEXT_ROOT + path +'?op=DELETE&recursive=true&user.name='+self.username 52 | logger.debug("Delete directory: " + url_path) 53 | httpClient = self.__getNameNodeHTTPClient() 54 | httpClient.request('DELETE', url_path , headers={}) 55 | response = httpClient.getresponse() 56 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason)) 57 | httpClient.close() 58 | 59 | 60 | def put(self, source_data, target_path, replication=1): 61 | if os.path.isabs(target_path)==False: 62 | raise Exception("Only absolute paths supported: %s"%(target_path)) 63 | 64 | url_path = WEBHDFS_CONTEXT_ROOT + target_path + '?op=CREATE&overwrite=true&user.name='+self.username 65 | 66 | httpClient = self.__getNameNodeHTTPClient() 67 | httpClient.request('PUT', url_path , headers={}) 68 | response = httpClient.getresponse() 69 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason)) 70 | msg = response.msg 71 | redirect_location = msg["location"] 72 | logger.debug("HTTP Location: %s"%(redirect_location)) 73 | result = urlparse.urlparse(redirect_location) 74 | redirect_host = result.netloc[:result.netloc.index(":")] 75 | redirect_port = result.netloc[(result.netloc.index(":")+1):] 76 | # Bug in WebHDFS 0.20.205 => requires param otherwise a NullPointerException is thrown 77 | redirect_path = result.path + "?" + result.query + "&replication="+str(replication) 78 | 79 | logger.debug("Send redirect to: host: %s, port: %s, path: %s "%(redirect_host, redirect_port, redirect_path)) 80 | fileUploadClient = httplib.HTTPConnection(redirect_host, 81 | redirect_port, timeout=600) 82 | # This requires currently Python 2.6 or higher 83 | fileUploadClient.request('PUT', redirect_path, source_data, headers={}) 84 | response = fileUploadClient.getresponse() 85 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason)) 86 | httpClient.close() 87 | fileUploadClient.close() 88 | return response.status 89 | 90 | 91 | def get(self, source_path): 92 | if os.path.isabs(source_path)==False: 93 | raise Exception("Only absolute paths supported: %s"%(source_path)) 94 | url_path = WEBHDFS_CONTEXT_ROOT + source_path+'?op=OPEN&overwrite=true&user.name='+self.username 95 | logger.debug("GET URL: %s"%url_path) 96 | httpClient = self.__getNameNodeHTTPClient() 97 | httpClient.request('GET', url_path , headers={}) 98 | response = httpClient.getresponse() 99 | data = None 100 | if response.length!=None: 101 | msg = response.msg 102 | redirect_location = msg["location"] 103 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason)) 104 | logger.debug("HTTP Location: %s"%(redirect_location)) 105 | result = urlparse.urlparse(redirect_location) 106 | redirect_host = result.netloc[:result.netloc.index(":")] 107 | redirect_port = result.netloc[(result.netloc.index(":")+1):] 108 | 109 | redirect_path = result.path + "?" + result.query 110 | 111 | logger.debug("Send redirect to: host: %s, port: %s, path: %s "%(redirect_host, redirect_port, redirect_path)) 112 | fileDownloadClient = httplib.HTTPConnection(redirect_host, 113 | redirect_port, timeout=600) 114 | 115 | fileDownloadClient.request('GET', redirect_path, headers={}) 116 | response = fileDownloadClient.getresponse() 117 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason)) 118 | data=response.read() 119 | httpClient.close() 120 | return data 121 | 122 | 123 | def copyFromLocal(self, source_path, target_path, replication=1): 124 | f = open(source_path, "r") 125 | source_data = f.read() 126 | f.close() 127 | return self.put(source_data, target_path, replication) 128 | 129 | 130 | def copyToLocal(self, source_path, target_path): 131 | if os.path.isabs(source_path)==False: 132 | raise Exception("Only absolute paths supported: %s"%(source_path)) 133 | url_path = WEBHDFS_CONTEXT_ROOT + source_path+'?op=OPEN&overwrite=true&user.name='+self.username 134 | logger.debug("GET URL: %s"%url_path) 135 | httpClient = self.__getNameNodeHTTPClient() 136 | httpClient.request('GET', url_path , headers={}) 137 | response = httpClient.getresponse() 138 | # if file is empty GET returns a response with length == NONE and 139 | # no msg["location"] 140 | if response.length!=None: 141 | msg = response.msg 142 | redirect_location = msg["location"] 143 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason)) 144 | logger.debug("HTTP Location: %s"%(redirect_location)) 145 | result = urlparse.urlparse(redirect_location) 146 | redirect_host = result.netloc[:result.netloc.index(":")] 147 | redirect_port = result.netloc[(result.netloc.index(":")+1):] 148 | 149 | redirect_path = result.path + "?" + result.query 150 | 151 | logger.debug("Send redirect to: host: %s, port: %s, path: %s "%(redirect_host, redirect_port, redirect_path)) 152 | fileDownloadClient = httplib.HTTPConnection(redirect_host, 153 | redirect_port, timeout=600) 154 | 155 | fileDownloadClient.request('GET', redirect_path, headers={}) 156 | response = fileDownloadClient.getresponse() 157 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason)) 158 | 159 | # Write data to file 160 | target_file = open(target_path, "w") 161 | target_file.write(response.read()) 162 | target_file.close() 163 | fileDownloadClient.close() 164 | else: 165 | target_file = open(target_path, "w") 166 | target_file.close() 167 | 168 | httpClient.close() 169 | return response.status 170 | 171 | 172 | def listdir(self, path): 173 | if os.path.isabs(path)==False: 174 | raise Exception("Only absolute paths supported: %s"%(path)) 175 | 176 | url_path = urllib.quote(WEBHDFS_CONTEXT_ROOT + path+'?op=LISTSTATUS&user.name='+self.username) 177 | logger.debug("List directory: " + url_path) 178 | httpClient = self.__getNameNodeHTTPClient() 179 | httpClient.request('GET', url_path , headers={}) 180 | response = httpClient.getresponse() 181 | logger.debug("HTTP Response: %d, %s"%(response.status, response.reason)) 182 | data_dict = json.loads(response.read()) 183 | logger.debug("Data: " + str(data_dict)) 184 | files=[] 185 | for i in data_dict["FileStatuses"]["FileStatus"]: 186 | logger.debug(i["type"] + ": " + i["pathSuffix"]) 187 | files.append(i["pathSuffix"]) 188 | httpClient.close() 189 | return files 190 | 191 | def __getNameNodeHTTPClient(self): 192 | httpClient = httplib.HTTPConnection(self.namenode_host, 193 | self.namenode_port, 194 | timeout=600) 195 | return httpClient 196 | 197 | 198 | 199 | if __name__ == "__main__": 200 | webhdfs = WebHDFS("localhost", 50070, "luckow") 201 | webhdfs.mkdir("/pilotstore-1/pd-9c2d42c4-30a3-11e1-bab1-00264a13ca4c/") 202 | webhdfs.copyFromLocal("/Users/luckow/workspace-saga/applications/pilot-store/test/data1/test1.txt", 203 | "/pilotstore-1/pd-9c2d42c4-30a3-11e1-bab1-00264a13ca4c/test1.txt") 204 | 205 | webhdfs.copyToLocal("/pilotstore-1/pd-9c2d42c4-30a3-11e1-bab1-00264a13ca4c/test1.txt", 206 | "/tmp/test1.txt") 207 | 208 | webhdfs.listdir("/") 209 | 210 | 211 | 212 | --------------------------------------------------------------------------------