├── pydfs ├── dfs.conf ├── minion.py ├── client.py └── master.py ├── LICENSE ├── .gitignore └── README.md /pydfs/dfs.conf: -------------------------------------------------------------------------------- 1 | [master] 2 | block_size = 10 3 | replication_factor = 2 4 | minions = 1:192.168.1.8:8888,2:192.168.1.9:8888 -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Sanket 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /pydfs/minion.py: -------------------------------------------------------------------------------- 1 | import rpyc 2 | import uuid 3 | import os 4 | 5 | from rpyc.utils.server import ThreadedServer 6 | 7 | DATA_DIR="/tmp/minion/" 8 | 9 | class MinionService(rpyc.Service): 10 | class exposed_Minion(): 11 | blocks = {} 12 | 13 | def exposed_put(self,block_uuid,data,minions): 14 | with open(DATA_DIR+str(block_uuid),'w') as f: 15 | f.write(data) 16 | if len(minions)>0: 17 | self.forward(block_uuid,data,minions) 18 | 19 | 20 | def exposed_get(self,block_uuid): 21 | block_addr=DATA_DIR+str(block_uuid) 22 | if not os.path.isfile(block_addr): 23 | return None 24 | with open(block_addr) as f: 25 | return f.read() 26 | 27 | def forward(self,block_uuid,data,minions): 28 | print "8888: forwaring to:" 29 | print block_uuid, minions 30 | minion=minions[0] 31 | minions=minions[1:] 32 | host,port=minion 33 | 34 | con=rpyc.connect(host,port=port) 35 | minion = con.root.Minion() 36 | minion.put(block_uuid,data,minions) 37 | 38 | def delete_block(self,uuid): 39 | pass 40 | 41 | if __name__ == "__main__": 42 | if not os.path.isdir(DATA_DIR): os.mkdir(DATA_DIR) 43 | t = ThreadedServer(MinionService, port = 8888) 44 | t.start() -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### The (simplified) code presented at SRECon is located under [srecon](https://github.com/sanketplus/PyDFS/tree/srecon) branch. 2 | 3 | --- 4 | # PyDFS 5 | Simple (~200 lines) distributed file system like HDFS (and of-course GFS). It consists of one Master (NameNode) and multiple Minions (DataNode). And a client for interation. It will dump metadata/namespace when given SIGINT and reload it when fired up next time. Replicate data the way HDFS does. It will send data to one minion and that minion will send it to next one and so on. Reading done in similar manner. Will contact first minion for block, if fails then second and so on. Uses RPyC for RPC. 6 | 7 | #### [Blog: Simple Distributed File System in Python : PyDFS](https://superuser.blog/distributed-file-system-python/) 8 | 9 | ### Requirements: 10 | - rpyc (Really! That's it.) 11 | 12 | ### How to run. 13 | 1. Edit `dfs.conf` for setting block size, replication factor and list minions (`minionid:host:port`) 14 | 2. Fire up master and minions. 15 | 3. To store and retrieve a file: 16 | ```sh 17 | $ python client.py put sourcefile.txt sometxt 18 | $ python client.py get sometxt 19 | ``` 20 | ##### Stop it using Ctll + C so that it will dump the namespace. 21 | 22 | ## TODO: 23 | 1. Implement Delete 24 | 2. Use better algo for minion selection to put a block (currently random) 25 | 3. Dump namespace periodically (maybe) 26 | 4. Minion heartbeats / Block reports 27 | 5. Add entry in namespace only after write succeeds. 28 | 6. Use proper datastructure(tree-like eg. treedict) to store 29 | namespace(currently simple dict) 30 | 7. Logging 31 | 8. Expand this TODO 32 | -------------------------------------------------------------------------------- /pydfs/client.py: -------------------------------------------------------------------------------- 1 | import rpyc 2 | import sys 3 | import os 4 | import logging 5 | 6 | logging.basicConfig(level=logging.DEBUG) 7 | LOG = logging.getLogger(__name__) 8 | 9 | def send_to_minion(block_uuid,data,minions): 10 | LOG.info("sending: " + str(block_uuid) + str(minions)) 11 | minion=minions[0] 12 | minions=minions[1:] 13 | host,port=minion 14 | 15 | con=rpyc.connect(host,port=port) 16 | minion = con.root.Minion() 17 | minion.put(block_uuid,data,minions) 18 | 19 | 20 | def read_from_minion(block_uuid,minion): 21 | host,port = minion 22 | con=rpyc.connect(host,port=port) 23 | minion = con.root.Minion() 24 | return minion.get(block_uuid) 25 | 26 | def get(master,fname): 27 | file_table = master.get_file_table_entry(fname) 28 | if not file_table: 29 | LOG.info("404: file not found") 30 | return 31 | 32 | for block in file_table: 33 | for m in [master.get_minions()[_] for _ in block[1]]: 34 | data = read_from_minion(block[0],m) 35 | if data: 36 | sys.stdout.write(data) 37 | break 38 | else: 39 | LOG.info("No blocks found. Possibly a corrupt file") 40 | 41 | def put(master,source,dest): 42 | size = os.path.getsize(source) 43 | blocks = master.write(dest,size) 44 | with open(source) as f: 45 | for b in blocks: 46 | data = f.read(master.get_block_size()) 47 | block_uuid=b[0] 48 | minions = [master.get_minions()[_] for _ in b[1]] 49 | send_to_minion(block_uuid,data,minions) 50 | 51 | 52 | def main(args): 53 | con=rpyc.connect("localhost",port=2131) 54 | master=con.root.Master() 55 | 56 | if args[0] == "get": 57 | get(master,args[1]) 58 | elif args[0] == "put": 59 | put(master,args[1],args[2]) 60 | else: 61 | LOG.error("try 'put srcFile destFile OR get file'") 62 | 63 | 64 | if __name__ == "__main__": 65 | main(sys.argv[1:]) 66 | -------------------------------------------------------------------------------- /pydfs/master.py: -------------------------------------------------------------------------------- 1 | import rpyc 2 | import uuid 3 | import threading 4 | import math 5 | import random 6 | import ConfigParser 7 | import signal 8 | import pickle 9 | import sys 10 | import os 11 | 12 | from rpyc.utils.server import ThreadedServer 13 | 14 | def int_handler(signal, frame): 15 | pickle.dump((MasterService.exposed_Master.file_table,MasterService.exposed_Master.block_mapping),open('fs.img','wb')) 16 | sys.exit(0) 17 | 18 | def set_conf(): 19 | conf=ConfigParser.ConfigParser() 20 | conf.readfp(open('dfs.conf')) 21 | MasterService.exposed_Master.block_size = int(conf.get('master','block_size')) 22 | MasterService.exposed_Master.replication_factor = int(conf.get('master','replication_factor')) 23 | minions = conf.get('master','minions').split(',') 24 | for m in minions: 25 | id,host,port=m.split(":") 26 | MasterService.exposed_Master.minions[id]=(host,port) 27 | 28 | if os.path.isfile('fs.img'): 29 | MasterService.exposed_Master.file_table,MasterService.exposed_Master.block_mapping = pickle.load(open('fs.img','rb')) 30 | 31 | class MasterService(rpyc.Service): 32 | class exposed_Master(): 33 | file_table = {} 34 | block_mapping = {} 35 | minions = {} 36 | 37 | block_size = 0 38 | replication_factor = 0 39 | 40 | def exposed_read(self,fname): 41 | mapping = self.__class__.file_table[fname] 42 | return mapping 43 | 44 | def exposed_write(self,dest,size): 45 | if self.exists(dest): 46 | pass # ignoring for now, will delete it later 47 | 48 | self.__class__.file_table[dest]=[] 49 | 50 | num_blocks = self.calc_num_blocks(size) 51 | blocks = self.alloc_blocks(dest,num_blocks) 52 | return blocks 53 | 54 | def exposed_get_file_table_entry(self,fname): 55 | if fname in self.__class__.file_table: 56 | return self.__class__.file_table[fname] 57 | else: 58 | return None 59 | 60 | def exposed_get_block_size(self): 61 | return self.__class__.block_size 62 | 63 | def exposed_get_minions(self): 64 | return self.__class__.minions 65 | 66 | def calc_num_blocks(self,size): 67 | return int(math.ceil(float(size)/self.__class__.block_size)) 68 | 69 | def exists(self,file): 70 | return file in self.__class__.file_table 71 | 72 | def alloc_blocks(self,dest,num): 73 | blocks = [] 74 | for i in range(0,num): 75 | block_uuid = uuid.uuid1() 76 | nodes_ids = random.sample(self.__class__.minions.keys(),self.__class__.replication_factor) 77 | blocks.append((block_uuid,nodes_ids)) 78 | 79 | self.__class__.file_table[dest].append((block_uuid,nodes_ids)) 80 | 81 | return blocks 82 | 83 | 84 | if __name__ == "__main__": 85 | set_conf() 86 | signal.signal(signal.SIGINT,int_handler) 87 | t = ThreadedServer(MasterService, port = 2131) 88 | t.start() --------------------------------------------------------------------------------